tasks_ncbi
pipes/WDL/tasks/tasks_ncbi.wdl

TASKS tasks_ncbi

File Path pipes/WDL/tasks/tasks_ncbi.wdl
WDL Version 1.0
Type tasks

📋Tasks in this document

Tasks

TASKS download_fasta

Inputs

Name Type Description Default
out_prefix String - -
accessions Array[String]+ - -
emailAddress String - -
1 optional input with default value

Command

ncbi.py --version | tee VERSION
ncbi.py fetch_fastas \
    ~{emailAddress} \
    . \
    ~{sep=" " accessions} \
    --combinedFilePrefix ~{out_prefix} \

Outputs

Name Type Expression
sequences_fasta File "~{out_prefix}.fasta"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory "7 GB"
cpu 2
dx_instance_type "mem2_ssd1_v2_x2"
maxRetries 2

TASKS download_fasta_from_accession_string

Download GenBank sequences from a space-separated string of accessions (optionally with coverage specifications like 'accession:12.5x') and combine into a single fasta file.

Inputs

Name Type Description Default
accession_string String - -
out_prefix String - -
emailAddress String - -
1 optional input with default value

Command

set -e
ncbi.py --version | tee VERSION

# Parse accession string to extract just the accession IDs (strip coverage values if present)
python3 <<CODE
import re
accession_string = "~{accession_string}"
# Split by whitespace and extract accession (before colon if present)
accessions = []
for pair in accession_string.split():
    if ':' in pair:
        accessions.append(pair.split(':')[0])
    else:
        accessions.append(pair)

with open('accessions.txt', 'w') as f:
    for acc in accessions:
        f.write(acc + '\n')
CODE

# Download sequences using extracted accessions
ncbi.py fetch_fastas \
    ~{emailAddress} \
    . \
    $(cat accessions.txt | tr '\n' ' ') \
    --combinedFilePrefix ~{out_prefix}

Outputs

Name Type Expression
sequences_fasta File "~{out_prefix}.fasta"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory "7 GB"
cpu 2
dx_instance_type "mem2_ssd1_v2_x2"
maxRetries 2

TASKS download_annotations

Inputs

Name Type Description Default
accessions Array[String]+ - -
emailAddress String - -
combined_out_prefix String - -
1 optional input with default value

Command

set -ex -o pipefail
ncbi.py --version | tee VERSION
ncbi.py fetch_feature_tables \
    ~{emailAddress} \
    ./ \
    ~{sep=" " accessions} \
    --loglevel DEBUG
mkdir -p combined
ncbi.py fetch_fastas \
    ~{emailAddress} \
    ./ \
    ~{sep=" " accessions} \
    --combinedFilePrefix "combined/~{combined_out_prefix}" \
    --forceOverwrite \
    --loglevel DEBUG

Outputs

Name Type Expression
combined_fasta File "combined/~{combined_out_prefix}.fasta"
genomes_fasta Array[File] glob("*.fasta")
features_tbl Array[File] glob("*.tbl")
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory "7 GB"
cpu 2
dx_instance_type "mem2_ssd1_v2_x2"
maxRetries 2

TASKS download_ref_genomes_from_tsv

Inputs

Name Type Description Default
ref_genomes_tsv File - -
emailAddress String - -
1 optional input with default value

Command

set -ex -o pipefail
ncbi.py --version | tee VERSION
mkdir -p combined

python3<<CODE
import csv
import phylo.genbank
with open("~{ref_genomes_tsv}", 'rt') as inf:
  reader = csv.DictReader(inf, delimiter='\t',
    fieldnames=['tax_id', 'isolate_prefix', 'taxname', 'accessions']) # backwards support for headerless tsvs
  # for the future: batch all the downloads in a single call and re-organize output files afterwards
  for ref_genome in reader:
    if ref_genome['tax_id'] != 'tax_id': # skip header
      accessions = ref_genome['accessions'].split(':')
      phylo.genbank.fetch_fastas_from_genbank(
        accessionList=accessions,
        destinationDir=".",
        emailAddress="~{emailAddress}",
        forceOverwrite=True,
        combinedFilePrefix="combined/" + '-'.join(accessions),
        removeSeparateFiles=False,
        chunkSize=500)
CODE

Outputs

Name Type Expression
ref_genomes_fastas Array[File] glob("combined/*.fasta")
num_references Int length(ref_genomes_fastas)

Runtime

Key Value
docker docker
memory "7 GB"
cpu 2
dx_instance_type "mem2_ssd1_v2_x2"
maxRetries 2

TASKS sequencing_platform_from_bam

Inputs

Name Type Description Default
bam File - -
1 optional input with default value

Command

set -ex -o pipefail
samtools view -H "~{bam}" | grep '^@RG' | grep -o 'PL:[^[:space:]]*' | cut -d':' -f2 | sort | uniq > BAM_PLATFORMS
if [ $(wc -l < BAM_PLATFORMS) -ne 1 ]; then
  echo "Other: hybrid" > GENBANK_SEQ_TECH
elif grep -qi 'ILLUMINA' BAM_PLATFORMS; then
  echo "Illumina" > GENBANK_SEQ_TECH
elif grep -qi 'ONT' BAM_PLATFORMS; then
  echo "Oxford Nanopore" > GENBANK_SEQ_TECH
elif grep -qi 'PACBIO' BAM_PLATFORMS; then
  echo "PacBio" > GENBANK_SEQ_TECH
elif grep -qi 'IONTORRENT' BAM_PLATFORMS; then
  echo "IonTorrent" > GENBANK_SEQ_TECH
elif grep -qi 'SOLID' BAM_PLATFORMS; then
  echo "SOLiD" > GENBANK_SEQ_TECH
elif grep -qi 'ULTIMA' BAM_PLATFORMS; then
  echo "Ultima" > GENBANK_SEQ_TECH
elif grep -qi 'ELEMENT' BAM_PLATFORMS; then
  echo "Element" > GENBANK_SEQ_TECH
elif grep -qi 'CAPILLARY' BAM_PLATFORMS; then
  echo "Sanger" > GENBANK_SEQ_TECH
else
  echo "Haven't seen this one before!"
  exit 1
fi

Outputs

Name Type Expression
genbank_sequencing_technology String read_string("GENBANK_SEQ_TECH")

Runtime

Key Value
docker docker
memory "3 GB"
cpu 2
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS align_and_annot_transfer_single

Given a reference genome annotation in TBL format (e.g. from Genbank or RefSeq) and new genome not in Genbank, produce new annotation files (TBL format with appropriate coordinate conversions) for the new genome. Resulting output can be fed to tbl2asn for Genbank submission.

Inputs

Name Type Description Default
genome_fasta File New genome, all segments/chromosomes in one fasta file. Must contain the same number of sequences as reference_fasta -
reference_fastas Array[File]+ Reference genome, each segment/chromosome in a separate fasta file, in the exact same count and order as the segments/chromosomes described in genome_fasta. Headers must be Genbank accessions. -
reference_feature_tables Array[File]+ NCBI Genbank feature table, each segment/chromosome in a separate TBL file, in the exact same count and order as the segments/chromosomes described in genome_fasta and reference_fastas. Accession numbers in the TBL files must correspond exactly to those in reference_fasta. -
3 optional inputs with default values

Command

set -e
ncbi.py --version | tee VERSION
mkdir -p out
ncbi.py tbl_transfer_multichr \
    "~{genome_fasta}" \
    out \
    --ref_fastas ~{sep=" " reference_fastas} \
    --ref_tbls ~{sep=" " reference_feature_tables} \
    --oob_clip \
    --loglevel DEBUG
cat out/*.tbl > "~{out_basename}.tbl"

Outputs

Name Type Expression
feature_tbl File "~{out_basename}.tbl"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory machine_mem_gb + " GB"
cpu 8
dx_instance_type "mem2_ssd1_v2_x4"
preemptible 1
maxRetries 2

TASKS structured_comments

Inputs

Name Type Description Default
assembly_stats_tsv File - -
filter_to_ids File? - -
1 optional input with default value

Command

set -e

python3 << CODE
import util.file

samples_to_filter_to = set()
if "~{default="" filter_to_ids}":
    with open("~{default="" filter_to_ids}", 'rt') as inf:
        samples_to_filter_to = set(line.strip() for line in inf)

out_headers = ('SeqID', 'StructuredCommentPrefix', 'Assembly Method', 'Coverage', 'Sequencing Technology', 'StructuredCommentSuffix')
with open("~{out_base}.cmt", 'wt') as outf:
    outf.write('\t'.join(out_headers)+'\n')

    for row in util.file.read_tabfile_dict("~{assembly_stats_tsv}"):
        outrow = dict((h, row.get(h, '')) for h in out_headers)

        if samples_to_filter_to:
          if row['SeqID'] not in samples_to_filter_to:
              continue

        if outrow['Coverage']:
          outrow['Coverage'] = "{}x".format(round(float(outrow['Coverage'])))
        outrow['StructuredCommentPrefix'] = 'Assembly-Data'
        outrow['StructuredCommentSuffix'] = 'Assembly-Data'
        outf.write('\t'.join(outrow[h] for h in out_headers)+'\n')
CODE

Outputs

Name Type Expression
structured_comment_table File "~{out_base}.cmt"

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS structured_comments_from_aligned_bam

Inputs

Name Type Description Default
aligned_bam File - -
assembly_method String - -
assembly_method_version String - -
4 optional inputs with default values

Command

set -e -o pipefail

cp "~{aligned_bam}" aligned.bam
reports.py coverage_only aligned.bam coverage.txt
cat coverage.txt

samtools view -H "~{aligned_bam}" | grep '^@SQ' | grep -o 'SN:[^[:space:]]*' | cut -d':' -f2 | tee SEQ_IDS

samtools view -H "~{aligned_bam}" | grep '^@RG' | grep -o 'PL:[^[:space:]]*' | cut -d':' -f2 | sort | uniq | tee BAM_PLATFORMS
if [ $(wc -l < BAM_PLATFORMS) -ne 1 ]; then
  echo "Other: hybrid" > GENBANK_SEQ_TECH
elif grep -qi 'ILLUMINA' BAM_PLATFORMS; then
  echo "Illumina" > GENBANK_SEQ_TECH
elif grep -qi 'ONT' BAM_PLATFORMS; then
  echo "Oxford Nanopore" > GENBANK_SEQ_TECH
elif grep -qi 'PACBIO' BAM_PLATFORMS; then
  echo "PacBio" > GENBANK_SEQ_TECH
elif grep -qi 'IONTORRENT' BAM_PLATFORMS; then
  echo "IonTorrent" > GENBANK_SEQ_TECH
elif grep -qi 'SOLID' BAM_PLATFORMS; then
  echo "SOLiD" > GENBANK_SEQ_TECH
elif grep -qi 'ULTIMA' BAM_PLATFORMS; then
  echo "Ultima" > GENBANK_SEQ_TECH
elif grep -qi 'ELEMENT' BAM_PLATFORMS; then
  echo "Element" > GENBANK_SEQ_TECH
elif grep -qi 'CAPILLARY' BAM_PLATFORMS; then
  echo "Sanger" > GENBANK_SEQ_TECH
else
  echo "Haven't seen this one before!"
  exit 1
fi
cat GENBANK_SEQ_TECH

python3 << CODE
import Bio.SeqIO
import csv
import re

# get list of sequence IDs from BAM header
with open("SEQ_IDS") as inf:
    seqids = set(line.strip() for line in inf)
    if ~{true="True" false="False" sanitize_ids}:
      seqids = set(re.sub(r'[^0-9A-Za-z!_-]', '-', x) for x in seqids)

# get sequencing technology from BAM header    
with open("GENBANK_SEQ_TECH", "rt") as inf:
    sequencing_tech = inf.read().strip()

# this header has to be in this specific order -- don't reorder the columns!
out_headers = ('SeqID', 'StructuredCommentPrefix', 'Assembly Method', "~{true="Genome " false="" is_genome_assembly}Coverage", 'Sequencing Technology', 'StructuredCommentSuffix')
with open("~{out_basename}.cmt", 'wt') as outf:
  outf.write('\t'.join(out_headers)+'\n')

  with open("coverage.txt", "rt") as inf:
    for row in csv.DictReader(inf, delimiter='\t'):
      if row.get('sample') and row.get('aln2self_cov_median') and row['sample'] in seqids:
        outrow = {
          'SeqID': row['sample'],
          'Assembly Method': "~{assembly_method} v. ~{assembly_method_version}",  # note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
          'Sequencing Technology': sequencing_tech,
          "~{true="Genome " false="" is_genome_assembly}Coverage": "{}x".format(round(float(row['aln2self_cov_median']))),
          'StructuredCommentPrefix': "~{true="Genome-" false="" is_genome_assembly}Assembly-Data",
          'StructuredCommentSuffix': "~{true="Genome-" false="" is_genome_assembly}Assembly-Data",
        }
        outf.write('\t'.join(outrow[h] for h in out_headers)+'\n')
CODE

Outputs

Name Type Expression
structured_comment_file File "~{out_basename}.cmt"

Runtime

Key Value
docker docker
memory "2 GB"
cpu 2
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS prefix_fasta_header

Inputs

Name Type Description Default
genome_fasta File - -
prefix String - -
1 optional input with default value

Command

set -e
python3 <<CODE
with open('~{genome_fasta}', 'rt') as inf:
  with open('~{out_basename}.fasta', 'wt') as outf:
    for line in inf:
      if line.startswith('>'):
        line = ">{}{}\n".format('~{prefix}', line.rstrip()[1:])
      outf.write(line)
CODE

Outputs

Name Type Expression
renamed_fasta File "~{out_basename}.fasta"

Runtime

Key Value
docker "python:slim"
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS rename_fasta_header

Inputs

Name Type Description Default
genome_fasta File - -
new_name String - -
2 optional inputs with default values

Command

set -e
file_utils.py rename_fasta_sequences \
  "~{genome_fasta}" "~{out_basename}.fasta" "~{new_name}"

Outputs

Name Type Expression
renamed_fasta File "~{out_basename}.fasta"

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS gisaid_meta_prep

Inputs

Name Type Description Default
source_modifier_table File - -
structured_comments File - -
out_name String - -
username String? - -
submitting_lab_name String - -
fasta_filename String? - -
4 optional inputs with default values

Command

python3 << CODE
import os.path
import csv
import json

strict = ~{true="True" false="False" strict}

# institutional mappings
address_map = json.loads('~{address_map}')
authors_map = json.loads('~{authors_map}')
assert "~{submitting_lab_name}" in address_map, f"error: institution '~{submitting_lab_name}' not found in address_map"

# lookup table files to dicts
sample_to_cmt = {}
with open('~{structured_comments}', 'rt') as inf:
  for row in csv.DictReader(inf, delimiter='\t'):
    sample_to_cmt[row['SeqID']] = row

out_headers = ('submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage', 'covv_collection_date', 'covv_location', 'covv_add_location', 'covv_host', 'covv_add_host_info', 'covv_sampling_strategy', 'covv_gender', 'covv_patient_age', 'covv_patient_status', 'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology', 'covv_assembly_method', 'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr', 'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr', 'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type')

with open('~{out_name}', 'w', newline='') as outf:
  writer = csv.DictWriter(outf, out_headers, dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
  writer.writeheader()

  with open('~{source_modifier_table}', 'rt') as inf:
    for row in csv.DictReader(inf, delimiter='\t'):

      isolation_source = row['isolation_source'].lower()
      #covv_specimen
      if strict:
        valid_isolation_sources = ('clinical', 'environmental')
        assert isolation_source in valid_isolation_sources, f"Metadata error: 'isolation_source' not one of: {valid_isolation_sources}\n{row}"
        assert row['host'] == 'Homo sapiens' or isolation_source == 'environmental', f"Metadata error: 'host' must be 'Homo sapiens' if 'isolation_source' is not 'Environmental'\n{row}"
        assert row['organism']         == 'Severe acute respiratory syndrome coronavirus 2', f"'organism' != 'Severe acute respiratory syndrome coronavirus 2'\n{row}"
        assert row['db_xref']          == 'taxon:2697049', f"Metadata error: 'db_xref' != 'taxon:2697049'\n{row}"

        collected_by = row['collected_by']
        assert collected_by in address_map, f"error: institution '{collected_by}' not found in address_map"
        assert collected_by in authors_map, f"error: institution '{collected_by}' not found in authors_map"

      # PHA4GE/INSDC controlled vocabulary for source information
      # from "Vocabulary" tab of this sheet:
      #   https://github.com/pha4ge/SARS-CoV-2-Contextual-Data-Specification/blob/master/PHA4GE%20SARS-CoV-2%20Contextual%20Data%20Template.xlsx
      gisaid_specimen_source = "unknown"
      if isolation_source == 'clinical':
        gisaid_specimen_source = row.get("body_product",row.get("anatomical_material",row.get("anatomical_part","missing")))
      if isolation_source == 'environmental':
        gisaid_specimen_source = row.get("environmental_material",row.get("environmental_site","missing"))

      writer.writerow({
        'covv_virus_name'     : 'hCoV-19/' +row['Sequence_ID'],
        'covv_collection_date': row['collection_date'],
        'covv_location'       : '~{continent} / ' + row['country'].replace(':',' /'),

        'covv_type'           : 'betacoronavirus',
        'covv_passage'        : 'Original',
        'covv_host'           : 'Human' if isolation_source == 'clinical' else isolation_source.replace("environmental","Environment"),
        'covv_add_host_info'  : 'unknown',
        'covv_gender'         : 'unknown',
        'covv_patient_age'    : 'unknown',
        'covv_patient_status' : 'unknown',
        'covv_specimen'       : gisaid_specimen_source.capitalize(), # capitalization of the first word seems to be the norm for GISAID

        'covv_assembly_method': sample_to_cmt[row['Sequence_ID']]['Assembly Method'],
        'covv_coverage'       : sample_to_cmt[row['Sequence_ID']]['Coverage'],
        'covv_seq_technology' : sample_to_cmt[row['Sequence_ID']]['Sequencing Technology'],

        'covv_orig_lab'       : row['collected_by'],
        'covv_subm_lab'       : "~{submitting_lab_name}",
        'covv_authors'        : authors_map.get(row['collected_by'], 'REQUIRED'),
        'covv_orig_lab_addr'  : address_map.get(row['collected_by'], 'REQUIRED'),
        'covv_subm_lab_addr'  : address_map.get("~{submitting_lab_name}", 'REQUIRED'),

        'submitter'           : "~{default="REQUIRED" username}",
        'fn'                  : "~{default="REQUIRED" fasta_filename}",

        'covv_sampling_strategy'  : row.get('note',''),
      })

CODE

Outputs

Name Type Expression
meta_csv File "~{out_name}"

Runtime

Key Value
docker "python:slim"
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS lookup_table_by_filename

Inputs

Name Type Description Default
id String - -
mapping_tsv File - -
2 optional inputs with default values

Command

set -e -o pipefail
grep ^"~{id}" ~{mapping_tsv} | cut -f ~{return_col} > OUTVAL

Outputs

Name Type Expression
value String read_string("OUTVAL")

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS sra_meta_prep

Prepare tables for submission to NCBI's SRA database. This only works on bam files produced by illumina.py illumina_demux --append_run_id in viral-core.

Inputs

Name Type Description Default
cleaned_bam_filepaths Array[File] {'description': 'Unaligned bam files containing cleaned (submittable) reads.', 'localization_optional': <WDL.Expr.Boolean object at 0x7f162517a050>, 'stream': <WDL.Expr.Boolean object at 0x7f162517a0d0>, 'patterns': ['*.bam']} -
biosample_map File Tab text file with a header and at least two columns named accession and sample_name. 'accession' maps to the BioSample accession number. Any samples without an accession will be omitted from output. 'sample_name' maps to the internal lab sample name used in filenames, samplesheets, and library_metadata files. -
library_metadata Array[File] Tab text file with a header and at least six columns (sample, library_id_per_sample, library_strategy, library_source, library_selection, design_description). See 3rd tab of https://www.ncbi.nlm.nih.gov/core/assets/sra/files/SRA_metadata_acc_example.xlsx for controlled vocabulary and term definition. -
platform String Sequencing platform (one of _LS454, ABI_SOLID, BGISEQ, CAPILLARY, COMPLETE_GENOMICS, HELICOS, ILLUMINA, ION_TORRENT, OXFORD_NANOPORE, PACBIO_SMRT). -
instrument_model String Sequencing instrument model (examples for platform=ILLUMINA: HiSeq X Five, HiSeq X Ten, Illumina Genome Analyzer, Illumina Genome Analyzer II, Illumina Genome Analyzer IIx, Illumina HiScanSQ, Illumina HiSeq 1000, Illumina HiSeq 1500, Illumina HiSeq 2000, Illumina HiSeq 2500, Illumina HiSeq 3000, Illumina HiSeq 4000, Illumina iSeq 100, Illumina NovaSeq 6000, Illumina MiniSeq, Illumina MiSeq, NextSeq 500, NextSeq 550). -
title String Descriptive sentence of the form <method> of <organism>, e.g. Metagenomic RNA-seq of SARS-CoV-2. -
paired Boolean - -
2 optional inputs with default values

Command

python3 << CODE
import os.path
import csv
import util.file

# WDL arrays to python arrays
bam_uris = list(x for x in '~{sep="*" cleaned_bam_filepaths}'.split('*') if x)
library_metadata = list(x for x in '~{sep="*" library_metadata}'.split('*') if x)

# lookup table files to dicts
lib_to_bams = {}
sample_to_biosample = {}
for bam in bam_uris:
  # filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam or <libraryname>.<flowcell>.<lane>.bam
  bam_base = os.path.basename(bam)
  bam_parts = bam_base.split('.')
  assert bam_parts[-1] == 'bam', "filename does not end in .bam -- {}".format(bam) 
  bam_parts = bam_parts[:-1]
  if bam_parts[-1] == 'cleaned':
    bam_parts = bam_parts[:-1]
  assert len(bam_parts) >= 3, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
  lib = '.'.join(bam_parts[:-2]) # drop flowcell and lane
  lib_to_bams.setdefault(lib, [])
  lib_to_bams[lib].append(bam_base)
  print("debug: registering lib={} bam={}".format(lib, bam_base))
with open('~{biosample_map}', 'rt') as inf:
  for row in csv.DictReader(inf, delimiter='\t'):
    sample_to_biosample[row['sample_name']] = row['accession']

# set up SRA metadata table
outrows = []
out_headers = ['biosample_accession', 'library_ID', 'title', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'platform', 'instrument_model', 'design_description', 'filetype', 'assembly', 'filename']

# iterate through library_metadata entries and produce an output row for each entry
libs_written = set()
for libfile in library_metadata:
  with open(libfile, 'rt') as inf:
    for row in csv.DictReader(inf, delimiter='\t'):
      lib = util.file.string_to_file_name("{}.l{}".format(row['sample'], row['library_id_per_sample']))
      biosample = sample_to_biosample.get(row['sample'],'')
      bams = lib_to_bams.get(lib,[])
      print("debug: sample={} lib={} biosample={}, bams={}".format(row['sample'], lib, biosample, bams))
      if biosample and bams and lib not in libs_written:
        libs_written.add(lib)
        outrows.append({
          'biosample_accession': sample_to_biosample[row['sample']],
          'library_ID': lib,
          'title': "~{title}",
          'library_strategy': row.get('library_strategy',''),
          'library_source': row.get('library_source',''),
          'library_selection': row.get('library_selection',''),
          'library_layout': '~{true="paired" false="single" paired}',
          'platform': '~{platform}',
          'instrument_model': '~{instrument_model}',
          'design_description': row.get('design_description',''),
          'filetype': 'bam',
          'assembly': 'unaligned',
          'files': lib_to_bams[lib],
        })
assert outrows, "failed to prepare any metadata -- output is empty!"

# find library with the most files and add col headers
n_cols = max(len(row['files']) for row in outrows)
for i in range(n_cols-1):
  out_headers.append('filename{}'.format(i+2))

# write output file
with open('~{out_name}', 'wt') as outf:
  outf.write('\t'.join(out_headers)+'\n')
  for row in outrows:
    row['filename'] = row['files'][0]
    for i in range(len(row['files'])):
      row['filename{}'.format(i+1)] = row['files'][i]
    outf.write('\t'.join(row.get(h,'') for h in out_headers)+'\n')
CODE

Outputs

Name Type Expression
sra_metadata File "~{out_name}"
cleaned_bam_uris File write_lines(cleaned_bam_filepaths)

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS biosample_to_table

Reformats a BioSample registration attributes table (attributes.tsv) for ingest into a Terra table.

Inputs

Name Type Description Default
biosample_attributes_tsv File - -
raw_bam_filepaths Array[File] {'description': 'Unaligned bam files containing raw reads.', 'localization_optional': <WDL.Expr.Boolean object at 0x7f162517ac50>, 'stream': <WDL.Expr.Boolean object at 0x7f162517acd0>, 'patterns': ['*.bam']} -
demux_meta_json File - -
2 optional inputs with default values

Command

set -ex -o pipefail
python3 << CODE
import os.path
import csv
import json

# load demux metadata
with open("~{demux_meta_json}", 'rt') as inf:
  demux_meta_by_file = json.load(inf)

# load list of bams surviving filters
bam_fnames = list(os.path.basename(x) for x in '~{sep="*" raw_bam_filepaths}'.split('*'))
bam_fnames = list(x[:-len('.bam')] if x.endswith('.bam') else x for x in bam_fnames)
bam_fnames = list(x[:-len('.cleaned')] if x.endswith('.cleaned') else x for x in bam_fnames)
print("bam basenames ({}): {}".format(len(bam_fnames), bam_fnames))
sample_to_sanitized = {demux_meta_by_file.get(x, {}).get('sample_original'): demux_meta_by_file.get(x, {}).get('sample') for x in bam_fnames}
if None in sample_to_sanitized:
  del sample_to_sanitized[None]
sample_names_seen = sample_to_sanitized.keys()
print("samples seen ({}): {}".format(len(sample_names_seen), sorted(sample_names_seen)))

# load biosample metadata
biosample_attributes = []
biosample_headers = ['biosample_accession']
with open('~{biosample_attributes_tsv}', 'rt') as inf:
  for row in csv.DictReader(inf, delimiter='\t'):
    if row['sample_name'] in sample_names_seen and row['message'] == "Successfully loaded":
      row['biosample_accession'] = row.get('accession')
      row = dict({k:v for k,v in row.items() if v.strip().lower() not in ('missing', 'na', 'not applicable', 'not collected', '')})
      for k,v in row.items():
        if v and (k not in biosample_headers) and k not in ('message', 'accession'):
          biosample_headers.append(k)
      row['biosample_json'] = json.dumps({k: v for k,v in row.items() if k in biosample_headers})
      biosample_attributes.append(row)
biosample_headers.append('biosample_json')

print("biosample headers ({}): {}".format(len(biosample_headers), biosample_headers))
print("biosample output rows ({})".format(len(biosample_attributes)))
samples_seen_without_biosample = set(sample_names_seen) - set(row['sample_name'] for row in biosample_attributes)
print("samples seen in bams without biosample entries ({}): {}".format(len(samples_seen_without_biosample), sorted(samples_seen_without_biosample)))

# write reformatted table
with open('~{base}.entities.tsv', 'w', newline='') as outf:
  writer = csv.DictWriter(outf, delimiter='\t', fieldnames=["~{sanitized_id_col}"]+biosample_headers, dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
  writer.writeheader()
  for row in biosample_attributes:
    outrow = {h: row.get(h, '') for h in biosample_headers}
    outrow["~{sanitized_id_col}"] = sample_to_sanitized[row['sample_name']]
    writer.writerow(outrow)
CODE

Outputs

Name Type Expression
sample_meta_tsv File "~{base}.entities.tsv"

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS biosample_to_genbank

Prepares two input metadata files for Genbank submission based on a BioSample registration attributes table (attributes.tsv) since all of the necessary values are there. This produces both a Genbank Source Modifier Table and a BioSample ID map file that can be fed into the prepare_genbank task.

Inputs

Name Type Description Default
biosample_attributes File - -
taxid Int - -
filter_to_ids File? - -
filter_to_accession String? - -
organism_name_override String? - -
sequence_id_override String? - -
isolate_prefix_override String? - -
source_overrides_json File? - -
6 optional inputs with default values

Command

set -e
python3<<CODE
import csv
import json
import re
import os.path

header_key_map = {
    'Sequence_ID':'~{biosample_col_for_fasta_headers}',
    'BioProject':'bioproject_accession',
    'BioSample':'accession',
}
with open("~{write_json(src_to_attr_map)}", 'rt') as inf:
    more_header_key_map = json.load(inf)
for k,v in more_header_key_map.items():
    header_key_map[k] = v
print("header_key_map: {}".format(header_key_map))

out_headers_total = ['Sequence_ID', 'isolate', 'collection_date', 'geo_loc_name', 'collected_by', 'isolation_source', 'organism', 'host', 'note', 'db_xref', 'BioProject', 'BioSample']
samples_to_filter_to = set()
if "~{default="" filter_to_ids}":
    with open("~{filter_to_ids}", 'rt') as inf:
        samples_to_filter_to = set(line.strip() for line in inf)
        print("filtering to samples: {}".format(samples_to_filter_to))
only_accession = "~{default="" filter_to_accession}" if "~{default="" filter_to_accession}" else None
if only_accession:
    print("filtering to biosample: {}".format(only_accession))

# read entire tsv -> biosample_attributes, filtered to only the entries we keep
with open("~{biosample_attributes}", 'rt') as inf_biosample:
  biosample_attributes_reader = csv.DictReader(inf_biosample, delimiter='\t')
  in_headers = biosample_attributes_reader.fieldnames
  if 'accession' not in in_headers:
    assert 'biosample_accession' in in_headers, "no accession column found in ~{biosample_attributes}"
    header_key_map['BioSample'] = 'biosample_accession'
  biosample_attributes = list(row for row in biosample_attributes_reader
    if row.get('message', 'Success').startswith('Success')
    and (not only_accession or row[header_key_map['BioSample']] == only_accession)
    and (not samples_to_filter_to or row[header_key_map['Sequence_ID']] in samples_to_filter_to))
  print("filtered to {} samples".format(len(biosample_attributes)))

# clean out some na values
for row in biosample_attributes:
  for k in ('strain', 'isolate', 'host', 'geo_loc_name', 'collection_date', 'serotype', 'food_origin'):
    if row.get(k, '').lower().strip() in ('na', 'not applicable', 'missing', 'not collected', 'not available', ''):
        row[k] = ''

# override organism_name if provided (this allows us to submit Genbank assemblies for
# specific species even though the metagenomic BioSample may have been registered with a different
# species or none at all)
if "~{default="" organism_name_override}":
    for row in biosample_attributes:
        row['organism'] = "~{default="" organism_name_override}"

# handle special submission types: flu, sc2, noro, dengue
special_bugs = ('Influenza A virus', 'Influenza B virus', 'Influenza C virus',
                'Severe acute respiratory syndrome coronavirus 2',
                'Norovirus', 'Dengue virus')
for special in special_bugs:
    # sanitize organism name if it's a special one
    for row in biosample_attributes:
      if row['organism'].startswith(special):
          row['organism'] = special

    # enforce that special submissions are all the same special thing
    if any(row['organism'] == special for row in biosample_attributes):
      print("special organism found " + special)
      assert all(row['organism'] == special for row in biosample_attributes), "if any samples are {}, all samples must be {}".format(special, special)
      ### Influenza-specific requirements
      if special.startswith('Influenza'):
        print("special organism is Influenza A/B/C")
        # simplify isolate name
        if row.get('strain'):
          header_key_map['isolate'] = 'strain'
        if 'serotype' not in out_headers_total:
          out_headers_total.append('serotype')
        for row in biosample_attributes:
          # simplify isolate name more if it still looks structured with metadata (not allowed for flu submissions)
          if len(row[header_key_map.get('isolate', 'isolate')].split('/')) >= 2:
            row[header_key_map.get('isolate', 'isolate')] = row[header_key_map.get('isolate', 'isolate')].split('/')[-2]
          # populate serotype from name parsing
          match = re.search(r'\(([^()]+)\)+$', row['sample_name'])
          if match:
              row['serotype'] = match.group(1)
              print("found serotype {}". format(row['serotype']))
          # populate host field from name parsing if empty, override milk
          if not row.get('host','').strip():
            match = re.search(r'[^/]+/([^/]+)/[^/]+/[^/]+/[^/]+', row['sample_name'])
            if match:
                row['host'] = match.group(1)
                if row['host'] == 'bovine_milk':
                  row['host'] = 'Cattle'
                assert 'host' in out_headers_total
          # override geo_loc_name if food_origin exists
          if row.get('food_origin','').strip():
              print("overriding geo_loc_name with food_origin")
              row['geo_loc_name'] = row['food_origin']

# prepare output headers
out_headers = list(h for h in out_headers_total if header_key_map.get(h,h) in in_headers)
if 'db_xref' not in out_headers:
    out_headers.append('db_xref')
if 'note' not in out_headers:
    out_headers.append('note')
if 'serotype' in out_headers_total and 'serotype' not in out_headers:
    out_headers.append('serotype')
if 'strain' not in out_headers and any(row['organism'].startswith('Influenza') for row in biosample_attributes):
    out_headers.append('strain')

# manual overrides if provided
if os.path.isfile("~{source_overrides_json}"):
  with open("~{source_overrides_json}", 'rt') as inf:
    for k,v in json.load(inf).items():
      print("overriding {} with {}".format(k,v))
      if k not in out_headers:
        out_headers.append(k)
      for row in biosample_attributes:
        row[k] = v

# write output source modifier table
with open("~{out_basename}.src", 'wt') as outf_smt:
  outf_smt.write('\t'.join(out_headers)+'\n')

  with open("~{out_basename}.sample_ids.txt", 'wt') as outf_ids:
      for row in biosample_attributes:
        # populate output row as a dict
        outrow = dict((h, row.get(header_key_map.get(h,h), '')) for h in out_headers)

        ### Taxon-specific genome naming rules go here
        if outrow['organism'].startswith('Influenza'):
          ### Influenza-specific isolate naming was handled above already and is required to be metadata-free
          # FTP submission pathway requires us to do the naming via the strain field
          type = outrow['organism'].split()[1] # A, B, C, D, etc
          state = outrow['geo_loc_name'].split(':')[1].strip() if ':' in outrow['geo_loc_name'] else outrow['geo_loc_name']
          year = outrow['collection_date'].split('-')[0]
          outrow['strain'] = '/'.join([type, state, outrow['isolate'], year])
          print("new strain name: {}".format(outrow['strain']))
        elif outrow['organism'].startswith('Special taxon with special naming rules'):
          ### Example special case here
          pass
        else:
          ### Most viral taxa can follow this generic model containing structured metadata
          # <short organism name>/<host lowercase>/Country/ST-Institution-LabID/Year
          # e.g. RSV-A/human/USA/MA-Broad-1234/2020
          # e.g. SARS-CoV-2/human/USA/MA-Broad-1234/2020
          # assume that the current isolate name has the structure:
          # <something wrong to be overwritten>/<host>/<country>/<state>-<institution>-<labid>/<year>
          print("old isolate name: {}".format(outrow['isolate']))
          year = outrow['collection_date'].split('-')[0]
          country = outrow['geo_loc_name'].split(':')[0]
          host = outrow['host'].lower()

          # host name mapping to the informal names used by NCBI in sequence titles for certain species
          host_to_informal_name_map = {
            'homo sapiens': 'human',
            'sus scrofa domesticus': 'swine',
            'sus scrofa': 'swine'
          }
          host = host_to_informal_name_map.get(host, host)

          if len(outrow['isolate'].split('/')) >= 2:
            state_inst_labid = outrow['isolate'].split('/')[-2]
          else:
            state_inst_labid = outrow['Sequence_ID']
          name_prefix = outrow['organism'].split('(')[0].split('/')[0].strip()
          if name_prefix.lower().endswith('refseq'):
            name_prefix = name_prefix[:-6].strip()
          if "~{default="" isolate_prefix_override}".strip():
            name_prefix = "~{default="" isolate_prefix_override}".strip()
          outrow['isolate'] = '/'.join([name_prefix, host, country, state_inst_labid, year])
          print("new isolate name: {}".format(outrow['isolate']))

        # some fields are not allowed to be empty
        if not outrow.get('geo_loc_name'):
            outrow['geo_loc_name'] = 'missing'
        if not outrow.get('host'):
            outrow['host'] = 'not applicable'

        # custom db_xref/taxon
        outrow['db_xref'] = "taxon:{}".format(~{taxid})

        # load the purpose of sequencing (or if not, the purpose of sampling) in the note field
        outrow['note'] = row.get('purpose_of_sequencing', row.get('purpose_of_sampling', ''))

        # overwrite sequence ID if requested
        if "~{default="" sequence_id_override}":
            outrow['Sequence_ID'] = "~{default="" sequence_id_override}"

        # sanitize sequence IDs to match fasta headers
        if "~{sanitize_seq_ids}".lower() == 'true':
            outrow['Sequence_ID'] = re.sub(r'[^0-9A-Za-z!_-]', '-', outrow['Sequence_ID'])

        # write isolate name for this sample (it will overwrite and only keep the last one if there are many)
        with open("isolate_name.str", 'wt') as outf_isolate:
            print("writing isolate name: {}".format(outrow['isolate']))
            outf_isolate.write(outrow['isolate']+'\n')

        # write entry for this sample
        sample_name = outrow['Sequence_ID']
        if ~{num_segments}>1:
            for i in range(~{num_segments}):
                outrow['Sequence_ID'] = "{}-{}".format(sample_name, i+1)
                outf_smt.write('\t'.join(outrow[h] for h in out_headers)+'\n')
                outf_ids.write(outrow['Sequence_ID']+'\n')
        else:
            outf_smt.write('\t'.join(outrow[h] for h in out_headers)+'\n')
            outf_ids.write(outrow['Sequence_ID']+'\n')
CODE

Outputs

Name Type Expression
genbank_source_modifier_table File "~{out_basename}.src"
sample_ids File "~{out_basename}.sample_ids.txt"
isolate_name String read_string("isolate_name.str")

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS generate_author_sbt_file

Generate an NCBI-compatible author sbt file for submission of sequence data to GenBank. Accepts an author string, a defaults yaml file, and a jinja2-format template. Output is comparable to what is generated by http://www.ncbi.nlm.nih.gov/WebSub/template.cgi

Inputs

Name Type Description Default
author_list String? A string containing a space-delimited list with of author surnames separated by first name and (optional) middle initial. Ex. 'Lastname,Firstname, Last-hypenated,First,M., Last,F.' -
j2_template File an sbt file (optionally) with Jinja2 variables to be filled in based on values present in author_sbt_defaults_yaml, if provided. If no yaml is provided, this file is passed through verbatim. Example: gs://pathogen-public-dbs/other-related/author_template.sbt.j2 -
defaults_yaml File? A YAML file with default values to use for the submitter, submitter affiliation, and author affiliation. Optionally including authors at the start and end of the author_list. Example: gs://pathogen-public-dbs/other-related/default_sbt_values.yaml -
2 optional inputs with default values

Command

set -e

# blank yaml file to be used if the optional input is not specified
touch blank.yml

python3 << CODE
# generates an sbt file of the format returned by:
# http://www.ncbi.nlm.nih.gov/WebSub/template.cgi
import re
import shutil
# external dependencies
import yaml # pyyaml
from jinja2 import Template #jinja2

def render_sbt(author_string, defaults_yaml=None, sbt_out_path="authors.sbt", j2_template="author_template.sbt.j2"):
    # simple version for only initials: #author_re=re.compile(r"\s?(?P<lastname>[\w\'\-\ ]+),(?P<initials>(?:[A-Z]\.){1,3})")
    author_re=re.compile(r"\s?(?P<lastname>[\w\'\-\ ]+),((?P<first>\w[\w\'\-\ ]+\.?),?|(?P<initials>(?:[A-Z]\.)+))(?P<initials_ext>(?:[A-Z]\.)*)")

    authors=[]
    defaults_data_last_authors=[]
    defaults_data = {}

    authors_affil = None
    submitter     = None
    bioproject    = None
    title         = None
    citation      = None

    if defaults_yaml is not None:
        with open(defaults_yaml) as defaults_yaml:
            defaults_data = yaml.load(defaults_yaml, Loader=yaml.FullLoader)

            if defaults_data is not None:
                submitter     = defaults_data.get("submitter")
                bioproject    = defaults_data.get("bioproject")
                title         = defaults_data.get("title")
                citation      = defaults_data.get("citation")
                authors_affil = defaults_data.get("authors_affil")
                
                defaults_data_authors = defaults_data.get("authors_start",[])
                for author in defaults_data_authors:
                    authors.extend(author)

                defaults_data_last_authors = defaults_data.get("authors_last",[])
                for author in defaults_data_last_authors:
                    last_authors.append(author)
    
    for author_match in author_re.finditer(author_string):
        author = {}
        lastname=author_match.group("lastname")
        initials=[]
        if author_match.group("initials"):
            initials.extend(list(filter(None,author_match.group("initials").split("."))))
        if author_match.group("initials_ext"):
            initials.extend(list(filter(None,author_match.group("initials_ext").split("."))))

        first=""
        if author_match.group("first"):
            first=author_match.group("first")
        else:
            first=initials[0]+"."
        author["last"]     = author_match.group("lastname")
        author["first"]    = first
        author["initials"]   = ".".join(initials[1:]) if not author_match.group("first") else ".".join(initials)
        author["initials"]   = author["initials"]+"." if len(author["initials"])>0 else author["initials"]
        
        if author not in authors: # could use less exact match
            authors.append(author)

    for author in defaults_data_last_authors:
        if author not in authors:
            authors.append(author)

    jinja_rendering_kwargs={}
    if authors_affil is not None:
        jinja_rendering_kwargs["authors_affil"]=authors_affil
    if title is not None:
        jinja_rendering_kwargs["title"]=title
    if submitter is not None:
        jinja_rendering_kwargs["submitter"]=submitter
    if citation is not None:
        jinja_rendering_kwargs["citation"]=citation
    if bioproject is not None:
        jinja_rendering_kwargs["bioproject"]=bioproject

    if len(authors) >= 1 or len(jinja_rendering_kwargs) >= 1:
        with open(j2_template) as sbt_template:
            template = Template(sbt_template.read())

        rendered = template.render( authors=authors, 
                                    **jinja_rendering_kwargs)
    
        #print(rendered)
        with open(sbt_out_path,"w") as sbt_out:
            sbt_out.write(rendered)
    else:
        # if no authors were specified, simply copy the template to the output
        shutil.copyfile(j2_template, sbt_out_path)

render_sbt("~{author_list}", defaults_yaml="~{default="blank.yml" defaults_yaml}", sbt_out_path="~{out_base}.sbt", j2_template="~{j2_template}")
CODE

Outputs

Name Type Expression
sbt_file File "~{out_base}.sbt"

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS table2asn

This task runs NCBI's table2asn, the artist formerly known as tbl2asn

Inputs

Name Type Description Default
assembly_fasta File Assembled genome. All chromosomes/segments in one file. -
annotations_tbl File Gene annotations in TBL format, one per fasta file. Filename basenames must match the assemblies_fasta basenames. These files are typically output from the ncbi.annot_transfer task. -
authors_sbt File A genbank submission template file (SBT) with the author list, created at https://submit.ncbi.nlm.nih.gov/genbank/template/submission/ -
source_modifier_table File? A tab-delimited text file containing requisite metadata for Genbank (a 'source modifier table'). https://www.ncbi.nlm.nih.gov/WebSub/html/help/genbank-source-table.html -
structured_comment_file File? {'patterns': ['*.cmt']} -
comment String? Optional comments that can be displayed in the COMMENT section of the Genbank record. This may include any disclaimers about assembly quality or notes about pre-publication availability or requests to discuss pre-publication use with authors. -
organism String The scientific name for the organism being submitted. This is typically the species name and should match the name given by the NCBI Taxonomy database. For more info, see: https://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#Organism -
5 optional inputs with default values

Command

set -ex -o pipefail
table2asn -version | cut -f 2 -d ' ' > TABLE2ASN_VERSION
cp "~{assembly_fasta}" "~{out_basename}.fsa" # input fasta must be in CWD so output files end up next to it
touch "~{out_basename}.val"  # this file isn't produced if no errors/warnings

table2asn \
  -t "~{authors_sbt}" \
  -i "~{out_basename}.fsa" \
  -f "~{annotations_tbl}" \
  ~{'-w "' + structured_comment_file + '"'} \
  -j '[gcode=~{genetic_code}][moltype=~{mol_type}][organism=~{organism}]' \
  ~{'-src-file "' + source_modifier_table + '"'} \
  ~{'-y "' + comment + '"'} \
  -a s -V vb

set +x
echo "table2asn complete"
cat "~{out_basename}.val" | { grep -vi '^Info:' || test $? = 1; } | tee "~{out_basename}.val.no_info"
cat "~{out_basename}.val.no_info" | { grep -vi '^Warning: valid' || test $? = 1; } | tee "~{out_basename}.val.errors_only"

Outputs

Name Type Expression
genbank_submission_sqn File "~{out_basename}.sqn"
genbank_preview_file File "~{out_basename}.gbf"
genbank_validation_file File "~{out_basename}.val"
table2asn_errors Array[String] read_lines("~{out_basename}.val")
table2asn_version String read_string("TABLE2ASN_VERSION")
table2asn_passing Boolean length(read_lines("~{out_basename}.val.errors_only")) == 0

Runtime

Key Value
docker docker
memory machine_mem_gb + " GB"
cpu 2
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"

TASKS package_special_genbank_ftp_submission

Prepares a zip and xml file for FTP-based NCBI Genbank submission according to instructions at https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/genbank/SARS-CoV-2/.

Inputs

Name Type Description Default
sequences_fasta File - -
structured_comment_table File - -
source_modifier_table File - -
author_template_sbt File - -
submission_name String - -
submission_uid String - -
spuid_namespace String - -
account_name String - -
2 optional inputs with default values

Command

set -e

# make the submission zip file
cp "~{sequences_fasta}" sequence.fsa
cp "~{structured_comment_table}" comment.cmt
cp "~{source_modifier_table}" source.src
cp "~{author_template_sbt}" template.sbt
zip "~{submission_uid}.zip" sequence.fsa comment.cmt source.src template.sbt

# make the submission xml file
SUB_NAME="~{submission_name}"
ACCT_NAME="~{account_name}"
SPUID="~{submission_uid}"
cat << EOF > submission.xml
<?xml version="1.0"?>
<Submission>
  <Description>
    <Comment>$SUB_NAME</Comment>
    <Organization type="center" role="owner">
      <Name>$ACCT_NAME</Name>
    </Organization>
  </Description>
  <Action>
    <AddFiles target_db="GenBank">
      <File file_path="$SPUID.zip">
        <DataType>genbank-submission-package</DataType>
      </File>
      <Attribute name="wizard">~{wizard}</Attribute>
      <Identifier>
        <SPUID spuid_namespace="~{spuid_namespace}">$SPUID</SPUID>
      </Identifier>
    </AddFiles>
  </Action>
</Submission>
EOF

# make the (empty) ready file
touch submit.ready

Outputs

Name Type Expression
submission_zip File "~{submission_uid}.zip"
submission_xml File "submission.xml"
submit_ready File "submit.ready"

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS genbank_special_taxa

this task tells you if you have a special taxon that NCBI treats differently

Inputs

Name Type Description Default
taxid Int - -
taxdump_tgz File - -
vadr_by_taxid_tsv File - -
1 optional input with default value

Command

set -e

# unpack the taxdump tarball
mkdir -p taxdump
read_utils.py extract_tarball "~{taxdump_tgz}" taxdump

python3 << CODE
import csv
import metagenomics
import tarfile
import urllib.request
import json
import re
import sys
taxid = ~{taxid}

# load taxdb and retrieve full hierarchy leading to this taxid
taxdb = metagenomics.TaxonomyDb(tax_dir="taxdump", load_nodes=True, load_names=True, load_gis=False)
ancestors = taxdb.get_ordered_ancestors(taxid)
this_and_ancestors = [taxid] + ancestors

# Genbank prohibits normal submissions for SC2, Flu A/B/C, Noro, and Dengue
table2asn_prohibited = {
  11320: "Influenza A virus",
  11520: "Influenza B virus",
  11552: "Influenza C virus",
  2697049: "Severe acute respiratory syndrome coronavirus 2",
  11983: "Norovirus",
  3052464: "Dengue virus"
}
prohibited = any(node in table2asn_prohibited for node in this_and_ancestors)
with open("table2asn_allowed.boolean", "wt") as outf:
  outf.write("false" if prohibited else "true")
with open("genbank_submission_mechanism.str", "wt") as outf:
  if any(node == 11320 for node in this_and_ancestors):
    outf.write("InfluenzaA")
  elif any(node == 11520 for node in this_and_ancestors):
    outf.write("InfluenzaB")
  elif any(node == 11552 for node in this_and_ancestors):
    outf.write("InfluenzaC")
  elif any(node == 2697049 for node in this_and_ancestors):
    outf.write("SARS-CoV-2")
  elif any(node == 11983 for node in this_and_ancestors):
    outf.write("Norovirus")
  elif any(node == 3052464 for node in this_and_ancestors):
    outf.write("Dengue")
  else:
    outf.write("table2asn")

# Genbank requires certain fields to be populated for Flu A and Dengue
if any(node == 11320 for node in this_and_ancestors):
  # flu A needs subtype specified in the serotype column
  with open("genbank_source_overrides.json", "wt") as outf:
    # This taxid matches a specific named strain
    match = re.search(r'\(([^()]+)\)+$', taxdb.names[taxid])
    if match:
      subtype = match.group(1)
      print("found Influenza A subtype {}". format(subtype))
      json.dump({'serotype':subtype}, outf)
    else:
      # Influenza A has a number of taxids at the subtype level
      match = re.search(r'(\w+)\s+subtype$', taxdb.names[taxid])
      if match:
        subtype = match.group(1)
        print("found Influenza A subtype {}". format(subtype))
        json.dump({'serotype':subtype}, outf)
      else:
        print("failed to find Influenza A subtype from taxid {}, taxname {}".format(taxid, taxdb.names[taxid]))
        sys.exit(1)
elif any(node == 3052464 for node in this_and_ancestors):
  # dengue needs serotype specified in the genotype column
  with open("genbank_source_overrides.json", "wt") as outf:
    match = re.search(r'(\d)$', taxdb.names[taxid])
    if match:
      serotype = match.group(1)
      print("found Dengue serotype {}". format(serotype))
      json.dump({'genotype':serotype}, outf)
    else:
      print("failed to find Dengue serotype from taxid {}, taxname {}".format(taxid, taxdb.names[taxid]))
      sys.exit(1)

# VADR is an annotation tool that supports SC2, Flu A/B/C/D, Noro, Dengue, RSV A/B, MPXV, etc
# https://github.com/ncbi/vadr/wiki/Available-VADR-model-files
# Note this table includes some taxa that are subtaxa of others and in those cases, it is ORDERED from
# more specific to less specific (e.g. noro before calici, dengue before flavi, sc2 before corona)
# so use the *first* hit in the table.
# tsv header: tax_id, taxon_name, min_seq_len, max_seq_len, vadr_min_ram_gb, vadr_opts, vadr_model_tar_url
with open("~{vadr_by_taxid_tsv}", 'rt') as inf:
  vadr_supported = list(row for row in csv.DictReader(inf, delimiter='\t'))

out_vadr_supported = False
out_vadr_cli_options = ""
out_vadr_model_tar_url = ""
out_min_genome_length = 0
out_max_genome_length = 1000000
out_vadr_taxid = 0
out_vadr_min_ram_gb = 8
out_vadr_model_tar_subdir = ""
for row in vadr_supported:
  if any(node == int(row['tax_id']) for node in this_and_ancestors):
    out_vadr_taxid = int(row['tax_id'])
    out_vadr_supported = True
    out_vadr_cli_options = row['vadr_opts']
    out_vadr_model_tar_url = row['vadr_model_tar_url']
    if row['min_seq_len']:
      out_min_genome_length = int(row['min_seq_len'])
    if row['max_seq_len']:
      out_max_genome_length = int(row['max_seq_len'])
    if row['vadr_min_ram_gb']:
      out_vadr_min_ram_gb = int(row['vadr_min_ram_gb'])
    if row['vadr_model_tar_subdir']:
      out_vadr_model_tar_subdir = row['vadr_model_tar_subdir']
    break
with open("vadr_supported.boolean", "wt") as outf:
  outf.write("true" if out_vadr_supported else "false")
with open("vadr_cli_options.string", "wt") as outf:
  outf.write(out_vadr_cli_options)
with open("min_genome_length.int", "wt") as outf:
  outf.write(str(out_min_genome_length))
with open("max_genome_length.int", "wt") as outf:
  outf.write(str(out_max_genome_length))
with open("vadr_taxid.int", "wt") as outf:
  outf.write(str(out_vadr_taxid))
with open("vadr_min_ram_gb.int", "wt") as outf:
  outf.write(str(out_vadr_min_ram_gb))
with open("vadr_model_tar_subdir.str", "wt") as outf:
  outf.write(out_vadr_model_tar_subdir)

if out_vadr_model_tar_url:
  urllib.request.urlretrieve(out_vadr_model_tar_url, "vadr_model-~{taxid}.tar.gz")

CODE

Outputs

Name Type Expression
table2asn_allowed Boolean read_boolean("table2asn_allowed.boolean")
genbank_submission_mechanism String read_string("genbank_submission_mechanism.str")
genbank_source_overrides_json File? "genbank_source_overrides.json"
vadr_supported Boolean read_boolean("vadr_supported.boolean")
vadr_cli_options String read_string("vadr_cli_options.string")
vadr_model_tar File? "vadr_model-~{taxid}.tar.gz"
vadr_model_tar_subdir String read_string("vadr_model_tar_subdir.str")
vadr_taxid Int read_int("vadr_taxid.int")
vadr_min_ram_gb Int read_int("vadr_min_ram_gb.int")
min_genome_length Int read_int("min_genome_length.int")
max_genome_length Int read_int("max_genome_length.int")

Runtime

Key Value
docker docker
memory "2 GB"
cpu 1
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS vadr

Runs NCBI's Viral Annotation DefineR for annotation and QC.

Inputs

Name Type Description Default
genome_fasta File - -
vadr_opts String? - -
minlen Int? - -
maxlen Int? - -
vadr_model_tar File? - -
vadr_model_tar_subdir String? - -
4 optional inputs with default values

Command

set -e
# unpack custom VADR models or use default
if [ -n "~{vadr_model_tar}" ]; then
  mkdir -p vadr-untar
  tar -C vadr-untar -xzvf "~{vadr_model_tar}"
  # just link the model subdirectory, not the outer wrapper
  ln -s vadr-untar/*/ vadr-models
else
  # use default (distributed with docker image) models
  ln -s /opt/vadr/vadr-models vadr-models
fi
if [ -n "~{default="" vadr_model_tar_subdir}" ]; then
  VADR_MODEL_DIR="vadr-models/~{default="" vadr_model_tar_subdir}"
else
  VADR_MODEL_DIR="vadr-models"
fi

# remove terminal ambiguous nucleotides
/opt/vadr/vadr/miniscripts/fasta-trim-terminal-ambigs.pl \
  "~{genome_fasta}" \
  ~{'--minlen ' + minlen} \
  ~{'--maxlen ' + maxlen} \
  > "~{out_basename}.fasta"

# run VADR
v-annotate.pl \
  ~{default="" vadr_opts} \
  --split --cpu `nproc` \
  --mdir "$VADR_MODEL_DIR" \
  "~{out_basename}.fasta" \
  "~{out_basename}"

# package everything for output
tar -C "~{out_basename}" -czvf "~{out_basename}.vadr.tar.gz" .

# get the gene annotations (feature table)
# the vadr.fail.tbl sometimes contains junk / invalid content that needs to be filtered out
cat "~{out_basename}/~{out_basename}.vadr.pass.tbl" \
    "~{out_basename}/~{out_basename}.vadr.fail.tbl" \
   | sed '/Additional note/,$d' \
    > "~{out_basename}.tbl"

# prep alerts into a tsv file for parsing
cat "~{out_basename}/~{out_basename}.vadr.alt.list" | cut -f 5 | tail -n +2 \
  > "~{out_basename}.vadr.alerts.tsv"
cat "~{out_basename}.vadr.alerts.tsv" | wc -l > NUM_ALERTS

# record peak memory usage
set +o pipefail
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } | tee MEM_BYTES

Outputs

Name Type Expression
feature_tbl File "~{out_basename}.tbl"
num_alerts Int read_int("NUM_ALERTS")
alerts_list File "~{out_basename}/~{out_basename}.vadr.alt.list"
alerts Array[String] read_lines("~{out_basename}.vadr.alerts.tsv")
outputs_tgz File "~{out_basename}.vadr.tar.gz"
pass Boolean num_alerts == 0
vadr_docker String docker
max_ram_gb Int ceil((read_float("MEM_BYTES") / 1000000000))

Runtime

Key Value
docker docker
memory mem_size + " GB"
cpu cpus
dx_instance_type "mem2_ssd1_v2_x4"
maxRetries 2

TASKS package_genbank_submissions

Inputs

Name Type Description Default
genbank_file_manifest Array[String] - -
genbank_submit_files Array[File] - -
spuid_base String - -
spuid_namespace String - -
account_name String - -
authors_sbt File - -
3 optional inputs with default values

Command

set -e

# bring everything into CWD -- at this point we are sure all files are uniquely named / no collisions
cp "~{sep="" "" genbank_submit_files}" .
cp "~{authors_sbt}" template.sbt

python3 << CODE
import json
import zipfile
import csv

# initialize counters and file lists for each submission category
counts_by_type = {}
files_by_type = {}
groups_total = list(m+"_"+v for m in ('table2asn', 'InfluenzaA', 'InfluenzaB', 'InfluenzaC', 'SARS-CoV-2', 'Norovirus', 'Dengue') for v in ('clean', 'warnings'))
for group in groups_total:
  counts_by_type[group] = 0
  files_by_type[group] = []

# read manifest from genbank_single
for genome in json.loads('[~{sep="," genbank_file_manifest}]'):
  group = genome['submission_type'] + '_' + ('clean' if genome['validation_passing'] else 'warnings')
  counts_by_type[group] += 1
  files_by_type[group].extend(genome['files'])

# function for writing submission.xml files
def write_submission_xml(spuid, spuid_namespace, account_name, wizard=None):
  fname = spuid + ".xml"
  submission_name = spuid
  with open(fname, 'wt') as outf:
    outf.write('<?xml version="1.0"?>\n')
    outf.write('<Submission>\n')
    outf.write('  <Description>\n')
    outf.write('    <Comment>{}</Comment>\n'.format(submission_name))
    outf.write('    <Organization type="center" role="owner">\n')
    outf.write('      <Name>{}</Name>\n'.format(account_name))
    outf.write('    </Organization>\n')
    outf.write('  </Description>\n')
    outf.write('  <Action>\n')
    outf.write('    <AddFiles target_db="GenBank">\n')
    outf.write('      <File file_path="{}.zip">\n'.format(spuid))
    outf.write('        <DataType>genbank-submission-package</DataType>\n')
    outf.write('      </File>\n')
    if wizard:
      outf.write('      <Attribute name="wizard">{}</Attribute>\n'.format(wizard))
    outf.write('      <Identifier>\n')
    outf.write('        <SPUID spuid_namespace="{}">{}</SPUID>\n'.format(spuid_namespace, spuid))
    outf.write('      </Identifier>\n')
    outf.write('    </AddFiles>\n')
    outf.write('  </Action>\n')
    outf.write('</Submission>\n')

# write and bundle outputs
for group in groups_total:
  with open("count_"+group+".int", 'wt') as outf:
    outf.write(str(counts_by_type[group]))
  if counts_by_type[group]:
    if group.startswith('table2asn'):
      # bundle the sqn files together and that's it
      with zipfile.ZipFile("~{spuid_base}_" + group + ".zip", 'w') as zf:
        for fname in files_by_type[group]:
          zf.write(fname)
    else:
      # for special submissions, merge each individual file type
      with open("sequences.fsa", 'wt') as out_fsa:
        with open("comment.cmt", 'wt') as out_cmt:
          with open("source.src", 'wt') as out_src:
            src_header = ""
            cmt_header = ""
            for fname in files_by_type[group]:
              if fname.endswith('.fsa') or fname.endswith('.fasta'):
                with open(fname) as inf:
                  out_fsa.write(inf.read())
              elif fname.endswith('.cmt'):
                with open(fname) as inf:
                  header = inf.readline()
                  if cmt_header:
                    assert header == cmt_header
                  else:
                    cmt_header = header
                    out_cmt.write(header)
                  out_cmt.write(inf.read())
              elif fname.endswith('.src'):
                with open(fname) as inf:
                  header = inf.readline()
                  if src_header:
                    assert header == src_header
                  else:
                    src_header = header
                    out_src.write(header)
                  out_src.write(inf.read())
      # make FTP zips and xmls
      if group.startswith('SARS-CoV-2'):
        with zipfile.ZipFile("~{spuid_base}_FTP_" + group + ".zip", 'w') as zf:
          for fname in ('sequences.fsa', 'comment.cmt', 'source.src', 'template.sbt'):
            zf.write(fname)
        wizard = "BankIt_SARSCoV2_api"
        write_submission_xml("~{spuid_base}_FTP_" + group, "~{spuid_namespace}", "~{account_name}", wizard=wizard)
      elif group.startswith('Influenza'):
        with zipfile.ZipFile("~{spuid_base}_FTP_" + group + ".zip", 'w') as zf:
          for fname in ('sequences.fsa', 'comment.cmt', 'source.src', 'template.sbt'):
            zf.write(fname)
        wizard = "BankIt_influenza_api"
        write_submission_xml("~{spuid_base}_FTP_" + group, "~{spuid_namespace}", "~{account_name}", wizard=wizard)
        # make a different src file for web after making FTP zip
        # because for some reason, you can't use the same source.src file for web and FTP for flu
        # strain column is required for FTP and prohibited for web
        with open('source.src', 'rt') as inf:
          reader = csv.DictReader(inf, delimiter='\t')
          out_header = list(h for h in reader.fieldnames if h != 'strain')
          rows = []
          for row in reader:
            del row['strain']
            rows.append(row)
        with open('source.src', 'wt') as outf:
          writer = csv.DictWriter(outf, delimiter='\t', fieldnames=out_header, dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
          writer.writeheader()
          for row in rows:
            writer.writerow(row)

      # make web portal zips
      with zipfile.ZipFile("~{spuid_base}_" + group + ".zip", 'w') as zf:
        for fname in ('sequences.fsa', 'comment.cmt', 'source.src', 'template.sbt'):
          zf.write(fname)
CODE

Outputs

Name Type Expression
ftp_submission_files Array[File] glob("~{spuid_base}_FTP_*_clean.*")
submit_sqns_clean_zip File? "~{spuid_base}_table2asn_clean.zip"
submit_sqns_warnings_zip File? "~{spuid_base}_table2asn_warnings.zip"
num_sqns_clean Int read_int("count_table2asn_clean.int")
num_sqns_warnings Int read_int("count_table2asn_warnings.int")
submit_fluA_clean_zip File? "~{spuid_base}_InfluenzaA_clean.zip"
submit_fluA_warnings_zip File? "~{spuid_base}_InfluenzaA_warnings.zip"
num_fluA_clean Int read_int("count_InfluenzaA_clean.int")
num_fluA_warnings Int read_int("count_InfluenzaA_warnings.int")
submit_fluB_clean_zip File? "~{spuid_base}_InfluenzaB_clean.zip"
submit_fluB_warnings_zip File? "~{spuid_base}_InfluenzaB_warnings.zip"
num_fluB_clean Int read_int("count_InfluenzaB_clean.int")
num_fluB_warnings Int read_int("count_InfluenzaB_warnings.int")
submit_fluC_clean_zip File? "~{spuid_base}_InfluenzaC_clean.zip"
submit_fluC_warnings_zip File? "~{spuid_base}_InfluenzaC_warnings.zip"
num_fluC_clean Int read_int("count_InfluenzaC_clean.int")
num_fluC_warnings Int read_int("count_InfluenzaC_warnings.int")
submit_sc2_clean_zip File? "~{spuid_base}_SARS-CoV-2_clean.zip"
submit_sc2_warnings_zip File? "~{spuid_base}_SARS-CoV-2_warnings.zip"
num_sc2_clean Int read_int("count_SARS-CoV-2_clean.int")
num_sc2_warnings Int read_int("count_SARS-CoV-2_warnings.int")
submit_noro_clean_zip File? "~{spuid_base}_Norovirus_clean.zip"
submit_noro_warnings_zip File? "~{spuid_base}_Norovirus_warnings.zip"
num_noro_clean Int read_int("count_Norovirus_clean.int")
num_noro_warnings Int read_int("count_Norovirus_warnings.int")
submit_dengue_clean_zip File? "~{spuid_base}_Dengue_clean.zip"
submit_dengue_warnings_zip File? "~{spuid_base}_Dengue_warnings.zip"
num_dengue_clean Int read_int("count_Dengue_clean.int")
num_dengue_warnings Int read_int("count_Dengue_warnings.int")

Runtime

Key Value
docker docker
memory mem_size + " GB"
cpu cpus
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 1
← Back to Index

tasks_ncbi - WDL Source Code

version 1.0

task download_fasta {
  input {
    String         out_prefix
    Array[String]+ accessions
    String         emailAddress

    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  command {
    ncbi.py --version | tee VERSION
    ncbi.py fetch_fastas \
        ${emailAddress} \
        . \
        ${sep=' ' accessions} \
        --combinedFilePrefix ${out_prefix} \
  }

  output {
    File   sequences_fasta  = "${out_prefix}.fasta"
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: "7 GB"
    cpu: 2
    dx_instance_type: "mem2_ssd1_v2_x2"
    maxRetries: 2
  }
}

task download_fasta_from_accession_string {
  meta {
    description: "Download GenBank sequences from a space-separated string of accessions (optionally with coverage specifications like 'accession:12.5x') and combine into a single fasta file."
  }

  input {
    String accession_string
    String out_prefix
    String emailAddress

    String docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  command <<<
    set -e
    ncbi.py --version | tee VERSION

    # Parse accession string to extract just the accession IDs (strip coverage values if present)
    python3 <<CODE
    import re
    accession_string = "~{accession_string}"
    # Split by whitespace and extract accession (before colon if present)
    accessions = []
    for pair in accession_string.split():
        if ':' in pair:
            accessions.append(pair.split(':')[0])
        else:
            accessions.append(pair)

    with open('accessions.txt', 'w') as f:
        for acc in accessions:
            f.write(acc + '\n')
    CODE

    # Download sequences using extracted accessions
    ncbi.py fetch_fastas \
        ~{emailAddress} \
        . \
        $(cat accessions.txt | tr '\n' ' ') \
        --combinedFilePrefix ~{out_prefix}
  >>>

  output {
    File   sequences_fasta  = "~{out_prefix}.fasta"
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: "7 GB"
    cpu: 2
    dx_instance_type: "mem2_ssd1_v2_x2"
    maxRetries: 2
  }
}

task download_annotations {
  input {
    Array[String]+ accessions
    String         emailAddress
    String         combined_out_prefix

    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  command <<<
    set -ex -o pipefail
    ncbi.py --version | tee VERSION
    ncbi.py fetch_feature_tables \
        ~{emailAddress} \
        ./ \
        ~{sep=' ' accessions} \
        --loglevel DEBUG
    mkdir -p combined
    ncbi.py fetch_fastas \
        ~{emailAddress} \
        ./ \
        ~{sep=' ' accessions} \
        --combinedFilePrefix "combined/~{combined_out_prefix}" \
        --forceOverwrite \
        --loglevel DEBUG
  >>>

  output {
    File        combined_fasta   = "combined/~{combined_out_prefix}.fasta"
    Array[File] genomes_fasta    = glob("*.fasta")
    Array[File] features_tbl     = glob("*.tbl")
    String      viralngs_version = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: "7 GB"
    cpu: 2
    dx_instance_type: "mem2_ssd1_v2_x2"
    maxRetries: 2
  }
}

task download_ref_genomes_from_tsv {
  input {
    File      ref_genomes_tsv    # [tax_id, isolate_prefix, taxname, colon_delim_accession_list]
    String    emailAddress

    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  command <<<
    set -ex -o pipefail
    ncbi.py --version | tee VERSION
    mkdir -p combined

    python3<<CODE
    import csv
    import phylo.genbank
    with open("~{ref_genomes_tsv}", 'rt') as inf:
      reader = csv.DictReader(inf, delimiter='\t',
        fieldnames=['tax_id', 'isolate_prefix', 'taxname', 'accessions']) # backwards support for headerless tsvs
      # for the future: batch all the downloads in a single call and re-organize output files afterwards
      for ref_genome in reader:
        if ref_genome['tax_id'] != 'tax_id': # skip header
          accessions = ref_genome['accessions'].split(':')
          phylo.genbank.fetch_fastas_from_genbank(
            accessionList=accessions,
            destinationDir=".",
            emailAddress="~{emailAddress}",
            forceOverwrite=True,
            combinedFilePrefix="combined/" + '-'.join(accessions),
            removeSeparateFiles=False,
            chunkSize=500)
    CODE
  >>>

  output {
    Array[File] ref_genomes_fastas  = glob("combined/*.fasta")
    Int         num_references      = length(ref_genomes_fastas)
  }

  runtime {
    docker: docker
    memory: "7 GB"
    cpu: 2
    dx_instance_type: "mem2_ssd1_v2_x2"
    maxRetries: 2
  }
}

task sequencing_platform_from_bam {
  input {
    File    bam

    String  docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  command <<<
    set -ex -o pipefail
    samtools view -H "~{bam}" | grep '^@RG' | grep -o 'PL:[^[:space:]]*' | cut -d':' -f2 | sort | uniq > BAM_PLATFORMS
    if [ $(wc -l < BAM_PLATFORMS) -ne 1 ]; then
      echo "Other: hybrid" > GENBANK_SEQ_TECH
    elif grep -qi 'ILLUMINA' BAM_PLATFORMS; then
      echo "Illumina" > GENBANK_SEQ_TECH
    elif grep -qi 'ONT' BAM_PLATFORMS; then
      echo "Oxford Nanopore" > GENBANK_SEQ_TECH
    elif grep -qi 'PACBIO' BAM_PLATFORMS; then
      echo "PacBio" > GENBANK_SEQ_TECH
    elif grep -qi 'IONTORRENT' BAM_PLATFORMS; then
      echo "IonTorrent" > GENBANK_SEQ_TECH
    elif grep -qi 'SOLID' BAM_PLATFORMS; then
      echo "SOLiD" > GENBANK_SEQ_TECH
    elif grep -qi 'ULTIMA' BAM_PLATFORMS; then
      echo "Ultima" > GENBANK_SEQ_TECH
    elif grep -qi 'ELEMENT' BAM_PLATFORMS; then
      echo "Element" > GENBANK_SEQ_TECH
    elif grep -qi 'CAPILLARY' BAM_PLATFORMS; then
      echo "Sanger" > GENBANK_SEQ_TECH
    else
      echo "Haven't seen this one before!"
      exit 1
    fi
  >>>

  output {
    String  genbank_sequencing_technology  = read_string("GENBANK_SEQ_TECH")
  }

  runtime {
    docker: docker
    memory: "3 GB"
    cpu: 2
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task align_and_annot_transfer_single {
  meta {
    description: "Given a reference genome annotation in TBL format (e.g. from Genbank or RefSeq) and new genome not in Genbank, produce new annotation files (TBL format with appropriate coordinate conversions) for the new genome. Resulting output can be fed to tbl2asn for Genbank submission."
  }

  input {
    File         genome_fasta
    Array[File]+ reference_fastas
    Array[File]+ reference_feature_tables

    String       out_basename = basename(genome_fasta, '.fasta')
    Int          machine_mem_gb = 30
    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  parameter_meta {
    genome_fasta: {
      description: "New genome, all segments/chromosomes in one fasta file. Must contain the same number of sequences as reference_fasta",
      patterns: ["*.fasta"]
    }
    reference_fastas: {
      description: "Reference genome, each segment/chromosome in a separate fasta file, in the exact same count and order as the segments/chromosomes described in genome_fasta. Headers must be Genbank accessions.",
      patterns: ["*.fasta"]
    }
    reference_feature_tables: {
      description: "NCBI Genbank feature table, each segment/chromosome in a separate TBL file, in the exact same count and order as the segments/chromosomes described in genome_fasta and reference_fastas. Accession numbers in the TBL files must correspond exactly to those in reference_fasta.",
      patterns: ["*.tbl"]
    }
  }

  command <<<
    set -e
    ncbi.py --version | tee VERSION
    mkdir -p out
    ncbi.py tbl_transfer_multichr \
        "~{genome_fasta}" \
        out \
        --ref_fastas ~{sep=' ' reference_fastas} \
        --ref_tbls ~{sep=' ' reference_feature_tables} \
        --oob_clip \
        --loglevel DEBUG
    cat out/*.tbl > "~{out_basename}.tbl"
  >>>

  output {
    File         feature_tbl           = "~{out_basename}.tbl"
    #Array[File]+ genome_per_chr_tbls   = glob("out/*.tbl")
    #Array[File]+ genome_per_chr_fastas = glob("out/*.fasta")
    String       viralngs_version      = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: machine_mem_gb + " GB"
    cpu: 8
    dx_instance_type: "mem2_ssd1_v2_x4"
    preemptible: 1
    maxRetries: 2
  }
}

task structured_comments {
  input {
    File   assembly_stats_tsv

    File?  filter_to_ids

    String docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  String out_base = basename(assembly_stats_tsv, '.txt')
  command <<<
    set -e

    python3 << CODE
    import util.file

    samples_to_filter_to = set()
    if "~{default='' filter_to_ids}":
        with open("~{default='' filter_to_ids}", 'rt') as inf:
            samples_to_filter_to = set(line.strip() for line in inf)

    out_headers = ('SeqID', 'StructuredCommentPrefix', 'Assembly Method', 'Coverage', 'Sequencing Technology', 'StructuredCommentSuffix')
    with open("~{out_base}.cmt", 'wt') as outf:
        outf.write('\t'.join(out_headers)+'\n')

        for row in util.file.read_tabfile_dict("~{assembly_stats_tsv}"):
            outrow = dict((h, row.get(h, '')) for h in out_headers)

            if samples_to_filter_to:
              if row['SeqID'] not in samples_to_filter_to:
                  continue

            if outrow['Coverage']:
              outrow['Coverage'] = "{}x".format(round(float(outrow['Coverage'])))
            outrow['StructuredCommentPrefix'] = 'Assembly-Data'
            outrow['StructuredCommentSuffix'] = 'Assembly-Data'
            outf.write('\t'.join(outrow[h] for h in out_headers)+'\n')
    CODE
  >>>
  output {
    File   structured_comment_table = "~{out_base}.cmt"
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task structured_comments_from_aligned_bam {
  input {
    File    aligned_bam
    String  assembly_method
    String  assembly_method_version

    String  out_basename = basename(aligned_bam, '.bam')
    Boolean is_genome_assembly = true
    Boolean sanitize_ids = true
    String  docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  # see https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/
  command <<<
    set -e -o pipefail

    cp "~{aligned_bam}" aligned.bam
    reports.py coverage_only aligned.bam coverage.txt
    cat coverage.txt

    samtools view -H "~{aligned_bam}" | grep '^@SQ' | grep -o 'SN:[^[:space:]]*' | cut -d':' -f2 | tee SEQ_IDS

    samtools view -H "~{aligned_bam}" | grep '^@RG' | grep -o 'PL:[^[:space:]]*' | cut -d':' -f2 | sort | uniq | tee BAM_PLATFORMS
    if [ $(wc -l < BAM_PLATFORMS) -ne 1 ]; then
      echo "Other: hybrid" > GENBANK_SEQ_TECH
    elif grep -qi 'ILLUMINA' BAM_PLATFORMS; then
      echo "Illumina" > GENBANK_SEQ_TECH
    elif grep -qi 'ONT' BAM_PLATFORMS; then
      echo "Oxford Nanopore" > GENBANK_SEQ_TECH
    elif grep -qi 'PACBIO' BAM_PLATFORMS; then
      echo "PacBio" > GENBANK_SEQ_TECH
    elif grep -qi 'IONTORRENT' BAM_PLATFORMS; then
      echo "IonTorrent" > GENBANK_SEQ_TECH
    elif grep -qi 'SOLID' BAM_PLATFORMS; then
      echo "SOLiD" > GENBANK_SEQ_TECH
    elif grep -qi 'ULTIMA' BAM_PLATFORMS; then
      echo "Ultima" > GENBANK_SEQ_TECH
    elif grep -qi 'ELEMENT' BAM_PLATFORMS; then
      echo "Element" > GENBANK_SEQ_TECH
    elif grep -qi 'CAPILLARY' BAM_PLATFORMS; then
      echo "Sanger" > GENBANK_SEQ_TECH
    else
      echo "Haven't seen this one before!"
      exit 1
    fi
    cat GENBANK_SEQ_TECH

    python3 << CODE
    import Bio.SeqIO
    import csv
    import re

    # get list of sequence IDs from BAM header
    with open("SEQ_IDS") as inf:
        seqids = set(line.strip() for line in inf)
        if ~{true="True" false="False" sanitize_ids}:
          seqids = set(re.sub(r'[^0-9A-Za-z!_-]', '-', x) for x in seqids)

    # get sequencing technology from BAM header    
    with open("GENBANK_SEQ_TECH", "rt") as inf:
        sequencing_tech = inf.read().strip()

    # this header has to be in this specific order -- don't reorder the columns!
    out_headers = ('SeqID', 'StructuredCommentPrefix', 'Assembly Method', "~{true='Genome ' false='' is_genome_assembly}Coverage", 'Sequencing Technology', 'StructuredCommentSuffix')
    with open("~{out_basename}.cmt", 'wt') as outf:
      outf.write('\t'.join(out_headers)+'\n')

      with open("coverage.txt", "rt") as inf:
        for row in csv.DictReader(inf, delimiter='\t'):
          if row.get('sample') and row.get('aln2self_cov_median') and row['sample'] in seqids:
            outrow = {
              'SeqID': row['sample'],
              'Assembly Method': "~{assembly_method} v. ~{assembly_method_version}",  # note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
              'Sequencing Technology': sequencing_tech,
              "~{true='Genome ' false='' is_genome_assembly}Coverage": "{}x".format(round(float(row['aln2self_cov_median']))),
              'StructuredCommentPrefix': "~{true='Genome-' false='' is_genome_assembly}Assembly-Data",
              'StructuredCommentSuffix': "~{true='Genome-' false='' is_genome_assembly}Assembly-Data",
            }
            outf.write('\t'.join(outrow[h] for h in out_headers)+'\n')
    CODE
  >>>
  output {
    File   structured_comment_file = "~{out_basename}.cmt"
  }
  runtime {
    docker: docker
    memory: "2 GB"
    cpu: 2
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task prefix_fasta_header {
  input {
    File   genome_fasta
    String prefix
    String out_basename = basename(genome_fasta, ".fasta")
  }
  command <<<
    set -e
    python3 <<CODE
    with open('~{genome_fasta}', 'rt') as inf:
      with open('~{out_basename}.fasta', 'wt') as outf:
        for line in inf:
          if line.startswith('>'):
            line = ">{}{}\n".format('~{prefix}', line.rstrip()[1:])
          outf.write(line)
    CODE
  >>>
  output {
    File renamed_fasta = "~{out_basename}.fasta"
  }
  runtime {
    docker: "python:slim"
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task rename_fasta_header {
  input {
    File   genome_fasta
    String new_name

    String out_basename = basename(genome_fasta, ".fasta")

    String docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  command {
    set -e
    file_utils.py rename_fasta_sequences \
      "~{genome_fasta}" "~{out_basename}.fasta" "~{new_name}"
  }
  output {
    File renamed_fasta = "~{out_basename}.fasta"
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task gisaid_meta_prep {
  input {
    File    source_modifier_table
    File    structured_comments
    String  out_name
    String  continent = "North America"
    Boolean strict = true
    String? username
    String  submitting_lab_name
    String? fasta_filename

    String  address_map = '{}'
    String  authors_map = '{}'
  }
  command <<<
    python3 << CODE
    import os.path
    import csv
    import json

    strict = ~{true="True" false="False" strict}

    # institutional mappings
    address_map = json.loads('~{address_map}')
    authors_map = json.loads('~{authors_map}')
    assert "~{submitting_lab_name}" in address_map, f"error: institution '~{submitting_lab_name}' not found in address_map"

    # lookup table files to dicts
    sample_to_cmt = {}
    with open('~{structured_comments}', 'rt') as inf:
      for row in csv.DictReader(inf, delimiter='\t'):
        sample_to_cmt[row['SeqID']] = row

    out_headers = ('submitter', 'fn', 'covv_virus_name', 'covv_type', 'covv_passage', 'covv_collection_date', 'covv_location', 'covv_add_location', 'covv_host', 'covv_add_host_info', 'covv_sampling_strategy', 'covv_gender', 'covv_patient_age', 'covv_patient_status', 'covv_specimen', 'covv_outbreak', 'covv_last_vaccinated', 'covv_treatment', 'covv_seq_technology', 'covv_assembly_method', 'covv_coverage', 'covv_orig_lab', 'covv_orig_lab_addr', 'covv_provider_sample_id', 'covv_subm_lab', 'covv_subm_lab_addr', 'covv_subm_sample_id', 'covv_authors', 'covv_comment', 'comment_type')

    with open('~{out_name}', 'w', newline='') as outf:
      writer = csv.DictWriter(outf, out_headers, dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
      writer.writeheader()

      with open('~{source_modifier_table}', 'rt') as inf:
        for row in csv.DictReader(inf, delimiter='\t'):

          isolation_source = row['isolation_source'].lower()
          #covv_specimen
          if strict:
            valid_isolation_sources = ('clinical', 'environmental')
            assert isolation_source in valid_isolation_sources, f"Metadata error: 'isolation_source' not one of: {valid_isolation_sources}\n{row}"
            assert row['host'] == 'Homo sapiens' or isolation_source == 'environmental', f"Metadata error: 'host' must be 'Homo sapiens' if 'isolation_source' is not 'Environmental'\n{row}"
            assert row['organism']         == 'Severe acute respiratory syndrome coronavirus 2', f"'organism' != 'Severe acute respiratory syndrome coronavirus 2'\n{row}"
            assert row['db_xref']          == 'taxon:2697049', f"Metadata error: 'db_xref' != 'taxon:2697049'\n{row}"

            collected_by = row['collected_by']
            assert collected_by in address_map, f"error: institution '{collected_by}' not found in address_map"
            assert collected_by in authors_map, f"error: institution '{collected_by}' not found in authors_map"

          # PHA4GE/INSDC controlled vocabulary for source information
          # from "Vocabulary" tab of this sheet:
          #   https://github.com/pha4ge/SARS-CoV-2-Contextual-Data-Specification/blob/master/PHA4GE%20SARS-CoV-2%20Contextual%20Data%20Template.xlsx
          gisaid_specimen_source = "unknown"
          if isolation_source == 'clinical':
            gisaid_specimen_source = row.get("body_product",row.get("anatomical_material",row.get("anatomical_part","missing")))
          if isolation_source == 'environmental':
            gisaid_specimen_source = row.get("environmental_material",row.get("environmental_site","missing"))

          writer.writerow({
            'covv_virus_name'     : 'hCoV-19/' +row['Sequence_ID'],
            'covv_collection_date': row['collection_date'],
            'covv_location'       : '~{continent} / ' + row['country'].replace(':',' /'),

            'covv_type'           : 'betacoronavirus',
            'covv_passage'        : 'Original',
            'covv_host'           : 'Human' if isolation_source == 'clinical' else isolation_source.replace("environmental","Environment"),
            'covv_add_host_info'  : 'unknown',
            'covv_gender'         : 'unknown',
            'covv_patient_age'    : 'unknown',
            'covv_patient_status' : 'unknown',
            'covv_specimen'       : gisaid_specimen_source.capitalize(), # capitalization of the first word seems to be the norm for GISAID

            'covv_assembly_method': sample_to_cmt[row['Sequence_ID']]['Assembly Method'],
            'covv_coverage'       : sample_to_cmt[row['Sequence_ID']]['Coverage'],
            'covv_seq_technology' : sample_to_cmt[row['Sequence_ID']]['Sequencing Technology'],

            'covv_orig_lab'       : row['collected_by'],
            'covv_subm_lab'       : "~{submitting_lab_name}",
            'covv_authors'        : authors_map.get(row['collected_by'], 'REQUIRED'),
            'covv_orig_lab_addr'  : address_map.get(row['collected_by'], 'REQUIRED'),
            'covv_subm_lab_addr'  : address_map.get("~{submitting_lab_name}", 'REQUIRED'),

            'submitter'           : "~{default='REQUIRED' username}",
            'fn'                  : "~{default='REQUIRED' fasta_filename}",

            'covv_sampling_strategy'  : row.get('note',''),
          })

    CODE
  >>>
  output {
    File meta_csv = "~{out_name}"
  }
  runtime {
    docker: "python:slim"
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task lookup_table_by_filename {
  input {
    String id
    File   mapping_tsv
    Int    return_col = 2

    String docker = "ubuntu"
  }
  command {
    set -e -o pipefail
    grep ^"~{id}" ~{mapping_tsv} | cut -f ~{return_col} > OUTVAL
  }
  output {
    String value = read_string("OUTVAL")
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task sra_meta_prep {
  meta {
    description: "Prepare tables for submission to NCBI's SRA database. This only works on bam files produced by illumina.py illumina_demux --append_run_id in viral-core."
  }
  input {
    Array[File] cleaned_bam_filepaths
    File        biosample_map
    Array[File] library_metadata
    String      platform
    String      instrument_model
    String      title
    Boolean     paired

    String      out_name = "sra_metadata.tsv"
    String      docker="quay.io/broadinstitute/viral-core:2.5.1"
  }
  Int disk_size = 100
  parameter_meta {
    cleaned_bam_filepaths: {
      description: "Unaligned bam files containing cleaned (submittable) reads.",
      localization_optional: true,
      stream: true,
      patterns: ["*.bam"]
    }
    biosample_map: {
      description: "Tab text file with a header and at least two columns named accession and sample_name. 'accession' maps to the BioSample accession number. Any samples without an accession will be omitted from output. 'sample_name' maps to the internal lab sample name used in filenames, samplesheets, and library_metadata files.",
      patterns: ["*.txt", "*.tsv"]
    }
    library_metadata: {
      description: "Tab text file with a header and at least six columns (sample, library_id_per_sample, library_strategy, library_source, library_selection, design_description). See 3rd tab of https://www.ncbi.nlm.nih.gov/core/assets/sra/files/SRA_metadata_acc_example.xlsx for controlled vocabulary and term definition.",
      patterns: ["*.txt", "*.tsv"]
    }
    platform: {
      description: "Sequencing platform (one of _LS454, ABI_SOLID, BGISEQ, CAPILLARY, COMPLETE_GENOMICS, HELICOS, ILLUMINA, ION_TORRENT, OXFORD_NANOPORE, PACBIO_SMRT)."
    }
    instrument_model: {
      description: "Sequencing instrument model (examples for platform=ILLUMINA: HiSeq X Five, HiSeq X Ten, Illumina Genome Analyzer, Illumina Genome Analyzer II, Illumina Genome Analyzer IIx, Illumina HiScanSQ, Illumina HiSeq 1000, Illumina HiSeq 1500, Illumina HiSeq 2000, Illumina HiSeq 2500, Illumina HiSeq 3000, Illumina HiSeq 4000, Illumina iSeq 100, Illumina NovaSeq 6000, Illumina MiniSeq, Illumina MiSeq, NextSeq 500, NextSeq 550)."
    }
    title: {
      description: "Descriptive sentence of the form <method> of <organism>, e.g. Metagenomic RNA-seq of SARS-CoV-2."
    }
  }
  command <<<
    python3 << CODE
    import os.path
    import csv
    import util.file

    # WDL arrays to python arrays
    bam_uris = list(x for x in '~{sep="*" cleaned_bam_filepaths}'.split('*') if x)
    library_metadata = list(x for x in '~{sep="*" library_metadata}'.split('*') if x)

    # lookup table files to dicts
    lib_to_bams = {}
    sample_to_biosample = {}
    for bam in bam_uris:
      # filename must be <libraryname>.<flowcell>.<lane>.cleaned.bam or <libraryname>.<flowcell>.<lane>.bam
      bam_base = os.path.basename(bam)
      bam_parts = bam_base.split('.')
      assert bam_parts[-1] == 'bam', "filename does not end in .bam -- {}".format(bam) 
      bam_parts = bam_parts[:-1]
      if bam_parts[-1] == 'cleaned':
        bam_parts = bam_parts[:-1]
      assert len(bam_parts) >= 3, "filename does not conform to <libraryname>.<flowcell>.<lane>.cleaned.bam -- {}".format(bam_base)
      lib = '.'.join(bam_parts[:-2]) # drop flowcell and lane
      lib_to_bams.setdefault(lib, [])
      lib_to_bams[lib].append(bam_base)
      print("debug: registering lib={} bam={}".format(lib, bam_base))
    with open('~{biosample_map}', 'rt') as inf:
      for row in csv.DictReader(inf, delimiter='\t'):
        sample_to_biosample[row['sample_name']] = row['accession']

    # set up SRA metadata table
    outrows = []
    out_headers = ['biosample_accession', 'library_ID', 'title', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'platform', 'instrument_model', 'design_description', 'filetype', 'assembly', 'filename']

    # iterate through library_metadata entries and produce an output row for each entry
    libs_written = set()
    for libfile in library_metadata:
      with open(libfile, 'rt') as inf:
        for row in csv.DictReader(inf, delimiter='\t'):
          lib = util.file.string_to_file_name("{}.l{}".format(row['sample'], row['library_id_per_sample']))
          biosample = sample_to_biosample.get(row['sample'],'')
          bams = lib_to_bams.get(lib,[])
          print("debug: sample={} lib={} biosample={}, bams={}".format(row['sample'], lib, biosample, bams))
          if biosample and bams and lib not in libs_written:
            libs_written.add(lib)
            outrows.append({
              'biosample_accession': sample_to_biosample[row['sample']],
              'library_ID': lib,
              'title': "~{title}",
              'library_strategy': row.get('library_strategy',''),
              'library_source': row.get('library_source',''),
              'library_selection': row.get('library_selection',''),
              'library_layout': '~{true="paired" false="single" paired}',
              'platform': '~{platform}',
              'instrument_model': '~{instrument_model}',
              'design_description': row.get('design_description',''),
              'filetype': 'bam',
              'assembly': 'unaligned',
              'files': lib_to_bams[lib],
            })
    assert outrows, "failed to prepare any metadata -- output is empty!"

    # find library with the most files and add col headers
    n_cols = max(len(row['files']) for row in outrows)
    for i in range(n_cols-1):
      out_headers.append('filename{}'.format(i+2))

    # write output file
    with open('~{out_name}', 'wt') as outf:
      outf.write('\t'.join(out_headers)+'\n')
      for row in outrows:
        row['filename'] = row['files'][0]
        for i in range(len(row['files'])):
          row['filename{}'.format(i+1)] = row['files'][i]
        outf.write('\t'.join(row.get(h,'') for h in out_headers)+'\n')
    CODE
  >>>
  output {
    File sra_metadata     = "~{out_name}"
    File cleaned_bam_uris = write_lines(cleaned_bam_filepaths)
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task biosample_to_table {
  meta {
    description: "Reformats a BioSample registration attributes table (attributes.tsv) for ingest into a Terra table."
  }
  input {
    File        biosample_attributes_tsv
    Array[File] raw_bam_filepaths
    File        demux_meta_json

    String  sample_table_name  = "sample"
    String  docker = "python:slim"
  }
  String  sanitized_id_col = "entity:~{sample_table_name}_id"
  String base = basename(basename(biosample_attributes_tsv, ".txt"), ".tsv")
  parameter_meta {
    raw_bam_filepaths: {
      description: "Unaligned bam files containing raw reads.",
      localization_optional: true,
      stream: true,
      patterns: ["*.bam"]
    }
  }
  command <<<
    set -ex -o pipefail
    python3 << CODE
    import os.path
    import csv
    import json

    # load demux metadata
    with open("~{demux_meta_json}", 'rt') as inf:
      demux_meta_by_file = json.load(inf)

    # load list of bams surviving filters
    bam_fnames = list(os.path.basename(x) for x in '~{sep="*" raw_bam_filepaths}'.split('*'))
    bam_fnames = list(x[:-len('.bam')] if x.endswith('.bam') else x for x in bam_fnames)
    bam_fnames = list(x[:-len('.cleaned')] if x.endswith('.cleaned') else x for x in bam_fnames)
    print("bam basenames ({}): {}".format(len(bam_fnames), bam_fnames))
    sample_to_sanitized = {demux_meta_by_file.get(x, {}).get('sample_original'): demux_meta_by_file.get(x, {}).get('sample') for x in bam_fnames}
    if None in sample_to_sanitized:
      del sample_to_sanitized[None]
    sample_names_seen = sample_to_sanitized.keys()
    print("samples seen ({}): {}".format(len(sample_names_seen), sorted(sample_names_seen)))

    # load biosample metadata
    biosample_attributes = []
    biosample_headers = ['biosample_accession']
    with open('~{biosample_attributes_tsv}', 'rt') as inf:
      for row in csv.DictReader(inf, delimiter='\t'):
        if row['sample_name'] in sample_names_seen and row['message'] == "Successfully loaded":
          row['biosample_accession'] = row.get('accession')
          row = dict({k:v for k,v in row.items() if v.strip().lower() not in ('missing', 'na', 'not applicable', 'not collected', '')})
          for k,v in row.items():
            if v and (k not in biosample_headers) and k not in ('message', 'accession'):
              biosample_headers.append(k)
          row['biosample_json'] = json.dumps({k: v for k,v in row.items() if k in biosample_headers})
          biosample_attributes.append(row)
    biosample_headers.append('biosample_json')

    print("biosample headers ({}): {}".format(len(biosample_headers), biosample_headers))
    print("biosample output rows ({})".format(len(biosample_attributes)))
    samples_seen_without_biosample = set(sample_names_seen) - set(row['sample_name'] for row in biosample_attributes)
    print("samples seen in bams without biosample entries ({}): {}".format(len(samples_seen_without_biosample), sorted(samples_seen_without_biosample)))

    # write reformatted table
    with open('~{base}.entities.tsv', 'w', newline='') as outf:
      writer = csv.DictWriter(outf, delimiter='\t', fieldnames=["~{sanitized_id_col}"]+biosample_headers, dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
      writer.writeheader()
      for row in biosample_attributes:
        outrow = {h: row.get(h, '') for h in biosample_headers}
        outrow["~{sanitized_id_col}"] = sample_to_sanitized[row['sample_name']]
        writer.writerow(outrow)
    CODE
  >>>
  output {
    File sample_meta_tsv = "~{base}.entities.tsv"
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task biosample_to_genbank {
  meta {
    description: "Prepares two input metadata files for Genbank submission based on a BioSample registration attributes table (attributes.tsv) since all of the necessary values are there. This produces both a Genbank Source Modifier Table and a BioSample ID map file that can be fed into the prepare_genbank task."
  }
  input {
    File    biosample_attributes
    Int     taxid
    Int     num_segments = 1
    String  biosample_col_for_fasta_headers = "sample_name"

    File?   filter_to_ids
    String? filter_to_accession
    Map[String,String] src_to_attr_map = {}
    String?  organism_name_override
    String?  sequence_id_override
    String?  isolate_prefix_override
    File?    source_overrides_json # optional set of key-value pairs to override source metadata

    Boolean sanitize_seq_ids = true

    String  out_basename = basename(basename(biosample_attributes, ".txt"), ".tsv")
    String  docker = "python:slim"
  }
  command <<<
    set -e
    python3<<CODE
    import csv
    import json
    import re
    import os.path

    header_key_map = {
        'Sequence_ID':'~{biosample_col_for_fasta_headers}',
        'BioProject':'bioproject_accession',
        'BioSample':'accession',
    }
    with open("~{write_json(src_to_attr_map)}", 'rt') as inf:
        more_header_key_map = json.load(inf)
    for k,v in more_header_key_map.items():
        header_key_map[k] = v
    print("header_key_map: {}".format(header_key_map))

    out_headers_total = ['Sequence_ID', 'isolate', 'collection_date', 'geo_loc_name', 'collected_by', 'isolation_source', 'organism', 'host', 'note', 'db_xref', 'BioProject', 'BioSample']
    samples_to_filter_to = set()
    if "~{default='' filter_to_ids}":
        with open("~{filter_to_ids}", 'rt') as inf:
            samples_to_filter_to = set(line.strip() for line in inf)
            print("filtering to samples: {}".format(samples_to_filter_to))
    only_accession = "~{default='' filter_to_accession}" if "~{default='' filter_to_accession}" else None
    if only_accession:
        print("filtering to biosample: {}".format(only_accession))

    # read entire tsv -> biosample_attributes, filtered to only the entries we keep
    with open("~{biosample_attributes}", 'rt') as inf_biosample:
      biosample_attributes_reader = csv.DictReader(inf_biosample, delimiter='\t')
      in_headers = biosample_attributes_reader.fieldnames
      if 'accession' not in in_headers:
        assert 'biosample_accession' in in_headers, "no accession column found in ~{biosample_attributes}"
        header_key_map['BioSample'] = 'biosample_accession'
      biosample_attributes = list(row for row in biosample_attributes_reader
        if row.get('message', 'Success').startswith('Success')
        and (not only_accession or row[header_key_map['BioSample']] == only_accession)
        and (not samples_to_filter_to or row[header_key_map['Sequence_ID']] in samples_to_filter_to))
      print("filtered to {} samples".format(len(biosample_attributes)))

    # clean out some na values
    for row in biosample_attributes:
      for k in ('strain', 'isolate', 'host', 'geo_loc_name', 'collection_date', 'serotype', 'food_origin'):
        if row.get(k, '').lower().strip() in ('na', 'not applicable', 'missing', 'not collected', 'not available', ''):
            row[k] = ''

    # override organism_name if provided (this allows us to submit Genbank assemblies for
    # specific species even though the metagenomic BioSample may have been registered with a different
    # species or none at all)
    if "~{default='' organism_name_override}":
        for row in biosample_attributes:
            row['organism'] = "~{default='' organism_name_override}"

    # handle special submission types: flu, sc2, noro, dengue
    special_bugs = ('Influenza A virus', 'Influenza B virus', 'Influenza C virus',
                    'Severe acute respiratory syndrome coronavirus 2',
                    'Norovirus', 'Dengue virus')
    for special in special_bugs:
        # sanitize organism name if it's a special one
        for row in biosample_attributes:
          if row['organism'].startswith(special):
              row['organism'] = special

        # enforce that special submissions are all the same special thing
        if any(row['organism'] == special for row in biosample_attributes):
          print("special organism found " + special)
          assert all(row['organism'] == special for row in biosample_attributes), "if any samples are {}, all samples must be {}".format(special, special)
          ### Influenza-specific requirements
          if special.startswith('Influenza'):
            print("special organism is Influenza A/B/C")
            # simplify isolate name
            if row.get('strain'):
              header_key_map['isolate'] = 'strain'
            if 'serotype' not in out_headers_total:
              out_headers_total.append('serotype')
            for row in biosample_attributes:
              # simplify isolate name more if it still looks structured with metadata (not allowed for flu submissions)
              if len(row[header_key_map.get('isolate', 'isolate')].split('/')) >= 2:
                row[header_key_map.get('isolate', 'isolate')] = row[header_key_map.get('isolate', 'isolate')].split('/')[-2]
              # populate serotype from name parsing
              match = re.search(r'\(([^()]+)\)+$', row['sample_name'])
              if match:
                  row['serotype'] = match.group(1)
                  print("found serotype {}". format(row['serotype']))
              # populate host field from name parsing if empty, override milk
              if not row.get('host','').strip():
                match = re.search(r'[^/]+/([^/]+)/[^/]+/[^/]+/[^/]+', row['sample_name'])
                if match:
                    row['host'] = match.group(1)
                    if row['host'] == 'bovine_milk':
                      row['host'] = 'Cattle'
                    assert 'host' in out_headers_total
              # override geo_loc_name if food_origin exists
              if row.get('food_origin','').strip():
                  print("overriding geo_loc_name with food_origin")
                  row['geo_loc_name'] = row['food_origin']

    # prepare output headers
    out_headers = list(h for h in out_headers_total if header_key_map.get(h,h) in in_headers)
    if 'db_xref' not in out_headers:
        out_headers.append('db_xref')
    if 'note' not in out_headers:
        out_headers.append('note')
    if 'serotype' in out_headers_total and 'serotype' not in out_headers:
        out_headers.append('serotype')
    if 'strain' not in out_headers and any(row['organism'].startswith('Influenza') for row in biosample_attributes):
        out_headers.append('strain')

    # manual overrides if provided
    if os.path.isfile("~{source_overrides_json}"):
      with open("~{source_overrides_json}", 'rt') as inf:
        for k,v in json.load(inf).items():
          print("overriding {} with {}".format(k,v))
          if k not in out_headers:
            out_headers.append(k)
          for row in biosample_attributes:
            row[k] = v

    # write output source modifier table
    with open("~{out_basename}.src", 'wt') as outf_smt:
      outf_smt.write('\t'.join(out_headers)+'\n')

      with open("~{out_basename}.sample_ids.txt", 'wt') as outf_ids:
          for row in biosample_attributes:
            # populate output row as a dict
            outrow = dict((h, row.get(header_key_map.get(h,h), '')) for h in out_headers)

            ### Taxon-specific genome naming rules go here
            if outrow['organism'].startswith('Influenza'):
              ### Influenza-specific isolate naming was handled above already and is required to be metadata-free
              # FTP submission pathway requires us to do the naming via the strain field
              type = outrow['organism'].split()[1] # A, B, C, D, etc
              state = outrow['geo_loc_name'].split(':')[1].strip() if ':' in outrow['geo_loc_name'] else outrow['geo_loc_name']
              year = outrow['collection_date'].split('-')[0]
              outrow['strain'] = '/'.join([type, state, outrow['isolate'], year])
              print("new strain name: {}".format(outrow['strain']))
            elif outrow['organism'].startswith('Special taxon with special naming rules'):
              ### Example special case here
              pass
            else:
              ### Most viral taxa can follow this generic model containing structured metadata
              # <short organism name>/<host lowercase>/Country/ST-Institution-LabID/Year
              # e.g. RSV-A/human/USA/MA-Broad-1234/2020
              # e.g. SARS-CoV-2/human/USA/MA-Broad-1234/2020
              # assume that the current isolate name has the structure:
              # <something wrong to be overwritten>/<host>/<country>/<state>-<institution>-<labid>/<year>
              print("old isolate name: {}".format(outrow['isolate']))
              year = outrow['collection_date'].split('-')[0]
              country = outrow['geo_loc_name'].split(':')[0]
              host = outrow['host'].lower()

              # host name mapping to the informal names used by NCBI in sequence titles for certain species
              host_to_informal_name_map = {
                'homo sapiens': 'human',
                'sus scrofa domesticus': 'swine',
                'sus scrofa': 'swine'
              }
              host = host_to_informal_name_map.get(host, host)

              if len(outrow['isolate'].split('/')) >= 2:
                state_inst_labid = outrow['isolate'].split('/')[-2]
              else:
                state_inst_labid = outrow['Sequence_ID']
              name_prefix = outrow['organism'].split('(')[0].split('/')[0].strip()
              if name_prefix.lower().endswith('refseq'):
                name_prefix = name_prefix[:-6].strip()
              if "~{default='' isolate_prefix_override}".strip():
                name_prefix = "~{default='' isolate_prefix_override}".strip()
              outrow['isolate'] = '/'.join([name_prefix, host, country, state_inst_labid, year])
              print("new isolate name: {}".format(outrow['isolate']))

            # some fields are not allowed to be empty
            if not outrow.get('geo_loc_name'):
                outrow['geo_loc_name'] = 'missing'
            if not outrow.get('host'):
                outrow['host'] = 'not applicable'

            # custom db_xref/taxon
            outrow['db_xref'] = "taxon:{}".format(~{taxid})

            # load the purpose of sequencing (or if not, the purpose of sampling) in the note field
            outrow['note'] = row.get('purpose_of_sequencing', row.get('purpose_of_sampling', ''))

            # overwrite sequence ID if requested
            if "~{default='' sequence_id_override}":
                outrow['Sequence_ID'] = "~{default='' sequence_id_override}"

            # sanitize sequence IDs to match fasta headers
            if "~{sanitize_seq_ids}".lower() == 'true':
                outrow['Sequence_ID'] = re.sub(r'[^0-9A-Za-z!_-]', '-', outrow['Sequence_ID'])

            # write isolate name for this sample (it will overwrite and only keep the last one if there are many)
            with open("isolate_name.str", 'wt') as outf_isolate:
                print("writing isolate name: {}".format(outrow['isolate']))
                outf_isolate.write(outrow['isolate']+'\n')

            # write entry for this sample
            sample_name = outrow['Sequence_ID']
            if ~{num_segments}>1:
                for i in range(~{num_segments}):
                    outrow['Sequence_ID'] = "{}-{}".format(sample_name, i+1)
                    outf_smt.write('\t'.join(outrow[h] for h in out_headers)+'\n')
                    outf_ids.write(outrow['Sequence_ID']+'\n')
            else:
                outf_smt.write('\t'.join(outrow[h] for h in out_headers)+'\n')
                outf_ids.write(outrow['Sequence_ID']+'\n')
    CODE
  >>>
  output {
    File   genbank_source_modifier_table = "~{out_basename}.src"
    File   sample_ids                    = "~{out_basename}.sample_ids.txt"
    String isolate_name                  = read_string("isolate_name.str")
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task generate_author_sbt_file {
  meta {
    description: "Generate an NCBI-compatible author sbt file for submission of sequence data to GenBank. Accepts an author string, a defaults yaml file, and a jinja2-format template. Output is comparable to what is generated by http://www.ncbi.nlm.nih.gov/WebSub/template.cgi"
  }

  input {
    String? author_list
    File    j2_template
    File?   defaults_yaml
    String  out_base = "authors"

    String  docker = "quay.io/broadinstitute/py3-bio:0.1.2"
  }

  parameter_meta {
    author_list: {
      description: "A string containing a space-delimited list with of author surnames separated by first name and (optional) middle initial. Ex. 'Lastname,Firstname, Last-hypenated,First,M., Last,F.'"
    }
    j2_template: {
      description: "an sbt file (optionally) with Jinja2 variables to be filled in based on values present in author_sbt_defaults_yaml, if provided. If no yaml is provided, this file is passed through verbatim. Example: gs://pathogen-public-dbs/other-related/author_template.sbt.j2"
    }
    defaults_yaml: {
      description: "A YAML file with default values to use for the submitter, submitter affiliation, and author affiliation. Optionally including authors at the start and end of the author_list. Example: gs://pathogen-public-dbs/other-related/default_sbt_values.yaml",
      patterns: ["*.yaml","*.yml"]
    }
    out_base: {
      description: "prefix to use for the generated *.sbt output file"
    }
  }
  
  command <<<
    set -e

    # blank yaml file to be used if the optional input is not specified
    touch blank.yml

    python3 << CODE
    # generates an sbt file of the format returned by:
    # http://www.ncbi.nlm.nih.gov/WebSub/template.cgi
    import re
    import shutil
    # external dependencies
    import yaml # pyyaml
    from jinja2 import Template #jinja2

    def render_sbt(author_string, defaults_yaml=None, sbt_out_path="authors.sbt", j2_template="author_template.sbt.j2"):
        # simple version for only initials: #author_re=re.compile(r"\s?(?P<lastname>[\w\'\-\ ]+),(?P<initials>(?:[A-Z]\.){1,3})")
        author_re=re.compile(r"\s?(?P<lastname>[\w\'\-\ ]+),((?P<first>\w[\w\'\-\ ]+\.?),?|(?P<initials>(?:[A-Z]\.)+))(?P<initials_ext>(?:[A-Z]\.)*)")

        authors=[]
        defaults_data_last_authors=[]
        defaults_data = {}

        authors_affil = None
        submitter     = None
        bioproject    = None
        title         = None
        citation      = None

        if defaults_yaml is not None:
            with open(defaults_yaml) as defaults_yaml:
                defaults_data = yaml.load(defaults_yaml, Loader=yaml.FullLoader)

                if defaults_data is not None:
                    submitter     = defaults_data.get("submitter")
                    bioproject    = defaults_data.get("bioproject")
                    title         = defaults_data.get("title")
                    citation      = defaults_data.get("citation")
                    authors_affil = defaults_data.get("authors_affil")
                    
                    defaults_data_authors = defaults_data.get("authors_start",[])
                    for author in defaults_data_authors:
                        authors.extend(author)

                    defaults_data_last_authors = defaults_data.get("authors_last",[])
                    for author in defaults_data_last_authors:
                        last_authors.append(author)
        
        for author_match in author_re.finditer(author_string):
            author = {}
            lastname=author_match.group("lastname")
            initials=[]
            if author_match.group("initials"):
                initials.extend(list(filter(None,author_match.group("initials").split("."))))
            if author_match.group("initials_ext"):
                initials.extend(list(filter(None,author_match.group("initials_ext").split("."))))

            first=""
            if author_match.group("first"):
                first=author_match.group("first")
            else:
                first=initials[0]+"."
            author["last"]     = author_match.group("lastname")
            author["first"]    = first
            author["initials"]   = ".".join(initials[1:]) if not author_match.group("first") else ".".join(initials)
            author["initials"]   = author["initials"]+"." if len(author["initials"])>0 else author["initials"]
            
            if author not in authors: # could use less exact match
                authors.append(author)

        for author in defaults_data_last_authors:
            if author not in authors:
                authors.append(author)

        jinja_rendering_kwargs={}
        if authors_affil is not None:
            jinja_rendering_kwargs["authors_affil"]=authors_affil
        if title is not None:
            jinja_rendering_kwargs["title"]=title
        if submitter is not None:
            jinja_rendering_kwargs["submitter"]=submitter
        if citation is not None:
            jinja_rendering_kwargs["citation"]=citation
        if bioproject is not None:
            jinja_rendering_kwargs["bioproject"]=bioproject

        if len(authors) >= 1 or len(jinja_rendering_kwargs) >= 1:
            with open(j2_template) as sbt_template:
                template = Template(sbt_template.read())

            rendered = template.render( authors=authors, 
                                        **jinja_rendering_kwargs)
        
            #print(rendered)
            with open(sbt_out_path,"w") as sbt_out:
                sbt_out.write(rendered)
        else:
            # if no authors were specified, simply copy the template to the output
            shutil.copyfile(j2_template, sbt_out_path)

    render_sbt("~{author_list}", defaults_yaml="~{default='blank.yml' defaults_yaml}", sbt_out_path="~{out_base}.sbt", j2_template="~{j2_template}")
    CODE
  >>>
  output {
    File sbt_file = "~{out_base}.sbt"
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task table2asn {
  meta {
    description: "This task runs NCBI's table2asn, the artist formerly known as tbl2asn"
  }

  input {
    File         assembly_fasta
    File         annotations_tbl
    File         authors_sbt
    File?        source_modifier_table
    File?        structured_comment_file
    String?      comment
    String       organism
    String       mol_type = "cRNA"
    Int          genetic_code = 1

    String       out_basename = basename(assembly_fasta, ".fasta")
    Int          machine_mem_gb = 8
    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"  # this could be a simpler docker image, we don't use anything beyond table2asn itself
  }
  Int disk_size = 50

  parameter_meta {
    assembly_fasta: {
      description: "Assembled genome. All chromosomes/segments in one file.",
      patterns: ["*.fasta"]
    }
    annotations_tbl: {
      description: "Gene annotations in TBL format, one per fasta file. Filename basenames must match the assemblies_fasta basenames. These files are typically output from the ncbi.annot_transfer task.",
      patterns: ["*.tbl"]
    }
    authors_sbt: {
      description: "A genbank submission template file (SBT) with the author list, created at https://submit.ncbi.nlm.nih.gov/genbank/template/submission/",
      patterns: ["*.sbt"]
    }
    source_modifier_table: {
      description: "A tab-delimited text file containing requisite metadata for Genbank (a 'source modifier table'). https://www.ncbi.nlm.nih.gov/WebSub/html/help/genbank-source-table.html",
      patterns: ["*.src"]
    }
    structured_comment_file: {
      patterns: ["*.cmt"]
    }
    organism: {
      description: "The scientific name for the organism being submitted. This is typically the species name and should match the name given by the NCBI Taxonomy database. For more info, see: https://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#Organism"
    }
    mol_type: {
      description: "The type of molecule being described. Any value allowed by the INSDC controlled vocabulary may be used here. Valid values are described at http://www.insdc.org/controlled-vocabulary-moltype-qualifier"
    }
    comment: {
      description: "Optional comments that can be displayed in the COMMENT section of the Genbank record. This may include any disclaimers about assembly quality or notes about pre-publication availability or requests to discuss pre-publication use with authors."
    }
  }

  command <<<
    set -ex -o pipefail
    table2asn -version | cut -f 2 -d ' ' > TABLE2ASN_VERSION
    cp "~{assembly_fasta}" "~{out_basename}.fsa" # input fasta must be in CWD so output files end up next to it
    touch "~{out_basename}.val"  # this file isn't produced if no errors/warnings

    table2asn \
      -t "~{authors_sbt}" \
      -i "~{out_basename}.fsa" \
      -f "~{annotations_tbl}" \
      ~{'-w "' + structured_comment_file + '"'} \
      -j '[gcode=~{genetic_code}][moltype=~{mol_type}][organism=~{organism}]' \
      ~{'-src-file "' + source_modifier_table + '"'} \
      ~{'-y "' + comment + '"'} \
      -a s -V vb

    set +x
    echo "table2asn complete"
    cat "~{out_basename}.val" | { grep -vi '^Info:' || test $? = 1; } | tee "~{out_basename}.val.no_info"
    cat "~{out_basename}.val.no_info" | { grep -vi '^Warning: valid' || test $? = 1; } | tee "~{out_basename}.val.errors_only"
  >>>

  output {
    File          genbank_submission_sqn   = "~{out_basename}.sqn"
    File          genbank_preview_file     = "~{out_basename}.gbf"
    File          genbank_validation_file  = "~{out_basename}.val"
    Array[String] table2asn_errors         = read_lines("~{out_basename}.val")
    String        table2asn_version        = read_string("TABLE2ASN_VERSION")
    Boolean       table2asn_passing        = length(read_lines("~{out_basename}.val.errors_only")) == 0
  }

  runtime {
    docker: docker
    memory: machine_mem_gb + " GB"
    cpu: 2
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
  }
}

task package_special_genbank_ftp_submission {
  meta {
    description: "Prepares a zip and xml file for FTP-based NCBI Genbank submission according to instructions at https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/genbank/SARS-CoV-2/."
  }
  input {
    File   sequences_fasta
    File   structured_comment_table
    File   source_modifier_table
    File   author_template_sbt
    String submission_name
    String submission_uid
    String spuid_namespace
    String account_name
    String wizard="BankIt_SARSCoV2_api"

    String  docker = "quay.io/broadinstitute/viral-baseimage:0.3.0"
  }
  command <<<
    set -e

    # make the submission zip file
    cp "~{sequences_fasta}" sequence.fsa
    cp "~{structured_comment_table}" comment.cmt
    cp "~{source_modifier_table}" source.src
    cp "~{author_template_sbt}" template.sbt
    zip "~{submission_uid}.zip" sequence.fsa comment.cmt source.src template.sbt

    # make the submission xml file
    SUB_NAME="~{submission_name}"
    ACCT_NAME="~{account_name}"
    SPUID="~{submission_uid}"
    cat << EOF > submission.xml
    <?xml version="1.0"?>
    <Submission>
      <Description>
        <Comment>$SUB_NAME</Comment>
        <Organization type="center" role="owner">
          <Name>$ACCT_NAME</Name>
        </Organization>
      </Description>
      <Action>
        <AddFiles target_db="GenBank">
          <File file_path="$SPUID.zip">
            <DataType>genbank-submission-package</DataType>
          </File>
          <Attribute name="wizard">~{wizard}</Attribute>
          <Identifier>
            <SPUID spuid_namespace="~{spuid_namespace}">$SPUID</SPUID>
          </Identifier>
        </AddFiles>
      </Action>
    </Submission>
    EOF

    # make the (empty) ready file
    touch submit.ready
  >>>
  output {
    File submission_zip = "~{submission_uid}.zip"
    File submission_xml = "submission.xml"
    File submit_ready   = "submit.ready"
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task genbank_special_taxa {
  meta {
    description: "this task tells you if you have a special taxon that NCBI treats differently"
  }

  input {
    Int     taxid
    File    taxdump_tgz
    File    vadr_by_taxid_tsv # "gs://pathogen-public-dbs/viral-references/annotation/vadr/vadr-by-taxid.tsv"
    String  docker = "quay.io/broadinstitute/viral-classify:2.5.1.0"
  }

  command <<<
    set -e

    # unpack the taxdump tarball
    mkdir -p taxdump
    read_utils.py extract_tarball "~{taxdump_tgz}" taxdump

    python3 << CODE
    import csv
    import metagenomics
    import tarfile
    import urllib.request
    import json
    import re
    import sys
    taxid = ~{taxid}

    # load taxdb and retrieve full hierarchy leading to this taxid
    taxdb = metagenomics.TaxonomyDb(tax_dir="taxdump", load_nodes=True, load_names=True, load_gis=False)
    ancestors = taxdb.get_ordered_ancestors(taxid)
    this_and_ancestors = [taxid] + ancestors

    # Genbank prohibits normal submissions for SC2, Flu A/B/C, Noro, and Dengue
    table2asn_prohibited = {
      11320: "Influenza A virus",
      11520: "Influenza B virus",
      11552: "Influenza C virus",
      2697049: "Severe acute respiratory syndrome coronavirus 2",
      11983: "Norovirus",
      3052464: "Dengue virus"
    }
    prohibited = any(node in table2asn_prohibited for node in this_and_ancestors)
    with open("table2asn_allowed.boolean", "wt") as outf:
      outf.write("false" if prohibited else "true")
    with open("genbank_submission_mechanism.str", "wt") as outf:
      if any(node == 11320 for node in this_and_ancestors):
        outf.write("InfluenzaA")
      elif any(node == 11520 for node in this_and_ancestors):
        outf.write("InfluenzaB")
      elif any(node == 11552 for node in this_and_ancestors):
        outf.write("InfluenzaC")
      elif any(node == 2697049 for node in this_and_ancestors):
        outf.write("SARS-CoV-2")
      elif any(node == 11983 for node in this_and_ancestors):
        outf.write("Norovirus")
      elif any(node == 3052464 for node in this_and_ancestors):
        outf.write("Dengue")
      else:
        outf.write("table2asn")

    # Genbank requires certain fields to be populated for Flu A and Dengue
    if any(node == 11320 for node in this_and_ancestors):
      # flu A needs subtype specified in the serotype column
      with open("genbank_source_overrides.json", "wt") as outf:
        # This taxid matches a specific named strain
        match = re.search(r'\(([^()]+)\)+$', taxdb.names[taxid])
        if match:
          subtype = match.group(1)
          print("found Influenza A subtype {}". format(subtype))
          json.dump({'serotype':subtype}, outf)
        else:
          # Influenza A has a number of taxids at the subtype level
          match = re.search(r'(\w+)\s+subtype$', taxdb.names[taxid])
          if match:
            subtype = match.group(1)
            print("found Influenza A subtype {}". format(subtype))
            json.dump({'serotype':subtype}, outf)
          else:
            print("failed to find Influenza A subtype from taxid {}, taxname {}".format(taxid, taxdb.names[taxid]))
            sys.exit(1)
    elif any(node == 3052464 for node in this_and_ancestors):
      # dengue needs serotype specified in the genotype column
      with open("genbank_source_overrides.json", "wt") as outf:
        match = re.search(r'(\d)$', taxdb.names[taxid])
        if match:
          serotype = match.group(1)
          print("found Dengue serotype {}". format(serotype))
          json.dump({'genotype':serotype}, outf)
        else:
          print("failed to find Dengue serotype from taxid {}, taxname {}".format(taxid, taxdb.names[taxid]))
          sys.exit(1)

    # VADR is an annotation tool that supports SC2, Flu A/B/C/D, Noro, Dengue, RSV A/B, MPXV, etc
    # https://github.com/ncbi/vadr/wiki/Available-VADR-model-files
    # Note this table includes some taxa that are subtaxa of others and in those cases, it is ORDERED from
    # more specific to less specific (e.g. noro before calici, dengue before flavi, sc2 before corona)
    # so use the *first* hit in the table.
    # tsv header: tax_id, taxon_name, min_seq_len, max_seq_len, vadr_min_ram_gb, vadr_opts, vadr_model_tar_url
    with open("~{vadr_by_taxid_tsv}", 'rt') as inf:
      vadr_supported = list(row for row in csv.DictReader(inf, delimiter='\t'))

    out_vadr_supported = False
    out_vadr_cli_options = ""
    out_vadr_model_tar_url = ""
    out_min_genome_length = 0
    out_max_genome_length = 1000000
    out_vadr_taxid = 0
    out_vadr_min_ram_gb = 8
    out_vadr_model_tar_subdir = ""
    for row in vadr_supported:
      if any(node == int(row['tax_id']) for node in this_and_ancestors):
        out_vadr_taxid = int(row['tax_id'])
        out_vadr_supported = True
        out_vadr_cli_options = row['vadr_opts']
        out_vadr_model_tar_url = row['vadr_model_tar_url']
        if row['min_seq_len']:
          out_min_genome_length = int(row['min_seq_len'])
        if row['max_seq_len']:
          out_max_genome_length = int(row['max_seq_len'])
        if row['vadr_min_ram_gb']:
          out_vadr_min_ram_gb = int(row['vadr_min_ram_gb'])
        if row['vadr_model_tar_subdir']:
          out_vadr_model_tar_subdir = row['vadr_model_tar_subdir']
        break
    with open("vadr_supported.boolean", "wt") as outf:
      outf.write("true" if out_vadr_supported else "false")
    with open("vadr_cli_options.string", "wt") as outf:
      outf.write(out_vadr_cli_options)
    with open("min_genome_length.int", "wt") as outf:
      outf.write(str(out_min_genome_length))
    with open("max_genome_length.int", "wt") as outf:
      outf.write(str(out_max_genome_length))
    with open("vadr_taxid.int", "wt") as outf:
      outf.write(str(out_vadr_taxid))
    with open("vadr_min_ram_gb.int", "wt") as outf:
      outf.write(str(out_vadr_min_ram_gb))
    with open("vadr_model_tar_subdir.str", "wt") as outf:
      outf.write(out_vadr_model_tar_subdir)

    if out_vadr_model_tar_url:
      urllib.request.urlretrieve(out_vadr_model_tar_url, "vadr_model-~{taxid}.tar.gz")

    CODE
  >>>

  output {
    Boolean  table2asn_allowed     = read_boolean("table2asn_allowed.boolean")
    String   genbank_submission_mechanism = read_string("genbank_submission_mechanism.str")
    File?    genbank_source_overrides_json = "genbank_source_overrides.json"
    Boolean  vadr_supported        = read_boolean("vadr_supported.boolean")
    String   vadr_cli_options      = read_string("vadr_cli_options.string")
    File?    vadr_model_tar        = "vadr_model-~{taxid}.tar.gz"
    String   vadr_model_tar_subdir = read_string("vadr_model_tar_subdir.str")
    Int      vadr_taxid            = read_int("vadr_taxid.int")
    Int      vadr_min_ram_gb       = read_int("vadr_min_ram_gb.int")
    Int      min_genome_length     = read_int("min_genome_length.int")
    Int      max_genome_length     = read_int("max_genome_length.int")
  }

  runtime {
    docker: docker
    memory: "2 GB"
    cpu: 1
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task vadr {
  meta {
    description: "Runs NCBI's Viral Annotation DefineR for annotation and QC."
  }
  input {
    File    genome_fasta
    String? vadr_opts
    Int?    minlen
    Int?    maxlen
    File?   vadr_model_tar
    String? vadr_model_tar_subdir

    String out_basename = basename(genome_fasta, '.fasta')
    String docker = "mirror.gcr.io/staphb/vadr:1.6.4"
    Int    mem_size = 16  # the RSV model in particular seems to consume 15GB RAM
    Int    cpus = 4
  }
  command <<<
    set -e
    # unpack custom VADR models or use default
    if [ -n "~{vadr_model_tar}" ]; then
      mkdir -p vadr-untar
      tar -C vadr-untar -xzvf "~{vadr_model_tar}"
      # just link the model subdirectory, not the outer wrapper
      ln -s vadr-untar/*/ vadr-models
    else
      # use default (distributed with docker image) models
      ln -s /opt/vadr/vadr-models vadr-models
    fi
    if [ -n "~{default='' vadr_model_tar_subdir}" ]; then
      VADR_MODEL_DIR="vadr-models/~{default='' vadr_model_tar_subdir}"
    else
      VADR_MODEL_DIR="vadr-models"
    fi

    # remove terminal ambiguous nucleotides
    /opt/vadr/vadr/miniscripts/fasta-trim-terminal-ambigs.pl \
      "~{genome_fasta}" \
      ~{'--minlen ' + minlen} \
      ~{'--maxlen ' + maxlen} \
      > "~{out_basename}.fasta"

    # run VADR
    v-annotate.pl \
      ~{default='' vadr_opts} \
      --split --cpu `nproc` \
      --mdir "$VADR_MODEL_DIR" \
      "~{out_basename}.fasta" \
      "~{out_basename}"

    # package everything for output
    tar -C "~{out_basename}" -czvf "~{out_basename}.vadr.tar.gz" .

    # get the gene annotations (feature table)
    # the vadr.fail.tbl sometimes contains junk / invalid content that needs to be filtered out
    cat "~{out_basename}/~{out_basename}.vadr.pass.tbl" \
        "~{out_basename}/~{out_basename}.vadr.fail.tbl" \
       | sed '/Additional note/,$d' \
        > "~{out_basename}.tbl"

    # prep alerts into a tsv file for parsing
    cat "~{out_basename}/~{out_basename}.vadr.alt.list" | cut -f 5 | tail -n +2 \
      > "~{out_basename}.vadr.alerts.tsv"
    cat "~{out_basename}.vadr.alerts.tsv" | wc -l > NUM_ALERTS

    # record peak memory usage
    set +o pipefail
    { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } | tee MEM_BYTES
  >>>
  output {
    File                 feature_tbl = "~{out_basename}.tbl"
    Int                  num_alerts  = read_int("NUM_ALERTS")
    File                 alerts_list = "~{out_basename}/~{out_basename}.vadr.alt.list"
    Array[String]        alerts      = read_lines("~{out_basename}.vadr.alerts.tsv")
    File                 outputs_tgz = "~{out_basename}.vadr.tar.gz"
    Boolean              pass        = num_alerts==0
    String               vadr_docker = docker
    Int                  max_ram_gb  = ceil(read_float("MEM_BYTES")/1000000000)
  }
  runtime {
    docker: docker
    memory: mem_size + " GB"
    cpu: cpus
    dx_instance_type: "mem2_ssd1_v2_x4"
    maxRetries: 2
  }
}

task package_genbank_submissions {
  input {
    Array[String]   genbank_file_manifest
    Array[File]     genbank_submit_files

    String          spuid_base
    String          spuid_namespace
    String          account_name
    File            authors_sbt

    String          docker = "python:slim"
    Int             mem_size = 2
    Int             cpus = 1
  }
  command <<<
    set -e

    # bring everything into CWD -- at this point we are sure all files are uniquely named / no collisions
    cp "~{sep='" "' genbank_submit_files}" .
    cp "~{authors_sbt}" template.sbt

    python3 << CODE
    import json
    import zipfile
    import csv

    # initialize counters and file lists for each submission category
    counts_by_type = {}
    files_by_type = {}
    groups_total = list(m+"_"+v for m in ('table2asn', 'InfluenzaA', 'InfluenzaB', 'InfluenzaC', 'SARS-CoV-2', 'Norovirus', 'Dengue') for v in ('clean', 'warnings'))
    for group in groups_total:
      counts_by_type[group] = 0
      files_by_type[group] = []

    # read manifest from genbank_single
    for genome in json.loads('[~{sep="," genbank_file_manifest}]'):
      group = genome['submission_type'] + '_' + ('clean' if genome['validation_passing'] else 'warnings')
      counts_by_type[group] += 1
      files_by_type[group].extend(genome['files'])

    # function for writing submission.xml files
    def write_submission_xml(spuid, spuid_namespace, account_name, wizard=None):
      fname = spuid + ".xml"
      submission_name = spuid
      with open(fname, 'wt') as outf:
        outf.write('<?xml version="1.0"?>\n')
        outf.write('<Submission>\n')
        outf.write('  <Description>\n')
        outf.write('    <Comment>{}</Comment>\n'.format(submission_name))
        outf.write('    <Organization type="center" role="owner">\n')
        outf.write('      <Name>{}</Name>\n'.format(account_name))
        outf.write('    </Organization>\n')
        outf.write('  </Description>\n')
        outf.write('  <Action>\n')
        outf.write('    <AddFiles target_db="GenBank">\n')
        outf.write('      <File file_path="{}.zip">\n'.format(spuid))
        outf.write('        <DataType>genbank-submission-package</DataType>\n')
        outf.write('      </File>\n')
        if wizard:
          outf.write('      <Attribute name="wizard">{}</Attribute>\n'.format(wizard))
        outf.write('      <Identifier>\n')
        outf.write('        <SPUID spuid_namespace="{}">{}</SPUID>\n'.format(spuid_namespace, spuid))
        outf.write('      </Identifier>\n')
        outf.write('    </AddFiles>\n')
        outf.write('  </Action>\n')
        outf.write('</Submission>\n')

    # write and bundle outputs
    for group in groups_total:
      with open("count_"+group+".int", 'wt') as outf:
        outf.write(str(counts_by_type[group]))
      if counts_by_type[group]:
        if group.startswith('table2asn'):
          # bundle the sqn files together and that's it
          with zipfile.ZipFile("~{spuid_base}_" + group + ".zip", 'w') as zf:
            for fname in files_by_type[group]:
              zf.write(fname)
        else:
          # for special submissions, merge each individual file type
          with open("sequences.fsa", 'wt') as out_fsa:
            with open("comment.cmt", 'wt') as out_cmt:
              with open("source.src", 'wt') as out_src:
                src_header = ""
                cmt_header = ""
                for fname in files_by_type[group]:
                  if fname.endswith('.fsa') or fname.endswith('.fasta'):
                    with open(fname) as inf:
                      out_fsa.write(inf.read())
                  elif fname.endswith('.cmt'):
                    with open(fname) as inf:
                      header = inf.readline()
                      if cmt_header:
                        assert header == cmt_header
                      else:
                        cmt_header = header
                        out_cmt.write(header)
                      out_cmt.write(inf.read())
                  elif fname.endswith('.src'):
                    with open(fname) as inf:
                      header = inf.readline()
                      if src_header:
                        assert header == src_header
                      else:
                        src_header = header
                        out_src.write(header)
                      out_src.write(inf.read())
          # make FTP zips and xmls
          if group.startswith('SARS-CoV-2'):
            with zipfile.ZipFile("~{spuid_base}_FTP_" + group + ".zip", 'w') as zf:
              for fname in ('sequences.fsa', 'comment.cmt', 'source.src', 'template.sbt'):
                zf.write(fname)
            wizard = "BankIt_SARSCoV2_api"
            write_submission_xml("~{spuid_base}_FTP_" + group, "~{spuid_namespace}", "~{account_name}", wizard=wizard)
          elif group.startswith('Influenza'):
            with zipfile.ZipFile("~{spuid_base}_FTP_" + group + ".zip", 'w') as zf:
              for fname in ('sequences.fsa', 'comment.cmt', 'source.src', 'template.sbt'):
                zf.write(fname)
            wizard = "BankIt_influenza_api"
            write_submission_xml("~{spuid_base}_FTP_" + group, "~{spuid_namespace}", "~{account_name}", wizard=wizard)
            # make a different src file for web after making FTP zip
            # because for some reason, you can't use the same source.src file for web and FTP for flu
            # strain column is required for FTP and prohibited for web
            with open('source.src', 'rt') as inf:
              reader = csv.DictReader(inf, delimiter='\t')
              out_header = list(h for h in reader.fieldnames if h != 'strain')
              rows = []
              for row in reader:
                del row['strain']
                rows.append(row)
            with open('source.src', 'wt') as outf:
              writer = csv.DictWriter(outf, delimiter='\t', fieldnames=out_header, dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
              writer.writeheader()
              for row in rows:
                writer.writerow(row)

          # make web portal zips
          with zipfile.ZipFile("~{spuid_base}_" + group + ".zip", 'w') as zf:
            for fname in ('sequences.fsa', 'comment.cmt', 'source.src', 'template.sbt'):
              zf.write(fname)
    CODE
  >>>

  output {
    Array[File] ftp_submission_files  = glob("~{spuid_base}_FTP_*_clean.*")
    File? submit_sqns_clean_zip       = "~{spuid_base}_table2asn_clean.zip"
    File? submit_sqns_warnings_zip    = "~{spuid_base}_table2asn_warnings.zip"
    Int   num_sqns_clean              = read_int("count_table2asn_clean.int")
    Int   num_sqns_warnings           = read_int("count_table2asn_warnings.int")
    File? submit_fluA_clean_zip       = "~{spuid_base}_InfluenzaA_clean.zip"
    File? submit_fluA_warnings_zip    = "~{spuid_base}_InfluenzaA_warnings.zip"
    Int   num_fluA_clean              = read_int("count_InfluenzaA_clean.int")
    Int   num_fluA_warnings           = read_int("count_InfluenzaA_warnings.int")
    File? submit_fluB_clean_zip       = "~{spuid_base}_InfluenzaB_clean.zip"
    File? submit_fluB_warnings_zip    = "~{spuid_base}_InfluenzaB_warnings.zip"
    Int   num_fluB_clean              = read_int("count_InfluenzaB_clean.int")
    Int   num_fluB_warnings           = read_int("count_InfluenzaB_warnings.int")
    File? submit_fluC_clean_zip       = "~{spuid_base}_InfluenzaC_clean.zip"
    File? submit_fluC_warnings_zip    = "~{spuid_base}_InfluenzaC_warnings.zip"
    Int   num_fluC_clean              = read_int("count_InfluenzaC_clean.int")
    Int   num_fluC_warnings           = read_int("count_InfluenzaC_warnings.int")
    File? submit_sc2_clean_zip        = "~{spuid_base}_SARS-CoV-2_clean.zip"
    File? submit_sc2_warnings_zip     = "~{spuid_base}_SARS-CoV-2_warnings.zip"
    Int   num_sc2_clean               = read_int("count_SARS-CoV-2_clean.int")
    Int   num_sc2_warnings            = read_int("count_SARS-CoV-2_warnings.int")
    File? submit_noro_clean_zip       = "~{spuid_base}_Norovirus_clean.zip"
    File? submit_noro_warnings_zip    = "~{spuid_base}_Norovirus_warnings.zip"
    Int   num_noro_clean              = read_int("count_Norovirus_clean.int")
    Int   num_noro_warnings           = read_int("count_Norovirus_warnings.int")
    File? submit_dengue_clean_zip     = "~{spuid_base}_Dengue_clean.zip"
    File? submit_dengue_warnings_zip  = "~{spuid_base}_Dengue_warnings.zip"
    Int   num_dengue_clean            = read_int("count_Dengue_clean.int")
    Int   num_dengue_warnings         = read_int("count_Dengue_warnings.int")
  }

  runtime {
    docker: docker
    memory: mem_size + " GB"
    cpu: cpus
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 1
  }
}