WORKFLOW
sarscov2_illumina_full
File Path
pipes/WDL/workflows/sarscov2_illumina_full.wdl
WDL Version
1.0
Type
workflow
Imports
Namespace
Path
read_utils
../tasks/tasks_read_utils.wdl
ncbi
../tasks/tasks_ncbi.wdl
nextstrain
../tasks/tasks_nextstrain.wdl
sarscov2
../tasks/tasks_sarscov2.wdl
terra
../tasks/tasks_terra.wdl
assembly
../tasks/tasks_assembly.wdl
utils
../tasks/tasks_utils.wdl
demux_deplete
demux_deplete.wdl
assemble_refbased
assemble_refbased.wdl
sarscov2_batch_relineage
sarscov2_batch_relineage.wdl
sarscov2_biosample_load
sarscov2_biosample_load.wdl
Workflow: sarscov2_illumina_full
Full SARS-CoV-2 analysis workflow starting from raw Illumina flowcell (tar.gz) and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging for data release.
Author: Broad Viral Genomics
Name
Type
Description
Default
flowcell_tgz
File
-
-
reference_fasta
File
Reference genome to align reads to.
-
amplicon_bed_prefix
String
amplicon primers to trim in reference coordinate space (0-based BED format)
-
read_structure
String?
-
-
biosample_attributes
Array[File]
A post-submission attributes file from NCBI BioSample, which is available at https://submit.ncbi.nlm.nih.gov/subs/ and clicking on 'Download attributes file with BioSample accessions'. The 'sample_name' column must match the external_ids used in sample_rename_map (or internal ids if sample_rename_map is omitted).
-
instrument_model
String?
-
-
sra_title
String
-
-
min_genome_coverage
Int?
-
-
sample_rename_map
File?
-
-
workspace_name
String?
-
-
terra_project
String?
-
-
collab_ids_tsv
File?
-
-
gcs_out_metrics
String?
-
-
gcs_out_cdc
String?
-
-
gcs_out_sra
String?
-
-
sample_meta_crsp
File?
-
-
id_salt
File
-
-
biosample_submit_tsv
File?
-
-
bioproject
String
-
-
ftp_config_js
File
-
-
samplesheets
Array[File]+
-
-
spikein_db
File
-
-
bmtaggerDbs
Array[File]?
-
-
blastDbs
Array[File]?
-
-
bwaDbs
Array[File]?
-
-
runinfo
File?
-
-
sequencingCenter
String?
-
-
barcode_columns_to_rev_comp
Array[String]?
-
-
flowcell
String?
-
-
minMismatchDelta
Int?
-
-
maxNoCalls
Int?
-
-
minimumQuality
Int?
-
-
threads
Int?
-
-
runStartDate
String?
-
-
maxRecordsInRam
Int?
-
-
numberOfNegativeControls
Int?
-
-
tileLimit
Int?
-
-
firstTile
Int?
-
-
machine_mem_gb
Int?
-
-
machine_mem_gb
Int?
-
-
query_chunk_size
Int?
-
-
title
String?
-
-
comment
String?
-
-
template
String?
-
-
tag
String?
-
-
ignore_analysis_files
String?
-
-
ignore_sample_names
String?
-
-
sample_names
File?
-
-
exclude_modules
Array[String]?
-
-
module_to_use
Array[String]?
-
-
output_data_format
String?
-
-
config
File?
-
-
config_yaml
String?
-
-
title
String?
-
-
comment
String?
-
-
template
String?
-
-
tag
String?
-
-
ignore_analysis_files
String?
-
-
ignore_sample_names
String?
-
-
sample_names
File?
-
-
exclude_modules
Array[String]?
-
-
module_to_use
Array[String]?
-
-
output_data_format
String?
-
-
config
File?
-
-
config_yaml
String?
-
-
sample_original_name
String?
-
-
novocraft_license
File?
-
-
machine_mem_gb
Int?
-
-
min_keep_length
Int?
-
-
sliding_window
Int?
-
-
primer_offset
Int?
-
-
machine_mem_gb
Int?
-
-
reheader_table
File?
-
-
amplicon_set
String?
-
-
max_coverage_depth
Int?
-
-
base_q_threshold
Int?
-
-
mapping_q_threshold
Int?
-
-
read_length_threshold
Int?
-
-
plotXLimits
String?
-
-
plotYLimits
String?
-
-
machine_mem_gb
Int?
-
-
reheader_table
File?
-
-
max_coverage_depth
Int?
-
-
base_q_threshold
Int?
-
-
mapping_q_threshold
Int?
-
-
read_length_threshold
Int?
-
-
plotXLimits
String?
-
-
plotYLimits
String?
-
-
vadr_model_tar
File?
-
-
vadr_model_tar_subdir
String?
-
-
root_sequence
File?
-
-
auspice_reference_tree_json
File?
-
-
pathogen_json
File?
-
-
gene_annotations_json
File?
-
-
min_length
Int?
-
-
max_ambig
Float?
-
-
analysis_mode
String?
-
-
timezone
String?
-
-
genome_status_json
File?
-
-
max_date
String?
-
-
min_date
String?
-
-
filter_to_ids
File?
-
-
filter_to_accession
String?
-
-
organism_name_override
String?
-
-
sequence_id_override
String?
-
-
isolate_prefix_override
String?
-
-
source_overrides_json
File?
-
-
author_template_sbt
File
-
-
spuid_namespace
String
-
-
account_name
String
-
-
username
String?
-
-
submitting_lab_name
String
-
-
min_date
String?
-
-
voc_list
String?
-
-
voi_list
String?
-
-
204 optional inputs with default values
min_genome_bases
Int
-
24000
max_vadr_alerts
Int
-
0
ntc_max_unambig
Int
-
3000
prod_test
String
-
"Production"
country
String
-
'USA'
ontology_map_states
String
-
'{"AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"}'
ontology_map_body_part
String
-
'{"AN SWAB": "Anterior Nares", "AN Swab": "Anterior Nares", "Anterior Nares": "Anterior Nares", "Swab": "Upper respiratory tract", "Viral": "Upper respiratory tract", "Null": "Anterior Nares", "NP Swab": "Nasopharynx (NP)", "Nasopharynx (NP)": "Nasopharynx (NP)", "Oropharynx (OP)": "Oropharynx (OP)", "Other": "Not Provided"}'
prefix_map
String
-
'{"Broad Institute Clinical Research Sequencing Platform": "CRSP_", "Massachusetts General Hospital": "MGH_", "Rhode Island Department of Health": "RIDOH_", "Biobot Analytics": "Biobot_", "Flow Health":"FlowHealth_", "Colorado Mesa University":"CMU_", "Capture Diagnostics Hawaii":"Capture_", "Boston Medical Center":"BMC_", "University of Central Florida":"UCF_"}'
org_name_map
String
-
'{"Broad Institute Clinical Research Sequencing Platform": "Broad Institute Clinical Research Sequencing Platform", "Massachusetts General Hospital": "Massachusetts General Hospital", "RIDOH": "Rhode Island Department of Health", "BIOBOT": "Biobot Analytics", "FLOW":"Flow Health", "MESA":"Colorado Mesa University", "CAPTURE":"Capture Diagnostics Hawaii", "BUBMC":"Boston Medical Center", "UCF":"University of Central Florida"}'
allowed_purposes
String
-
'["Baseline surveillance (random sampling)", "Targeted surveillance (non-random sampling)", "Screening for Variants of Concern (VOC)", "Longitudinal surveillance (repeat sampling of individuals)", "Vaccine escape surveillance", "Cluster/Outbreak investigation"]'
sequencing_lab_prefix
String
-
'CDCBI'
docker
String
-
"quay.io/broadinstitute/py3-bio:0.1.2"
docker
String
-
"quay.io/broadinstitute/ncbi-tools:2.11.1"
docker
String
-
"quay.io/broadinstitute/ncbi-tools:2.11.1"
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
drop_empty
Boolean
-
true
drop_header
Boolean
-
true
out_name
String
-
"~{basename(basename(tsv,'.txt'),'.tsv')}-~{col}.txt"
sort_reads
Boolean
-
true
insert_demux_outputs_into_terra_tables
Boolean
-
false
revcomp_i5_indexes
Boolean
-
false
min_reads_per_bam
Int
-
100
default_sample_keys
Array[String]
-
["amplicon_set", "control", "batch_lib", "viral_ct"]
default_filename_keys
Array[String]
-
["spike_in", "batch_lib"]
revcomp
Boolean
-
true
docker
String
-
"quay.io/broadinstitute/py3-bio:0.1.2"
old_id_col
String
-
'internal_id'
new_id_col
String
-
'external_id'
rev_comp_barcodes_before_demux
Boolean
-
false
emit_unmatched_reads_bam
Boolean
-
false
minimumBaseQuality
Int?
-
10
maxMismatches
Int?
-
0
inner_barcode_trim_r1_right_of_barcode
Int
-
10
inner_barcode_predemux_trim_r1_3prime
Int
-
18
inner_barcode_predemux_trim_r2_5prime
Int
-
18
inner_barcode_predemux_trim_r2_3prime
Int
-
18
disk_size
Int
-
2625
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
topNHits
Int
-
3
filter_bam_to_proper_primary_mapped_reads
Boolean
-
true
do_not_require_proper_mapped_pairs_when_filtering
Boolean
-
false
keep_singletons_when_filtering
Boolean
-
false
keep_duplicates_when_filtering
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
clear_tags
Boolean
-
false
tags_to_clear_space_separated
String
-
"XT X0 X1 XA AM SM BQ CT XN OC OP"
cpu
Int
-
8
machine_mem_gb
Int
-
15
docker
String
-
"quay.io/broadinstitute/viral-classify:2.5.1.0"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_table_name
String
-
"sample"
library_table_name
String
-
"library"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
sample_table_name
String
-
"sample"
docker
String
-
"python:slim"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
docker
String
-
"schaluvadi/pathogen-genomic-surveillance:api-wdl"
out_dir
String
-
"./multiqc-output"
force
Boolean
-
false
full_names
Boolean
-
false
data_dir
Boolean
-
false
no_data_dir
Boolean
-
false
zip_data_dir
Boolean
-
false
export
Boolean
-
false
flat
Boolean
-
false
interactive
Boolean
-
true
lint
Boolean
-
false
pdf
Boolean
-
false
megaQC_upload
Boolean
-
false
docker
String
-
"quay.io/biocontainers/multiqc:1.32--pyhdfd78af_1"
out_dir
String
-
"./multiqc-output"
force
Boolean
-
false
full_names
Boolean
-
false
data_dir
Boolean
-
false
no_data_dir
Boolean
-
false
zip_data_dir
Boolean
-
false
export
Boolean
-
false
flat
Boolean
-
false
interactive
Boolean
-
true
lint
Boolean
-
false
pdf
Boolean
-
false
megaQC_upload
Boolean
-
false
docker
String
-
"quay.io/biocontainers/multiqc:1.32--pyhdfd78af_1"
output_prefix
String
-
"count_summary"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
aligner
String
-
"minimap2"
align_to_ref_options
Map[String,String]
-
{"novoalign": "-r Random -l 40 -g 40 -x 20 -t 501 -k", "bwa": "-k 12 -B 1", "minimap2": ""}
align_to_self_options
Map[String,String]
-
{"novoalign": "-r Random -l 40 -g 40 -x 20 -t 100", "bwa": "", "minimap2": ""}
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_name
String
-
basename(basename(basename(reads_unmapped_bam,".bam"),".taxfilt"),".clean")
min_quality
Int?
-
1
docker
String
-
"andersenlabapps/ivar:1.3.1"
bam_basename
String
-
basename(aligned_bam,".bam")
disk_size
Int
-
375
run_fastqc
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
machine_mem_gb
Int
-
4
out_basename
String
-
basename(aligned_bam,'.bam')
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
max_amp_len
Int
-
5000
max_amplicons
Int
-
500
machine_mem_gb
Int
-
32
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
skip_mark_dupes
Boolean
-
false
plot_only_non_duplicates
Boolean
-
false
bin_large_plots
Boolean
-
false
binning_summary_statistic
String?
-
"max"
plot_width_pixels
Int?
-
1100
plot_height_pixels
Int?
-
850
plot_pixels_per_inch
Int?
-
100
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
mark_duplicates
Boolean
-
false
machine_mem_gb
Int
-
15
docker
String
-
"quay.io/broadinstitute/viral-assemble:2.5.1.0"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_name
String
-
basename(basename(basename(reads_unmapped_bam,".bam"),".taxfilt"),".clean")
run_fastqc
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
machine_mem_gb
Int
-
4
out_basename
String
-
basename(aligned_bam,'.bam')
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
skip_mark_dupes
Boolean
-
false
plot_only_non_duplicates
Boolean
-
false
bin_large_plots
Boolean
-
false
binning_summary_statistic
String?
-
"max"
plot_width_pixels
Int?
-
1100
plot_height_pixels
Int?
-
850
plot_pixels_per_inch
Int?
-
100
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
add_header
Array[String]
-
[]
out_basename
String
-
basename(genome_fasta,".fasta")
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
out_basename
String
-
basename(genome_fasta,'.fasta')
docker
String
-
"mirror.gcr.io/staphb/vadr:1.6.4"
mem_size
Int
-
16
cpus
Int
-
4
cpus
Int
-
4
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
disk_size
Int
-
150
docker
String
-
"nextstrain/nextclade:3.18.0"
update_dbs_now
Boolean
-
false
docker
String
-
"quay.io/staphb/pangolin:4.3.3-pdata-1.36"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
default_empty
Int
-
0
docker
String
-
"quay.io/broadinstitute/py3-bio:0.1.2"
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
collab_ids_idcol
String
-
'external_id'
collab_ids_addcols
Array[String]
-
['collaborator_id', 'hl7_message_id', 'matrix_id']
address_map
String
-
'{}'
authors_map
String
-
'{}'
docker
String
-
"quay.io/broadinstitute/py3-bio:0.1.2"
cpus
Int
-
4
out_fname
String
-
sub(sub(basename(sequences,".zst"),".vcf",".filtered.vcf"),".fasta$",".filtered.fasta")
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
out_fname
String
-
sub(sub(basename(sequences,".zst"),".vcf",".filtered.vcf"),".fasta$",".filtered.fasta")
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
biosample_col_for_fasta_headers
String
-
"sample_name"
src_to_attr_map
Map[String,String]
-
{}
sanitize_seq_ids
Boolean
-
true
out_basename
String
-
basename(basename(biosample_attributes,".txt"),".tsv")
docker
String
-
"python:slim"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
out_fname
String
-
sub(sub(basename(sequences,".zst"),".vcf",".filtered.vcf"),".fasta$",".filtered.fasta")
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
wizard
String
-
"BankIt_SARSCoV2_api"
docker
String
-
"quay.io/broadinstitute/viral-baseimage:0.3.0"
continent
String
-
"North America"
strict
Boolean
-
true
address_map
String
-
'{}'
authors_map
String
-
'{}'
docker
String
-
"schaluvadi/pathogen-genomic-surveillance:api-wdl"
outname
String
-
"~{terra_project}-~{workspace_name}-~{table_name}.tsv"
docker
String
-
"schaluvadi/pathogen-genomic-surveillance:api-wdl"
sequencing_lab
String
-
"Broad Institute"
intro_blurb
String
-
"The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020."
machine_mem_gb
Int
-
7
docker
String
-
"quay.io/broadinstitute/sc2-rmd:0.1.25"
Outputs
Name
Type
Expression
raw_reads_unaligned_bams
Array[File]
demux_deplete.raw_reads_unaligned_bams
cleaned_reads_unaligned_bams
Array[File]
demux_deplete.cleaned_reads_unaligned_bams
cleaned_bams_tiny
Array[File]
demux_deplete.cleaned_bams_tiny
aligned_trimmed_bams
Array[File]
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam
meta_by_filename_json
File
demux_deplete.meta_by_filename_json
read_counts_raw
Array[Int]
demux_deplete.read_counts_raw
read_counts_depleted
Array[Int]
demux_deplete.read_counts_depleted
sra_metadata
File
select_first([demux_deplete.sra_metadata])
cleaned_bam_uris
File
select_first([demux_deplete.cleaned_bam_uris])
assemblies_fasta
Array[File]
assemble_refbased.assembly_fasta
max_ntc_bases
Int
ntc_max.out
ntc_rejected_batches
Array[String]
filter_bad_ntc_batches.reject_batches
ntc_rejected_lanes
Array[String]
filter_bad_ntc_batches.reject_lanes
demux_metrics
Array[File]
demux_deplete.demux_metrics
demux_commonBarcodes
Array[File]
demux_deplete.demux_commonBarcodes
demux_outlierBarcodes
Array[File]
demux_deplete.demux_outlierBarcodes
primer_trimmed_read_count
Array[Int]
flatten(assemble_refbased.primer_trimmed_read_count)
primer_trimmed_read_percent
Array[Float]
flatten(assemble_refbased.primer_trimmed_read_percent)
ivar_trim_stats_html
File
ivar_trim_stats.trim_stats_html
ivar_trim_stats_png
File
ivar_trim_stats.trim_stats_png
ivar_trim_stats_tsv
File
ivar_trim_stats.trim_stats_tsv
multiqc_report_raw
File
demux_deplete.multiqc_report_raw
multiqc_report_cleaned
File
demux_deplete.multiqc_report_cleaned
spikein_counts
File
demux_deplete.spikein_counts
picard_metrics_wgs
File
picard_wgs_merge.out_tsv
picard_metrics_alignment
File
picard_alignment_merge.out_tsv
assembly_stats_tsv
File
assembly_meta_tsv.combined
assembly_stats_final_tsv
File
sc2_meta_final.meta_tsv
assembly_stats_relineage_tsv
File
sarscov2_batch_relineage.assembly_stats_relineage_tsv
assembly_stats_final_relineage_tsv
File
sc2_meta_final.meta_tsv
submission_zip
File
package_genbank_ftp_submission.submission_zip
submission_xml
File
package_genbank_ftp_submission.submission_xml
submit_ready
File
package_genbank_ftp_submission.submit_ready
vadr_outputs
Array[File]
select_all(vadr.outputs_tgz)
genbank_source_table
File
biosample_to_genbank.genbank_source_modifier_table
gisaid_fasta
File
prefix_gisaid.renamed_fasta
gisaid_meta_csv
File
gisaid_meta_prep.meta_csv
genbank_fasta
File
submit_genomes.filtered_fasta
nextmeta_tsv
File
nextmeta_prep.nextmeta_tsv
nextclade_all_json
File
sarscov2_batch_relineage.nextclade_all_json
nextclade_all_tsv
File
sarscov2_batch_relineage.nextclade_all_tsv
nextclade_auspice_json
File
sarscov2_batch_relineage.nextclade_auspice_json
nextalign_msa
File
sarscov2_batch_relineage.nextalign_msa
pangolin_report
File
sarscov2_batch_relineage.pangolin_report
pangolin_msa
File
sarscov2_batch_relineage.pangolin_msa
passing_fasta
File
passing_cat.filtered_fasta
assembled_ids
Array[String]
select_all(passing_assembly_ids)
submittable_ids
Array[String]
read_lines(filter_bad_ntc_batches.seqids_kept)
failed_assembly_ids
Array[String]
select_all(failed_assembly_id)
failed_annotation_ids
Array[String]
select_all(failed_annotation_id)
num_read_files
Int
length(demux_deplete.cleaned_reads_unaligned_bams)
num_assembled
Int
length(select_all(passing_assemblies))
num_failed_assembly
Int
length(select_all(failed_assembly_id))
num_submittable
Int
filter_bad_ntc_batches.num_kept
num_failed_annotation
Int
length(select_all(failed_annotation_id))
num_samples
Int
length(group_bams_by_sample.sample_names)
run_date
String
demux_deplete.run_date
run_id
String
demux_deplete.run_id
sequencing_reports
File?
sequencing_report.all_zip
id_map_tsv
File?
sarscov2_biosample_load.id_map_tsv
biosample_attributes_out
Array[File]
select_all(flatten([[sarscov2_biosample_load.biosample_attributes], biosample_attributes]))
data_tables_out
Array[String]
select_first([data_tables.tables, []])
Calls
This workflow calls the following tasks or subworkflows:
No explicit input mappings
Input Mappings (3)
Input
Value
input_tsvs
select_all(flatten([[sarscov2_biosample_load.biosample_attributes], biosample_attributes]))
id_col
'accession'
out_basename
"biosample_attributes-merged"
Input Mappings (2)
Input
Value
tsv
biosample_merge.out_tsv
col
'sample_name'
Input Mappings (6)
Input
Value
flowcell_tgz
flowcell_tgz
biosample_map_tsvs
[biosample_merge.out_tsv]
instrument_model_user_specified
instrument_model
sra_title
sra_title
read_structure
read_structure
sample_rename_map
select_first([sample_rename_map, sarscov2_biosample_load.id_map_tsv])
Input Mappings (1)
Input
Value
bam_filepaths
demux_deplete.cleaned_reads_unaligned_bams
Input Mappings (4)
Input
Value
infile
amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed"
outfilename
demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed"
search
"MN908947.3"
replace
"NC_045512.2"
Input Mappings (7)
Input
Value
reads_unmapped_bams
name_reads.right
reference_fasta
reference_fasta
sample_name
name_reads.left
skip_mark_dupes
ampseq
trim_coords_bed
bed_rename.outfile
major_cutoff
0.75
min_coverage
if defined(min_genome_coverage) then min_genome_coverage else if ampseq then 50 else 3
CALL
TASKS
biosample
↗
→ fetch_row_from_tsv
Input Mappings (4)
Input
Value
tsv
biosample_merge.out_tsv
idx_col
"sample_name"
idx_val
orig_name
set_default_keys
["collection_date", "bioproject_accession", "accession", "collected_by", "geo_loc_name", "host_subject_id", "host_age", "host_sex", "purpose_of_sequencing", "anatomical_material", "anatomical_part", "body_product"]
Input Mappings (2)
Input
Value
genome_fasta
assemble_refbased.assembly_fasta
new_name
orig_name
Input Mappings (4)
Input
Value
genome_fasta
assemble_refbased.assembly_fasta
vadr_opts
"--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn"
minlen
50
maxlen
30000
Input Mappings (2)
Input
Value
infiles
[write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)]
output_name
"assembly_metadata-~{flowcell_id}.tsv"
Input Mappings (5)
Input
Value
flowcell_id
flowcell_id
genomes_fasta
assemble_refbased.assembly_fasta
metadata_annotated_tsv
assembly_meta_tsv.combined
metadata_raw_tsv
assembly_meta_tsv.combined
min_genome_bases
min_genome_bases
Input Mappings (4)
Input
Value
seqid_list
write_lines(select_all(passing_assembly_ids))
demux_meta_by_sample_json
demux_deplete.meta_by_sample_json
assembly_meta_tsv
sarscov2_batch_relineage.assembly_stats_relineage_tsv
ntc_min_unambig
ntc_max_unambig
Input Mappings (1)
Input
Value
list
select_all(ntc_bases)
Input Mappings (3)
Input
Value
ivar_trim_stats_tsv
write_tsv(flatten(assemble_refbased.ivar_trim_stats_tsv))
flowcell
flowcell_id
out_basename
"ivar_trim_stats-~{flowcell_id}"
Input Mappings (3)
Input
Value
input_tsvs
assemble_refbased.picard_metrics_wgs
id_col
'sample_sanitized'
out_basename
"picard_metrics_wgs-~{flowcell_id}"
Input Mappings (3)
Input
Value
input_tsvs
assemble_refbased.picard_metrics_alignment
id_col
'sample_sanitized'
out_basename
"picard_metrics_alignment-~{flowcell_id}"
Input Mappings (3)
Input
Value
input_tsvs
assemble_refbased.picard_metrics_insert_size
id_col
'sample_sanitized'
out_basename
"picard_metrics_insertsize-~{flowcell_id}"
Input Mappings (2)
Input
Value
infiles
assemble_refbased.samtools_ampliconstats_parsed
out_filename
"samtools_ampliconstats-~{flowcell_id}.txt"
Input Mappings (5)
Input
Value
assembly_stats_tsv
sarscov2_batch_relineage.assembly_stats_relineage_tsv
collab_ids_tsv
select_first([collab_ids_tsv, sarscov2_biosample_load.collab_ids_tsv])
drop_file_cols
true
min_unambig
min_genome_bases
genome_status_json
filter_bad_ntc_batches.fail_meta_json
Input Mappings (2)
Input
Value
infiles
select_all(passing_assemblies)
output_name
"assemblies_passing-~{flowcell_id}.prefilter.fasta"
CALL
TASKS
passing_ntc
↗
→ filter_sequences_to_list
Input Mappings (2)
Input
Value
sequences
passing_cat_prefilter.combined
keep_list
[filter_bad_ntc_batches.seqids_kept]
CALL
TASKS
passing_cat
↗
→ filter_sequences_to_list
Input Mappings (3)
Input
Value
sequences
passing_ntc.filtered_fasta
keep_list
[accessioned_samples.out_txt]
out_fname
"assemblies_passing-~{flowcell_id}.fasta"
Input Mappings (2)
Input
Value
sequences
passing_cat.filtered_fasta
keep_list
[write_lines(select_all(submittable_id))]
Input Mappings (4)
Input
Value
biosample_attributes
biosample_merge.out_tsv
num_segments
1
taxid
taxid
filter_to_ids
submittable_filter.ids_kept
Input Mappings (2)
Input
Value
assembly_stats_tsv
write_tsv(flatten([[['SeqID', 'Assembly Method', 'Coverage', 'Sequencing Technology']], select_all(assembly_cmt)]))
filter_to_ids
biosample_to_genbank.sample_ids
CALL
TASKS
submit_genomes
↗
→ filter_sequences_to_list
Input Mappings (2)
Input
Value
sequences
submittable_filter.filtered_fasta
keep_list
[biosample_to_genbank.sample_ids]
Input Mappings (5)
Input
Value
sequences_fasta
submit_genomes.filtered_fasta
source_modifier_table
biosample_to_genbank.genbank_source_modifier_table
structured_comment_table
structured_comments.structured_comment_table
submission_name
flowcell_id
submission_uid
flowcell_id
CALL
TASKS
prefix_gisaid
↗
→ prefix_fasta_header
Input Mappings (3)
Input
Value
genome_fasta
submit_genomes.filtered_fasta
prefix
gisaid_prefix
out_basename
"gisaid-sequences-~{flowcell_id}"
Input Mappings (4)
Input
Value
source_modifier_table
biosample_to_genbank.genbank_source_modifier_table
structured_comments
structured_comments.structured_comment_table
fasta_filename
"gisaid-sequences-~{flowcell_id}.fasta"
out_name
"gisaid-meta-~{flowcell_id}.csv"
Input Mappings (4)
Input
Value
gisaid_meta
gisaid_meta_prep.meta_csv
assembly_meta
sarscov2_batch_relineage.assembly_stats_relineage_tsv
out_name
"nextmeta-~{flowcell_id}.tsv"
filter_to_ids
filter_bad_ntc_batches.seqids_kept
CALL
TASKS
data_tables
↗
→ upload_reads_assemblies_entities_tsv
Input Mappings (5)
Input
Value
workspace_name
select_first([workspace_name])
terra_project
select_first([terra_project])
tsv_file
sarscov2_batch_relineage.assembly_stats_relineage_tsv
cleaned_reads_unaligned_bams_string
demux_deplete.cleaned_reads_unaligned_bams
meta_by_filename_json
demux_deplete.meta_by_filename_json
Input Mappings (4)
Input
Value
workspace_name
select_first([workspace_name])
terra_project
select_first([terra_project])
table_name
'assemblies'
nop_input_string
data_tables.tables[0]
Input Mappings (4)
Input
Value
assembly_stats_tsv
download_entities_tsv.tsv_file
collab_ids_tsv
select_first([collab_ids_tsv, sarscov2_biosample_load.collab_ids_tsv])
max_date
demux_deplete.run_date
min_unambig
min_genome_bases
Input Mappings (2)
Input
Value
infiles
flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, samtools_ampliconstats_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics])
gcs_uri_prefix
"~{gcs_out_metrics}/~{flowcell_id}/"
Input Mappings (2)
Input
Value
infiles
[sc2_meta_final.meta_tsv, passing_cat.filtered_fasta, gisaid_meta_prep.meta_csv, prefix_gisaid.renamed_fasta, package_genbank_ftp_submission.submission_zip, select_first([demux_deplete.sra_metadata])]
gcs_uri_prefix
"~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/"
Input Mappings (2)
Input
Value
infiles
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam
gcs_uri_prefix
"~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/rawfiles/"
Input Mappings (2)
Input
Value
infiles
demux_deplete.cleaned_reads_unaligned_bams
gcs_uri_prefix
"~{gcs_out_sra}/~{flowcell_id}/"
Input Mappings (2)
Input
Value
infiles
[select_first([demux_deplete.sra_metadata])]
gcs_uri_prefix
"~{gcs_out_sra}/"
Images
Container images used by tasks in this workflow:
quay.io/broadinstitute/viral-core:2.5.1
Used by 10 tasks:
biosample_merge
picard_wgs_merge
picard_alignment_merge
picard_insertsize_merge
passing_ntc
passing_cat
submittable_filter
structured_comments
submit_genomes
rename_fasta_header
python:slim
Used by 9 tasks:
accessioned_samples
group_bams_by_sample
filter_bad_ntc_batches
ntc_max
biosample_to_genbank
prefix_gisaid
gisaid_meta_prep
nextmeta_prep
biosample
ubuntu
Used by 4 tasks:
assembly_meta_tsv
samtools_ampliconstats_merge
passing_cat_prefilter
bed_rename
⚙️ Parameterized
Configured via input:
docker
Used by 2 tasks:
ivar_trim_stats
sc2_meta_final
⚙️ Parameterized
Configured via input:
docker
Used by 6 tasks:
package_genbank_ftp_submission
gcs_metrics_dump
gcs_cdc_dump
gcs_cdc_dump_reads
gcs_sra_dump_reads
gcs_sra_dump
⚙️ Parameterized
Configured via input:
docker
Used by 2 tasks:
data_tables
download_entities_tsv
⚙️ Parameterized
Configured via input:
docker
⚙️ Parameterized
Configured via input:
docker
Zoom In
Zoom Out
Fit
Reset
🖱️ Scroll to zoom • Drag to pan • Double-click to reset • ESC to close
flowchart TD
Start([sarscov2_illumina_full])
subgraph C1 ["↔️ if length(biosample_attributes) == 0"]
direction TB
N1[/"sarscov2_biosample_load"/]
end
N2["biosample_mergetsv_join "]
N3["accessioned_samplesfetch_col_from_tsv "]
N4[/"demux_deplete"/]
N5["group_bams_by_sample"]
subgraph S1 ["🔃 scatter name_reads in zip(group_bams_by_sample.sample_names, group_bams_by_sample.grouped_bam_filepaths)"]
direction TB
subgraph C2 ["↔️ if ampseq"]
direction TB
N6["bed_renamesed "]
end
N7[/"assemble_refbased"/]
N8["biosamplefetch_row_from_tsv "]
subgraph C3 ["↔️ if assemble_refbased.assembly_length_unambiguous >= min_genome_bases"]
direction TB
N9["rename_fasta_header"]
N10["vadr"]
end
end
N11["assembly_meta_tsvconcatenate "]
N12[/"sarscov2_batch_relineage"/]
N13["filter_bad_ntc_batches"]
N14["ntc_maxmax "]
N15["ivar_trim_stats"]
N16["picard_wgs_mergetsv_join "]
N17["picard_alignment_mergetsv_join "]
N18["picard_insertsize_mergetsv_join "]
N19["samtools_ampliconstats_mergecat_except_headers "]
N20["sc2_meta_final"]
N21["passing_cat_prefilterconcatenate "]
N22["passing_ntcfilter_sequences_to_list "]
N23["passing_catfilter_sequences_to_list "]
N24["submittable_filterfilter_sequences_to_list "]
N25["biosample_to_genbank"]
N26["structured_comments"]
N27["submit_genomesfilter_sequences_to_list "]
N28["package_genbank_ftp_submissionpackage_special_genbank_ftp_submission "]
N29["prefix_gisaidprefix_fasta_header "]
N30["gisaid_meta_prep"]
N31["nextmeta_prep"]
subgraph C4 ["↔️ if defined(workspace_name) && defined(terra_project)"]
direction TB
N32["data_tablesupload_reads_assemblies_entities_tsv "]
N33["download_entities_tsv"]
N34["sequencing_report"]
end
subgraph C5 ["↔️ if defined(gcs_out_metrics)"]
direction TB
N35["gcs_metrics_dumpgcs_copy "]
end
subgraph C6 ["↔️ if defined(gcs_out_cdc)"]
direction TB
N36["gcs_cdc_dumpgcs_copy "]
N37["gcs_cdc_dump_readsgcs_copy "]
end
subgraph C7 ["↔️ if defined(gcs_out_sra)"]
direction TB
N38["gcs_sra_dump_readsgcs_copy "]
N39["gcs_sra_dumpgcs_copy "]
end
N1 --> N2
N2 --> N3
N2 --> N4
N1 --> N4
N4 --> N5
N5 --> N6
N4 --> N6
N5 --> N7
N4 --> N7
N6 --> N7
N5 --> N8
N4 --> N8
N2 --> N8
N5 --> N9
N7 --> N9
N4 --> N9
N5 --> N10
N7 --> N10
N7 --> N11
N4 --> N11
N8 --> N11
N10 --> N11
N11 --> N12
N7 --> N12
N4 --> N12
N4 --> N13
N12 --> N13
N7 --> N14
N7 --> N15
N4 --> N15
N7 --> N16
N7 --> N17
N7 --> N18
N7 --> N19
N13 --> N20
N12 --> N20
N1 --> N20
N9 --> N21
N13 --> N22
N21 --> N22
N22 --> N23
N3 --> N23
N4 --> N24
N23 --> N24
N24 --> N25
N2 --> N25
N7 --> N26
N4 --> N26
N25 --> N26
N24 --> N27
N25 --> N27
N25 --> N28
N26 --> N28
N27 --> N28
N4 --> N28
N27 --> N29
N26 --> N30
N25 --> N30
N30 --> N31
N13 --> N31
N12 --> N31
N4 --> N32
N12 --> N32
N32 --> N33
N4 --> N34
N33 --> N34
N1 --> N34
N17 --> N35
N20 --> N35
N15 --> N35
N11 --> N35
N4 --> N35
N12 --> N35
N16 --> N35
N19 --> N35
N18 --> N35
N23 --> N36
N29 --> N36
N30 --> N36
N20 --> N36
N4 --> N36
N28 --> N36
N7 --> N37
N4 --> N38
N4 --> N39
Start --> N1
N37 --> End([End])
N14 --> End([End])
N34 --> End([End])
N31 --> End([End])
N38 --> End([End])
N36 --> End([End])
N39 --> End([End])
N35 --> End([End])
classDef taskNode fill:#a371f7,stroke:#8b5cf6,stroke-width:2px,color:#fff
classDef workflowNode fill:#58a6ff,stroke:#1f6feb,stroke-width:2px,color:#fff
version 1.0
#DX_SKIP_WORKFLOW
import "../tasks/tasks_read_utils.wdl" as read_utils
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_sarscov2.wdl" as sarscov2
import "../tasks/tasks_terra.wdl" as terra
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_utils.wdl" as utils
import "demux_deplete.wdl"
import "assemble_refbased.wdl"
import "sarscov2_batch_relineage.wdl"
import "sarscov2_biosample_load.wdl"
workflow sarscov2_illumina_full {
meta {
description: "Full SARS-CoV-2 analysis workflow starting from raw Illumina flowcell (tar.gz) and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging for data release."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
parameter_meta {
reference_fasta: {
description: "Reference genome to align reads to.",
patterns: ["*.fasta"]
}
amplicon_bed_prefix: {
description: "amplicon primers to trim in reference coordinate space (0-based BED format)",
patterns: ["*.bed"]
}
biosample_attributes: {
description: "A post-submission attributes file from NCBI BioSample, which is available at https://submit.ncbi.nlm.nih.gov/subs/ and clicking on 'Download attributes file with BioSample accessions'. The 'sample_name' column must match the external_ids used in sample_rename_map (or internal ids if sample_rename_map is omitted).",
patterns: ["*.txt", "*.tsv"]
}
}
input {
File flowcell_tgz
File reference_fasta
String amplicon_bed_prefix
String? read_structure
Array[File] biosample_attributes
String? instrument_model
String sra_title
Int min_genome_bases = 24000
Int max_vadr_alerts = 0
Int ntc_max_unambig = 3000
Int? min_genome_coverage
File? sample_rename_map
String? workspace_name
String? terra_project
File? collab_ids_tsv
String? gcs_out_metrics
String? gcs_out_cdc
String? gcs_out_sra
}
Int taxid = 2697049
String gisaid_prefix = 'hCoV-19/'
# Broad production pipeline only: metadata ETL and NCBI BioSample registration
if(length(biosample_attributes) == 0) {
call sarscov2_biosample_load.sarscov2_biosample_load
}
# merge biosample attributes tables
call utils.tsv_join as biosample_merge {
input:
input_tsvs = select_all(flatten([[sarscov2_biosample_load.biosample_attributes], biosample_attributes])),
id_col = 'accession',
out_basename = "biosample_attributes-merged"
}
call utils.fetch_col_from_tsv as accessioned_samples {
input:
tsv = biosample_merge.out_tsv,
col = 'sample_name'
}
### demux, deplete, SRA submission prep, fastqc/multiqc
call demux_deplete.demux_deplete {
input:
flowcell_tgz = flowcell_tgz,
biosample_map_tsvs = [biosample_merge.out_tsv],
instrument_model_user_specified = instrument_model,
sra_title = sra_title,
read_structure = read_structure,
sample_rename_map = select_first([sample_rename_map, sarscov2_biosample_load.id_map_tsv])
}
String flowcell_id = demux_deplete.run_id
### gather data by biosample
call read_utils.group_bams_by_sample {
input:
bam_filepaths = demux_deplete.cleaned_reads_unaligned_bams
}
### assembly and analyses per biosample
scatter(name_reads in zip(group_bams_by_sample.sample_names, group_bams_by_sample.grouped_bam_filepaths)) {
Boolean ampseq = (demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] != "")
String orig_name = demux_deplete.meta_by_sample[name_reads.left]["sample_original"]
# assemble genome
if (ampseq) {
call utils.sed as bed_rename {
input:
infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed",
outfilename = demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed",
search = "MN908947.3",
replace = "NC_045512.2"
}
}
call assemble_refbased.assemble_refbased {
input:
reads_unmapped_bams = name_reads.right,
reference_fasta = reference_fasta,
sample_name = name_reads.left,
skip_mark_dupes = ampseq,
trim_coords_bed = bed_rename.outfile,
major_cutoff = 0.75,
min_coverage = if defined(min_genome_coverage) then min_genome_coverage else (if ampseq then 50 else 3)
}
# log controls
if (demux_deplete.meta_by_sample[name_reads.left]["control"] == 'NTC') {
Int ntc_bases = assemble_refbased.assembly_length_unambiguous
}
# grab biosample metadata
call utils.fetch_row_from_tsv as biosample {
input:
tsv = biosample_merge.out_tsv,
idx_col = "sample_name",
idx_val = orig_name,
set_default_keys = ["collection_date", "bioproject_accession", "accession", "collected_by", "geo_loc_name", "host_subject_id", "host_age", "host_sex", "purpose_of_sequencing", "anatomical_material", "anatomical_part", "body_product"]
}
# for genomes that somewhat assemble
if (assemble_refbased.assembly_length_unambiguous >= min_genome_bases) {
call ncbi.rename_fasta_header {
input:
genome_fasta = assemble_refbased.assembly_fasta,
new_name = orig_name
}
File passing_assemblies = rename_fasta_header.renamed_fasta
String passing_assembly_ids = orig_name
Array[String] assembly_cmt = [orig_name, "Broad viral-ngs v. " + demux_deplete.demux_viral_core_version, assemble_refbased.assembly_mean_coverage, demux_deplete.instrument_model_inferred]
# VADR annotation & QC
call ncbi.vadr {
input:
genome_fasta = assemble_refbased.assembly_fasta,
vadr_opts = "--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn",
minlen = 50,
maxlen = 30000
}
if (vadr.num_alerts<=max_vadr_alerts) {
String submittable_id = orig_name
}
if (vadr.num_alerts>max_vadr_alerts) {
String failed_annotation_id = orig_name
}
}
if (assemble_refbased.assembly_length_unambiguous < min_genome_bases) {
String failed_assembly_id = orig_name
}
Array[String] assembly_tsv_row = [
orig_name,
name_reads.left,
biosample.map["accession"],
flowcell_id,
demux_deplete.run_date,
biosample.map["collection_date"],
biosample.map["geo_loc_name"],
biosample.map["host_subject_id"],
assemble_refbased.assembly_length_unambiguous,
assemble_refbased.assembly_mean_coverage,
assemble_refbased.dist_to_ref_snps,
assemble_refbased.dist_to_ref_indels,
select_first([vadr.num_alerts, ""]),
assemble_refbased.assembly_fasta,
assemble_refbased.align_to_ref_merged_coverage_plot,
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
assemble_refbased.replicate_discordant_vcf,
assemble_refbased.align_to_ref_variants_vcf_gz,
select_first([vadr.outputs_tgz, ""]),
demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"],
assemble_refbased.replicate_concordant_sites,
assemble_refbased.replicate_discordant_snps,
assemble_refbased.replicate_discordant_indels,
assemble_refbased.num_read_groups,
assemble_refbased.num_libraries,
assemble_refbased.align_to_ref_merged_reads_aligned,
assemble_refbased.align_to_ref_merged_bases_aligned,
select_first([vadr.alerts_list, ""]),
biosample.map["purpose_of_sequencing"],
biosample.map["collected_by"],
biosample.map["bioproject_accession"],
biosample.map["host_age"],
biosample.map["host_sex"],
"",
biosample.map["anatomical_material"],
biosample.map["anatomical_part"],
biosample.map["body_product"],
demux_deplete.meta_by_sample[name_reads.left]["viral_ct"]
]
}
Array[String] assembly_tsv_header = [
'sample', 'sample_sanitized', 'biosample_accession', 'flowcell_id', 'run_date', 'collection_date', 'geo_loc_name', 'host_subject_id',
'assembly_length_unambiguous', 'assembly_mean_coverage',
'dist_to_ref_snps', 'dist_to_ref_indels', 'vadr_num_alerts',
'assembly_fasta', 'coverage_plot', 'aligned_bam',
'replicate_discordant_vcf', 'variants_from_ref_vcf',
'vadr_tgz',
'amplicon_set',
'replicate_concordant_sites', 'replicate_discordant_snps', 'replicate_discordant_indels', 'num_read_groups', 'num_libraries',
'align_to_ref_merged_reads_aligned', 'align_to_ref_merged_bases_aligned',
'vadr_alerts', 'purpose_of_sequencing', 'collected_by', 'bioproject_accession',
'age', 'sex', 'zip', "anatomical_material", "anatomical_part", "body_product",
'Ct'
]
### summary stats
call utils.concatenate as assembly_meta_tsv {
input:
infiles = [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)],
output_name = "assembly_metadata-~{flowcell_id}.tsv"
}
# nextclade and pangolin on full data set
call sarscov2_batch_relineage.sarscov2_batch_relineage {
input:
flowcell_id = flowcell_id,
genomes_fasta = assemble_refbased.assembly_fasta, # TO DO: can this just be [passing_cat_prefilter.combined]?
metadata_annotated_tsv = assembly_meta_tsv.combined,
metadata_raw_tsv = assembly_meta_tsv.combined,
min_genome_bases = min_genome_bases
}
### mark up the bad batches or lanes where NTCs assemble
call assembly.filter_bad_ntc_batches {
input:
seqid_list = write_lines(select_all(passing_assembly_ids)),
demux_meta_by_sample_json = demux_deplete.meta_by_sample_json,
assembly_meta_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
ntc_min_unambig = ntc_max_unambig
}
### QC metrics
call read_utils.max as ntc_max {
input:
list = select_all(ntc_bases)
}
call assembly.ivar_trim_stats {
input:
ivar_trim_stats_tsv = write_tsv(flatten(assemble_refbased.ivar_trim_stats_tsv)),
flowcell = flowcell_id,
out_basename = "ivar_trim_stats-~{flowcell_id}"
}
call utils.tsv_join as picard_wgs_merge {
input:
input_tsvs = assemble_refbased.picard_metrics_wgs,
id_col = 'sample_sanitized',
out_basename = "picard_metrics_wgs-~{flowcell_id}"
}
call utils.tsv_join as picard_alignment_merge {
input:
input_tsvs = assemble_refbased.picard_metrics_alignment,
id_col = 'sample_sanitized',
out_basename = "picard_metrics_alignment-~{flowcell_id}"
}
call utils.tsv_join as picard_insertsize_merge {
input:
input_tsvs = assemble_refbased.picard_metrics_insert_size,
id_col = 'sample_sanitized',
out_basename = "picard_metrics_insertsize-~{flowcell_id}"
}
call utils.cat_except_headers as samtools_ampliconstats_merge {
input:
infiles = assemble_refbased.samtools_ampliconstats_parsed,
out_filename = "samtools_ampliconstats-~{flowcell_id}.txt"
}
### filter and concatenate final sets for delivery ("passing" and "submittable")
call sarscov2.sc2_meta_final {
# this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
input:
assembly_stats_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
collab_ids_tsv = select_first([collab_ids_tsv, sarscov2_biosample_load.collab_ids_tsv]),
drop_file_cols = true,
min_unambig = min_genome_bases,
genome_status_json = filter_bad_ntc_batches.fail_meta_json
}
call utils.concatenate as passing_cat_prefilter {
# this emits a fasta of only genomes that pass min_unambig
input:
infiles = select_all(passing_assemblies),
output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
}
call nextstrain.filter_sequences_to_list as passing_ntc {
# this drops all genomes that are failed_NTC
input:
sequences = passing_cat_prefilter.combined,
keep_list = [filter_bad_ntc_batches.seqids_kept]
}
call nextstrain.filter_sequences_to_list as passing_cat {
# this drops all genomes that don't have BioSample accessions (e.g. control libraries)
input:
sequences = passing_ntc.filtered_fasta,
keep_list = [accessioned_samples.out_txt],
out_fname = "assemblies_passing-~{flowcell_id}.fasta"
}
call nextstrain.filter_sequences_to_list as submittable_filter {
# this drops all failed_annotation (aka VADR fails)
input:
sequences = passing_cat.filtered_fasta,
keep_list = [write_lines(select_all(submittable_id))]
}
### prep genbank submission
call ncbi.biosample_to_genbank {
# this takes a BioSample attributes file and emits a Genbank Source Modifier Table
input:
biosample_attributes = biosample_merge.out_tsv,
num_segments = 1,
taxid = taxid,
filter_to_ids = submittable_filter.ids_kept
}
call ncbi.structured_comments {
input:
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
filter_to_ids = biosample_to_genbank.sample_ids
}
call nextstrain.filter_sequences_to_list as submit_genomes {
input:
sequences = submittable_filter.filtered_fasta,
keep_list = [biosample_to_genbank.sample_ids]
}
call ncbi.package_special_genbank_ftp_submission as package_genbank_ftp_submission {
input:
sequences_fasta = submit_genomes.filtered_fasta,
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comment_table = structured_comments.structured_comment_table,
submission_name = flowcell_id,
submission_uid = flowcell_id
}
### prep gisaid submission
call ncbi.prefix_fasta_header as prefix_gisaid {
input:
genome_fasta = submit_genomes.filtered_fasta,
prefix = gisaid_prefix,
out_basename = "gisaid-sequences-~{flowcell_id}"
}
call ncbi.gisaid_meta_prep {
input:
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comments = structured_comments.structured_comment_table,
fasta_filename = "gisaid-sequences-~{flowcell_id}.fasta",
out_name = "gisaid-meta-~{flowcell_id}.csv"
}
# prep nextmeta-style metadata for private nextstrain build
call nextstrain.nextmeta_prep {
input:
gisaid_meta = gisaid_meta_prep.meta_csv,
assembly_meta = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
out_name = "nextmeta-~{flowcell_id}.tsv",
filter_to_ids = filter_bad_ntc_batches.seqids_kept
}
# create data tables with assembly_meta_tsv if workspace name and project provided
if (defined(workspace_name) && defined(terra_project)) {
call terra.upload_reads_assemblies_entities_tsv as data_tables {
input:
workspace_name = select_first([workspace_name]),
terra_project = select_first([terra_project]),
tsv_file = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
cleaned_reads_unaligned_bams_string = demux_deplete.cleaned_reads_unaligned_bams,
meta_by_filename_json = demux_deplete.meta_by_filename_json
}
call terra.download_entities_tsv {
input:
workspace_name = select_first([workspace_name]),
terra_project = select_first([terra_project]),
table_name = 'assemblies',
nop_input_string = data_tables.tables[0]
}
call sarscov2.sequencing_report {
input:
assembly_stats_tsv = download_entities_tsv.tsv_file,
collab_ids_tsv = select_first([collab_ids_tsv, sarscov2_biosample_load.collab_ids_tsv]),
max_date = demux_deplete.run_date,
min_unambig = min_genome_bases
}
}
# bucket deliveries
if(defined(gcs_out_metrics)) {
call terra.gcs_copy as gcs_metrics_dump {
input:
infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, samtools_ampliconstats_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]),
gcs_uri_prefix = "~{gcs_out_metrics}/~{flowcell_id}/"
}
}
if(defined(gcs_out_cdc)) {
call terra.gcs_copy as gcs_cdc_dump {
input:
infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta, gisaid_meta_prep.meta_csv, prefix_gisaid.renamed_fasta, package_genbank_ftp_submission.submission_zip, select_first([demux_deplete.sra_metadata])],
gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/"
}
call terra.gcs_copy as gcs_cdc_dump_reads {
input:
infiles = assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/rawfiles/"
}
}
if(defined(gcs_out_sra)) {
call terra.gcs_copy as gcs_sra_dump_reads {
input:
infiles = demux_deplete.cleaned_reads_unaligned_bams,
gcs_uri_prefix = "~{gcs_out_sra}/~{flowcell_id}/"
}
call terra.gcs_copy as gcs_sra_dump {
input:
infiles = [select_first([demux_deplete.sra_metadata])],
gcs_uri_prefix = "~{gcs_out_sra}/"
}
}
output {
Array[File] raw_reads_unaligned_bams = demux_deplete.raw_reads_unaligned_bams
Array[File] cleaned_reads_unaligned_bams = demux_deplete.cleaned_reads_unaligned_bams
Array[File] cleaned_bams_tiny = demux_deplete.cleaned_bams_tiny
Array[File] aligned_trimmed_bams = assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam
File meta_by_filename_json = demux_deplete.meta_by_filename_json
Array[Int] read_counts_raw = demux_deplete.read_counts_raw
Array[Int] read_counts_depleted = demux_deplete.read_counts_depleted
File sra_metadata = select_first([demux_deplete.sra_metadata])
File cleaned_bam_uris = select_first([demux_deplete.cleaned_bam_uris])
Array[File] assemblies_fasta = assemble_refbased.assembly_fasta
Int max_ntc_bases = ntc_max.out
Array[String] ntc_rejected_batches = filter_bad_ntc_batches.reject_batches
Array[String] ntc_rejected_lanes = filter_bad_ntc_batches.reject_lanes
Array[File] demux_metrics = demux_deplete.demux_metrics
Array[File] demux_commonBarcodes = demux_deplete.demux_commonBarcodes
Array[File] demux_outlierBarcodes = demux_deplete.demux_outlierBarcodes
Array[Int] primer_trimmed_read_count = flatten(assemble_refbased.primer_trimmed_read_count)
Array[Float] primer_trimmed_read_percent = flatten(assemble_refbased.primer_trimmed_read_percent)
File ivar_trim_stats_html = ivar_trim_stats.trim_stats_html
File ivar_trim_stats_png = ivar_trim_stats.trim_stats_png
File ivar_trim_stats_tsv = ivar_trim_stats.trim_stats_tsv
File multiqc_report_raw = demux_deplete.multiqc_report_raw
File multiqc_report_cleaned = demux_deplete.multiqc_report_cleaned
File spikein_counts = demux_deplete.spikein_counts
File picard_metrics_wgs = picard_wgs_merge.out_tsv
File picard_metrics_alignment = picard_alignment_merge.out_tsv
File assembly_stats_tsv = assembly_meta_tsv.combined
File assembly_stats_final_tsv = sc2_meta_final.meta_tsv
File assembly_stats_relineage_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv
File assembly_stats_final_relineage_tsv = sc2_meta_final.meta_tsv
File submission_zip = package_genbank_ftp_submission.submission_zip
File submission_xml = package_genbank_ftp_submission.submission_xml
File submit_ready = package_genbank_ftp_submission.submit_ready
Array[File] vadr_outputs = select_all(vadr.outputs_tgz)
File genbank_source_table = biosample_to_genbank.genbank_source_modifier_table
File gisaid_fasta = prefix_gisaid.renamed_fasta
File gisaid_meta_csv = gisaid_meta_prep.meta_csv
File genbank_fasta = submit_genomes.filtered_fasta
File nextmeta_tsv = nextmeta_prep.nextmeta_tsv
File nextclade_all_json = sarscov2_batch_relineage.nextclade_all_json
File nextclade_all_tsv = sarscov2_batch_relineage.nextclade_all_tsv
File nextclade_auspice_json = sarscov2_batch_relineage.nextclade_auspice_json
File nextalign_msa = sarscov2_batch_relineage.nextalign_msa
File pangolin_report = sarscov2_batch_relineage.pangolin_report
File pangolin_msa = sarscov2_batch_relineage.pangolin_msa
File passing_fasta = passing_cat.filtered_fasta
Array[String] assembled_ids = select_all(passing_assembly_ids)
Array[String] submittable_ids = read_lines(filter_bad_ntc_batches.seqids_kept)
Array[String] failed_assembly_ids = select_all(failed_assembly_id)
Array[String] failed_annotation_ids = select_all(failed_annotation_id)
Int num_read_files = length(demux_deplete.cleaned_reads_unaligned_bams)
Int num_assembled = length(select_all(passing_assemblies))
Int num_failed_assembly = length(select_all(failed_assembly_id))
Int num_submittable = filter_bad_ntc_batches.num_kept
Int num_failed_annotation = length(select_all(failed_annotation_id))
Int num_samples = length(group_bams_by_sample.sample_names)
String run_date = demux_deplete.run_date
String run_id = demux_deplete.run_id
File? sequencing_reports = sequencing_report.all_zip
File? id_map_tsv = sarscov2_biosample_load.id_map_tsv
Array[File] biosample_attributes_out = select_all(flatten([[sarscov2_biosample_load.biosample_attributes], biosample_attributes]))
Array[String] data_tables_out = select_first([data_tables.tables, []])
}
}
version 1.0
#DX_SKIP_WORKFLOW
import "../tasks/tasks_read_utils.wdl" as read_utils
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_sarscov2.wdl" as sarscov2
import "../tasks/tasks_terra.wdl" as terra
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_utils.wdl" as utils
import "demux_deplete.wdl"
import "assemble_refbased.wdl"
import "sarscov2_batch_relineage.wdl"
import "sarscov2_biosample_load.wdl"
workflow sarscov2_illumina_full {
meta {
description: "Full SARS-CoV-2 analysis workflow starting from raw Illumina flowcell (tar.gz) and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging for data release."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
parameter_meta {
reference_fasta: {
description: "Reference genome to align reads to.",
patterns: ["*.fasta"]
}
amplicon_bed_prefix: {
description: "amplicon primers to trim in reference coordinate space (0-based BED format)",
patterns: ["*.bed"]
}
biosample_attributes: {
description: "A post-submission attributes file from NCBI BioSample, which is available at https://submit.ncbi.nlm.nih.gov/subs/ and clicking on 'Download attributes file with BioSample accessions'. The 'sample_name' column must match the external_ids used in sample_rename_map (or internal ids if sample_rename_map is omitted).",
patterns: ["*.txt", "*.tsv"]
}
}
input {
File flowcell_tgz
File reference_fasta
String amplicon_bed_prefix
String? read_structure
Array[File] biosample_attributes
String? instrument_model
String sra_title
Int min_genome_bases = 24000
Int max_vadr_alerts = 0
Int ntc_max_unambig = 3000
Int? min_genome_coverage
File? sample_rename_map
String? workspace_name
String? terra_project
File? collab_ids_tsv
String? gcs_out_metrics
String? gcs_out_cdc
String? gcs_out_sra
}
Int taxid = 2697049
String gisaid_prefix = 'hCoV-19/'
# Broad production pipeline only: metadata ETL and NCBI BioSample registration
if(length(biosample_attributes) == 0) {
call sarscov2_biosample_load.sarscov2_biosample_load
}
# merge biosample attributes tables
call utils.tsv_join as biosample_merge {
input:
input_tsvs = select_all(flatten([[sarscov2_biosample_load.biosample_attributes], biosample_attributes])),
id_col = 'accession',
out_basename = "biosample_attributes-merged"
}
call utils.fetch_col_from_tsv as accessioned_samples {
input:
tsv = biosample_merge.out_tsv,
col = 'sample_name'
}
### demux, deplete, SRA submission prep, fastqc/multiqc
call demux_deplete.demux_deplete {
input:
flowcell_tgz = flowcell_tgz,
biosample_map_tsvs = [biosample_merge.out_tsv],
instrument_model_user_specified = instrument_model,
sra_title = sra_title,
read_structure = read_structure,
sample_rename_map = select_first([sample_rename_map, sarscov2_biosample_load.id_map_tsv])
}
String flowcell_id = demux_deplete.run_id
### gather data by biosample
call read_utils.group_bams_by_sample {
input:
bam_filepaths = demux_deplete.cleaned_reads_unaligned_bams
}
### assembly and analyses per biosample
scatter(name_reads in zip(group_bams_by_sample.sample_names, group_bams_by_sample.grouped_bam_filepaths)) {
Boolean ampseq = (demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] != "")
String orig_name = demux_deplete.meta_by_sample[name_reads.left]["sample_original"]
# assemble genome
if (ampseq) {
call utils.sed as bed_rename {
input:
infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed",
outfilename = demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed",
search = "MN908947.3",
replace = "NC_045512.2"
}
}
call assemble_refbased.assemble_refbased {
input:
reads_unmapped_bams = name_reads.right,
reference_fasta = reference_fasta,
sample_name = name_reads.left,
skip_mark_dupes = ampseq,
trim_coords_bed = bed_rename.outfile,
major_cutoff = 0.75,
min_coverage = if defined(min_genome_coverage) then min_genome_coverage else (if ampseq then 50 else 3)
}
# log controls
if (demux_deplete.meta_by_sample[name_reads.left]["control"] == 'NTC') {
Int ntc_bases = assemble_refbased.assembly_length_unambiguous
}
# grab biosample metadata
call utils.fetch_row_from_tsv as biosample {
input:
tsv = biosample_merge.out_tsv,
idx_col = "sample_name",
idx_val = orig_name,
set_default_keys = ["collection_date", "bioproject_accession", "accession", "collected_by", "geo_loc_name", "host_subject_id", "host_age", "host_sex", "purpose_of_sequencing", "anatomical_material", "anatomical_part", "body_product"]
}
# for genomes that somewhat assemble
if (assemble_refbased.assembly_length_unambiguous >= min_genome_bases) {
call ncbi.rename_fasta_header {
input:
genome_fasta = assemble_refbased.assembly_fasta,
new_name = orig_name
}
File passing_assemblies = rename_fasta_header.renamed_fasta
String passing_assembly_ids = orig_name
Array[String] assembly_cmt = [orig_name, "Broad viral-ngs v. " + demux_deplete.demux_viral_core_version, assemble_refbased.assembly_mean_coverage, demux_deplete.instrument_model_inferred]
# VADR annotation & QC
call ncbi.vadr {
input:
genome_fasta = assemble_refbased.assembly_fasta,
vadr_opts = "--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn",
minlen = 50,
maxlen = 30000
}
if (vadr.num_alerts<=max_vadr_alerts) {
String submittable_id = orig_name
}
if (vadr.num_alerts>max_vadr_alerts) {
String failed_annotation_id = orig_name
}
}
if (assemble_refbased.assembly_length_unambiguous < min_genome_bases) {
String failed_assembly_id = orig_name
}
Array[String] assembly_tsv_row = [
orig_name,
name_reads.left,
biosample.map["accession"],
flowcell_id,
demux_deplete.run_date,
biosample.map["collection_date"],
biosample.map["geo_loc_name"],
biosample.map["host_subject_id"],
assemble_refbased.assembly_length_unambiguous,
assemble_refbased.assembly_mean_coverage,
assemble_refbased.dist_to_ref_snps,
assemble_refbased.dist_to_ref_indels,
select_first([vadr.num_alerts, ""]),
assemble_refbased.assembly_fasta,
assemble_refbased.align_to_ref_merged_coverage_plot,
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
assemble_refbased.replicate_discordant_vcf,
assemble_refbased.align_to_ref_variants_vcf_gz,
select_first([vadr.outputs_tgz, ""]),
demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"],
assemble_refbased.replicate_concordant_sites,
assemble_refbased.replicate_discordant_snps,
assemble_refbased.replicate_discordant_indels,
assemble_refbased.num_read_groups,
assemble_refbased.num_libraries,
assemble_refbased.align_to_ref_merged_reads_aligned,
assemble_refbased.align_to_ref_merged_bases_aligned,
select_first([vadr.alerts_list, ""]),
biosample.map["purpose_of_sequencing"],
biosample.map["collected_by"],
biosample.map["bioproject_accession"],
biosample.map["host_age"],
biosample.map["host_sex"],
"",
biosample.map["anatomical_material"],
biosample.map["anatomical_part"],
biosample.map["body_product"],
demux_deplete.meta_by_sample[name_reads.left]["viral_ct"]
]
}
Array[String] assembly_tsv_header = [
'sample', 'sample_sanitized', 'biosample_accession', 'flowcell_id', 'run_date', 'collection_date', 'geo_loc_name', 'host_subject_id',
'assembly_length_unambiguous', 'assembly_mean_coverage',
'dist_to_ref_snps', 'dist_to_ref_indels', 'vadr_num_alerts',
'assembly_fasta', 'coverage_plot', 'aligned_bam',
'replicate_discordant_vcf', 'variants_from_ref_vcf',
'vadr_tgz',
'amplicon_set',
'replicate_concordant_sites', 'replicate_discordant_snps', 'replicate_discordant_indels', 'num_read_groups', 'num_libraries',
'align_to_ref_merged_reads_aligned', 'align_to_ref_merged_bases_aligned',
'vadr_alerts', 'purpose_of_sequencing', 'collected_by', 'bioproject_accession',
'age', 'sex', 'zip', "anatomical_material", "anatomical_part", "body_product",
'Ct'
]
### summary stats
call utils.concatenate as assembly_meta_tsv {
input:
infiles = [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)],
output_name = "assembly_metadata-~{flowcell_id}.tsv"
}
# nextclade and pangolin on full data set
call sarscov2_batch_relineage.sarscov2_batch_relineage {
input:
flowcell_id = flowcell_id,
genomes_fasta = assemble_refbased.assembly_fasta, # TO DO: can this just be [passing_cat_prefilter.combined]?
metadata_annotated_tsv = assembly_meta_tsv.combined,
metadata_raw_tsv = assembly_meta_tsv.combined,
min_genome_bases = min_genome_bases
}
### mark up the bad batches or lanes where NTCs assemble
call assembly.filter_bad_ntc_batches {
input:
seqid_list = write_lines(select_all(passing_assembly_ids)),
demux_meta_by_sample_json = demux_deplete.meta_by_sample_json,
assembly_meta_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
ntc_min_unambig = ntc_max_unambig
}
### QC metrics
call read_utils.max as ntc_max {
input:
list = select_all(ntc_bases)
}
call assembly.ivar_trim_stats {
input:
ivar_trim_stats_tsv = write_tsv(flatten(assemble_refbased.ivar_trim_stats_tsv)),
flowcell = flowcell_id,
out_basename = "ivar_trim_stats-~{flowcell_id}"
}
call utils.tsv_join as picard_wgs_merge {
input:
input_tsvs = assemble_refbased.picard_metrics_wgs,
id_col = 'sample_sanitized',
out_basename = "picard_metrics_wgs-~{flowcell_id}"
}
call utils.tsv_join as picard_alignment_merge {
input:
input_tsvs = assemble_refbased.picard_metrics_alignment,
id_col = 'sample_sanitized',
out_basename = "picard_metrics_alignment-~{flowcell_id}"
}
call utils.tsv_join as picard_insertsize_merge {
input:
input_tsvs = assemble_refbased.picard_metrics_insert_size,
id_col = 'sample_sanitized',
out_basename = "picard_metrics_insertsize-~{flowcell_id}"
}
call utils.cat_except_headers as samtools_ampliconstats_merge {
input:
infiles = assemble_refbased.samtools_ampliconstats_parsed,
out_filename = "samtools_ampliconstats-~{flowcell_id}.txt"
}
### filter and concatenate final sets for delivery ("passing" and "submittable")
call sarscov2.sc2_meta_final {
# this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
input:
assembly_stats_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
collab_ids_tsv = select_first([collab_ids_tsv, sarscov2_biosample_load.collab_ids_tsv]),
drop_file_cols = true,
min_unambig = min_genome_bases,
genome_status_json = filter_bad_ntc_batches.fail_meta_json
}
call utils.concatenate as passing_cat_prefilter {
# this emits a fasta of only genomes that pass min_unambig
input:
infiles = select_all(passing_assemblies),
output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
}
call nextstrain.filter_sequences_to_list as passing_ntc {
# this drops all genomes that are failed_NTC
input:
sequences = passing_cat_prefilter.combined,
keep_list = [filter_bad_ntc_batches.seqids_kept]
}
call nextstrain.filter_sequences_to_list as passing_cat {
# this drops all genomes that don't have BioSample accessions (e.g. control libraries)
input:
sequences = passing_ntc.filtered_fasta,
keep_list = [accessioned_samples.out_txt],
out_fname = "assemblies_passing-~{flowcell_id}.fasta"
}
call nextstrain.filter_sequences_to_list as submittable_filter {
# this drops all failed_annotation (aka VADR fails)
input:
sequences = passing_cat.filtered_fasta,
keep_list = [write_lines(select_all(submittable_id))]
}
### prep genbank submission
call ncbi.biosample_to_genbank {
# this takes a BioSample attributes file and emits a Genbank Source Modifier Table
input:
biosample_attributes = biosample_merge.out_tsv,
num_segments = 1,
taxid = taxid,
filter_to_ids = submittable_filter.ids_kept
}
call ncbi.structured_comments {
input:
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
filter_to_ids = biosample_to_genbank.sample_ids
}
call nextstrain.filter_sequences_to_list as submit_genomes {
input:
sequences = submittable_filter.filtered_fasta,
keep_list = [biosample_to_genbank.sample_ids]
}
call ncbi.package_special_genbank_ftp_submission as package_genbank_ftp_submission {
input:
sequences_fasta = submit_genomes.filtered_fasta,
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comment_table = structured_comments.structured_comment_table,
submission_name = flowcell_id,
submission_uid = flowcell_id
}
### prep gisaid submission
call ncbi.prefix_fasta_header as prefix_gisaid {
input:
genome_fasta = submit_genomes.filtered_fasta,
prefix = gisaid_prefix,
out_basename = "gisaid-sequences-~{flowcell_id}"
}
call ncbi.gisaid_meta_prep {
input:
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comments = structured_comments.structured_comment_table,
fasta_filename = "gisaid-sequences-~{flowcell_id}.fasta",
out_name = "gisaid-meta-~{flowcell_id}.csv"
}
# prep nextmeta-style metadata for private nextstrain build
call nextstrain.nextmeta_prep {
input:
gisaid_meta = gisaid_meta_prep.meta_csv,
assembly_meta = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
out_name = "nextmeta-~{flowcell_id}.tsv",
filter_to_ids = filter_bad_ntc_batches.seqids_kept
}
# create data tables with assembly_meta_tsv if workspace name and project provided
if (defined(workspace_name) && defined(terra_project)) {
call terra.upload_reads_assemblies_entities_tsv as data_tables {
input:
workspace_name = select_first([workspace_name]),
terra_project = select_first([terra_project]),
tsv_file = sarscov2_batch_relineage.assembly_stats_relineage_tsv,
cleaned_reads_unaligned_bams_string = demux_deplete.cleaned_reads_unaligned_bams,
meta_by_filename_json = demux_deplete.meta_by_filename_json
}
call terra.download_entities_tsv {
input:
workspace_name = select_first([workspace_name]),
terra_project = select_first([terra_project]),
table_name = 'assemblies',
nop_input_string = data_tables.tables[0]
}
call sarscov2.sequencing_report {
input:
assembly_stats_tsv = download_entities_tsv.tsv_file,
collab_ids_tsv = select_first([collab_ids_tsv, sarscov2_biosample_load.collab_ids_tsv]),
max_date = demux_deplete.run_date,
min_unambig = min_genome_bases
}
}
# bucket deliveries
if(defined(gcs_out_metrics)) {
call terra.gcs_copy as gcs_metrics_dump {
input:
infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, samtools_ampliconstats_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]),
gcs_uri_prefix = "~{gcs_out_metrics}/~{flowcell_id}/"
}
}
if(defined(gcs_out_cdc)) {
call terra.gcs_copy as gcs_cdc_dump {
input:
infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta, gisaid_meta_prep.meta_csv, prefix_gisaid.renamed_fasta, package_genbank_ftp_submission.submission_zip, select_first([demux_deplete.sra_metadata])],
gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/"
}
call terra.gcs_copy as gcs_cdc_dump_reads {
input:
infiles = assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/rawfiles/"
}
}
if(defined(gcs_out_sra)) {
call terra.gcs_copy as gcs_sra_dump_reads {
input:
infiles = demux_deplete.cleaned_reads_unaligned_bams,
gcs_uri_prefix = "~{gcs_out_sra}/~{flowcell_id}/"
}
call terra.gcs_copy as gcs_sra_dump {
input:
infiles = [select_first([demux_deplete.sra_metadata])],
gcs_uri_prefix = "~{gcs_out_sra}/"
}
}
output {
Array[File] raw_reads_unaligned_bams = demux_deplete.raw_reads_unaligned_bams
Array[File] cleaned_reads_unaligned_bams = demux_deplete.cleaned_reads_unaligned_bams
Array[File] cleaned_bams_tiny = demux_deplete.cleaned_bams_tiny
Array[File] aligned_trimmed_bams = assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam
File meta_by_filename_json = demux_deplete.meta_by_filename_json
Array[Int] read_counts_raw = demux_deplete.read_counts_raw
Array[Int] read_counts_depleted = demux_deplete.read_counts_depleted
File sra_metadata = select_first([demux_deplete.sra_metadata])
File cleaned_bam_uris = select_first([demux_deplete.cleaned_bam_uris])
Array[File] assemblies_fasta = assemble_refbased.assembly_fasta
Int max_ntc_bases = ntc_max.out
Array[String] ntc_rejected_batches = filter_bad_ntc_batches.reject_batches
Array[String] ntc_rejected_lanes = filter_bad_ntc_batches.reject_lanes
Array[File] demux_metrics = demux_deplete.demux_metrics
Array[File] demux_commonBarcodes = demux_deplete.demux_commonBarcodes
Array[File] demux_outlierBarcodes = demux_deplete.demux_outlierBarcodes
Array[Int] primer_trimmed_read_count = flatten(assemble_refbased.primer_trimmed_read_count)
Array[Float] primer_trimmed_read_percent = flatten(assemble_refbased.primer_trimmed_read_percent)
File ivar_trim_stats_html = ivar_trim_stats.trim_stats_html
File ivar_trim_stats_png = ivar_trim_stats.trim_stats_png
File ivar_trim_stats_tsv = ivar_trim_stats.trim_stats_tsv
File multiqc_report_raw = demux_deplete.multiqc_report_raw
File multiqc_report_cleaned = demux_deplete.multiqc_report_cleaned
File spikein_counts = demux_deplete.spikein_counts
File picard_metrics_wgs = picard_wgs_merge.out_tsv
File picard_metrics_alignment = picard_alignment_merge.out_tsv
File assembly_stats_tsv = assembly_meta_tsv.combined
File assembly_stats_final_tsv = sc2_meta_final.meta_tsv
File assembly_stats_relineage_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv
File assembly_stats_final_relineage_tsv = sc2_meta_final.meta_tsv
File submission_zip = package_genbank_ftp_submission.submission_zip
File submission_xml = package_genbank_ftp_submission.submission_xml
File submit_ready = package_genbank_ftp_submission.submit_ready
Array[File] vadr_outputs = select_all(vadr.outputs_tgz)
File genbank_source_table = biosample_to_genbank.genbank_source_modifier_table
File gisaid_fasta = prefix_gisaid.renamed_fasta
File gisaid_meta_csv = gisaid_meta_prep.meta_csv
File genbank_fasta = submit_genomes.filtered_fasta
File nextmeta_tsv = nextmeta_prep.nextmeta_tsv
File nextclade_all_json = sarscov2_batch_relineage.nextclade_all_json
File nextclade_all_tsv = sarscov2_batch_relineage.nextclade_all_tsv
File nextclade_auspice_json = sarscov2_batch_relineage.nextclade_auspice_json
File nextalign_msa = sarscov2_batch_relineage.nextalign_msa
File pangolin_report = sarscov2_batch_relineage.pangolin_report
File pangolin_msa = sarscov2_batch_relineage.pangolin_msa
File passing_fasta = passing_cat.filtered_fasta
Array[String] assembled_ids = select_all(passing_assembly_ids)
Array[String] submittable_ids = read_lines(filter_bad_ntc_batches.seqids_kept)
Array[String] failed_assembly_ids = select_all(failed_assembly_id)
Array[String] failed_annotation_ids = select_all(failed_annotation_id)
Int num_read_files = length(demux_deplete.cleaned_reads_unaligned_bams)
Int num_assembled = length(select_all(passing_assemblies))
Int num_failed_assembly = length(select_all(failed_assembly_id))
Int num_submittable = filter_bad_ntc_batches.num_kept
Int num_failed_annotation = length(select_all(failed_annotation_id))
Int num_samples = length(group_bams_by_sample.sample_names)
String run_date = demux_deplete.run_date
String run_id = demux_deplete.run_id
File? sequencing_reports = sequencing_report.all_zip
File? id_map_tsv = sarscov2_biosample_load.id_map_tsv
Array[File] biosample_attributes_out = select_all(flatten([[sarscov2_biosample_load.biosample_attributes], biosample_attributes]))
Array[String] data_tables_out = select_first([data_tables.tables, []])
}
}