WORKFLOW
sarscov2_sra_to_genbank
File Path
pipes/WDL/workflows/sarscov2_sra_to_genbank.wdl
WDL Version
1.0
Type
workflow
Imports
Namespace
Path
ncbi
../tasks/tasks_ncbi.wdl
ncbi_tools
../tasks/tasks_ncbi_tools.wdl
reports
../tasks/tasks_reports.wdl
utils
../tasks/tasks_utils.wdl
assemble_refbased
assemble_refbased.wdl
sarscov2_lineages
sarscov2_lineages.wdl
Workflow: sarscov2_sra_to_genbank
Full SARS-CoV-2 analysis workflow starting from SRA data and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging assemblies for data release.
Author: Broad Viral Genomics
Name
Type
Description
Default
SRA_accessions
Array[String]
SRA Run accession numbers (SRR####).
-
reference_fasta
File
Reference genome to align reads to.
-
amplicon_bed_default
File
Amplicon primers to trim in reference coordinate space (0-based BED format). Will only be used if the SRA Experiment Design has a Library Strategy set to AMPLICON (will be ignored on all other values).
-
spikein_db
File
-
-
sample_name
String?
-
-
email_address
String?
-
-
ncbi_api_key
String?
-
-
machine_mem_gb
Int?
-
-
machine_mem_gb
Int?
-
-
sample_original_name
String?
-
-
novocraft_license
File?
-
-
machine_mem_gb
Int?
-
-
min_keep_length
Int?
-
-
sliding_window
Int?
-
-
primer_offset
Int?
-
-
machine_mem_gb
Int?
-
-
reheader_table
File?
-
-
amplicon_set
String?
-
-
max_coverage_depth
Int?
-
-
base_q_threshold
Int?
-
-
mapping_q_threshold
Int?
-
-
read_length_threshold
Int?
-
-
plotXLimits
String?
-
-
plotYLimits
String?
-
-
machine_mem_gb
Int?
-
-
reheader_table
File?
-
-
max_coverage_depth
Int?
-
-
base_q_threshold
Int?
-
-
mapping_q_threshold
Int?
-
-
read_length_threshold
Int?
-
-
plotXLimits
String?
-
-
plotYLimits
String?
-
-
root_sequence
File?
-
-
auspice_reference_tree_json
File?
-
-
pathogen_json
File?
-
-
gene_annotations_json
File?
-
-
min_length
Int?
-
-
max_ambig
Float?
-
-
analysis_mode
String?
-
-
vadr_model_tar
File?
-
-
vadr_model_tar_subdir
String?
-
-
filter_to_accession
String?
-
-
organism_name_override
String?
-
-
sequence_id_override
String?
-
-
isolate_prefix_override
String?
-
-
source_overrides_json
File?
-
-
author_template_sbt
File
-
-
submission_name
String
-
-
submission_uid
String
-
-
spuid_namespace
String
-
-
account_name
String
-
-
username
String?
-
-
submitting_lab_name
String
-
-
fasta_filename
String?
-
-
title
String?
-
-
comment
String?
-
-
template
String?
-
-
tag
String?
-
-
ignore_analysis_files
String?
-
-
ignore_sample_names
String?
-
-
sample_names
File?
-
-
exclude_modules
Array[String]?
-
-
module_to_use
Array[String]?
-
-
output_data_format
String?
-
-
config
File?
-
-
config_yaml
String?
-
-
97 optional inputs with default values
min_genome_bases
Int
-
15000
min_reads_per_bam
Int
-
100
max_vadr_alerts
Int
-
0
docker
String
-
"quay.io/broadinstitute/ncbi-tools:2.11.1"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
topNHits
Int
-
3
filter_bam_to_proper_primary_mapped_reads
Boolean
-
true
do_not_require_proper_mapped_pairs_when_filtering
Boolean
-
false
keep_singletons_when_filtering
Boolean
-
false
keep_duplicates_when_filtering
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
major_cutoff
Float
-
0.75
align_to_ref_options
Map[String,String]
-
{"novoalign": "-r Random -l 40 -g 40 -x 20 -t 501 -k", "bwa": "-k 12 -B 1", "minimap2": ""}
align_to_self_options
Map[String,String]
-
{"novoalign": "-r Random -l 40 -g 40 -x 20 -t 100", "bwa": "", "minimap2": ""}
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_name
String
-
basename(basename(basename(reads_unmapped_bam,".bam"),".taxfilt"),".clean")
min_quality
Int?
-
1
docker
String
-
"andersenlabapps/ivar:1.3.1"
bam_basename
String
-
basename(aligned_bam,".bam")
disk_size
Int
-
375
run_fastqc
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
machine_mem_gb
Int
-
4
out_basename
String
-
basename(aligned_bam,'.bam')
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
max_amp_len
Int
-
5000
max_amplicons
Int
-
500
machine_mem_gb
Int
-
32
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
skip_mark_dupes
Boolean
-
false
plot_only_non_duplicates
Boolean
-
false
bin_large_plots
Boolean
-
false
binning_summary_statistic
String?
-
"max"
plot_width_pixels
Int?
-
1100
plot_height_pixels
Int?
-
850
plot_pixels_per_inch
Int?
-
100
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
mark_duplicates
Boolean
-
false
machine_mem_gb
Int
-
15
docker
String
-
"quay.io/broadinstitute/viral-assemble:2.5.1.0"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_name
String
-
basename(basename(basename(reads_unmapped_bam,".bam"),".taxfilt"),".clean")
run_fastqc
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
machine_mem_gb
Int
-
4
out_basename
String
-
basename(aligned_bam,'.bam')
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
skip_mark_dupes
Boolean
-
false
plot_only_non_duplicates
Boolean
-
false
bin_large_plots
Boolean
-
false
binning_summary_statistic
String?
-
"max"
plot_width_pixels
Int?
-
1100
plot_height_pixels
Int?
-
850
plot_pixels_per_inch
Int?
-
100
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
out_basename
String
-
basename(genome_fasta,".fasta")
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
50
docker
String
-
"nextstrain/nextclade:3.18.0"
update_dbs_now
Boolean
-
false
docker
String
-
"quay.io/staphb/pangolin:4.3.3-pdata-1.36"
out_basename
String
-
basename(genome_fasta,'.fasta')
docker
String
-
"mirror.gcr.io/staphb/vadr:1.6.4"
mem_size
Int
-
16
cpus
Int
-
4
cpus
Int
-
4
cpus
Int
-
4
biosample_col_for_fasta_headers
String
-
"sample_name"
src_to_attr_map
Map[String,String]
-
{}
sanitize_seq_ids
Boolean
-
true
out_basename
String
-
basename(basename(biosample_attributes,".txt"),".tsv")
docker
String
-
"python:slim"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
wizard
String
-
"BankIt_SARSCoV2_api"
docker
String
-
"quay.io/broadinstitute/viral-baseimage:0.3.0"
out_basename
String
-
basename(genome_fasta,".fasta")
continent
String
-
"North America"
address_map
String
-
'{}'
authors_map
String
-
'{}'
out_dir
String
-
"./multiqc-output"
force
Boolean
-
false
full_names
Boolean
-
false
data_dir
Boolean
-
false
no_data_dir
Boolean
-
false
zip_data_dir
Boolean
-
false
export
Boolean
-
false
flat
Boolean
-
false
interactive
Boolean
-
true
lint
Boolean
-
false
pdf
Boolean
-
false
megaQC_upload
Boolean
-
false
docker
String
-
"quay.io/biocontainers/multiqc:1.32--pyhdfd78af_1"
output_prefix
String
-
"count_summary"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
Outputs
Name
Type
Expression
raw_reads_unaligned_bams
Array[File]
select_all(reads_ubam)
cleaned_reads_unaligned_bams
Array[File]
select_all(reads_ubam)
biosample_attributes
File
group_sra_bams_by_biosample.biosample_attributes_tsv
read_counts_raw
Array[Int]
select_all(num_reads)
read_counts_depleted
Array[Int]
select_all(num_reads)
primer_trimmed_read_count
Array[Int]
flatten(assemble_refbased.primer_trimmed_read_count)
primer_trimmed_read_percent
Array[Float]
flatten(assemble_refbased.primer_trimmed_read_percent)
assemblies_fasta
Array[File]
assemble_refbased.assembly_fasta
passing_assemblies_fasta
Array[File]
select_all(passing_assemblies)
submittable_assemblies_fasta
Array[File]
select_all(submittable_genomes)
multiqc_report_raw
File
multiqc_raw.multiqc_report
spikein_counts
File
spike_summary.count_summary
assembly_stats_tsv
File
assembly_meta_tsv.combined
submission_zip
File
package_genbank_ftp_submission.submission_zip
submission_xml
File
package_genbank_ftp_submission.submission_xml
submit_ready
File
package_genbank_ftp_submission.submit_ready
vadr_outputs
Array[File]
select_all(vadr.outputs_tgz)
genbank_source_table
File
biosample_to_genbank.genbank_source_modifier_table
gisaid_fasta
File
prefix_gisaid.renamed_fasta
gisaid_meta_csv
File
gisaid_meta_prep.meta_csv
assembled_ids
Array[String]
select_all(passing_assembly_ids)
submittable_ids
Array[String]
select_all(submittable_id)
failed_assembly_ids
Array[String]
select_all(failed_assembly_id)
failed_annotation_ids
Array[String]
select_all(failed_annotation_id)
num_read_files
Int
length(Fetch_SRA_to_BAM.reads_ubam)
num_assembled
Int
length(select_all(passing_assemblies))
num_failed_assembly
Int
length(select_all(failed_assembly_id))
num_submittable
Int
length(select_all(submittable_id))
num_failed_annotation
Int
length(select_all(failed_annotation_id))
num_samples
Int
length(group_sra_bams_by_biosample.biosample_accessions)
Calls
This workflow calls the following tasks or subworkflows:
Input Mappings (1)
Input
Value
SRA_ID
sra_id
Input Mappings (1)
Input
Value
reads_bam
Fetch_SRA_to_BAM.reads_ubam
CALL
TASKS
spikein
↗
→ align_and_count
Input Mappings (2)
Input
Value
reads_bam
Fetch_SRA_to_BAM.reads_ubam
ref_db
spikein_db
Input Mappings (5)
Input
Value
biosamples
select_all(biosample_accession)
bam_filepaths
select_all(reads_ubam)
biosample_attributes_jsons
select_all(biosample_attributes_json)
library_strategies
select_all(library_strategy)
seq_platforms
select_all(seq_platform)
Input Mappings (7)
Input
Value
reads_unmapped_bams
samn_bam.right
reference_fasta
reference_fasta
sample_name
samn_bam.left
aligner
"minimap2"
skip_mark_dupes
ampseq
trim_coords_bed
trim_coords_bed
min_coverage
if ampseq then 20 else 3
Input Mappings (2)
Input
Value
genome_fasta
assemble_refbased.assembly_fasta
new_name
orig_name
Input Mappings (1)
Input
Value
genome_fasta
passing_assemblies
Input Mappings (4)
Input
Value
genome_fasta
passing_assemblies
vadr_opts
"--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn"
minlen
50
maxlen
30000
Input Mappings (2)
Input
Value
infiles
[write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)]
output_name
"assembly_metadata.tsv"
Input Mappings (2)
Input
Value
infiles
select_all(submittable_genomes)
output_name
"assemblies.fasta"
Input Mappings (4)
Input
Value
biosample_attributes
group_sra_bams_by_biosample.biosample_attributes_tsv
num_segments
1
taxid
taxid
filter_to_ids
write_lines(select_all(submittable_id))
Input Mappings (2)
Input
Value
assembly_stats_tsv
write_tsv(flatten([[['SeqID', 'Assembly Method', 'Coverage', 'Sequencing Technology']], select_all(assembly_cmt)]))
filter_to_ids
write_lines(select_all(submittable_id))
Input Mappings (3)
Input
Value
sequences_fasta
submit_genomes.combined
source_modifier_table
biosample_to_genbank.genbank_source_modifier_table
structured_comment_table
structured_comments.structured_comment_table
CALL
TASKS
prefix_gisaid
↗
→ prefix_fasta_header
Input Mappings (2)
Input
Value
genome_fasta
submit_genomes.combined
prefix
gisaid_prefix
Input Mappings (4)
Input
Value
source_modifier_table
biosample_to_genbank.genbank_source_modifier_table
structured_comments
structured_comments.structured_comment_table
out_name
"gisaid_meta.csv"
strict
false
Input Mappings (2)
Input
Value
input_files
select_all(fastqc.fastqc_zip)
file_name
"multiqc-raw.html"
CALL
TASKS
spike_summary
↗
→ align_and_count_summary
Input Mappings (1)
Input
Value
counts_txt
select_all(spikein.report)
Images
Container images used by tasks in this workflow:
python:slim
Used by 4 tasks:
group_sra_bams_by_biosample
biosample_to_genbank
prefix_gisaid
gisaid_meta_prep
ubuntu
Used by 2 tasks:
assembly_meta_tsv
submit_genomes
⚙️ Parameterized
Configured via input:
docker
Used by 2 tasks:
structured_comments
rename_fasta_header
⚙️ Parameterized
Configured via input:
docker
Used by 1 task:
package_genbank_ftp_submission
~{docker}
Used by 4 tasks:
multiqc_raw
spike_summary
fastqc
spikein
⚙️ Parameterized
Configured via input:
docker
⚙️ Parameterized
Configured via input:
docker
Zoom In
Zoom Out
Fit
Reset
🖱️ Scroll to zoom • Drag to pan • Double-click to reset • ESC to close
flowchart TD
Start([sarscov2_sra_to_genbank])
subgraph S1 ["🔃 scatter sra_id in SRA_accessions"]
direction TB
N1["Fetch_SRA_to_BAM"]
subgraph C1 ["↔️ if Fetch_SRA_to_BAM.num_reads >= min_reads_per_bam"]
direction TB
N2["fastqc"]
N3["spikeinalign_and_count "]
end
end
N4["group_sra_bams_by_biosample"]
subgraph S2 ["🔃 scatter samn_bam in zip(group_sra_bams_by_biosample.biosample_accessions, group_sra_bams_by_biosample.grouped_bam_filepaths)"]
direction TB
N5[/"assemble_refbased"/]
subgraph C2 ["↔️ if assemble_refbased.assembly_length_unambiguous >= min_genome_bases"]
direction TB
N6["rename_fasta_header"]
N7[/"sarscov2_lineages"/]
N8["vadr"]
end
end
N9["assembly_meta_tsvconcatenate "]
N10["submit_genomesconcatenate "]
N11["biosample_to_genbank"]
N12["structured_comments"]
N13["package_genbank_ftp_submissionpackage_special_genbank_ftp_submission "]
N14["prefix_gisaidprefix_fasta_header "]
N15["gisaid_meta_prep"]
N16["multiqc_rawMultiQC "]
N17["spike_summaryalign_and_count_summary "]
N1 --> N2
N1 --> N3
N1 --> N4
N4 --> N5
N5 --> N6
N4 --> N6
N5 --> N7
N4 --> N7
N6 --> N7
N5 --> N8
N4 --> N8
N6 --> N8
N5 --> N9
N7 --> N9
N8 --> N9
N4 --> N9
N6 --> N10
N4 --> N11
N5 --> N12
N4 --> N12
N10 --> N13
N11 --> N13
N12 --> N13
N10 --> N14
N12 --> N15
N11 --> N15
N2 --> N16
N3 --> N17
Start --> N1
N15 --> End([End])
N14 --> End([End])
N9 --> End([End])
N13 --> End([End])
N17 --> End([End])
N16 --> End([End])
classDef taskNode fill:#a371f7,stroke:#8b5cf6,stroke-width:2px,color:#fff
classDef workflowNode fill:#58a6ff,stroke:#1f6feb,stroke-width:2px,color:#fff
version 1.0
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils
import "assemble_refbased.wdl"
import "sarscov2_lineages.wdl"
workflow sarscov2_sra_to_genbank {
meta {
description: "Full SARS-CoV-2 analysis workflow starting from SRA data and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging assemblies for data release."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
parameter_meta {
SRA_accessions: {
description: "SRA Run accession numbers (SRR####)."
}
reference_fasta: {
description: "Reference genome to align reads to.",
patterns: ["*.fasta"]
}
amplicon_bed_default: {
description: "Amplicon primers to trim in reference coordinate space (0-based BED format). Will only be used if the SRA Experiment Design has a Library Strategy set to AMPLICON (will be ignored on all other values).",
patterns: ["*.bed"]
}
}
input {
Array[String] SRA_accessions
File reference_fasta
File amplicon_bed_default
File spikein_db
Int min_genome_bases = 15000
Int min_reads_per_bam = 100
Int max_vadr_alerts = 0
}
Int taxid = 2697049
String gisaid_prefix = 'hCoV-19/'
### retrieve reads and annotation from SRA
scatter(sra_id in SRA_accessions) {
call ncbi_tools.Fetch_SRA_to_BAM {
input:
SRA_ID = sra_id
}
if (Fetch_SRA_to_BAM.num_reads >= min_reads_per_bam) {
File reads_ubam = Fetch_SRA_to_BAM.reads_ubam
File biosample_attributes_json = Fetch_SRA_to_BAM.biosample_attributes_json
String library_strategy = Fetch_SRA_to_BAM.library_strategy
String biosample_accession = Fetch_SRA_to_BAM.biosample_accession
Int num_reads = Fetch_SRA_to_BAM.num_reads
String seq_platform = Fetch_SRA_to_BAM.sequencing_platform
call reports.fastqc {
input:
reads_bam = Fetch_SRA_to_BAM.reads_ubam
}
call reports.align_and_count as spikein {
input:
reads_bam = Fetch_SRA_to_BAM.reads_ubam,
ref_db = spikein_db
}
}
}
### gather data by biosample
call ncbi_tools.group_sra_bams_by_biosample {
input:
biosamples = select_all(biosample_accession),
bam_filepaths = select_all(reads_ubam),
biosample_attributes_jsons = select_all(biosample_attributes_json),
library_strategies = select_all(library_strategy),
seq_platforms = select_all(seq_platform)
}
### assembly and analyses per biosample
scatter(samn_bam in zip(group_sra_bams_by_biosample.biosample_accessions, group_sra_bams_by_biosample.grouped_bam_filepaths)) {
Boolean ampseq = (group_sra_bams_by_biosample.samn_to_library_strategy[samn_bam.left] == "AMPLICON")
String orig_name = group_sra_bams_by_biosample.samn_to_attributes[samn_bam.left]["sample_name"]
# assemble genome
if (ampseq) {
String trim_coords_bed = amplicon_bed_default
}
call assemble_refbased.assemble_refbased {
input:
reads_unmapped_bams = samn_bam.right,
reference_fasta = reference_fasta,
sample_name = samn_bam.left,
aligner = "minimap2",
skip_mark_dupes = ampseq,
trim_coords_bed = trim_coords_bed,
min_coverage = if ampseq then 20 else 3
}
# for genomes that somewhat assemble
if (assemble_refbased.assembly_length_unambiguous >= min_genome_bases) {
call ncbi.rename_fasta_header {
input:
genome_fasta = assemble_refbased.assembly_fasta,
new_name = orig_name
}
File passing_assemblies = rename_fasta_header.renamed_fasta
String passing_assembly_ids = orig_name
Array[String] assembly_cmt = [orig_name, "Broad viral-ngs v. " + assemble_refbased.align_to_ref_viral_core_version, assemble_refbased.assembly_mean_coverage, group_sra_bams_by_biosample.samn_to_sequencing_platform[samn_bam.left]]
# lineage assignment
call sarscov2_lineages.sarscov2_lineages {
input:
genome_fasta = passing_assemblies
}
# VADR annotation & QC
call ncbi.vadr {
input:
genome_fasta = passing_assemblies,
vadr_opts = "--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn",
minlen = 50,
maxlen = 30000
}
if (vadr.num_alerts<=max_vadr_alerts) {
File submittable_genomes = passing_assemblies
String submittable_id = orig_name
}
if (vadr.num_alerts>max_vadr_alerts) {
String failed_annotation_id = orig_name
}
}
if (assemble_refbased.assembly_length_unambiguous < min_genome_bases) {
String failed_assembly_id = orig_name
}
Array[String] assembly_tsv_row = [
orig_name,
samn_bam.left,
basename(select_first([trim_coords_bed, ""])),
assemble_refbased.assembly_mean_coverage,
assemble_refbased.assembly_length_unambiguous,
select_first([sarscov2_lineages.nextclade_clade, ""]),
select_first([sarscov2_lineages.nextclade_aa_subs, ""]),
select_first([sarscov2_lineages.nextclade_aa_dels, ""]),
select_first([sarscov2_lineages.pango_lineage, ""]),
assemble_refbased.dist_to_ref_snps,
assemble_refbased.dist_to_ref_indels,
select_first([vadr.num_alerts, ""]),
assemble_refbased.assembly_fasta,
assemble_refbased.align_to_ref_merged_coverage_plot,
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
assemble_refbased.replicate_discordant_vcf,
select_first([sarscov2_lineages.nextclade_tsv, ""]),
select_first([sarscov2_lineages.nextclade_json, ""]),
select_first([sarscov2_lineages.pango_lineage_report, ""]),
select_first([vadr.outputs_tgz, ""]),
assemble_refbased.replicate_concordant_sites,
assemble_refbased.replicate_discordant_snps,
assemble_refbased.replicate_discordant_indels,
assemble_refbased.num_read_groups,
assemble_refbased.num_libraries,
assemble_refbased.align_to_ref_merged_reads_aligned,
assemble_refbased.align_to_ref_merged_bases_aligned,
]
}
Array[String] assembly_tsv_header = [
'sample', 'sample_sanitized', 'amplicon_set', 'assembly_mean_coverage', 'assembly_length_unambiguous',
'nextclade_clade', 'nextclade_aa_subs', 'nextclade_aa_dels', 'pango_lineage',
'dist_to_ref_snps', 'dist_to_ref_indels', 'vadr_num_alerts',
'assembly_fasta', 'coverage_plot', 'aligned_bam', 'replicate_discordant_vcf',
'nextclade_tsv', 'nextclade_json', 'pangolin_csv', 'vadr_tgz',
'replicate_concordant_sites', 'replicate_discordant_snps', 'replicate_discordant_indels', 'num_read_groups', 'num_libraries',
'align_to_ref_merged_reads_aligned', 'align_to_ref_merged_bases_aligned',
]
call utils.concatenate as assembly_meta_tsv {
input:
infiles = [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)],
output_name = "assembly_metadata.tsv"
}
### prep genbank submission
call utils.concatenate as submit_genomes {
input:
infiles = select_all(submittable_genomes),
output_name = "assemblies.fasta"
}
call ncbi.biosample_to_genbank {
input:
biosample_attributes = group_sra_bams_by_biosample.biosample_attributes_tsv,
num_segments = 1,
taxid = taxid,
filter_to_ids = write_lines(select_all(submittable_id))
}
call ncbi.structured_comments {
input:
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
filter_to_ids = write_lines(select_all(submittable_id))
}
call ncbi.package_special_genbank_ftp_submission as package_genbank_ftp_submission {
input:
sequences_fasta = submit_genomes.combined,
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comment_table = structured_comments.structured_comment_table
}
### prep gisaid submission
call ncbi.prefix_fasta_header as prefix_gisaid {
input:
genome_fasta = submit_genomes.combined,
prefix = gisaid_prefix
}
call ncbi.gisaid_meta_prep {
input:
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comments = structured_comments.structured_comment_table,
out_name = "gisaid_meta.csv",
strict = false
}
#### summary stats
call reports.MultiQC as multiqc_raw {
input:
input_files = select_all(fastqc.fastqc_zip),
file_name = "multiqc-raw.html"
}
call reports.align_and_count_summary as spike_summary {
input:
counts_txt = select_all(spikein.report)
}
output {
Array[File] raw_reads_unaligned_bams = select_all(reads_ubam)
Array[File] cleaned_reads_unaligned_bams = select_all(reads_ubam)
File biosample_attributes = group_sra_bams_by_biosample.biosample_attributes_tsv
Array[Int] read_counts_raw = select_all(num_reads)
Array[Int] read_counts_depleted = select_all(num_reads)
Array[Int] primer_trimmed_read_count = flatten(assemble_refbased.primer_trimmed_read_count)
Array[Float] primer_trimmed_read_percent = flatten(assemble_refbased.primer_trimmed_read_percent)
Array[File] assemblies_fasta = assemble_refbased.assembly_fasta
Array[File] passing_assemblies_fasta = select_all(passing_assemblies)
Array[File] submittable_assemblies_fasta = select_all(submittable_genomes)
File multiqc_report_raw = multiqc_raw.multiqc_report
File spikein_counts = spike_summary.count_summary
File assembly_stats_tsv = assembly_meta_tsv.combined
File submission_zip = package_genbank_ftp_submission.submission_zip
File submission_xml = package_genbank_ftp_submission.submission_xml
File submit_ready = package_genbank_ftp_submission.submit_ready
Array[File] vadr_outputs = select_all(vadr.outputs_tgz)
File genbank_source_table = biosample_to_genbank.genbank_source_modifier_table
File gisaid_fasta = prefix_gisaid.renamed_fasta
File gisaid_meta_csv = gisaid_meta_prep.meta_csv
Array[String] assembled_ids = select_all(passing_assembly_ids)
Array[String] submittable_ids = select_all(submittable_id)
Array[String] failed_assembly_ids = select_all(failed_assembly_id)
Array[String] failed_annotation_ids = select_all(failed_annotation_id)
Int num_read_files = length(Fetch_SRA_to_BAM.reads_ubam)
Int num_assembled = length(select_all(passing_assemblies))
Int num_failed_assembly = length(select_all(failed_assembly_id))
Int num_submittable = length(select_all(submittable_id))
Int num_failed_annotation = length(select_all(failed_annotation_id))
Int num_samples = length(group_sra_bams_by_biosample.biosample_accessions)
}
}
version 1.0
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils
import "assemble_refbased.wdl"
import "sarscov2_lineages.wdl"
workflow sarscov2_sra_to_genbank {
meta {
description: "Full SARS-CoV-2 analysis workflow starting from SRA data and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging assemblies for data release."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
parameter_meta {
SRA_accessions: {
description: "SRA Run accession numbers (SRR####)."
}
reference_fasta: {
description: "Reference genome to align reads to.",
patterns: ["*.fasta"]
}
amplicon_bed_default: {
description: "Amplicon primers to trim in reference coordinate space (0-based BED format). Will only be used if the SRA Experiment Design has a Library Strategy set to AMPLICON (will be ignored on all other values).",
patterns: ["*.bed"]
}
}
input {
Array[String] SRA_accessions
File reference_fasta
File amplicon_bed_default
File spikein_db
Int min_genome_bases = 15000
Int min_reads_per_bam = 100
Int max_vadr_alerts = 0
}
Int taxid = 2697049
String gisaid_prefix = 'hCoV-19/'
### retrieve reads and annotation from SRA
scatter(sra_id in SRA_accessions) {
call ncbi_tools.Fetch_SRA_to_BAM {
input:
SRA_ID = sra_id
}
if (Fetch_SRA_to_BAM.num_reads >= min_reads_per_bam) {
File reads_ubam = Fetch_SRA_to_BAM.reads_ubam
File biosample_attributes_json = Fetch_SRA_to_BAM.biosample_attributes_json
String library_strategy = Fetch_SRA_to_BAM.library_strategy
String biosample_accession = Fetch_SRA_to_BAM.biosample_accession
Int num_reads = Fetch_SRA_to_BAM.num_reads
String seq_platform = Fetch_SRA_to_BAM.sequencing_platform
call reports.fastqc {
input:
reads_bam = Fetch_SRA_to_BAM.reads_ubam
}
call reports.align_and_count as spikein {
input:
reads_bam = Fetch_SRA_to_BAM.reads_ubam,
ref_db = spikein_db
}
}
}
### gather data by biosample
call ncbi_tools.group_sra_bams_by_biosample {
input:
biosamples = select_all(biosample_accession),
bam_filepaths = select_all(reads_ubam),
biosample_attributes_jsons = select_all(biosample_attributes_json),
library_strategies = select_all(library_strategy),
seq_platforms = select_all(seq_platform)
}
### assembly and analyses per biosample
scatter(samn_bam in zip(group_sra_bams_by_biosample.biosample_accessions, group_sra_bams_by_biosample.grouped_bam_filepaths)) {
Boolean ampseq = (group_sra_bams_by_biosample.samn_to_library_strategy[samn_bam.left] == "AMPLICON")
String orig_name = group_sra_bams_by_biosample.samn_to_attributes[samn_bam.left]["sample_name"]
# assemble genome
if (ampseq) {
String trim_coords_bed = amplicon_bed_default
}
call assemble_refbased.assemble_refbased {
input:
reads_unmapped_bams = samn_bam.right,
reference_fasta = reference_fasta,
sample_name = samn_bam.left,
aligner = "minimap2",
skip_mark_dupes = ampseq,
trim_coords_bed = trim_coords_bed,
min_coverage = if ampseq then 20 else 3
}
# for genomes that somewhat assemble
if (assemble_refbased.assembly_length_unambiguous >= min_genome_bases) {
call ncbi.rename_fasta_header {
input:
genome_fasta = assemble_refbased.assembly_fasta,
new_name = orig_name
}
File passing_assemblies = rename_fasta_header.renamed_fasta
String passing_assembly_ids = orig_name
Array[String] assembly_cmt = [orig_name, "Broad viral-ngs v. " + assemble_refbased.align_to_ref_viral_core_version, assemble_refbased.assembly_mean_coverage, group_sra_bams_by_biosample.samn_to_sequencing_platform[samn_bam.left]]
# lineage assignment
call sarscov2_lineages.sarscov2_lineages {
input:
genome_fasta = passing_assemblies
}
# VADR annotation & QC
call ncbi.vadr {
input:
genome_fasta = passing_assemblies,
vadr_opts = "--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn",
minlen = 50,
maxlen = 30000
}
if (vadr.num_alerts<=max_vadr_alerts) {
File submittable_genomes = passing_assemblies
String submittable_id = orig_name
}
if (vadr.num_alerts>max_vadr_alerts) {
String failed_annotation_id = orig_name
}
}
if (assemble_refbased.assembly_length_unambiguous < min_genome_bases) {
String failed_assembly_id = orig_name
}
Array[String] assembly_tsv_row = [
orig_name,
samn_bam.left,
basename(select_first([trim_coords_bed, ""])),
assemble_refbased.assembly_mean_coverage,
assemble_refbased.assembly_length_unambiguous,
select_first([sarscov2_lineages.nextclade_clade, ""]),
select_first([sarscov2_lineages.nextclade_aa_subs, ""]),
select_first([sarscov2_lineages.nextclade_aa_dels, ""]),
select_first([sarscov2_lineages.pango_lineage, ""]),
assemble_refbased.dist_to_ref_snps,
assemble_refbased.dist_to_ref_indels,
select_first([vadr.num_alerts, ""]),
assemble_refbased.assembly_fasta,
assemble_refbased.align_to_ref_merged_coverage_plot,
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
assemble_refbased.replicate_discordant_vcf,
select_first([sarscov2_lineages.nextclade_tsv, ""]),
select_first([sarscov2_lineages.nextclade_json, ""]),
select_first([sarscov2_lineages.pango_lineage_report, ""]),
select_first([vadr.outputs_tgz, ""]),
assemble_refbased.replicate_concordant_sites,
assemble_refbased.replicate_discordant_snps,
assemble_refbased.replicate_discordant_indels,
assemble_refbased.num_read_groups,
assemble_refbased.num_libraries,
assemble_refbased.align_to_ref_merged_reads_aligned,
assemble_refbased.align_to_ref_merged_bases_aligned,
]
}
Array[String] assembly_tsv_header = [
'sample', 'sample_sanitized', 'amplicon_set', 'assembly_mean_coverage', 'assembly_length_unambiguous',
'nextclade_clade', 'nextclade_aa_subs', 'nextclade_aa_dels', 'pango_lineage',
'dist_to_ref_snps', 'dist_to_ref_indels', 'vadr_num_alerts',
'assembly_fasta', 'coverage_plot', 'aligned_bam', 'replicate_discordant_vcf',
'nextclade_tsv', 'nextclade_json', 'pangolin_csv', 'vadr_tgz',
'replicate_concordant_sites', 'replicate_discordant_snps', 'replicate_discordant_indels', 'num_read_groups', 'num_libraries',
'align_to_ref_merged_reads_aligned', 'align_to_ref_merged_bases_aligned',
]
call utils.concatenate as assembly_meta_tsv {
input:
infiles = [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)],
output_name = "assembly_metadata.tsv"
}
### prep genbank submission
call utils.concatenate as submit_genomes {
input:
infiles = select_all(submittable_genomes),
output_name = "assemblies.fasta"
}
call ncbi.biosample_to_genbank {
input:
biosample_attributes = group_sra_bams_by_biosample.biosample_attributes_tsv,
num_segments = 1,
taxid = taxid,
filter_to_ids = write_lines(select_all(submittable_id))
}
call ncbi.structured_comments {
input:
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
filter_to_ids = write_lines(select_all(submittable_id))
}
call ncbi.package_special_genbank_ftp_submission as package_genbank_ftp_submission {
input:
sequences_fasta = submit_genomes.combined,
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comment_table = structured_comments.structured_comment_table
}
### prep gisaid submission
call ncbi.prefix_fasta_header as prefix_gisaid {
input:
genome_fasta = submit_genomes.combined,
prefix = gisaid_prefix
}
call ncbi.gisaid_meta_prep {
input:
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
structured_comments = structured_comments.structured_comment_table,
out_name = "gisaid_meta.csv",
strict = false
}
#### summary stats
call reports.MultiQC as multiqc_raw {
input:
input_files = select_all(fastqc.fastqc_zip),
file_name = "multiqc-raw.html"
}
call reports.align_and_count_summary as spike_summary {
input:
counts_txt = select_all(spikein.report)
}
output {
Array[File] raw_reads_unaligned_bams = select_all(reads_ubam)
Array[File] cleaned_reads_unaligned_bams = select_all(reads_ubam)
File biosample_attributes = group_sra_bams_by_biosample.biosample_attributes_tsv
Array[Int] read_counts_raw = select_all(num_reads)
Array[Int] read_counts_depleted = select_all(num_reads)
Array[Int] primer_trimmed_read_count = flatten(assemble_refbased.primer_trimmed_read_count)
Array[Float] primer_trimmed_read_percent = flatten(assemble_refbased.primer_trimmed_read_percent)
Array[File] assemblies_fasta = assemble_refbased.assembly_fasta
Array[File] passing_assemblies_fasta = select_all(passing_assemblies)
Array[File] submittable_assemblies_fasta = select_all(submittable_genomes)
File multiqc_report_raw = multiqc_raw.multiqc_report
File spikein_counts = spike_summary.count_summary
File assembly_stats_tsv = assembly_meta_tsv.combined
File submission_zip = package_genbank_ftp_submission.submission_zip
File submission_xml = package_genbank_ftp_submission.submission_xml
File submit_ready = package_genbank_ftp_submission.submit_ready
Array[File] vadr_outputs = select_all(vadr.outputs_tgz)
File genbank_source_table = biosample_to_genbank.genbank_source_modifier_table
File gisaid_fasta = prefix_gisaid.renamed_fasta
File gisaid_meta_csv = gisaid_meta_prep.meta_csv
Array[String] assembled_ids = select_all(passing_assembly_ids)
Array[String] submittable_ids = select_all(submittable_id)
Array[String] failed_assembly_ids = select_all(failed_assembly_id)
Array[String] failed_annotation_ids = select_all(failed_annotation_id)
Int num_read_files = length(Fetch_SRA_to_BAM.reads_ubam)
Int num_assembled = length(select_all(passing_assemblies))
Int num_failed_assembly = length(select_all(failed_assembly_id))
Int num_submittable = length(select_all(submittable_id))
Int num_failed_annotation = length(select_all(failed_annotation_id))
Int num_samples = length(group_sra_bams_by_biosample.biosample_accessions)
}
}