sarscov2_sra_to_genbank
pipes/WDL/workflows/sarscov2_sra_to_genbank.wdl

WORKFLOW sarscov2_sra_to_genbank

File Path pipes/WDL/workflows/sarscov2_sra_to_genbank.wdl
WDL Version 1.0
Type workflow

Imports

Namespace Path
ncbi ../tasks/tasks_ncbi.wdl
ncbi_tools ../tasks/tasks_ncbi_tools.wdl
reports ../tasks/tasks_reports.wdl
utils ../tasks/tasks_utils.wdl
assemble_refbased assemble_refbased.wdl
sarscov2_lineages sarscov2_lineages.wdl

Workflow: sarscov2_sra_to_genbank

Full SARS-CoV-2 analysis workflow starting from SRA data and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging assemblies for data release.

Author: Broad Viral Genomics
viral-ngs@broadinstitute.org

Inputs

Name Type Description Default
SRA_accessions Array[String] SRA Run accession numbers (SRR####). -
reference_fasta File Reference genome to align reads to. -
amplicon_bed_default File Amplicon primers to trim in reference coordinate space (0-based BED format). Will only be used if the SRA Experiment Design has a Library Strategy set to AMPLICON (will be ignored on all other values). -
spikein_db File - -
sample_name String? - -
email_address String? - -
ncbi_api_key String? - -
machine_mem_gb Int? - -
machine_mem_gb Int? - -
sample_original_name String? - -
novocraft_license File? - -
machine_mem_gb Int? - -
min_keep_length Int? - -
sliding_window Int? - -
primer_offset Int? - -
machine_mem_gb Int? - -
reheader_table File? - -
amplicon_set String? - -
max_coverage_depth Int? - -
base_q_threshold Int? - -
mapping_q_threshold Int? - -
read_length_threshold Int? - -
plotXLimits String? - -
plotYLimits String? - -
machine_mem_gb Int? - -
reheader_table File? - -
max_coverage_depth Int? - -
base_q_threshold Int? - -
mapping_q_threshold Int? - -
read_length_threshold Int? - -
plotXLimits String? - -
plotYLimits String? - -
root_sequence File? - -
auspice_reference_tree_json File? - -
pathogen_json File? - -
gene_annotations_json File? - -
min_length Int? - -
max_ambig Float? - -
analysis_mode String? - -
vadr_model_tar File? - -
vadr_model_tar_subdir String? - -
filter_to_accession String? - -
organism_name_override String? - -
sequence_id_override String? - -
isolate_prefix_override String? - -
source_overrides_json File? - -
author_template_sbt File - -
submission_name String - -
submission_uid String - -
spuid_namespace String - -
account_name String - -
username String? - -
submitting_lab_name String - -
fasta_filename String? - -
title String? - -
comment String? - -
template String? - -
tag String? - -
ignore_analysis_files String? - -
ignore_sample_names String? - -
sample_names File? - -
exclude_modules Array[String]? - -
module_to_use Array[String]? - -
output_data_format String? - -
config File? - -
config_yaml String? - -
97 optional inputs with default values

Outputs

Name Type Expression
raw_reads_unaligned_bams Array[File] select_all(reads_ubam)
cleaned_reads_unaligned_bams Array[File] select_all(reads_ubam)
biosample_attributes File group_sra_bams_by_biosample.biosample_attributes_tsv
read_counts_raw Array[Int] select_all(num_reads)
read_counts_depleted Array[Int] select_all(num_reads)
primer_trimmed_read_count Array[Int] flatten(assemble_refbased.primer_trimmed_read_count)
primer_trimmed_read_percent Array[Float] flatten(assemble_refbased.primer_trimmed_read_percent)
assemblies_fasta Array[File] assemble_refbased.assembly_fasta
passing_assemblies_fasta Array[File] select_all(passing_assemblies)
submittable_assemblies_fasta Array[File] select_all(submittable_genomes)
multiqc_report_raw File multiqc_raw.multiqc_report
spikein_counts File spike_summary.count_summary
assembly_stats_tsv File assembly_meta_tsv.combined
submission_zip File package_genbank_ftp_submission.submission_zip
submission_xml File package_genbank_ftp_submission.submission_xml
submit_ready File package_genbank_ftp_submission.submit_ready
vadr_outputs Array[File] select_all(vadr.outputs_tgz)
genbank_source_table File biosample_to_genbank.genbank_source_modifier_table
gisaid_fasta File prefix_gisaid.renamed_fasta
gisaid_meta_csv File gisaid_meta_prep.meta_csv
assembled_ids Array[String] select_all(passing_assembly_ids)
submittable_ids Array[String] select_all(submittable_id)
failed_assembly_ids Array[String] select_all(failed_assembly_id)
failed_annotation_ids Array[String] select_all(failed_annotation_id)
num_read_files Int length(Fetch_SRA_to_BAM.reads_ubam)
num_assembled Int length(select_all(passing_assemblies))
num_failed_assembly Int length(select_all(failed_assembly_id))
num_submittable Int length(select_all(submittable_id))
num_failed_annotation Int length(select_all(failed_annotation_id))
num_samples Int length(group_sra_bams_by_biosample.biosample_accessions)

Calls

This workflow calls the following tasks or subworkflows:

CALL TASKS Fetch_SRA_to_BAM

Input Mappings (1)
Input Value
SRA_ID sra_id

CALL TASKS fastqc

Input Mappings (1)
Input Value
reads_bam Fetch_SRA_to_BAM.reads_ubam

CALL TASKS spikein → align_and_count

Input Mappings (2)
Input Value
reads_bam Fetch_SRA_to_BAM.reads_ubam
ref_db spikein_db

CALL TASKS group_sra_bams_by_biosample

Input Mappings (5)
Input Value
biosamples select_all(biosample_accession)
bam_filepaths select_all(reads_ubam)
biosample_attributes_jsons select_all(biosample_attributes_json)
library_strategies select_all(library_strategy)
seq_platforms select_all(seq_platform)

CALL WORKFLOW assemble_refbased

Input Mappings (7)
Input Value
reads_unmapped_bams samn_bam.right
reference_fasta reference_fasta
sample_name samn_bam.left
aligner "minimap2"
skip_mark_dupes ampseq
trim_coords_bed trim_coords_bed
min_coverage if ampseq then 20 else 3

CALL TASKS rename_fasta_header

Input Mappings (2)
Input Value
genome_fasta assemble_refbased.assembly_fasta
new_name orig_name

CALL WORKFLOW sarscov2_lineages

Input Mappings (1)
Input Value
genome_fasta passing_assemblies

CALL TASKS vadr

Input Mappings (4)
Input Value
genome_fasta passing_assemblies
vadr_opts "--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn"
minlen 50
maxlen 30000

CALL TASKS assembly_meta_tsv → concatenate

Input Mappings (2)
Input Value
infiles [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)]
output_name "assembly_metadata.tsv"

CALL TASKS submit_genomes → concatenate

Input Mappings (2)
Input Value
infiles select_all(submittable_genomes)
output_name "assemblies.fasta"

CALL TASKS biosample_to_genbank

Input Mappings (4)
Input Value
biosample_attributes group_sra_bams_by_biosample.biosample_attributes_tsv
num_segments 1
taxid taxid
filter_to_ids write_lines(select_all(submittable_id))

CALL TASKS structured_comments

Input Mappings (2)
Input Value
assembly_stats_tsv write_tsv(flatten([[['SeqID', 'Assembly Method', 'Coverage', 'Sequencing Technology']], select_all(assembly_cmt)]))
filter_to_ids write_lines(select_all(submittable_id))

CALL TASKS package_genbank_ftp_submission → package_special_genbank_ftp_submission

Input Mappings (3)
Input Value
sequences_fasta submit_genomes.combined
source_modifier_table biosample_to_genbank.genbank_source_modifier_table
structured_comment_table structured_comments.structured_comment_table

CALL TASKS prefix_gisaid → prefix_fasta_header

Input Mappings (2)
Input Value
genome_fasta submit_genomes.combined
prefix gisaid_prefix

CALL TASKS gisaid_meta_prep

Input Mappings (4)
Input Value
source_modifier_table biosample_to_genbank.genbank_source_modifier_table
structured_comments structured_comments.structured_comment_table
out_name "gisaid_meta.csv"
strict false

CALL TASKS multiqc_raw → MultiQC

Input Mappings (2)
Input Value
input_files select_all(fastqc.fastqc_zip)
file_name "multiqc-raw.html"

CALL TASKS spike_summary → align_and_count_summary

Input Mappings (1)
Input Value
counts_txt select_all(spikein.report)

Images

Container images used by tasks in this workflow:

🐳 python

python:slim

Used by 4 tasks:
  • group_sra_bams_by_biosample
  • biosample_to_genbank
  • prefix_gisaid
  • gisaid_meta_prep
🐳 ubuntu

ubuntu

Used by 2 tasks:
  • assembly_meta_tsv
  • submit_genomes
🐳 Parameterized Image
⚙️ Parameterized

Configured via input:
docker

Used by 2 tasks:
  • structured_comments
  • rename_fasta_header
🐳 Parameterized Image
⚙️ Parameterized

Configured via input:
docker

Used by 1 task:
  • package_genbank_ftp_submission
🐳 ~{docker}

~{docker}

Used by 4 tasks:
  • multiqc_raw
  • spike_summary
  • fastqc
  • spikein
🐳 Parameterized Image
⚙️ Parameterized

Configured via input:
docker

Used by 1 task:
  • Fetch_SRA_to_BAM
🐳 Parameterized Image
⚙️ Parameterized

Configured via input:
docker

Used by 1 task:
  • vadr
← Back to Index

sarscov2_sra_to_genbank - Workflow Graph

🖱️ Scroll to zoom • Drag to pan • Double-click to reset • ESC to close

sarscov2_sra_to_genbank - WDL Source Code

version 1.0

import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_ncbi_tools.wdl" as ncbi_tools
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils

import "assemble_refbased.wdl"
import "sarscov2_lineages.wdl"

workflow sarscov2_sra_to_genbank {
    meta {
        description: "Full SARS-CoV-2 analysis workflow starting from SRA data and metadata and performing assembly, spike-in analysis, qc, lineage assignment, and packaging assemblies for data release."
        author: "Broad Viral Genomics"
        email:  "viral-ngs@broadinstitute.org"
        allowNestedInputs: true
    }

    parameter_meta {
        SRA_accessions: {
            description: "SRA Run accession numbers (SRR####)."
        }
        reference_fasta: {
            description: "Reference genome to align reads to.",
            patterns: ["*.fasta"]
        }
        amplicon_bed_default: {
            description: "Amplicon primers to trim in reference coordinate space (0-based BED format). Will only be used if the SRA Experiment Design has a Library Strategy set to AMPLICON (will be ignored on all other values).",
            patterns: ["*.bed"]
        }
    }

    input {
        Array[String] SRA_accessions

        File          reference_fasta
        File          amplicon_bed_default
        File          spikein_db

        Int           min_genome_bases = 15000
        Int           min_reads_per_bam = 100
        Int           max_vadr_alerts = 0
    }
    Int     taxid = 2697049
    String  gisaid_prefix = 'hCoV-19/'

    ### retrieve reads and annotation from SRA
    scatter(sra_id in SRA_accessions) {
        call ncbi_tools.Fetch_SRA_to_BAM {
            input:
                SRA_ID = sra_id
        }
        if (Fetch_SRA_to_BAM.num_reads >= min_reads_per_bam) {
            File reads_ubam                = Fetch_SRA_to_BAM.reads_ubam
            File biosample_attributes_json = Fetch_SRA_to_BAM.biosample_attributes_json
            String library_strategy        = Fetch_SRA_to_BAM.library_strategy
            String biosample_accession     = Fetch_SRA_to_BAM.biosample_accession
            Int num_reads                  = Fetch_SRA_to_BAM.num_reads
            String seq_platform            = Fetch_SRA_to_BAM.sequencing_platform
            call reports.fastqc {
                input:
                    reads_bam = Fetch_SRA_to_BAM.reads_ubam
            }
            call reports.align_and_count as spikein {
                input:
                    reads_bam = Fetch_SRA_to_BAM.reads_ubam,
                    ref_db    = spikein_db
            }
        }
    }

    ### gather data by biosample
    call ncbi_tools.group_sra_bams_by_biosample {
        input:
            biosamples                 = select_all(biosample_accession),
            bam_filepaths              = select_all(reads_ubam),
            biosample_attributes_jsons = select_all(biosample_attributes_json),
            library_strategies         = select_all(library_strategy),
            seq_platforms              = select_all(seq_platform)
    }

    ### assembly and analyses per biosample
    scatter(samn_bam in zip(group_sra_bams_by_biosample.biosample_accessions, group_sra_bams_by_biosample.grouped_bam_filepaths)) {
        Boolean ampseq   = (group_sra_bams_by_biosample.samn_to_library_strategy[samn_bam.left] == "AMPLICON")
        String orig_name = group_sra_bams_by_biosample.samn_to_attributes[samn_bam.left]["sample_name"]

        # assemble genome
        if (ampseq) {
            String trim_coords_bed = amplicon_bed_default
        }
        call assemble_refbased.assemble_refbased {
            input:
                reads_unmapped_bams = samn_bam.right,
                reference_fasta     = reference_fasta,
                sample_name         = samn_bam.left,
                aligner             = "minimap2",
                skip_mark_dupes     = ampseq,
                trim_coords_bed     = trim_coords_bed,
                min_coverage        = if ampseq then 20 else 3
        }

        # for genomes that somewhat assemble
        if (assemble_refbased.assembly_length_unambiguous >= min_genome_bases) {
            call ncbi.rename_fasta_header {
              input:
                genome_fasta = assemble_refbased.assembly_fasta,
                new_name     = orig_name
            }

            File passing_assemblies     = rename_fasta_header.renamed_fasta
            String passing_assembly_ids = orig_name
            Array[String] assembly_cmt  = [orig_name, "Broad viral-ngs v. " + assemble_refbased.align_to_ref_viral_core_version, assemble_refbased.assembly_mean_coverage, group_sra_bams_by_biosample.samn_to_sequencing_platform[samn_bam.left]]

            # lineage assignment
            call sarscov2_lineages.sarscov2_lineages {
                input:
                    genome_fasta = passing_assemblies
            }

            # VADR annotation & QC
            call ncbi.vadr {
              input:
                genome_fasta = passing_assemblies,
                vadr_opts = "--glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn",
                minlen = 50,
                maxlen = 30000
            }
            if (vadr.num_alerts<=max_vadr_alerts) {
              File submittable_genomes = passing_assemblies
              String submittable_id    = orig_name
            }
            if (vadr.num_alerts>max_vadr_alerts) {
              String failed_annotation_id = orig_name
            }
        }
        if (assemble_refbased.assembly_length_unambiguous < min_genome_bases) {
            String failed_assembly_id = orig_name
        }

        Array[String] assembly_tsv_row = [
            orig_name,
            samn_bam.left,
            basename(select_first([trim_coords_bed, ""])),
            assemble_refbased.assembly_mean_coverage,
            assemble_refbased.assembly_length_unambiguous,
            select_first([sarscov2_lineages.nextclade_clade, ""]),
            select_first([sarscov2_lineages.nextclade_aa_subs, ""]),
            select_first([sarscov2_lineages.nextclade_aa_dels, ""]),
            select_first([sarscov2_lineages.pango_lineage, ""]),
            assemble_refbased.dist_to_ref_snps,
            assemble_refbased.dist_to_ref_indels,
            select_first([vadr.num_alerts, ""]),
            assemble_refbased.assembly_fasta,
            assemble_refbased.align_to_ref_merged_coverage_plot,
            assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
            assemble_refbased.replicate_discordant_vcf,
            select_first([sarscov2_lineages.nextclade_tsv, ""]),
            select_first([sarscov2_lineages.nextclade_json, ""]),
            select_first([sarscov2_lineages.pango_lineage_report, ""]),
            select_first([vadr.outputs_tgz, ""]),
            assemble_refbased.replicate_concordant_sites,
            assemble_refbased.replicate_discordant_snps,
            assemble_refbased.replicate_discordant_indels,
            assemble_refbased.num_read_groups,
            assemble_refbased.num_libraries,
            assemble_refbased.align_to_ref_merged_reads_aligned,
            assemble_refbased.align_to_ref_merged_bases_aligned,
        ]
    }

    Array[String] assembly_tsv_header = [
        'sample', 'sample_sanitized', 'amplicon_set', 'assembly_mean_coverage', 'assembly_length_unambiguous',
        'nextclade_clade', 'nextclade_aa_subs', 'nextclade_aa_dels', 'pango_lineage',
        'dist_to_ref_snps', 'dist_to_ref_indels', 'vadr_num_alerts',
        'assembly_fasta', 'coverage_plot', 'aligned_bam', 'replicate_discordant_vcf',
        'nextclade_tsv', 'nextclade_json', 'pangolin_csv', 'vadr_tgz',
        'replicate_concordant_sites', 'replicate_discordant_snps', 'replicate_discordant_indels', 'num_read_groups', 'num_libraries',
        'align_to_ref_merged_reads_aligned', 'align_to_ref_merged_bases_aligned',
        ]

    call utils.concatenate as assembly_meta_tsv {
      input:
        infiles = [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)],
        output_name = "assembly_metadata.tsv"
    }

    ### prep genbank submission
    call utils.concatenate as submit_genomes {
      input:
        infiles     = select_all(submittable_genomes),
        output_name = "assemblies.fasta"
    }
    call ncbi.biosample_to_genbank {
      input:
        biosample_attributes = group_sra_bams_by_biosample.biosample_attributes_tsv,
        num_segments         = 1,
        taxid                = taxid,
        filter_to_ids        = write_lines(select_all(submittable_id))
    }
    call ncbi.structured_comments {
      input:
        assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
        filter_to_ids      = write_lines(select_all(submittable_id))
    }
    call ncbi.package_special_genbank_ftp_submission as package_genbank_ftp_submission {
      input:
        sequences_fasta          = submit_genomes.combined,
        source_modifier_table    = biosample_to_genbank.genbank_source_modifier_table,
        structured_comment_table = structured_comments.structured_comment_table
    }

    ### prep gisaid submission
    call ncbi.prefix_fasta_header as prefix_gisaid {
      input:
        genome_fasta = submit_genomes.combined,
        prefix       = gisaid_prefix
    }
    call ncbi.gisaid_meta_prep {
      input:
        source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
        structured_comments   = structured_comments.structured_comment_table,
        out_name              = "gisaid_meta.csv",
        strict                = false
    }

    #### summary stats
    call reports.MultiQC as multiqc_raw {
        input:
            input_files = select_all(fastqc.fastqc_zip),
            file_name   = "multiqc-raw.html"
    }
    call reports.align_and_count_summary as spike_summary {
        input:
            counts_txt = select_all(spikein.report)
    }

    output {
        Array[File]   raw_reads_unaligned_bams     = select_all(reads_ubam)
        Array[File]   cleaned_reads_unaligned_bams = select_all(reads_ubam)
        
        File          biosample_attributes         = group_sra_bams_by_biosample.biosample_attributes_tsv
        
        Array[Int]    read_counts_raw              = select_all(num_reads)
        Array[Int]    read_counts_depleted         = select_all(num_reads)
        
        Array[Int]    primer_trimmed_read_count    = flatten(assemble_refbased.primer_trimmed_read_count)
        Array[Float]  primer_trimmed_read_percent  = flatten(assemble_refbased.primer_trimmed_read_percent)
        
        Array[File]   assemblies_fasta             = assemble_refbased.assembly_fasta
        Array[File]   passing_assemblies_fasta     = select_all(passing_assemblies)
        Array[File]   submittable_assemblies_fasta = select_all(submittable_genomes)
        
        File          multiqc_report_raw           = multiqc_raw.multiqc_report
        File          spikein_counts               = spike_summary.count_summary
        
        File          assembly_stats_tsv           = assembly_meta_tsv.combined
        
        File          submission_zip               = package_genbank_ftp_submission.submission_zip
        File          submission_xml               = package_genbank_ftp_submission.submission_xml
        File          submit_ready                 = package_genbank_ftp_submission.submit_ready
        Array[File]   vadr_outputs                 = select_all(vadr.outputs_tgz)
        File          genbank_source_table         = biosample_to_genbank.genbank_source_modifier_table
        
        File          gisaid_fasta                 = prefix_gisaid.renamed_fasta
        File          gisaid_meta_csv              = gisaid_meta_prep.meta_csv
        
        Array[String] assembled_ids                = select_all(passing_assembly_ids)
        Array[String] submittable_ids              = select_all(submittable_id)
        Array[String] failed_assembly_ids          = select_all(failed_assembly_id)
        Array[String] failed_annotation_ids        = select_all(failed_annotation_id)
        Int           num_read_files               = length(Fetch_SRA_to_BAM.reads_ubam)
        Int           num_assembled                = length(select_all(passing_assemblies))
        Int           num_failed_assembly          = length(select_all(failed_assembly_id))
        Int           num_submittable              = length(select_all(submittable_id))
        Int           num_failed_annotation        = length(select_all(failed_annotation_id))
        Int           num_samples                  = length(group_sra_bams_by_biosample.biosample_accessions)
    }
}