WORKFLOW
augur_from_msa
File Path
pipes/WDL/workflows/augur_from_msa.wdl
WDL Version
1.0
Type
workflow
Imports
Namespace
Path
nextstrain
../tasks/tasks_nextstrain.wdl
reports
../tasks/tasks_reports.wdl
utils
../tasks/tasks_utils.wdl
Build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/
Author: Broad Viral Genomics
Name
Type
Description
Default
msa_or_vcf
File
Multiple sequence alignment (aligned fasta) or variants (vcf format).
-
sample_metadata
Array[File]+
Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details. At least one tab file must be provided--if multiple are provided, they will be joined via a full left outer join using the 'strain' column as the join ID.
-
ref_fasta
File
A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.
-
genbank_gb
File
A 'genbank' formatted gene annotation file that is used to calculate coding consequences of observed mutations. Must correspond to the same coordinate space as ref_fasta. Typically downloaded from the same NCBI accession number as ref_fasta.
-
auspice_config
File
A file specifying options to customize the auspice export; see: https://nextstrain.github.io/auspice/customise-client/introduction
-
clades_tsv
File?
A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades
-
ancestral_traits_to_infer
Array[String]?
A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata.
-
keep_list
Array[File]?
Optional lists of strain ids to filter inputs down to.
-
mask_bed
File?
Optional list of sites to mask when building trees.
-
exclude_sites
File?
-
-
vcf_reference
File?
-
-
tree_builder_args
String?
-
-
gen_per_year
Int?
-
-
clock_rate
Float?
-
-
clock_std_dev
Float?
-
-
root
String?
-
-
covariance
Boolean?
-
-
precision
Int?
-
-
branch_length_inference
String?
-
-
coalescent
String?
-
-
vcf_reference
File?
-
-
weights
File?
-
-
sampling_bias_correction
Float?
-
-
min_date
Float?
-
-
max_date
Float?
-
-
pivot_interval
Int?
-
-
pivot_interval_units
String?
-
-
narrow_bandwidth
Float?
-
-
wide_bandwidth
Float?
-
-
proportion_wide
Float?
-
-
minimal_frequency
Float?
-
-
stiffness
Float?
-
-
inertia
Float?
-
-
vcf_reference
File?
-
-
root_sequence
File?
-
-
output_vcf
File?
-
-
genes
File?
-
-
vcf_reference_output
File?
-
-
vcf_reference
File?
-
-
lat_longs_tsv
File?
-
-
colors_tsv
File?
-
-
geo_resolutions
Array[String]?
-
-
color_by_metadata
Array[String]?
-
-
description_md
File?
-
-
maintainers
Array[String]?
-
-
title
String?
-
-
50 optional inputs with default values
out_fname
String
-
sub(sub(basename(sequences,".zst"),".vcf",".filtered.vcf"),".fasta$",".filtered.fasta")
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
method
String
-
"iqtree"
substitution_model
String
-
"GTR"
cpus
Int
-
64
machine_mem_gb
Int
-
32
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
1250
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
generate_timetree
Boolean
-
true
keep_root
Boolean
-
true
keep_polytomies
Boolean
-
false
date_confidence
Boolean
-
true
date_inference
String?
-
"marginal"
clock_filter_iqd
Int?
-
4
divergence_units
String?
-
"mutations"
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
machine_mem_gb
Int
-
75
confidence
Boolean
-
true
machine_mem_gb
Int
-
32
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
method
String
-
"kde"
censored
Boolean
-
false
include_internal_nodes
Boolean
-
false
machine_mem_gb
Int
-
64
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
out_basename
String
-
basename(tree,'.nwk')
disk_size
Int
-
200
inference
String
-
"joint"
keep_ambiguous
Boolean
-
false
infer_ambiguous
Boolean
-
false
keep_overhangs
Boolean
-
false
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
include_root_sequence
Boolean
-
true
out_basename
String
-
basename(basename(tree,".nwk"),"_timetree")
machine_mem_gb
Int
-
64
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
Outputs
Name
Type
Expression
masked_alignment
File
augur_mask_sites.masked_sequences
ml_tree
File
draft_augur_tree.aligned_tree
time_tree
File
refine_augur_tree.tree_refined
node_data_jsons
Array[File]
select_all([refine_augur_tree.branch_lengths, ancestral_traits.node_data_json, ancestral_tree.nt_muts_json, translate_augur_tree.aa_muts_json, assign_clades_to_nodes.node_clade_data_json])
auspice_input_json
File
export_auspice_json.virus_json
tip_frequencies_json
File
tip_frequencies.node_data_json
root_sequence_json
File
export_auspice_json.root_sequence_json
Calls
This workflow calls the following tasks or subworkflows:
Input Mappings (2)
Input
Value
sequences
msa_or_vcf
keep_list
keep_list
Input Mappings (2)
Input
Value
sequences
filter_sequences_to_list.filtered_fasta
mask_bed
mask_bed
Input Mappings (1)
Input
Value
msa_or_vcf
augur_mask_sites.masked_sequences
Input Mappings (3)
Input
Value
input_tsvs
sample_metadata
id_col
'strain'
out_basename
"metadata-merged"
Input Mappings (3)
Input
Value
raw_tree
draft_augur_tree.aligned_tree
msa_or_vcf
augur_mask_sites.masked_sequences
metadata
select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
Input Mappings (3)
Input
Value
tree
refine_augur_tree.tree_refined
metadata
select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
columns
select_first([ancestral_traits_to_infer, []])
Input Mappings (2)
Input
Value
tree
refine_augur_tree.tree_refined
metadata
select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
Input Mappings (2)
Input
Value
tree
refine_augur_tree.tree_refined
msa_or_vcf
augur_mask_sites.masked_sequences
Input Mappings (3)
Input
Value
tree
refine_augur_tree.tree_refined
nt_muts
ancestral_tree.nt_muts_json
genbank_gb
genbank_gb
Input Mappings (5)
Input
Value
tree_nwk
refine_augur_tree.tree_refined
nt_muts_json
ancestral_tree.nt_muts_json
aa_muts_json
translate_augur_tree.aa_muts_json
ref_fasta
ref_fasta
clades_tsv
select_first([clades_tsv])
Input Mappings (4)
Input
Value
tree
refine_augur_tree.tree_refined
sample_metadata
select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
node_data_jsons
select_all([refine_augur_tree.branch_lengths, ancestral_traits.node_data_json, ancestral_tree.nt_muts_json, translate_augur_tree.aa_muts_json, assign_clades_to_nodes.node_clade_data_json])
auspice_config
auspice_config
Images
Container images used by tasks in this workflow:
⚙️ Parameterized
Configured via input:
docker
Used by 2 tasks:
filter_sequences_to_list
tsv_join
⚙️ Parameterized
Configured via input:
docker
Used by 9 tasks:
augur_mask_sites
draft_augur_tree
refine_augur_tree
tip_frequencies
ancestral_tree
translate_augur_tree
export_auspice_json
ancestral_traits
assign_clades_to_nodes
Zoom In
Zoom Out
Fit
Reset
🖱️ Scroll to zoom • Drag to pan • Double-click to reset • ESC to close
flowchart TD
Start([augur_from_msa])
N1["filter_sequences_to_list"]
N2["augur_mask_sites"]
N3["draft_augur_tree"]
subgraph C1 ["↔️ if length(sample_metadata) > 1"]
direction TB
N4["tsv_join"]
end
N5["refine_augur_tree"]
subgraph C2 ["↔️ if defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer, []])) > 0"]
direction TB
N6["ancestral_traits"]
end
N7["tip_frequencies"]
N8["ancestral_tree"]
N9["translate_augur_tree"]
subgraph C3 ["↔️ if defined(clades_tsv)"]
direction TB
N10["assign_clades_to_nodes"]
end
N11["export_auspice_json"]
N1 --> N2
N2 --> N3
N2 --> N5
N3 --> N5
N4 --> N5
N4 --> N6
N5 --> N6
N4 --> N7
N5 --> N7
N2 --> N8
N5 --> N8
N8 --> N9
N5 --> N9
N8 --> N10
N5 --> N10
N9 --> N10
N10 --> N11
N4 --> N11
N5 --> N11
N9 --> N11
N8 --> N11
N6 --> N11
Start --> N1
Start --> N4
N7 --> End([End])
N11 --> End([End])
classDef taskNode fill:#a371f7,stroke:#8b5cf6,stroke-width:2px,color:#fff
classDef workflowNode fill:#58a6ff,stroke:#1f6feb,stroke-width:2px,color:#fff
version 1.0
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils
workflow augur_from_msa {
meta {
description: "Build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
File msa_or_vcf
Array[File]+ sample_metadata
File ref_fasta
File genbank_gb
File auspice_config
File? clades_tsv
Array[String]? ancestral_traits_to_infer
Array[File]? keep_list
File? mask_bed
}
parameter_meta {
msa_or_vcf: {
description: "Multiple sequence alignment (aligned fasta) or variants (vcf format).",
patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
}
sample_metadata: {
description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details. At least one tab file must be provided--if multiple are provided, they will be joined via a full left outer join using the 'strain' column as the join ID.",
patterns: ["*.txt", "*.tsv"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
genbank_gb: {
description: "A 'genbank' formatted gene annotation file that is used to calculate coding consequences of observed mutations. Must correspond to the same coordinate space as ref_fasta. Typically downloaded from the same NCBI accession number as ref_fasta.",
patterns: ["*.gb", "*.gbf"]
}
ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
auspice_config: {
description: "A file specifying options to customize the auspice export; see: https://nextstrain.github.io/auspice/customise-client/introduction",
patterns: ["*.json", "*.txt"]
}
clades_tsv: {
description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
patterns: ["*.tsv", "*.txt"]
}
keep_list: {
description: "Optional lists of strain ids to filter inputs down to.",
patterns: ["*.txt", "*.tsv"]
}
mask_bed: {
description: "Optional list of sites to mask when building trees.",
patterns: ["*.bed"]
}
}
call nextstrain.filter_sequences_to_list {
input:
sequences = msa_or_vcf,
keep_list = keep_list
}
call nextstrain.augur_mask_sites {
input:
sequences = filter_sequences_to_list.filtered_fasta,
mask_bed = mask_bed
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}
if(length(sample_metadata)>1) {
call utils.tsv_join {
input:
input_tsvs = sample_metadata,
id_col = 'strain',
out_basename = "metadata-merged"
}
}
call nextstrain.refine_augur_tree {
input:
raw_tree = draft_augur_tree.aligned_tree,
msa_or_vcf = augur_mask_sites.masked_sequences,
metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
}
if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
call nextstrain.ancestral_traits {
input:
tree = refine_augur_tree.tree_refined,
metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata])),
columns = select_first([ancestral_traits_to_infer,[]])
}
}
call nextstrain.tip_frequencies {
input:
tree = refine_augur_tree.tree_refined,
metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
}
call nextstrain.ancestral_tree {
input:
tree = refine_augur_tree.tree_refined,
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.translate_augur_tree {
input:
tree = refine_augur_tree.tree_refined,
nt_muts = ancestral_tree.nt_muts_json,
genbank_gb = genbank_gb
}
if(defined(clades_tsv)) {
call nextstrain.assign_clades_to_nodes {
input:
tree_nwk = refine_augur_tree.tree_refined,
nt_muts_json = ancestral_tree.nt_muts_json,
aa_muts_json = translate_augur_tree.aa_muts_json,
ref_fasta = ref_fasta,
clades_tsv = select_first([clades_tsv])
}
}
call nextstrain.export_auspice_json {
input:
tree = refine_augur_tree.tree_refined,
sample_metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata])),
node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json]),
auspice_config = auspice_config
}
output {
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
File time_tree = refine_augur_tree.tree_refined
Array[File] node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
File auspice_input_json = export_auspice_json.virus_json
File tip_frequencies_json = tip_frequencies.node_data_json
File root_sequence_json = export_auspice_json.root_sequence_json
}
}
version 1.0
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils
workflow augur_from_msa {
meta {
description: "Build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
File msa_or_vcf
Array[File]+ sample_metadata
File ref_fasta
File genbank_gb
File auspice_config
File? clades_tsv
Array[String]? ancestral_traits_to_infer
Array[File]? keep_list
File? mask_bed
}
parameter_meta {
msa_or_vcf: {
description: "Multiple sequence alignment (aligned fasta) or variants (vcf format).",
patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"]
}
sample_metadata: {
description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details. At least one tab file must be provided--if multiple are provided, they will be joined via a full left outer join using the 'strain' column as the join ID.",
patterns: ["*.txt", "*.tsv"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
genbank_gb: {
description: "A 'genbank' formatted gene annotation file that is used to calculate coding consequences of observed mutations. Must correspond to the same coordinate space as ref_fasta. Typically downloaded from the same NCBI accession number as ref_fasta.",
patterns: ["*.gb", "*.gbf"]
}
ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
auspice_config: {
description: "A file specifying options to customize the auspice export; see: https://nextstrain.github.io/auspice/customise-client/introduction",
patterns: ["*.json", "*.txt"]
}
clades_tsv: {
description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
patterns: ["*.tsv", "*.txt"]
}
keep_list: {
description: "Optional lists of strain ids to filter inputs down to.",
patterns: ["*.txt", "*.tsv"]
}
mask_bed: {
description: "Optional list of sites to mask when building trees.",
patterns: ["*.bed"]
}
}
call nextstrain.filter_sequences_to_list {
input:
sequences = msa_or_vcf,
keep_list = keep_list
}
call nextstrain.augur_mask_sites {
input:
sequences = filter_sequences_to_list.filtered_fasta,
mask_bed = mask_bed
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}
if(length(sample_metadata)>1) {
call utils.tsv_join {
input:
input_tsvs = sample_metadata,
id_col = 'strain',
out_basename = "metadata-merged"
}
}
call nextstrain.refine_augur_tree {
input:
raw_tree = draft_augur_tree.aligned_tree,
msa_or_vcf = augur_mask_sites.masked_sequences,
metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
}
if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
call nextstrain.ancestral_traits {
input:
tree = refine_augur_tree.tree_refined,
metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata])),
columns = select_first([ancestral_traits_to_infer,[]])
}
}
call nextstrain.tip_frequencies {
input:
tree = refine_augur_tree.tree_refined,
metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata]))
}
call nextstrain.ancestral_tree {
input:
tree = refine_augur_tree.tree_refined,
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.translate_augur_tree {
input:
tree = refine_augur_tree.tree_refined,
nt_muts = ancestral_tree.nt_muts_json,
genbank_gb = genbank_gb
}
if(defined(clades_tsv)) {
call nextstrain.assign_clades_to_nodes {
input:
tree_nwk = refine_augur_tree.tree_refined,
nt_muts_json = ancestral_tree.nt_muts_json,
aa_muts_json = translate_augur_tree.aa_muts_json,
ref_fasta = ref_fasta,
clades_tsv = select_first([clades_tsv])
}
}
call nextstrain.export_auspice_json {
input:
tree = refine_augur_tree.tree_refined,
sample_metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata])),
node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json]),
auspice_config = auspice_config
}
output {
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
File time_tree = refine_augur_tree.tree_refined
Array[File] node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
File auspice_input_json = export_auspice_json.virus_json
File tip_frequencies_json = tip_frequencies.node_data_json
File root_sequence_json = export_auspice_json.root_sequence_json
}
}