WORKFLOW
augur_from_assemblies
File Path
pipes/WDL/workflows/augur_from_assemblies.wdl
WDL Version
1.0
Type
workflow
Imports
Namespace
Path
nextstrain
../tasks/tasks_nextstrain.wdl
utils
../tasks/tasks_utils.wdl
Workflow: augur_from_assemblies
Align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/
Author: Broad Viral Genomics
Name
Type
Description
Default
assembly_fastas
Array[File]+
Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.
-
contextual_genome_fastas
Array[File]?
Set of near-complete contextual genomes to include in tree build. Each fasta provided must represent a single chromosome/segment of a genome. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.
-
sample_metadata_tsvs
Array[File]+
Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.
-
ref_fasta
File
A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.
-
min_unambig_genome
Int
Minimum number of called bases in genome to pass prefilter.
-
clades_tsv
File?
A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades
-
ancestral_traits_to_infer
Array[String]?
A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata.
-
lab_highlight_loc
String?
-
-
sequences_per_group
Int?
-
-
group_by
String?
-
-
include
File?
-
-
exclude
File?
-
-
min_date
Float?
-
-
max_date
Float?
-
-
min_length
Int?
-
-
priority
File?
-
-
subsample_seed
Int?
-
-
exclude_where
Array[String]?
-
-
include_where
Array[String]?
-
-
mask_bed
File?
-
-
exclude_sites
File?
-
-
vcf_reference
File?
-
-
tree_builder_args
String?
-
-
gen_per_year
Int?
-
-
clock_rate
Float?
-
-
clock_std_dev
Float?
-
-
root
String?
-
-
covariance
Boolean?
-
-
precision
Int?
-
-
branch_length_inference
String?
-
-
coalescent
String?
-
-
vcf_reference
File?
-
-
weights
File?
-
-
sampling_bias_correction
Float?
-
-
vcf_reference
File?
-
-
root_sequence
File?
-
-
output_vcf
File?
-
-
genbank_gb
File
-
-
genes
File?
-
-
vcf_reference_output
File?
-
-
vcf_reference
File?
-
-
min_date
Float?
-
-
max_date
Float?
-
-
pivot_interval
Int?
-
-
pivot_interval_units
String?
-
-
narrow_bandwidth
Float?
-
-
wide_bandwidth
Float?
-
-
proportion_wide
Float?
-
-
minimal_frequency
Float?
-
-
stiffness
Float?
-
-
inertia
Float?
-
-
auspice_config
File
-
-
lat_longs_tsv
File?
-
-
colors_tsv
File?
-
-
geo_resolutions
Array[String]?
-
-
color_by_metadata
Array[String]?
-
-
description_md
File?
-
-
maintainers
Array[String]?
-
-
title
String?
-
-
72 optional inputs with default values
make_snps_vcf
Boolean
-
false
cpus
Int
-
4
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
error_on_seq_diff
Boolean
-
false
nextstrain_ncov_repo_commit
String
-
"30435fb9ec8de2f045167fb90adfec12f123e80a"
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
remove_reference
Boolean
-
false
keep_length
Boolean
-
true
large
Boolean
-
false
memsavetree
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
mem_size
Int
-
500
cpus
Int
-
64
disk_size
Int
-
750
allow_wildcard_bases
Boolean
-
true
docker
String
-
"quay.io/biocontainers/snp-sites:2.5.1--hed695b0_0"
disk_size
Int
-
750
out_suffix
String
-
".txt"
prefer_first
Boolean
-
true
machine_mem_gb
Int
-
7
table_map
Array[File]
-
[]
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
50
non_nucleotide
Boolean
-
true
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
method
String
-
"iqtree"
substitution_model
String
-
"GTR"
cpus
Int
-
64
machine_mem_gb
Int
-
32
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
1250
generate_timetree
Boolean
-
true
keep_root
Boolean
-
true
keep_polytomies
Boolean
-
false
date_confidence
Boolean
-
true
date_inference
String?
-
"marginal"
clock_filter_iqd
Int?
-
4
divergence_units
String?
-
"mutations"
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
machine_mem_gb
Int
-
75
confidence
Boolean
-
true
machine_mem_gb
Int
-
32
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
750
inference
String
-
"joint"
keep_ambiguous
Boolean
-
false
infer_ambiguous
Boolean
-
false
keep_overhangs
Boolean
-
false
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
method
String
-
"kde"
censored
Boolean
-
false
include_internal_nodes
Boolean
-
false
machine_mem_gb
Int
-
64
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
out_basename
String
-
basename(tree,'.nwk')
disk_size
Int
-
200
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
include_root_sequence
Boolean
-
true
out_basename
String
-
basename(basename(tree,".nwk"),"_timetree")
machine_mem_gb
Int
-
64
docker
String
-
"docker.io/nextstrain/base:build-20240318T173028Z"
disk_size
Int
-
300
Outputs
Name
Type
Expression
combined_assemblies
File
filter_sequences_by_length.filtered_fasta
multiple_alignment
File
mafft.aligned_sequences
unmasked_snps
File?
snp_sites.snps_vcf
metadata_merged
File
derived_cols.derived_metadata
keep_list
File
fasta_to_ids.ids_txt
subsampled_sequences
File
prefilter.filtered_fasta
sequences_kept
Int
prefilter.sequences_out
masked_alignment
File
augur_mask_sites.masked_sequences
ml_tree
File
draft_augur_tree.aligned_tree
time_tree
File
refine_augur_tree.tree_refined
node_data_jsons
Array[File]
select_all([refine_augur_tree.branch_lengths, ancestral_traits.node_data_json, ancestral_tree.nt_muts_json, translate_augur_tree.aa_muts_json, assign_clades_to_nodes.node_clade_data_json])
auspice_input_json
File
export_auspice_json.virus_json
tip_frequencies_json
File
tip_frequencies.node_data_json
root_sequence_json
File
export_auspice_json.root_sequence_json
Calls
This workflow calls the following tasks or subworkflows:
Input Mappings (2)
Input
Value
infiles
flatten([assembly_fastas, select_first([contextual_genome_fastas, []])])
output_name
"all_samples_combined_assembly.fasta"
Input Mappings (2)
Input
Value
sequences_fasta
zcat.combined
min_non_N
min_unambig_genome
CALL
TASKS
dedup_seqs
↗
→ nextstrain_deduplicate_sequences
Input Mappings (1)
Input
Value
sequences_fasta
filter_sequences_by_length.filtered_fasta
CALL
TASKS
mafft
↗
→ mafft_one_chr
Input Mappings (3)
Input
Value
sequences
dedup_seqs.sequences_deduplicated_fasta
ref_fasta
ref_fasta
basename
"all_samples_aligned.fasta"
Input Mappings (1)
Input
Value
msa_fasta
mafft.aligned_sequences
Input Mappings (3)
Input
Value
input_tsvs
sample_metadata_tsvs
id_col
'strain'
out_basename
"metadata-merged"
Input Mappings (1)
Input
Value
metadata_tsv
select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs]))
CALL
TASKS
prefilter
↗
→ filter_subsample_sequences
Input Mappings (2)
Input
Value
sequences_fasta
mafft.aligned_sequences
sample_metadata_tsv
derived_cols.derived_metadata
Input Mappings (1)
Input
Value
sequences_fasta
prefilter.filtered_fasta
Input Mappings (1)
Input
Value
sequences
prefilter.filtered_fasta
Input Mappings (1)
Input
Value
msa_or_vcf
augur_mask_sites.masked_sequences
Input Mappings (3)
Input
Value
raw_tree
draft_augur_tree.aligned_tree
msa_or_vcf
augur_mask_sites.masked_sequences
metadata
derived_cols.derived_metadata
Input Mappings (3)
Input
Value
tree
refine_augur_tree.tree_refined
metadata
derived_cols.derived_metadata
columns
select_first([ancestral_traits_to_infer, []])
Input Mappings (2)
Input
Value
tree
refine_augur_tree.tree_refined
msa_or_vcf
augur_mask_sites.masked_sequences
Input Mappings (2)
Input
Value
tree
refine_augur_tree.tree_refined
nt_muts
ancestral_tree.nt_muts_json
Input Mappings (2)
Input
Value
tree
refine_augur_tree.tree_refined
metadata
derived_cols.derived_metadata
Input Mappings (5)
Input
Value
tree_nwk
refine_augur_tree.tree_refined
nt_muts_json
ancestral_tree.nt_muts_json
aa_muts_json
translate_augur_tree.aa_muts_json
ref_fasta
ref_fasta
clades_tsv
select_first([clades_tsv])
Input Mappings (3)
Input
Value
tree
refine_augur_tree.tree_refined
sample_metadata
derived_cols.derived_metadata
node_data_jsons
select_all([refine_augur_tree.branch_lengths, ancestral_traits.node_data_json, ancestral_tree.nt_muts_json, translate_augur_tree.aa_muts_json, assign_clades_to_nodes.node_clade_data_json])
Images
Container images used by tasks in this workflow:
quay.io/broadinstitute/viral-core:2.5.1
Used by 4 tasks:
zcat
filter_sequences_by_length
derived_cols
tsv_join
⚙️ Parameterized
Configured via input:
docker
Used by 11 tasks:
dedup_seqs
prefilter
augur_mask_sites
draft_augur_tree
refine_augur_tree
ancestral_tree
translate_augur_tree
tip_frequencies
export_auspice_json
ancestral_traits
assign_clades_to_nodes
⚙️ Parameterized
Configured via input:
docker
⚙️ Parameterized
Configured via input:
docker
Zoom In
Zoom Out
Fit
Reset
🖱️ Scroll to zoom • Drag to pan • Double-click to reset • ESC to close
flowchart TD
Start([augur_from_assemblies])
N1["zcat"]
N2["filter_sequences_by_length"]
N3["dedup_seqsnextstrain_deduplicate_sequences "]
N4["mafftmafft_one_chr "]
subgraph C1 ["↔️ if make_snps_vcf"]
direction TB
N5["snp_sites"]
end
subgraph C2 ["↔️ if length(sample_metadata_tsvs) > 1"]
direction TB
N6["tsv_join"]
end
N7["derived_cols"]
N8["prefilterfilter_subsample_sequences "]
N9["fasta_to_ids"]
N10["augur_mask_sites"]
N11["draft_augur_tree"]
N12["refine_augur_tree"]
subgraph C3 ["↔️ if defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer, []])) > 0"]
direction TB
N13["ancestral_traits"]
end
N14["ancestral_tree"]
N15["translate_augur_tree"]
N16["tip_frequencies"]
subgraph C4 ["↔️ if defined(clades_tsv)"]
direction TB
N17["assign_clades_to_nodes"]
end
N18["export_auspice_json"]
N1 --> N2
N2 --> N3
N3 --> N4
N4 --> N5
N6 --> N7
N7 --> N8
N4 --> N8
N8 --> N9
N8 --> N10
N10 --> N11
N7 --> N12
N10 --> N12
N11 --> N12
N7 --> N13
N12 --> N13
N10 --> N14
N12 --> N14
N14 --> N15
N12 --> N15
N7 --> N16
N12 --> N16
N14 --> N17
N12 --> N17
N15 --> N17
N7 --> N18
N17 --> N18
N12 --> N18
N15 --> N18
N14 --> N18
N13 --> N18
Start --> N1
Start --> N6
N5 --> End([End])
N16 --> End([End])
N9 --> End([End])
N18 --> End([End])
classDef taskNode fill:#a371f7,stroke:#8b5cf6,stroke-width:2px,color:#fff
classDef workflowNode fill:#58a6ff,stroke:#1f6feb,stroke-width:2px,color:#fff
version 1.0
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_utils.wdl" as utils
workflow augur_from_assemblies {
meta {
description: "Align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
Array[File]+ assembly_fastas
Array[File]? contextual_genome_fastas
Array[File]+ sample_metadata_tsvs
File ref_fasta
Int min_unambig_genome
File? clades_tsv
Array[String]? ancestral_traits_to_infer
Boolean make_snps_vcf = false
}
parameter_meta {
assembly_fastas: {
description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.",
patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
}
contextual_genome_fastas: {
description: "Set of near-complete contextual genomes to include in tree build. Each fasta provided must represent a single chromosome/segment of a genome. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture. ",
patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
}
sample_metadata_tsvs: {
description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.",
patterns: ["*.txt", "*.tsv"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
min_unambig_genome: {
description: "Minimum number of called bases in genome to pass prefilter."
}
ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
clades_tsv: {
description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
patterns: ["*.tsv", "*.txt"]
}
}
#### mafft_and_snp
call utils.zcat {
input:
infiles = flatten([assembly_fastas, select_first([contextual_genome_fastas,[]])]),
output_name = "all_samples_combined_assembly.fasta"
}
call utils.filter_sequences_by_length {
input:
sequences_fasta = zcat.combined,
min_non_N = min_unambig_genome
}
call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs {
input:
sequences_fasta = filter_sequences_by_length.filtered_fasta
}
call nextstrain.mafft_one_chr as mafft {
input:
sequences = dedup_seqs.sequences_deduplicated_fasta,
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
if(make_snps_vcf) {
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
}
#### subsample_by_metadata_with_focal
if(length(sample_metadata_tsvs)>1) {
call utils.tsv_join {
input:
input_tsvs = sample_metadata_tsvs,
id_col = 'strain',
out_basename = "metadata-merged"
}
}
call nextstrain.derived_cols {
input:
metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs]))
}
call nextstrain.filter_subsample_sequences as prefilter {
input:
sequences_fasta = mafft.aligned_sequences,
sample_metadata_tsv = derived_cols.derived_metadata
}
call utils.fasta_to_ids {
input:
sequences_fasta = prefilter.filtered_fasta
}
#### augur_from_msa
call nextstrain.augur_mask_sites {
input:
sequences = prefilter.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.refine_augur_tree {
input:
raw_tree = draft_augur_tree.aligned_tree,
msa_or_vcf = augur_mask_sites.masked_sequences,
metadata = derived_cols.derived_metadata
}
if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
call nextstrain.ancestral_traits {
input:
tree = refine_augur_tree.tree_refined,
metadata = derived_cols.derived_metadata,
columns = select_first([ancestral_traits_to_infer,[]])
}
}
call nextstrain.ancestral_tree {
input:
tree = refine_augur_tree.tree_refined,
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.translate_augur_tree {
input:
tree = refine_augur_tree.tree_refined,
nt_muts = ancestral_tree.nt_muts_json
}
call nextstrain.tip_frequencies {
input:
tree = refine_augur_tree.tree_refined,
metadata = derived_cols.derived_metadata
}
if(defined(clades_tsv)) {
call nextstrain.assign_clades_to_nodes {
input:
tree_nwk = refine_augur_tree.tree_refined,
nt_muts_json = ancestral_tree.nt_muts_json,
aa_muts_json = translate_augur_tree.aa_muts_json,
ref_fasta = ref_fasta,
clades_tsv = select_first([clades_tsv])
}
}
call nextstrain.export_auspice_json {
input:
tree = refine_augur_tree.tree_refined,
sample_metadata = derived_cols.derived_metadata,
node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
}
output {
File combined_assemblies = filter_sequences_by_length.filtered_fasta
File multiple_alignment = mafft.aligned_sequences
File? unmasked_snps = snp_sites.snps_vcf
File metadata_merged = derived_cols.derived_metadata
File keep_list = fasta_to_ids.ids_txt
File subsampled_sequences = prefilter.filtered_fasta
Int sequences_kept = prefilter.sequences_out
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
File time_tree = refine_augur_tree.tree_refined
Array[File] node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
File auspice_input_json = export_auspice_json.virus_json
File tip_frequencies_json = tip_frequencies.node_data_json
File root_sequence_json = export_auspice_json.root_sequence_json
}
}
version 1.0
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_utils.wdl" as utils
workflow augur_from_assemblies {
meta {
description: "Align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
Array[File]+ assembly_fastas
Array[File]? contextual_genome_fastas
Array[File]+ sample_metadata_tsvs
File ref_fasta
Int min_unambig_genome
File? clades_tsv
Array[String]? ancestral_traits_to_infer
Boolean make_snps_vcf = false
}
parameter_meta {
assembly_fastas: {
description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.",
patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
}
contextual_genome_fastas: {
description: "Set of near-complete contextual genomes to include in tree build. Each fasta provided must represent a single chromosome/segment of a genome. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture. ",
patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
}
sample_metadata_tsvs: {
description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.",
patterns: ["*.txt", "*.tsv"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
min_unambig_genome: {
description: "Minimum number of called bases in genome to pass prefilter."
}
ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
clades_tsv: {
description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
patterns: ["*.tsv", "*.txt"]
}
}
#### mafft_and_snp
call utils.zcat {
input:
infiles = flatten([assembly_fastas, select_first([contextual_genome_fastas,[]])]),
output_name = "all_samples_combined_assembly.fasta"
}
call utils.filter_sequences_by_length {
input:
sequences_fasta = zcat.combined,
min_non_N = min_unambig_genome
}
call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs {
input:
sequences_fasta = filter_sequences_by_length.filtered_fasta
}
call nextstrain.mafft_one_chr as mafft {
input:
sequences = dedup_seqs.sequences_deduplicated_fasta,
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
if(make_snps_vcf) {
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
}
#### subsample_by_metadata_with_focal
if(length(sample_metadata_tsvs)>1) {
call utils.tsv_join {
input:
input_tsvs = sample_metadata_tsvs,
id_col = 'strain',
out_basename = "metadata-merged"
}
}
call nextstrain.derived_cols {
input:
metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs]))
}
call nextstrain.filter_subsample_sequences as prefilter {
input:
sequences_fasta = mafft.aligned_sequences,
sample_metadata_tsv = derived_cols.derived_metadata
}
call utils.fasta_to_ids {
input:
sequences_fasta = prefilter.filtered_fasta
}
#### augur_from_msa
call nextstrain.augur_mask_sites {
input:
sequences = prefilter.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.refine_augur_tree {
input:
raw_tree = draft_augur_tree.aligned_tree,
msa_or_vcf = augur_mask_sites.masked_sequences,
metadata = derived_cols.derived_metadata
}
if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
call nextstrain.ancestral_traits {
input:
tree = refine_augur_tree.tree_refined,
metadata = derived_cols.derived_metadata,
columns = select_first([ancestral_traits_to_infer,[]])
}
}
call nextstrain.ancestral_tree {
input:
tree = refine_augur_tree.tree_refined,
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.translate_augur_tree {
input:
tree = refine_augur_tree.tree_refined,
nt_muts = ancestral_tree.nt_muts_json
}
call nextstrain.tip_frequencies {
input:
tree = refine_augur_tree.tree_refined,
metadata = derived_cols.derived_metadata
}
if(defined(clades_tsv)) {
call nextstrain.assign_clades_to_nodes {
input:
tree_nwk = refine_augur_tree.tree_refined,
nt_muts_json = ancestral_tree.nt_muts_json,
aa_muts_json = translate_augur_tree.aa_muts_json,
ref_fasta = ref_fasta,
clades_tsv = select_first([clades_tsv])
}
}
call nextstrain.export_auspice_json {
input:
tree = refine_augur_tree.tree_refined,
sample_metadata = derived_cols.derived_metadata,
node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
}
output {
File combined_assemblies = filter_sequences_by_length.filtered_fasta
File multiple_alignment = mafft.aligned_sequences
File? unmasked_snps = snp_sites.snps_vcf
File metadata_merged = derived_cols.derived_metadata
File keep_list = fasta_to_ids.ids_txt
File subsampled_sequences = prefilter.filtered_fasta
Int sequences_kept = prefilter.sequences_out
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
File time_tree = refine_augur_tree.tree_refined
Array[File] node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
File auspice_input_json = export_auspice_json.virus_json
File tip_frequencies_json = tip_frequencies.node_data_json
File root_sequence_json = export_auspice_json.root_sequence_json
}
}