WORKFLOW
scaffold_and_refine_multitaxa
File Path
pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl
WDL Version
1.0
Type
workflow
Imports
Namespace
Path
assembly
../tasks/tasks_assembly.wdl
ncbi
../tasks/tasks_ncbi.wdl
utils
../tasks/tasks_utils.wdl
assemble_refbased
assemble_refbased.wdl
Workflow: scaffold_and_refine_multitaxa
Scaffold de novo contigs against a set of possible references and subsequently polish with reads. This workflow accepts a very large set of input reference genomes. It will subset the reference genomes to those with ANI hits to the provided contigs/MAGs and cluster the reference hits by any ANI similarity to each other. It will choose the top reference from each cluster and produce one assembly for each cluster. This is intended to allow for the presence of multiple diverse viral taxa (coinfections) while forcing a choice of the best assembly from groups of related reference genomes.
Author: Broad Viral Genomics
Name
Type
Description
Default
sample_id
String
-
-
sample_name
String?
-
-
reads_unmapped_bam
File
-
-
contigs_fasta
File
-
-
taxid_to_ref_accessions_tsv
File
-
-
biosample_accession
String?
-
-
emailAddress
String
-
-
skani_m
Int?
-
-
skani_s
Int?
-
-
skani_c
Int?
-
-
skani_n
Int?
-
-
skani_m
Int?
-
-
skani_s
Int?
-
-
skani_c
Int?
-
-
nucmer_max_gap
Int?
-
-
nucmer_min_match
Int?
-
-
nucmer_min_cluster
Int?
-
-
scaffold_min_contig_len
Int?
-
-
scaffold_min_pct_contig_aligned
Float?
-
-
machine_mem_gb
Int?
-
-
sample_original_name
String?
-
-
novocraft_license
File?
-
-
trim_coords_bed
File?
-
-
machine_mem_gb
Int?
-
-
min_keep_length
Int?
-
-
sliding_window
Int?
-
-
primer_offset
Int?
-
-
machine_mem_gb
Int?
-
-
reheader_table
File?
-
-
amplicon_set
String?
-
-
max_coverage_depth
Int?
-
-
base_q_threshold
Int?
-
-
mapping_q_threshold
Int?
-
-
read_length_threshold
Int?
-
-
plotXLimits
String?
-
-
plotYLimits
String?
-
-
machine_mem_gb
Int?
-
-
reheader_table
File?
-
-
max_coverage_depth
Int?
-
-
base_q_threshold
Int?
-
-
mapping_q_threshold
Int?
-
-
read_length_threshold
Int?
-
-
plotXLimits
String?
-
-
plotYLimits
String?
-
-
65 optional inputs with default values
table_name
String
-
"sample"
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
docker
String
-
"quay.io/broadinstitute/viral-assemble:2.5.1.0"
machine_mem_gb
Int
-
4
cpu
Int
-
2
disk_size
Int
-
100
disk_size
Int
-
375
tar_opts
String
-
"-z"
aligner
String
-
"muscle"
replace_length
Int
-
55
docker
String
-
"quay.io/broadinstitute/viral-assemble:2.5.1.0"
sample_name
String
-
basename(basename(contigs_fasta,".fasta"),".assembly1-spades")
set_default_keys
Array[String]
-
[]
aligner
String
-
"minimap2"
min_coverage
Int
-
3
major_cutoff
Float
-
0.75
skip_mark_dupes
Boolean
-
false
align_to_ref_options
Map[String,String]
-
{"novoalign": "-r Random -l 40 -g 40 -x 20 -t 501 -k", "bwa": "-k 12 -B 1", "minimap2": ""}
align_to_self_options
Map[String,String]
-
{"novoalign": "-r Random -l 40 -g 40 -x 20 -t 100", "bwa": "", "minimap2": ""}
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_name
String
-
basename(basename(basename(reads_unmapped_bam,".bam"),".taxfilt"),".clean")
min_quality
Int?
-
1
docker
String
-
"andersenlabapps/ivar:1.3.1"
bam_basename
String
-
basename(aligned_bam,".bam")
disk_size
Int
-
375
run_fastqc
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
machine_mem_gb
Int
-
4
out_basename
String
-
basename(aligned_bam,'.bam')
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
max_amp_len
Int
-
5000
max_amplicons
Int
-
500
machine_mem_gb
Int
-
32
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
skip_mark_dupes
Boolean
-
false
plot_only_non_duplicates
Boolean
-
false
bin_large_plots
Boolean
-
false
binning_summary_statistic
String?
-
"max"
plot_width_pixels
Int?
-
1100
plot_height_pixels
Int?
-
850
plot_pixels_per_inch
Int?
-
100
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
mark_duplicates
Boolean
-
false
machine_mem_gb
Int
-
15
docker
String
-
"quay.io/broadinstitute/viral-assemble:2.5.1.0"
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
sample_name
String
-
basename(basename(basename(reads_unmapped_bam,".bam"),".taxfilt"),".clean")
run_fastqc
Boolean
-
false
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
disk_size
Int
-
750
machine_mem_gb
Int
-
4
out_basename
String
-
basename(aligned_bam,'.bam')
docker
String
-
"quay.io/broadinstitute/viral-phylo:2.5.1.0"
skip_mark_dupes
Boolean
-
false
plot_only_non_duplicates
Boolean
-
false
bin_large_plots
Boolean
-
false
binning_summary_statistic
String?
-
"max"
plot_width_pixels
Int?
-
1100
plot_height_pixels
Int?
-
850
plot_pixels_per_inch
Int?
-
100
docker
String
-
"quay.io/broadinstitute/viral-core:2.5.1"
cpus
Int
-
4
cpus
Int
-
4
Outputs
Name
Type
Expression
assembly_stats_by_taxon
Array[Map[String,String]]
stats_by_taxon
assembly_stats_by_taxon_tsv
File
select_first([assembly_stats_non_empty.combined, assembly_stats_empty.combined])
assembly_method
String
"viral-ngs/scaffold_and_refine_multitaxa"
skani_num_ref_clusters
Int
length(select_references.matched_reference_clusters_fastas_tars)
skani_contigs_to_refs_dist_tsv
File
select_references.skani_dist_full_tsv
assembly_all_taxids
Array[String]
taxid
assembly_all_taxnames
Array[String]
tax_name
assembly_all_lengths_unambig
Array[Int]
assembly_length_unambiguous
assembly_all_pct_ref_cov
Array[Float]
percent_reference_covered
assembly_all_fastas
Array[File]
assembly_fasta
Calls
This workflow calls the following tasks or subworkflows:
Input Mappings (1)
Input
Value
ref_genomes_tsv
taxid_to_ref_accessions_tsv
Input Mappings (2)
Input
Value
reference_genomes_fastas
download_ref_genomes_from_tsv.ref_genomes_fastas
contigs_fasta
contigs_fasta
Input Mappings (1)
Input
Value
tar_file
ref_cluster_tar
Input Mappings (6)
Input
Value
reads_bam
reads_unmapped_bam
contigs_fasta
contigs_fasta
reference_genome_fasta
tar_extract.files
min_length_fraction
0
min_unambig
0
allow_incomplete_output
true
CALL
TASKS
tax_lookup
↗
→ fetch_row_from_tsv
Input Mappings (4)
Input
Value
tsv
taxid_to_ref_accessions_tsv
idx_col
"accessions"
idx_val
sub(scaffold.scaffolding_chosen_ref_basename,"-",":")
add_header
["taxid", "isolate_prefix", "taxname", "accessions"]
CALL
WORKFLOW
refine
↗
→ assemble_refbased
Input Mappings (3)
Input
Value
reads_unmapped_bams
[reads_unmapped_bam]
reference_fasta
scaffold.scaffold_fasta
sample_name
sample_id + "-" + taxid
Input Mappings (2)
Input
Value
infiles
[write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))]
output_name
"assembly_metadata-~{sample_id}.tsv"
Input Mappings (2)
Input
Value
infiles
[write_tsv([assembly_header])]
output_name
"assembly_metadata-~{sample_id}.tsv"
Images
Container images used by tasks in this workflow:
⚙️ Parameterized
Configured via input:
docker
Used by 1 task:
download_ref_genomes_from_tsv
⚙️ Parameterized
Configured via input:
docker
Used by 2 tasks:
select_references
scaffold
quay.io/broadinstitute/viral-baseimage:0.3.0
ubuntu
Used by 2 tasks:
assembly_stats_non_empty
assembly_stats_empty
Zoom In
Zoom Out
Fit
Reset
🖱️ Scroll to zoom • Drag to pan • Double-click to reset • ESC to close
flowchart TD
Start([scaffold_and_refine_multitaxa])
N1["download_ref_genomes_from_tsv"]
N2["select_references"]
subgraph S1 ["🔃 scatter ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars"]
direction TB
N3["tar_extract"]
N4["scaffold"]
N5["tax_lookupfetch_row_from_tsv "]
subgraph C1 ["↔️ if scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig"]
direction TB
N6[/"refineassemble_refbased "/]
end
end
subgraph C2 ["↔️ if length(select_all(stat_by_taxon)) > 0"]
direction TB
N7["assembly_stats_non_emptyconcatenate "]
end
subgraph C3 ["↔️ if length(select_all(stat_by_taxon)) == 0"]
direction TB
N8["assembly_stats_emptyconcatenate "]
end
N1 --> N2
N2 --> N3
N2 --> N4
N3 --> N4
N4 --> N5
N2 --> N5
N4 --> N6
N2 --> N6
N5 --> N6
Start --> N1
Start --> N7
Start --> N8
N8 --> End([End])
N7 --> End([End])
N6 --> End([End])
classDef taskNode fill:#a371f7,stroke:#8b5cf6,stroke-width:2px,color:#fff
classDef workflowNode fill:#58a6ff,stroke:#1f6feb,stroke-width:2px,color:#fff
version 1.0
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_utils.wdl" as utils
import "assemble_refbased.wdl" as assemble_refbased
workflow scaffold_and_refine_multitaxa {
meta {
description: "Scaffold de novo contigs against a set of possible references and subsequently polish with reads. This workflow accepts a very large set of input reference genomes. It will subset the reference genomes to those with ANI hits to the provided contigs/MAGs and cluster the reference hits by any ANI similarity to each other. It will choose the top reference from each cluster and produce one assembly for each cluster. This is intended to allow for the presence of multiple diverse viral taxa (coinfections) while forcing a choice of the best assembly from groups of related reference genomes."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
String sample_id
String? sample_name
File reads_unmapped_bam
File contigs_fasta
File taxid_to_ref_accessions_tsv
String? biosample_accession
String table_name = "sample"
}
Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished
String sample_original_name = select_first([sample_name, sample_id])
# download (multi-segment) genomes for each reference, fasta filename = dash-concatenated accession list
call ncbi.download_ref_genomes_from_tsv {
input:
ref_genomes_tsv = taxid_to_ref_accessions_tsv
}
# subset reference genomes to those with ANI hits to contigs and cluster reference hits by any ANI similarity to each other
call assembly.select_references {
input:
reference_genomes_fastas = download_ref_genomes_from_tsv.ref_genomes_fastas,
contigs_fasta = contigs_fasta
}
# assemble and produce stats for every reference cluster
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "~{table_name}"]
scatter(ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars) {
call utils.tar_extract {
input:
tar_file = ref_cluster_tar
}
# assemble (scaffold-and-refine) genome against this reference cluster
call assembly.scaffold {
input:
reads_bam = reads_unmapped_bam,
contigs_fasta = contigs_fasta,
reference_genome_fasta = tar_extract.files,
min_length_fraction = 0,
min_unambig = 0,
allow_incomplete_output = true
}
# get taxid and taxname from taxid_to_ref_accessions_tsv
call utils.fetch_row_from_tsv as tax_lookup {
input:
tsv = taxid_to_ref_accessions_tsv,
idx_col = "accessions",
idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"),
add_header = ["taxid", "isolate_prefix", "taxname", "accessions"]
}
String taxid = tax_lookup.map["taxid"]
String tax_name = tax_lookup.map["taxname"]
String isolate_prefix = tax_lookup.map["isolate_prefix"]
# polish de novo assembly with reads
if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) {
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [reads_unmapped_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_id + "-" + taxid
}
}
# build output tsv row
Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, 0])
Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length
File assembly_fasta = select_first([refine.assembly_fasta, scaffold.intermediate_gapfill_fasta])
Map[String, String] stats_by_taxon = {
"entity:assembly_id" : sample_id + "-" + taxid,
"assembly_name" : tax_name + ": " + sample_original_name,
"sample_id" : sample_id,
"sample_name" : sample_original_name,
"taxid" : taxid,
"tax_name" : tax_name,
"tax_shortname" : isolate_prefix,
"assembly_fasta" : assembly_fasta,
"aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]),
"coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ""]),
"assembly_length" : select_first([refine.assembly_length, "0"]),
"assembly_length_unambiguous" : assembly_length_unambiguous,
"reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, "0"]),
"mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, "0"]),
"percent_reference_covered" : percent_reference_covered,
"scaffolding_num_segments_recovered" : scaffold.assembly_num_segments_recovered,
"reference_num_segments_required" : scaffold.reference_num_segments_required,
"reference_length" : scaffold.reference_length,
"reference_accessions" : tax_lookup.map["accessions"],
"skani_num_ref_clusters" : length(select_references.matched_reference_clusters_fastas_tars),
"skani_this_cluster_num_refs" : length(tar_extract.files),
"skani_dist_tsv" : scaffold.scaffolding_stats,
"scaffolding_ani" : scaffold.skani_ani,
"scaffolding_pct_ref_cov" : scaffold.skani_ref_aligned_frac,
"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta,
"assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous,
"replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, "0"]),
"replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, "0"]),
"replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, "0"]),
"replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ""]),
"isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ""]),
"aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]),
"coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ""]),
"read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, "0"]),
"bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, "0"]),
"assembly_method" : "viral-ngs/assemble_denovo",
"assembly_method_version" : scaffold.viralngs_version,
"biosample_accession" : select_first([biosample_accession, ""]),
"~{table_name}": '{"entityType":"~{table_name}","entityName":"' + sample_id + '"}'
}
if(assembly_length_unambiguous > min_scaffold_unambig) {
scatter(h in assembly_header) {
String stat_by_taxon = stats_by_taxon[h]
}
}
}
### summary stats
if (length(select_all(stat_by_taxon)) > 0) {
call utils.concatenate as assembly_stats_non_empty {
input:
infiles = [write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))],
output_name = "assembly_metadata-~{sample_id}.tsv"
}
}
if (length(select_all(stat_by_taxon)) == 0) {
call utils.concatenate as assembly_stats_empty {
input:
infiles = [write_tsv([assembly_header])],
output_name = "assembly_metadata-~{sample_id}.tsv"
}
}
output {
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon
File assembly_stats_by_taxon_tsv = select_first([assembly_stats_non_empty.combined, assembly_stats_empty.combined])
String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa"
Int skani_num_ref_clusters = length(select_references.matched_reference_clusters_fastas_tars)
File skani_contigs_to_refs_dist_tsv = select_references.skani_dist_full_tsv
Array[String] assembly_all_taxids = taxid
Array[String] assembly_all_taxnames = tax_name
Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous
Array[Float] assembly_all_pct_ref_cov = percent_reference_covered
Array[File] assembly_all_fastas = assembly_fasta
}
}
version 1.0
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_utils.wdl" as utils
import "assemble_refbased.wdl" as assemble_refbased
workflow scaffold_and_refine_multitaxa {
meta {
description: "Scaffold de novo contigs against a set of possible references and subsequently polish with reads. This workflow accepts a very large set of input reference genomes. It will subset the reference genomes to those with ANI hits to the provided contigs/MAGs and cluster the reference hits by any ANI similarity to each other. It will choose the top reference from each cluster and produce one assembly for each cluster. This is intended to allow for the presence of multiple diverse viral taxa (coinfections) while forcing a choice of the best assembly from groups of related reference genomes."
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
String sample_id
String? sample_name
File reads_unmapped_bam
File contigs_fasta
File taxid_to_ref_accessions_tsv
String? biosample_accession
String table_name = "sample"
}
Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished
String sample_original_name = select_first([sample_name, sample_id])
# download (multi-segment) genomes for each reference, fasta filename = dash-concatenated accession list
call ncbi.download_ref_genomes_from_tsv {
input:
ref_genomes_tsv = taxid_to_ref_accessions_tsv
}
# subset reference genomes to those with ANI hits to contigs and cluster reference hits by any ANI similarity to each other
call assembly.select_references {
input:
reference_genomes_fastas = download_ref_genomes_from_tsv.ref_genomes_fastas,
contigs_fasta = contigs_fasta
}
# assemble and produce stats for every reference cluster
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "~{table_name}"]
scatter(ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars) {
call utils.tar_extract {
input:
tar_file = ref_cluster_tar
}
# assemble (scaffold-and-refine) genome against this reference cluster
call assembly.scaffold {
input:
reads_bam = reads_unmapped_bam,
contigs_fasta = contigs_fasta,
reference_genome_fasta = tar_extract.files,
min_length_fraction = 0,
min_unambig = 0,
allow_incomplete_output = true
}
# get taxid and taxname from taxid_to_ref_accessions_tsv
call utils.fetch_row_from_tsv as tax_lookup {
input:
tsv = taxid_to_ref_accessions_tsv,
idx_col = "accessions",
idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"),
add_header = ["taxid", "isolate_prefix", "taxname", "accessions"]
}
String taxid = tax_lookup.map["taxid"]
String tax_name = tax_lookup.map["taxname"]
String isolate_prefix = tax_lookup.map["isolate_prefix"]
# polish de novo assembly with reads
if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) {
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [reads_unmapped_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_id + "-" + taxid
}
}
# build output tsv row
Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, 0])
Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length
File assembly_fasta = select_first([refine.assembly_fasta, scaffold.intermediate_gapfill_fasta])
Map[String, String] stats_by_taxon = {
"entity:assembly_id" : sample_id + "-" + taxid,
"assembly_name" : tax_name + ": " + sample_original_name,
"sample_id" : sample_id,
"sample_name" : sample_original_name,
"taxid" : taxid,
"tax_name" : tax_name,
"tax_shortname" : isolate_prefix,
"assembly_fasta" : assembly_fasta,
"aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]),
"coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ""]),
"assembly_length" : select_first([refine.assembly_length, "0"]),
"assembly_length_unambiguous" : assembly_length_unambiguous,
"reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, "0"]),
"mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, "0"]),
"percent_reference_covered" : percent_reference_covered,
"scaffolding_num_segments_recovered" : scaffold.assembly_num_segments_recovered,
"reference_num_segments_required" : scaffold.reference_num_segments_required,
"reference_length" : scaffold.reference_length,
"reference_accessions" : tax_lookup.map["accessions"],
"skani_num_ref_clusters" : length(select_references.matched_reference_clusters_fastas_tars),
"skani_this_cluster_num_refs" : length(tar_extract.files),
"skani_dist_tsv" : scaffold.scaffolding_stats,
"scaffolding_ani" : scaffold.skani_ani,
"scaffolding_pct_ref_cov" : scaffold.skani_ref_aligned_frac,
"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta,
"assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous,
"replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, "0"]),
"replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, "0"]),
"replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, "0"]),
"replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ""]),
"isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ""]),
"aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]),
"coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ""]),
"read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, "0"]),
"bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, "0"]),
"assembly_method" : "viral-ngs/assemble_denovo",
"assembly_method_version" : scaffold.viralngs_version,
"biosample_accession" : select_first([biosample_accession, ""]),
"~{table_name}": '{"entityType":"~{table_name}","entityName":"' + sample_id + '"}'
}
if(assembly_length_unambiguous > min_scaffold_unambig) {
scatter(h in assembly_header) {
String stat_by_taxon = stats_by_taxon[h]
}
}
}
### summary stats
if (length(select_all(stat_by_taxon)) > 0) {
call utils.concatenate as assembly_stats_non_empty {
input:
infiles = [write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))],
output_name = "assembly_metadata-~{sample_id}.tsv"
}
}
if (length(select_all(stat_by_taxon)) == 0) {
call utils.concatenate as assembly_stats_empty {
input:
infiles = [write_tsv([assembly_header])],
output_name = "assembly_metadata-~{sample_id}.tsv"
}
}
output {
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon
File assembly_stats_by_taxon_tsv = select_first([assembly_stats_non_empty.combined, assembly_stats_empty.combined])
String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa"
Int skani_num_ref_clusters = length(select_references.matched_reference_clusters_fastas_tars)
File skani_contigs_to_refs_dist_tsv = select_references.skani_dist_full_tsv
Array[String] assembly_all_taxids = taxid
Array[String] assembly_all_taxnames = tax_name
Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous
Array[Float] assembly_all_pct_ref_cov = percent_reference_covered
Array[File] assembly_all_fastas = assembly_fasta
}
}