TASKS tasks_metagenomics
| File Path |
pipes/WDL/tasks/tasks_metagenomics.wdl
|
|---|---|
| WDL Version | 1.0 |
| Type | tasks |
📋Tasks in this document
-
krakenuniq Runs Krakenuniq classification
-
build_krakenuniq_db
-
kraken2 Runs Kraken2 classification
-
report_primary_kraken_taxa Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributin...
-
filter_refs_to_found_taxa Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report.
-
build_kraken2_db Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona tax...
-
blastx Runs BLASTx classification
-
krona
-
krona_merge
-
filter_bam_to_taxa
-
kaiju
Tasks
TASKS krakenuniq
Runs Krakenuniq classification
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
reads_unmapped_bam
|
Array[File]
|
Reads to classify. May be unmapped or mapped or both, paired-end or single-end. | - |
krakenuniq_db_tar_lz4
|
File
|
Pre-built Kraken database tarball. | - |
krona_taxonomy_db_tgz
|
File
|
Krona taxonomy database containing a single file: taxonomy.tab, or possibly just a compressed taxonomy.tab | - |
2 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
DB_DIR=$(mktemp -d --suffix _db)
mkdir -p $DB_DIR/krakenuniq $DB_DIR/krona
metagenomics.py --version | tee VERSION
# decompress DB to $DB_DIR
read_utils.py extract_tarball \
"~{krakenuniq_db_tar_lz4}" $DB_DIR/krakenuniq \
--loglevel=DEBUG
# Support old db tar format
if [ -d "$DB_DIR/krakenuniq/krakenuniq" ]; then
mv $DB_DIR/krakenuniq/krakenuniq/* $DB_DIR/krakenuniq
fi
# unpack krona taxonomy.tab
if [[ "~{krona_taxonomy_db_tgz}" == *.tar.* ]]; then
read_utils.py extract_tarball \
"~{krona_taxonomy_db_tgz}" $DB_DIR/krona \
--loglevel=DEBUG & # we don't need this until later
else
if [[ "~{krona_taxonomy_db_tgz}" == *.zst ]]; then
cat "~{krona_taxonomy_db_tgz}" | zstd -d > $DB_DIR/krona/taxonomy.tab &
elif [[ "~{krona_taxonomy_db_tgz}" == *.gz ]]; then
cat "~{krona_taxonomy_db_tgz}" | pigz -dc > $DB_DIR/krona/taxonomy.tab &
elif [[ "~{krona_taxonomy_db_tgz}" == *.bz2 ]]; then
cat "~{krona_taxonomy_db_tgz}" | bzip -dc > $DB_DIR/krona/taxonomy.tab &
else
cp "~{krona_taxonomy_db_tgz}" $DB_DIR/krona/taxonomy.tab &
fi
fi
# prep input and output file names
OUT_READS=fnames_outreads.txt
OUT_REPORTS=fnames_outreports.txt
OUT_BASENAME=basenames_reports.txt
for bam in "~{sep="" "" reads_unmapped_bam}"; do
echo "$(basename $bam .bam).krakenuniq-reads.txt.gz" >> $OUT_READS
echo "$(basename $bam .bam)" >> $OUT_BASENAME
echo "$(basename $bam .bam).krakenuniq-summary_report.txt" >> $OUT_REPORTS
done
# execute on all inputs and outputs serially, but with a single
# database load into ram
metagenomics.py krakenuniq \
$DB_DIR/krakenuniq \
"~{sep="" "" reads_unmapped_bam}" \
--outReads $(cat $OUT_READS) \
--outReport $(cat $OUT_REPORTS) \
--loglevel=DEBUG
wait # for krona_taxonomy_db_tgz to download and extract
# Support old db tar format
if [ -d $DB_DIR/krona/taxonomy ]; then
mv $DB_DIR/krona/taxonomy/* $DB_DIR/krona
fi
# run single-threaded krona on up to nproc samples at once
parallel -I ,, \
"metagenomics.py krona \
,,.krakenuniq-summary_report.txt \
$DB_DIR/krona \
,,.krakenuniq-krona.html \
--sample_name ,, \
--noRank --noHits --inputType krakenuniq \
--loglevel=DEBUG" \
::: $(cat $OUT_BASENAME)
# merge all krona reports
ktImportKrona -o krakenuniq.krona.combined.html *.krakenuniq-krona.html
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES
Outputs
| Name | Type | Expression |
|---|---|---|
krakenuniq_classified_reads
|
Array[File]
|
glob("*.krakenuniq-reads.txt.gz")
|
krakenuniq_summary_reports
|
Array[File]
|
glob("*.krakenuniq-summary_report.txt")
|
krona_report_html
|
Array[File]
|
glob("*.krakenuniq-krona.html")
|
krona_report_merged_html
|
File
|
"krakenuniq.krona.combined.html"
|
max_ram_gb
|
Int
|
ceil((read_float("MEM_BYTES") / 1000000000))
|
viralngs_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
32
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem3_ssd1_v2_x48"
|
preemptible
|
0
|
maxRetries
|
2
|
TASKS build_krakenuniq_db
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
genome_fastas_tarball
|
File
|
- | - |
taxonomy_db_tarball
|
File
|
- | - |
db_basename
|
String
|
- | - |
subsetTaxonomy
|
Boolean?
|
- | - |
minimizerLen
|
Int?
|
- | - |
kmerLen
|
Int?
|
- | - |
maxDbSize
|
Int?
|
- | - |
zstd_compression_level
|
Int?
|
- | - |
2 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
TAXDB_DIR=$(mktemp -d --suffix _taxdb)
FASTAS_DIR=$(mktemp -d --suffix fasta)
DB_DIR="$TMPDIR/~{db_basename}"
mkdir -p $DB_DIR
metagenomics.py --version | tee VERSION
# decompress input tarballs
read_utils.py extract_tarball \
"~{genome_fastas_tarball}" $FASTAS_DIR \
--loglevel=DEBUG
read_utils.py extract_tarball \
"~{taxonomy_db_tarball}" $TAXDB_DIR \
--loglevel=DEBUG
# build database
metagenomics.py krakenuniq_build \
$DB_DIR --library $FASTAS_DIR --taxonomy $TAXDB_DIR \
~{true="--subsetTaxonomy=" false="" subsetTaxonomy} \
~{'--minimizerLen=' + minimizerLen} \
~{'--kmerLen=' + kmerLen} \
~{'--maxDbSize=' + maxDbSize} \
--clean \
--loglevel=DEBUG
# tar it up
tar -c -C $DB_DIR . | zstd ~{"-" + zstd_compression_level} > "~{db_basename}.tar.zst"
Outputs
| Name | Type | Expression |
|---|---|---|
krakenuniq_db
|
File
|
"~{db_basename}.tar.zst"
|
viralngs_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
cpu
|
32
|
dx_instance_type
|
"mem3_ssd1_v2_x32"
|
preemptible
|
0
|
maxRetries
|
2
|
TASKS kraken2
Runs Kraken2 classification
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
reads_bam
|
File
|
Reads or contigs to classify. May be unmapped or mapped or both, paired-end or single-end. | - |
kraken2_db_tgz
|
File
|
Pre-built Kraken database tarball containing three files: hash.k2d, opts.k2d, and taxo.k2d. | - |
krona_taxonomy_db_tgz
|
File
|
Krona taxonomy database containing a single file: taxonomy.tab, or possibly just a compressed taxonomy.tab | - |
min_base_qual
|
Int?
|
Minimum base quality used in classification | - |
3 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
DB_DIR=$(mktemp -d --suffix _db)
mkdir -p $DB_DIR/kraken2 $DB_DIR/krona
# decompress DB to $DB_DIR
read_utils.py extract_tarball \
"~{kraken2_db_tgz}" $DB_DIR/kraken2 \
--loglevel=DEBUG
du -hs $DB_DIR/kraken2
# unpack krona taxonomy.tab
if [[ "~{krona_taxonomy_db_tgz}" == *.tar.* ]]; then
read_utils.py extract_tarball \
"~{krona_taxonomy_db_tgz}" $DB_DIR/krona \
--loglevel=DEBUG & # we don't need this until later
else
if [[ "~{krona_taxonomy_db_tgz}" == *.zst ]]; then
cat "~{krona_taxonomy_db_tgz}" | zstd -d > $DB_DIR/krona/taxonomy.tab &
elif [[ "~{krona_taxonomy_db_tgz}" == *.gz ]]; then
cat "~{krona_taxonomy_db_tgz}" | pigz -dc > $DB_DIR/krona/taxonomy.tab &
elif [[ "~{krona_taxonomy_db_tgz}" == *.bz2 ]]; then
cat "~{krona_taxonomy_db_tgz}" | bzip -dc > $DB_DIR/krona/taxonomy.tab &
else
cp "~{krona_taxonomy_db_tgz}" $DB_DIR/krona/taxonomy.tab &
fi
fi
metagenomics.py --version | tee VERSION
if [[ "~{reads_bam}" == *.bam ]]; then
metagenomics.py kraken2 \
$DB_DIR/kraken2 \
"~{reads_bam}" \
--outReads "~{out_basename}".kraken2.reads.txt \
--outReports "~{out_basename}".kraken2.report.txt \
~{"--confidence " + confidence_threshold} \
~{"--min_base_qual " + min_base_qual} \
--loglevel=DEBUG
else # fasta input file: call kraken2 directly
kraken2 \
--db $DB_DIR/kraken2 \
"~{reads_bam}" \
--output "~{out_basename}".kraken2.reads.txt \
--report "~{out_basename}".kraken2.report.txt \
~{"--confidence " + confidence_threshold} \
~{"--min_base_qual " + min_base_qual}
fi
wait # for krona_taxonomy_db_tgz to download and extract
pigz "~{out_basename}".kraken2.reads.txt &
metagenomics.py krona \
"~{out_basename}".kraken2.report.txt \
$DB_DIR/krona \
"~{out_basename}".kraken2.krona.html \
--sample_name "~{out_basename}" \
--noRank --noHits --inputType kraken2 \
--loglevel=DEBUG
wait # pigz reads.txt
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES
Outputs
| Name | Type | Expression |
|---|---|---|
kraken2_reads_report
|
File
|
"~{out_basename}.kraken2.reads.txt.gz"
|
kraken2_summary_report
|
File
|
"~{out_basename}.kraken2.report.txt"
|
krona_report_html
|
File
|
"~{out_basename}.kraken2.krona.html"
|
max_ram_gb
|
Int
|
ceil((read_float("MEM_BYTES") / 1000000000))
|
viralngs_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
16
|
cpuPlatform
|
"Intel Ice Lake"
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem3_ssd1_v2_x8"
|
preemptible
|
2
|
maxRetries
|
2
|
TASKS report_primary_kraken_taxa
Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributing taxa under a focal taxon of interest.
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
kraken_summary_report
|
File
|
- | - |
2 optional inputs with default values |
|||
Command
set -e
metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv"
cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW
cut -f 2 TOPROW > NUM_FOCAL
cut -f 4 TOPROW > PCT_OF_FOCAL
cut -f 7 TOPROW > NUM_READS
cut -f 8 TOPROW > TAX_RANK
cut -f 9 TOPROW > TAX_ID
cut -f 10 TOPROW > TAX_NAME
Outputs
| Name | Type | Expression |
|---|---|---|
focal_tax_name
|
String
|
focal_taxon
|
ranked_focal_report
|
File
|
"~{out_basename}.ranked_focal_report.tsv"
|
total_focal_reads
|
Int
|
read_int("NUM_FOCAL")
|
percent_of_focal
|
Float
|
read_float("PCT_OF_FOCAL")
|
num_reads
|
Int
|
read_int("NUM_READS")
|
tax_rank
|
String
|
read_string("TAX_RANK")
|
tax_id
|
String
|
read_string("TAX_ID")
|
tax_name
|
String
|
read_string("TAX_NAME")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
1
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem1_ssd1_v2_x2"
|
preemptible
|
2
|
maxRetries
|
2
|
TASKS filter_refs_to_found_taxa
Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report.
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
taxid_to_ref_accessions_tsv
|
File
|
- | - |
focal_report_tsv
|
File
|
- | - |
taxdump_tgz
|
File
|
- | - |
2 optional inputs with default values |
|||
Command
set -e
mkdir -p taxdump
read_utils.py extract_tarball "~{taxdump_tgz}" taxdump
metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" taxdump ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv"
Outputs
| Name | Type | Expression |
|---|---|---|
filtered_taxid_to_ref_accessions_tsv
|
File
|
"~{ref_basename}-~{hits_basename}.tsv"
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
"2 GB"
|
cpu
|
1
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem1_ssd1_v2_x2"
|
preemptible
|
2
|
maxRetries
|
2
|
TASKS build_kraken2_db
Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent.
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
db_basename
|
String
|
A descriptive string used in output filenames. Outputs will be called kraken2-<db_basename>.tar.zst, krona-<db_basename>.tar.zst, and taxdump-<db_basename>.tar.gz | - |
taxonomy_db_tgz
|
File?
|
Optional tarball of kraken2 taxonomy database directory. Omitting this input will cause a fresh download from NCBI at the time of build. | - |
kmerLen
|
Int?
|
(k) K-mer length in bp/aa (Kraken2 defaults: 35 nt, 15 aa) | - |
minimizerLen
|
Int?
|
(l) Minimizer length in bp/aa (Kraken2 defaults: 31 nt, 12 aa) | - |
minimizerSpaces
|
Int?
|
(s) Number of bp/aa in minimizer that are ignored in comparisons (Kraken2 defaults: 7 nt, 0 aa) | - |
maxDbSize
|
Int?
|
- | - |
zstd_compression_level
|
Int?
|
- | - |
5 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
TAXDB_DIR=$(mktemp -d)
FASTAS_DIR=$(mktemp -d)
KRONA_DIR=$(mktemp -d)
DB_DIR=$(mktemp -d)
metagenomics.py --version | tee VERSION
# prep input taxonomy db, if specified
if [ -n "~{taxonomy_db_tgz}" ]; then
read_utils.py extract_tarball \
"~{taxonomy_db_tgz}" $TAXDB_DIR \
--loglevel=DEBUG &
TAX_INPUT_CMD="--tax_db=$TAXDB_DIR"
else
TAX_INPUT_CMD=""
fi
# prep input custom fastas, if specified
CUSTOM_INPUT_CMD=""
if [ -n "~{sep=" " custom_libraries}" ]; then
CUSTOM_INPUT_CMD="--custom_libraries "
for TGZ in ~{sep=" " custom_libraries}; do
if [[ ($TGZ == *.tar.*) || ($TGZ == *.tgz) ]]; then
read_utils.py extract_tarball \
$TGZ $FASTAS_DIR \
--loglevel=DEBUG &
else
if [[ $TGZ == *.zst ]]; then
cat $TGZ | zstd -d > $FASTAS_DIR/$(basename $TGZ .zst) &
elif [[ $TGZ == *.gz ]]; then
cat $TGZ | pigz -dc > $FASTAS_DIR/$(basename $TGZ .gz) &
elif [[ $TGZ == *.bz2 ]]; then
cat $TGZ | bzip -dc > $FASTAS_DIR/$(basename $TGZ .bz2) &
else
CUSTOM_INPUT_CMD="$CUSTOM_INPUT_CMD $TGZ"
fi
fi
done
wait # wait for all decompressions to finish
for FASTA in $FASTAS_DIR/*; do
CUSTOM_INPUT_CMD="$CUSTOM_INPUT_CMD $FASTA"
done
fi
# prep standard libraries, if specified
STD_INPUT_CMD=""
if [ -n "~{sep=" " standard_libraries}" ]; then
STD_INPUT_CMD="--standard_libraries ~{sep=" " standard_libraries}"
fi
# build kraken2 database
wait # wait for all decompressions to finish
metagenomics.py kraken2_build \
$DB_DIR \
$TAX_INPUT_CMD \
$STD_INPUT_CMD \
$CUSTOM_INPUT_CMD \
--taxdump_out "taxdump-~{db_basename}.tar.gz" \
~{true="--protein" false="" protein} \
~{'--kmerLen=' + kmerLen} \
~{'--minimizerLen=' + minimizerLen} \
~{'--minimizerSpaces=' + minimizerSpaces} \
~{'--maxDbSize=' + maxDbSize} \
--loglevel=DEBUG
tar -c -C $DB_DIR . | zstd ~{"-" + zstd_compression_level} > "kraken2-~{db_basename}.tar.zst" &
# build matching krona db
metagenomics.py krona_build \
$KRONA_DIR --taxdump_tar_gz "taxdump-~{db_basename}.tar.gz"
cat $KRONA_DIR/taxonomy.tab | zstd -19 > "krona-~{db_basename}-taxonomy.tab.zst"
wait # tar/zst of kraken2 db
Outputs
| Name | Type | Expression |
|---|---|---|
kraken2_db
|
File
|
"kraken2-~{db_basename}.tar.zst"
|
taxdump_tgz
|
File
|
"taxdump-~{db_basename}.tar.gz"
|
krona_db
|
File
|
"krona-~{db_basename}-taxonomy.tab.zst"
|
viralngs_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
cpu
|
16
|
dx_instance_type
|
"mem3_ssd1_v2_x16"
|
preemptible
|
0
|
maxRetries
|
2
|
TASKS blastx
Runs BLASTx classification
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
contigs_fasta
|
File
|
Sequences to classify. Use for a small number of longer query sequences (e.g. contigs) | - |
blast_db_tgz
|
File
|
Pre-built BLAST database tarball containing an indexed blast database named 'nr' | - |
krona_taxonomy_db_tgz
|
File
|
Krona taxonomy database: a tarball containing a taxonomy.tab file as well as accession to taxid mapping (a kraken-based taxonomy database will not suffice). | - |
2 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
DB_DIR=$(mktemp -d --suffix _db)
mkdir -p $DB_DIR/blast $DB_DIR/krona
# decompress DB to $DB_DIR
read_utils.py extract_tarball \
"~{blast_db_tgz}" $DB_DIR/blast \
--loglevel=DEBUG
# unpack krona taxonomy database
read_utils.py extract_tarball \
"~{krona_taxonomy_db_tgz}" $DB_DIR/krona \
--loglevel=DEBUG & # we don't need this until later
blastx -version | tee VERSION
blastx \
-query "~{contigs_fasta}" \
-db $DB_DIR/blast/nr \
-out "~{out_basename}.blastx.contigs.txt" \
-outfmt 7 \
-num_threads $(nproc)
wait # for krona_taxonomy_db_tgz to download and extract
ktImportBLAST \
-i -k \
-tax $DB_DIR/krona \
-o "~{out_basename}.blastx.krona.html" \
"~{out_basename}.blastx.contigs.txt","~{out_basename}"
pigz "~{out_basename}".blastx.contigs.txt
Outputs
| Name | Type | Expression |
|---|---|---|
blast_report
|
File
|
"~{out_basename}.blastx.contigs.txt.gz"
|
krona_report_html
|
File
|
"~{out_basename}.blastx.krona.html"
|
blastx_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
32
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem1_ssd1_v2_x36"
|
preemptible
|
1
|
maxRetries
|
2
|
TASKS krona
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
reports_txt_gz
|
Array[File]+
|
- | - |
krona_taxonomy_db_tgz
|
File
|
- | - |
input_type
|
String?
|
- | - |
query_column
|
Int?
|
- | - |
taxid_column
|
Int?
|
- | - |
score_column
|
Int?
|
- | - |
magnitude_column
|
Int?
|
- | - |
3 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
DB_DIR=$(mktemp -d --suffix _db)
mkdir -p $DB_DIR/krona
metagenomics.py --version | tee VERSION
# unpack krona taxonomy.tab
if [[ "~{krona_taxonomy_db_tgz}" == *.tar.* ]]; then
read_utils.py extract_tarball \
"~{krona_taxonomy_db_tgz}" $DB_DIR/krona \
--loglevel=DEBUG
else
if [[ "~{krona_taxonomy_db_tgz}" == *.zst ]]; then
cat "~{krona_taxonomy_db_tgz}" | zstd -d > $DB_DIR/krona/taxonomy.tab
elif [[ "~{krona_taxonomy_db_tgz}" == *.gz ]]; then
cat "~{krona_taxonomy_db_tgz}" | pigz -dc > $DB_DIR/krona/taxonomy.tab
elif [[ "~{krona_taxonomy_db_tgz}" == *.bz2 ]]; then
cat "~{krona_taxonomy_db_tgz}" | bzip -dc > $DB_DIR/krona/taxonomy.tab
else
cp "~{krona_taxonomy_db_tgz}" $DB_DIR/krona/taxonomy.tab
fi
fi
metagenomics.py krona \
"~{sep="" "" reports_txt_gz}" \
$DB_DIR/krona \
"~{out_basename}.html" \
~{'--inputType=' + input_type} \
~{'--queryColumn=' + query_column} \
~{'--taxidColumn=' + taxid_column} \
~{'--scoreColumn=' + score_column} \
~{'--magnitudeColumn=' + magnitude_column} \
--noRank --noHits \
--loglevel=DEBUG
Outputs
| Name | Type | Expression |
|---|---|---|
krona_report_html
|
File
|
"~{out_basename}.html"
|
viralngs_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
1
|
disks
|
"local-disk " + disk_size + " HDD"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem1_ssd2_v2_x2"
|
maxRetries
|
2
|
TASKS krona_merge
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
krona_reports
|
Array[File]
|
- | - |
out_basename
|
String
|
- | - |
2 optional inputs with default values |
|||
Command
set -ex -o pipefail
ktImportKrona | head -2 | tail -1 | cut -f 2-3 -d ' ' | tee VERSION
ktImportKrona -o "~{out_basename}.html" ~{sep=" " krona_reports}
Outputs
| Name | Type | Expression |
|---|---|---|
krona_report_html
|
File
|
"~{out_basename}.html"
|
krona_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
1
|
disks
|
"local-disk " + disk_size + " HDD"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem1_ssd2_v2_x2"
|
maxRetries
|
2
|
TASKS filter_bam_to_taxa
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
classified_bam
|
File
|
- | - |
classified_reads_txt_gz
|
File
|
- | - |
ncbi_taxonomy_db_tgz
|
File
|
- | - |
taxonomic_names
|
Array[String]?
|
- | - |
taxonomic_ids
|
Array[Int]?
|
- | - |
minimum_hit_groups
|
Int?
|
- | - |
6 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
# find 90% memory
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
# decompress taxonomy DB to CWD
read_utils.py extract_tarball \
"~{ncbi_taxonomy_db_tgz}" . \
--loglevel=DEBUG
if [ -d "taxonomy" ]; then mv taxonomy/* .; fi
touch taxfilterargs
TAXNAMELIST="~{write_lines(select_first([taxonomic_names, []]))}"
if [ -n "$(cat $TAXNAMELIST)" ]; then
echo "--taxNames" >> taxfilterargs
cat $TAXNAMELIST >> taxfilterargs
echo "" >> taxfilterargs # cromwell write_lines lacks a final newline, so add one manually
fi
TAXIDLIST="~{write_lines(select_first([taxonomic_ids, []]))}"
if [ -n "$(cat $TAXIDLIST)" ]; then
echo "--taxIDs" >> taxfilterargs
cat $TAXIDLIST >> taxfilterargs
echo "" >> taxfilterargs # cromwell write_lines lacks a final newline, so add one manually
fi
echo "taxfilterargs:"
cat taxfilterargs
metagenomics.py --version | tee VERSION
samtools view -c "~{classified_bam}" | tee classified_taxonomic_filter_read_count_pre &
cat taxfilterargs | grep . | xargs -d '\n' metagenomics.py filter_bam_to_taxa \
"~{classified_bam}" \
"~{classified_reads_txt_gz}" \
"~{out_basename}.bam" \
nodes.dmp \
names.dmp \
~{true="--exclude" false="" exclude_taxa} \
~{true="--without-children" false="" withoutChildren} \
~{'--minimum_hit_groups=' + minimum_hit_groups} \
--out_count COUNT \
--JVMmemory "$mem_in_mb"m \
--loglevel=DEBUG
samtools view -c "~{out_basename}.bam" | tee classified_taxonomic_filter_read_count_post
if [ "~{run_fastqc}" = "true" ]; then
reports.py fastqc "~{out_basename}.bam" "~{out_basename}.fastqc.html"
fi
wait
Outputs
| Name | Type | Expression |
|---|---|---|
bam_filtered_to_taxa
|
File
|
"~{out_basename}.bam"
|
classified_taxonomic_filter_read_count_pre
|
Int
|
read_int("classified_taxonomic_filter_read_count_pre")
|
reads_matching_taxa
|
Int
|
read_int("COUNT")
|
classified_taxonomic_filter_read_count_post
|
Int
|
read_int("classified_taxonomic_filter_read_count_post")
|
viralngs_version
|
String
|
read_string("VERSION")
|
fastqc_html_report
|
File?
|
"~{out_basename}.fastqc.html"
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
cpu
|
8
|
dx_instance_type
|
"mem3_ssd1_v2_x4"
|
preemptible
|
1
|
maxRetries
|
2
|
TASKS kaiju
Inputs
| Name | Type | Description | Default |
|---|---|---|---|
reads_unmapped_bam
|
File
|
- | - |
kaiju_db_lz4
|
File
|
- | - |
ncbi_taxonomy_db_tgz
|
File
|
- | - |
krona_taxonomy_db_tgz
|
File
|
- | - |
2 optional inputs with default values |
|||
Command
set -ex -o pipefail
if [ -z "$TMPDIR" ]; then
export TMPDIR=$(pwd)
fi
DB_DIR=$(mktemp -d --suffix _db)
mkdir -p $DB_DIR/kaiju $DB_DIR/krona $DB_DIR/taxonomy
lz4 -dc "~{kaiju_db_lz4}" > $DB_DIR/kaiju/kaiju.fmi
read_utils.py extract_tarball \
"~{ncbi_taxonomy_db_tgz}" $DB_DIR/taxonomy \
--loglevel=DEBUG
# Support old db tar format
if [ -d "$DB_DIR/taxonomy/taxonomy" ]; then
mv $DB_DIR/taxonomy/taxonomy/* $DB_DIR/taxonomy
fi
read_utils.py extract_tarball \
"~{krona_taxonomy_db_tgz}" $DB_DIR/krona \
--loglevel=DEBUG
metagenomics.py --version | tee VERSION
# classify contigs
metagenomics.py kaiju \
"~{reads_unmapped_bam}" \
$DB_DIR/kaiju/kaiju.fmi \
$DB_DIR/taxonomy \
"~{input_basename}.kaiju.summary_report.txt" \
--outReads "~{input_basename}.kaiju.reads.txt.gz" \
--loglevel=DEBUG
# run krona
metagenomics.py krona \
"~{input_basename}.kaiju.summary_report.txt" \
$DB_DIR/krona \
"~{input_basename}.kaiju-krona.html" \
--inputType kaiju \
--noRank --noHits \
--loglevel=DEBUG
Outputs
| Name | Type | Expression |
|---|---|---|
kaiju_report
|
File
|
"~{input_basename}.kaiju-summary_report.txt"
|
kaiju_reads
|
File
|
"~{input_basename}.kaiju-reads.txt.gz"
|
krona_report_html
|
File
|
"~{input_basename}.kaiju-krona.html"
|
viralngs_version
|
String
|
read_string("VERSION")
|
Runtime
| Key | Value |
|---|---|
docker
|
docker
|
memory
|
machine_mem_gb + " GB"
|
cpu
|
16
|
disks
|
"local-disk " + disk_size + " LOCAL"
|
disk
|
disk_size + " GB"
|
dx_instance_type
|
"mem3_ssd1_v2_x16"
|
maxRetries
|
2
|