tasks_reports
pipes/WDL/tasks/tasks_reports.wdl

TASKS tasks_reports

File Path pipes/WDL/tasks/tasks_reports.wdl
WDL Version 1.0
Type tasks

📋Tasks in this document

Tasks

TASKS alignment_metrics

Produce various standard metrics and coverage plots via Picard and Samtools for aligned BAM files.

Inputs

Name Type Description Default
aligned_bam File - -
ref_fasta File - -
primers_bed File? - -
amplicon_set String? - -
min_coverage Int? - -
4 optional inputs with default values

Command

set -e
MEM_MB=$(free -m | head -2 | tail -1 | awk '{print $4}')
XMX=$(echo "-Xmx"$MEM_MB"m")
echo "Requesting $MEM_MB MB of RAM for Java"

# requisite Picard fasta indexing
python3<<CODE
import shutil
import util.file
with util.file.fastas_with_sanitized_ids("~{ref_fasta}", use_tmp=True) as sanitized_fastas:
    shutil.copyfile(sanitized_fastas[0], 'reference.fasta')
CODE
picard $XMX CreateSequenceDictionary -R reference.fasta

if [ -s "~{ref_fasta}" ]; then
  # get Picard metrics and clean up the junky outputs
  picard $XMX CollectRawWgsMetrics \
    -R reference.fasta \
    -I "~{aligned_bam}" \
    -O picard_raw.raw_wgs_metrics.txt
  grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt

  picard $XMX CollectAlignmentSummaryMetrics \
    -R reference.fasta \
    -I "~{aligned_bam}" \
    -O picard_raw.alignment_metrics.txt
  grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt

  picard $XMX CollectInsertSizeMetrics \
    -I "~{aligned_bam}" \
    -O picard_raw.insert_size_metrics.txt \
    -H picard_raw.insert_size_metrics.pdf \
    --INCLUDE_DUPLICATES true
  grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt
else
  # ref_fasta is empty -> Picard will fail
  touch picard_clean.raw_wgs_metrics.txt picard_clean.alignment_metrics.txt picard_clean.insert_size_metrics.txt
fi

# prepend the sample name in order to facilitate tsv joining later
SAMPLE=$(samtools view -H "~{aligned_bam}" | grep ^@RG | perl -lape 's/^@RG.*SM:(\S+).*$/$1/' | sort | uniq)
echo -e "sample_sanitized\tbam" > prepend.txt
echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
paste prepend.txt picard_clean.raw_wgs_metrics.txt > "~{out_basename}".raw_wgs_metrics.txt
echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
paste prepend.txt picard_clean.alignment_metrics.txt > "~{out_basename}".alignment_metrics.txt
echo -e "sample_sanitized\tbam" > prepend.txt
echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
paste prepend.txt picard_clean.insert_size_metrics.txt > "~{out_basename}".insert_size_metrics.txt

touch "~{out_basename}".ampliconstats.txt "~{out_basename}".ampliconstats_parsed.txt
echo -e "sample_sanitized\tbam\tamplicon_set\tamplicon_idx\tamplicon_left\tamplicon_right\tFREADS\tFDEPTH\tFPCOV\tFAMP" > "~{out_basename}.ampliconstats_parsed.txt"
if [ -n "~{primers_bed}" ]; then
  # samtools ampliconstats
  cat "~{primers_bed}" | sort -k 1,1 -k 4,4 -t $'\t' > primers-sorted_for_samtools.bed
  set +e # there are just some weird bed files out there -- let them fail silently
  samtools ampliconstats -s -@ $(nproc) \
    ~{'-d ' + min_coverage} \
    ~{'-l ' + max_amp_len} \
    ~{'-a ' + max_amplicons} \
    -o "~{out_basename}".ampliconstats.txt primers-sorted_for_samtools.bed "~{aligned_bam}"

  # parse into our own tsv to facilitate tsv joining later
  if [ -n "~{default="" amplicon_set}" ]; then
    AMPLICON_SET="~{default="" amplicon_set}"
  else
    AMPLICON_SET=$(basename "~{primers_bed}" .bed)
  fi
  grep ^AMPLICON "~{out_basename}".ampliconstats.txt | cut -f 2- > AMPLICON
  grep ^FREADS "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FREADS; echo "" >> FREADS
  grep ^FDEPTH "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FDEPTH; echo "" >> FDEPTH
  grep ^FPCOV  "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FPCOV;  echo "" >> FPCOV
  grep ^FAMP   "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP
  for i in $(cut -f 1 AMPLICON); do echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET"; done > prepend.txt
  wc -l prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP
  paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep '\S' >> "~{out_basename}.ampliconstats_parsed.txt"
fi

Outputs

Name Type Expression
wgs_metrics File "~{out_basename}.raw_wgs_metrics.txt"
alignment_metrics File "~{out_basename}.alignment_metrics.txt"
insert_size_metrics File "~{out_basename}.insert_size_metrics.txt"
amplicon_stats File "~{out_basename}.ampliconstats.txt"
amplicon_stats_parsed File "~{out_basename}.ampliconstats_parsed.txt"

Runtime

Key Value
docker docker
memory machine_mem_gb + " GB"
cpu 4
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem3_ssd1_v2_x4"
maxRetries 2

TASKS plot_coverage

Inputs

Name Type Description Default
aligned_reads_bam File - -
sample_name String - -
max_coverage_depth Int? - -
base_q_threshold Int? - -
mapping_q_threshold Int? - -
read_length_threshold Int? - -
plotXLimits String? - -
plotYLimits String? - -
8 optional inputs with default values

Command

set -ex -o pipefail

read_utils.py --version | tee VERSION

samtools view -c ~{aligned_reads_bam} | tee reads_aligned
if [ "$(cat reads_aligned)" != "0" ]; then
  samtools index -@ "$(nproc)" "~{aligned_reads_bam}"

  PLOT_DUPE_OPTION=""
  if [[ "~{skip_mark_dupes}" != "true" ]]; then
    PLOT_DUPE_OPTION="~{true="--plotOnlyNonDuplicates" false="" plot_only_non_duplicates}"
  fi
  
  BINNING_OPTION="~{true="--binLargePlots" false="" bin_large_plots}"

  # plot coverage
  reports.py plot_coverage \
    "~{aligned_reads_bam}" \
    "~{sample_name}.coverage_plot.pdf" \
    --outSummary "~{sample_name}.coverage_plot.txt" \
    --plotFormat pdf \
    --plotWidth 1100 \
    --plotHeight 850 \
    --plotDPI 100 \
    ~{"-m " + max_coverage_depth} \
    ~{"-q " + base_q_threshold} \
    ~{"-Q " + mapping_q_threshold} \
    ~{"-l " + read_length_threshold} \
    ~{"--plotXLimits " + plotXLimits} \
    ~{"--plotYLimits " + plotYLimits} \
    $PLOT_DUPE_OPTION \
    $BINNING_OPTION \
    --binningSummaryStatistic ~{binning_summary_statistic} \
    --plotTitle "~{sample_name} coverage plot" \
    --loglevel=DEBUG

else
  touch ~{sample_name}.coverage_plot.pdf ~{sample_name}.coverage_plot.txt
fi

# collect figures of merit
set +o pipefail # grep will exit 1 if it fails to find the pattern
samtools view -H ~{aligned_reads_bam} | perl -n -e'/^@SQ.*LN:(\d+)/ && print "$1\n"' |  python -c "import sys; print(sum(int(x) for x in sys.stdin))" | tee assembly_length
# report only primary alignments 260=exclude unaligned reads and secondary mappings
samtools view -h -F 260 ~{aligned_reads_bam} | samtools flagstat - | tee ~{sample_name}.flagstat.txt
grep properly ~{sample_name}.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
samtools view ~{aligned_reads_bam} | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else print(0)" > mean_coverage

Outputs

Name Type Expression
coverage_plot File "~{sample_name}.coverage_plot.pdf"
coverage_tsv File "~{sample_name}.coverage_plot.txt"
assembly_length Int read_int("assembly_length")
reads_aligned Int read_int("reads_aligned")
read_pairs_aligned Int read_int("read_pairs_aligned")
bases_aligned Float read_float("bases_aligned")
mean_coverage Float read_float("mean_coverage")
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker "~{docker}"
memory "7 GB"
cpu 2
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x4"
preemptible 1
maxRetries 2

TASKS merge_coverage_per_position

Inputs

Name Type Description Default
coverage_tsvs Array[File]+ - -
ref_fasta File - -
3 optional inputs with default values

Command

set -e

python3<<CODE
import os
import pandas as pd
from functools import reduce
import Bio.SeqIO

# get genome length
genome_length = 0
with open('~{ref_fasta}', 'rt') as inf:
  for seq in Bio.SeqIO.parse(inf, 'fasta'):
    genome_length += len(seq.seq.replace("-",""))

# Loop through a list of file paths and read in each depth.tsv generated as part of assemble_refbased
depths_dfs = []
for in_tsv in ("~{sep="", "" coverage_tsvs}"):
    sample_name = '.'.join(os.path.basename(in_tsv).split('.')[:-2])
    sample_depths_df = pd.read_csv(in_tsv, sep='\t', header=None
        ).rename(columns={0:'Ref',1:'Position',2:sample_name})
    depths_dfs.append(sample_depths_df)

# Condense all depths into a single dataframe
df_merged = reduce(lambda left,right:
    pd.merge(left,right,on=['Ref','Position'],how='outer'),
    depths_dfs)
df_merged = df_merged.fillna(0)

#Create dummy df that contains all positions along the genome
dummy_df = pd.DataFrame([range(1,genome_length)]).T.rename(columns={0:'Position'})
df_merged = df_merged.merge(dummy_df, on='Position', how='right').fillna(0)
df_merged = df_merged.drop(['Ref'], axis=1)
df_merged.to_csv("~{out_report_name}", index=False)
CODE

Outputs

Name Type Expression
coverage_multi_sample_per_position_csv File out_report_name

Runtime

Key Value
docker "~{docker}"
memory "2 GB"
cpu 2
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd2_v2_x4"
maxRetries 2

TASKS coverage_report

Inputs

Name Type Description Default
mapped_bams Array[File]+ - -
3 optional inputs with default values

Command

set -e
reports.py --version | tee VERSION
python3 << CODE
import tools.samtools
import reports
samtools = tools.samtools.SamtoolsTool()
in_bams = list([bam for bam in ["~{sep="", "" mapped_bams}"] if bam and not samtools.isEmpty(bam)])
if in_bams:
  reports.coverage_only(in_bams, "~{out_report_name}")
else:
  with open("~{out_report_name}", "w") as outf:
    outf.write('\t'.join(('sample', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', 'aln2self_cov_1X', 'aln2self_cov_5X', 'aln2self_cov_20X', 'aln2self_cov_100X'))+'\n')
CODE

Outputs

Name Type Expression
coverage_report File out_report_name
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory "2 GB"
cpu 2
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd2_v2_x4"
maxRetries 2

TASKS assembly_bases

Count bases in a fasta file.

Inputs

Name Type Description Default
fasta File - -
1 optional input with default value

Command

set -e
grep -v '^>' "~{fasta}" | tr -d '\n' | wc -c | tee assembly_length
grep -v '^>' "~{fasta}" | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous

Outputs

Name Type Expression
assembly_length Int read_int("assembly_length")
assembly_length_unambiguous Int read_int("assembly_length_unambiguous")

Runtime

Key Value
docker "~{docker}"
memory "1 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS fastqc

Inputs

Name Type Description Default
reads_bam File Input reads in BAM format. -
1 optional input with default value

Command

set -ex -o pipefail
reports.py --version | tee VERSION
reports.py fastqc ~{reads_bam} ~{reads_basename}_fastqc.html --out_zip ~{reads_basename}_fastqc.zip

Outputs

Name Type Expression
fastqc_html File "~{reads_basename}_fastqc.html"
fastqc_zip File "~{reads_basename}_fastqc.zip"
viralngs_version String read_string("VERSION")

Runtime

Key Value
memory "2 GB"
cpu 1
docker "~{docker}"
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS align_and_count

Inputs

Name Type Description Default
reads_bam File Unaligned reads in BAM format -
ref_db File Reference genome in FASTA format -
machine_mem_gb Int? - -
6 optional inputs with default values

Command

set -ex -o pipefail

read_utils.py --version | tee VERSION

ln -s "~{reads_bam}" "~{reads_basename}.bam"
read_utils.py minimap2_idxstats \
  "~{reads_basename}.bam" \
  "~{ref_db}" \
  --outStats "~{reads_basename}.count.~{ref_basename}.txt.unsorted" \
  ~{true="--filterReadsAfterAlignment" false="" filter_bam_to_proper_primary_mapped_reads} \
  ~{true="--doNotRequirePairsToBeProper" false="" do_not_require_proper_mapped_pairs_when_filtering} \
  ~{true="--keepSingletons" false="" keep_singletons_when_filtering} \
  ~{true="--keepDuplicates" false="" keep_duplicates_when_filtering} \
  --loglevel=DEBUG

sort -b -r -n -k3 "~{reads_basename}.count.~{ref_basename}.txt.unsorted" > "~{reads_basename}.count.~{ref_basename}.txt"
head -n ~{topNHits} "~{reads_basename}.count.~{ref_basename}.txt" > "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
TOP_HIT="$(head -1 '~{reads_basename}.count.~{ref_basename}.txt' | cut -f 1 | sed 's/\*/\\*/' | tee '~{reads_basename}.count.~{ref_basename}.top.txt')"

TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
TOTAL_COUNT_OF_LESSER_HITS=$((grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" || echo "0" ) | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
echo $TOTAL_COUNT_OF_TOP_HIT | tee TOTAL_COUNT_OF_TOP_HIT
echo $TOTAL_COUNT_OF_LESSER_HITS | tee TOTAL_COUNT_OF_LESSER_HITS

if [ $TOTAL_COUNT_OF_LESSER_HITS -ne 0 -o $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
  PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
    bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt' )
else
  echo "PCT_MAPPING_TO_LESSER_HITS cannot be calculated: there were no hits to any sequence"
  PCT_MAPPING_TO_LESSER_HITS=$( echo "null" | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
fi

TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
echo $TOTAL_READS_IN_INPUT | tee TOTAL_READS_IN_INPUT
if [ $TOTAL_READS_IN_INPUT -eq 0 ]; then
  echo "no reads in input bam"
  PCT_OF_INPUT_READS_MAPPED=$(echo "0" | tee "~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt")
  echo "PCT_TOP_HIT_OF_TOTAL_READS cannot be calculated: there were no mapping hits, or no reads"
  PCT_TOP_HIT_OF_TOTAL_READS=$( echo "null" | tee '~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt')
else
  PCT_OF_INPUT_READS_MAPPED=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT) / $TOTAL_READS_IN_INPUT" | \
  bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt' )

  if [ $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
    PCT_TOP_HIT_OF_TOTAL_READS=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_TOP_HIT / $TOTAL_READS_IN_INPUT)" | \
      bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt' )
  else
    echo "PCT_TOP_HIT_OF_TOTAL_READS cannot be calculated: there were no mapping hits, or no reads"
    PCT_TOP_HIT_OF_TOTAL_READS=$( echo "null" | tee '~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt')
  fi
fi

Outputs

Name Type Expression
report File "~{reads_basename}.count.~{ref_basename}.txt"
report_top_hits File "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
top_hit_id String read_string("~{reads_basename}.count.~{ref_basename}.top.txt")
reads_total Int read_int("TOTAL_READS_IN_INPUT")
reads_mapped_top_hit Int read_int("TOTAL_COUNT_OF_TOP_HIT")
reads_mapped Int read_int("TOTAL_COUNT_OF_LESSER_HITS") + read_int("TOTAL_COUNT_OF_TOP_HIT")
pct_total_reads_mapped String read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
pct_top_hit_of_total_reads String read_string('~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt')
pct_lesser_hits_of_mapped String read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
viralngs_version String read_string("VERSION")

Runtime

Key Value
memory select_first([machine_mem_gb, 15]) + " GB"
cpu 4
docker "~{docker}"
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x4"
maxRetries 2

TASKS align_and_count_summary

Inputs

Name Type Description Default
counts_txt Array[File]+ - -
2 optional inputs with default values

Command

set -ex -o pipefail

reports.py --version | tee VERSION
reports.py aggregate_alignment_counts ~{sep=" " counts_txt} "~{output_prefix}".tsv --loglevel=DEBUG

Outputs

Name Type Expression
count_summary File "~{output_prefix}.tsv"
viralngs_version String read_string("VERSION")

Runtime

Key Value
memory "7 GB"
cpu 8
docker "~{docker}"
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS aggregate_metagenomics_reports

Inputs

Name Type Description Default
kraken_summary_reports Array[File]+ - -
4 optional inputs with default values

Command

set -ex -o pipefail

metagenomics.py --version | tee VERSION
metagenomics.py taxlevel_summary \
  ~{sep=" " kraken_summary_reports} \
  --csvOut aggregate_taxa_summary_~{aggregate_taxon_heading}_by_~{aggregate_taxlevel_focus}_top_~{aggregate_top_N_hits}_by_sample.csv \
  --noHist \
  --taxHeading ~{aggregate_taxon_heading_space_separated} \
  --taxlevelFocus ~{aggregate_taxlevel_focus} \
  --zeroFill --includeRoot --topN ~{aggregate_top_N_hits} \
  --loglevel=DEBUG

Outputs

Name Type Expression
krakenuniq_aggregate_taxlevel_summary File "aggregate_taxa_summary_~{aggregate_taxon_heading}_by_~{aggregate_taxlevel_focus}_top_~{aggregate_top_N_hits}_by_sample.csv"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker "~{docker}"
memory "3 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd2_v2_x2"
preemptible 0
maxRetries 2

TASKS MultiQC

Inputs

Name Type Description Default
input_files Array[File] - -
title String? - -
comment String? - -
file_name String? - -
template String? - -
tag String? - -
ignore_analysis_files String? - -
ignore_sample_names String? - -
sample_names File? - -
exclude_modules Array[String]? - -
module_to_use Array[String]? - -
output_data_format String? [tsv|yaml|json] default:tsv -
config File? - -
config_yaml String? - -
13 optional inputs with default values

Command

      set -ex -o pipefail

      echo "~{sep="
" input_files}" > input-filenames.txt
      echo "" >> input-filenames.txt

      # Run MultiQC but allow it to fail (it crashes with exit 1 on empty/invalid zip files)
      set +e
      multiqc \
      --file-list input-filenames.txt \
      --outdir "~{out_dir}" \
      ~{true="--force" false="" force} \
      ~{true="--fullnames" false="" full_names} \
      ~{"--title " + title} \
      ~{"--comment " + comment} \
      ~{"--filename " + file_name} \
      ~{"--template " + template} \
      ~{"--tag " + tag} \
      ~{"--ignore " + ignore_analysis_files} \
      ~{"--ignore-samples" + ignore_sample_names} \
      ~{"--sample-names " + sample_names} \
      ~{true="--exclude " false="" defined(exclude_modules)}~{sep=" --exclude " select_first([exclude_modules, []])} \
      ~{true="--module " false="" defined(module_to_use)}~{sep=" --module " select_first([module_to_use, []])} \
      ~{true="--data-dir" false="" data_dir} \
      ~{true="--no-data-dir" false="" no_data_dir} \
      ~{"--data-format " + output_data_format} \
      ~{true="--zip-data-dir" false="" zip_data_dir} \
      ~{true="--export" false="" export} \
      ~{true="--flat" false="" flat} \
      ~{true="--interactive" false="" interactive} \
      ~{true="--lint" false="" lint} \
      ~{true="--pdf" false="" pdf} \
      ~{false="--no-megaqc-upload" true="" megaQC_upload} \
      ~{"--config " + config} \
      ~{"--cl-config " + config_yaml}
      MULTIQC_EXIT_CODE=$?
      set -e

      # Ensure output directory exists (MultiQC may remove it if no results found or if it crashed)
      mkdir -p "~{out_dir}"

      if [ -z "~{file_name}" ]; then
        mv "~{out_dir}/~{report_filename}_report.html" "~{out_dir}/~{report_filename}.html" 2>/dev/null || true
      fi

      # Create placeholder HTML report if MultiQC didn't create one (happens when no valid results found or on crash)
      if [ ! -f "~{out_dir}/~{report_filename}.html" ]; then
        echo "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>MultiQC Report</title></head><body><h1>MultiQC Report</h1><p>No analysis results found in input files.</p></body></html>" > "~{out_dir}/~{report_filename}.html"
      fi

      # Ensure data directory exists before tarring (MultiQC only creates it when results are found)
      mkdir -p "~{out_dir}/~{report_filename}_data"
      tar -c "~{out_dir}/~{report_filename}_data" | gzip -c > "~{report_filename}_data.tar.gz"

Outputs

Name Type Expression
multiqc_report File "~{out_dir}/~{report_filename}.html"
multiqc_data_dir_tarball File "~{report_filename}_data.tar.gz"

Runtime

Key Value
memory "8 GB"
cpu 16
docker "~{docker}"
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem2_ssd1_v2_x2"
maxRetries 2

TASKS compare_two_genomes

Inputs

Name Type Description Default
genome_one File - -
genome_two File - -
out_basename String - -
1 optional input with default value

Command

set -ex -o pipefail
assembly.py --version | tee VERSION
assembly.py alignment_summary "~{genome_one}" "~{genome_two}" --outfileName "~{out_basename}.txt" --printCounts --loglevel=DEBUG
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
set +o pipefail
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES

Outputs

Name Type Expression
comparison_table File "~{out_basename}.txt"
max_ram_gb Int ceil((read_float("MEM_BYTES") / 1000000000))
runtime_sec Int ceil(read_float("UPTIME_SEC"))
cpu_load String read_string("CPU_LOAD")
viralngs_version String read_string("VERSION")

Runtime

Key Value
memory "3 GB"
cpu 2
docker docker
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
preemptible 1
maxRetries 2
← Back to Index

tasks_reports - WDL Source Code

version 1.0

task alignment_metrics {
  meta {
      description: "Produce various standard metrics and coverage plots via Picard and Samtools for aligned BAM files."
  }

  input {
    File   aligned_bam
    File   ref_fasta
    File?  primers_bed
    String? amplicon_set
    Int?   min_coverage
    Int    max_amp_len=5000
    Int    max_amplicons=500

    Int    machine_mem_gb=32
    String docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  String out_basename = basename(aligned_bam, ".bam")
  Int disk_size = 150

  command <<<
    set -e
    MEM_MB=$(free -m | head -2 | tail -1 | awk '{print $4}')
    XMX=$(echo "-Xmx"$MEM_MB"m")
    echo "Requesting $MEM_MB MB of RAM for Java"

    # requisite Picard fasta indexing
    python3<<CODE
    import shutil
    import util.file
    with util.file.fastas_with_sanitized_ids("~{ref_fasta}", use_tmp=True) as sanitized_fastas:
        shutil.copyfile(sanitized_fastas[0], 'reference.fasta')
    CODE
    picard $XMX CreateSequenceDictionary -R reference.fasta

    if [ -s "~{ref_fasta}" ]; then
      # get Picard metrics and clean up the junky outputs
      picard $XMX CollectRawWgsMetrics \
        -R reference.fasta \
        -I "~{aligned_bam}" \
        -O picard_raw.raw_wgs_metrics.txt
      grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt

      picard $XMX CollectAlignmentSummaryMetrics \
        -R reference.fasta \
        -I "~{aligned_bam}" \
        -O picard_raw.alignment_metrics.txt
      grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt

      picard $XMX CollectInsertSizeMetrics \
        -I "~{aligned_bam}" \
        -O picard_raw.insert_size_metrics.txt \
        -H picard_raw.insert_size_metrics.pdf \
        --INCLUDE_DUPLICATES true
      grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt
    else
      # ref_fasta is empty -> Picard will fail
      touch picard_clean.raw_wgs_metrics.txt picard_clean.alignment_metrics.txt picard_clean.insert_size_metrics.txt
    fi

    # prepend the sample name in order to facilitate tsv joining later
    SAMPLE=$(samtools view -H "~{aligned_bam}" | grep ^@RG | perl -lape 's/^@RG.*SM:(\S+).*$/$1/' | sort | uniq)
    echo -e "sample_sanitized\tbam" > prepend.txt
    echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
    paste prepend.txt picard_clean.raw_wgs_metrics.txt > "~{out_basename}".raw_wgs_metrics.txt
    echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
    echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
    paste prepend.txt picard_clean.alignment_metrics.txt > "~{out_basename}".alignment_metrics.txt
    echo -e "sample_sanitized\tbam" > prepend.txt
    echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt
    paste prepend.txt picard_clean.insert_size_metrics.txt > "~{out_basename}".insert_size_metrics.txt

    touch "~{out_basename}".ampliconstats.txt "~{out_basename}".ampliconstats_parsed.txt
    echo -e "sample_sanitized\tbam\tamplicon_set\tamplicon_idx\tamplicon_left\tamplicon_right\tFREADS\tFDEPTH\tFPCOV\tFAMP" > "~{out_basename}.ampliconstats_parsed.txt"
    if [ -n "~{primers_bed}" ]; then
      # samtools ampliconstats
      cat "~{primers_bed}" | sort -k 1,1 -k 4,4 -t $'\t' > primers-sorted_for_samtools.bed
      set +e # there are just some weird bed files out there -- let them fail silently
      samtools ampliconstats -s -@ $(nproc) \
        ~{'-d ' + min_coverage} \
        ~{'-l ' + max_amp_len} \
        ~{'-a ' + max_amplicons} \
        -o "~{out_basename}".ampliconstats.txt primers-sorted_for_samtools.bed "~{aligned_bam}"

      # parse into our own tsv to facilitate tsv joining later
      if [ -n "~{default='' amplicon_set}" ]; then
        AMPLICON_SET="~{default='' amplicon_set}"
      else
        AMPLICON_SET=$(basename "~{primers_bed}" .bed)
      fi
      grep ^AMPLICON "~{out_basename}".ampliconstats.txt | cut -f 2- > AMPLICON
      grep ^FREADS "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FREADS; echo "" >> FREADS
      grep ^FDEPTH "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FDEPTH; echo "" >> FDEPTH
      grep ^FPCOV  "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FPCOV;  echo "" >> FPCOV
      grep ^FAMP   "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP
      for i in $(cut -f 1 AMPLICON); do echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET"; done > prepend.txt
      wc -l prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP
      paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep '\S' >> "~{out_basename}.ampliconstats_parsed.txt"
    fi
  >>>

  output {
    File wgs_metrics         = "~{out_basename}.raw_wgs_metrics.txt"
    File alignment_metrics   = "~{out_basename}.alignment_metrics.txt"
    File insert_size_metrics = "~{out_basename}.insert_size_metrics.txt"
    File amplicon_stats      = "~{out_basename}.ampliconstats.txt"
    File amplicon_stats_parsed = "~{out_basename}.ampliconstats_parsed.txt"
  }

  runtime {
    docker: docker
    memory: machine_mem_gb + " GB"
    cpu: 4
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem3_ssd1_v2_x4"
    maxRetries: 2
  }
}

task plot_coverage {
  input {
    File    aligned_reads_bam
    String  sample_name

    Boolean skip_mark_dupes           = false
    Boolean plot_only_non_duplicates  = false
    Boolean bin_large_plots           = false
    String? binning_summary_statistic = "max" # max or min

    Int? plot_width_pixels            = 1100
    Int? plot_height_pixels           = 850
    Int? plot_pixels_per_inch         = 100

    Int? max_coverage_depth
    Int? base_q_threshold
    Int? mapping_q_threshold
    Int? read_length_threshold
    String? plotXLimits # of the form "min max" (ints, space between)
    String? plotYLimits # of the form "min max" (ints, space between)

    String  docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  Int disk_size = 375
  
  command {
    set -ex -o pipefail

    read_utils.py --version | tee VERSION

    samtools view -c ${aligned_reads_bam} | tee reads_aligned
    if [ "$(cat reads_aligned)" != "0" ]; then
      samtools index -@ "$(nproc)" "${aligned_reads_bam}"

      PLOT_DUPE_OPTION=""
      if [[ "${skip_mark_dupes}" != "true" ]]; then
        PLOT_DUPE_OPTION="${true='--plotOnlyNonDuplicates' false="" plot_only_non_duplicates}"
      fi
      
      BINNING_OPTION="${true='--binLargePlots' false="" bin_large_plots}"

      # plot coverage
      reports.py plot_coverage \
        "${aligned_reads_bam}" \
        "${sample_name}.coverage_plot.pdf" \
        --outSummary "${sample_name}.coverage_plot.txt" \
        --plotFormat pdf \
        --plotWidth 1100 \
        --plotHeight 850 \
        --plotDPI 100 \
        ${"-m " + max_coverage_depth} \
        ${"-q " + base_q_threshold} \
        ${"-Q " + mapping_q_threshold} \
        ${"-l " + read_length_threshold} \
        ${"--plotXLimits " + plotXLimits} \
        ${"--plotYLimits " + plotYLimits} \
        $PLOT_DUPE_OPTION \
        $BINNING_OPTION \
        --binningSummaryStatistic ${binning_summary_statistic} \
        --plotTitle "${sample_name} coverage plot" \
        --loglevel=DEBUG

    else
      touch ${sample_name}.coverage_plot.pdf ${sample_name}.coverage_plot.txt
    fi

    # collect figures of merit
    set +o pipefail # grep will exit 1 if it fails to find the pattern
    samtools view -H ${aligned_reads_bam} | perl -n -e'/^@SQ.*LN:(\d+)/ && print "$1\n"' |  python -c "import sys; print(sum(int(x) for x in sys.stdin))" | tee assembly_length
    # report only primary alignments 260=exclude unaligned reads and secondary mappings
    samtools view -h -F 260 ${aligned_reads_bam} | samtools flagstat - | tee ${sample_name}.flagstat.txt
    grep properly ${sample_name}.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
    samtools view ${aligned_reads_bam} | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
    python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else print(0)" > mean_coverage
  }

  output {
    File   coverage_plot      = "${sample_name}.coverage_plot.pdf"
    File   coverage_tsv       = "${sample_name}.coverage_plot.txt"
    Int    assembly_length    = read_int("assembly_length")
    Int    reads_aligned      = read_int("reads_aligned")
    Int    read_pairs_aligned = read_int("read_pairs_aligned")
    Float  bases_aligned      = read_float("bases_aligned")
    Float  mean_coverage      = read_float("mean_coverage")
    String viralngs_version   = read_string("VERSION")
  }

  runtime {
    docker: "${docker}"
    memory: "7 GB"
    cpu: 2
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    preemptible: 1
    maxRetries: 2
  }
}

task merge_coverage_per_position {
  input {
    Array[File]+ coverage_tsvs
    File         ref_fasta

    String       out_report_name = "coverage_report.csv"
    Int          disk_size = 100
    String       docker = "quay.io/broadinstitute/py3-bio:0.1.2"
  }

  command <<<
    set -e

    python3<<CODE
    import os
    import pandas as pd
    from functools import reduce
    import Bio.SeqIO

    # get genome length
    genome_length = 0
    with open('~{ref_fasta}', 'rt') as inf:
      for seq in Bio.SeqIO.parse(inf, 'fasta'):
        genome_length += len(seq.seq.replace("-",""))

    # Loop through a list of file paths and read in each depth.tsv generated as part of assemble_refbased
    depths_dfs = []
    for in_tsv in ("~{sep='", "' coverage_tsvs}"):
        sample_name = '.'.join(os.path.basename(in_tsv).split('.')[:-2])
        sample_depths_df = pd.read_csv(in_tsv, sep='\t', header=None
            ).rename(columns={0:'Ref',1:'Position',2:sample_name})
        depths_dfs.append(sample_depths_df)

    # Condense all depths into a single dataframe
    df_merged = reduce(lambda left,right:
        pd.merge(left,right,on=['Ref','Position'],how='outer'),
        depths_dfs)
    df_merged = df_merged.fillna(0)

    #Create dummy df that contains all positions along the genome
    dummy_df = pd.DataFrame([range(1,genome_length)]).T.rename(columns={0:'Position'})
    df_merged = df_merged.merge(dummy_df, on='Position', how='right').fillna(0)
    df_merged = df_merged.drop(['Ref'], axis=1)
    df_merged.to_csv("~{out_report_name}", index=False)
    CODE
  >>>

  output {
    File   coverage_multi_sample_per_position_csv  = out_report_name
  }

  runtime {
    docker: "${docker}"
    memory: "2 GB"
    cpu: 2
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd2_v2_x4"
    maxRetries: 2
  }
}

task coverage_report {
  input {
    Array[File]+ mapped_bams
    Array[File]  mapped_bam_idx = []  # optional.. speeds it up if you provide it, otherwise we auto-index
    String       out_report_name = "coverage_report.txt"

    String       docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  Int disk_size = 375

  command <<<
    set -e
    reports.py --version | tee VERSION
    python3 << CODE
    import tools.samtools
    import reports
    samtools = tools.samtools.SamtoolsTool()
    in_bams = list([bam for bam in ["~{sep='", "' mapped_bams}"] if bam and not samtools.isEmpty(bam)])
    if in_bams:
      reports.coverage_only(in_bams, "~{out_report_name}")
    else:
      with open("~{out_report_name}", "w") as outf:
        outf.write('\t'.join(('sample', 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', 'aln2self_cov_1X', 'aln2self_cov_5X', 'aln2self_cov_20X', 'aln2self_cov_100X'))+'\n')
    CODE
  >>>

  output {
    File   coverage_report  = out_report_name
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: "2 GB"
    cpu: 2
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd2_v2_x4"
    maxRetries: 2
  }
}

task assembly_bases {
    meta {
      description: "Count bases in a fasta file."
    }

    input {
      File   fasta
      String docker ="ubuntu"
    }

    Int disk_size = 50

    command {
        set -e
        grep -v '^>' "~{fasta}" | tr -d '\n' | wc -c | tee assembly_length
        grep -v '^>' "~{fasta}" | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous
    }

    output {
        Int assembly_length             = read_int("assembly_length")
        Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
    }

    runtime {
        docker: "${docker}"
        memory: "1 GB"
        cpu: 1
        disks:  "local-disk " + disk_size + " HDD"
        disk: disk_size + " GB" # TES
        dx_instance_type: "mem1_ssd1_v2_x2"
        maxRetries: 2
    }
}

task fastqc {
  input {
    File   reads_bam

    String docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  parameter_meta {
    reads_bam:{ 
    description: "Input reads in BAM format.",
    category: "required"
    }

  }

  String   reads_basename=basename(reads_bam, ".bam")
  Int disk_size = 375

  command {
    set -ex -o pipefail
    reports.py --version | tee VERSION
    reports.py fastqc ${reads_bam} ${reads_basename}_fastqc.html --out_zip ${reads_basename}_fastqc.zip
  }

  output {
    File   fastqc_html      = "${reads_basename}_fastqc.html"
    File   fastqc_zip       = "${reads_basename}_fastqc.zip"
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    memory: "2 GB"
    cpu: 1
    docker: "${docker}"
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task align_and_count {
  input {
    File   reads_bam
    File   ref_db
    Int    topNHits = 3

    Boolean filter_bam_to_proper_primary_mapped_reads         = true
    Boolean do_not_require_proper_mapped_pairs_when_filtering = false
    Boolean keep_singletons_when_filtering                    = false
    Boolean keep_duplicates_when_filtering                    = false

    Int?   machine_mem_gb
    String docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  String  reads_basename=basename(reads_bam, ".bam")
  String  ref_basename=basename(ref_db, ".fasta")
  Int disk_size = 375

  parameter_meta {
    reads_bam: {
      description: "Unaligned reads in BAM format",
      pattern: ["*.bam"],
      category: "required"
    }
    ref_db: {
      description: "Reference genome in FASTA format",
      pattern: ["*.FASTA"],
      category: "required"
    }
    filter_bam_to_proper_primary_mapped_reads: {
      description: "If specified, reads till be filtered after alignment to include only those flagged as properly paired.",
      category: "optional"
    }
    do_not_require_proper_mapped_pairs_when_filtering: {
      description: "Do not require reads to be properly paired when filtering",
      category: "optional"
    }
    keep_singletons_when_filtering: {
      description: "Keep singletons when filtering",
      category: "optional"
    }
    keep_duplicates_when_filtering: {
      description: "Do not exclude reads marked as duplicates when filtering",
      category: "optional"
    }
  }
  command <<<
    set -ex -o pipefail

    read_utils.py --version | tee VERSION

    ln -s "~{reads_bam}" "~{reads_basename}.bam"
    read_utils.py minimap2_idxstats \
      "~{reads_basename}.bam" \
      "~{ref_db}" \
      --outStats "~{reads_basename}.count.~{ref_basename}.txt.unsorted" \
      ~{true="--filterReadsAfterAlignment"   false="" filter_bam_to_proper_primary_mapped_reads} \
      ~{true="--doNotRequirePairsToBeProper" false="" do_not_require_proper_mapped_pairs_when_filtering} \
      ~{true="--keepSingletons"              false="" keep_singletons_when_filtering} \
      ~{true="--keepDuplicates"              false="" keep_duplicates_when_filtering} \
      --loglevel=DEBUG

    sort -b -r -n -k3 "~{reads_basename}.count.~{ref_basename}.txt.unsorted" > "~{reads_basename}.count.~{ref_basename}.txt"
    head -n ~{topNHits} "~{reads_basename}.count.~{ref_basename}.txt" > "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
    TOP_HIT="$(head -1 '~{reads_basename}.count.~{ref_basename}.txt' | cut -f 1 | sed 's/\*/\\*/' | tee '~{reads_basename}.count.~{ref_basename}.top.txt')"

    TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
    TOTAL_COUNT_OF_LESSER_HITS=$((grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" || echo "0" ) | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
    echo $TOTAL_COUNT_OF_TOP_HIT | tee TOTAL_COUNT_OF_TOP_HIT
    echo $TOTAL_COUNT_OF_LESSER_HITS | tee TOTAL_COUNT_OF_LESSER_HITS

    if [ $TOTAL_COUNT_OF_LESSER_HITS -ne 0 -o $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
      PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
        bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt' )
    else
      echo "PCT_MAPPING_TO_LESSER_HITS cannot be calculated: there were no hits to any sequence"
      PCT_MAPPING_TO_LESSER_HITS=$( echo "null" | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
    fi

    TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
    echo $TOTAL_READS_IN_INPUT | tee TOTAL_READS_IN_INPUT
    if [ $TOTAL_READS_IN_INPUT -eq 0 ]; then
      echo "no reads in input bam"
      PCT_OF_INPUT_READS_MAPPED=$(echo "0" | tee "~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt")
      echo "PCT_TOP_HIT_OF_TOTAL_READS cannot be calculated: there were no mapping hits, or no reads"
      PCT_TOP_HIT_OF_TOTAL_READS=$( echo "null" | tee '~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt')
    else
      PCT_OF_INPUT_READS_MAPPED=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT) / $TOTAL_READS_IN_INPUT" | \
      bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt' )

      if [ $TOTAL_COUNT_OF_TOP_HIT -ne 0 ]; then
        PCT_TOP_HIT_OF_TOTAL_READS=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_TOP_HIT / $TOTAL_READS_IN_INPUT)" | \
          bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt' )
      else
        echo "PCT_TOP_HIT_OF_TOTAL_READS cannot be calculated: there were no mapping hits, or no reads"
        PCT_TOP_HIT_OF_TOTAL_READS=$( echo "null" | tee '~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt')
      fi
    fi
  >>>

  output {
    File   report           = "~{reads_basename}.count.~{ref_basename}.txt"
    
    File   report_top_hits  = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
    String top_hit_id       = read_string("~{reads_basename}.count.~{ref_basename}.top.txt")

    Int    reads_total          = read_int("TOTAL_READS_IN_INPUT")
    Int    reads_mapped_top_hit = read_int("TOTAL_COUNT_OF_TOP_HIT")
    Int    reads_mapped         = read_int("TOTAL_COUNT_OF_LESSER_HITS") + read_int("TOTAL_COUNT_OF_TOP_HIT")

    String pct_total_reads_mapped     = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
    String pct_top_hit_of_total_reads = read_string('~{reads_basename}.count.~{ref_basename}.pct_top_hit_of_total_reads.txt')
    String pct_lesser_hits_of_mapped  = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
    
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    memory: select_first([machine_mem_gb, 15]) + " GB"
    cpu: 4
    docker: "${docker}"
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

task align_and_count_summary {
  input {
    Array[File]+ counts_txt

    String       output_prefix = "count_summary"

    String       docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  Int disk_size = 100

  command {
    set -ex -o pipefail

    reports.py --version | tee VERSION
    reports.py aggregate_alignment_counts ${sep=' ' counts_txt} "${output_prefix}".tsv --loglevel=DEBUG
  }

  output {
    File   count_summary    = "${output_prefix}.tsv"
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    memory: "7 GB"
    cpu: 8
    docker: "${docker}"
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task aggregate_metagenomics_reports {
  input {
    Array[File]+ kraken_summary_reports 
    String       aggregate_taxon_heading_space_separated  = "Viruses"
    String       aggregate_taxlevel_focus                 = "species"
    Int          aggregate_top_N_hits                     = 5

    String       docker = "quay.io/broadinstitute/viral-classify:2.5.1.0"
  }

  parameter_meta {
    aggregate_taxon_heading_space_separated: { description: "The taxonomic heading to analyze. More than one can be specified." }
    aggregate_taxlevel_focus:                { description: "species,genus,family,order,class,phylum,kingdom,superkingdom" }
    aggregate_top_N_hits:                    { description: "only include the top N hits from a given sample in the aggregate report" }
  }

  String       aggregate_taxon_heading = sub(aggregate_taxon_heading_space_separated, " ", "_") # replace spaces with underscores for use in filename
  Int disk_size = 50

  command {
    set -ex -o pipefail

    metagenomics.py --version | tee VERSION
    metagenomics.py taxlevel_summary \
      ${sep=' ' kraken_summary_reports} \
      --csvOut aggregate_taxa_summary_${aggregate_taxon_heading}_by_${aggregate_taxlevel_focus}_top_${aggregate_top_N_hits}_by_sample.csv \
      --noHist \
      --taxHeading ${aggregate_taxon_heading_space_separated} \
      --taxlevelFocus ${aggregate_taxlevel_focus} \
      --zeroFill --includeRoot --topN ${aggregate_top_N_hits} \
      --loglevel=DEBUG
  }

  output {
    File   krakenuniq_aggregate_taxlevel_summary = "aggregate_taxa_summary_${aggregate_taxon_heading}_by_${aggregate_taxlevel_focus}_top_${aggregate_top_N_hits}_by_sample.csv"
    String viralngs_version                      = read_string("VERSION")
  }

  runtime {
    docker: "${docker}"
    memory: "3 GB"
    cpu: 1
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd2_v2_x2"
    preemptible: 0
    maxRetries: 2
  }
}

task MultiQC {
  input {
    Array[File]    input_files

    String?        title
    String?        comment
    String?        file_name
    String         out_dir       = "./multiqc-output"
    String?        template
    String?        tag
    String?        ignore_analysis_files
    String?        ignore_sample_names
    File?          sample_names
    Array[String]? exclude_modules
    Array[String]? module_to_use
    String?        output_data_format
    Boolean        force         = false
    Boolean        full_names    = false
    Boolean        data_dir      = false
    Boolean        no_data_dir   = false
    Boolean        zip_data_dir  = false
    Boolean        export        = false
    Boolean        flat          = false
    Boolean        interactive   = true
    Boolean        lint          = false
    Boolean        pdf           = false
    Boolean        megaQC_upload = false # Upload generated report to MegaQC if MegaQC options are found
    File?          config  # directory
    String?        config_yaml

    String         docker = "quay.io/biocontainers/multiqc:1.32--pyhdfd78af_1"
  }

  parameter_meta {
    output_data_format: { description: "[tsv|yaml|json] default:tsv" }
  }

  # get the basename in all wdl use the filename specified (sans ".html" extension, if specified)
  String report_filename = if (defined(file_name)) then basename(select_first([file_name]), ".html") else "multiqc"
  Int disk_size = 375

  command {
      set -ex -o pipefail

      echo "${sep='\n' input_files}" > input-filenames.txt
      echo "" >> input-filenames.txt

      # Run MultiQC but allow it to fail (it crashes with exit 1 on empty/invalid zip files)
      set +e
      multiqc \
      --file-list input-filenames.txt \
      --outdir "${out_dir}" \
      ${true="--force" false="" force} \
      ${true="--fullnames" false="" full_names} \
      ${"--title " + title} \
      ${"--comment " + comment} \
      ${"--filename " + file_name} \
      ${"--template " + template} \
      ${"--tag " + tag} \
      ${"--ignore " + ignore_analysis_files} \
      ${"--ignore-samples" + ignore_sample_names} \
      ${"--sample-names " + sample_names} \
      ${true="--exclude " false="" defined(exclude_modules)}${sep=' --exclude ' select_first([exclude_modules,[]])} \
      ${true="--module " false="" defined(module_to_use)}${sep=' --module ' select_first([module_to_use,[]])} \
      ${true="--data-dir" false="" data_dir} \
      ${true="--no-data-dir" false="" no_data_dir} \
      ${"--data-format " + output_data_format} \
      ${true="--zip-data-dir" false="" zip_data_dir} \
      ${true="--export" false="" export} \
      ${true="--flat" false="" flat} \
      ${true="--interactive" false="" interactive} \
      ${true="--lint" false="" lint} \
      ${true="--pdf" false="" pdf} \
      ${false="--no-megaqc-upload" true="" megaQC_upload} \
      ${"--config " + config} \
      ${"--cl-config " + config_yaml }
      MULTIQC_EXIT_CODE=$?
      set -e

      # Ensure output directory exists (MultiQC may remove it if no results found or if it crashed)
      mkdir -p "${out_dir}"

      if [ -z "${file_name}" ]; then
        mv "${out_dir}/${report_filename}_report.html" "${out_dir}/${report_filename}.html" 2>/dev/null || true
      fi

      # Create placeholder HTML report if MultiQC didn't create one (happens when no valid results found or on crash)
      if [ ! -f "${out_dir}/${report_filename}.html" ]; then
        echo "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>MultiQC Report</title></head><body><h1>MultiQC Report</h1><p>No analysis results found in input files.</p></body></html>" > "${out_dir}/${report_filename}.html"
      fi

      # Ensure data directory exists before tarring (MultiQC only creates it when results are found)
      mkdir -p "${out_dir}/${report_filename}_data"
      tar -c "${out_dir}/${report_filename}_data" | gzip -c > "${report_filename}_data.tar.gz"
  }

  output {
      File multiqc_report           = "${out_dir}/${report_filename}.html"
      File multiqc_data_dir_tarball = "${report_filename}_data.tar.gz"
  }

  runtime {
    memory: "8 GB"
    cpu: 16
    docker: "${docker}"
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem2_ssd1_v2_x2"
    maxRetries: 2
  }
}

task compare_two_genomes {
  input {
    File   genome_one
    File   genome_two
    String out_basename

    String docker = "quay.io/broadinstitute/viral-assemble:2.5.1.0"
  }

  Int disk_size = 50

  command <<<
    set -ex -o pipefail
    assembly.py --version | tee VERSION
    assembly.py alignment_summary "~{genome_one}" "~{genome_two}" --outfileName "~{out_basename}.txt" --printCounts --loglevel=DEBUG
    cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
    cat /proc/loadavg > CPU_LOAD
    set +o pipefail
    { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES
  >>>

  output {
    File   comparison_table = "~{out_basename}.txt"
    Int    max_ram_gb       = ceil(read_float("MEM_BYTES")/1000000000)
    Int    runtime_sec      = ceil(read_float("UPTIME_SEC"))
    String cpu_load         = read_string("CPU_LOAD")
    String viralngs_version = read_string("VERSION")
  }

  runtime {
    memory: "3 GB"
    cpu: 2
    docker: docker
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    preemptible: 1
    maxRetries: 2
  }
}