tasks_read_utils
pipes/WDL/tasks/tasks_read_utils.wdl

TASKS tasks_read_utils

File Path pipes/WDL/tasks/tasks_read_utils.wdl
WDL Version 1.0
Type tasks

📋Tasks in this document

Tasks

TASKS max

Inputs

Name Type Description Default
list Array[Int] - -
1 optional input with default value

Command

python3 << CODE
inlist = '~{sep="*" list}'.split('*')
print(str(max(map(int, [x for x in inlist if x]), default = ~{default_empty})))
CODE

Outputs

Name Type Expression
out Int read_int(stdout())

Runtime

Key Value
docker "python:slim"
memory "1 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS group_bams_by_sample

Inputs

Name Type Description Default
bam_filepaths Array[File] {'description': 'all bam files', 'localization_optional': <WDL.Expr.Boolean object at 0x7f162514cfd0>, 'stream': <WDL.Expr.Boolean object at 0x7f162514d050>, 'patterns': ['*.bam']} -

Command

python3 << CODE
import os.path

# WDL arrays to python arrays
bam_uris = '~{sep="*" bam_filepaths}'.split('*')

# lookup table files to dicts
sample_to_bams = {}
for bam in bam_uris:
  # filename must be <samplename>.l<xxx>.bam
  assert bam.endswith('.bam'), "filename does not end in .bam: {}".format(bam)
  bam_base = os.path.basename(bam)
  i = bam_base.index('.l')
  assert i>0, "filename does not contain a .l -- {}".format(bam)
  sample = bam_base[:i]
  sample_to_bams.setdefault(sample, [])
  sample_to_bams[sample].append(bam)

# write outputs
with open('grouped_bams', 'wt') as out_groups:
  with open('sample_names', 'wt') as out_samples:
    for sample in sorted(sample_to_bams.keys()):
      out_samples.write(sample+'\n')
      out_groups.write('\t'.join(sample_to_bams[sample])+'\n')
CODE

Outputs

Name Type Expression
grouped_bam_filepaths Array[Array[File]+] read_tsv('grouped_bams')
sample_names Array[String] read_lines('sample_names')

Runtime

Key Value
docker "python:slim"
memory "1 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS get_bam_samplename

Inputs

Name Type Description Default
bam File - -
1 optional input with default value

Command

set -e -o pipefail
samtools view -H "~{bam}" | \
  perl -lane 'if (/^\@RG\t.*SM:(\S+)/) { print "$1" }' | \
  sort | uniq > SAMPLE_NAME

Outputs

Name Type Expression
sample_name String read_string("SAMPLE_NAME")

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS get_sample_meta

Inputs

Name Type Description Default
samplesheets_extended Array[File] - -
1 optional input with default value

Command

python3 << CODE
import os.path
import csv
import json
import util.file

# WDL arrays to python arrays
library_metadata = '~{sep="*" samplesheets_extended}'.split('*')

# lookup table files to dicts
meta = {}
meta_cols = ('sample','amplicon_set','control')
for col in meta_cols:
  meta[col] = {}
for libfile in library_metadata:
  with open(libfile, 'rt') as inf:
    for row in csv.DictReader(inf, delimiter='\t'):
      sanitized = util.file.string_to_file_name(row['sample'])
      for col in meta_cols:
        meta[col].setdefault(sanitized, '')
        if row.get(col):
          meta[col][sanitized] = row[col]

# write outputs
for col in meta_cols:
  with open(col, 'wt') as outf:
    json.dump(meta[col], outf, indent=2)
CODE

Outputs

Name Type Expression
original_names Map[String,String] read_json('sample')
amplicon_set Map[String,String] read_json('amplicon_set')
control Map[String,String] read_json('control')

Runtime

Key Value
docker docker
memory "1 GB"
cpu 1
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS merge_and_reheader_bams

Merge and/or reheader bam files using a mapping table. This task can modify read group tags in a BAM header field for single BAM files or as part of a BAM merge operation. The output is a single BAM file (given one or more input BAMs) and a three-column tab delimited text table that defines: the field, the old value, and the new value (e.g. LB, old_lib_name, new_lib_name or SM, old_sample_name, new_sample_name)

Inputs

Name Type Description Default
in_bams Array[File]+ - -
sample_name String? - -
reheader_table File? - -
5 optional inputs with default values

Command

set -ex -o pipefail

read_utils.py --version | tee VERSION
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

if [ ~{length(in_bams)} -gt 1 ]; then
    read_utils.py merge_bams ~{sep=" " in_bams} merged.bam --JVMmemory="$mem_in_mb"m --loglevel DEBUG
else
    echo "Skipping merge, only one input file"
    cp ~{sep=" " in_bams} merged.bam
fi    

# remap all SM values to user specified value
if [ -n "~{sample_name}" ]; then
  # create sample name remapping table based on existing sample names
  samtools view -H merged.bam | perl -n -e'/SM:(\S+)/ && print "SM\t$1\t'"~{sample_name}"'\n"' | sort | uniq >> reheader_table.txt
fi

# remap arbitrary headers using user specified table
if [[ -f "~{reheader_table}" ]]; then
  cat "~{reheader_table}" >> reheader_table.txt
fi

# reheader bam file if requested
if [ -s reheader_table.txt ]; then
  read_utils.py reheader_bam merged.bam reheader_table.txt "~{out_basename}.bam" --loglevel DEBUG
else
  mv merged.bam "~{out_basename}.bam"
fi

# summary stats on merged output
samtools view -c "~{out_basename}.bam" | tee read_count_merged
samtools flagstat "~{out_basename}.bam" | tee "~{out_basename}.bam.flagstat.txt"

# fastqc can be really slow on large files, make it optional
if [ "~{run_fastqc}" = "true" ]; then
  reports.py fastqc "~{out_basename}.bam" "~{out_basename}.fastqc.html"
fi

Outputs

Name Type Expression
out_bam File "~{out_basename}.bam"
read_count Int read_int("read_count_merged")
flagstat File "~{out_basename}.bam.flagstat.txt"
fastqc File? "~{out_basename}.fastqc.html"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory machine_mem_gb + " GB"
cpu 2
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd2_v2_x4"
preemptible 0
maxRetries 2

TASKS rmdup_ubam

Perform read deduplication on unaligned reads.

Inputs

Name Type Description Default
reads_unmapped_bam File unaligned reads in BAM format -
3 optional inputs with default values

Command

set -ex -o pipefail
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
read_utils.py --version | tee VERSION

read_utils.py rmdup_"~{method}"_bam \
  "~{reads_unmapped_bam}" \
  "~{reads_basename}".dedup.bam \
  --JVMmemory "$mem_in_mb"m \
  --loglevel=DEBUG

samtools view -c "~{reads_basename}.dedup.bam" | tee dedup_read_count_post
reports.py fastqc "~{reads_basename}.dedup.bam" "~{reads_basename}.dedup_fastqc.html" --out_zip "~{reads_basename}.dedup_fastqc.zip"

Outputs

Name Type Expression
dedup_bam File "~{reads_basename}.dedup.bam"
dedup_fastqc File "~{reads_basename}.dedup_fastqc.html"
dedup_fastqc_zip File "~{reads_basename}.dedup_fastqc.zip"
dedup_read_count_post Int read_int("dedup_read_count_post")
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory machine_mem_gb + " GB"
cpu 2
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem2_ssd1_v2_x2"
maxRetries 2

TASKS downsample_bams

Downsample reads in a BAM file randomly subsampling to a target read count. Read deduplication can occur either before or after random subsampling, or not at all (default: not at all).

Inputs

Name Type Description Default
reads_bam Array[File]+ - -
readCount Int? - -
machine_mem_gb Int? - -
3 optional inputs with default values

Command

set -ex -o pipefail

# find 90% memory
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

if [[ "~{deduplicateBefore}" == "true" ]]; then
  DEDUP_OPTION="--deduplicateBefore"
elif [[ "~{deduplicateAfter}" == "true" ]]; then
  DEDUP_OPTION="--deduplicateAfter"
fi

if [[ "~{deduplicateBefore}" == "true" && "~{deduplicateAfter}" == "true" ]]; then
  echo "deduplicateBefore and deduplicateAfter are mutually exclusive. Only one can be used."
  exit 1
fi
    
read_utils.py --version | tee VERSION

read_utils.py downsample_bams \
    ~{sep=" " reads_bam} \
    --outPath ./output \
    ~{'--readCount=' + readCount} \
    $DEDUP_OPTION \
    --JVMmemory "$mem_in_mb"m

Outputs

Name Type Expression
downsampled_bam Array[File] glob("output/*.downsampled-*.bam")
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory select_first([machine_mem_gb, 3]) + " GB"
cpu 4
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x4"
maxRetries 2

TASKS FastqToUBAM

Converts FASTQ (paired or single) to uBAM and adds read group information.

Inputs

Name Type Description Default
fastq_1 File Unaligned read1 file in fastq format -
fastq_2 File? Unaligned read2 file in fastq format. This should be empty for single-end read conversion and required for paired-end reads. If provided, it must match fastq_1 in length and order. -
sample_name String Sample name. This is required and will populate the 'SM' read group value and will be used as the output filename (must be filename-friendly). -
library_name String Library name. This is required and will populate the 'LB' read group value. SM & LB combinations must be identical for any sequencing reads generated from the same sequencing library, and must be distinct for any reads generated from different libraries. -
readgroup_name String? - -
platform_unit String? - -
run_date String? - -
platform_name String Sequencing platform. This is required and will populate the 'PL' read group value. Must be one of CAPILLARY, DNBSEQ, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT, PACBIO, or SOLID. -
sequencing_center String? - -
additional_picard_options String? A string containing additional options to pass to picard FastqToSam beyond those made explicitly available as inputs to this task. For valid values, see: https://broadinstitute.github.io/picard/command-line-overview.html#FastqToSam -
4 optional inputs with default values

Command

set -ex -o pipefail

# find 90% memory
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

read_utils.py --version | tee VERSION

if [[ ! "~{platform_name}" =~ ^(CAPILLARY|DNBSEQ|ELEMENT|HELICOS|ILLUMINA|IONTORRENT|LS454|ONT|PACBIO|SINGULAR|SOLID|ULTIMA)$ ]]; then
  exit 1
fi

picard -Xmx"$mem_in_mb"m \
  FastqToSam \
  FASTQ="~{fastq_1}" \
  ~{"FASTQ2=" + fastq_2} \
  SAMPLE_NAME="~{sample_name}" \
  LIBRARY_NAME="~{library_name}" \
  OUTPUT="~{sample_name}".bam \
  ~{"READ_GROUP_NAME=" + readgroup_name} \
  ~{"PLATFORM_UNIT=" + platform_unit} \
  ~{"RUN_DATE=" + run_date} \
  ~{"PLATFORM=" + platform_name} \
  ~{"SEQUENCING_CENTER=" + sequencing_center} ~{additional_picard_options}

Outputs

Name Type Expression
unmapped_bam File "~{sample_name}.bam"

Runtime

Key Value
docker docker
cpu cpus
memory mem_gb + " GB"
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS read_depths

Inputs

Name Type Description Default
aligned_bam File - -
2 optional inputs with default values

Command

set -e -o pipefail

samtools depth "~{aligned_bam}" > "~{out_basename}.read_depths.txt"

Outputs

Name Type Expression
read_depths File "~{out_basename}.read_depths.txt"

Runtime

Key Value
docker docker
cpu 2
memory "3 GB"
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2
← Back to Index

tasks_read_utils - WDL Source Code

version 1.0

task max {
  input {
    Array[Int] list
    Int        default_empty = 0
  }
  Int disk_size = 10
  command <<<
    python3 << CODE
    inlist = '~{sep="*" list}'.split('*')
    print(str(max(map(int, [x for x in inlist if x]), default = ~{default_empty})))
    CODE
  >>>
  output {
    Int out = read_int(stdout())
  }
  runtime {
    docker: "python:slim"
    memory: "1 GB"
    cpu: 1
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task group_bams_by_sample {
  input {
    Array[File] bam_filepaths
  }
  Int disk_size = 100
  parameter_meta {
    bam_filepaths: {
      description: "all bam files",
      localization_optional: true,
      stream: true,
      patterns: ["*.bam"]
    }
  }
  command <<<
    python3 << CODE
    import os.path

    # WDL arrays to python arrays
    bam_uris = '~{sep="*" bam_filepaths}'.split('*')

    # lookup table files to dicts
    sample_to_bams = {}
    for bam in bam_uris:
      # filename must be <samplename>.l<xxx>.bam
      assert bam.endswith('.bam'), "filename does not end in .bam: {}".format(bam)
      bam_base = os.path.basename(bam)
      i = bam_base.index('.l')
      assert i>0, "filename does not contain a .l -- {}".format(bam)
      sample = bam_base[:i]
      sample_to_bams.setdefault(sample, [])
      sample_to_bams[sample].append(bam)

    # write outputs
    with open('grouped_bams', 'wt') as out_groups:
      with open('sample_names', 'wt') as out_samples:
        for sample in sorted(sample_to_bams.keys()):
          out_samples.write(sample+'\n')
          out_groups.write('\t'.join(sample_to_bams[sample])+'\n')
    CODE
  >>>
  output {
    Array[Array[File]+] grouped_bam_filepaths = read_tsv('grouped_bams')
    Array[String]       sample_names          = read_lines('sample_names')
  }
  runtime {
    docker: "python:slim"
    memory: "1 GB"
    cpu: 1
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task get_bam_samplename {
  input {
    File    bam
    String  docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  Int   disk_size = round(size(bam, "GB")) + 50
  command <<<
    set -e -o pipefail
    samtools view -H "~{bam}" | \
      perl -lane 'if (/^\@RG\t.*SM:(\S+)/) { print "$1" }' | \
      sort | uniq > SAMPLE_NAME
  >>>
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
  output {
    String sample_name = read_string("SAMPLE_NAME")
  }
}

task get_sample_meta {
  input {
    Array[File] samplesheets_extended

    String      docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  Int disk_size = 50
  command <<<
    python3 << CODE
    import os.path
    import csv
    import json
    import util.file

    # WDL arrays to python arrays
    library_metadata = '~{sep="*" samplesheets_extended}'.split('*')

    # lookup table files to dicts
    meta = {}
    meta_cols = ('sample','amplicon_set','control')
    for col in meta_cols:
      meta[col] = {}
    for libfile in library_metadata:
      with open(libfile, 'rt') as inf:
        for row in csv.DictReader(inf, delimiter='\t'):
          sanitized = util.file.string_to_file_name(row['sample'])
          for col in meta_cols:
            meta[col].setdefault(sanitized, '')
            if row.get(col):
              meta[col][sanitized] = row[col]

    # write outputs
    for col in meta_cols:
      with open(col, 'wt') as outf:
        json.dump(meta[col], outf, indent=2)
    CODE
  >>>
  output {
    Map[String,String] original_names = read_json('sample')
    Map[String,String] amplicon_set   = read_json('amplicon_set')
    Map[String,String] control        = read_json('control')
  }
  runtime {
    docker: docker
    memory: "1 GB"
    cpu: 1
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}


task merge_and_reheader_bams {
    meta {
      description: "Merge and/or reheader bam files using a mapping table. This task can modify read group tags in a BAM header field for single BAM files or as part of a BAM merge operation. The output is a single BAM file (given one or more input BAMs) and a three-column tab delimited text table that defines: the field, the old value, and the new value (e.g. LB, old_lib_name, new_lib_name or SM, old_sample_name, new_sample_name)"
    }

    input {
      Array[File]+ in_bams
      String?      sample_name
      File?        reheader_table
      String       out_basename = basename(in_bams[0], ".bam")

      Boolean      run_fastqc = false
      String       docker = "quay.io/broadinstitute/viral-core:2.5.1"
      Int          disk_size = 750
      Int          machine_mem_gb = 4
    }
    
    command <<<
        set -ex -o pipefail

        read_utils.py --version | tee VERSION
        mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

        if [ ~{length(in_bams)} -gt 1 ]; then
            read_utils.py merge_bams ~{sep=' ' in_bams} merged.bam --JVMmemory="$mem_in_mb"m --loglevel DEBUG
        else
            echo "Skipping merge, only one input file"
            cp ~{sep=' ' in_bams} merged.bam
        fi    

        # remap all SM values to user specified value
        if [ -n "~{sample_name}" ]; then
          # create sample name remapping table based on existing sample names
          samtools view -H merged.bam | perl -n -e'/SM:(\S+)/ && print "SM\t$1\t'"~{sample_name}"'\n"' | sort | uniq >> reheader_table.txt
        fi

        # remap arbitrary headers using user specified table
        if [[ -f "~{reheader_table}" ]]; then
          cat "~{reheader_table}" >> reheader_table.txt
        fi

        # reheader bam file if requested
        if [ -s reheader_table.txt ]; then
          read_utils.py reheader_bam merged.bam reheader_table.txt "~{out_basename}.bam" --loglevel DEBUG
        else
          mv merged.bam "~{out_basename}.bam"
        fi

        # summary stats on merged output
        samtools view -c "~{out_basename}.bam" | tee read_count_merged
        samtools flagstat "~{out_basename}.bam" | tee "~{out_basename}.bam.flagstat.txt"

        # fastqc can be really slow on large files, make it optional
        if [ "~{run_fastqc}" = "true" ]; then
          reports.py fastqc "~{out_basename}.bam" "~{out_basename}.fastqc.html"
        fi
    >>>

    output {
        File   out_bam          = "~{out_basename}.bam"
        Int    read_count       = read_int("read_count_merged")
        File   flagstat         = "~{out_basename}.bam.flagstat.txt"
        File?  fastqc           = "~{out_basename}.fastqc.html"
        String viralngs_version = read_string("VERSION")
    }

    runtime {
        docker: docker
        memory: machine_mem_gb + " GB"
        cpu: 2
        disks:  "local-disk " + disk_size + " LOCAL"
        disk: disk_size + " GB" # TES
        dx_instance_type: "mem1_ssd2_v2_x4"
        preemptible: 0
        maxRetries: 2
    }
}

task rmdup_ubam {
  meta {
    description: "Perform read deduplication on unaligned reads."
  }

  input {
    File    reads_unmapped_bam
    String  method = "mvicuna"

    Int     machine_mem_gb = 7
    String  docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  Int disk_size = 375 + 2 * ceil(size(reads_unmapped_bam, "GB"))

  parameter_meta {
    reads_unmapped_bam: { description: "unaligned reads in BAM format", patterns: ["*.bam"] }
    method:             { description: "mvicuna or cdhit" }
  }

  String reads_basename = basename(reads_unmapped_bam, ".bam")
  
  command <<<
    set -ex -o pipefail
    mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
    read_utils.py --version | tee VERSION

    read_utils.py rmdup_"~{method}"_bam \
      "~{reads_unmapped_bam}" \
      "~{reads_basename}".dedup.bam \
      --JVMmemory "$mem_in_mb"m \
      --loglevel=DEBUG

    samtools view -c "~{reads_basename}.dedup.bam" | tee dedup_read_count_post
    reports.py fastqc "~{reads_basename}.dedup.bam" "~{reads_basename}.dedup_fastqc.html" --out_zip "~{reads_basename}.dedup_fastqc.zip"
  >>>

  output {
    File   dedup_bam             = "~{reads_basename}.dedup.bam"
    File   dedup_fastqc          = "~{reads_basename}.dedup_fastqc.html"
    File   dedup_fastqc_zip      = "~{reads_basename}.dedup_fastqc.zip"
    Int    dedup_read_count_post = read_int("dedup_read_count_post")
    String viralngs_version      = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: machine_mem_gb + " GB"
    cpu:    2
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem2_ssd1_v2_x2"
    maxRetries: 2
  }
}

task downsample_bams {
  meta {
    description: "Downsample reads in a BAM file randomly subsampling to a target read count. Read deduplication can occur either before or after random subsampling, or not at all (default: not at all)."
    volatile: true
  }

  input {
    Array[File]+ reads_bam
    Int?         readCount
    Boolean      deduplicateBefore = false
    Boolean      deduplicateAfter = false

    Int?         machine_mem_gb
    String       docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }

  Int disk_size = 750

  command {
    set -ex -o pipefail

    # find 90% memory
    mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

    if [[ "${deduplicateBefore}" == "true" ]]; then
      DEDUP_OPTION="--deduplicateBefore"
    elif [[ "${deduplicateAfter}" == "true" ]]; then
      DEDUP_OPTION="--deduplicateAfter"
    fi

    if [[ "${deduplicateBefore}" == "true" && "${deduplicateAfter}" == "true" ]]; then
      echo "deduplicateBefore and deduplicateAfter are mutually exclusive. Only one can be used."
      exit 1
    fi
    
    read_utils.py --version | tee VERSION

    read_utils.py downsample_bams \
        ${sep=' ' reads_bam} \
        --outPath ./output \
        ${'--readCount=' + readCount} \
        $DEDUP_OPTION \
        --JVMmemory "$mem_in_mb"m
  }

  output {
    Array[File] downsampled_bam  = glob("output/*.downsampled-*.bam")
    String      viralngs_version = read_string("VERSION")
  }

  runtime {
    docker: docker
    memory: select_first([machine_mem_gb, 3]) + " GB"
    cpu:    4
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

task FastqToUBAM {
  meta {
    description: "Converts FASTQ (paired or single) to uBAM and adds read group information."
  }
  input {
    File    fastq_1
    File?   fastq_2
    String  sample_name
    String  library_name
    String? readgroup_name
    String? platform_unit
    String? run_date
    String  platform_name
    String? sequencing_center
    String? additional_picard_options

    Int     cpus = 2
    Int     mem_gb = 4
    Int     disk_size = 750
    String  docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  parameter_meta {
    fastq_1: { description: "Unaligned read1 file in fastq format", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] }
    fastq_2: { description: "Unaligned read2 file in fastq format. This should be empty for single-end read conversion and required for paired-end reads. If provided, it must match fastq_1 in length and order.", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] }
    sample_name: { description: "Sample name. This is required and will populate the 'SM' read group value and will be used as the output filename (must be filename-friendly)." }
    library_name: { description: "Library name. This is required and will populate the 'LB' read group value. SM & LB combinations must be identical for any sequencing reads generated from the same sequencing library, and must be distinct for any reads generated from different libraries." }
    platform_name: { description: "Sequencing platform. This is required and will populate the 'PL' read group value. Must be one of CAPILLARY, DNBSEQ, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT, PACBIO, or SOLID." }
    additional_picard_options: { description: "A string containing additional options to pass to picard FastqToSam beyond those made explicitly available as inputs to this task. For valid values, see: https://broadinstitute.github.io/picard/command-line-overview.html#FastqToSam" }
  }
  command <<<
      set -ex -o pipefail

      # find 90% memory
      mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)

      read_utils.py --version | tee VERSION

      if [[ ! "~{platform_name}" =~ ^(CAPILLARY|DNBSEQ|ELEMENT|HELICOS|ILLUMINA|IONTORRENT|LS454|ONT|PACBIO|SINGULAR|SOLID|ULTIMA)$ ]]; then
        exit 1
      fi

      picard -Xmx"$mem_in_mb"m \
        FastqToSam \
        FASTQ="~{fastq_1}" \
        ~{"FASTQ2=" + fastq_2} \
        SAMPLE_NAME="~{sample_name}" \
        LIBRARY_NAME="~{library_name}" \
        OUTPUT="~{sample_name}".bam \
        ~{"READ_GROUP_NAME=" + readgroup_name} \
        ~{"PLATFORM_UNIT=" + platform_unit} \
        ~{"RUN_DATE=" + run_date} \
        ~{"PLATFORM=" + platform_name} \
        ~{"SEQUENCING_CENTER=" + sequencing_center} ~{additional_picard_options}
  >>>
  runtime {
    docker: docker
    cpu: cpus
    memory: mem_gb + " GB"
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
  output {
    File unmapped_bam = "~{sample_name}.bam"
  }
}

task read_depths {
  input {
    File      aligned_bam

    String    out_basename = basename(aligned_bam, '.bam')
    String    docker = "quay.io/broadinstitute/viral-core:2.5.1"
  }
  Int disk_size = 200
  command <<<
    set -e -o pipefail

    samtools depth "~{aligned_bam}" > "~{out_basename}.read_depths.txt"
  >>>

  output {
    File read_depths = "~{out_basename}.read_depths.txt"
  }
  runtime {
    docker: docker
    cpu:    2
    memory: "3 GB"
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}