tasks_intrahost
pipes/WDL/tasks/tasks_intrahost.wdl

TASKS tasks_intrahost

File Path pipes/WDL/tasks/tasks_intrahost.wdl
WDL Version 1.0
Type tasks

📋Tasks in this document

Tasks

TASKS polyphonia_detect_cross_contamination

Inputs

Name Type Description Default
lofreq_vcfs Array[File] VCF file(s) output by LoFreq or GATK; must use reference provided by reference_fasta -
genome_fastas Array[File] Unaligned consensus genome or genomes -
reference_fasta File Reference fasta file -
read_depths Array[File]? Read depth tables (tab-separated, no header: reference name, position, read depth); provide alongside vcf files or heterozygosity tables if min-depth>0 -
masked_positions String? 1-indexed positions to mask (e.g., 1-10,50,55-70) -
masked_positions_file File? 1-indexed positions to mask, one per line -
plate_maps Array[File]? Optional plate map(s) (tab-separated, no header: sample name, plate position (e.g., A8)) -
plate_columns Int? Number columns in plate (e.g., 1, 2, 3, 4) -
plate_rows Int? Number rows in plate (e.g., A, B, C, D) -
17 optional inputs with default values

Command

set -e -o pipefail

# commented out until polyphonia can report its own version
#polyphonia --version | tee POLYPHONIA_VERSION

mkdir -p figs
    
# prep plate maps input, if specified
PLATE_MAPS_INPUT=""
if [ -n "${sep=' ' plate_maps}" ]; then
  PLATE_MAPS_INPUT="--plate-map ~{sep=" " plate_maps}"
fi

polyphonia cross_contamination \
  --ref ~{reference_fasta} \
  --vcf ~{sep=" " lofreq_vcfs} \
  --consensus ~{sep=" " genome_fastas} \
  --read-depths ~{sep=" " read_depths} \
  ~{'--min-covered ' + min_genome_coverage} \
  ~{'--min-depth ' + min_read_depth} \
  ~{'--min-readcount ' + min_readcount} \
  ~{'--max-mismatches ' + max_mismatches} \
  ~{'--min-matches ' + min_matches} \
  ~{'--min-matches-proportion ' + min_matches_proportion} \
  ~{'--min-maf ' + min_maf} \
  ~{'--masked-positions ' + masked_positions} \
  ~{'--masked-positions-file ' + masked_positions_file} \
  $PLATE_MAPS_INPUT \
  ~{'--plate-size ' + plate_size} \
  ~{'--plate-columns ' + plate_columns} \
  ~{'--plate-rows ' + plate_rows} \
  --compare-direct ~{true="TRUE" false="FALSE" compare_direct_neighbors} \
  --compare-diagonal ~{true="TRUE" false="FALSE" compare_diagonal_neighbors} \
  --compare-row ~{true="TRUE" false="FALSE" compare_full_row} \
  --compare-column ~{true="TRUE" false="FALSE" compare_full_column} \
  --compare-plate ~{true="TRUE" false="FALSE" compare_full_plate} \
  --output ~{out_basename}.txt \
  --out-figures figs \
  --cores `nproc` \
  --verbose TRUE \
  --overwrite TRUE \
  --print-all-iSNVs ~{true="TRUE" false="FALSE" print_all_isnvs} \
  --print-all ~{true="TRUE" false="FALSE" print_all}

Outputs

Name Type Expression
report File "~{out_basename}.txt"
figures Array[File] glob("figs/*")

Runtime

Key Value
docker docker
cpu 4
memory "13 GB"
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x4"
maxRetries 2

TASKS lofreq

Inputs

Name Type Description Default
aligned_bam File - -
reference_fasta File - -
2 optional inputs with default values

Command

set -e -o pipefail

lofreq version | grep version | sed 's/.* \(.*\)/\1/g' | tee LOFREQ_VERSION

# make local copies because CWD is writeable but localization dir isn't always
cp "~{aligned_bam}" aligned.bam
python3<<CODE
import shutil
import util.file
with util.file.fastas_with_sanitized_ids("~{reference_fasta}", use_tmp=True) as sanitized_fastas:
    shutil.copyfile(sanitized_fastas[0], 'reference.fasta')
CODE

# samtools faidx fails if fasta is empty
if [ $(grep -v '^>' reference.fasta | tr -d '\nNn' | wc -c) == "0" ]; then
  touch "~{out_basename}.vcf"
  exit 0
fi

# index for lofreq
samtools faidx reference.fasta
samtools index aligned.bam

# lofreq
lofreq call \
  -f reference.fasta \
  -o "~{out_basename}.vcf" \
  aligned.bam

Outputs

Name Type Expression
report_vcf File "~{out_basename}.vcf"
lofreq_version String read_string("LOFREQ_VERSION")

Runtime

Key Value
docker docker
cpu 2
memory "3 GB"
disks "local-disk " + disk_size + " HDD"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x2"
maxRetries 2

TASKS isnvs_per_sample

Inputs

Name Type Description Default
mapped_bam File - -
assembly_fasta File - -
threads Int? - -
minReadsPerStrand Int? - -
maxBias Int? - -
machine_mem_gb Int? - -
3 optional inputs with default values

Command

intrahost.py --version | tee VERSION
intrahost.py vphaser_one_sample \
    ~{mapped_bam} \
    ~{assembly_fasta} \
    vphaser2.~{sample_name}.txt.gz \
    ~{'--vphaserNumThreads=' + threads} \
    ~{true="--removeDoublyMappedReads" false="" removeDoublyMappedReads} \
    ~{'--minReadsEach=' + minReadsPerStrand} \
    ~{'--maxBias=' + maxBias}

Outputs

Name Type Expression
isnvsFile File "vphaser2.~{sample_name}.txt.gz"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker "~{docker}"
memory select_first([machine_mem_gb, 7]) + " GB"
dx_instance_type "mem1_ssd1_v2_x8"
maxRetries 2

TASKS isnvs_vcf

Inputs

Name Type Description Default
vphaser2Calls Array[File] vphaser output; ex. vphaser2.<sample>.txt.gz -
perSegmentMultiAlignments Array[File] aligned_##.fasta, where ## is segment number -
reference_fasta File - -
snpEffRef Array[String]? list of accessions to build/find snpEff database -
sampleNames Array[String]? list of sample names -
emailAddress String? email address passed to NCBI if we need to download reference sequences -
machine_mem_gb Int? - -
2 optional inputs with default values

Command

set -ex -o pipefail

intrahost.py --version | tee VERSION

SAMPLES="~{sep=" " sampleNames}"
if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi

providedSnpRefAccessions="~{sep=" " snpEffRef}"
if [ -n "$providedSnpRefAccessions" ]; then 
  snpRefAccessions="$providedSnpRefAccessions";
else
  snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('~{reference_fasta}', 'fasta'))))")"
fi

echo "snpRefAccessions: $snpRefAccessions"

intrahost.py merge_to_vcf \
    ~{reference_fasta} \
    isnvs.vcf.gz \
    $SAMPLES \
    --isnvs ~{sep=" " vphaser2Calls} \
    --alignments ~{sep=" " perSegmentMultiAlignments} \
    --strip_chr_version \
    ~{true="--naive_filter" false="" naiveFilter} \
    --parse_accession
    
interhost.py snpEff \
    isnvs.vcf.gz \
    $snpRefAccessions \
    isnvs.annot.vcf.gz \
    ~{'--emailAddress=' + emailAddress}

intrahost.py iSNV_table \
    isnvs.annot.vcf.gz \
    isnvs.annot.txt.gz

Outputs

Name Type Expression
isnvs_vcf File "isnvs.vcf.gz"
isnvs_vcf_idx File "isnvs.vcf.gz.tbi"
isnvs_annot_vcf File "isnvs.annot.vcf.gz"
isnvs_annot_vcf_idx File "isnvs.annot.vcf.gz.tbi"
isnvs_annot_txt File "isnvs.annot.txt.gz"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker "~{docker}"
memory select_first([machine_mem_gb, 4]) + " GB"
dx_instance_type "mem1_ssd1_v2_x4"
maxRetries 2

TASKS annotate_vcf_snpeff

Inputs

Name Type Description Default
in_vcf File input VCF to annotate with snpEff -
ref_fasta File The sequence containing the accession to use for annotation; only used if snpEffRef is not provided. -
snpEffRef Array[String]? list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession) -
emailAddress String? email address passed to NCBI if we need to download reference sequences -
machine_mem_gb Int? - -
2 optional inputs with default values

Command

set -ex -o pipefail

intrahost.py --version | tee VERSION

providedSnpRefAccessions="~{sep=" " snpEffRef}"
if [ -n "$providedSnpRefAccessions" ]; then 
  snpRefAccessions="$providedSnpRefAccessions";
else
  snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('~{ref_fasta}', 'fasta'))))")"
fi
echo "snpRefAccessions: $snpRefAccessions"

vcf_to_use=""
if (file "~{in_vcf}" | grep -q "gzip" ) ; then
  echo "~{in_vcf} is already compressed"
  vcf_to_use="~{in_vcf}"
else
  echo "~{in_vcf} is not compressed; gzipping..."
  bgzip "~{in_vcf}"
  vcf_to_use="~{in_vcf}.gz"
fi

# renames the seq id using the first sequence in the alignment
ref_name=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]]+).*$/\1/g')
ref_name_no_version=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]\.]+).*$/\1/g')
# copy the input or just created gzipped vcf file
cp "$vcf_to_use" "temp.vcf.gz"
# ensure uncompressed
bgzip -d "temp.vcf.gz"
# rename chr field (first col) in vcf
cat "temp.vcf" | sed "s/^1/$ref_name_no_version/" > "temp2.vcf"
    
# output the vcf, removing the reference sequence if present as a sample name
bgzip "temp2.vcf"
tabix -p vcf "temp2.vcf.gz"
bcftools index "temp2.vcf.gz"
bcftools view -s "^$ref_name" "temp2.vcf.gz" > "temp3.vcf"
rm "temp2.vcf.gz"
vcf_to_use="temp3.vcf"

# compress vcf
bgzip -c "$vcf_to_use" > "$vcf_to_use.gz"
vcf_to_use="$vcf_to_use.gz"

# index vcf
echo "Creating vcf index"
bcftools index "$vcf_to_use"
tabix -p vcf "$vcf_to_use"
    
interhost.py snpEff \
    "$vcf_to_use" \
    $snpRefAccessions \
    "~{output_basename}.annot.vcf.gz" \
    ~{'--emailAddress=' + emailAddress}

Outputs

Name Type Expression
annot_vcf_gz File "~{output_basename}.annot.vcf.gz"
annot_vcf_gz_tbi File "~{output_basename}.annot.vcf.gz.tbi"
viralngs_version String read_string("VERSION")

Runtime

Key Value
docker docker
memory select_first([machine_mem_gb, 4]) + " GB"
disks "local-disk " + disk_size + " LOCAL"
disk disk_size + " GB"
dx_instance_type "mem1_ssd1_v2_x4"
maxRetries 2
← Back to Index

tasks_intrahost - WDL Source Code

version 1.0

task polyphonia_detect_cross_contamination {
  input {
    Array[File]  lofreq_vcfs
    Array[File]  genome_fastas
    File         reference_fasta

    Int          min_readcount       = 0
    Float        min_maf             = 0
    Float        min_genome_coverage = 0.95
    Int          min_read_depth      = 100
    Array[File]? read_depths
    String?      masked_positions
    File?        masked_positions_file
    Int          max_mismatches      = 0
    Int          min_matches         = 3
    Float        min_matches_proportion = 1

    Array[File]? plate_maps
    Int?         plate_size                 = 96
    Int?         plate_columns
    Int?         plate_rows
    Boolean?     compare_direct_neighbors   = true
    Boolean?     compare_diagonal_neighbors = false
    Boolean?     compare_full_row           = false
    Boolean?     compare_full_column        = false
    Boolean?     compare_full_plate         = false
    
    Boolean?     print_all_isnvs     = true
    Boolean?     print_all           = false

    String       out_basename = "potential_cross_contamination"

    String       docker = "quay.io/broadinstitute/polyphonia:latest"
  }

  Int disk_size = 100

  parameter_meta {
    lofreq_vcfs:         { description: "VCF file(s) output by LoFreq or GATK; must use reference provided by reference_fasta" }
    genome_fastas:       { description: "Unaligned consensus genome or genomes" }
    reference_fasta:     { description: "Reference fasta file" }
    
    min_readcount:       { description: "Minimum minor allele readcount for position to be considered heterozygous" }
    min_maf:             { description: "Minimum minor allele frequency for position to be considered heterozygous" }
    min_read_depth:      { description: "Minimum read depth for a position to be used for comparison" }
    read_depths:         { description: "Read depth tables (tab-separated, no header: reference name, position, read depth); provide alongside vcf files or heterozygosity tables if min-depth>0" }
    min_genome_coverage: { description: "Minimum proportion genome covered for a sample to be included" }
    masked_positions:    { description: "1-indexed positions to mask (e.g., 1-10,50,55-70)" }
    masked_positions_file: { description: "1-indexed positions to mask, one per line" }
    min_matches:         { description: "Of positions at which the two consensus genomes differ, the minimum number of positions at which contamination is detected as a minor allele" }
    min_matches_proportion: { description: "Of positions at which the two consensus genomes differ, the minimum proportion of positions at which contamination is detected as a minor allele" }
    max_mismatches:      { description: "Maximum allowed bases in contaminating sample consensus not matching contaminated sample alleles" }
    
    plate_maps:          { description: "Optional plate map(s) (tab-separated, no header: sample name, plate position (e.g., A8))" }
    plate_size:          { description: "Standard plate size (6-well, 12-well, 24, 48, 96, 384, 1536, 3456)" }
    plate_columns:       { description: "Number columns in plate (e.g., 1, 2, 3, 4)" }
    plate_rows:          { description: "Number rows in plate (e.g., A, B, C, D)" }
    compare_direct_neighbors:  { description: "Compare direct plate neighbors (left, right, top, bottom)" }
    compare_diagonal_neighbors:{ description: "Compare diagonal plate neighbors (top-right, bottom-right, top-left, bottom-left)" }
    compare_full_row:    { description: "Compare samples in the same row (e.g., row A)" }
    compare_full_column: { description: "Compare samples in the same column (e.g., column 8)" }
    compare_full_plate:  { description: "Compare all samples in the same plate map" }
    
    print_all_isnvs:     { description: "Include all threshold-passing samples in iSNVs visualizations, including samples without plate neighbors" }
    print_all:           { description: "Output outcomes of all comparisons (all comparisons are marked as potential cross-contamination)" }
  }

  command <<<
    set -e -o pipefail

    # commented out until polyphonia can report its own version
    #polyphonia --version | tee POLYPHONIA_VERSION

    mkdir -p figs
    
    # prep plate maps input, if specified
    PLATE_MAPS_INPUT=""
    if [ -n "${sep=' ' plate_maps}" ]; then
      PLATE_MAPS_INPUT="--plate-map ~{sep=' ' plate_maps}"
    fi

    polyphonia cross_contamination \
      --ref ~{reference_fasta} \
      --vcf ~{sep=' ' lofreq_vcfs} \
      --consensus ~{sep=' ' genome_fastas} \
      --read-depths ~{sep=' ' read_depths} \
      ~{'--min-covered ' + min_genome_coverage} \
      ~{'--min-depth ' + min_read_depth} \
      ~{'--min-readcount ' + min_readcount} \
      ~{'--max-mismatches ' + max_mismatches} \
      ~{'--min-matches ' + min_matches} \
      ~{'--min-matches-proportion ' + min_matches_proportion} \
      ~{'--min-maf ' + min_maf} \
      ~{'--masked-positions ' + masked_positions} \
      ~{'--masked-positions-file ' + masked_positions_file} \
      $PLATE_MAPS_INPUT \
      ~{'--plate-size ' + plate_size} \
      ~{'--plate-columns ' + plate_columns} \
      ~{'--plate-rows ' + plate_rows} \
      --compare-direct ~{true="TRUE" false="FALSE" compare_direct_neighbors} \
      --compare-diagonal ~{true="TRUE" false="FALSE" compare_diagonal_neighbors} \
      --compare-row ~{true="TRUE" false="FALSE" compare_full_row} \
      --compare-column ~{true="TRUE" false="FALSE" compare_full_column} \
      --compare-plate ~{true="TRUE" false="FALSE" compare_full_plate} \
      --output ~{out_basename}.txt \
      --out-figures figs \
      --cores `nproc` \
      --verbose TRUE \
      --overwrite TRUE \
      --print-all-iSNVs ~{true="TRUE" false="FALSE" print_all_isnvs} \
      --print-all ~{true="TRUE" false="FALSE" print_all}
  >>>

  output {
    File        report             = "~{out_basename}.txt"
    Array[File] figures            = glob("figs/*")
    # commented out until polyphonia can report its own version
    #String      polyphonia_version = read_string("POLYPHONIA_VERSION")
  }
  runtime {
    docker: docker
    cpu:    4
    memory: "13 GB"
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

task lofreq {
  input {
    File      aligned_bam
    File      reference_fasta

    String    out_basename = basename(aligned_bam, '.bam')
    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }
  Int disk_size = 200
  command <<<
    set -e -o pipefail

    lofreq version | grep version | sed 's/.* \(.*\)/\1/g' | tee LOFREQ_VERSION

    # make local copies because CWD is writeable but localization dir isn't always
    cp "~{aligned_bam}" aligned.bam
    python3<<CODE
    import shutil
    import util.file
    with util.file.fastas_with_sanitized_ids("~{reference_fasta}", use_tmp=True) as sanitized_fastas:
        shutil.copyfile(sanitized_fastas[0], 'reference.fasta')
    CODE

    # samtools faidx fails if fasta is empty
    if [ $(grep -v '^>' reference.fasta | tr -d '\nNn' | wc -c) == "0" ]; then
      touch "~{out_basename}.vcf"
      exit 0
    fi

    # index for lofreq
    samtools faidx reference.fasta
    samtools index aligned.bam

    # lofreq
    lofreq call \
      -f reference.fasta \
      -o "~{out_basename}.vcf" \
      aligned.bam
  >>>

  output {
    File   report_vcf     = "~{out_basename}.vcf"
    String lofreq_version = read_string("LOFREQ_VERSION")
  }
  runtime {
    docker: docker
    cpu:    2
    memory: "3 GB"
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task isnvs_per_sample {
  input {
    File    mapped_bam
    File    assembly_fasta

    Int?    threads
    Int?    minReadsPerStrand
    Int?    maxBias
    Boolean removeDoublyMappedReads = true

    Int?    machine_mem_gb
    String  docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"

    String  sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
  }
  

  command {
    intrahost.py --version | tee VERSION
    intrahost.py vphaser_one_sample \
        ${mapped_bam} \
        ${assembly_fasta} \
        vphaser2.${sample_name}.txt.gz \
        ${'--vphaserNumThreads=' + threads} \
        ${true="--removeDoublyMappedReads" false="" removeDoublyMappedReads} \
        ${'--minReadsEach=' + minReadsPerStrand} \
        ${'--maxBias=' + maxBias}
  }

  output {
    File   isnvsFile        = "vphaser2.${sample_name}.txt.gz"
    String viralngs_version = read_string("VERSION")
  }
  runtime {
    docker: "${docker}"
    memory: select_first([machine_mem_gb, 7]) + " GB"
    dx_instance_type: "mem1_ssd1_v2_x8"
    maxRetries: 2
  }
}


task isnvs_vcf {
  input {
    Array[File]    vphaser2Calls
    Array[File]    perSegmentMultiAlignments
    File           reference_fasta

    Array[String]? snpEffRef
    Array[String]? sampleNames
    String?        emailAddress
    Boolean        naiveFilter = false

    Int?           machine_mem_gb
    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  parameter_meta {
    vphaser2Calls:             { description: "vphaser output; ex. vphaser2.<sample>.txt.gz" }
    perSegmentMultiAlignments: { description: "aligned_##.fasta, where ## is segment number" }
    snpEffRef:                 { description: "list of accessions to build/find snpEff database" }
    sampleNames:               { description: "list of sample names" }
    emailAddress:              { description: "email address passed to NCBI if we need to download reference sequences" }
  }

  command {
    set -ex -o pipefail

    intrahost.py --version | tee VERSION

    SAMPLES="${sep=' ' sampleNames}"
    if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi

    providedSnpRefAccessions="${sep=' ' snpEffRef}"
    if [ -n "$providedSnpRefAccessions" ]; then 
      snpRefAccessions="$providedSnpRefAccessions";
    else
      snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${reference_fasta}', 'fasta'))))")"
    fi

    echo "snpRefAccessions: $snpRefAccessions"

    intrahost.py merge_to_vcf \
        ${reference_fasta} \
        isnvs.vcf.gz \
        $SAMPLES \
        --isnvs ${sep=' ' vphaser2Calls} \
        --alignments ${sep=' ' perSegmentMultiAlignments} \
        --strip_chr_version \
        ${true="--naive_filter" false="" naiveFilter} \
        --parse_accession
        
    interhost.py snpEff \
        isnvs.vcf.gz \
        $snpRefAccessions \
        isnvs.annot.vcf.gz \
        ${'--emailAddress=' + emailAddress}

    intrahost.py iSNV_table \
        isnvs.annot.vcf.gz \
        isnvs.annot.txt.gz
  }

  output {
    File   isnvs_vcf           = "isnvs.vcf.gz"
    File   isnvs_vcf_idx       = "isnvs.vcf.gz.tbi"
    File   isnvs_annot_vcf     = "isnvs.annot.vcf.gz"
    File   isnvs_annot_vcf_idx = "isnvs.annot.vcf.gz.tbi"
    File   isnvs_annot_txt     = "isnvs.annot.txt.gz"
    String viralngs_version    = read_string("VERSION")
  }
  runtime {
    docker: "${docker}"
    memory: select_first([machine_mem_gb, 4]) + " GB"
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

task annotate_vcf_snpeff {
  input {
    File           in_vcf
    File           ref_fasta

    Array[String]? snpEffRef
    String?        emailAddress

    Int?           machine_mem_gb
    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"

    String         output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
  }

  Int disk_size = 375


  parameter_meta {
    in_vcf:             { description: "input VCF to annotate with snpEff", patterns: ["*.vcf","*.vcf.gz"]}
    ref_fasta:          { description: "The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.", patterns: ["*.fasta","*.fa"] }
    snpEffRef:          { description: "list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)" }
    emailAddress:       { description: "email address passed to NCBI if we need to download reference sequences" }
  }

  command {
    set -ex -o pipefail

    intrahost.py --version | tee VERSION

    providedSnpRefAccessions="${sep=' ' snpEffRef}"
    if [ -n "$providedSnpRefAccessions" ]; then 
      snpRefAccessions="$providedSnpRefAccessions";
    else
      snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${ref_fasta}', 'fasta'))))")"
    fi
    echo "snpRefAccessions: $snpRefAccessions"

    vcf_to_use=""
    if (file "${in_vcf}" | grep -q "gzip" ) ; then
      echo "${in_vcf} is already compressed"
      vcf_to_use="${in_vcf}"
    else
      echo "${in_vcf} is not compressed; gzipping..."
      bgzip "${in_vcf}"
      vcf_to_use="${in_vcf}.gz"
    fi

    # renames the seq id using the first sequence in the alignment
    ref_name=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]]+).*$/\1/g')
    ref_name_no_version=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]\.]+).*$/\1/g')
    # copy the input or just created gzipped vcf file
    cp "$vcf_to_use" "temp.vcf.gz"
    # ensure uncompressed
    bgzip -d "temp.vcf.gz"
    # rename chr field (first col) in vcf
    cat "temp.vcf" | sed "s/^1/$ref_name_no_version/" > "temp2.vcf"
    
    # output the vcf, removing the reference sequence if present as a sample name
    bgzip "temp2.vcf"
    tabix -p vcf "temp2.vcf.gz"
    bcftools index "temp2.vcf.gz"
    bcftools view -s "^$ref_name" "temp2.vcf.gz" > "temp3.vcf"
    rm "temp2.vcf.gz"
    vcf_to_use="temp3.vcf"

    # compress vcf
    bgzip -c "$vcf_to_use" > "$vcf_to_use.gz"
    vcf_to_use="$vcf_to_use.gz"

    # index vcf
    echo "Creating vcf index"
    bcftools index "$vcf_to_use"
    tabix -p vcf "$vcf_to_use"
    
    interhost.py snpEff \
        "$vcf_to_use" \
        $snpRefAccessions \
        "${output_basename}.annot.vcf.gz" \
        ${'--emailAddress=' + emailAddress}
  }

  output {
    File   annot_vcf_gz     = "~{output_basename}.annot.vcf.gz"
    File   annot_vcf_gz_tbi = "~{output_basename}.annot.vcf.gz.tbi"
    String viralngs_version = read_string("VERSION")
  }
  runtime {
    docker: docker
    memory: select_first([machine_mem_gb, 4]) + " GB"
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}