tasks_intrahost - WDL Atlas

TASKS tasks_intrahost

File Path	`pipes/WDL/tasks/tasks_intrahost.wdl`
WDL Version	1.0
Type	tasks

📋Tasks in this document

polyphonia_detect_cross_contamination
lofreq
isnvs_per_sample
isnvs_vcf
annotate_vcf_snpeff

Tasks

TASKS polyphonia_detect_cross_contamination

Inputs

Name	Type	Description	Default
`lofreq_vcfs`	`Array[File]`	VCF file(s) output by LoFreq or GATK; must use reference provided by reference_fasta	-
`genome_fastas`	`Array[File]`	Unaligned consensus genome or genomes	-
`reference_fasta`	`File`	Reference fasta file	-
`read_depths`	`Array[File]?`	Read depth tables (tab-separated, no header: reference name, position, read depth); provide alongside vcf files or heterozygosity tables if min-depth>0	-
`masked_positions`	`String?`	1-indexed positions to mask (e.g., 1-10,50,55-70)	-
`masked_positions_file`	`File?`	1-indexed positions to mask, one per line	-
`plate_maps`	`Array[File]?`	Optional plate map(s) (tab-separated, no header: sample name, plate position (e.g., A8))	-
`plate_columns`	`Int?`	Number columns in plate (e.g., 1, 2, 3, 4)	-
`plate_rows`	`Int?`	Number rows in plate (e.g., A, B, C, D)	-
17 optional inputs with default values
`min_readcount`	`Int`	Minimum minor allele readcount for position to be considered heterozygous	0
`min_maf`	`Float`	Minimum minor allele frequency for position to be considered heterozygous	0
`min_genome_coverage`	`Float`	Minimum proportion genome covered for a sample to be included	0.95
`min_read_depth`	`Int`	Minimum read depth for a position to be used for comparison	100
`max_mismatches`	`Int`	Maximum allowed bases in contaminating sample consensus not matching contaminated sample alleles	0
`min_matches`	`Int`	Of positions at which the two consensus genomes differ, the minimum number of positions at which contamination is detected as a minor allele	3
`min_matches_proportion`	`Float`	Of positions at which the two consensus genomes differ, the minimum proportion of positions at which contamination is detected as a minor allele	1
`plate_size`	`Int?`	Standard plate size (6-well, 12-well, 24, 48, 96, 384, 1536, 3456)	96
`compare_direct_neighbors`	`Boolean?`	Compare direct plate neighbors (left, right, top, bottom)	true
`compare_diagonal_neighbors`	`Boolean?`	Compare diagonal plate neighbors (top-right, bottom-right, top-left, bottom-left)	false
`compare_full_row`	`Boolean?`	Compare samples in the same row (e.g., row A)	false
`compare_full_column`	`Boolean?`	Compare samples in the same column (e.g., column 8)	false
`compare_full_plate`	`Boolean?`	Compare all samples in the same plate map	false
`print_all_isnvs`	`Boolean?`	Include all threshold-passing samples in iSNVs visualizations, including samples without plate neighbors	true
`print_all`	`Boolean?`	Output outcomes of all comparisons (all comparisons are marked as potential cross-contamination)	false
`out_basename`	`String`	-	"potential_cross_contamination"
`docker`	`String`	-	"quay.io/broadinstitute/polyphonia:latest"

Command

set -e -o pipefail

# commented out until polyphonia can report its own version
#polyphonia --version | tee POLYPHONIA_VERSION

mkdir -p figs
    
# prep plate maps input, if specified
PLATE_MAPS_INPUT=""
if [ -n "${sep=' ' plate_maps}" ]; then
  PLATE_MAPS_INPUT="--plate-map ~{sep=" " plate_maps}"
fi

polyphonia cross_contamination \
  --ref ~{reference_fasta} \
  --vcf ~{sep=" " lofreq_vcfs} \
  --consensus ~{sep=" " genome_fastas} \
  --read-depths ~{sep=" " read_depths} \
  ~{'--min-covered ' + min_genome_coverage} \
  ~{'--min-depth ' + min_read_depth} \
  ~{'--min-readcount ' + min_readcount} \
  ~{'--max-mismatches ' + max_mismatches} \
  ~{'--min-matches ' + min_matches} \
  ~{'--min-matches-proportion ' + min_matches_proportion} \
  ~{'--min-maf ' + min_maf} \
  ~{'--masked-positions ' + masked_positions} \
  ~{'--masked-positions-file ' + masked_positions_file} \
  $PLATE_MAPS_INPUT \
  ~{'--plate-size ' + plate_size} \
  ~{'--plate-columns ' + plate_columns} \
  ~{'--plate-rows ' + plate_rows} \
  --compare-direct ~{true="TRUE" false="FALSE" compare_direct_neighbors} \
  --compare-diagonal ~{true="TRUE" false="FALSE" compare_diagonal_neighbors} \
  --compare-row ~{true="TRUE" false="FALSE" compare_full_row} \
  --compare-column ~{true="TRUE" false="FALSE" compare_full_column} \
  --compare-plate ~{true="TRUE" false="FALSE" compare_full_plate} \
  --output ~{out_basename}.txt \
  --out-figures figs \
  --cores `nproc` \
  --verbose TRUE \
  --overwrite TRUE \
  --print-all-iSNVs ~{true="TRUE" false="FALSE" print_all_isnvs} \
  --print-all ~{true="TRUE" false="FALSE" print_all}

Outputs

Name	Type	Expression
`report`	`File`	`"~{out_basename}.txt"`
`figures`	`Array[File]`	`glob("figs/*")`

Runtime

Key	Value
`docker`	`docker`
`cpu`	`4`
`memory`	`"13 GB"`
`disks`	`"local-disk " + disk_size + " HDD"`
`disk`	`disk_size + " GB"`
`dx_instance_type`	`"mem1_ssd1_v2_x4"`
`maxRetries`	`2`

TASKS lofreq

Inputs

Name	Type	Description	Default
`aligned_bam`	`File`	-	-
`reference_fasta`	`File`	-	-
2 optional inputs with default values
`out_basename`	`String`	-	basename(aligned_bam,'.bam')
`docker`	`String`	-	"quay.io/broadinstitute/viral-phylo:2.5.1.0"

Command

set -e -o pipefail

lofreq version | grep version | sed 's/.* \(.*\)/\1/g' | tee LOFREQ_VERSION

# make local copies because CWD is writeable but localization dir isn't always
cp "~{aligned_bam}" aligned.bam
python3<<CODE
import shutil
import util.file
with util.file.fastas_with_sanitized_ids("~{reference_fasta}", use_tmp=True) as sanitized_fastas:
    shutil.copyfile(sanitized_fastas[0], 'reference.fasta')
CODE

# samtools faidx fails if fasta is empty
if [ $(grep -v '^>' reference.fasta | tr -d '\nNn' | wc -c) == "0" ]; then
  touch "~{out_basename}.vcf"
  exit 0
fi

# index for lofreq
samtools faidx reference.fasta
samtools index aligned.bam

# lofreq
lofreq call \
  -f reference.fasta \
  -o "~{out_basename}.vcf" \
  aligned.bam

Outputs

Name	Type	Expression
`report_vcf`	`File`	`"~{out_basename}.vcf"`
`lofreq_version`	`String`	`read_string("LOFREQ_VERSION")`

Runtime

Key	Value
`docker`	`docker`
`cpu`	`2`
`memory`	`"3 GB"`
`disks`	`"local-disk " + disk_size + " HDD"`
`disk`	`disk_size + " GB"`
`dx_instance_type`	`"mem1_ssd1_v2_x2"`
`maxRetries`	`2`

TASKS isnvs_per_sample

Inputs

Name	Type	Description	Default
`mapped_bam`	`File`	-	-
`assembly_fasta`	`File`	-	-
`threads`	`Int?`	-	-
`minReadsPerStrand`	`Int?`	-	-
`maxBias`	`Int?`	-	-
`machine_mem_gb`	`Int?`	-	-
3 optional inputs with default values
`removeDoublyMappedReads`	`Boolean`	-	true
`docker`	`String`	-	"quay.io/broadinstitute/viral-phylo:2.5.1.0"
`sample_name`	`String`	-	basename(basename(basename(mapped_bam,".bam"),".all"),".mapped")

Command

intrahost.py --version | tee VERSION
intrahost.py vphaser_one_sample \
    ~{mapped_bam} \
    ~{assembly_fasta} \
    vphaser2.~{sample_name}.txt.gz \
    ~{'--vphaserNumThreads=' + threads} \
    ~{true="--removeDoublyMappedReads" false="" removeDoublyMappedReads} \
    ~{'--minReadsEach=' + minReadsPerStrand} \
    ~{'--maxBias=' + maxBias}

Outputs

Name	Type	Expression
`isnvsFile`	`File`	`"vphaser2.~{sample_name}.txt.gz"`
`viralngs_version`	`String`	`read_string("VERSION")`

Runtime

Key	Value
`docker`	`"~{docker}"`
`memory`	`select_first([machine_mem_gb, 7]) + " GB"`
`dx_instance_type`	`"mem1_ssd1_v2_x8"`
`maxRetries`	`2`

TASKS isnvs_vcf

Inputs

Name	Type	Description	Default
`vphaser2Calls`	`Array[File]`	vphaser output; ex. vphaser2.<sample>.txt.gz	-
`perSegmentMultiAlignments`	`Array[File]`	aligned_##.fasta, where ## is segment number	-
`reference_fasta`	`File`	-	-
`snpEffRef`	`Array[String]?`	list of accessions to build/find snpEff database	-
`sampleNames`	`Array[String]?`	list of sample names	-
`emailAddress`	`String?`	email address passed to NCBI if we need to download reference sequences	-
`machine_mem_gb`	`Int?`	-	-
2 optional inputs with default values
`naiveFilter`	`Boolean`	-	false
`docker`	`String`	-	"quay.io/broadinstitute/viral-phylo:2.5.1.0"

Command

set -ex -o pipefail

intrahost.py --version | tee VERSION

SAMPLES="~{sep=" " sampleNames}"
if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi

providedSnpRefAccessions="~{sep=" " snpEffRef}"
if [ -n "$providedSnpRefAccessions" ]; then 
  snpRefAccessions="$providedSnpRefAccessions";
else
  snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('~{reference_fasta}', 'fasta'))))")"
fi

echo "snpRefAccessions: $snpRefAccessions"

intrahost.py merge_to_vcf \
    ~{reference_fasta} \
    isnvs.vcf.gz \
    $SAMPLES \
    --isnvs ~{sep=" " vphaser2Calls} \
    --alignments ~{sep=" " perSegmentMultiAlignments} \
    --strip_chr_version \
    ~{true="--naive_filter" false="" naiveFilter} \
    --parse_accession
    
interhost.py snpEff \
    isnvs.vcf.gz \
    $snpRefAccessions \
    isnvs.annot.vcf.gz \
    ~{'--emailAddress=' + emailAddress}

intrahost.py iSNV_table \
    isnvs.annot.vcf.gz \
    isnvs.annot.txt.gz

Outputs

Name	Type	Expression
`isnvs_vcf`	`File`	`"isnvs.vcf.gz"`
`isnvs_vcf_idx`	`File`	`"isnvs.vcf.gz.tbi"`
`isnvs_annot_vcf`	`File`	`"isnvs.annot.vcf.gz"`
`isnvs_annot_vcf_idx`	`File`	`"isnvs.annot.vcf.gz.tbi"`
`isnvs_annot_txt`	`File`	`"isnvs.annot.txt.gz"`
`viralngs_version`	`String`	`read_string("VERSION")`

Runtime

Key	Value
`docker`	`"~{docker}"`
`memory`	`select_first([machine_mem_gb, 4]) + " GB"`
`dx_instance_type`	`"mem1_ssd1_v2_x4"`
`maxRetries`	`2`

TASKS annotate_vcf_snpeff

Inputs

Name	Type	Description	Default
`in_vcf`	`File`	input VCF to annotate with snpEff	-
`ref_fasta`	`File`	The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.	-
`snpEffRef`	`Array[String]?`	list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)	-
`emailAddress`	`String?`	email address passed to NCBI if we need to download reference sequences	-
`machine_mem_gb`	`Int?`	-	-
2 optional inputs with default values
`docker`	`String`	-	"quay.io/broadinstitute/viral-phylo:2.5.1.0"
`output_basename`	`String`	-	basename(basename(in_vcf,".gz"),".vcf")

Command

set -ex -o pipefail

intrahost.py --version | tee VERSION

providedSnpRefAccessions="~{sep=" " snpEffRef}"
if [ -n "$providedSnpRefAccessions" ]; then 
  snpRefAccessions="$providedSnpRefAccessions";
else
  snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('~{ref_fasta}', 'fasta'))))")"
fi
echo "snpRefAccessions: $snpRefAccessions"

vcf_to_use=""
if (file "~{in_vcf}" | grep -q "gzip" ) ; then
  echo "~{in_vcf} is already compressed"
  vcf_to_use="~{in_vcf}"
else
  echo "~{in_vcf} is not compressed; gzipping..."
  bgzip "~{in_vcf}"
  vcf_to_use="~{in_vcf}.gz"
fi

# renames the seq id using the first sequence in the alignment
ref_name=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]]+).*$/\1/g')
ref_name_no_version=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]\.]+).*$/\1/g')
# copy the input or just created gzipped vcf file
cp "$vcf_to_use" "temp.vcf.gz"
# ensure uncompressed
bgzip -d "temp.vcf.gz"
# rename chr field (first col) in vcf
cat "temp.vcf" | sed "s/^1/$ref_name_no_version/" > "temp2.vcf"
    
# output the vcf, removing the reference sequence if present as a sample name
bgzip "temp2.vcf"
tabix -p vcf "temp2.vcf.gz"
bcftools index "temp2.vcf.gz"
bcftools view -s "^$ref_name" "temp2.vcf.gz" > "temp3.vcf"
rm "temp2.vcf.gz"
vcf_to_use="temp3.vcf"

# compress vcf
bgzip -c "$vcf_to_use" > "$vcf_to_use.gz"
vcf_to_use="$vcf_to_use.gz"

# index vcf
echo "Creating vcf index"
bcftools index "$vcf_to_use"
tabix -p vcf "$vcf_to_use"
    
interhost.py snpEff \
    "$vcf_to_use" \
    $snpRefAccessions \
    "~{output_basename}.annot.vcf.gz" \
    ~{'--emailAddress=' + emailAddress}

Outputs

Name	Type	Expression
`annot_vcf_gz`	`File`	`"~{output_basename}.annot.vcf.gz"`
`annot_vcf_gz_tbi`	`File`	`"~{output_basename}.annot.vcf.gz.tbi"`
`viralngs_version`	`String`	`read_string("VERSION")`

Runtime

Key	Value
`docker`	`docker`
`memory`	`select_first([machine_mem_gb, 4]) + " GB"`
`disks`	`"local-disk " + disk_size + " LOCAL"`
`disk`	`disk_size + " GB"`
`dx_instance_type`	`"mem1_ssd1_v2_x4"`
`maxRetries`	`2`

← Back to Index

version 1.0

task polyphonia_detect_cross_contamination {
  input {
    Array[File]  lofreq_vcfs
    Array[File]  genome_fastas
    File         reference_fasta

    Int          min_readcount       = 0
    Float        min_maf             = 0
    Float        min_genome_coverage = 0.95
    Int          min_read_depth      = 100
    Array[File]? read_depths
    String?      masked_positions
    File?        masked_positions_file
    Int          max_mismatches      = 0
    Int          min_matches         = 3
    Float        min_matches_proportion = 1

    Array[File]? plate_maps
    Int?         plate_size                 = 96
    Int?         plate_columns
    Int?         plate_rows
    Boolean?     compare_direct_neighbors   = true
    Boolean?     compare_diagonal_neighbors = false
    Boolean?     compare_full_row           = false
    Boolean?     compare_full_column        = false
    Boolean?     compare_full_plate         = false
    
    Boolean?     print_all_isnvs     = true
    Boolean?     print_all           = false

    String       out_basename = "potential_cross_contamination"

    String       docker = "quay.io/broadinstitute/polyphonia:latest"
  }

  Int disk_size = 100

  parameter_meta {
    lofreq_vcfs:         { description: "VCF file(s) output by LoFreq or GATK; must use reference provided by reference_fasta" }
    genome_fastas:       { description: "Unaligned consensus genome or genomes" }
    reference_fasta:     { description: "Reference fasta file" }
    
    min_readcount:       { description: "Minimum minor allele readcount for position to be considered heterozygous" }
    min_maf:             { description: "Minimum minor allele frequency for position to be considered heterozygous" }
    min_read_depth:      { description: "Minimum read depth for a position to be used for comparison" }
    read_depths:         { description: "Read depth tables (tab-separated, no header: reference name, position, read depth); provide alongside vcf files or heterozygosity tables if min-depth>0" }
    min_genome_coverage: { description: "Minimum proportion genome covered for a sample to be included" }
    masked_positions:    { description: "1-indexed positions to mask (e.g., 1-10,50,55-70)" }
    masked_positions_file: { description: "1-indexed positions to mask, one per line" }
    min_matches:         { description: "Of positions at which the two consensus genomes differ, the minimum number of positions at which contamination is detected as a minor allele" }
    min_matches_proportion: { description: "Of positions at which the two consensus genomes differ, the minimum proportion of positions at which contamination is detected as a minor allele" }
    max_mismatches:      { description: "Maximum allowed bases in contaminating sample consensus not matching contaminated sample alleles" }
    
    plate_maps:          { description: "Optional plate map(s) (tab-separated, no header: sample name, plate position (e.g., A8))" }
    plate_size:          { description: "Standard plate size (6-well, 12-well, 24, 48, 96, 384, 1536, 3456)" }
    plate_columns:       { description: "Number columns in plate (e.g., 1, 2, 3, 4)" }
    plate_rows:          { description: "Number rows in plate (e.g., A, B, C, D)" }
    compare_direct_neighbors:  { description: "Compare direct plate neighbors (left, right, top, bottom)" }
    compare_diagonal_neighbors:{ description: "Compare diagonal plate neighbors (top-right, bottom-right, top-left, bottom-left)" }
    compare_full_row:    { description: "Compare samples in the same row (e.g., row A)" }
    compare_full_column: { description: "Compare samples in the same column (e.g., column 8)" }
    compare_full_plate:  { description: "Compare all samples in the same plate map" }
    
    print_all_isnvs:     { description: "Include all threshold-passing samples in iSNVs visualizations, including samples without plate neighbors" }
    print_all:           { description: "Output outcomes of all comparisons (all comparisons are marked as potential cross-contamination)" }
  }

  command <<<
    set -e -o pipefail

    # commented out until polyphonia can report its own version
    #polyphonia --version | tee POLYPHONIA_VERSION

    mkdir -p figs
    
    # prep plate maps input, if specified
    PLATE_MAPS_INPUT=""
    if [ -n "${sep=' ' plate_maps}" ]; then
      PLATE_MAPS_INPUT="--plate-map ~{sep=' ' plate_maps}"
    fi

    polyphonia cross_contamination \
      --ref ~{reference_fasta} \
      --vcf ~{sep=' ' lofreq_vcfs} \
      --consensus ~{sep=' ' genome_fastas} \
      --read-depths ~{sep=' ' read_depths} \
      ~{'--min-covered ' + min_genome_coverage} \
      ~{'--min-depth ' + min_read_depth} \
      ~{'--min-readcount ' + min_readcount} \
      ~{'--max-mismatches ' + max_mismatches} \
      ~{'--min-matches ' + min_matches} \
      ~{'--min-matches-proportion ' + min_matches_proportion} \
      ~{'--min-maf ' + min_maf} \
      ~{'--masked-positions ' + masked_positions} \
      ~{'--masked-positions-file ' + masked_positions_file} \
      $PLATE_MAPS_INPUT \
      ~{'--plate-size ' + plate_size} \
      ~{'--plate-columns ' + plate_columns} \
      ~{'--plate-rows ' + plate_rows} \
      --compare-direct ~{true="TRUE" false="FALSE" compare_direct_neighbors} \
      --compare-diagonal ~{true="TRUE" false="FALSE" compare_diagonal_neighbors} \
      --compare-row ~{true="TRUE" false="FALSE" compare_full_row} \
      --compare-column ~{true="TRUE" false="FALSE" compare_full_column} \
      --compare-plate ~{true="TRUE" false="FALSE" compare_full_plate} \
      --output ~{out_basename}.txt \
      --out-figures figs \
      --cores `nproc` \
      --verbose TRUE \
      --overwrite TRUE \
      --print-all-iSNVs ~{true="TRUE" false="FALSE" print_all_isnvs} \
      --print-all ~{true="TRUE" false="FALSE" print_all}
  >>>

  output {
    File        report             = "~{out_basename}.txt"
    Array[File] figures            = glob("figs/*")
    # commented out until polyphonia can report its own version
    #String      polyphonia_version = read_string("POLYPHONIA_VERSION")
  }
  runtime {
    docker: docker
    cpu:    4
    memory: "13 GB"
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

task lofreq {
  input {
    File      aligned_bam
    File      reference_fasta

    String    out_basename = basename(aligned_bam, '.bam')
    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }
  Int disk_size = 200
  command <<<
    set -e -o pipefail

    lofreq version | grep version | sed 's/.* \(.*\)/\1/g' | tee LOFREQ_VERSION

    # make local copies because CWD is writeable but localization dir isn't always
    cp "~{aligned_bam}" aligned.bam
    python3<<CODE
    import shutil
    import util.file
    with util.file.fastas_with_sanitized_ids("~{reference_fasta}", use_tmp=True) as sanitized_fastas:
        shutil.copyfile(sanitized_fastas[0], 'reference.fasta')
    CODE

    # samtools faidx fails if fasta is empty
    if [ $(grep -v '^>' reference.fasta | tr -d '\nNn' | wc -c) == "0" ]; then
      touch "~{out_basename}.vcf"
      exit 0
    fi

    # index for lofreq
    samtools faidx reference.fasta
    samtools index aligned.bam

    # lofreq
    lofreq call \
      -f reference.fasta \
      -o "~{out_basename}.vcf" \
      aligned.bam
  >>>

  output {
    File   report_vcf     = "~{out_basename}.vcf"
    String lofreq_version = read_string("LOFREQ_VERSION")
  }
  runtime {
    docker: docker
    cpu:    2
    memory: "3 GB"
    disks:  "local-disk " + disk_size + " HDD"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x2"
    maxRetries: 2
  }
}

task isnvs_per_sample {
  input {
    File    mapped_bam
    File    assembly_fasta

    Int?    threads
    Int?    minReadsPerStrand
    Int?    maxBias
    Boolean removeDoublyMappedReads = true

    Int?    machine_mem_gb
    String  docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"

    String  sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
  }
  

  command {
    intrahost.py --version | tee VERSION
    intrahost.py vphaser_one_sample \
        ${mapped_bam} \
        ${assembly_fasta} \
        vphaser2.${sample_name}.txt.gz \
        ${'--vphaserNumThreads=' + threads} \
        ${true="--removeDoublyMappedReads" false="" removeDoublyMappedReads} \
        ${'--minReadsEach=' + minReadsPerStrand} \
        ${'--maxBias=' + maxBias}
  }

  output {
    File   isnvsFile        = "vphaser2.${sample_name}.txt.gz"
    String viralngs_version = read_string("VERSION")
  }
  runtime {
    docker: "${docker}"
    memory: select_first([machine_mem_gb, 7]) + " GB"
    dx_instance_type: "mem1_ssd1_v2_x8"
    maxRetries: 2
  }
}


task isnvs_vcf {
  input {
    Array[File]    vphaser2Calls
    Array[File]    perSegmentMultiAlignments
    File           reference_fasta

    Array[String]? snpEffRef
    Array[String]? sampleNames
    String?        emailAddress
    Boolean        naiveFilter = false

    Int?           machine_mem_gb
    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"
  }

  parameter_meta {
    vphaser2Calls:             { description: "vphaser output; ex. vphaser2.<sample>.txt.gz" }
    perSegmentMultiAlignments: { description: "aligned_##.fasta, where ## is segment number" }
    snpEffRef:                 { description: "list of accessions to build/find snpEff database" }
    sampleNames:               { description: "list of sample names" }
    emailAddress:              { description: "email address passed to NCBI if we need to download reference sequences" }
  }

  command {
    set -ex -o pipefail

    intrahost.py --version | tee VERSION

    SAMPLES="${sep=' ' sampleNames}"
    if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi

    providedSnpRefAccessions="${sep=' ' snpEffRef}"
    if [ -n "$providedSnpRefAccessions" ]; then 
      snpRefAccessions="$providedSnpRefAccessions";
    else
      snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${reference_fasta}', 'fasta'))))")"
    fi

    echo "snpRefAccessions: $snpRefAccessions"

    intrahost.py merge_to_vcf \
        ${reference_fasta} \
        isnvs.vcf.gz \
        $SAMPLES \
        --isnvs ${sep=' ' vphaser2Calls} \
        --alignments ${sep=' ' perSegmentMultiAlignments} \
        --strip_chr_version \
        ${true="--naive_filter" false="" naiveFilter} \
        --parse_accession
        
    interhost.py snpEff \
        isnvs.vcf.gz \
        $snpRefAccessions \
        isnvs.annot.vcf.gz \
        ${'--emailAddress=' + emailAddress}

    intrahost.py iSNV_table \
        isnvs.annot.vcf.gz \
        isnvs.annot.txt.gz
  }

  output {
    File   isnvs_vcf           = "isnvs.vcf.gz"
    File   isnvs_vcf_idx       = "isnvs.vcf.gz.tbi"
    File   isnvs_annot_vcf     = "isnvs.annot.vcf.gz"
    File   isnvs_annot_vcf_idx = "isnvs.annot.vcf.gz.tbi"
    File   isnvs_annot_txt     = "isnvs.annot.txt.gz"
    String viralngs_version    = read_string("VERSION")
  }
  runtime {
    docker: "${docker}"
    memory: select_first([machine_mem_gb, 4]) + " GB"
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

task annotate_vcf_snpeff {
  input {
    File           in_vcf
    File           ref_fasta

    Array[String]? snpEffRef
    String?        emailAddress

    Int?           machine_mem_gb
    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0"

    String         output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
  }

  Int disk_size = 375


  parameter_meta {
    in_vcf:             { description: "input VCF to annotate with snpEff", patterns: ["*.vcf","*.vcf.gz"]}
    ref_fasta:          { description: "The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.", patterns: ["*.fasta","*.fa"] }
    snpEffRef:          { description: "list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)" }
    emailAddress:       { description: "email address passed to NCBI if we need to download reference sequences" }
  }

  command {
    set -ex -o pipefail

    intrahost.py --version | tee VERSION

    providedSnpRefAccessions="${sep=' ' snpEffRef}"
    if [ -n "$providedSnpRefAccessions" ]; then 
      snpRefAccessions="$providedSnpRefAccessions";
    else
      snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${ref_fasta}', 'fasta'))))")"
    fi
    echo "snpRefAccessions: $snpRefAccessions"

    vcf_to_use=""
    if (file "${in_vcf}" | grep -q "gzip" ) ; then
      echo "${in_vcf} is already compressed"
      vcf_to_use="${in_vcf}"
    else
      echo "${in_vcf} is not compressed; gzipping..."
      bgzip "${in_vcf}"
      vcf_to_use="${in_vcf}.gz"
    fi

    # renames the seq id using the first sequence in the alignment
    ref_name=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]]+).*$/\1/g')
    ref_name_no_version=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]\.]+).*$/\1/g')
    # copy the input or just created gzipped vcf file
    cp "$vcf_to_use" "temp.vcf.gz"
    # ensure uncompressed
    bgzip -d "temp.vcf.gz"
    # rename chr field (first col) in vcf
    cat "temp.vcf" | sed "s/^1/$ref_name_no_version/" > "temp2.vcf"
    
    # output the vcf, removing the reference sequence if present as a sample name
    bgzip "temp2.vcf"
    tabix -p vcf "temp2.vcf.gz"
    bcftools index "temp2.vcf.gz"
    bcftools view -s "^$ref_name" "temp2.vcf.gz" > "temp3.vcf"
    rm "temp2.vcf.gz"
    vcf_to_use="temp3.vcf"

    # compress vcf
    bgzip -c "$vcf_to_use" > "$vcf_to_use.gz"
    vcf_to_use="$vcf_to_use.gz"

    # index vcf
    echo "Creating vcf index"
    bcftools index "$vcf_to_use"
    tabix -p vcf "$vcf_to_use"
    
    interhost.py snpEff \
        "$vcf_to_use" \
        $snpRefAccessions \
        "${output_basename}.annot.vcf.gz" \
        ${'--emailAddress=' + emailAddress}
  }

  output {
    File   annot_vcf_gz     = "~{output_basename}.annot.vcf.gz"
    File   annot_vcf_gz_tbi = "~{output_basename}.annot.vcf.gz.tbi"
    String viralngs_version = read_string("VERSION")
  }
  runtime {
    docker: docker
    memory: select_first([machine_mem_gb, 4]) + " GB"
    disks:  "local-disk " + disk_size + " LOCAL"
    disk: disk_size + " GB" # TES
    dx_instance_type: "mem1_ssd1_v2_x4"
    maxRetries: 2
  }
}

version 1.0 task polyphonia_detect_cross_contamination { input { Array[File] lofreq_vcfs Array[File] genome_fastas File reference_fasta Int min_readcount = 0 Float min_maf = 0 Float min_genome_coverage = 0.95 Int min_read_depth = 100 Array[File]? read_depths String? masked_positions File? masked_positions_file Int max_mismatches = 0 Int min_matches = 3 Float min_matches_proportion = 1 Array[File]? plate_maps Int? plate_size = 96 Int? plate_columns Int? plate_rows Boolean? compare_direct_neighbors = true Boolean? compare_diagonal_neighbors = false Boolean? compare_full_row = false Boolean? compare_full_column = false Boolean? compare_full_plate = false Boolean? print_all_isnvs = true Boolean? print_all = false String out_basename = "potential_cross_contamination" String docker = "quay.io/broadinstitute/polyphonia:latest" } Int disk_size = 100 parameter_meta { lofreq_vcfs: { description: "VCF file(s) output by LoFreq or GATK; must use reference provided by reference_fasta" } genome_fastas: { description: "Unaligned consensus genome or genomes" } reference_fasta: { description: "Reference fasta file" } min_readcount: { description: "Minimum minor allele readcount for position to be considered heterozygous" } min_maf: { description: "Minimum minor allele frequency for position to be considered heterozygous" } min_read_depth: { description: "Minimum read depth for a position to be used for comparison" } read_depths: { description: "Read depth tables (tab-separated, no header: reference name, position, read depth); provide alongside vcf files or heterozygosity tables if min-depth>0" } min_genome_coverage: { description: "Minimum proportion genome covered for a sample to be included" } masked_positions: { description: "1-indexed positions to mask (e.g., 1-10,50,55-70)" } masked_positions_file: { description: "1-indexed positions to mask, one per line" } min_matches: { description: "Of positions at which the two consensus genomes differ, the minimum number of positions at which contamination is detected as a minor allele" } min_matches_proportion: { description: "Of positions at which the two consensus genomes differ, the minimum proportion of positions at which contamination is detected as a minor allele" } max_mismatches: { description: "Maximum allowed bases in contaminating sample consensus not matching contaminated sample alleles" } plate_maps: { description: "Optional plate map(s) (tab-separated, no header: sample name, plate position (e.g., A8))" } plate_size: { description: "Standard plate size (6-well, 12-well, 24, 48, 96, 384, 1536, 3456)" } plate_columns: { description: "Number columns in plate (e.g., 1, 2, 3, 4)" } plate_rows: { description: "Number rows in plate (e.g., A, B, C, D)" } compare_direct_neighbors: { description: "Compare direct plate neighbors (left, right, top, bottom)" } compare_diagonal_neighbors:{ description: "Compare diagonal plate neighbors (top-right, bottom-right, top-left, bottom-left)" } compare_full_row: { description: "Compare samples in the same row (e.g., row A)" } compare_full_column: { description: "Compare samples in the same column (e.g., column 8)" } compare_full_plate: { description: "Compare all samples in the same plate map" } print_all_isnvs: { description: "Include all threshold-passing samples in iSNVs visualizations, including samples without plate neighbors" } print_all: { description: "Output outcomes of all comparisons (all comparisons are marked as potential cross-contamination)" } } command <<< set -e -o pipefail # commented out until polyphonia can report its own version #polyphonia --version | tee POLYPHONIA_VERSION mkdir -p figs # prep plate maps input, if specified PLATE_MAPS_INPUT="" if [ -n "${sep=' ' plate_maps}" ]; then PLATE_MAPS_INPUT="--plate-map ~{sep=' ' plate_maps}" fi polyphonia cross_contamination \ --ref ~{reference_fasta} \ --vcf ~{sep=' ' lofreq_vcfs} \ --consensus ~{sep=' ' genome_fastas} \ --read-depths ~{sep=' ' read_depths} \ ~{'--min-covered ' + min_genome_coverage} \ ~{'--min-depth ' + min_read_depth} \ ~{'--min-readcount ' + min_readcount} \ ~{'--max-mismatches ' + max_mismatches} \ ~{'--min-matches ' + min_matches} \ ~{'--min-matches-proportion ' + min_matches_proportion} \ ~{'--min-maf ' + min_maf} \ ~{'--masked-positions ' + masked_positions} \ ~{'--masked-positions-file ' + masked_positions_file} \ $PLATE_MAPS_INPUT \ ~{'--plate-size ' + plate_size} \ ~{'--plate-columns ' + plate_columns} \ ~{'--plate-rows ' + plate_rows} \ --compare-direct ~{true="TRUE" false="FALSE" compare_direct_neighbors} \ --compare-diagonal ~{true="TRUE" false="FALSE" compare_diagonal_neighbors} \ --compare-row ~{true="TRUE" false="FALSE" compare_full_row} \ --compare-column ~{true="TRUE" false="FALSE" compare_full_column} \ --compare-plate ~{true="TRUE" false="FALSE" compare_full_plate} \ --output ~{out_basename}.txt \ --out-figures figs \ --cores `nproc` \ --verbose TRUE \ --overwrite TRUE \ --print-all-iSNVs ~{true="TRUE" false="FALSE" print_all_isnvs} \ --print-all ~{true="TRUE" false="FALSE" print_all} >>> output { File report = "~{out_basename}.txt" Array[File] figures = glob("figs/*") # commented out until polyphonia can report its own version #String polyphonia_version = read_string("POLYPHONIA_VERSION") } runtime { docker: docker cpu: 4 memory: "13 GB" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x4" maxRetries: 2 } } task lofreq { input { File aligned_bam File reference_fasta String out_basename = basename(aligned_bam, '.bam') String docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0" } Int disk_size = 200 command <<< set -e -o pipefail lofreq version | grep version | sed 's/.* $.*$/\1/g' | tee LOFREQ_VERSION # make local copies because CWD is writeable but localization dir isn't always cp "~{aligned_bam}" aligned.bam python3<<CODE import shutil import util.file with util.file.fastas_with_sanitized_ids("~{reference_fasta}", use_tmp=True) as sanitized_fastas: shutil.copyfile(sanitized_fastas[0], 'reference.fasta') CODE # samtools faidx fails if fasta is empty if [ $(grep -v '^>' reference.fasta | tr -d '\nNn' | wc -c) == "0" ]; then touch "~{out_basename}.vcf" exit 0 fi # index for lofreq samtools faidx reference.fasta samtools index aligned.bam # lofreq lofreq call \ -f reference.fasta \ -o "~{out_basename}.vcf" \ aligned.bam >>> output { File report_vcf = "~{out_basename}.vcf" String lofreq_version = read_string("LOFREQ_VERSION") } runtime { docker: docker cpu: 2 memory: "3 GB" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x2" maxRetries: 2 } } task isnvs_per_sample { input { File mapped_bam File assembly_fasta Int? threads Int? minReadsPerStrand Int? maxBias Boolean removeDoublyMappedReads = true Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0" String sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped") } command { intrahost.py --version | tee VERSION intrahost.py vphaser_one_sample \ ${mapped_bam} \ ${assembly_fasta} \ vphaser2.${sample_name}.txt.gz \ ${'--vphaserNumThreads=' + threads} \ ${true="--removeDoublyMappedReads" false="" removeDoublyMappedReads} \ ${'--minReadsEach=' + minReadsPerStrand} \ ${'--maxBias=' + maxBias} } output { File isnvsFile = "vphaser2.${sample_name}.txt.gz" String viralngs_version = read_string("VERSION") } runtime { docker: "${docker}" memory: select_first([machine_mem_gb, 7]) + " GB" dx_instance_type: "mem1_ssd1_v2_x8" maxRetries: 2 } } task isnvs_vcf { input { Array[File] vphaser2Calls Array[File] perSegmentMultiAlignments File reference_fasta Array[String]? snpEffRef Array[String]? sampleNames String? emailAddress Boolean naiveFilter = false Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0" } parameter_meta { vphaser2Calls: { description: "vphaser output; ex. vphaser2.<sample>.txt.gz" } perSegmentMultiAlignments: { description: "aligned_##.fasta, where ## is segment number" } snpEffRef: { description: "list of accessions to build/find snpEff database" } sampleNames: { description: "list of sample names" } emailAddress: { description: "email address passed to NCBI if we need to download reference sequences" } } command { set -ex -o pipefail intrahost.py --version | tee VERSION SAMPLES="${sep=' ' sampleNames}" if [ -n "$SAMPLES" ]; then SAMPLES="--samples $SAMPLES"; fi providedSnpRefAccessions="${sep=' ' snpEffRef}" if [ -n "$providedSnpRefAccessions" ]; then snpRefAccessions="$providedSnpRefAccessions"; else snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${reference_fasta}', 'fasta'))))")" fi echo "snpRefAccessions: $snpRefAccessions" intrahost.py merge_to_vcf \ ${reference_fasta} \ isnvs.vcf.gz \ $SAMPLES \ --isnvs ${sep=' ' vphaser2Calls} \ --alignments ${sep=' ' perSegmentMultiAlignments} \ --strip_chr_version \ ${true="--naive_filter" false="" naiveFilter} \ --parse_accession interhost.py snpEff \ isnvs.vcf.gz \ $snpRefAccessions \ isnvs.annot.vcf.gz \ ${'--emailAddress=' + emailAddress} intrahost.py iSNV_table \ isnvs.annot.vcf.gz \ isnvs.annot.txt.gz } output { File isnvs_vcf = "isnvs.vcf.gz" File isnvs_vcf_idx = "isnvs.vcf.gz.tbi" File isnvs_annot_vcf = "isnvs.annot.vcf.gz" File isnvs_annot_vcf_idx = "isnvs.annot.vcf.gz.tbi" File isnvs_annot_txt = "isnvs.annot.txt.gz" String viralngs_version = read_string("VERSION") } runtime { docker: "${docker}" memory: select_first([machine_mem_gb, 4]) + " GB" dx_instance_type: "mem1_ssd1_v2_x4" maxRetries: 2 } } task annotate_vcf_snpeff { input { File in_vcf File ref_fasta Array[String]? snpEffRef String? emailAddress Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-phylo:2.5.1.0" String output_basename = basename(basename(in_vcf, ".gz"), ".vcf") } Int disk_size = 375 parameter_meta { in_vcf: { description: "input VCF to annotate with snpEff", patterns: ["*.vcf","*.vcf.gz"]} ref_fasta: { description: "The sequence containing the accession to use for annotation; only used if snpEffRef is not provided.", patterns: ["*.fasta","*.fa"] } snpEffRef: { description: "list of accessions to build/find snpEff database. If this is not provided, the ID from the reference fasta will be used (it must be a GenBank accession)" } emailAddress: { description: "email address passed to NCBI if we need to download reference sequences" } } command { set -ex -o pipefail intrahost.py --version | tee VERSION providedSnpRefAccessions="${sep=' ' snpEffRef}" if [ -n "$providedSnpRefAccessions" ]; then snpRefAccessions="$providedSnpRefAccessions"; else snpRefAccessions="$(python -c "from Bio import SeqIO; print(' '.join(list(s.id for s in SeqIO.parse('${ref_fasta}', 'fasta'))))")" fi echo "snpRefAccessions: $snpRefAccessions" vcf_to_use="" if (file "${in_vcf}" | grep -q "gzip" ) ; then echo "${in_vcf} is already compressed" vcf_to_use="${in_vcf}" else echo "${in_vcf} is not compressed; gzipping..." bgzip "${in_vcf}" vcf_to_use="${in_vcf}.gz" fi # renames the seq id using the first sequence in the alignment ref_name=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]]+).*$/\1/g') ref_name_no_version=$(head -n1 "~{ref_fasta}" | sed -E 's/^>([^[:space:]\.]+).*$/\1/g') # copy the input or just created gzipped vcf file cp "$vcf_to_use" "temp.vcf.gz" # ensure uncompressed bgzip -d "temp.vcf.gz" # rename chr field (first col) in vcf cat "temp.vcf" | sed "s/^1/$ref_name_no_version/" > "temp2.vcf" # output the vcf, removing the reference sequence if present as a sample name bgzip "temp2.vcf" tabix -p vcf "temp2.vcf.gz" bcftools index "temp2.vcf.gz" bcftools view -s "^$ref_name" "temp2.vcf.gz" > "temp3.vcf" rm "temp2.vcf.gz" vcf_to_use="temp3.vcf" # compress vcf bgzip -c "$vcf_to_use" > "$vcf_to_use.gz" vcf_to_use="$vcf_to_use.gz" # index vcf echo "Creating vcf index" bcftools index "$vcf_to_use" tabix -p vcf "$vcf_to_use" interhost.py snpEff \ "$vcf_to_use" \ $snpRefAccessions \ "${output_basename}.annot.vcf.gz" \ ${'--emailAddress=' + emailAddress} } output { File annot_vcf_gz = "~{output_basename}.annot.vcf.gz" File annot_vcf_gz_tbi = "~{output_basename}.annot.vcf.gz.tbi" String viralngs_version = read_string("VERSION") } runtime { docker: docker memory: select_first([machine_mem_gb, 4]) + " GB" disks: "local-disk " + disk_size + " LOCAL" disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x4" maxRetries: 2 } }

TASKS tasks_intrahost

📋Tasks in this document

Tasks

TASKS polyphonia_detect_cross_contamination

Inputs

Command

Outputs

Runtime

TASKS lofreq

Inputs

Command

Outputs

Runtime

TASKS isnvs_per_sample

Inputs

Command

Outputs

Runtime

TASKS isnvs_vcf

Inputs

Command

Outputs

Runtime

TASKS annotate_vcf_snpeff

Inputs

Command

Outputs

Runtime

tasks_intrahost - WDL Source Code