Mercurial > repos > iuc > varscan_somatic

--- a/macros.xml	Thu Mar 28 18:19:00 2019 -0400
+++ b/macros.xml	Fri Aug 16 15:49:54 2019 -0400
@@ -56,9 +56,9 @@
         <param argument="--min-coverage" name="min_coverage" type="integer" value="8" min="1" max="200"
             label="Minimum coverage" help="@HELP@"/>
     </xml>
-    <xml name="min_reads2">
+    <xml name="min_reads2" token_help="Minimum number (default: 2) of variant-supporting reads at a position required to make a call">
         <param argument="--min-reads2" name="min_reads2" type="integer" value="2" min="1" max="200"
-            label="Minimum supporting reads" help="Minimum number of variant-supporting reads at a position required to make a call"/>
+            label="Minimum supporting reads" help="@HELP@"/>
     </xml>
     <xml name="min_avg_qual">
         <param argument="--min-avg-qual" name="min_avg_qual" type="integer" value="15" min="1" max="50"
@@ -68,12 +68,12 @@
     <xml name="min_var_freq" token_value="0.01">
         <param argument="--min-var-freq" name="min_var_freq" type="float" value="@VALUE@" min="0" max="1"
             label="Minimum variant allele frequency"
-            help="Minimum variant allele frequency required for calling a variant"/>
+            help="Minimum variant allele frequency (default: @VALUE@) required for calling a variant"/>
     </xml>
     <xml name="min_freq_for_hom">
         <param argument="--min-freq-for-hom" name="min_freq_for_hom" type="float" value="0.75" min="0" max="1"
             label="Minimum homozygous variant allele frequency"
-            help="Minimum variant allele frequency required for calling a homozygous genotype" />
+            help="Minimum variant allele frequency (default: 0.75) required for calling a homozygous genotype" />
     </xml>
     <xml name="p_value" token_value="0.01"
     token_label="p-value threshold for calling variants"
--- a/varscan.py	Thu Mar 28 18:19:00 2019 -0400
+++ b/varscan.py	Fri Aug 16 15:49:54 2019 -0400
@@ -8,6 +8,7 @@
 import sys
 import tempfile
 import time
+from collections import namedtuple
 from contextlib import ExitStack
 from functools import partial
 from threading import Thread
@@ -15,6 +16,332 @@
 import pysam


+AlleleStats = namedtuple(
+    "AlleleStats",
+    [
+        'reads_total',
+        'reads_fw',
+        'reads_rv',
+        'avg_mapqual',
+        'avg_basequal',
+        'avg_dist_from_center',
+        'avg_mismatch_fraction',
+        'avg_mismatch_qualsum',
+        'avg_clipped_len',
+        'avg_dist_from_3prime'
+    ]
+)
+
+
+def _get_allele_specific_pileup_column_stats(pileups, ref_fetch,
+                                             ignore_md, ignore_nm,
+                                             mm_runs, detect_q2_runs):
+    var_reads_plus = var_reads_minus = 0
+    sum_mapping_qualities = 0
+    sum_base_qualities = 0
+    sum_dist_from_center = 0
+    sum_dist_from_3prime = 0
+    sum_clipped_length = 0
+    sum_unclipped_length = 0
+    sum_mismatch_fractions = 0
+    sum_mismatch_qualities = 0
+
+    for p in pileups:
+        if p.alignment.is_reverse:
+            var_reads_minus += 1
+        else:
+            var_reads_plus += 1
+        sum_mapping_qualities += p.alignment.mapping_quality
+        sum_base_qualities += p.alignment.query_qualities[p.query_position]
+        sum_clipped_length += p.alignment.query_alignment_length
+        unclipped_length = p.alignment.query_length
+        sum_unclipped_length += unclipped_length
+        # The following calculations are all in 1-based coordinates
+        # with respect to the physical 5'-end of the read sequence.
+        if p.alignment.is_reverse:
+            read_base_pos = unclipped_length - p.query_position
+            read_first_aln_pos = (
+                unclipped_length - p.alignment.query_alignment_end + 1
+            )
+            read_last_aln_pos = (
+                unclipped_length - p.alignment.query_alignment_start
+            )
+            qualities_3to5prime = p.alignment.query_alignment_qualities
+        else:
+            read_base_pos = p.query_position + 1
+            read_first_aln_pos = p.alignment.query_alignment_start + 1
+            read_last_aln_pos = p.alignment.query_alignment_end
+            qualities_3to5prime = reversed(
+                p.alignment.query_alignment_qualities
+            )
+
+        read_last_effective_aln_pos = read_last_aln_pos
+        if detect_q2_runs:
+            # Note: the original bam-readcount algorithm always takes
+            # terminal q2 runs into account when determining the
+            # effective 3'-ends of reads.
+            # However, q2 runs have no special meaning since Illumina
+            # pipeline version 1.8 so detecting them is optional
+            # in this code.
+            for qual in qualities_3to5prime:
+                if qual != 2:
+                    break
+                read_last_effective_aln_pos -= 1
+
+        # Note: the original bam-readcount algorithm defines the
+        # read center as:
+        # read_center = p.alignment.query_alignment_length / 2
+        # This is less accurate than the implementation here.
+        read_center = (read_first_aln_pos + read_last_aln_pos) / 2
+        sum_dist_from_center += 1 - abs(
+            read_base_pos - read_center
+        ) / (read_center - 1)
+        # To calculate the distance of base positions from the 3'-ends
+        # of reads bam-readcount uses the formula:
+        # sum_dist_from_3prime += abs(
+        #     read_last_effective_aln_pos - read_base_pos
+        # ) / (unclipped_length - 1)
+        # , which treats base positions on both sides of the effective
+        # 3'-end equally. Since this seems hard to justify, we cap
+        # the distance calculation at 0 for base positions past the
+        # effective 3'-end (which, in turn, should only be possible
+        # with detection of q2 runs).
+        if read_last_effective_aln_pos > read_base_pos:
+            sum_dist_from_3prime += (
+                read_last_effective_aln_pos - read_base_pos
+            ) / (unclipped_length - 1)
+
+        if sum_mismatch_fractions >= 0 or sum_mismatch_qualities >= 0:
+            # sum_mismatch_fractions and sum_mismatch_qualities will be
+            # set to float('nan') if reference info (in form of an MD tag or
+            # an actual reference sequence) was not available for any previous
+            # read in the pileup.
+            # In that case, there is no point to trying to calculate
+            # mismatch stats for other reads anymore.
+
+            # The following mismatch calculations use this logic:
+            # 1. For determining the edit distance between an aligned read and
+            #    the reference, we follow the authoritative definition of the
+            #    NM tag calculation in
+            #    http://samtools.github.io/hts-specs/SAMtags.pdf
+            #
+            #    For historical reasons, the result of this calculation will
+            #    disagree more often than not with NM tag values calculated by
+            #    other tools.
+            #    If precalculated NM tag values are present on the aligned
+            #    reads, these can be given preference through the use_nm flag.
+            #    Doing so will mimick the behavior of bam-readcount, which
+            #    requires and always just looks at NM tags.
+            # 2. For determining mismatch quality sums, a mismatch is defined
+            #    differently and in accordance with the implementation in
+            #    bam-readcount:
+            #    - only mismatches (not inserted or deleted bases) are
+            #      considered
+            #    - 'N' in the reference is considered a match for any read base
+            #    - any matching (case-insensitive) base in reference and read
+            #      is considered a match, even if that base is not one of
+            #      A, C, G or T.
+            # In both 1. and 2. above a '=' in the read is always considered a
+            # match, irrespective of the reference base.
+
+            num_mismatches = 0
+            if not ignore_md:
+                try:
+                    # see if the read has an MD tag, in which case pysam can
+                    # calculate the reference sequence for us
+                    ref_seq = p.alignment.get_reference_sequence().upper()
+                except ValueError:
+                    ignore_md = True
+            if not ignore_nm:
+                try:
+                    num_mismatches = p.alignment.get_tag('NM')
+                except KeyError:
+                    ignore_nm = True
+
+            if ignore_md:
+                if not ref_fetch:
+                    # cannot calculate mismatch stats without ref info
+                    sum_mismatch_qualities = float('nan')
+                    if ignore_nm:
+                        sum_mismatch_fractions = float('nan')
+                    else:
+                        sum_mismatch_fractions += (
+                            num_mismatches / p.alignment.query_alignment_length
+                        )
+                    continue
+                # Without an MD tag we need to extract the relevant part
+                # of the reference from the full sequence by position.
+                ref_positions = p.alignment.get_reference_positions()
+                ref_seq = ref_fetch(
+                    ref_positions[0], ref_positions[-1] + 1
+                ).upper()
+
+            potential_matches = {'A', 'C', 'G', 'T'}
+            aligned_pairs = p.alignment.get_aligned_pairs(matches_only=True)
+            ref_offset = aligned_pairs[0][1]
+            last_mismatch_pos = None
+            mismatch_run_quals = []
+            for qpos, rpos in aligned_pairs:
+                read_base = p.alignment.query_sequence[qpos].upper()
+                if read_base == '=':
+                    # always treat the special read base '=' as a
+                    # match, irrespective of the reference base
+                    continue
+                ref_base = ref_seq[rpos - ref_offset]
+                # see if we have a mismatch to use for mismatch quality sum
+                # calculation
+                if ref_base != 'N' and ref_base != read_base:
+                    if mm_runs:
+                        if (
+                            last_mismatch_pos is None
+                        ) or (
+                            last_mismatch_pos + 1 == qpos
+                        ):
+                            mismatch_run_quals.append(
+                                p.alignment.query_qualities[qpos]
+                            )
+                        else:
+                            sum_mismatch_qualities += max(mismatch_run_quals)
+                            mismatch_run_quals = [
+                                p.alignment.query_qualities[qpos]
+                            ]
+                        last_mismatch_pos = qpos
+                    else:
+                        sum_mismatch_qualities += (
+                            p.alignment.query_qualities[qpos]
+                        )
+                if ignore_nm:
+                    # see if we have a mismatch that increases the edit
+                    # distance according to the SAMtags specs
+                    if (
+                        read_base not in potential_matches
+                    ) or (
+                        ref_base not in potential_matches
+                    ) or (
+                        read_base != ref_base
+                    ):
+                        num_mismatches += 1
+
+            if mismatch_run_quals:
+                sum_mismatch_qualities += max(mismatch_run_quals)
+            if ignore_nm:
+                # use the number of mismatches calculated above,
+                # but add inserted and deleted bases to it
+                cigar_stats = p.alignment.get_cigar_stats()[0]
+                num_mismatches += cigar_stats[1] + cigar_stats[2]
+            sum_mismatch_fractions += (
+                num_mismatches / p.alignment.query_alignment_length
+            )
+
+    return (
+        var_reads_plus,
+        var_reads_minus,
+        sum_mapping_qualities,
+        sum_base_qualities,
+        sum_dist_from_center,
+        sum_mismatch_fractions,
+        sum_mismatch_qualities,
+        sum_clipped_length,
+        sum_dist_from_3prime
+    )
+
+
+def get_allele_stats(reads, pos, alleles,
+                     ref=None, ignore_md=True, ignore_nm=True,
+                     mm_runs=True, detect_q2_runs=False,
+                     pileup_args=None):
+    chrom, start, stop = pos
+    if pileup_args is None:
+        pileup_args = {}
+    if pileup_args.get('stepper') == 'samtools':
+        if pileup_args.get('compute_baq', True) is not False:
+            if 'fastafile' not in pileup_args:
+                pileup_args['fastafile'] = ref
+        # be careful when passing in a custom 'fastafile' option:
+        # providing it slows down pysam tremendously even if the option
+        # isn't actually required.
+
+    pile = reads.pileup(
+        chrom, start, stop,
+        **pileup_args
+    )
+    # apply false-positive filtering a la varscan fpfilter
+    # find the variant site in the pileup columns
+    for pile_column in pile:
+        if pile_column.reference_pos == start:
+            break
+    else:
+        # With no reads covering the genomic position
+        # we can only return "empty" allele stats
+        return [
+            AlleleStats(0, 0, 0, *[float('nan')] * 7)
+            for allele in alleles
+        ]
+
+    # extract required information
+    # overall read depth at the site
+    read_depth = pile_column.get_num_aligned()
+    assert read_depth > 0
+
+    alleles_stats = []
+    for allele in alleles:
+        if '-' in allele:
+            allele, indel_type, indel_after = allele.partition('-')
+            indel_len = -len(indel_after)
+        elif '+' in allele:
+            allele, indel_type, indel_after = allele.partition('+')
+            indel_len = len(indel_after)
+        else:
+            indel_type = None
+
+        stats_it = (
+            p for p in pile_column.pileups
+            # skip reads that don't support the allele we're looking for
+            if (p.query_position is not None)
+            and (p.alignment.query_sequence[p.query_position] == allele)
+        )
+        if indel_type == '-':
+            stats_it = (
+                p for p in stats_it if p.indel == indel_len
+            )
+        elif indel_type == '+':
+            stats_it = (
+                p for p in stats_it if (
+                    p.indel == indel_len
+                ) and (
+                    p.alignment.query_sequence[
+                        p.query_position + 1:
+                        p.query_position + 1 + indel_len
+                    ] == indel_after
+                )
+            )
+        allele_stats = _get_allele_specific_pileup_column_stats(
+            stats_it,
+            partial(
+                pysam.FastaFile.fetch, ref, chrom
+            ) if ref else None,
+            ignore_md,
+            ignore_nm,
+            mm_runs,
+            detect_q2_runs
+        )
+
+        allele_reads_total = allele_stats[0] + allele_stats[1]
+        if allele_reads_total == 0:
+            # No stats without reads!
+            alleles_stats.append(
+                AlleleStats(read_depth, 0, 0, *[float('nan')] * 7)
+            )
+        else:
+            alleles_stats.append(
+                AlleleStats(
+                    read_depth, allele_stats[0], allele_stats[1],
+                    *(i / allele_reads_total for i in allele_stats[2:])
+                )
+            )
+    return alleles_stats
+
+
 class VariantCallingError (RuntimeError):
     """Exception class for issues with samtools and varscan subprocesses."""

@@ -40,13 +367,15 @@

 class VarScanCaller (object):
     def __init__(self, ref_genome, bam_input_files,
-                 max_depth=None,
+                 max_depth=None, count_orphans=False, detect_overlaps=True,
                  min_mapqual=None, min_basequal=None,
                  threads=1, verbose=False, quiet=True
                  ):
         self.ref_genome = ref_genome
         self.bam_input_files = bam_input_files
         self.max_depth = max_depth
+        self.count_orphans = count_orphans
+        self.detect_overlaps = detect_overlaps
         self.min_mapqual = min_mapqual
         self.min_basequal = min_basequal
         self.threads = threads
@@ -67,15 +396,38 @@
         self.tmpfiles = []

     def _get_pysam_pileup_args(self):
-        param_dict = {}
+        # Compute the pileup args dynamically to account for possibly updated
+        # instance attributes.
+
+        # fixed default parameters
+        # Note on the fixed default for 'ignore_overlaps':
+        # In order to use the exact same pileup parameters during variant
+        # calling and postprocessing, we would have to compute the setting
+        # dynamically like so:
+        # if not self.detect_overlaps:
+        #    param_dict['ignore_overlaps'] = False
+        # However, samtools/pysam implement overlap correction by manipulating
+        # (setting to zero) the lower qualities of bases that are part of an
+        # overlap (see
+        # https://sourceforge.net/p/samtools/mailman/message/32793789/). This
+        # manipulation has such a big undesirable effect on base quality stats
+        # calculated during postprocessing that calculating the stats on a
+        # slightly different pileup seems like the lesser evil.
+
+        param_dict = {
+            'ignore_overlaps': False,
+            'compute_baq': False,
+            'stepper': 'samtools',
+        }
+        # user-controllable parameters
+        if self.count_orphans:
+            param_dict['ignore_orphans'] = False
         if self.max_depth is not None:
             param_dict['max_depth'] = self.max_depth
         if self.min_mapqual is not None:
             param_dict['min_mapping_quality'] = self.min_mapqual
         if self.min_basequal is not None:
             param_dict['min_base_quality'] = self.min_basequal
-        param_dict['compute_baq'] = False
-        param_dict['stepper'] = 'samtools'
         return param_dict

     def varcall_parallel(self, normal_purity=None, tumor_purity=None,
@@ -108,6 +460,10 @@
             if value is not None:
                 varcall_engine_options += [option, str(value)]
         pileup_engine_options = ['-B']
+        if self.count_orphans:
+            pileup_engine_options += ['-A']
+        if not self.detect_overlaps:
+            pileup_engine_options += ['-x']
         if self.max_depth is not None:
             pileup_engine_options += ['-d', str(self.max_depth)]
         if self.min_mapqual is not None:
@@ -506,154 +862,12 @@
         del record.format['RD']
         del record.format['DP4']

-    def get_allele_specific_pileup_column_stats(
-        self, allele, pile_column, ref_fetch, use_md=True
-    ):
-        var_reads_plus = var_reads_minus = 0
-        sum_mapping_qualities = 0
-        sum_base_qualities = 0
-        sum_dist_from_center = 0
-        sum_dist_from_3prime = 0
-        sum_clipped_length = 0
-        sum_unclipped_length = 0
-        sum_num_mismatches_as_fraction = 0
-        sum_mismatch_qualities = 0
-
-        for p in pile_column.pileups:
-            # skip reads that don't support the allele we're looking for
-            if p.query_position is None:
-                continue
-            if p.alignment.query_sequence[p.query_position] != allele:
-                continue
-
-            if p.alignment.is_reverse:
-                var_reads_minus += 1
-            else:
-                var_reads_plus += 1
-            sum_mapping_qualities += p.alignment.mapping_quality
-            sum_base_qualities += p.alignment.query_qualities[p.query_position]
-            sum_clipped_length += p.alignment.query_alignment_length
-            unclipped_length = p.alignment.query_length
-            sum_unclipped_length += unclipped_length
-            # The following calculations are all in 1-based coordinates
-            # with respect to the physical 5'-end of the read sequence.
-            if p.alignment.is_reverse:
-                read_base_pos = unclipped_length - p.query_position
-                read_first_aln_pos = (
-                    unclipped_length - p.alignment.query_alignment_end + 1
-                )
-                read_last_aln_pos = (
-                    unclipped_length - p.alignment.query_alignment_start
-                )
-            else:
-                read_base_pos = p.query_position + 1
-                read_first_aln_pos = p.alignment.query_alignment_start + 1
-                read_last_aln_pos = p.alignment.query_alignment_end
-            # Note: the original bam-readcount algorithm uses the less accurate
-            # p.alignment.query_alignment_length / 2 as the read center
-            read_center = (read_first_aln_pos + read_last_aln_pos) / 2
-            sum_dist_from_center += 1 - abs(
-                read_base_pos - read_center
-            ) / (read_center - 1)
-            # Note: the original bam-readcount algorithm uses a more
-            # complicated definition of the 3'-end of a read sequence, which
-            # clips off runs of bases with a quality scores of exactly 2.
-            sum_dist_from_3prime += (
-                read_last_aln_pos - read_base_pos
-            ) / (unclipped_length - 1)
-
-            sum_num_mismatches = 0
-            if use_md:
-                try:
-                    # see if the read has an MD tag, in which case pysam can
-                    # calculate the reference sequence for us
-                    aligned_pairs = p.alignment.get_aligned_pairs(
-                        matches_only=True, with_seq=True
-                    )
-                except ValueError:
-                    use_md = False
-            if use_md:
-                # The ref sequence got calculated from alignment and MD tag.
-                for qpos, rpos, ref_base in aligned_pairs:
-                    # pysam uses lowercase ref bases to indicate mismatches
-                    if (
-                        ref_base == 'a' or ref_base == 't' or
-                        ref_base == 'g' or ref_base == 'c'
-                    ):
-                        sum_num_mismatches += 1
-                        sum_mismatch_qualities += p.alignment.query_qualities[
-                            qpos
-                        ]
-            else:
-                # We need to obtain the aligned portion of the reference
-                # sequence.
-                aligned_pairs = p.alignment.get_aligned_pairs(
-                    matches_only=True
-                )
-                # note that ref bases can be lowercase
-                ref_seq = ref_fetch(
-                    aligned_pairs[0][1], aligned_pairs[-1][1] + 1
-                ).upper()
-                ref_offset = aligned_pairs[0][1]
-
-                for qpos, rpos in p.alignment.get_aligned_pairs(
-                    matches_only=True
-                ):
-                    # see if we have a mismatch to the reference at this
-                    # position, but note that
-                    # - the query base may be lowercase (SAM/BAM spec)
-                    # - an '=', if present in the query seq, indicates a match
-                    #   to the reference (SAM/BAM spec)
-                    # - there cannot be a mismatch with an N in the reference
-                    ref_base = ref_seq[rpos - ref_offset]
-                    read_base = p.alignment.query_sequence[qpos].upper()
-                    if (
-                        read_base != '=' and ref_base != 'N'
-                    ) and (
-                        ref_base != read_base
-                    ):
-                        sum_num_mismatches += 1
-                        sum_mismatch_qualities += p.alignment.query_qualities[
-                            qpos
-                        ]
-            sum_num_mismatches_as_fraction += (
-                sum_num_mismatches / p.alignment.query_alignment_length
-            )
-
-        var_reads_total = var_reads_plus + var_reads_minus
-        if var_reads_total == 0:
-            # No stats without reads!
-            return None
-        avg_mapping_quality = sum_mapping_qualities / var_reads_total
-        avg_basequality = sum_base_qualities / var_reads_total
-        avg_pos_as_fraction = sum_dist_from_center / var_reads_total
-        avg_num_mismatches_as_fraction = (
-            sum_num_mismatches_as_fraction / var_reads_total
-        )
-        avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total
-        avg_clipped_length = sum_clipped_length / var_reads_total
-        avg_distance_to_effective_3p_end = (
-            sum_dist_from_3prime / var_reads_total
-        )
-
-        return (
-            avg_mapping_quality,
-            avg_basequality,
-            var_reads_plus,
-            var_reads_minus,
-            avg_pos_as_fraction,
-            avg_num_mismatches_as_fraction,
-            avg_sum_mismatch_qualities,
-            avg_clipped_length,
-            avg_distance_to_effective_3p_end
-        )
-
     def _postprocess_variant_records(self, invcf,
                                      min_var_count2, min_var_count2_lc,
-                                     min_var_freq2, max_somatic_p,
-                                     max_somatic_p_depth,
+                                     min_var_freq2, min_var_count2_depth,
                                      min_ref_readpos, min_var_readpos,
                                      min_ref_dist3, min_var_dist3,
+                                     detect_q2_runs,
                                      min_ref_len, min_var_len,
                                      max_relative_len_diff,
                                      min_strandedness, min_strand_reads,
@@ -663,6 +877,7 @@
                                      max_mapqual_diff,
                                      max_ref_mmqs, max_var_mmqs,
                                      min_mmqs_diff, max_mmqs_diff,
+                                     ignore_md,
                                      **args):
         # set FILTER field according to Varscan criteria
         # multiple FILTER entries must be separated by semicolons
@@ -681,134 +896,158 @@
             )
             refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome))
             pileup_args = self._get_pysam_pileup_args()
+            _get_stats = get_allele_stats
             for record in invcf:
-                if any(len(allele) > 1 for allele in record.alleles):
-                    # skip indel postprocessing for the moment
-                    yield record
-                    continue
+                is_indel = False
                 if record.alleles[0] == 'N':
                     # It makes no sense to call SNPs against an unknown
                     # reference base
                     continue
+                if len(record.alleles[0]) > 1:
+                    # a deletion
+                    is_indel = True
+                    alleles = [
+                        record.alleles[1], record.alleles[0].replace(
+                            record.alleles[1], record.alleles[1] + '-', 1
+                        )
+                    ]
+                elif len(record.alleles[1]) > 1:
+                    # an insertion
+                    is_indel = True
+                    alleles = [
+                        record.alleles[0],
+                        record.alleles[1].replace(
+                            record.alleles[0], record.alleles[0] + '+', 1
+                        )
+                    ]
+                else:
+                    # a regular SNV
+                    alleles = record.alleles[:2]
                 # get pileup for genomic region affected by this variant
                 if record.info['SS'] == '2':
                     # a somatic variant => generate pileup from tumor data
-                    pile = tumor_reads.pileup(
-                        record.chrom, record.start, record.stop,
-                        **pileup_args
-                    )
-                    sample_of_interest = 'TUMOR'
+                    reads_of_interest = tumor_reads
+                    calculate_ref_stats = record.samples['TUMOR']['RD'] > 0
                 elif record.info['SS'] in ['1', '3']:
                     # a germline or LOH variant => pileup from normal data
-                    pile = normal_reads.pileup(
-                        record.chrom, record.start, record.stop,
-                        **pileup_args
-                    )
-                    sample_of_interest = 'NORMAL'
+                    reads_of_interest = normal_reads
+                    calculate_ref_stats = record.samples['NORMAL']['RD'] > 0
                 else:
                     # TO DO: figure out if there is anything interesting to do
                     # for SS status codes 0 (reference) and 5 (unknown)
                     yield record
                     continue

-                # apply false-positive filtering a la varscan fpfilter
-                # find the variant site in the pileup columns
-                for pile_column in pile:
-                    if pile_column.reference_pos == record.start:
-                        break
-                # extract required information
-                # overall read depth at the site
-                read_depth = pile_column.get_num_aligned()
-                assert read_depth > 0
                 # no multiallelic sites in varscan
                 assert len(record.alleles) == 2
-                if record.samples[sample_of_interest]['RD'] > 0:
-                    ref_stats, alt_stats = [
-                        self.get_allele_specific_pileup_column_stats(
-                            allele,
-                            pile_column,
-                            partial(
-                                pysam.FastaFile.fetch, refseq, record.chrom
-                            )
-                        )
-                        for allele in record.alleles
-                    ]
+
+                if calculate_ref_stats:
+                    ref_stats, alt_stats = _get_stats(
+                        reads_of_interest,
+                        (record.chrom, record.start, record.stop),
+                        alleles,
+                        refseq,
+                        ignore_md=ignore_md,
+                        ignore_nm=False,
+                        mm_runs=True,
+                        detect_q2_runs=detect_q2_runs,
+                        pileup_args=pileup_args
+                    )
                 else:
                     ref_stats = None
-                    alt_stats = self.get_allele_specific_pileup_column_stats(
-                        record.alleles[1],
-                        pile_column,
-                        partial(pysam.FastaFile.fetch, refseq, record.chrom)
-                    )
+                    alt_stats = _get_stats(
+                        reads_of_interest,
+                        (record.chrom, record.start, record.stop),
+                        alleles[1:2],
+                        refseq,
+                        ignore_md=ignore_md,
+                        ignore_nm=False,
+                        mm_runs=True,
+                        detect_q2_runs=detect_q2_runs,
+                        pileup_args=pileup_args
+                    )[0]
                 ref_count = 0
                 if ref_stats:
-                    ref_count = ref_stats[2] + ref_stats[3]
-                    if ref_stats[1] < min_ref_basequal:
+                    ref_count = ref_stats.reads_fw + ref_stats.reads_rv
+                    if ref_stats.avg_basequal < min_ref_basequal:
                         record.filter.add('RefBaseQual')
                     if ref_count >= 2:
-                        if ref_stats[0] < min_ref_mapqual:
+                        if ref_stats.avg_mapqual < min_ref_mapqual:
                             record.filter.add('RefMapQual')
-                        if ref_stats[4] < min_ref_readpos:
+                        if ref_stats.avg_dist_from_center < min_ref_readpos:
                             record.filter.add('RefReadPos')
-                        # ref_stats[5] (avg_num_mismatches_as_fraction
+                        # ref_stats.avg_mismatch_fraction
                         # is not a filter criterion in VarScan fpfilter
-                        if ref_stats[6] > max_ref_mmqs:
+                        if ref_stats.avg_mismatch_qualsum > max_ref_mmqs:
                             record.filter.add('RefMMQS')
-                        if ref_stats[7] < min_ref_len:
+                        if not is_indel and (
+                            ref_stats.avg_clipped_len < min_ref_len
+                        ):
                             # VarScan fpfilter does not apply this filter
-                            # for indels, but there is no reason
-                            # not to do it.
+                            # for indels, so we do not do it either.
                             record.filter.add('RefAvgRL')
-                        if ref_stats[8] < min_ref_dist3:
+                        if ref_stats.avg_dist_from_3prime < min_ref_dist3:
                             record.filter.add('RefDist3')
                 if alt_stats:
-                    alt_count = alt_stats[2] + alt_stats[3]
-                    if (
-                        alt_count < min_var_count2_lc
-                    ) or (
-                        read_depth >= max_somatic_p_depth and
-                        alt_count < min_var_count2
+                    alt_count = alt_stats.reads_fw + alt_stats.reads_rv
+                    if alt_count < min_var_count2:
+                        if (
+                            (alt_count + ref_count) >= min_var_count2_depth
+                        ) or (
+                            alt_count < min_var_count2_lc
+                        ):
+                            record.filter.add('VarCount')
+                    if alt_count / alt_stats.reads_total < min_var_freq2:
+                        record.filter.add('VarFreq')
+                    if not is_indel and (
+                        alt_stats.avg_basequal < min_var_basequal
                     ):
-                        record.filter.add('VarCount')
-                    if alt_count / read_depth < min_var_freq2:
-                        record.filter.add('VarFreq')
-                    if alt_stats[1] < min_var_basequal:
                         record.filter.add('VarBaseQual')
-                    if alt_count > min_strand_reads:
+                    if alt_count >= min_strand_reads:
                         if (
-                            alt_stats[2] / alt_count < min_strandedness
+                            alt_stats.reads_fw / alt_count < min_strandedness
                         ) or (
-                            alt_stats[3] / alt_count < min_strandedness
+                            alt_stats.reads_rv / alt_count < min_strandedness
                         ):
                             record.filter.add('Strand')
-                    if alt_stats[2] + alt_stats[3] >= 2:
-                        if alt_stats[0] < min_var_mapqual:
+                    if alt_count >= 2:
+                        if alt_stats.avg_mapqual < min_var_mapqual:
                             record.filter.add('VarMapQual')
-                        if alt_stats[4] < min_var_readpos:
+                        if alt_stats.avg_dist_from_center < min_var_readpos:
                             record.filter.add('VarReadPos')
-                        # alt_stats[5] (avg_num_mismatches_as_fraction
+                        # alt_stats.avg_mismatch_fraction
                         # is not a filter criterion in VarScan fpfilter
-                        if alt_stats[6] > max_var_mmqs:
+                        if alt_stats.avg_mismatch_qualsum > max_var_mmqs:
                             record.filter.add('VarMMQS')
-                        if alt_stats[7] < min_var_len:
+                        if not is_indel and (
+                            alt_stats.avg_clipped_len < min_var_len
+                        ):
                             # VarScan fpfilter does not apply this filter
-                            # for indels, but there is no reason
-                            # not to do it.
+                            # for indels, so we do not do it either.
                             record.filter.add('VarAvgRL')
-                        if alt_stats[8] < min_var_dist3:
+                        if alt_stats.avg_dist_from_3prime < min_var_dist3:
                             record.filter.add('VarDist3')
                     if ref_count >= 2 and alt_count >= 2:
-                        if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff:
+                        if (
+                            ref_stats.avg_mapqual - alt_stats.avg_mapqual
+                        ) > max_mapqual_diff:
                             record.filter.add('MapQualDiff')
-                        if (ref_stats[1] - alt_stats[1]) > max_basequal_diff:
+                        if (
+                            ref_stats.avg_basequal - alt_stats.avg_basequal
+                        ) > max_basequal_diff:
                             record.filter.add('MaxBAQdiff')
-                        mmqs_diff = alt_stats[6] - ref_stats[6]
+                        mmqs_diff = (
+                            alt_stats.avg_mismatch_qualsum
+                            - ref_stats.avg_mismatch_qualsum
+                        )
                         if mmqs_diff < min_mmqs_diff:
                             record.filter.add('MinMMQSdiff')
                         if mmqs_diff > max_mmqs_diff:
                             record.filter.add('MMQSdiff')
                         if (
-                            1 - alt_stats[7] / ref_stats[7]
+                            1 -
+                            alt_stats.avg_clipped_len
+                            / ref_stats.avg_clipped_len
                         ) > max_relative_len_diff:
                             record.filter.add('ReadLenDiff')
                 else:
@@ -994,6 +1233,8 @@
     instance_args = {
         k: args.pop(k) for k in [
             'max_depth',
+            'count_orphans',
+            'detect_overlaps',
             'min_mapqual',
             'min_basequal',
             'threads',
@@ -1046,7 +1287,7 @@
     )
     p.add_argument(
         '-s', '--split-output',
-        dest='split_output', action='store_true', default=False,
+        dest='split_output', action='store_true',
         help='indicate that separate output files for SNPs and indels '
              'should be generated (original VarScan behavior). If specified, '
              '%%T in the --ofile file name will be replaced with "snp" and '
@@ -1090,6 +1331,20 @@
              'default: 8000)'
     )
     call_group.add_argument(
+        '--count-orphans',
+        dest='count_orphans', action='store_true',
+        help='Use anomalous read pairs in variant calling '
+             '(samtools mpileup -A option; default: ignore anomalous pairs)'
+    )
+    call_group.add_argument(
+        '--no-detect-overlaps',
+        dest='detect_overlaps', action='store_false',
+        help='Disable automatic read-pair overlap detection by samtools '
+             'mpileup. Overlap detection tries to avoid counting duplicated '
+             'bases twice during variant calling. '
+             '(samtools mpileup -x option; default: use overlap detection)'
+    )
+    call_group.add_argument(
         '--min-basequal',
         dest='min_basequal', type=int,
         default=13,
@@ -1160,7 +1415,15 @@
         dest='min_var_count2_lc', type=int,
         default=2,
         help='Minimum number of variant-supporting reads when depth below '
-             '--somatic-p-depth (default: 2)'
+             '--min-var-count2-depth (default: 2)'
+    )
+    filter_group.add_argument(
+        '--min-var-count2-depth',
+        dest='min_var_count2_depth', type=int,
+        default=10,
+        help='Combined depth of ref- and variant-supporting reads required to '
+             'apply the --min-var-count filter instead of --min-var-count-lc '
+             '(default: 10)'
     )
     filter_group.add_argument(
         '--min-var-freq2',
@@ -1169,18 +1432,6 @@
         help='Minimum variant allele frequency (default: 0.05)'
     )
     filter_group.add_argument(
-        '--max-somatic-p',
-        dest='max_somatic_p', type=float,
-        default=0.05,
-        help='Maximum somatic p-value (default: 0.05)'
-    )
-    filter_group.add_argument(
-        '--max-somatic-p-depth',
-        dest='max_somatic_p_depth', type=int,
-        default=10,
-        help='Depth required to run --max-somatic-p filter (default: 10)'
-    )
-    filter_group.add_argument(
         '--min-ref-readpos',
         dest='min_ref_readpos', type=float,
         default=0.1,
@@ -1209,6 +1460,13 @@
              '3\'end of variant-supporting reads (default: 0.1)'
     )
     filter_group.add_argument(
+        '--detect-q2-runs',
+        dest='detect_q2_runs', action='store_true',
+        help='Look for 3\'-terminal q2 runs and take their lengths into '
+             'account for determining the effective 3\'end of reads '
+             '(default: off)'
+    )
+    filter_group.add_argument(
         '--min-ref-len',
         dest='min_ref_len', type=int,
         default=90,
@@ -1309,5 +1567,13 @@
         default=50,
         help='Maximum mismatch quality sum difference (var - ref; default: 50)'
     )
+    filter_group.add_argument(
+        '--ignore-md',
+        dest='ignore_md', action='store_true',
+        help='Do not rely on read MD tag, even if it is present on the mapped '
+             'reads, and recalculate mismatch quality stats from ref '
+             'alignments instead.'
+    )
+
     args = vars(p.parse_args())
     varscan_call(**args)
--- a/varscan_somatic.xml	Thu Mar 28 18:19:00 2019 -0400
+++ b/varscan_somatic.xml	Fri Aug 16 15:49:54 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="varscan_somatic" name="VarScan somatic" version="@VERSION@.4">
+<tool id="varscan_somatic" name="VarScan somatic" version="@VERSION@.5">
     <description>Call germline/somatic and LOH variants from tumor-normal sample pairs</description>
     <macros>
         <import>macros.xml</import>
@@ -28,6 +28,16 @@
                 text="##FILTER=&lt;ID=RefDist3,Description=" />
             </assert_contents>
         </macro>
+        <macro name="filter_compat_options">
+            <section name="experts_only" title="Compatibility options for experts" expanded="false">
+                <param name="compat_opts" type="select" display="checkboxes" multiple="true" optional="true"
+                label="Compatibility Options for Posterior Variant Filtering"
+                help="">
+                    <option value="--ignore-md" selected="true">Always determine mismatch quality statistics from recalculated read to reference alignments. Ignore read MD tags if present.</option>
+                    <option value="--detect-q2-runs" selected="false">Treat runs of base qualities of value 2 at the 3' end of reads as quality control indicator (Illumina 1.5 compatibility setting)</option>
+                </param>
+            </section>
+        </macro>
     </macros>
     <expand macro="requirements">
         <requirement type="package" version="3.6.7">python</requirement>
@@ -63,8 +73,11 @@
             --threads \${GALAXY_SLOTS:-2}
             #if str($call_params.settings) == "custom":
                 ## samtools mpileup parameters
-                --min-basequal ${call_params.min_avg_qual}
-                --min-mapqual ${call_params.min_mapqual}
+                --min-basequal ${call_params.read_selection.min_basequal}
+                --min-mapqual ${call_params.read_selection.min_mapqual}
+                ${call_params.read_selection.count_orphans}
+                ${call_params.read_selection.detect_overlaps}
+                --max-pileup-depth ${call_params.read_selection.max_pileup_depth}
                 ## VarScan parameters
                 --min-coverage ${call_params.min_coverage}
                 --min-var-count ${call_params.min_reads2}
@@ -75,56 +88,61 @@
             #end if
             #if str($filter_params.settings) == "no_filter":
                 --no-filters
-            #elif str($filter_params.settings) == "dream3_settings":
-                --min-var-count2 3
-                --min-var-count2-lc 1
-                --min-var-freq2 0.05
-                --max-somatic-p 0.05
-                --max-somatic-p-depth 10
-                --min-ref-readpos 0.2
-                --min-var-readpos 0.15
-                --min-ref-dist3 0.2
-                --min-var-dist3 0.15
-                --min-ref-len 90
-                --min-var-len 90
-                --max-len-diff 0.05
-                --min-strandedness 0
-                --min-strand-reads 5
-                --min-ref-basequal 15
-                --min-var-basequal 30
-                --max-basequal-diff 50
-                --min-ref-mapqual 20
-                --min-var-mapqual 30
-                --max-mapqual-diff 10
-                --max-ref-mmqs 50
-                --max-var-mmqs 100
-                --min-mmqs-diff 0
-                --max-mmqs-diff 50
-            #elif str($filter_params.settings) == "custom":
-                --min-var-count2 ${filter_params.min_var_count}
-                --min-var-count2-lc ${filter_params.min_var_count_lc}
-                --min-var-freq2 ${filter_params.min_var_freq2}
-                --max-somatic-p ${filter_params.max_somatic_p}
-                --max-somatic-p-depth ${filter_params.max_somatic_p_depth}
-                --min-ref-readpos ${filter_params.min_ref_readpos}
-                --min-var-readpos ${filter_params.min_var_readpos}
-                --min-ref-dist3 ${filter_params.min_ref_dist3}
-                --min-var-dist3 ${filter_params.min_var_dist3}
-                --min-ref-len ${filter_params.min_ref_len}
-                --min-var-len ${filter_params.min_var_len}
-                --max-len-diff ${filter_params.max_len_diff}
-                --min-strandedness ${filter_params.min_strandedness}
-                --min-strand-reads ${filter_params.min_strand_reads}
-                --min-ref-basequal ${filter_params.min_ref_basequal}
-                --min-var-basequal ${filter_params.min_var_basequal}
-                --max-basequal-diff ${filter_params.max_basequal_diff}
-                --min-ref-mapqual ${filter_params.min_ref_mapqual}
-                --min-var-mapqual ${filter_params.min_var_mapqual}
-                --max-mapqual-diff ${filter_params.max_mapqual_diff}
-                --max-ref-mmqs ${filter_params.max_ref_mmqs}
-                --max-var-mmqs ${filter_params.max_var_mmqs}
-                --min-mmqs-diff ${filter_params.min_mmqs_diff}
-                --max-mmqs-diff ${filter_params.max_mmqs_diff}
+            #else:
+                #if str($filter_params.settings) == "dream3_settings":
+                    --min-var-count2 3
+                    --min-var-count2-lc 1
+                    --min-var-count2-depth 10
+                    --min-var-freq2 0.05
+                    --min-ref-readpos 0.2
+                    --min-var-readpos 0.15
+                    --min-ref-dist3 0.2
+                    --min-var-dist3 0.15
+                    --min-ref-len 90
+                    --min-var-len 90
+                    --max-len-diff 0.05
+                    --min-strandedness 0
+                    --min-strand-reads 5
+                    --min-ref-basequal 15
+                    --min-var-basequal 30
+                    --max-basequal-diff 50
+                    --min-ref-mapqual 20
+                    --min-var-mapqual 30
+                    --max-mapqual-diff 10
+                    --max-ref-mmqs 50
+                    --max-var-mmqs 100
+                    --min-mmqs-diff 0
+                    --max-mmqs-diff 50
+                #elif str($filter_params.settings) == "custom":
+                    --min-var-count2 ${filter_params.min_var_count}
+                    --min-var-count2-lc ${filter_params.min_var_count_lc}
+                    --min-var-count2-depth ${filter_params.min_var_count_depth}
+                    --min-var-freq2 ${filter_params.min_var_freq2}
+                    --min-ref-readpos ${filter_params.min_ref_readpos}
+                    --min-var-readpos ${filter_params.min_var_readpos}
+                    --min-ref-dist3 ${filter_params.min_ref_dist3}
+                    --min-var-dist3 ${filter_params.min_var_dist3}
+                    --min-ref-len ${filter_params.min_ref_len}
+                    --min-var-len ${filter_params.min_var_len}
+                    --max-len-diff ${filter_params.max_len_diff}
+                    --min-strandedness ${filter_params.min_strandedness}
+                    --min-strand-reads ${filter_params.min_strand_reads}
+                    --min-ref-basequal ${filter_params.min_ref_basequal}
+                    --min-var-basequal ${filter_params.min_var_basequal}
+                    --max-basequal-diff ${filter_params.max_basequal_diff}
+                    --min-ref-mapqual ${filter_params.min_ref_mapqual}
+                    --min-var-mapqual ${filter_params.min_var_mapqual}
+                    --max-mapqual-diff ${filter_params.max_mapqual_diff}
+                    --max-ref-mmqs ${filter_params.max_ref_mmqs}
+                    --max-var-mmqs ${filter_params.max_var_mmqs}
+                    --min-mmqs-diff ${filter_params.min_mmqs_diff}
+                    --max-mmqs-diff ${filter_params.max_mmqs_diff}
+                #end if
+                #if $filter_params.experts_only.compat_opts:
+                    #for $opt in str($filter_params.experts_only.compat_opts).split(','):
+                        $opt
+                    #end for
+                #end if
             #end if
             --verbose
             '$ref_genome'
@@ -166,22 +184,35 @@
                 <option value="custom">Customize settings</option>
             </param>
             <when value="custom">
-                <param argument="samtools mpileup -Q" name="min_avg_qual" type="integer" value="13" min="0" max="50"
-                label="Minimum base quality"
-                help="The minimum base quality at the variant position required to use a read for calling" />
-                <param argument="samtools mpileup -q" name="min_mapqual" type="integer" value="0" min="0" max="60"
-                label="Minimum mapping quality"
-                help="The minimum mapping quality required for a read to be considered in variant calling" />
+                <section name="read_selection" title="Read selection" expanded="true"
+                help="The settings in this section control which mapped reads will be used for variant calling.">
+                    <param argument="samtools mpileup -Q" name="min_basequal" type="integer" value="13" min="1" max="50"
+                    label="Minimum base quality"
+                    help="The minimum base quality (default: 13) at a given position required to use a read for calling variants at that site" />
+                    <param argument="samtools mpileup -q" name="min_mapqual" type="integer" value="0" min="0" max="60"
+                    label="Minimum mapping quality"
+                    help="The minimum mapping quality (default: 0) required for a read to be considered in variant calling" />
+                    <param argument="samtools mpileup -A" name="count_orphans" type="boolean" truevalue="--count-orphans" falsevalue="" checked="false"
+                    label="Use reads from anomalously mapped pairs"
+                    help="Applies to paired-end reads only. If set to true, reads from pairs that are flagged as non-proper pairs (SAM/BAM FLAG field 2) will be used in variant calling. The default is to ignore such reads." />
+                    <param argument="samtools mpileup -x" name="detect_overlaps" type="boolean" truevalue="" falsevalue="--no-detect-overlaps" checked="true"
+                    label="Try to correct for read-pair overlaps"
+                    help="Applies to paired-end reads only. If the reads of a pair overlap on the reference, then with this option (on by default), the bases for which both reads provide evidence will be counted only once in variant calling (of the two sequenced bases in the reads, the base with the better base quality will be used)." />
+                    <param argument="samtools mpileup -d" name="max_pileup_depth" type="integer" value="8000" min="4000"
+                    label="Maximum number of reads per site"
+                    help="Restrict the number of reads used for variant calling at a site to this maximum (default: 8000) for each sample. Helps protect against excessive memory usage (and slow tool runs) at sites of extraordinary high coverage." />
+                </section>
                 <expand macro="min_coverage"
-                help="Minimum site coverage required in the normal and in the tumor sample to call a variant. This threshold gets applied after eliminating reads with low base and mapping qualitiy as defined above." />
-                <expand macro="min_reads2" />
+                help="Minimum site coverage (default: 8) required in the normal and in the tumor sample to call a variant. This threshold gets applied after eliminating reads based on the read selection criteria above." />
+                <expand macro="min_reads2"
+                help="Minimum number (default: 2) of variant-supporting reads (after read selection) at a position required to make a call"/>
                 <expand macro="min_var_freq" value="0.1" />
                 <expand macro="min_freq_for_hom" />
                 <expand macro="p_value" value="0.99"
-                help="The p-value threshold used to determine if a variant should be called for either sample" />
+                help="The p-value threshold (default: 0.99) used to determine if a variant should be called for either sample" />
                 <param argument="--somatic-p-value" name="somatic_p_value" type="float" value="0.05" min="0" max="1"
                 label="P-value threshold for calling somatic variants and LOH events"
-                help="The p-value threshold used to determine if read count differences between the normal and the tumor sample justify classification of a variant as somatic or as an LOH event" />
+                help="The p-value threshold (default: 0.05) used to determine if read count differences between the normal and the tumor sample justify classification of a variant as somatic or as an LOH event" />
             </when>
             <when value="varscan_defaults" />
         </conditional>
@@ -192,82 +223,84 @@
                 <option value="no_filter">Do not perform posterior filtering</option>
                 <option value="custom">Customize settings</option>
             </param>
-            <when value="varscan_defaults" />
-            <when value="dream3_settings" />
             <when value="no_filter" />
+            <when value="varscan_defaults">
+                <expand macro="filter_compat_options" />
+            </when>
+            <when value="dream3_settings">
+                <expand macro="filter_compat_options" />
+            </when>
             <when value="custom">
                 <param argument="--min-var-count" name="min_var_count" type="integer" value="4" min="1" max="200"
                 label="Minimum number of variant-supporting reads"
-                help="" />
+                help="(default: 4)" />
                 <param argument="--min-var-count-lc" name="min_var_count_lc" type="integer" value="2" min="1" max="200"
                 label="Low coverage minimum number of variant-supporting reads"
-                help="Will be applied instead of the --min-var-count limit for sites with poor overall (less than --max-somatic-p-depth) coverage" />
+                help="This setting (default: 2) will be applied instead of the --min-var-count limit for sites with poor overall (see threshold below) coverage." />
+                <param argument="--max-somatic-p-depth" name="min_var_count_depth" type="integer" value="10" min="2" max="200"
+                label="Minimum variant allele count threshold"
+                help="Combined depth (default: 10) of ref- and variant-supporting reads required at variant site to apply the (stricter) --min-var-count filter instead of --min-var-count-lc" />
                 <param argument="--min-var-freq" name="min_var_freq2" type="float" value="0.05" min="0" max="1"
                 label="Minimum variant allele frequency"
-                help="" />
-                <param argument="--max-somatic-p" name="max_somatic_p" type="float" value="0.05" min="0" max="1"
-                label="Maximum somatic p-value allowed for a somatic call"
-                help="" />
-                <param argument="--max-somatic-p-depth" name="max_somatic_p_depth" type="integer" value="10" min="2" max="200"
-                label="Depth required at variant site to run --max-somatic-p filter"
-                help="" />
+                help="(default: 0.05)" />
                 <param argument="--min-ref-readpos" name="min_ref_readpos" type="float" value="0.1" min="0" max="1"
                 label="Minimum relative variant position in ref-supporting reads"
-                help="The minimum average relative distance from the ends of ref-supporting reads required for variant sites" />
+                help="The minimum average relative distance from either end of ref-supporting reads (default: 0.1) required for variant sites" />
                 <param argument="--min-var-readpos" name="min_var_readpos" type="float" value="0.1" min="0" max="1"
                 label="Minimum relative variant position in variant-supporting reads"
-                help="The minimum average relative distance from the ends of variant-supporting reads required for variant sites" />
+                help="The minimum average relative distance from either end of variant-supporting reads (default: 0.1) required for variant sites" />
                 <param argument="--min-ref-dist3" name="min_ref_dist3" type="float" value="0.1" min="0" max="1"
                 label="Minimum distance of variant site from 3'-end of ref-supporting reads"
-                help="The minimum average relative distance from the effective 3'end of ref-supporting reads required for variant sites" />
+                help="The minimum average relative distance from the effective 3'end of ref-supporting reads (default: 0.1) required for variant sites. The effective 3'end is defined by the end of the alignment of the read to the reference (or, if the Illumina 1.5 compatibility setting is used, by the first base in 3'->5' direction with a base quality > 2)." />
                 <param argument="--min-var-dist3" name="min_var_dist3" type="float" value="0.1" min="0" max="1"
                 label="Minimum distance of variant site from 3'-end of variant-supporting reads"
-                help="The minimum average relative distance from the effective 3'end of variant-supporting reads required for variant sites" />
+                help="The minimum average relative distance from the effective 3'end of variant-supporting reads (default: 0.1) required for variant sites. The  effective 3'end is defined as above." />
                 <param argument="--min-ref-avgrl" name="min_ref_len" type="integer" value="90" min="0" max="200"
                 label="Minimum length of ref-supporting reads"
-                help="The minimum average trimmed length required for reads supporting the reference allele" />
+                help="The minimum average trimmed length (default: 90) required for reads supporting the reference allele" />
                 <param argument="--min-var-avgrl" name="min_var_len" type="integer" value="90" min="0" max="200"
                 label="Minimum length of variant-supporting reads"
-                help="The minimum average trimmed length required for reads supporting the variant allele" />
+                help="The minimum average trimmed length (default: 90) required for reads supporting the variant allele" />
                 <param argument="--max-rl-diff" name="max_len_diff" type="float" value="0.25" min="0" max="1"
                 label="Maximum relative read length difference"
-                help="The maximum allowed average relative read length difference (ref - var) between reads supporting the reference and the variant allele" />
+                help="The maximum allowed difference (default: 0.25) in the average relative read length (ref - var) between reads supporting the reference and the variant allele" />
                 <param argument="--min-strandedness" name="min_strandedness" type="float" value="0.01" min="0" max="0.5"
                 label="Minimum fraction of variant reads from each strand"
-                help="The minimum fraction of variant reads that are required to come from the forward and from the reverse strand" />
+                help="The minimum fraction (default: 0.01) of variant reads that are required to come from the forward and from the reverse strand" />
                 <param argument="--min-strand-reads" name="min_strand_reads" type="integer" value="5" min="2" max="200"
                 label="Minimum variant allele depth required to apply the --min-strandedness filter"
-                help="" />
+                help="(default: 5)" />
                 <param argument="--min-ref-basequal" name="min_ref_basequal" type="integer" value="15" min="1" max="50"
                 label="Minimum average base quality for the ref allele"
-                help="The minimum average base quality required at the variant site for reads supporting the reference allele" />
+                help="The minimum average base quality (default: 15) required at the variant site for reads supporting the reference allele" />
                 <param argument="--min-var-basequal" name="min_var_basequal" type="integer" value="15" min="1" max="50"
                 label="Minimum average base quality for the variant allele"
-                help="The minimum average base quality required at the variant site for reads supporting the variant allele" />
+                help="The minimum average base quality (default: 15) required at the variant site for reads supporting the variant allele" />
                 <param argument="max-basequal-diff" name="max_basequal_diff" type="integer" value="50" min="0" max="50"
                 label="Maximum base quality difference between ref- and variant-supporting reads"
-                help="The maximum average base quality difference (ref - var) allowed between the variant site positions of reads supporting the reference and the variant allele" />
+                help="The maximum difference (default: 50) in the average base quality (ref - var) allowed between the variant site positions of reads supporting the reference and the variant allele" />
                 <param argument="--min-ref-mapqual" name="min_ref_mapqual" type="integer" value="15" min="1" max="60"
                 label="Minimum average mapping quality of ref-supporting reads"
-                help="The minimum average mapping quality required for reads supporting the reference allele" />
+                help="The minimum average mapping quality (default: 15) required for reads supporting the reference allele" />
                 <param argument="--min-var-mapqual" name="min_var_mapqual" type="integer" value="15" min="1" max="60"
                 label="Minimum average mapping quality of variant-supporting reads"
-                help="The minimum average mapping quality required for reads supporting the variant allele" />
+                help="The minimum average mapping quality (default: 15) required for reads supporting the variant allele" />
                 <param argument="--max-mapqual-diff" name="max_mapqual_diff" type="integer" value="50" min="0" max="60"
                 label="Maximum mapping quality difference between ref- and variant-supporting reads"
-                help="The maximum average mapping quality difference (ref - var) allowed between reads supporting the reference and the variant allele" />
+                help="The maximum difference (default: 50) in the average mapping quality (ref - var) allowed between reads supporting the reference and the variant allele" />
                 <param argument="--max-ref-mmqs" name="max_ref_mmqs" type="integer" value="100" min="0"
                 label="Maximum mismatch base quality sum of ref-supporting reads"
-                help="The maximum mismatch base quality sum allowed for reads supporting the reference allele" />
+                help="The maximum mismatch base quality sum (default: 100) allowed for reads supporting the reference allele" />
                 <param argument="--max-var-mmqs" name="max_var_mmqs" type="integer" value="100" min="0"
                 label="Maximum mismatch base quality sum of var-supporting reads"
-                help="The maximum mismatch base quality sum allowed for reads supporting the variant allele" />
+                help="The maximum mismatch base quality sum (default: 100) allowed for reads supporting the variant allele" />
                 <param argument="--min-mmqs-diff" name="min_mmqs_diff" type="integer" value="0" min="0"
                 label="Minimum difference between mismatch base quality sums of variant- and ref-supporting reads"
-                help="The minimum difference in the mismatch base quality sums (var - ref) required between reads supporting the variant and the reference allele" />
+                help="The minimum difference (default: 0) in the mismatch base quality sums (var - ref) required between reads supporting the variant and the reference allele" />
                 <param argument="--max-mmqs-diff" name="max_mmqs_diff" type="integer" value="50" min="1"
                 label="Maximum difference between mismatch base quality sums of variant- and ref-supporting reads"
-                help="The maximum difference in the mismatch base quality sums (var - ref) allowed between reads supporting the variant and the reference allele" />
+                help="The maximum difference (default: 50) in the mismatch base quality sums (var - ref) allowed between reads supporting the variant and the reference allele" />
+                <expand macro="filter_compat_options" />
             </when>
         </conditional>
     </inputs>
@@ -286,6 +319,7 @@
     </outputs>
     <tests>
         <test expect_num_outputs="1">
+            <!-- run with default settings and genome from history -->
             <conditional name="reference">
                 <param name="source" value="history" />
                 <param name="genome" value="hg19_chrM.fa" />
@@ -305,6 +339,7 @@
             </output>
         </test>
         <test expect_num_outputs="1">
+            <!-- run with default settings and cached genome -->
             <conditional name="reference">
                 <param name="source" value="cached" />
                 <param name="genome" value="hg19mito" />
@@ -324,6 +359,7 @@
             </output>
         </test>
         <test expect_num_outputs="2">
+            <!-- run with default settings and split output -->
             <conditional name="reference">
                 <param name="source" value="history" />
                 <param name="genome" value="hg19_chrM.fa" />
@@ -347,6 +383,7 @@
             </output>
         </test>
         <test expect_num_outputs="1">
+            <!-- run with custom params for variant calling -->
             <conditional name="reference">
                 <param name="source" value="history" />
                 <param name="genome" value="hg19_chrM.fa" />
@@ -355,10 +392,12 @@
             <param name="tumor_bam" value="tumor_chrM.bam" />
             <param name="split_output" value="false" />
             <conditional name="call_params">
+                <section name="read_selection">
+                    <param name="min_basequal" value="5" />
+                </section>
                 <param name="settings" value="custom" />
                 <param name="min_coverage" value="2" />
                 <param name="min_reads2" value="1" />
-                <param name="min_avg_qual" value="5" />
                 <param name="min_var_freq" value="0.01" />
                 <param name="min_freq_for_hom" value="0.66" />
                 <param name="p_value" value="0.97" />
@@ -395,6 +434,7 @@
             </output>
         </test>
         <test expect_num_outputs="1">
+            <!-- run with preconfigured dream3 post-filter settings -->
             <conditional name="reference">
                 <param name="source" value="history" />
                 <param name="genome" value="hg19_chrM.fa" />
@@ -414,6 +454,7 @@
             </output>
         </test>
         <test expect_num_outputs="1">
+            <!-- run without post-filters -->
             <conditional name="reference">
                 <param name="source" value="history" />
                 <param name="genome" value="hg19_chrM.fa" />
@@ -433,6 +474,7 @@
             </output>
         </test>
         <test expect_num_outputs="1">
+            <!-- run with custom post-filters -->
             <conditional name="reference">
                 <param name="source" value="history" />
                 <param name="genome" value="hg19_chrM.fa" />
@@ -490,13 +532,6 @@
 This tool wraps the functionality of the ``varscan somatic`` and the
 ``varscan fpfilter`` command line tools.

-.. class:: infomark
-
-   The wrapper aims at providing the same functionality as the
-   ``varscan fpfilter`` tool, but implements it using ``pysam`` internally.
-   Note that, as one limitation compared to the original ``varscan`` tool,
-   the current version does not apply filters to indels!
-
 The tool is designed to detect genetic variants in a **pair of samples**
 representing normal and tumor tissue from the same individual. It classifies
 the variants, according to their most likely origin, as **somatic** (variant is