Mercurial > repos > iuc > varscan_somatic
comparison varscan.py @ 9:4e97191a1ff7 draft
"planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit fcf5ac14629c694f0f64773fab0428b1e78fe156"
| author | iuc |
|---|---|
| date | Fri, 16 Aug 2019 15:49:54 -0400 |
| parents | b79bb8b09822 |
| children | cf8ffc79db67 |
comparison
equal
deleted
inserted
replaced
| 8:b79bb8b09822 | 9:4e97191a1ff7 |
|---|---|
| 6 import os | 6 import os |
| 7 import subprocess | 7 import subprocess |
| 8 import sys | 8 import sys |
| 9 import tempfile | 9 import tempfile |
| 10 import time | 10 import time |
| 11 from collections import namedtuple | |
| 11 from contextlib import ExitStack | 12 from contextlib import ExitStack |
| 12 from functools import partial | 13 from functools import partial |
| 13 from threading import Thread | 14 from threading import Thread |
| 14 | 15 |
| 15 import pysam | 16 import pysam |
| 17 | |
| 18 | |
| 19 AlleleStats = namedtuple( | |
| 20 "AlleleStats", | |
| 21 [ | |
| 22 'reads_total', | |
| 23 'reads_fw', | |
| 24 'reads_rv', | |
| 25 'avg_mapqual', | |
| 26 'avg_basequal', | |
| 27 'avg_dist_from_center', | |
| 28 'avg_mismatch_fraction', | |
| 29 'avg_mismatch_qualsum', | |
| 30 'avg_clipped_len', | |
| 31 'avg_dist_from_3prime' | |
| 32 ] | |
| 33 ) | |
| 34 | |
| 35 | |
| 36 def _get_allele_specific_pileup_column_stats(pileups, ref_fetch, | |
| 37 ignore_md, ignore_nm, | |
| 38 mm_runs, detect_q2_runs): | |
| 39 var_reads_plus = var_reads_minus = 0 | |
| 40 sum_mapping_qualities = 0 | |
| 41 sum_base_qualities = 0 | |
| 42 sum_dist_from_center = 0 | |
| 43 sum_dist_from_3prime = 0 | |
| 44 sum_clipped_length = 0 | |
| 45 sum_unclipped_length = 0 | |
| 46 sum_mismatch_fractions = 0 | |
| 47 sum_mismatch_qualities = 0 | |
| 48 | |
| 49 for p in pileups: | |
| 50 if p.alignment.is_reverse: | |
| 51 var_reads_minus += 1 | |
| 52 else: | |
| 53 var_reads_plus += 1 | |
| 54 sum_mapping_qualities += p.alignment.mapping_quality | |
| 55 sum_base_qualities += p.alignment.query_qualities[p.query_position] | |
| 56 sum_clipped_length += p.alignment.query_alignment_length | |
| 57 unclipped_length = p.alignment.query_length | |
| 58 sum_unclipped_length += unclipped_length | |
| 59 # The following calculations are all in 1-based coordinates | |
| 60 # with respect to the physical 5'-end of the read sequence. | |
| 61 if p.alignment.is_reverse: | |
| 62 read_base_pos = unclipped_length - p.query_position | |
| 63 read_first_aln_pos = ( | |
| 64 unclipped_length - p.alignment.query_alignment_end + 1 | |
| 65 ) | |
| 66 read_last_aln_pos = ( | |
| 67 unclipped_length - p.alignment.query_alignment_start | |
| 68 ) | |
| 69 qualities_3to5prime = p.alignment.query_alignment_qualities | |
| 70 else: | |
| 71 read_base_pos = p.query_position + 1 | |
| 72 read_first_aln_pos = p.alignment.query_alignment_start + 1 | |
| 73 read_last_aln_pos = p.alignment.query_alignment_end | |
| 74 qualities_3to5prime = reversed( | |
| 75 p.alignment.query_alignment_qualities | |
| 76 ) | |
| 77 | |
| 78 read_last_effective_aln_pos = read_last_aln_pos | |
| 79 if detect_q2_runs: | |
| 80 # Note: the original bam-readcount algorithm always takes | |
| 81 # terminal q2 runs into account when determining the | |
| 82 # effective 3'-ends of reads. | |
| 83 # However, q2 runs have no special meaning since Illumina | |
| 84 # pipeline version 1.8 so detecting them is optional | |
| 85 # in this code. | |
| 86 for qual in qualities_3to5prime: | |
| 87 if qual != 2: | |
| 88 break | |
| 89 read_last_effective_aln_pos -= 1 | |
| 90 | |
| 91 # Note: the original bam-readcount algorithm defines the | |
| 92 # read center as: | |
| 93 # read_center = p.alignment.query_alignment_length / 2 | |
| 94 # This is less accurate than the implementation here. | |
| 95 read_center = (read_first_aln_pos + read_last_aln_pos) / 2 | |
| 96 sum_dist_from_center += 1 - abs( | |
| 97 read_base_pos - read_center | |
| 98 ) / (read_center - 1) | |
| 99 # To calculate the distance of base positions from the 3'-ends | |
| 100 # of reads bam-readcount uses the formula: | |
| 101 # sum_dist_from_3prime += abs( | |
| 102 # read_last_effective_aln_pos - read_base_pos | |
| 103 # ) / (unclipped_length - 1) | |
| 104 # , which treats base positions on both sides of the effective | |
| 105 # 3'-end equally. Since this seems hard to justify, we cap | |
| 106 # the distance calculation at 0 for base positions past the | |
| 107 # effective 3'-end (which, in turn, should only be possible | |
| 108 # with detection of q2 runs). | |
| 109 if read_last_effective_aln_pos > read_base_pos: | |
| 110 sum_dist_from_3prime += ( | |
| 111 read_last_effective_aln_pos - read_base_pos | |
| 112 ) / (unclipped_length - 1) | |
| 113 | |
| 114 if sum_mismatch_fractions >= 0 or sum_mismatch_qualities >= 0: | |
| 115 # sum_mismatch_fractions and sum_mismatch_qualities will be | |
| 116 # set to float('nan') if reference info (in form of an MD tag or | |
| 117 # an actual reference sequence) was not available for any previous | |
| 118 # read in the pileup. | |
| 119 # In that case, there is no point to trying to calculate | |
| 120 # mismatch stats for other reads anymore. | |
| 121 | |
| 122 # The following mismatch calculations use this logic: | |
| 123 # 1. For determining the edit distance between an aligned read and | |
| 124 # the reference, we follow the authoritative definition of the | |
| 125 # NM tag calculation in | |
| 126 # http://samtools.github.io/hts-specs/SAMtags.pdf | |
| 127 # | |
| 128 # For historical reasons, the result of this calculation will | |
| 129 # disagree more often than not with NM tag values calculated by | |
| 130 # other tools. | |
| 131 # If precalculated NM tag values are present on the aligned | |
| 132 # reads, these can be given preference through the use_nm flag. | |
| 133 # Doing so will mimick the behavior of bam-readcount, which | |
| 134 # requires and always just looks at NM tags. | |
| 135 # 2. For determining mismatch quality sums, a mismatch is defined | |
| 136 # differently and in accordance with the implementation in | |
| 137 # bam-readcount: | |
| 138 # - only mismatches (not inserted or deleted bases) are | |
| 139 # considered | |
| 140 # - 'N' in the reference is considered a match for any read base | |
| 141 # - any matching (case-insensitive) base in reference and read | |
| 142 # is considered a match, even if that base is not one of | |
| 143 # A, C, G or T. | |
| 144 # In both 1. and 2. above a '=' in the read is always considered a | |
| 145 # match, irrespective of the reference base. | |
| 146 | |
| 147 num_mismatches = 0 | |
| 148 if not ignore_md: | |
| 149 try: | |
| 150 # see if the read has an MD tag, in which case pysam can | |
| 151 # calculate the reference sequence for us | |
| 152 ref_seq = p.alignment.get_reference_sequence().upper() | |
| 153 except ValueError: | |
| 154 ignore_md = True | |
| 155 if not ignore_nm: | |
| 156 try: | |
| 157 num_mismatches = p.alignment.get_tag('NM') | |
| 158 except KeyError: | |
| 159 ignore_nm = True | |
| 160 | |
| 161 if ignore_md: | |
| 162 if not ref_fetch: | |
| 163 # cannot calculate mismatch stats without ref info | |
| 164 sum_mismatch_qualities = float('nan') | |
| 165 if ignore_nm: | |
| 166 sum_mismatch_fractions = float('nan') | |
| 167 else: | |
| 168 sum_mismatch_fractions += ( | |
| 169 num_mismatches / p.alignment.query_alignment_length | |
| 170 ) | |
| 171 continue | |
| 172 # Without an MD tag we need to extract the relevant part | |
| 173 # of the reference from the full sequence by position. | |
| 174 ref_positions = p.alignment.get_reference_positions() | |
| 175 ref_seq = ref_fetch( | |
| 176 ref_positions[0], ref_positions[-1] + 1 | |
| 177 ).upper() | |
| 178 | |
| 179 potential_matches = {'A', 'C', 'G', 'T'} | |
| 180 aligned_pairs = p.alignment.get_aligned_pairs(matches_only=True) | |
| 181 ref_offset = aligned_pairs[0][1] | |
| 182 last_mismatch_pos = None | |
| 183 mismatch_run_quals = [] | |
| 184 for qpos, rpos in aligned_pairs: | |
| 185 read_base = p.alignment.query_sequence[qpos].upper() | |
| 186 if read_base == '=': | |
| 187 # always treat the special read base '=' as a | |
| 188 # match, irrespective of the reference base | |
| 189 continue | |
| 190 ref_base = ref_seq[rpos - ref_offset] | |
| 191 # see if we have a mismatch to use for mismatch quality sum | |
| 192 # calculation | |
| 193 if ref_base != 'N' and ref_base != read_base: | |
| 194 if mm_runs: | |
| 195 if ( | |
| 196 last_mismatch_pos is None | |
| 197 ) or ( | |
| 198 last_mismatch_pos + 1 == qpos | |
| 199 ): | |
| 200 mismatch_run_quals.append( | |
| 201 p.alignment.query_qualities[qpos] | |
| 202 ) | |
| 203 else: | |
| 204 sum_mismatch_qualities += max(mismatch_run_quals) | |
| 205 mismatch_run_quals = [ | |
| 206 p.alignment.query_qualities[qpos] | |
| 207 ] | |
| 208 last_mismatch_pos = qpos | |
| 209 else: | |
| 210 sum_mismatch_qualities += ( | |
| 211 p.alignment.query_qualities[qpos] | |
| 212 ) | |
| 213 if ignore_nm: | |
| 214 # see if we have a mismatch that increases the edit | |
| 215 # distance according to the SAMtags specs | |
| 216 if ( | |
| 217 read_base not in potential_matches | |
| 218 ) or ( | |
| 219 ref_base not in potential_matches | |
| 220 ) or ( | |
| 221 read_base != ref_base | |
| 222 ): | |
| 223 num_mismatches += 1 | |
| 224 | |
| 225 if mismatch_run_quals: | |
| 226 sum_mismatch_qualities += max(mismatch_run_quals) | |
| 227 if ignore_nm: | |
| 228 # use the number of mismatches calculated above, | |
| 229 # but add inserted and deleted bases to it | |
| 230 cigar_stats = p.alignment.get_cigar_stats()[0] | |
| 231 num_mismatches += cigar_stats[1] + cigar_stats[2] | |
| 232 sum_mismatch_fractions += ( | |
| 233 num_mismatches / p.alignment.query_alignment_length | |
| 234 ) | |
| 235 | |
| 236 return ( | |
| 237 var_reads_plus, | |
| 238 var_reads_minus, | |
| 239 sum_mapping_qualities, | |
| 240 sum_base_qualities, | |
| 241 sum_dist_from_center, | |
| 242 sum_mismatch_fractions, | |
| 243 sum_mismatch_qualities, | |
| 244 sum_clipped_length, | |
| 245 sum_dist_from_3prime | |
| 246 ) | |
| 247 | |
| 248 | |
| 249 def get_allele_stats(reads, pos, alleles, | |
| 250 ref=None, ignore_md=True, ignore_nm=True, | |
| 251 mm_runs=True, detect_q2_runs=False, | |
| 252 pileup_args=None): | |
| 253 chrom, start, stop = pos | |
| 254 if pileup_args is None: | |
| 255 pileup_args = {} | |
| 256 if pileup_args.get('stepper') == 'samtools': | |
| 257 if pileup_args.get('compute_baq', True) is not False: | |
| 258 if 'fastafile' not in pileup_args: | |
| 259 pileup_args['fastafile'] = ref | |
| 260 # be careful when passing in a custom 'fastafile' option: | |
| 261 # providing it slows down pysam tremendously even if the option | |
| 262 # isn't actually required. | |
| 263 | |
| 264 pile = reads.pileup( | |
| 265 chrom, start, stop, | |
| 266 **pileup_args | |
| 267 ) | |
| 268 # apply false-positive filtering a la varscan fpfilter | |
| 269 # find the variant site in the pileup columns | |
| 270 for pile_column in pile: | |
| 271 if pile_column.reference_pos == start: | |
| 272 break | |
| 273 else: | |
| 274 # With no reads covering the genomic position | |
| 275 # we can only return "empty" allele stats | |
| 276 return [ | |
| 277 AlleleStats(0, 0, 0, *[float('nan')] * 7) | |
| 278 for allele in alleles | |
| 279 ] | |
| 280 | |
| 281 # extract required information | |
| 282 # overall read depth at the site | |
| 283 read_depth = pile_column.get_num_aligned() | |
| 284 assert read_depth > 0 | |
| 285 | |
| 286 alleles_stats = [] | |
| 287 for allele in alleles: | |
| 288 if '-' in allele: | |
| 289 allele, indel_type, indel_after = allele.partition('-') | |
| 290 indel_len = -len(indel_after) | |
| 291 elif '+' in allele: | |
| 292 allele, indel_type, indel_after = allele.partition('+') | |
| 293 indel_len = len(indel_after) | |
| 294 else: | |
| 295 indel_type = None | |
| 296 | |
| 297 stats_it = ( | |
| 298 p for p in pile_column.pileups | |
| 299 # skip reads that don't support the allele we're looking for | |
| 300 if (p.query_position is not None) | |
| 301 and (p.alignment.query_sequence[p.query_position] == allele) | |
| 302 ) | |
| 303 if indel_type == '-': | |
| 304 stats_it = ( | |
| 305 p for p in stats_it if p.indel == indel_len | |
| 306 ) | |
| 307 elif indel_type == '+': | |
| 308 stats_it = ( | |
| 309 p for p in stats_it if ( | |
| 310 p.indel == indel_len | |
| 311 ) and ( | |
| 312 p.alignment.query_sequence[ | |
| 313 p.query_position + 1: | |
| 314 p.query_position + 1 + indel_len | |
| 315 ] == indel_after | |
| 316 ) | |
| 317 ) | |
| 318 allele_stats = _get_allele_specific_pileup_column_stats( | |
| 319 stats_it, | |
| 320 partial( | |
| 321 pysam.FastaFile.fetch, ref, chrom | |
| 322 ) if ref else None, | |
| 323 ignore_md, | |
| 324 ignore_nm, | |
| 325 mm_runs, | |
| 326 detect_q2_runs | |
| 327 ) | |
| 328 | |
| 329 allele_reads_total = allele_stats[0] + allele_stats[1] | |
| 330 if allele_reads_total == 0: | |
| 331 # No stats without reads! | |
| 332 alleles_stats.append( | |
| 333 AlleleStats(read_depth, 0, 0, *[float('nan')] * 7) | |
| 334 ) | |
| 335 else: | |
| 336 alleles_stats.append( | |
| 337 AlleleStats( | |
| 338 read_depth, allele_stats[0], allele_stats[1], | |
| 339 *(i / allele_reads_total for i in allele_stats[2:]) | |
| 340 ) | |
| 341 ) | |
| 342 return alleles_stats | |
| 16 | 343 |
| 17 | 344 |
| 18 class VariantCallingError (RuntimeError): | 345 class VariantCallingError (RuntimeError): |
| 19 """Exception class for issues with samtools and varscan subprocesses.""" | 346 """Exception class for issues with samtools and varscan subprocesses.""" |
| 20 | 347 |
| 38 return msg_header + self.message | 365 return msg_header + self.message |
| 39 | 366 |
| 40 | 367 |
| 41 class VarScanCaller (object): | 368 class VarScanCaller (object): |
| 42 def __init__(self, ref_genome, bam_input_files, | 369 def __init__(self, ref_genome, bam_input_files, |
| 43 max_depth=None, | 370 max_depth=None, count_orphans=False, detect_overlaps=True, |
| 44 min_mapqual=None, min_basequal=None, | 371 min_mapqual=None, min_basequal=None, |
| 45 threads=1, verbose=False, quiet=True | 372 threads=1, verbose=False, quiet=True |
| 46 ): | 373 ): |
| 47 self.ref_genome = ref_genome | 374 self.ref_genome = ref_genome |
| 48 self.bam_input_files = bam_input_files | 375 self.bam_input_files = bam_input_files |
| 49 self.max_depth = max_depth | 376 self.max_depth = max_depth |
| 377 self.count_orphans = count_orphans | |
| 378 self.detect_overlaps = detect_overlaps | |
| 50 self.min_mapqual = min_mapqual | 379 self.min_mapqual = min_mapqual |
| 51 self.min_basequal = min_basequal | 380 self.min_basequal = min_basequal |
| 52 self.threads = threads | 381 self.threads = threads |
| 53 self.verbose = verbose | 382 self.verbose = verbose |
| 54 self.quiet = quiet | 383 self.quiet = quiet |
| 65 mode='wb', suffix='', delete=False, dir=os.getcwd() | 394 mode='wb', suffix='', delete=False, dir=os.getcwd() |
| 66 ) | 395 ) |
| 67 self.tmpfiles = [] | 396 self.tmpfiles = [] |
| 68 | 397 |
| 69 def _get_pysam_pileup_args(self): | 398 def _get_pysam_pileup_args(self): |
| 70 param_dict = {} | 399 # Compute the pileup args dynamically to account for possibly updated |
| 400 # instance attributes. | |
| 401 | |
| 402 # fixed default parameters | |
| 403 # Note on the fixed default for 'ignore_overlaps': | |
| 404 # In order to use the exact same pileup parameters during variant | |
| 405 # calling and postprocessing, we would have to compute the setting | |
| 406 # dynamically like so: | |
| 407 # if not self.detect_overlaps: | |
| 408 # param_dict['ignore_overlaps'] = False | |
| 409 # However, samtools/pysam implement overlap correction by manipulating | |
| 410 # (setting to zero) the lower qualities of bases that are part of an | |
| 411 # overlap (see | |
| 412 # https://sourceforge.net/p/samtools/mailman/message/32793789/). This | |
| 413 # manipulation has such a big undesirable effect on base quality stats | |
| 414 # calculated during postprocessing that calculating the stats on a | |
| 415 # slightly different pileup seems like the lesser evil. | |
| 416 | |
| 417 param_dict = { | |
| 418 'ignore_overlaps': False, | |
| 419 'compute_baq': False, | |
| 420 'stepper': 'samtools', | |
| 421 } | |
| 422 # user-controllable parameters | |
| 423 if self.count_orphans: | |
| 424 param_dict['ignore_orphans'] = False | |
| 71 if self.max_depth is not None: | 425 if self.max_depth is not None: |
| 72 param_dict['max_depth'] = self.max_depth | 426 param_dict['max_depth'] = self.max_depth |
| 73 if self.min_mapqual is not None: | 427 if self.min_mapqual is not None: |
| 74 param_dict['min_mapping_quality'] = self.min_mapqual | 428 param_dict['min_mapping_quality'] = self.min_mapqual |
| 75 if self.min_basequal is not None: | 429 if self.min_basequal is not None: |
| 76 param_dict['min_base_quality'] = self.min_basequal | 430 param_dict['min_base_quality'] = self.min_basequal |
| 77 param_dict['compute_baq'] = False | |
| 78 param_dict['stepper'] = 'samtools' | |
| 79 return param_dict | 431 return param_dict |
| 80 | 432 |
| 81 def varcall_parallel(self, normal_purity=None, tumor_purity=None, | 433 def varcall_parallel(self, normal_purity=None, tumor_purity=None, |
| 82 min_coverage=None, | 434 min_coverage=None, |
| 83 min_var_count=None, | 435 min_var_count=None, |
| 106 varcall_engine_options = [] | 458 varcall_engine_options = [] |
| 107 for option, value in varcall_engine_option_mapping: | 459 for option, value in varcall_engine_option_mapping: |
| 108 if value is not None: | 460 if value is not None: |
| 109 varcall_engine_options += [option, str(value)] | 461 varcall_engine_options += [option, str(value)] |
| 110 pileup_engine_options = ['-B'] | 462 pileup_engine_options = ['-B'] |
| 463 if self.count_orphans: | |
| 464 pileup_engine_options += ['-A'] | |
| 465 if not self.detect_overlaps: | |
| 466 pileup_engine_options += ['-x'] | |
| 111 if self.max_depth is not None: | 467 if self.max_depth is not None: |
| 112 pileup_engine_options += ['-d', str(self.max_depth)] | 468 pileup_engine_options += ['-d', str(self.max_depth)] |
| 113 if self.min_mapqual is not None: | 469 if self.min_mapqual is not None: |
| 114 pileup_engine_options += ['-q', str(self.min_mapqual)] | 470 pileup_engine_options += ['-q', str(self.min_mapqual)] |
| 115 if self.min_basequal is not None: | 471 if self.min_basequal is not None: |
| 504 # FREQ is easy to calculate from AD | 860 # FREQ is easy to calculate from AD |
| 505 del record.format['FREQ'] | 861 del record.format['FREQ'] |
| 506 del record.format['RD'] | 862 del record.format['RD'] |
| 507 del record.format['DP4'] | 863 del record.format['DP4'] |
| 508 | 864 |
| 509 def get_allele_specific_pileup_column_stats( | |
| 510 self, allele, pile_column, ref_fetch, use_md=True | |
| 511 ): | |
| 512 var_reads_plus = var_reads_minus = 0 | |
| 513 sum_mapping_qualities = 0 | |
| 514 sum_base_qualities = 0 | |
| 515 sum_dist_from_center = 0 | |
| 516 sum_dist_from_3prime = 0 | |
| 517 sum_clipped_length = 0 | |
| 518 sum_unclipped_length = 0 | |
| 519 sum_num_mismatches_as_fraction = 0 | |
| 520 sum_mismatch_qualities = 0 | |
| 521 | |
| 522 for p in pile_column.pileups: | |
| 523 # skip reads that don't support the allele we're looking for | |
| 524 if p.query_position is None: | |
| 525 continue | |
| 526 if p.alignment.query_sequence[p.query_position] != allele: | |
| 527 continue | |
| 528 | |
| 529 if p.alignment.is_reverse: | |
| 530 var_reads_minus += 1 | |
| 531 else: | |
| 532 var_reads_plus += 1 | |
| 533 sum_mapping_qualities += p.alignment.mapping_quality | |
| 534 sum_base_qualities += p.alignment.query_qualities[p.query_position] | |
| 535 sum_clipped_length += p.alignment.query_alignment_length | |
| 536 unclipped_length = p.alignment.query_length | |
| 537 sum_unclipped_length += unclipped_length | |
| 538 # The following calculations are all in 1-based coordinates | |
| 539 # with respect to the physical 5'-end of the read sequence. | |
| 540 if p.alignment.is_reverse: | |
| 541 read_base_pos = unclipped_length - p.query_position | |
| 542 read_first_aln_pos = ( | |
| 543 unclipped_length - p.alignment.query_alignment_end + 1 | |
| 544 ) | |
| 545 read_last_aln_pos = ( | |
| 546 unclipped_length - p.alignment.query_alignment_start | |
| 547 ) | |
| 548 else: | |
| 549 read_base_pos = p.query_position + 1 | |
| 550 read_first_aln_pos = p.alignment.query_alignment_start + 1 | |
| 551 read_last_aln_pos = p.alignment.query_alignment_end | |
| 552 # Note: the original bam-readcount algorithm uses the less accurate | |
| 553 # p.alignment.query_alignment_length / 2 as the read center | |
| 554 read_center = (read_first_aln_pos + read_last_aln_pos) / 2 | |
| 555 sum_dist_from_center += 1 - abs( | |
| 556 read_base_pos - read_center | |
| 557 ) / (read_center - 1) | |
| 558 # Note: the original bam-readcount algorithm uses a more | |
| 559 # complicated definition of the 3'-end of a read sequence, which | |
| 560 # clips off runs of bases with a quality scores of exactly 2. | |
| 561 sum_dist_from_3prime += ( | |
| 562 read_last_aln_pos - read_base_pos | |
| 563 ) / (unclipped_length - 1) | |
| 564 | |
| 565 sum_num_mismatches = 0 | |
| 566 if use_md: | |
| 567 try: | |
| 568 # see if the read has an MD tag, in which case pysam can | |
| 569 # calculate the reference sequence for us | |
| 570 aligned_pairs = p.alignment.get_aligned_pairs( | |
| 571 matches_only=True, with_seq=True | |
| 572 ) | |
| 573 except ValueError: | |
| 574 use_md = False | |
| 575 if use_md: | |
| 576 # The ref sequence got calculated from alignment and MD tag. | |
| 577 for qpos, rpos, ref_base in aligned_pairs: | |
| 578 # pysam uses lowercase ref bases to indicate mismatches | |
| 579 if ( | |
| 580 ref_base == 'a' or ref_base == 't' or | |
| 581 ref_base == 'g' or ref_base == 'c' | |
| 582 ): | |
| 583 sum_num_mismatches += 1 | |
| 584 sum_mismatch_qualities += p.alignment.query_qualities[ | |
| 585 qpos | |
| 586 ] | |
| 587 else: | |
| 588 # We need to obtain the aligned portion of the reference | |
| 589 # sequence. | |
| 590 aligned_pairs = p.alignment.get_aligned_pairs( | |
| 591 matches_only=True | |
| 592 ) | |
| 593 # note that ref bases can be lowercase | |
| 594 ref_seq = ref_fetch( | |
| 595 aligned_pairs[0][1], aligned_pairs[-1][1] + 1 | |
| 596 ).upper() | |
| 597 ref_offset = aligned_pairs[0][1] | |
| 598 | |
| 599 for qpos, rpos in p.alignment.get_aligned_pairs( | |
| 600 matches_only=True | |
| 601 ): | |
| 602 # see if we have a mismatch to the reference at this | |
| 603 # position, but note that | |
| 604 # - the query base may be lowercase (SAM/BAM spec) | |
| 605 # - an '=', if present in the query seq, indicates a match | |
| 606 # to the reference (SAM/BAM spec) | |
| 607 # - there cannot be a mismatch with an N in the reference | |
| 608 ref_base = ref_seq[rpos - ref_offset] | |
| 609 read_base = p.alignment.query_sequence[qpos].upper() | |
| 610 if ( | |
| 611 read_base != '=' and ref_base != 'N' | |
| 612 ) and ( | |
| 613 ref_base != read_base | |
| 614 ): | |
| 615 sum_num_mismatches += 1 | |
| 616 sum_mismatch_qualities += p.alignment.query_qualities[ | |
| 617 qpos | |
| 618 ] | |
| 619 sum_num_mismatches_as_fraction += ( | |
| 620 sum_num_mismatches / p.alignment.query_alignment_length | |
| 621 ) | |
| 622 | |
| 623 var_reads_total = var_reads_plus + var_reads_minus | |
| 624 if var_reads_total == 0: | |
| 625 # No stats without reads! | |
| 626 return None | |
| 627 avg_mapping_quality = sum_mapping_qualities / var_reads_total | |
| 628 avg_basequality = sum_base_qualities / var_reads_total | |
| 629 avg_pos_as_fraction = sum_dist_from_center / var_reads_total | |
| 630 avg_num_mismatches_as_fraction = ( | |
| 631 sum_num_mismatches_as_fraction / var_reads_total | |
| 632 ) | |
| 633 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total | |
| 634 avg_clipped_length = sum_clipped_length / var_reads_total | |
| 635 avg_distance_to_effective_3p_end = ( | |
| 636 sum_dist_from_3prime / var_reads_total | |
| 637 ) | |
| 638 | |
| 639 return ( | |
| 640 avg_mapping_quality, | |
| 641 avg_basequality, | |
| 642 var_reads_plus, | |
| 643 var_reads_minus, | |
| 644 avg_pos_as_fraction, | |
| 645 avg_num_mismatches_as_fraction, | |
| 646 avg_sum_mismatch_qualities, | |
| 647 avg_clipped_length, | |
| 648 avg_distance_to_effective_3p_end | |
| 649 ) | |
| 650 | |
| 651 def _postprocess_variant_records(self, invcf, | 865 def _postprocess_variant_records(self, invcf, |
| 652 min_var_count2, min_var_count2_lc, | 866 min_var_count2, min_var_count2_lc, |
| 653 min_var_freq2, max_somatic_p, | 867 min_var_freq2, min_var_count2_depth, |
| 654 max_somatic_p_depth, | |
| 655 min_ref_readpos, min_var_readpos, | 868 min_ref_readpos, min_var_readpos, |
| 656 min_ref_dist3, min_var_dist3, | 869 min_ref_dist3, min_var_dist3, |
| 870 detect_q2_runs, | |
| 657 min_ref_len, min_var_len, | 871 min_ref_len, min_var_len, |
| 658 max_relative_len_diff, | 872 max_relative_len_diff, |
| 659 min_strandedness, min_strand_reads, | 873 min_strandedness, min_strand_reads, |
| 660 min_ref_basequal, min_var_basequal, | 874 min_ref_basequal, min_var_basequal, |
| 661 max_basequal_diff, | 875 max_basequal_diff, |
| 662 min_ref_mapqual, min_var_mapqual, | 876 min_ref_mapqual, min_var_mapqual, |
| 663 max_mapqual_diff, | 877 max_mapqual_diff, |
| 664 max_ref_mmqs, max_var_mmqs, | 878 max_ref_mmqs, max_var_mmqs, |
| 665 min_mmqs_diff, max_mmqs_diff, | 879 min_mmqs_diff, max_mmqs_diff, |
| 880 ignore_md, | |
| 666 **args): | 881 **args): |
| 667 # set FILTER field according to Varscan criteria | 882 # set FILTER field according to Varscan criteria |
| 668 # multiple FILTER entries must be separated by semicolons | 883 # multiple FILTER entries must be separated by semicolons |
| 669 # No filters applied should be indicated with MISSING | 884 # No filters applied should be indicated with MISSING |
| 670 | 885 |
| 679 io_stack.enter_context( | 894 io_stack.enter_context( |
| 680 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files | 895 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files |
| 681 ) | 896 ) |
| 682 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) | 897 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) |
| 683 pileup_args = self._get_pysam_pileup_args() | 898 pileup_args = self._get_pysam_pileup_args() |
| 899 _get_stats = get_allele_stats | |
| 684 for record in invcf: | 900 for record in invcf: |
| 685 if any(len(allele) > 1 for allele in record.alleles): | 901 is_indel = False |
| 686 # skip indel postprocessing for the moment | |
| 687 yield record | |
| 688 continue | |
| 689 if record.alleles[0] == 'N': | 902 if record.alleles[0] == 'N': |
| 690 # It makes no sense to call SNPs against an unknown | 903 # It makes no sense to call SNPs against an unknown |
| 691 # reference base | 904 # reference base |
| 692 continue | 905 continue |
| 906 if len(record.alleles[0]) > 1: | |
| 907 # a deletion | |
| 908 is_indel = True | |
| 909 alleles = [ | |
| 910 record.alleles[1], record.alleles[0].replace( | |
| 911 record.alleles[1], record.alleles[1] + '-', 1 | |
| 912 ) | |
| 913 ] | |
| 914 elif len(record.alleles[1]) > 1: | |
| 915 # an insertion | |
| 916 is_indel = True | |
| 917 alleles = [ | |
| 918 record.alleles[0], | |
| 919 record.alleles[1].replace( | |
| 920 record.alleles[0], record.alleles[0] + '+', 1 | |
| 921 ) | |
| 922 ] | |
| 923 else: | |
| 924 # a regular SNV | |
| 925 alleles = record.alleles[:2] | |
| 693 # get pileup for genomic region affected by this variant | 926 # get pileup for genomic region affected by this variant |
| 694 if record.info['SS'] == '2': | 927 if record.info['SS'] == '2': |
| 695 # a somatic variant => generate pileup from tumor data | 928 # a somatic variant => generate pileup from tumor data |
| 696 pile = tumor_reads.pileup( | 929 reads_of_interest = tumor_reads |
| 697 record.chrom, record.start, record.stop, | 930 calculate_ref_stats = record.samples['TUMOR']['RD'] > 0 |
| 698 **pileup_args | |
| 699 ) | |
| 700 sample_of_interest = 'TUMOR' | |
| 701 elif record.info['SS'] in ['1', '3']: | 931 elif record.info['SS'] in ['1', '3']: |
| 702 # a germline or LOH variant => pileup from normal data | 932 # a germline or LOH variant => pileup from normal data |
| 703 pile = normal_reads.pileup( | 933 reads_of_interest = normal_reads |
| 704 record.chrom, record.start, record.stop, | 934 calculate_ref_stats = record.samples['NORMAL']['RD'] > 0 |
| 705 **pileup_args | |
| 706 ) | |
| 707 sample_of_interest = 'NORMAL' | |
| 708 else: | 935 else: |
| 709 # TO DO: figure out if there is anything interesting to do | 936 # TO DO: figure out if there is anything interesting to do |
| 710 # for SS status codes 0 (reference) and 5 (unknown) | 937 # for SS status codes 0 (reference) and 5 (unknown) |
| 711 yield record | 938 yield record |
| 712 continue | 939 continue |
| 713 | 940 |
| 714 # apply false-positive filtering a la varscan fpfilter | |
| 715 # find the variant site in the pileup columns | |
| 716 for pile_column in pile: | |
| 717 if pile_column.reference_pos == record.start: | |
| 718 break | |
| 719 # extract required information | |
| 720 # overall read depth at the site | |
| 721 read_depth = pile_column.get_num_aligned() | |
| 722 assert read_depth > 0 | |
| 723 # no multiallelic sites in varscan | 941 # no multiallelic sites in varscan |
| 724 assert len(record.alleles) == 2 | 942 assert len(record.alleles) == 2 |
| 725 if record.samples[sample_of_interest]['RD'] > 0: | 943 |
| 726 ref_stats, alt_stats = [ | 944 if calculate_ref_stats: |
| 727 self.get_allele_specific_pileup_column_stats( | 945 ref_stats, alt_stats = _get_stats( |
| 728 allele, | 946 reads_of_interest, |
| 729 pile_column, | 947 (record.chrom, record.start, record.stop), |
| 730 partial( | 948 alleles, |
| 731 pysam.FastaFile.fetch, refseq, record.chrom | 949 refseq, |
| 732 ) | 950 ignore_md=ignore_md, |
| 733 ) | 951 ignore_nm=False, |
| 734 for allele in record.alleles | 952 mm_runs=True, |
| 735 ] | 953 detect_q2_runs=detect_q2_runs, |
| 954 pileup_args=pileup_args | |
| 955 ) | |
| 736 else: | 956 else: |
| 737 ref_stats = None | 957 ref_stats = None |
| 738 alt_stats = self.get_allele_specific_pileup_column_stats( | 958 alt_stats = _get_stats( |
| 739 record.alleles[1], | 959 reads_of_interest, |
| 740 pile_column, | 960 (record.chrom, record.start, record.stop), |
| 741 partial(pysam.FastaFile.fetch, refseq, record.chrom) | 961 alleles[1:2], |
| 742 ) | 962 refseq, |
| 963 ignore_md=ignore_md, | |
| 964 ignore_nm=False, | |
| 965 mm_runs=True, | |
| 966 detect_q2_runs=detect_q2_runs, | |
| 967 pileup_args=pileup_args | |
| 968 )[0] | |
| 743 ref_count = 0 | 969 ref_count = 0 |
| 744 if ref_stats: | 970 if ref_stats: |
| 745 ref_count = ref_stats[2] + ref_stats[3] | 971 ref_count = ref_stats.reads_fw + ref_stats.reads_rv |
| 746 if ref_stats[1] < min_ref_basequal: | 972 if ref_stats.avg_basequal < min_ref_basequal: |
| 747 record.filter.add('RefBaseQual') | 973 record.filter.add('RefBaseQual') |
| 748 if ref_count >= 2: | 974 if ref_count >= 2: |
| 749 if ref_stats[0] < min_ref_mapqual: | 975 if ref_stats.avg_mapqual < min_ref_mapqual: |
| 750 record.filter.add('RefMapQual') | 976 record.filter.add('RefMapQual') |
| 751 if ref_stats[4] < min_ref_readpos: | 977 if ref_stats.avg_dist_from_center < min_ref_readpos: |
| 752 record.filter.add('RefReadPos') | 978 record.filter.add('RefReadPos') |
| 753 # ref_stats[5] (avg_num_mismatches_as_fraction | 979 # ref_stats.avg_mismatch_fraction |
| 754 # is not a filter criterion in VarScan fpfilter | 980 # is not a filter criterion in VarScan fpfilter |
| 755 if ref_stats[6] > max_ref_mmqs: | 981 if ref_stats.avg_mismatch_qualsum > max_ref_mmqs: |
| 756 record.filter.add('RefMMQS') | 982 record.filter.add('RefMMQS') |
| 757 if ref_stats[7] < min_ref_len: | 983 if not is_indel and ( |
| 984 ref_stats.avg_clipped_len < min_ref_len | |
| 985 ): | |
| 758 # VarScan fpfilter does not apply this filter | 986 # VarScan fpfilter does not apply this filter |
| 759 # for indels, but there is no reason | 987 # for indels, so we do not do it either. |
| 760 # not to do it. | |
| 761 record.filter.add('RefAvgRL') | 988 record.filter.add('RefAvgRL') |
| 762 if ref_stats[8] < min_ref_dist3: | 989 if ref_stats.avg_dist_from_3prime < min_ref_dist3: |
| 763 record.filter.add('RefDist3') | 990 record.filter.add('RefDist3') |
| 764 if alt_stats: | 991 if alt_stats: |
| 765 alt_count = alt_stats[2] + alt_stats[3] | 992 alt_count = alt_stats.reads_fw + alt_stats.reads_rv |
| 766 if ( | 993 if alt_count < min_var_count2: |
| 767 alt_count < min_var_count2_lc | 994 if ( |
| 768 ) or ( | 995 (alt_count + ref_count) >= min_var_count2_depth |
| 769 read_depth >= max_somatic_p_depth and | 996 ) or ( |
| 770 alt_count < min_var_count2 | 997 alt_count < min_var_count2_lc |
| 998 ): | |
| 999 record.filter.add('VarCount') | |
| 1000 if alt_count / alt_stats.reads_total < min_var_freq2: | |
| 1001 record.filter.add('VarFreq') | |
| 1002 if not is_indel and ( | |
| 1003 alt_stats.avg_basequal < min_var_basequal | |
| 771 ): | 1004 ): |
| 772 record.filter.add('VarCount') | |
| 773 if alt_count / read_depth < min_var_freq2: | |
| 774 record.filter.add('VarFreq') | |
| 775 if alt_stats[1] < min_var_basequal: | |
| 776 record.filter.add('VarBaseQual') | 1005 record.filter.add('VarBaseQual') |
| 777 if alt_count > min_strand_reads: | 1006 if alt_count >= min_strand_reads: |
| 778 if ( | 1007 if ( |
| 779 alt_stats[2] / alt_count < min_strandedness | 1008 alt_stats.reads_fw / alt_count < min_strandedness |
| 780 ) or ( | 1009 ) or ( |
| 781 alt_stats[3] / alt_count < min_strandedness | 1010 alt_stats.reads_rv / alt_count < min_strandedness |
| 782 ): | 1011 ): |
| 783 record.filter.add('Strand') | 1012 record.filter.add('Strand') |
| 784 if alt_stats[2] + alt_stats[3] >= 2: | 1013 if alt_count >= 2: |
| 785 if alt_stats[0] < min_var_mapqual: | 1014 if alt_stats.avg_mapqual < min_var_mapqual: |
| 786 record.filter.add('VarMapQual') | 1015 record.filter.add('VarMapQual') |
| 787 if alt_stats[4] < min_var_readpos: | 1016 if alt_stats.avg_dist_from_center < min_var_readpos: |
| 788 record.filter.add('VarReadPos') | 1017 record.filter.add('VarReadPos') |
| 789 # alt_stats[5] (avg_num_mismatches_as_fraction | 1018 # alt_stats.avg_mismatch_fraction |
| 790 # is not a filter criterion in VarScan fpfilter | 1019 # is not a filter criterion in VarScan fpfilter |
| 791 if alt_stats[6] > max_var_mmqs: | 1020 if alt_stats.avg_mismatch_qualsum > max_var_mmqs: |
| 792 record.filter.add('VarMMQS') | 1021 record.filter.add('VarMMQS') |
| 793 if alt_stats[7] < min_var_len: | 1022 if not is_indel and ( |
| 1023 alt_stats.avg_clipped_len < min_var_len | |
| 1024 ): | |
| 794 # VarScan fpfilter does not apply this filter | 1025 # VarScan fpfilter does not apply this filter |
| 795 # for indels, but there is no reason | 1026 # for indels, so we do not do it either. |
| 796 # not to do it. | |
| 797 record.filter.add('VarAvgRL') | 1027 record.filter.add('VarAvgRL') |
| 798 if alt_stats[8] < min_var_dist3: | 1028 if alt_stats.avg_dist_from_3prime < min_var_dist3: |
| 799 record.filter.add('VarDist3') | 1029 record.filter.add('VarDist3') |
| 800 if ref_count >= 2 and alt_count >= 2: | 1030 if ref_count >= 2 and alt_count >= 2: |
| 801 if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: | 1031 if ( |
| 1032 ref_stats.avg_mapqual - alt_stats.avg_mapqual | |
| 1033 ) > max_mapqual_diff: | |
| 802 record.filter.add('MapQualDiff') | 1034 record.filter.add('MapQualDiff') |
| 803 if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: | 1035 if ( |
| 1036 ref_stats.avg_basequal - alt_stats.avg_basequal | |
| 1037 ) > max_basequal_diff: | |
| 804 record.filter.add('MaxBAQdiff') | 1038 record.filter.add('MaxBAQdiff') |
| 805 mmqs_diff = alt_stats[6] - ref_stats[6] | 1039 mmqs_diff = ( |
| 1040 alt_stats.avg_mismatch_qualsum | |
| 1041 - ref_stats.avg_mismatch_qualsum | |
| 1042 ) | |
| 806 if mmqs_diff < min_mmqs_diff: | 1043 if mmqs_diff < min_mmqs_diff: |
| 807 record.filter.add('MinMMQSdiff') | 1044 record.filter.add('MinMMQSdiff') |
| 808 if mmqs_diff > max_mmqs_diff: | 1045 if mmqs_diff > max_mmqs_diff: |
| 809 record.filter.add('MMQSdiff') | 1046 record.filter.add('MMQSdiff') |
| 810 if ( | 1047 if ( |
| 811 1 - alt_stats[7] / ref_stats[7] | 1048 1 - |
| 1049 alt_stats.avg_clipped_len | |
| 1050 / ref_stats.avg_clipped_len | |
| 812 ) > max_relative_len_diff: | 1051 ) > max_relative_len_diff: |
| 813 record.filter.add('ReadLenDiff') | 1052 record.filter.add('ReadLenDiff') |
| 814 else: | 1053 else: |
| 815 # No variant-supporting reads for this record! | 1054 # No variant-supporting reads for this record! |
| 816 # This can happen in rare cases because of | 1055 # This can happen in rare cases because of |
| 992 out = (output_path, None) | 1231 out = (output_path, None) |
| 993 | 1232 |
| 994 instance_args = { | 1233 instance_args = { |
| 995 k: args.pop(k) for k in [ | 1234 k: args.pop(k) for k in [ |
| 996 'max_depth', | 1235 'max_depth', |
| 1236 'count_orphans', | |
| 1237 'detect_overlaps', | |
| 997 'min_mapqual', | 1238 'min_mapqual', |
| 998 'min_basequal', | 1239 'min_basequal', |
| 999 'threads', | 1240 'threads', |
| 1000 'verbose', | 1241 'verbose', |
| 1001 'quiet' | 1242 'quiet' |
| 1044 'basename for the two output files to be generated (see ' | 1285 'basename for the two output files to be generated (see ' |
| 1045 '-s|--split-output below).' | 1286 '-s|--split-output below).' |
| 1046 ) | 1287 ) |
| 1047 p.add_argument( | 1288 p.add_argument( |
| 1048 '-s', '--split-output', | 1289 '-s', '--split-output', |
| 1049 dest='split_output', action='store_true', default=False, | 1290 dest='split_output', action='store_true', |
| 1050 help='indicate that separate output files for SNPs and indels ' | 1291 help='indicate that separate output files for SNPs and indels ' |
| 1051 'should be generated (original VarScan behavior). If specified, ' | 1292 'should be generated (original VarScan behavior). If specified, ' |
| 1052 '%%T in the --ofile file name will be replaced with "snp" and ' | 1293 '%%T in the --ofile file name will be replaced with "snp" and ' |
| 1053 '"indel" to generate the names of the SNP and indel output ' | 1294 '"indel" to generate the names of the SNP and indel output ' |
| 1054 'files, respectively. If %%T is not found in the file name, it ' | 1295 'files, respectively. If %%T is not found in the file name, it ' |
| 1088 dest='max_depth', type=int, default=8000, | 1329 dest='max_depth', type=int, default=8000, |
| 1089 help='Maximum depth of generated pileups (samtools mpileup -d option; ' | 1330 help='Maximum depth of generated pileups (samtools mpileup -d option; ' |
| 1090 'default: 8000)' | 1331 'default: 8000)' |
| 1091 ) | 1332 ) |
| 1092 call_group.add_argument( | 1333 call_group.add_argument( |
| 1334 '--count-orphans', | |
| 1335 dest='count_orphans', action='store_true', | |
| 1336 help='Use anomalous read pairs in variant calling ' | |
| 1337 '(samtools mpileup -A option; default: ignore anomalous pairs)' | |
| 1338 ) | |
| 1339 call_group.add_argument( | |
| 1340 '--no-detect-overlaps', | |
| 1341 dest='detect_overlaps', action='store_false', | |
| 1342 help='Disable automatic read-pair overlap detection by samtools ' | |
| 1343 'mpileup. Overlap detection tries to avoid counting duplicated ' | |
| 1344 'bases twice during variant calling. ' | |
| 1345 '(samtools mpileup -x option; default: use overlap detection)' | |
| 1346 ) | |
| 1347 call_group.add_argument( | |
| 1093 '--min-basequal', | 1348 '--min-basequal', |
| 1094 dest='min_basequal', type=int, | 1349 dest='min_basequal', type=int, |
| 1095 default=13, | 1350 default=13, |
| 1096 help='Minimum base quality at the variant position to use a read ' | 1351 help='Minimum base quality at the variant position to use a read ' |
| 1097 '(default: 13)' | 1352 '(default: 13)' |
| 1158 filter_group.add_argument( | 1413 filter_group.add_argument( |
| 1159 '--min-var-count2-lc', | 1414 '--min-var-count2-lc', |
| 1160 dest='min_var_count2_lc', type=int, | 1415 dest='min_var_count2_lc', type=int, |
| 1161 default=2, | 1416 default=2, |
| 1162 help='Minimum number of variant-supporting reads when depth below ' | 1417 help='Minimum number of variant-supporting reads when depth below ' |
| 1163 '--somatic-p-depth (default: 2)' | 1418 '--min-var-count2-depth (default: 2)' |
| 1419 ) | |
| 1420 filter_group.add_argument( | |
| 1421 '--min-var-count2-depth', | |
| 1422 dest='min_var_count2_depth', type=int, | |
| 1423 default=10, | |
| 1424 help='Combined depth of ref- and variant-supporting reads required to ' | |
| 1425 'apply the --min-var-count filter instead of --min-var-count-lc ' | |
| 1426 '(default: 10)' | |
| 1164 ) | 1427 ) |
| 1165 filter_group.add_argument( | 1428 filter_group.add_argument( |
| 1166 '--min-var-freq2', | 1429 '--min-var-freq2', |
| 1167 dest='min_var_freq2', type=float, | 1430 dest='min_var_freq2', type=float, |
| 1168 default=0.05, | 1431 default=0.05, |
| 1169 help='Minimum variant allele frequency (default: 0.05)' | 1432 help='Minimum variant allele frequency (default: 0.05)' |
| 1170 ) | |
| 1171 filter_group.add_argument( | |
| 1172 '--max-somatic-p', | |
| 1173 dest='max_somatic_p', type=float, | |
| 1174 default=0.05, | |
| 1175 help='Maximum somatic p-value (default: 0.05)' | |
| 1176 ) | |
| 1177 filter_group.add_argument( | |
| 1178 '--max-somatic-p-depth', | |
| 1179 dest='max_somatic_p_depth', type=int, | |
| 1180 default=10, | |
| 1181 help='Depth required to run --max-somatic-p filter (default: 10)' | |
| 1182 ) | 1433 ) |
| 1183 filter_group.add_argument( | 1434 filter_group.add_argument( |
| 1184 '--min-ref-readpos', | 1435 '--min-ref-readpos', |
| 1185 dest='min_ref_readpos', type=float, | 1436 dest='min_ref_readpos', type=float, |
| 1186 default=0.1, | 1437 default=0.1, |
| 1207 default=0.1, | 1458 default=0.1, |
| 1208 help='Minimum average relative distance of site from the effective ' | 1459 help='Minimum average relative distance of site from the effective ' |
| 1209 '3\'end of variant-supporting reads (default: 0.1)' | 1460 '3\'end of variant-supporting reads (default: 0.1)' |
| 1210 ) | 1461 ) |
| 1211 filter_group.add_argument( | 1462 filter_group.add_argument( |
| 1463 '--detect-q2-runs', | |
| 1464 dest='detect_q2_runs', action='store_true', | |
| 1465 help='Look for 3\'-terminal q2 runs and take their lengths into ' | |
| 1466 'account for determining the effective 3\'end of reads ' | |
| 1467 '(default: off)' | |
| 1468 ) | |
| 1469 filter_group.add_argument( | |
| 1212 '--min-ref-len', | 1470 '--min-ref-len', |
| 1213 dest='min_ref_len', type=int, | 1471 dest='min_ref_len', type=int, |
| 1214 default=90, | 1472 default=90, |
| 1215 help='Minimum average trimmed length of reads supporting the ref ' | 1473 help='Minimum average trimmed length of reads supporting the ref ' |
| 1216 'allele (default: 90)' | 1474 'allele (default: 90)' |
| 1307 '--max-mmqs-diff', | 1565 '--max-mmqs-diff', |
| 1308 dest='max_mmqs_diff', type=int, | 1566 dest='max_mmqs_diff', type=int, |
| 1309 default=50, | 1567 default=50, |
| 1310 help='Maximum mismatch quality sum difference (var - ref; default: 50)' | 1568 help='Maximum mismatch quality sum difference (var - ref; default: 50)' |
| 1311 ) | 1569 ) |
| 1570 filter_group.add_argument( | |
| 1571 '--ignore-md', | |
| 1572 dest='ignore_md', action='store_true', | |
| 1573 help='Do not rely on read MD tag, even if it is present on the mapped ' | |
| 1574 'reads, and recalculate mismatch quality stats from ref ' | |
| 1575 'alignments instead.' | |
| 1576 ) | |
| 1577 | |
| 1312 args = vars(p.parse_args()) | 1578 args = vars(p.parse_args()) |
| 1313 varscan_call(**args) | 1579 varscan_call(**args) |
