comparison varscan.py @ 9:4e97191a1ff7 draft

"planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit fcf5ac14629c694f0f64773fab0428b1e78fe156"
author iuc
date Fri, 16 Aug 2019 15:49:54 -0400
parents b79bb8b09822
children cf8ffc79db67
comparison
equal deleted inserted replaced
8:b79bb8b09822 9:4e97191a1ff7
6 import os 6 import os
7 import subprocess 7 import subprocess
8 import sys 8 import sys
9 import tempfile 9 import tempfile
10 import time 10 import time
11 from collections import namedtuple
11 from contextlib import ExitStack 12 from contextlib import ExitStack
12 from functools import partial 13 from functools import partial
13 from threading import Thread 14 from threading import Thread
14 15
15 import pysam 16 import pysam
17
18
19 AlleleStats = namedtuple(
20 "AlleleStats",
21 [
22 'reads_total',
23 'reads_fw',
24 'reads_rv',
25 'avg_mapqual',
26 'avg_basequal',
27 'avg_dist_from_center',
28 'avg_mismatch_fraction',
29 'avg_mismatch_qualsum',
30 'avg_clipped_len',
31 'avg_dist_from_3prime'
32 ]
33 )
34
35
36 def _get_allele_specific_pileup_column_stats(pileups, ref_fetch,
37 ignore_md, ignore_nm,
38 mm_runs, detect_q2_runs):
39 var_reads_plus = var_reads_minus = 0
40 sum_mapping_qualities = 0
41 sum_base_qualities = 0
42 sum_dist_from_center = 0
43 sum_dist_from_3prime = 0
44 sum_clipped_length = 0
45 sum_unclipped_length = 0
46 sum_mismatch_fractions = 0
47 sum_mismatch_qualities = 0
48
49 for p in pileups:
50 if p.alignment.is_reverse:
51 var_reads_minus += 1
52 else:
53 var_reads_plus += 1
54 sum_mapping_qualities += p.alignment.mapping_quality
55 sum_base_qualities += p.alignment.query_qualities[p.query_position]
56 sum_clipped_length += p.alignment.query_alignment_length
57 unclipped_length = p.alignment.query_length
58 sum_unclipped_length += unclipped_length
59 # The following calculations are all in 1-based coordinates
60 # with respect to the physical 5'-end of the read sequence.
61 if p.alignment.is_reverse:
62 read_base_pos = unclipped_length - p.query_position
63 read_first_aln_pos = (
64 unclipped_length - p.alignment.query_alignment_end + 1
65 )
66 read_last_aln_pos = (
67 unclipped_length - p.alignment.query_alignment_start
68 )
69 qualities_3to5prime = p.alignment.query_alignment_qualities
70 else:
71 read_base_pos = p.query_position + 1
72 read_first_aln_pos = p.alignment.query_alignment_start + 1
73 read_last_aln_pos = p.alignment.query_alignment_end
74 qualities_3to5prime = reversed(
75 p.alignment.query_alignment_qualities
76 )
77
78 read_last_effective_aln_pos = read_last_aln_pos
79 if detect_q2_runs:
80 # Note: the original bam-readcount algorithm always takes
81 # terminal q2 runs into account when determining the
82 # effective 3'-ends of reads.
83 # However, q2 runs have no special meaning since Illumina
84 # pipeline version 1.8 so detecting them is optional
85 # in this code.
86 for qual in qualities_3to5prime:
87 if qual != 2:
88 break
89 read_last_effective_aln_pos -= 1
90
91 # Note: the original bam-readcount algorithm defines the
92 # read center as:
93 # read_center = p.alignment.query_alignment_length / 2
94 # This is less accurate than the implementation here.
95 read_center = (read_first_aln_pos + read_last_aln_pos) / 2
96 sum_dist_from_center += 1 - abs(
97 read_base_pos - read_center
98 ) / (read_center - 1)
99 # To calculate the distance of base positions from the 3'-ends
100 # of reads bam-readcount uses the formula:
101 # sum_dist_from_3prime += abs(
102 # read_last_effective_aln_pos - read_base_pos
103 # ) / (unclipped_length - 1)
104 # , which treats base positions on both sides of the effective
105 # 3'-end equally. Since this seems hard to justify, we cap
106 # the distance calculation at 0 for base positions past the
107 # effective 3'-end (which, in turn, should only be possible
108 # with detection of q2 runs).
109 if read_last_effective_aln_pos > read_base_pos:
110 sum_dist_from_3prime += (
111 read_last_effective_aln_pos - read_base_pos
112 ) / (unclipped_length - 1)
113
114 if sum_mismatch_fractions >= 0 or sum_mismatch_qualities >= 0:
115 # sum_mismatch_fractions and sum_mismatch_qualities will be
116 # set to float('nan') if reference info (in form of an MD tag or
117 # an actual reference sequence) was not available for any previous
118 # read in the pileup.
119 # In that case, there is no point to trying to calculate
120 # mismatch stats for other reads anymore.
121
122 # The following mismatch calculations use this logic:
123 # 1. For determining the edit distance between an aligned read and
124 # the reference, we follow the authoritative definition of the
125 # NM tag calculation in
126 # http://samtools.github.io/hts-specs/SAMtags.pdf
127 #
128 # For historical reasons, the result of this calculation will
129 # disagree more often than not with NM tag values calculated by
130 # other tools.
131 # If precalculated NM tag values are present on the aligned
132 # reads, these can be given preference through the use_nm flag.
133 # Doing so will mimick the behavior of bam-readcount, which
134 # requires and always just looks at NM tags.
135 # 2. For determining mismatch quality sums, a mismatch is defined
136 # differently and in accordance with the implementation in
137 # bam-readcount:
138 # - only mismatches (not inserted or deleted bases) are
139 # considered
140 # - 'N' in the reference is considered a match for any read base
141 # - any matching (case-insensitive) base in reference and read
142 # is considered a match, even if that base is not one of
143 # A, C, G or T.
144 # In both 1. and 2. above a '=' in the read is always considered a
145 # match, irrespective of the reference base.
146
147 num_mismatches = 0
148 if not ignore_md:
149 try:
150 # see if the read has an MD tag, in which case pysam can
151 # calculate the reference sequence for us
152 ref_seq = p.alignment.get_reference_sequence().upper()
153 except ValueError:
154 ignore_md = True
155 if not ignore_nm:
156 try:
157 num_mismatches = p.alignment.get_tag('NM')
158 except KeyError:
159 ignore_nm = True
160
161 if ignore_md:
162 if not ref_fetch:
163 # cannot calculate mismatch stats without ref info
164 sum_mismatch_qualities = float('nan')
165 if ignore_nm:
166 sum_mismatch_fractions = float('nan')
167 else:
168 sum_mismatch_fractions += (
169 num_mismatches / p.alignment.query_alignment_length
170 )
171 continue
172 # Without an MD tag we need to extract the relevant part
173 # of the reference from the full sequence by position.
174 ref_positions = p.alignment.get_reference_positions()
175 ref_seq = ref_fetch(
176 ref_positions[0], ref_positions[-1] + 1
177 ).upper()
178
179 potential_matches = {'A', 'C', 'G', 'T'}
180 aligned_pairs = p.alignment.get_aligned_pairs(matches_only=True)
181 ref_offset = aligned_pairs[0][1]
182 last_mismatch_pos = None
183 mismatch_run_quals = []
184 for qpos, rpos in aligned_pairs:
185 read_base = p.alignment.query_sequence[qpos].upper()
186 if read_base == '=':
187 # always treat the special read base '=' as a
188 # match, irrespective of the reference base
189 continue
190 ref_base = ref_seq[rpos - ref_offset]
191 # see if we have a mismatch to use for mismatch quality sum
192 # calculation
193 if ref_base != 'N' and ref_base != read_base:
194 if mm_runs:
195 if (
196 last_mismatch_pos is None
197 ) or (
198 last_mismatch_pos + 1 == qpos
199 ):
200 mismatch_run_quals.append(
201 p.alignment.query_qualities[qpos]
202 )
203 else:
204 sum_mismatch_qualities += max(mismatch_run_quals)
205 mismatch_run_quals = [
206 p.alignment.query_qualities[qpos]
207 ]
208 last_mismatch_pos = qpos
209 else:
210 sum_mismatch_qualities += (
211 p.alignment.query_qualities[qpos]
212 )
213 if ignore_nm:
214 # see if we have a mismatch that increases the edit
215 # distance according to the SAMtags specs
216 if (
217 read_base not in potential_matches
218 ) or (
219 ref_base not in potential_matches
220 ) or (
221 read_base != ref_base
222 ):
223 num_mismatches += 1
224
225 if mismatch_run_quals:
226 sum_mismatch_qualities += max(mismatch_run_quals)
227 if ignore_nm:
228 # use the number of mismatches calculated above,
229 # but add inserted and deleted bases to it
230 cigar_stats = p.alignment.get_cigar_stats()[0]
231 num_mismatches += cigar_stats[1] + cigar_stats[2]
232 sum_mismatch_fractions += (
233 num_mismatches / p.alignment.query_alignment_length
234 )
235
236 return (
237 var_reads_plus,
238 var_reads_minus,
239 sum_mapping_qualities,
240 sum_base_qualities,
241 sum_dist_from_center,
242 sum_mismatch_fractions,
243 sum_mismatch_qualities,
244 sum_clipped_length,
245 sum_dist_from_3prime
246 )
247
248
249 def get_allele_stats(reads, pos, alleles,
250 ref=None, ignore_md=True, ignore_nm=True,
251 mm_runs=True, detect_q2_runs=False,
252 pileup_args=None):
253 chrom, start, stop = pos
254 if pileup_args is None:
255 pileup_args = {}
256 if pileup_args.get('stepper') == 'samtools':
257 if pileup_args.get('compute_baq', True) is not False:
258 if 'fastafile' not in pileup_args:
259 pileup_args['fastafile'] = ref
260 # be careful when passing in a custom 'fastafile' option:
261 # providing it slows down pysam tremendously even if the option
262 # isn't actually required.
263
264 pile = reads.pileup(
265 chrom, start, stop,
266 **pileup_args
267 )
268 # apply false-positive filtering a la varscan fpfilter
269 # find the variant site in the pileup columns
270 for pile_column in pile:
271 if pile_column.reference_pos == start:
272 break
273 else:
274 # With no reads covering the genomic position
275 # we can only return "empty" allele stats
276 return [
277 AlleleStats(0, 0, 0, *[float('nan')] * 7)
278 for allele in alleles
279 ]
280
281 # extract required information
282 # overall read depth at the site
283 read_depth = pile_column.get_num_aligned()
284 assert read_depth > 0
285
286 alleles_stats = []
287 for allele in alleles:
288 if '-' in allele:
289 allele, indel_type, indel_after = allele.partition('-')
290 indel_len = -len(indel_after)
291 elif '+' in allele:
292 allele, indel_type, indel_after = allele.partition('+')
293 indel_len = len(indel_after)
294 else:
295 indel_type = None
296
297 stats_it = (
298 p for p in pile_column.pileups
299 # skip reads that don't support the allele we're looking for
300 if (p.query_position is not None)
301 and (p.alignment.query_sequence[p.query_position] == allele)
302 )
303 if indel_type == '-':
304 stats_it = (
305 p for p in stats_it if p.indel == indel_len
306 )
307 elif indel_type == '+':
308 stats_it = (
309 p for p in stats_it if (
310 p.indel == indel_len
311 ) and (
312 p.alignment.query_sequence[
313 p.query_position + 1:
314 p.query_position + 1 + indel_len
315 ] == indel_after
316 )
317 )
318 allele_stats = _get_allele_specific_pileup_column_stats(
319 stats_it,
320 partial(
321 pysam.FastaFile.fetch, ref, chrom
322 ) if ref else None,
323 ignore_md,
324 ignore_nm,
325 mm_runs,
326 detect_q2_runs
327 )
328
329 allele_reads_total = allele_stats[0] + allele_stats[1]
330 if allele_reads_total == 0:
331 # No stats without reads!
332 alleles_stats.append(
333 AlleleStats(read_depth, 0, 0, *[float('nan')] * 7)
334 )
335 else:
336 alleles_stats.append(
337 AlleleStats(
338 read_depth, allele_stats[0], allele_stats[1],
339 *(i / allele_reads_total for i in allele_stats[2:])
340 )
341 )
342 return alleles_stats
16 343
17 344
18 class VariantCallingError (RuntimeError): 345 class VariantCallingError (RuntimeError):
19 """Exception class for issues with samtools and varscan subprocesses.""" 346 """Exception class for issues with samtools and varscan subprocesses."""
20 347
38 return msg_header + self.message 365 return msg_header + self.message
39 366
40 367
41 class VarScanCaller (object): 368 class VarScanCaller (object):
42 def __init__(self, ref_genome, bam_input_files, 369 def __init__(self, ref_genome, bam_input_files,
43 max_depth=None, 370 max_depth=None, count_orphans=False, detect_overlaps=True,
44 min_mapqual=None, min_basequal=None, 371 min_mapqual=None, min_basequal=None,
45 threads=1, verbose=False, quiet=True 372 threads=1, verbose=False, quiet=True
46 ): 373 ):
47 self.ref_genome = ref_genome 374 self.ref_genome = ref_genome
48 self.bam_input_files = bam_input_files 375 self.bam_input_files = bam_input_files
49 self.max_depth = max_depth 376 self.max_depth = max_depth
377 self.count_orphans = count_orphans
378 self.detect_overlaps = detect_overlaps
50 self.min_mapqual = min_mapqual 379 self.min_mapqual = min_mapqual
51 self.min_basequal = min_basequal 380 self.min_basequal = min_basequal
52 self.threads = threads 381 self.threads = threads
53 self.verbose = verbose 382 self.verbose = verbose
54 self.quiet = quiet 383 self.quiet = quiet
65 mode='wb', suffix='', delete=False, dir=os.getcwd() 394 mode='wb', suffix='', delete=False, dir=os.getcwd()
66 ) 395 )
67 self.tmpfiles = [] 396 self.tmpfiles = []
68 397
69 def _get_pysam_pileup_args(self): 398 def _get_pysam_pileup_args(self):
70 param_dict = {} 399 # Compute the pileup args dynamically to account for possibly updated
400 # instance attributes.
401
402 # fixed default parameters
403 # Note on the fixed default for 'ignore_overlaps':
404 # In order to use the exact same pileup parameters during variant
405 # calling and postprocessing, we would have to compute the setting
406 # dynamically like so:
407 # if not self.detect_overlaps:
408 # param_dict['ignore_overlaps'] = False
409 # However, samtools/pysam implement overlap correction by manipulating
410 # (setting to zero) the lower qualities of bases that are part of an
411 # overlap (see
412 # https://sourceforge.net/p/samtools/mailman/message/32793789/). This
413 # manipulation has such a big undesirable effect on base quality stats
414 # calculated during postprocessing that calculating the stats on a
415 # slightly different pileup seems like the lesser evil.
416
417 param_dict = {
418 'ignore_overlaps': False,
419 'compute_baq': False,
420 'stepper': 'samtools',
421 }
422 # user-controllable parameters
423 if self.count_orphans:
424 param_dict['ignore_orphans'] = False
71 if self.max_depth is not None: 425 if self.max_depth is not None:
72 param_dict['max_depth'] = self.max_depth 426 param_dict['max_depth'] = self.max_depth
73 if self.min_mapqual is not None: 427 if self.min_mapqual is not None:
74 param_dict['min_mapping_quality'] = self.min_mapqual 428 param_dict['min_mapping_quality'] = self.min_mapqual
75 if self.min_basequal is not None: 429 if self.min_basequal is not None:
76 param_dict['min_base_quality'] = self.min_basequal 430 param_dict['min_base_quality'] = self.min_basequal
77 param_dict['compute_baq'] = False
78 param_dict['stepper'] = 'samtools'
79 return param_dict 431 return param_dict
80 432
81 def varcall_parallel(self, normal_purity=None, tumor_purity=None, 433 def varcall_parallel(self, normal_purity=None, tumor_purity=None,
82 min_coverage=None, 434 min_coverage=None,
83 min_var_count=None, 435 min_var_count=None,
106 varcall_engine_options = [] 458 varcall_engine_options = []
107 for option, value in varcall_engine_option_mapping: 459 for option, value in varcall_engine_option_mapping:
108 if value is not None: 460 if value is not None:
109 varcall_engine_options += [option, str(value)] 461 varcall_engine_options += [option, str(value)]
110 pileup_engine_options = ['-B'] 462 pileup_engine_options = ['-B']
463 if self.count_orphans:
464 pileup_engine_options += ['-A']
465 if not self.detect_overlaps:
466 pileup_engine_options += ['-x']
111 if self.max_depth is not None: 467 if self.max_depth is not None:
112 pileup_engine_options += ['-d', str(self.max_depth)] 468 pileup_engine_options += ['-d', str(self.max_depth)]
113 if self.min_mapqual is not None: 469 if self.min_mapqual is not None:
114 pileup_engine_options += ['-q', str(self.min_mapqual)] 470 pileup_engine_options += ['-q', str(self.min_mapqual)]
115 if self.min_basequal is not None: 471 if self.min_basequal is not None:
504 # FREQ is easy to calculate from AD 860 # FREQ is easy to calculate from AD
505 del record.format['FREQ'] 861 del record.format['FREQ']
506 del record.format['RD'] 862 del record.format['RD']
507 del record.format['DP4'] 863 del record.format['DP4']
508 864
509 def get_allele_specific_pileup_column_stats(
510 self, allele, pile_column, ref_fetch, use_md=True
511 ):
512 var_reads_plus = var_reads_minus = 0
513 sum_mapping_qualities = 0
514 sum_base_qualities = 0
515 sum_dist_from_center = 0
516 sum_dist_from_3prime = 0
517 sum_clipped_length = 0
518 sum_unclipped_length = 0
519 sum_num_mismatches_as_fraction = 0
520 sum_mismatch_qualities = 0
521
522 for p in pile_column.pileups:
523 # skip reads that don't support the allele we're looking for
524 if p.query_position is None:
525 continue
526 if p.alignment.query_sequence[p.query_position] != allele:
527 continue
528
529 if p.alignment.is_reverse:
530 var_reads_minus += 1
531 else:
532 var_reads_plus += 1
533 sum_mapping_qualities += p.alignment.mapping_quality
534 sum_base_qualities += p.alignment.query_qualities[p.query_position]
535 sum_clipped_length += p.alignment.query_alignment_length
536 unclipped_length = p.alignment.query_length
537 sum_unclipped_length += unclipped_length
538 # The following calculations are all in 1-based coordinates
539 # with respect to the physical 5'-end of the read sequence.
540 if p.alignment.is_reverse:
541 read_base_pos = unclipped_length - p.query_position
542 read_first_aln_pos = (
543 unclipped_length - p.alignment.query_alignment_end + 1
544 )
545 read_last_aln_pos = (
546 unclipped_length - p.alignment.query_alignment_start
547 )
548 else:
549 read_base_pos = p.query_position + 1
550 read_first_aln_pos = p.alignment.query_alignment_start + 1
551 read_last_aln_pos = p.alignment.query_alignment_end
552 # Note: the original bam-readcount algorithm uses the less accurate
553 # p.alignment.query_alignment_length / 2 as the read center
554 read_center = (read_first_aln_pos + read_last_aln_pos) / 2
555 sum_dist_from_center += 1 - abs(
556 read_base_pos - read_center
557 ) / (read_center - 1)
558 # Note: the original bam-readcount algorithm uses a more
559 # complicated definition of the 3'-end of a read sequence, which
560 # clips off runs of bases with a quality scores of exactly 2.
561 sum_dist_from_3prime += (
562 read_last_aln_pos - read_base_pos
563 ) / (unclipped_length - 1)
564
565 sum_num_mismatches = 0
566 if use_md:
567 try:
568 # see if the read has an MD tag, in which case pysam can
569 # calculate the reference sequence for us
570 aligned_pairs = p.alignment.get_aligned_pairs(
571 matches_only=True, with_seq=True
572 )
573 except ValueError:
574 use_md = False
575 if use_md:
576 # The ref sequence got calculated from alignment and MD tag.
577 for qpos, rpos, ref_base in aligned_pairs:
578 # pysam uses lowercase ref bases to indicate mismatches
579 if (
580 ref_base == 'a' or ref_base == 't' or
581 ref_base == 'g' or ref_base == 'c'
582 ):
583 sum_num_mismatches += 1
584 sum_mismatch_qualities += p.alignment.query_qualities[
585 qpos
586 ]
587 else:
588 # We need to obtain the aligned portion of the reference
589 # sequence.
590 aligned_pairs = p.alignment.get_aligned_pairs(
591 matches_only=True
592 )
593 # note that ref bases can be lowercase
594 ref_seq = ref_fetch(
595 aligned_pairs[0][1], aligned_pairs[-1][1] + 1
596 ).upper()
597 ref_offset = aligned_pairs[0][1]
598
599 for qpos, rpos in p.alignment.get_aligned_pairs(
600 matches_only=True
601 ):
602 # see if we have a mismatch to the reference at this
603 # position, but note that
604 # - the query base may be lowercase (SAM/BAM spec)
605 # - an '=', if present in the query seq, indicates a match
606 # to the reference (SAM/BAM spec)
607 # - there cannot be a mismatch with an N in the reference
608 ref_base = ref_seq[rpos - ref_offset]
609 read_base = p.alignment.query_sequence[qpos].upper()
610 if (
611 read_base != '=' and ref_base != 'N'
612 ) and (
613 ref_base != read_base
614 ):
615 sum_num_mismatches += 1
616 sum_mismatch_qualities += p.alignment.query_qualities[
617 qpos
618 ]
619 sum_num_mismatches_as_fraction += (
620 sum_num_mismatches / p.alignment.query_alignment_length
621 )
622
623 var_reads_total = var_reads_plus + var_reads_minus
624 if var_reads_total == 0:
625 # No stats without reads!
626 return None
627 avg_mapping_quality = sum_mapping_qualities / var_reads_total
628 avg_basequality = sum_base_qualities / var_reads_total
629 avg_pos_as_fraction = sum_dist_from_center / var_reads_total
630 avg_num_mismatches_as_fraction = (
631 sum_num_mismatches_as_fraction / var_reads_total
632 )
633 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total
634 avg_clipped_length = sum_clipped_length / var_reads_total
635 avg_distance_to_effective_3p_end = (
636 sum_dist_from_3prime / var_reads_total
637 )
638
639 return (
640 avg_mapping_quality,
641 avg_basequality,
642 var_reads_plus,
643 var_reads_minus,
644 avg_pos_as_fraction,
645 avg_num_mismatches_as_fraction,
646 avg_sum_mismatch_qualities,
647 avg_clipped_length,
648 avg_distance_to_effective_3p_end
649 )
650
651 def _postprocess_variant_records(self, invcf, 865 def _postprocess_variant_records(self, invcf,
652 min_var_count2, min_var_count2_lc, 866 min_var_count2, min_var_count2_lc,
653 min_var_freq2, max_somatic_p, 867 min_var_freq2, min_var_count2_depth,
654 max_somatic_p_depth,
655 min_ref_readpos, min_var_readpos, 868 min_ref_readpos, min_var_readpos,
656 min_ref_dist3, min_var_dist3, 869 min_ref_dist3, min_var_dist3,
870 detect_q2_runs,
657 min_ref_len, min_var_len, 871 min_ref_len, min_var_len,
658 max_relative_len_diff, 872 max_relative_len_diff,
659 min_strandedness, min_strand_reads, 873 min_strandedness, min_strand_reads,
660 min_ref_basequal, min_var_basequal, 874 min_ref_basequal, min_var_basequal,
661 max_basequal_diff, 875 max_basequal_diff,
662 min_ref_mapqual, min_var_mapqual, 876 min_ref_mapqual, min_var_mapqual,
663 max_mapqual_diff, 877 max_mapqual_diff,
664 max_ref_mmqs, max_var_mmqs, 878 max_ref_mmqs, max_var_mmqs,
665 min_mmqs_diff, max_mmqs_diff, 879 min_mmqs_diff, max_mmqs_diff,
880 ignore_md,
666 **args): 881 **args):
667 # set FILTER field according to Varscan criteria 882 # set FILTER field according to Varscan criteria
668 # multiple FILTER entries must be separated by semicolons 883 # multiple FILTER entries must be separated by semicolons
669 # No filters applied should be indicated with MISSING 884 # No filters applied should be indicated with MISSING
670 885
679 io_stack.enter_context( 894 io_stack.enter_context(
680 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files 895 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files
681 ) 896 )
682 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) 897 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome))
683 pileup_args = self._get_pysam_pileup_args() 898 pileup_args = self._get_pysam_pileup_args()
899 _get_stats = get_allele_stats
684 for record in invcf: 900 for record in invcf:
685 if any(len(allele) > 1 for allele in record.alleles): 901 is_indel = False
686 # skip indel postprocessing for the moment
687 yield record
688 continue
689 if record.alleles[0] == 'N': 902 if record.alleles[0] == 'N':
690 # It makes no sense to call SNPs against an unknown 903 # It makes no sense to call SNPs against an unknown
691 # reference base 904 # reference base
692 continue 905 continue
906 if len(record.alleles[0]) > 1:
907 # a deletion
908 is_indel = True
909 alleles = [
910 record.alleles[1], record.alleles[0].replace(
911 record.alleles[1], record.alleles[1] + '-', 1
912 )
913 ]
914 elif len(record.alleles[1]) > 1:
915 # an insertion
916 is_indel = True
917 alleles = [
918 record.alleles[0],
919 record.alleles[1].replace(
920 record.alleles[0], record.alleles[0] + '+', 1
921 )
922 ]
923 else:
924 # a regular SNV
925 alleles = record.alleles[:2]
693 # get pileup for genomic region affected by this variant 926 # get pileup for genomic region affected by this variant
694 if record.info['SS'] == '2': 927 if record.info['SS'] == '2':
695 # a somatic variant => generate pileup from tumor data 928 # a somatic variant => generate pileup from tumor data
696 pile = tumor_reads.pileup( 929 reads_of_interest = tumor_reads
697 record.chrom, record.start, record.stop, 930 calculate_ref_stats = record.samples['TUMOR']['RD'] > 0
698 **pileup_args
699 )
700 sample_of_interest = 'TUMOR'
701 elif record.info['SS'] in ['1', '3']: 931 elif record.info['SS'] in ['1', '3']:
702 # a germline or LOH variant => pileup from normal data 932 # a germline or LOH variant => pileup from normal data
703 pile = normal_reads.pileup( 933 reads_of_interest = normal_reads
704 record.chrom, record.start, record.stop, 934 calculate_ref_stats = record.samples['NORMAL']['RD'] > 0
705 **pileup_args
706 )
707 sample_of_interest = 'NORMAL'
708 else: 935 else:
709 # TO DO: figure out if there is anything interesting to do 936 # TO DO: figure out if there is anything interesting to do
710 # for SS status codes 0 (reference) and 5 (unknown) 937 # for SS status codes 0 (reference) and 5 (unknown)
711 yield record 938 yield record
712 continue 939 continue
713 940
714 # apply false-positive filtering a la varscan fpfilter
715 # find the variant site in the pileup columns
716 for pile_column in pile:
717 if pile_column.reference_pos == record.start:
718 break
719 # extract required information
720 # overall read depth at the site
721 read_depth = pile_column.get_num_aligned()
722 assert read_depth > 0
723 # no multiallelic sites in varscan 941 # no multiallelic sites in varscan
724 assert len(record.alleles) == 2 942 assert len(record.alleles) == 2
725 if record.samples[sample_of_interest]['RD'] > 0: 943
726 ref_stats, alt_stats = [ 944 if calculate_ref_stats:
727 self.get_allele_specific_pileup_column_stats( 945 ref_stats, alt_stats = _get_stats(
728 allele, 946 reads_of_interest,
729 pile_column, 947 (record.chrom, record.start, record.stop),
730 partial( 948 alleles,
731 pysam.FastaFile.fetch, refseq, record.chrom 949 refseq,
732 ) 950 ignore_md=ignore_md,
733 ) 951 ignore_nm=False,
734 for allele in record.alleles 952 mm_runs=True,
735 ] 953 detect_q2_runs=detect_q2_runs,
954 pileup_args=pileup_args
955 )
736 else: 956 else:
737 ref_stats = None 957 ref_stats = None
738 alt_stats = self.get_allele_specific_pileup_column_stats( 958 alt_stats = _get_stats(
739 record.alleles[1], 959 reads_of_interest,
740 pile_column, 960 (record.chrom, record.start, record.stop),
741 partial(pysam.FastaFile.fetch, refseq, record.chrom) 961 alleles[1:2],
742 ) 962 refseq,
963 ignore_md=ignore_md,
964 ignore_nm=False,
965 mm_runs=True,
966 detect_q2_runs=detect_q2_runs,
967 pileup_args=pileup_args
968 )[0]
743 ref_count = 0 969 ref_count = 0
744 if ref_stats: 970 if ref_stats:
745 ref_count = ref_stats[2] + ref_stats[3] 971 ref_count = ref_stats.reads_fw + ref_stats.reads_rv
746 if ref_stats[1] < min_ref_basequal: 972 if ref_stats.avg_basequal < min_ref_basequal:
747 record.filter.add('RefBaseQual') 973 record.filter.add('RefBaseQual')
748 if ref_count >= 2: 974 if ref_count >= 2:
749 if ref_stats[0] < min_ref_mapqual: 975 if ref_stats.avg_mapqual < min_ref_mapqual:
750 record.filter.add('RefMapQual') 976 record.filter.add('RefMapQual')
751 if ref_stats[4] < min_ref_readpos: 977 if ref_stats.avg_dist_from_center < min_ref_readpos:
752 record.filter.add('RefReadPos') 978 record.filter.add('RefReadPos')
753 # ref_stats[5] (avg_num_mismatches_as_fraction 979 # ref_stats.avg_mismatch_fraction
754 # is not a filter criterion in VarScan fpfilter 980 # is not a filter criterion in VarScan fpfilter
755 if ref_stats[6] > max_ref_mmqs: 981 if ref_stats.avg_mismatch_qualsum > max_ref_mmqs:
756 record.filter.add('RefMMQS') 982 record.filter.add('RefMMQS')
757 if ref_stats[7] < min_ref_len: 983 if not is_indel and (
984 ref_stats.avg_clipped_len < min_ref_len
985 ):
758 # VarScan fpfilter does not apply this filter 986 # VarScan fpfilter does not apply this filter
759 # for indels, but there is no reason 987 # for indels, so we do not do it either.
760 # not to do it.
761 record.filter.add('RefAvgRL') 988 record.filter.add('RefAvgRL')
762 if ref_stats[8] < min_ref_dist3: 989 if ref_stats.avg_dist_from_3prime < min_ref_dist3:
763 record.filter.add('RefDist3') 990 record.filter.add('RefDist3')
764 if alt_stats: 991 if alt_stats:
765 alt_count = alt_stats[2] + alt_stats[3] 992 alt_count = alt_stats.reads_fw + alt_stats.reads_rv
766 if ( 993 if alt_count < min_var_count2:
767 alt_count < min_var_count2_lc 994 if (
768 ) or ( 995 (alt_count + ref_count) >= min_var_count2_depth
769 read_depth >= max_somatic_p_depth and 996 ) or (
770 alt_count < min_var_count2 997 alt_count < min_var_count2_lc
998 ):
999 record.filter.add('VarCount')
1000 if alt_count / alt_stats.reads_total < min_var_freq2:
1001 record.filter.add('VarFreq')
1002 if not is_indel and (
1003 alt_stats.avg_basequal < min_var_basequal
771 ): 1004 ):
772 record.filter.add('VarCount')
773 if alt_count / read_depth < min_var_freq2:
774 record.filter.add('VarFreq')
775 if alt_stats[1] < min_var_basequal:
776 record.filter.add('VarBaseQual') 1005 record.filter.add('VarBaseQual')
777 if alt_count > min_strand_reads: 1006 if alt_count >= min_strand_reads:
778 if ( 1007 if (
779 alt_stats[2] / alt_count < min_strandedness 1008 alt_stats.reads_fw / alt_count < min_strandedness
780 ) or ( 1009 ) or (
781 alt_stats[3] / alt_count < min_strandedness 1010 alt_stats.reads_rv / alt_count < min_strandedness
782 ): 1011 ):
783 record.filter.add('Strand') 1012 record.filter.add('Strand')
784 if alt_stats[2] + alt_stats[3] >= 2: 1013 if alt_count >= 2:
785 if alt_stats[0] < min_var_mapqual: 1014 if alt_stats.avg_mapqual < min_var_mapqual:
786 record.filter.add('VarMapQual') 1015 record.filter.add('VarMapQual')
787 if alt_stats[4] < min_var_readpos: 1016 if alt_stats.avg_dist_from_center < min_var_readpos:
788 record.filter.add('VarReadPos') 1017 record.filter.add('VarReadPos')
789 # alt_stats[5] (avg_num_mismatches_as_fraction 1018 # alt_stats.avg_mismatch_fraction
790 # is not a filter criterion in VarScan fpfilter 1019 # is not a filter criterion in VarScan fpfilter
791 if alt_stats[6] > max_var_mmqs: 1020 if alt_stats.avg_mismatch_qualsum > max_var_mmqs:
792 record.filter.add('VarMMQS') 1021 record.filter.add('VarMMQS')
793 if alt_stats[7] < min_var_len: 1022 if not is_indel and (
1023 alt_stats.avg_clipped_len < min_var_len
1024 ):
794 # VarScan fpfilter does not apply this filter 1025 # VarScan fpfilter does not apply this filter
795 # for indels, but there is no reason 1026 # for indels, so we do not do it either.
796 # not to do it.
797 record.filter.add('VarAvgRL') 1027 record.filter.add('VarAvgRL')
798 if alt_stats[8] < min_var_dist3: 1028 if alt_stats.avg_dist_from_3prime < min_var_dist3:
799 record.filter.add('VarDist3') 1029 record.filter.add('VarDist3')
800 if ref_count >= 2 and alt_count >= 2: 1030 if ref_count >= 2 and alt_count >= 2:
801 if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: 1031 if (
1032 ref_stats.avg_mapqual - alt_stats.avg_mapqual
1033 ) > max_mapqual_diff:
802 record.filter.add('MapQualDiff') 1034 record.filter.add('MapQualDiff')
803 if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: 1035 if (
1036 ref_stats.avg_basequal - alt_stats.avg_basequal
1037 ) > max_basequal_diff:
804 record.filter.add('MaxBAQdiff') 1038 record.filter.add('MaxBAQdiff')
805 mmqs_diff = alt_stats[6] - ref_stats[6] 1039 mmqs_diff = (
1040 alt_stats.avg_mismatch_qualsum
1041 - ref_stats.avg_mismatch_qualsum
1042 )
806 if mmqs_diff < min_mmqs_diff: 1043 if mmqs_diff < min_mmqs_diff:
807 record.filter.add('MinMMQSdiff') 1044 record.filter.add('MinMMQSdiff')
808 if mmqs_diff > max_mmqs_diff: 1045 if mmqs_diff > max_mmqs_diff:
809 record.filter.add('MMQSdiff') 1046 record.filter.add('MMQSdiff')
810 if ( 1047 if (
811 1 - alt_stats[7] / ref_stats[7] 1048 1 -
1049 alt_stats.avg_clipped_len
1050 / ref_stats.avg_clipped_len
812 ) > max_relative_len_diff: 1051 ) > max_relative_len_diff:
813 record.filter.add('ReadLenDiff') 1052 record.filter.add('ReadLenDiff')
814 else: 1053 else:
815 # No variant-supporting reads for this record! 1054 # No variant-supporting reads for this record!
816 # This can happen in rare cases because of 1055 # This can happen in rare cases because of
992 out = (output_path, None) 1231 out = (output_path, None)
993 1232
994 instance_args = { 1233 instance_args = {
995 k: args.pop(k) for k in [ 1234 k: args.pop(k) for k in [
996 'max_depth', 1235 'max_depth',
1236 'count_orphans',
1237 'detect_overlaps',
997 'min_mapqual', 1238 'min_mapqual',
998 'min_basequal', 1239 'min_basequal',
999 'threads', 1240 'threads',
1000 'verbose', 1241 'verbose',
1001 'quiet' 1242 'quiet'
1044 'basename for the two output files to be generated (see ' 1285 'basename for the two output files to be generated (see '
1045 '-s|--split-output below).' 1286 '-s|--split-output below).'
1046 ) 1287 )
1047 p.add_argument( 1288 p.add_argument(
1048 '-s', '--split-output', 1289 '-s', '--split-output',
1049 dest='split_output', action='store_true', default=False, 1290 dest='split_output', action='store_true',
1050 help='indicate that separate output files for SNPs and indels ' 1291 help='indicate that separate output files for SNPs and indels '
1051 'should be generated (original VarScan behavior). If specified, ' 1292 'should be generated (original VarScan behavior). If specified, '
1052 '%%T in the --ofile file name will be replaced with "snp" and ' 1293 '%%T in the --ofile file name will be replaced with "snp" and '
1053 '"indel" to generate the names of the SNP and indel output ' 1294 '"indel" to generate the names of the SNP and indel output '
1054 'files, respectively. If %%T is not found in the file name, it ' 1295 'files, respectively. If %%T is not found in the file name, it '
1088 dest='max_depth', type=int, default=8000, 1329 dest='max_depth', type=int, default=8000,
1089 help='Maximum depth of generated pileups (samtools mpileup -d option; ' 1330 help='Maximum depth of generated pileups (samtools mpileup -d option; '
1090 'default: 8000)' 1331 'default: 8000)'
1091 ) 1332 )
1092 call_group.add_argument( 1333 call_group.add_argument(
1334 '--count-orphans',
1335 dest='count_orphans', action='store_true',
1336 help='Use anomalous read pairs in variant calling '
1337 '(samtools mpileup -A option; default: ignore anomalous pairs)'
1338 )
1339 call_group.add_argument(
1340 '--no-detect-overlaps',
1341 dest='detect_overlaps', action='store_false',
1342 help='Disable automatic read-pair overlap detection by samtools '
1343 'mpileup. Overlap detection tries to avoid counting duplicated '
1344 'bases twice during variant calling. '
1345 '(samtools mpileup -x option; default: use overlap detection)'
1346 )
1347 call_group.add_argument(
1093 '--min-basequal', 1348 '--min-basequal',
1094 dest='min_basequal', type=int, 1349 dest='min_basequal', type=int,
1095 default=13, 1350 default=13,
1096 help='Minimum base quality at the variant position to use a read ' 1351 help='Minimum base quality at the variant position to use a read '
1097 '(default: 13)' 1352 '(default: 13)'
1158 filter_group.add_argument( 1413 filter_group.add_argument(
1159 '--min-var-count2-lc', 1414 '--min-var-count2-lc',
1160 dest='min_var_count2_lc', type=int, 1415 dest='min_var_count2_lc', type=int,
1161 default=2, 1416 default=2,
1162 help='Minimum number of variant-supporting reads when depth below ' 1417 help='Minimum number of variant-supporting reads when depth below '
1163 '--somatic-p-depth (default: 2)' 1418 '--min-var-count2-depth (default: 2)'
1419 )
1420 filter_group.add_argument(
1421 '--min-var-count2-depth',
1422 dest='min_var_count2_depth', type=int,
1423 default=10,
1424 help='Combined depth of ref- and variant-supporting reads required to '
1425 'apply the --min-var-count filter instead of --min-var-count-lc '
1426 '(default: 10)'
1164 ) 1427 )
1165 filter_group.add_argument( 1428 filter_group.add_argument(
1166 '--min-var-freq2', 1429 '--min-var-freq2',
1167 dest='min_var_freq2', type=float, 1430 dest='min_var_freq2', type=float,
1168 default=0.05, 1431 default=0.05,
1169 help='Minimum variant allele frequency (default: 0.05)' 1432 help='Minimum variant allele frequency (default: 0.05)'
1170 )
1171 filter_group.add_argument(
1172 '--max-somatic-p',
1173 dest='max_somatic_p', type=float,
1174 default=0.05,
1175 help='Maximum somatic p-value (default: 0.05)'
1176 )
1177 filter_group.add_argument(
1178 '--max-somatic-p-depth',
1179 dest='max_somatic_p_depth', type=int,
1180 default=10,
1181 help='Depth required to run --max-somatic-p filter (default: 10)'
1182 ) 1433 )
1183 filter_group.add_argument( 1434 filter_group.add_argument(
1184 '--min-ref-readpos', 1435 '--min-ref-readpos',
1185 dest='min_ref_readpos', type=float, 1436 dest='min_ref_readpos', type=float,
1186 default=0.1, 1437 default=0.1,
1207 default=0.1, 1458 default=0.1,
1208 help='Minimum average relative distance of site from the effective ' 1459 help='Minimum average relative distance of site from the effective '
1209 '3\'end of variant-supporting reads (default: 0.1)' 1460 '3\'end of variant-supporting reads (default: 0.1)'
1210 ) 1461 )
1211 filter_group.add_argument( 1462 filter_group.add_argument(
1463 '--detect-q2-runs',
1464 dest='detect_q2_runs', action='store_true',
1465 help='Look for 3\'-terminal q2 runs and take their lengths into '
1466 'account for determining the effective 3\'end of reads '
1467 '(default: off)'
1468 )
1469 filter_group.add_argument(
1212 '--min-ref-len', 1470 '--min-ref-len',
1213 dest='min_ref_len', type=int, 1471 dest='min_ref_len', type=int,
1214 default=90, 1472 default=90,
1215 help='Minimum average trimmed length of reads supporting the ref ' 1473 help='Minimum average trimmed length of reads supporting the ref '
1216 'allele (default: 90)' 1474 'allele (default: 90)'
1307 '--max-mmqs-diff', 1565 '--max-mmqs-diff',
1308 dest='max_mmqs_diff', type=int, 1566 dest='max_mmqs_diff', type=int,
1309 default=50, 1567 default=50,
1310 help='Maximum mismatch quality sum difference (var - ref; default: 50)' 1568 help='Maximum mismatch quality sum difference (var - ref; default: 50)'
1311 ) 1569 )
1570 filter_group.add_argument(
1571 '--ignore-md',
1572 dest='ignore_md', action='store_true',
1573 help='Do not rely on read MD tag, even if it is present on the mapped '
1574 'reads, and recalculate mismatch quality stats from ref '
1575 'alignments instead.'
1576 )
1577
1312 args = vars(p.parse_args()) 1578 args = vars(p.parse_args())
1313 varscan_call(**args) 1579 varscan_call(**args)