comparison varscan.py @ 2:2fe9ebb98aad draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 30867f1f022bed18ba1c3b8dc9c54226890b3a9c
author iuc
date Tue, 04 Dec 2018 05:15:50 -0500
parents
children 2c66c4025db2
comparison
equal deleted inserted replaced
1:31a38ce7e8ae 2:2fe9ebb98aad
1 #!/usr/bin/env python3
2 from __future__ import print_function
3
4 import argparse
5 import io
6 import os
7 import subprocess
8 import sys
9 import tempfile
10 import time
11 from contextlib import ExitStack
12 from functools import partial
13 from threading import Thread
14
15 import pysam
16
17
18 class VariantCallingError (RuntimeError):
19 """Exception class for issues with samtools and varscan subprocesses."""
20
21 def __init__(self, message=None, call='', error=''):
22 self.message = message
23 self.call = call.strip()
24 self.error = error.strip()
25
26 def __str__(self):
27 if self.message is None:
28 return ''
29 if self.error:
30 msg_header = '"{0}" failed with:\n{1}\n\n'.format(
31 self.call, self.error
32 )
33 else:
34 msg_header = '{0} failed.\n'
35 'No further information about this error is available.\n\n'.format(
36 self.call
37 )
38 return msg_header + self.message
39
40
41 class VarScanCaller (object):
42 def __init__(self, ref_genome, bam_input_files,
43 max_depth=None,
44 min_mapqual=None, min_basequal=None,
45 threads=1, verbose=False, quiet=True
46 ):
47 self.ref_genome = ref_genome
48 self.bam_input_files = bam_input_files
49 self.max_depth = max_depth
50 self.min_mapqual = min_mapqual
51 self.min_basequal = min_basequal
52 self.threads = threads
53 self.verbose = verbose
54 self.quiet = quiet
55
56 with pysam.FastaFile(ref_genome) as ref_fa:
57 self.ref_contigs = ref_fa.references
58 self.ref_lengths = ref_fa.lengths
59
60 self.pileup_engine = ['samtools', 'mpileup']
61 self.varcall_engine = ['varscan', 'somatic']
62 self.requires_stdout_redirect = False
63 self.TemporaryContigVCF = partial(
64 tempfile.NamedTemporaryFile,
65 mode='wb', suffix='', delete=False, dir=os.getcwd()
66 )
67 self.tmpfiles = []
68
69 def _get_pysam_pileup_args(self):
70 param_dict = {}
71 if self.max_depth is not None:
72 param_dict['max_depth'] = self.max_depth
73 if self.min_mapqual is not None:
74 param_dict['min_mapping_quality'] = self.min_mapqual
75 if self.min_basequal is not None:
76 param_dict['min_base_quality'] = self.min_basequal
77 param_dict['compute_baq'] = False
78 param_dict['stepper'] = 'samtools'
79 return param_dict
80
81 def varcall_parallel(self, normal_purity=None, tumor_purity=None,
82 min_coverage=None,
83 min_var_count=None,
84 min_var_freq=None, min_hom_freq=None,
85 p_value=None, somatic_p_value=None,
86 threads=None, verbose=None, quiet=None
87 ):
88 if not threads:
89 threads = self.threads
90 if verbose is None:
91 verbose = self.verbose
92 if quiet is None:
93 quiet = self.quiet
94 # mapping of method parameters to varcall engine command line options
95 varcall_engine_option_mapping = [
96 ('--normal-purity', normal_purity),
97 ('--tumor-purity', tumor_purity),
98 ('--min-coverage', min_coverage),
99 ('--min-reads2', min_var_count),
100 ('--min-var-freq', min_var_freq),
101 ('--min-freq-for-hom', min_hom_freq),
102 ('--p-value', p_value),
103 ('--somatic-p-value', somatic_p_value),
104 ('--min-avg-qual', self.min_basequal)
105 ]
106 varcall_engine_options = []
107 for option, value in varcall_engine_option_mapping:
108 if value is not None:
109 varcall_engine_options += [option, str(value)]
110 pileup_engine_options = ['-B']
111 if self.max_depth is not None:
112 pileup_engine_options += ['-d', str(self.max_depth)]
113 if self.min_mapqual is not None:
114 pileup_engine_options += ['-q', str(self.min_mapqual)]
115 if self.min_basequal is not None:
116 pileup_engine_options += ['-Q', str(self.min_basequal)]
117
118 # Create a tuple of calls to samtools mpileup and varscan for
119 # each contig. The contig name is stored as the third element of
120 # that tuple.
121 # The calls are stored in the reverse order of the contig list so
122 # that they can be popped off later in the original order
123 calls = [(
124 self.pileup_engine + pileup_engine_options + [
125 '-r', contig + ':',
126 '-f', self.ref_genome
127 ] + self.bam_input_files,
128 self.varcall_engine + [
129 '-', '{out}', '--output-vcf', '1', '--mpileup', '1'
130 ] + varcall_engine_options,
131 contig
132 ) for contig in self.ref_contigs[::-1]]
133
134 if verbose:
135 print('Starting variant calling ..')
136
137 # launch subprocesses and monitor their status
138 subprocesses = []
139 error_table = {}
140 tmp_io_started = []
141 tmp_io_finished = []
142 self.tmpfiles = []
143
144 def enqueue_stderr_output(out, stderr_buffer):
145 for line in iter(out.readline, b''):
146 # Eventually we are going to print the contents of
147 # the stderr_buffer to sys.stderr so we can
148 # decode things here using its encoding.
149 # We do a 'backslashreplace' just to be on the safe side.
150 stderr_buffer.write(line.decode(sys.stderr.encoding,
151 'backslashreplace'))
152 out.close()
153
154 try:
155 while subprocesses or calls:
156 while calls and len(subprocesses) < threads:
157 # There are still calls waiting for execution and we
158 # have unoccupied threads so we launch a new combined
159 # call to samtools mpileup and the variant caller.
160
161 # pop the call arguments from our call stack
162 call = calls.pop()
163 # get the name of the contig that this call is going
164 # to work on
165 contig = call[2]
166 # Based on the contig name, generate a readable and
167 # file system-compatible prefix and use it to create
168 # a named temporary file, to which the call output
169 # will be redirected.
170 # At the moment we create the output file we add it to
171 # the list of all temporary output files so that we can
172 # remove it eventually during cleanup.
173 call_out = self.TemporaryContigVCF(
174 prefix=''.join(
175 c if c.isalnum() else '_' for c in contig
176 ) + '_',
177 )
178 # maintain a list of variant call outputs
179 # in the order the subprocesses got launched
180 tmp_io_started.append(call_out.name)
181 if self.requires_stdout_redirect:
182 # redirect stdout to the temporary file just created
183 stdout_p2 = call_out
184 stderr_p2 = subprocess.PIPE
185 else:
186 # variant caller wants to write output to file directly
187 stdout_p2 = subprocess.PIPE
188 stderr_p2 = subprocess.STDOUT
189 call[1][call[1].index('{out}')] = call_out.name
190 call_out.close()
191 # for reporting purposes, join the arguments for the
192 # samtools and the variant caller calls into readable
193 # strings
194 c_str = (' '.join(call[0]), ' '.join(call[1]))
195 error_table[c_str] = [io.StringIO(), io.StringIO()]
196 # start the subprocesses
197 p1 = subprocess.Popen(
198 call[0],
199 stdout=subprocess.PIPE, stderr=subprocess.PIPE
200 )
201 p2 = subprocess.Popen(
202 call[1],
203 stdin=p1.stdout,
204 stdout=stdout_p2,
205 stderr=stderr_p2
206 )
207 # subprocess bookkeeping
208 subprocesses.append((c_str, p1, p2, call_out, contig))
209 # make sure our newly launched call does not block
210 # because its buffers fill up
211 p1.stdout.close()
212 t1 = Thread(
213 target=enqueue_stderr_output,
214 args=(p1.stderr, error_table[c_str][0])
215 )
216 t2 = Thread(
217 target=enqueue_stderr_output,
218 args=(
219 p2.stderr
220 if self.requires_stdout_redirect else
221 p2.stdout,
222 error_table[c_str][1]
223 )
224 )
225 t1.daemon = t2.daemon = True
226 t1.start()
227 t2.start()
228
229 if verbose:
230 print(
231 'Calling variants for contig: {0}'
232 .format(call[2])
233 )
234
235 # monitor all running calls to see if any of them are done
236 for call, p1, p2, ofo, contig in subprocesses:
237 p1_stat = p1.poll()
238 p2_stat = p2.poll()
239 if p1_stat is not None or p2_stat is not None:
240 # There is an outcome for this process!
241 # Lets see what it is
242 if p1_stat or p2_stat:
243 print()
244 print(
245 error_table[call][0].getvalue(),
246 error_table[call][1].getvalue(),
247 file=sys.stderr
248 )
249 raise VariantCallingError(
250 'Variant Calling for contig {0} failed.'
251 .format(contig),
252 call='{0} | {1}'.format(call[0], call[1])
253 )
254 if p1_stat == 0 and p2_stat is None:
255 # VarScan is not handling the no output from
256 # samtools mpileup situation correctly so maybe
257 # that's the issue here
258 last_words = error_table[call][1].getvalue(
259 ).splitlines()[-4:]
260 if len(last_words) < 4 or any(
261 not msg.startswith('Input stream not ready')
262 for msg in last_words
263 ):
264 break
265 # lets give this process a bit more time
266 # VarScan is waiting for input it will never
267 # get, stop it.
268 p2.terminate()
269 subprocesses.remove((call, p1, p2, ofo, contig))
270 ofo.close()
271 break
272 if p2_stat == 0:
273 # Things went fine.
274 # maintain a list of variant call outputs
275 # that finished successfully (in the order
276 # they finished)
277 tmp_io_finished.append(ofo.name)
278 if verbose:
279 print()
280 print('Contig {0} finished.'.format(contig))
281 if not quiet:
282 print()
283 print(
284 'stderr output from samtools mpileup/'
285 'bcftools:'.upper(),
286 file=sys.stderr
287 )
288 print(
289 error_table[call][0].getvalue(),
290 error_table[call][1].getvalue(),
291 file=sys.stderr
292 )
293 # Discard the collected stderr output from
294 # the call, remove the call from the list of
295 # running calls and close its output file.
296 del error_table[call]
297 subprocesses.remove((call, p1, p2, ofo, contig))
298 # Closing the output file is important or we
299 # may hit the file system limit for open files
300 # if there are lots of contigs.
301 ofo.close()
302 break
303 # wait a bit in between monitoring cycles
304 time.sleep(2)
305 finally:
306 for call, p1, p2, ofo, contig in subprocesses:
307 # make sure we do not leave running subprocesses behind
308 for proc in (p1, p2):
309 try:
310 proc.terminate()
311 except Exception:
312 pass
313 # close currently open files
314 ofo.close()
315 # store the files with finished content in the order that
316 # the corresponding jobs were launched
317 self.tmpfiles = [f for f in tmp_io_started if f in tmp_io_finished]
318 # Make sure remaining buffered stderr output of
319 # subprocesses does not get lost.
320 # Currently, we don't screen this output for real errors,
321 # but simply rewrite everything.
322 if not quiet and error_table:
323 print()
324 print(
325 'stderr output from samtools mpileup/bcftools:'.upper(),
326 file=sys.stderr
327 )
328 for call, errors in error_table.items():
329 print(' | '.join(call), ':', file=sys.stderr)
330 print('-' * 20, file=sys.stderr)
331 print('samtools mpileup output:', file=sys.stderr)
332 print(errors[0].getvalue(), file=sys.stderr)
333 print('varscan somatic output:', file=sys.stderr)
334 print(errors[1].getvalue(), file=sys.stderr)
335
336 def _add_ref_contigs_to_header(self, header):
337 for chrom, length in zip(self.ref_contigs, self.ref_lengths):
338 header.add_meta(
339 'contig',
340 items=[('ID', chrom), ('length', length)]
341 )
342
343 def _add_filters_to_header(self, header):
344 varscan_fpfilters = {
345 'VarCount': 'Fewer than {min_var_count2} variant-supporting reads',
346 'VarFreq': 'Variant allele frequency below {min_var_freq2}',
347 'VarAvgRL':
348 'Average clipped length of variant-supporting reads < '
349 '{min_var_len}',
350 'VarReadPos': 'Relative average read position < {min_var_readpos}',
351 'VarDist3':
352 'Average distance to effective 3\' end < {min_var_dist3}',
353 'VarMMQS':
354 'Average mismatch quality sum for variant reads > '
355 '{max_var_mmqs}',
356 'VarMapQual':
357 'Average mapping quality of variant reads < {min_var_mapqual}',
358 'VarBaseQual':
359 'Average base quality of variant reads < {min_var_basequal}',
360 'Strand':
361 'Strand representation of variant reads < {min_strandedness}',
362 'RefAvgRL':
363 'Average clipped length of ref-supporting reads < '
364 '{min_ref_len}',
365 'RefReadPos':
366 'Relative average read position < {min_ref_readpos}',
367 'RefDist3':
368 'Average distance to effective 3\' end < {min_ref_dist3}',
369 'RefMapQual':
370 'Average mapping quality of reference reads < '
371 '{min_ref_mapqual}',
372 'RefBaseQual':
373 'Average base quality of ref-supporting reads < '
374 '{min_ref_basequal}',
375 'RefMMQS':
376 'Average mismatch quality sum for ref-supporting reads > '
377 '{max_ref_mmqs}',
378 'MMQSdiff':
379 'Mismatch quality sum difference (var - ref) > '
380 '{max_mmqs_diff}',
381 'MinMMQSdiff':
382 'Mismatch quality sum difference (var - ref) < '
383 '{max_mmqs_diff}',
384 'MapQualDiff':
385 'Mapping quality difference (ref - var) > {max_mapqual_diff}',
386 'MaxBAQdiff':
387 'Average base quality difference (ref - var) > '
388 '{max_basequal_diff}',
389 'ReadLenDiff':
390 'Average supporting read length difference (ref - var) > '
391 '{max_relative_len_diff}',
392 }
393 for filter_id, description in varscan_fpfilters.items():
394 header.filters.add(filter_id, None, None, description)
395
396 def _add_indel_info_flag_to_header(self, header):
397 header.info.add(
398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL'
399 )
400
401 def _compile_common_header(self, varcall_template, no_filters=False):
402 # fix the header generated by VarScan
403 # by adding reference and contig information
404 common_header = pysam.VariantHeader()
405 common_header.add_meta('reference', value=self.ref_genome)
406 self._add_ref_contigs_to_header(common_header)
407 if not no_filters:
408 # add filter info
409 self._add_filters_to_header(common_header)
410 # change the source information
411 common_header.add_meta('source', value='varscan.py')
412 # declare an INDEL flag for record INFO fields
413 self._add_indel_info_flag_to_header(common_header)
414 # take the remaining metadata from the template header produced by
415 # VarScan
416 with pysam.VariantFile(varcall_template, 'r') as original_data:
417 varscan_header = original_data.header
418 for sample in varscan_header.samples:
419 common_header.samples.add(sample)
420 common_header.merge(varscan_header)
421 return common_header
422
423 def pileup_masker(self, mask):
424 def apply_mask_on_pileup(piled_items):
425 for item, status in zip(piled_items, mask):
426 if status:
427 yield item
428 return apply_mask_on_pileup
429
430 def get_allele_specific_pileup_column_stats(
431 self, allele, pile_column, ref_fetch
432 ):
433 # number of reads supporting the given allele on
434 # forward and reverse strand, and in total
435 var_reads_plus = var_reads_minus = 0
436 var_supp_read_mask = []
437 for base in pile_column.get_query_sequences():
438 if base == allele:
439 # allele supporting read on + strand
440 var_reads_plus += 1
441 var_supp_read_mask.append(True)
442 elif base.upper() == allele:
443 # allele supporting read on - strand
444 var_reads_minus += 1
445 var_supp_read_mask.append(True)
446 else:
447 var_supp_read_mask.append(False)
448 var_reads_total = var_reads_plus + var_reads_minus
449
450 if var_reads_total == 0:
451 # No stats without reads!
452 return None
453
454 var_supp_only = self.pileup_masker(var_supp_read_mask)
455
456 # average mapping quality of the reads supporting the
457 # given allele
458 avg_mapping_quality = sum(
459 mq for mq in var_supp_only(
460 pile_column.get_mapping_qualities()
461 )
462 ) / var_reads_total
463
464 # for the remaining stats we need access to complete
465 # read information
466 piled_reads = [
467 p for p in var_supp_only(pile_column.pileups)
468 ]
469 assert len(piled_reads) == var_reads_total
470 sum_avg_base_qualities = 0
471 sum_dist_from_center = 0
472 sum_dist_from_3prime = 0
473 sum_clipped_length = 0
474 sum_unclipped_length = 0
475 sum_num_mismatches_as_fraction = 0
476 sum_mismatch_qualities = 0
477
478 for p in piled_reads:
479 sum_avg_base_qualities += sum(
480 p.alignment.query_qualities
481 ) / p.alignment.infer_query_length()
482 sum_clipped_length += p.alignment.query_alignment_length
483 unclipped_length = p.alignment.infer_read_length()
484 sum_unclipped_length += unclipped_length
485 read_center = p.alignment.query_alignment_length / 2
486 sum_dist_from_center += 1 - abs(
487 p.query_position - read_center
488 ) / read_center
489 if p.alignment.is_reverse:
490 sum_dist_from_3prime += p.query_position / unclipped_length
491 else:
492 sum_dist_from_3prime += 1 - p.query_position / unclipped_length
493
494 sum_num_mismatches = 0
495 for qpos, rpos in p.alignment.get_aligned_pairs():
496 if qpos is not None and rpos is not None:
497 if p.alignment.query_sequence[qpos] != ref_fetch(
498 rpos, rpos + 1
499 ).upper(): # ref bases can be lowercase!
500 sum_num_mismatches += 1
501 sum_mismatch_qualities += p.alignment.query_qualities[
502 qpos
503 ]
504 sum_num_mismatches_as_fraction += (
505 sum_num_mismatches / p.alignment.query_alignment_length
506 )
507 avg_basequality = sum_avg_base_qualities / var_reads_total
508 avg_pos_as_fraction = sum_dist_from_center / var_reads_total
509 avg_num_mismatches_as_fraction = (
510 sum_num_mismatches_as_fraction / var_reads_total
511 )
512 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total
513 avg_clipped_length = sum_clipped_length / var_reads_total
514 avg_distance_to_effective_3p_end = (
515 sum_dist_from_3prime / var_reads_total
516 )
517
518 return (
519 avg_mapping_quality,
520 avg_basequality,
521 var_reads_plus,
522 var_reads_minus,
523 avg_pos_as_fraction,
524 avg_num_mismatches_as_fraction,
525 avg_sum_mismatch_qualities,
526 avg_clipped_length,
527 avg_distance_to_effective_3p_end
528 )
529
530 def _postprocess_variant_records(self, invcf,
531 min_var_count2, min_var_count2_lc,
532 min_var_freq2, max_somatic_p,
533 max_somatic_p_depth,
534 min_ref_readpos, min_var_readpos,
535 min_ref_dist3, min_var_dist3,
536 min_ref_len, min_var_len,
537 max_relative_len_diff,
538 min_strandedness, min_strand_reads,
539 min_ref_basequal, min_var_basequal,
540 max_basequal_diff,
541 min_ref_mapqual, min_var_mapqual,
542 max_mapqual_diff,
543 max_ref_mmqs, max_var_mmqs,
544 min_mmqs_diff, max_mmqs_diff,
545 **args):
546 # set FILTER field according to Varscan criteria
547 # multiple FILTER entries must be separated by semicolons
548 # No filters applied should be indicated with MISSING
549
550 # since posterior filters are always applied to just one sample,
551 # a better place to store the info is in the FT genotype field:
552 # can be PASS, '.' to indicate that filters have not been applied,
553 # or a semicolon-separated list of filters that failed
554 # unfortunately, gemini does not support this field
555
556 with ExitStack() as io_stack:
557 normal_reads, tumor_reads = (
558 io_stack.enter_context(
559 pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files
560 )
561 refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome))
562 pileup_args = self._get_pysam_pileup_args()
563 for record in invcf:
564 if any(len(allele) > 1 for allele in record.alleles):
565 # skip indel postprocessing for the moment
566 yield record
567 continue
568 # get pileup for genomic region affected by this variant
569 if record.info['SS'] == '2':
570 # a somatic variant => generate pileup from tumor data
571 pile = tumor_reads.pileup(
572 record.chrom, record.start, record.stop,
573 **pileup_args
574 )
575 sample_of_interest = 'TUMOR'
576 elif record.info['SS'] in ['1', '3']:
577 # a germline or LOH variant => pileup from normal data
578 pile = normal_reads.pileup(
579 record.chrom, record.start, record.stop,
580 **pileup_args
581 )
582 sample_of_interest = 'NORMAL'
583 else:
584 # TO DO: figure out if there is anything interesting to do
585 # for SS status codes 0 (reference) and 5 (unknown)
586 yield record
587 continue
588
589 # apply false-positive filtering a la varscan fpfilter
590 # find the variant site in the pileup columns
591 for pile_column in pile:
592 if pile_column.reference_pos == record.start:
593 break
594 # extract required information
595 # overall read depth at the site
596 read_depth = pile_column.get_num_aligned()
597 assert read_depth > 0
598 # no multiallelic sites in varscan
599 assert len(record.alleles) == 2
600 if record.samples[sample_of_interest]['RD'] > 0:
601 ref_stats, alt_stats = [
602 self.get_allele_specific_pileup_column_stats(
603 allele,
604 pile_column,
605 partial(
606 pysam.FastaFile.fetch, refseq, record.chrom
607 )
608 )
609 for allele in record.alleles
610 ]
611 else:
612 ref_stats = None
613 alt_stats = self.get_allele_specific_pileup_column_stats(
614 record.alleles[1],
615 pile_column,
616 partial(pysam.FastaFile.fetch, refseq, record.chrom)
617 )
618 ref_count = 0
619 if ref_stats:
620 ref_count = ref_stats[2] + ref_stats[3]
621 if ref_stats[1] < min_ref_basequal:
622 record.filter.add('RefBaseQual')
623 if ref_count >= 2:
624 if ref_stats[0] < min_ref_mapqual:
625 record.filter.add('RefMapQual')
626 if ref_stats[4] < min_ref_readpos:
627 record.filter.add('RefReadPos')
628 # ref_stats[5] (avg_num_mismatches_as_fraction
629 # is not a filter criterion in VarScan fpfilter
630 if ref_stats[6] > max_ref_mmqs:
631 record.filter.add('RefMMQS')
632 if ref_stats[7] < min_ref_len:
633 # VarScan fpfilter does not apply this filter
634 # for indels, but there is no reason
635 # not to do it.
636 record.filter.add('RefAvgRL')
637 if ref_stats[8] < min_ref_dist3:
638 record.filter.add('RefDist3')
639 if alt_stats:
640 alt_count = alt_stats[2] + alt_stats[3]
641 if (
642 alt_count < min_var_count2_lc
643 ) or (
644 read_depth >= max_somatic_p_depth and
645 alt_count < min_var_count2
646 ):
647 record.filter.add('VarCount')
648 if alt_count / read_depth < min_var_freq2:
649 record.filter.add('VarFreq')
650 if alt_stats[1] < min_var_basequal:
651 record.filter.add('VarBaseQual')
652 if alt_count > min_strand_reads:
653 if (
654 alt_stats[2] / alt_count < min_strandedness
655 ) or (
656 alt_stats[3] / alt_count < min_strandedness
657 ):
658 record.filter.add('Strand')
659 if alt_stats[2] + alt_stats[3] >= 2:
660 if alt_stats[0] < min_var_mapqual:
661 record.filter.add('VarMapQual')
662 if alt_stats[4] < min_var_readpos:
663 record.filter.add('VarReadPos')
664 # alt_stats[5] (avg_num_mismatches_as_fraction
665 # is not a filter criterion in VarScan fpfilter
666 if alt_stats[6] > max_var_mmqs:
667 record.filter.add('VarMMQS')
668 if alt_stats[7] < min_var_len:
669 # VarScan fpfilter does not apply this filter
670 # for indels, but there is no reason
671 # not to do it.
672 record.filter.add('VarAvgRL')
673 if alt_stats[8] < min_var_dist3:
674 record.filter.add('VarDist3')
675 if ref_count >= 2 and alt_count >= 2:
676 if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff:
677 record.filter.add('MapQualDiff')
678 if (ref_stats[1] - alt_stats[1]) > max_basequal_diff:
679 record.filter.add('MaxBAQdiff')
680 mmqs_diff = alt_stats[6] - ref_stats[6]
681 if mmqs_diff < min_mmqs_diff:
682 record.filter.add('MinMMQSdiff')
683 if mmqs_diff > max_mmqs_diff:
684 record.filter.add('MMQSdiff')
685 if (
686 1 - alt_stats[7] / ref_stats[7]
687 ) > max_relative_len_diff:
688 record.filter.add('ReadLenDiff')
689 else:
690 # No variant-supporting reads for this record!
691 # This can happen in rare cases because of
692 # samtools mpileup issues, but indicates a
693 # rather unreliable variant call.
694 record.filter.add('VarCount')
695 record.filter.add('VarFreq')
696 yield record
697
698 def _indel_flagged_records(self, vcf):
699 for record in vcf:
700 record.info['INDEL'] = True
701 yield record
702
703 def _merge_generator(self, vcf1, vcf2):
704 try:
705 record1 = next(vcf1)
706 except StopIteration:
707 for record2 in vcf2:
708 yield record2
709 return
710 try:
711 record2 = next(vcf2)
712 except StopIteration:
713 yield record1
714 for record1 in vcf1:
715 yield record1
716 return
717 while True:
718 if (record1.start, record1.stop) < (record2.start, record2.stop):
719 yield record1
720 try:
721 record1 = next(vcf1)
722 except StopIteration:
723 yield record2
724 for record2 in vcf2:
725 yield record2
726 return
727 else:
728 yield record2
729 try:
730 record2 = next(vcf2)
731 except StopIteration:
732 yield record1
733 for record1 in vcf1:
734 yield record1
735 return
736
737 def merge_and_postprocess(self, snps_out, indels_out=None,
738 no_filters=False, **filter_args):
739 temporary_data = self.tmpfiles
740 self.tmpfiles = []
741 temporary_snp_files = [f + '.snp.vcf' for f in temporary_data]
742 temporary_indel_files = [f + '.indel.vcf' for f in temporary_data]
743
744 for f in temporary_data:
745 try:
746 os.remove(f)
747 except Exception:
748 pass
749
750 def noop_gen(data, **kwargs):
751 for d in data:
752 yield d
753
754 if no_filters:
755 apply_filters = noop_gen
756 else:
757 apply_filters = self._postprocess_variant_records
758
759 output_header = self._compile_common_header(
760 temporary_snp_files[0],
761 no_filters
762 )
763 if indels_out is None:
764 with open(snps_out, 'w') as o:
765 o.write(str(output_header).format(**filter_args))
766 for snp_f, indel_f in zip(
767 temporary_snp_files, temporary_indel_files
768 ):
769 with pysam.VariantFile(snp_f, 'r') as snp_invcf:
770 # fix the input header on the fly
771 # to avoid Warnings from htslib about missing contig
772 # info
773 self._add_ref_contigs_to_header(snp_invcf.header)
774 self._add_filters_to_header(snp_invcf.header)
775 self._add_indel_info_flag_to_header(snp_invcf.header)
776 with pysam.VariantFile(indel_f, 'r') as indel_invcf:
777 # fix the input header on the fly
778 # to avoid Warnings from htslib about missing
779 # contig info
780 self._add_ref_contigs_to_header(indel_invcf.header)
781 self._add_filters_to_header(indel_invcf.header)
782 self._add_indel_info_flag_to_header(
783 indel_invcf.header
784 )
785 for record in apply_filters(
786 self._merge_generator(
787 snp_invcf,
788 self._indel_flagged_records(indel_invcf)
789 ),
790 **filter_args
791 ):
792 o.write(str(record))
793 try:
794 os.remove(snp_f)
795 except Exception:
796 pass
797 try:
798 os.remove(indel_f)
799 except Exception:
800 pass
801
802 else:
803 with open(snps_out, 'w') as o:
804 o.write(str(output_header).format(**filter_args))
805 for f in temporary_snp_files:
806 with pysam.VariantFile(f, 'r') as invcf:
807 # fix the input header on the fly
808 # to avoid Warnings from htslib about missing
809 # contig info and errors because of undeclared
810 # filters
811 self._add_ref_contigs_to_header(invcf.header)
812 self._add_filters_to_header(invcf.header)
813 for record in apply_filters(
814 invcf, **filter_args
815 ):
816 o.write(str(record))
817 try:
818 os.remove(f)
819 except Exception:
820 pass
821 with open(indels_out, 'w') as o:
822 o.write(str(output_header))
823 for f in temporary_indel_files:
824 with pysam.VariantFile(f, 'r') as invcf:
825 # fix the input header on the fly
826 # to avoid Warnings from htslib about missing
827 # contig info and errors because of undeclared
828 # filters
829 self._add_ref_contigs_to_header(invcf.header)
830 self._add_filters_to_header(invcf.header)
831 self._add_indel_info_flag_to_header(invcf.header)
832 for record in apply_filters(
833 self._indel_flagged_records(invcf), **filter_args
834 ):
835 o.write(str(record))
836 try:
837 os.remove(f)
838 except Exception:
839 pass
840
841
842 def varscan_call(ref_genome, normal, tumor, output_path, **args):
843 """Preparse arguments and orchestrate calling and postprocessing."""
844
845 if args.pop('split_output'):
846 if '%T' in output_path:
847 out = (
848 output_path.replace('%T', 'snp'),
849 output_path.replace('%T', 'indel')
850 )
851 else:
852 out = (
853 output_path + '.snp',
854 output_path + '.indel'
855 )
856 else:
857 out = (output_path, None)
858
859 instance_args = {
860 k: args.pop(k) for k in [
861 'max_depth',
862 'min_mapqual',
863 'min_basequal',
864 'threads',
865 'verbose',
866 'quiet'
867 ]
868 }
869 varscan_somatic_args = {
870 k: args.pop(k) for k in [
871 'normal_purity',
872 'tumor_purity',
873 'min_coverage',
874 'min_var_count',
875 'min_var_freq',
876 'min_hom_freq',
877 'somatic_p_value',
878 'p_value'
879 ]
880 }
881
882 v = VarScanCaller(ref_genome, [normal, tumor], **instance_args)
883 v.varcall_parallel(**varscan_somatic_args)
884 v.merge_and_postprocess(*out, **args)
885
886
887 if __name__ == '__main__':
888 p = argparse.ArgumentParser()
889 p.add_argument(
890 'ref_genome',
891 metavar='reference_genome',
892 help='the reference genome (in fasta format)'
893 )
894 p.add_argument(
895 '--normal',
896 metavar='BAM_file', required=True,
897 help='the BAM input file of aligned reads from the normal sample'
898 )
899 p.add_argument(
900 '--tumor',
901 metavar='BAM_file', required=True,
902 help='the BAM input file of aligned reads from the tumor sample'
903 )
904 p.add_argument(
905 '-o', '--ofile', required=True,
906 metavar='OFILE', dest='output_path',
907 help='Name of the variant output file. With --split-output, the name '
908 'may use the %%T replacement token or will be used as the '
909 'basename for the two output files to be generated (see '
910 '-s|--split-output below).'
911 )
912 p.add_argument(
913 '-s', '--split-output',
914 dest='split_output', action='store_true', default=False,
915 help='indicate that separate output files for SNPs and indels '
916 'should be generated (original VarScan behavior). If specified, '
917 '%%T in the --ofile file name will be replaced with "snp" and '
918 '"indel" to generate the names of the SNP and indel output '
919 'files, respectively. If %%T is not found in the file name, it '
920 'will get interpreted as a basename to which ".snp"/".indel" '
921 'will be appended.'
922 )
923 p.add_argument(
924 '-t', '--threads',
925 type=int, default=1,
926 help='level of parallelism'
927 )
928 p.add_argument(
929 '-v', '--verbose',
930 action='store_true',
931 help='be verbose about progress'
932 )
933 p.add_argument(
934 '-q', '--quiet',
935 action='store_true',
936 help='suppress output from wrapped tools'
937 )
938 call_group = p.add_argument_group('Variant calling parameters')
939 call_group.add_argument(
940 '--normal-purity',
941 dest='normal_purity', type=float,
942 default=1.0,
943 help='Estimated purity of the normal sample (default: 1.0)'
944 )
945 call_group.add_argument(
946 '--tumor-purity',
947 dest='tumor_purity', type=float,
948 default=1.0,
949 help='Estimated purity of the tumor sample (default: 1.0)'
950 )
951 call_group.add_argument(
952 '--max-pileup-depth',
953 dest='max_depth', type=int, default=8000,
954 help='Maximum depth of generated pileups (samtools mpileup -d option; '
955 'default: 8000)'
956 )
957 call_group.add_argument(
958 '--min-basequal',
959 dest='min_basequal', type=int,
960 default=13,
961 help='Minimum base quality at the variant position to use a read '
962 '(default: 13)'
963 )
964 call_group.add_argument(
965 '--min-mapqual',
966 dest='min_mapqual', type=int,
967 default=0,
968 help='Minimum mapping quality required to use a read '
969 '(default: 0)'
970 )
971 call_group.add_argument(
972 '--min-coverage',
973 dest='min_coverage', type=int,
974 default=8,
975 help='Minimum site coverage required in the normal and in the tumor '
976 'sample to call a variant (default: 8)'
977 )
978 call_group.add_argument(
979 '--min-var-count',
980 dest='min_var_count', type=int,
981 default=2,
982 help='Minimum number of variant-supporting reads required to call a '
983 'variant (default: 2)'
984 )
985 call_group.add_argument(
986 '--min-var-freq',
987 dest='min_var_freq', type=float,
988 default=0.1,
989 help='Minimum variant allele frequency for calling (default: 0.1)'
990 )
991 call_group.add_argument(
992 '--min-hom-freq',
993 dest='min_hom_freq', type=float,
994 default=0.75,
995 help='Minimum variant allele frequency for homozygous call '
996 '(default: 0.75)'
997 )
998 call_group.add_argument(
999 '--p-value',
1000 dest='p_value', type=float,
1001 default=0.99,
1002 help='P-value threshold for heterozygous call (default: 0.99)'
1003 )
1004 call_group.add_argument(
1005 '--somatic-p-value',
1006 dest='somatic_p_value', type=float,
1007 default=0.05,
1008 help='P-value threshold for somatic call (default: 0.05)'
1009 )
1010 filter_group = p.add_argument_group('Posterior variant filter parameters')
1011 filter_group.add_argument(
1012 '--no-filters',
1013 dest='no_filters', action='store_true',
1014 help='Disable all posterior variant filters. '
1015 'If specified, all following options will be ignored'
1016 )
1017 filter_group.add_argument(
1018 '--min-var-count2',
1019 dest='min_var_count2', type=int,
1020 default=4,
1021 help='Minimum number of variant-supporting reads (default: 4)'
1022 )
1023 filter_group.add_argument(
1024 '--min-var-count2-lc',
1025 dest='min_var_count2_lc', type=int,
1026 default=2,
1027 help='Minimum number of variant-supporting reads when depth below '
1028 '--somatic-p-depth (default: 2)'
1029 )
1030 filter_group.add_argument(
1031 '--min-var-freq2',
1032 dest='min_var_freq2', type=float,
1033 default=0.05,
1034 help='Minimum variant allele frequency (default: 0.05)'
1035 )
1036 filter_group.add_argument(
1037 '--max-somatic-p',
1038 dest='max_somatic_p', type=float,
1039 default=0.05,
1040 help='Maximum somatic p-value (default: 0.05)'
1041 )
1042 filter_group.add_argument(
1043 '--max-somatic-p-depth',
1044 dest='max_somatic_p_depth', type=int,
1045 default=10,
1046 help='Depth required to run --max-somatic-p filter (default: 10)'
1047 )
1048 filter_group.add_argument(
1049 '--min-ref-readpos',
1050 dest='min_ref_readpos', type=float,
1051 default=0.1,
1052 help='Minimum average relative distance of site from the ends of '
1053 'ref-supporting reads (default: 0.1)'
1054 )
1055 filter_group.add_argument(
1056 '--min-var-readpos',
1057 dest='min_var_readpos', type=float,
1058 default=0.1,
1059 help='Minimum average relative distance of site from the ends of '
1060 'variant-supporting reads (default: 0.1)'
1061 )
1062 filter_group.add_argument(
1063 '--min-ref-dist3',
1064 dest='min_ref_dist3', type=float,
1065 default=0.1,
1066 help='Minimum average relative distance of site from the effective '
1067 '3\'end of ref-supporting reads (default: 0.1)'
1068 )
1069 filter_group.add_argument(
1070 '--min-var-dist3',
1071 dest='min_var_dist3', type=float,
1072 default=0.1,
1073 help='Minimum average relative distance of site from the effective '
1074 '3\'end of variant-supporting reads (default: 0.1)'
1075 )
1076 filter_group.add_argument(
1077 '--min-ref-len',
1078 dest='min_ref_len', type=int,
1079 default=90,
1080 help='Minimum average trimmed length of reads supporting the ref '
1081 'allele (default: 90)'
1082 )
1083 filter_group.add_argument(
1084 '--min-var-len',
1085 dest='min_var_len', type=int,
1086 default=90,
1087 help='Minimum average trimmed length of reads supporting the variant '
1088 'allele (default: 90)'
1089 )
1090 filter_group.add_argument(
1091 '--max-len-diff',
1092 dest='max_relative_len_diff', type=float,
1093 default=0.25,
1094 help='Maximum average relative read length difference (ref - var; '
1095 'default: 0.25)'
1096 )
1097 filter_group.add_argument(
1098 '--min-strandedness',
1099 dest='min_strandedness', type=float,
1100 default=0.01,
1101 help='Minimum fraction of variant reads from each strand '
1102 '(default: 0.01)'
1103 )
1104 filter_group.add_argument(
1105 '--min-strand-reads',
1106 dest='min_strand_reads', type=int,
1107 default=5,
1108 help='Minimum allele depth required to run --min-strandedness filter '
1109 '(default: 5)'
1110 )
1111 filter_group.add_argument(
1112 '--min-ref-basequal',
1113 dest='min_ref_basequal', type=int,
1114 default=15,
1115 help='Minimum average base quality for the ref allele (default: 15)'
1116 )
1117 filter_group.add_argument(
1118 '--min-var-basequal',
1119 dest='min_var_basequal', type=int,
1120 default=15,
1121 help='Minimum average base quality for the variant allele '
1122 '(default: 15)'
1123 )
1124 filter_group.add_argument(
1125 '--max-basequal-diff',
1126 dest='max_basequal_diff', type=int,
1127 default=50,
1128 help='Maximum average base quality diff (ref - var; default: 50)'
1129 )
1130 filter_group.add_argument(
1131 '--min-ref-mapqual',
1132 dest='min_ref_mapqual', type=int,
1133 default=15,
1134 help='Minimum average mapping quality of reads supporting the ref '
1135 'allele (default: 15)'
1136 )
1137 filter_group.add_argument(
1138 '--min-var-mapqual',
1139 dest='min_var_mapqual', type=int,
1140 default=15,
1141 help='Minimum average mapping quality of reads supporting the variant '
1142 'allele (default: 15)'
1143 )
1144 filter_group.add_argument(
1145 '--max-mapqual-diff',
1146 dest='max_mapqual_diff', type=int,
1147 default=50,
1148 help='Maximum average mapping quality difference (ref - var; '
1149 'default: 50)'
1150 )
1151 filter_group.add_argument(
1152 '--max-ref-mmqs',
1153 dest='max_ref_mmqs', type=int,
1154 default=100,
1155 help='Maximum mismatch quality sum of reads supporting the ref '
1156 'allele (default: 100)'
1157 )
1158 filter_group.add_argument(
1159 '--max-var-mmqs',
1160 dest='max_var_mmqs', type=int,
1161 default=100,
1162 help='Maximum mismatch quality sum of reads supporting the variant '
1163 'allele (default: 100)'
1164 )
1165 filter_group.add_argument(
1166 '--min-mmqs-diff',
1167 dest='min_mmqs_diff', type=int,
1168 default=0,
1169 help='Minimum mismatch quality sum difference (var - ref; default: 0)'
1170 )
1171 filter_group.add_argument(
1172 '--max-mmqs-diff',
1173 dest='max_mmqs_diff', type=int,
1174 default=50,
1175 help='Maximum mismatch quality sum difference (var - ref; default: 50)'
1176 )
1177 args = vars(p.parse_args())
1178 varscan_call(**args)