Mercurial > repos > iuc > varscan_copynumber
diff varscan.py @ 2:4ce3441c407c draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 30867f1f022bed18ba1c3b8dc9c54226890b3a9c
author | iuc |
---|---|
date | Tue, 04 Dec 2018 05:15:15 -0500 |
parents | |
children | 0cb031c7f464 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/varscan.py Tue Dec 04 05:15:15 2018 -0500 @@ -0,0 +1,1178 @@ +#!/usr/bin/env python3 +from __future__ import print_function + +import argparse +import io +import os +import subprocess +import sys +import tempfile +import time +from contextlib import ExitStack +from functools import partial +from threading import Thread + +import pysam + + +class VariantCallingError (RuntimeError): + """Exception class for issues with samtools and varscan subprocesses.""" + + def __init__(self, message=None, call='', error=''): + self.message = message + self.call = call.strip() + self.error = error.strip() + + def __str__(self): + if self.message is None: + return '' + if self.error: + msg_header = '"{0}" failed with:\n{1}\n\n'.format( + self.call, self.error + ) + else: + msg_header = '{0} failed.\n' + 'No further information about this error is available.\n\n'.format( + self.call + ) + return msg_header + self.message + + +class VarScanCaller (object): + def __init__(self, ref_genome, bam_input_files, + max_depth=None, + min_mapqual=None, min_basequal=None, + threads=1, verbose=False, quiet=True + ): + self.ref_genome = ref_genome + self.bam_input_files = bam_input_files + self.max_depth = max_depth + self.min_mapqual = min_mapqual + self.min_basequal = min_basequal + self.threads = threads + self.verbose = verbose + self.quiet = quiet + + with pysam.FastaFile(ref_genome) as ref_fa: + self.ref_contigs = ref_fa.references + self.ref_lengths = ref_fa.lengths + + self.pileup_engine = ['samtools', 'mpileup'] + self.varcall_engine = ['varscan', 'somatic'] + self.requires_stdout_redirect = False + self.TemporaryContigVCF = partial( + tempfile.NamedTemporaryFile, + mode='wb', suffix='', delete=False, dir=os.getcwd() + ) + self.tmpfiles = [] + + def _get_pysam_pileup_args(self): + param_dict = {} + if self.max_depth is not None: + param_dict['max_depth'] = self.max_depth + if self.min_mapqual is not None: + param_dict['min_mapping_quality'] = self.min_mapqual + if self.min_basequal is not None: + param_dict['min_base_quality'] = self.min_basequal + param_dict['compute_baq'] = False + param_dict['stepper'] = 'samtools' + return param_dict + + def varcall_parallel(self, normal_purity=None, tumor_purity=None, + min_coverage=None, + min_var_count=None, + min_var_freq=None, min_hom_freq=None, + p_value=None, somatic_p_value=None, + threads=None, verbose=None, quiet=None + ): + if not threads: + threads = self.threads + if verbose is None: + verbose = self.verbose + if quiet is None: + quiet = self.quiet + # mapping of method parameters to varcall engine command line options + varcall_engine_option_mapping = [ + ('--normal-purity', normal_purity), + ('--tumor-purity', tumor_purity), + ('--min-coverage', min_coverage), + ('--min-reads2', min_var_count), + ('--min-var-freq', min_var_freq), + ('--min-freq-for-hom', min_hom_freq), + ('--p-value', p_value), + ('--somatic-p-value', somatic_p_value), + ('--min-avg-qual', self.min_basequal) + ] + varcall_engine_options = [] + for option, value in varcall_engine_option_mapping: + if value is not None: + varcall_engine_options += [option, str(value)] + pileup_engine_options = ['-B'] + if self.max_depth is not None: + pileup_engine_options += ['-d', str(self.max_depth)] + if self.min_mapqual is not None: + pileup_engine_options += ['-q', str(self.min_mapqual)] + if self.min_basequal is not None: + pileup_engine_options += ['-Q', str(self.min_basequal)] + + # Create a tuple of calls to samtools mpileup and varscan for + # each contig. The contig name is stored as the third element of + # that tuple. + # The calls are stored in the reverse order of the contig list so + # that they can be popped off later in the original order + calls = [( + self.pileup_engine + pileup_engine_options + [ + '-r', contig + ':', + '-f', self.ref_genome + ] + self.bam_input_files, + self.varcall_engine + [ + '-', '{out}', '--output-vcf', '1', '--mpileup', '1' + ] + varcall_engine_options, + contig + ) for contig in self.ref_contigs[::-1]] + + if verbose: + print('Starting variant calling ..') + + # launch subprocesses and monitor their status + subprocesses = [] + error_table = {} + tmp_io_started = [] + tmp_io_finished = [] + self.tmpfiles = [] + + def enqueue_stderr_output(out, stderr_buffer): + for line in iter(out.readline, b''): + # Eventually we are going to print the contents of + # the stderr_buffer to sys.stderr so we can + # decode things here using its encoding. + # We do a 'backslashreplace' just to be on the safe side. + stderr_buffer.write(line.decode(sys.stderr.encoding, + 'backslashreplace')) + out.close() + + try: + while subprocesses or calls: + while calls and len(subprocesses) < threads: + # There are still calls waiting for execution and we + # have unoccupied threads so we launch a new combined + # call to samtools mpileup and the variant caller. + + # pop the call arguments from our call stack + call = calls.pop() + # get the name of the contig that this call is going + # to work on + contig = call[2] + # Based on the contig name, generate a readable and + # file system-compatible prefix and use it to create + # a named temporary file, to which the call output + # will be redirected. + # At the moment we create the output file we add it to + # the list of all temporary output files so that we can + # remove it eventually during cleanup. + call_out = self.TemporaryContigVCF( + prefix=''.join( + c if c.isalnum() else '_' for c in contig + ) + '_', + ) + # maintain a list of variant call outputs + # in the order the subprocesses got launched + tmp_io_started.append(call_out.name) + if self.requires_stdout_redirect: + # redirect stdout to the temporary file just created + stdout_p2 = call_out + stderr_p2 = subprocess.PIPE + else: + # variant caller wants to write output to file directly + stdout_p2 = subprocess.PIPE + stderr_p2 = subprocess.STDOUT + call[1][call[1].index('{out}')] = call_out.name + call_out.close() + # for reporting purposes, join the arguments for the + # samtools and the variant caller calls into readable + # strings + c_str = (' '.join(call[0]), ' '.join(call[1])) + error_table[c_str] = [io.StringIO(), io.StringIO()] + # start the subprocesses + p1 = subprocess.Popen( + call[0], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + p2 = subprocess.Popen( + call[1], + stdin=p1.stdout, + stdout=stdout_p2, + stderr=stderr_p2 + ) + # subprocess bookkeeping + subprocesses.append((c_str, p1, p2, call_out, contig)) + # make sure our newly launched call does not block + # because its buffers fill up + p1.stdout.close() + t1 = Thread( + target=enqueue_stderr_output, + args=(p1.stderr, error_table[c_str][0]) + ) + t2 = Thread( + target=enqueue_stderr_output, + args=( + p2.stderr + if self.requires_stdout_redirect else + p2.stdout, + error_table[c_str][1] + ) + ) + t1.daemon = t2.daemon = True + t1.start() + t2.start() + + if verbose: + print( + 'Calling variants for contig: {0}' + .format(call[2]) + ) + + # monitor all running calls to see if any of them are done + for call, p1, p2, ofo, contig in subprocesses: + p1_stat = p1.poll() + p2_stat = p2.poll() + if p1_stat is not None or p2_stat is not None: + # There is an outcome for this process! + # Lets see what it is + if p1_stat or p2_stat: + print() + print( + error_table[call][0].getvalue(), + error_table[call][1].getvalue(), + file=sys.stderr + ) + raise VariantCallingError( + 'Variant Calling for contig {0} failed.' + .format(contig), + call='{0} | {1}'.format(call[0], call[1]) + ) + if p1_stat == 0 and p2_stat is None: + # VarScan is not handling the no output from + # samtools mpileup situation correctly so maybe + # that's the issue here + last_words = error_table[call][1].getvalue( + ).splitlines()[-4:] + if len(last_words) < 4 or any( + not msg.startswith('Input stream not ready') + for msg in last_words + ): + break + # lets give this process a bit more time + # VarScan is waiting for input it will never + # get, stop it. + p2.terminate() + subprocesses.remove((call, p1, p2, ofo, contig)) + ofo.close() + break + if p2_stat == 0: + # Things went fine. + # maintain a list of variant call outputs + # that finished successfully (in the order + # they finished) + tmp_io_finished.append(ofo.name) + if verbose: + print() + print('Contig {0} finished.'.format(contig)) + if not quiet: + print() + print( + 'stderr output from samtools mpileup/' + 'bcftools:'.upper(), + file=sys.stderr + ) + print( + error_table[call][0].getvalue(), + error_table[call][1].getvalue(), + file=sys.stderr + ) + # Discard the collected stderr output from + # the call, remove the call from the list of + # running calls and close its output file. + del error_table[call] + subprocesses.remove((call, p1, p2, ofo, contig)) + # Closing the output file is important or we + # may hit the file system limit for open files + # if there are lots of contigs. + ofo.close() + break + # wait a bit in between monitoring cycles + time.sleep(2) + finally: + for call, p1, p2, ofo, contig in subprocesses: + # make sure we do not leave running subprocesses behind + for proc in (p1, p2): + try: + proc.terminate() + except Exception: + pass + # close currently open files + ofo.close() + # store the files with finished content in the order that + # the corresponding jobs were launched + self.tmpfiles = [f for f in tmp_io_started if f in tmp_io_finished] + # Make sure remaining buffered stderr output of + # subprocesses does not get lost. + # Currently, we don't screen this output for real errors, + # but simply rewrite everything. + if not quiet and error_table: + print() + print( + 'stderr output from samtools mpileup/bcftools:'.upper(), + file=sys.stderr + ) + for call, errors in error_table.items(): + print(' | '.join(call), ':', file=sys.stderr) + print('-' * 20, file=sys.stderr) + print('samtools mpileup output:', file=sys.stderr) + print(errors[0].getvalue(), file=sys.stderr) + print('varscan somatic output:', file=sys.stderr) + print(errors[1].getvalue(), file=sys.stderr) + + def _add_ref_contigs_to_header(self, header): + for chrom, length in zip(self.ref_contigs, self.ref_lengths): + header.add_meta( + 'contig', + items=[('ID', chrom), ('length', length)] + ) + + def _add_filters_to_header(self, header): + varscan_fpfilters = { + 'VarCount': 'Fewer than {min_var_count2} variant-supporting reads', + 'VarFreq': 'Variant allele frequency below {min_var_freq2}', + 'VarAvgRL': + 'Average clipped length of variant-supporting reads < ' + '{min_var_len}', + 'VarReadPos': 'Relative average read position < {min_var_readpos}', + 'VarDist3': + 'Average distance to effective 3\' end < {min_var_dist3}', + 'VarMMQS': + 'Average mismatch quality sum for variant reads > ' + '{max_var_mmqs}', + 'VarMapQual': + 'Average mapping quality of variant reads < {min_var_mapqual}', + 'VarBaseQual': + 'Average base quality of variant reads < {min_var_basequal}', + 'Strand': + 'Strand representation of variant reads < {min_strandedness}', + 'RefAvgRL': + 'Average clipped length of ref-supporting reads < ' + '{min_ref_len}', + 'RefReadPos': + 'Relative average read position < {min_ref_readpos}', + 'RefDist3': + 'Average distance to effective 3\' end < {min_ref_dist3}', + 'RefMapQual': + 'Average mapping quality of reference reads < ' + '{min_ref_mapqual}', + 'RefBaseQual': + 'Average base quality of ref-supporting reads < ' + '{min_ref_basequal}', + 'RefMMQS': + 'Average mismatch quality sum for ref-supporting reads > ' + '{max_ref_mmqs}', + 'MMQSdiff': + 'Mismatch quality sum difference (var - ref) > ' + '{max_mmqs_diff}', + 'MinMMQSdiff': + 'Mismatch quality sum difference (var - ref) < ' + '{max_mmqs_diff}', + 'MapQualDiff': + 'Mapping quality difference (ref - var) > {max_mapqual_diff}', + 'MaxBAQdiff': + 'Average base quality difference (ref - var) > ' + '{max_basequal_diff}', + 'ReadLenDiff': + 'Average supporting read length difference (ref - var) > ' + '{max_relative_len_diff}', + } + for filter_id, description in varscan_fpfilters.items(): + header.filters.add(filter_id, None, None, description) + + def _add_indel_info_flag_to_header(self, header): + header.info.add( + 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' + ) + + def _compile_common_header(self, varcall_template, no_filters=False): + # fix the header generated by VarScan + # by adding reference and contig information + common_header = pysam.VariantHeader() + common_header.add_meta('reference', value=self.ref_genome) + self._add_ref_contigs_to_header(common_header) + if not no_filters: + # add filter info + self._add_filters_to_header(common_header) + # change the source information + common_header.add_meta('source', value='varscan.py') + # declare an INDEL flag for record INFO fields + self._add_indel_info_flag_to_header(common_header) + # take the remaining metadata from the template header produced by + # VarScan + with pysam.VariantFile(varcall_template, 'r') as original_data: + varscan_header = original_data.header + for sample in varscan_header.samples: + common_header.samples.add(sample) + common_header.merge(varscan_header) + return common_header + + def pileup_masker(self, mask): + def apply_mask_on_pileup(piled_items): + for item, status in zip(piled_items, mask): + if status: + yield item + return apply_mask_on_pileup + + def get_allele_specific_pileup_column_stats( + self, allele, pile_column, ref_fetch + ): + # number of reads supporting the given allele on + # forward and reverse strand, and in total + var_reads_plus = var_reads_minus = 0 + var_supp_read_mask = [] + for base in pile_column.get_query_sequences(): + if base == allele: + # allele supporting read on + strand + var_reads_plus += 1 + var_supp_read_mask.append(True) + elif base.upper() == allele: + # allele supporting read on - strand + var_reads_minus += 1 + var_supp_read_mask.append(True) + else: + var_supp_read_mask.append(False) + var_reads_total = var_reads_plus + var_reads_minus + + if var_reads_total == 0: + # No stats without reads! + return None + + var_supp_only = self.pileup_masker(var_supp_read_mask) + + # average mapping quality of the reads supporting the + # given allele + avg_mapping_quality = sum( + mq for mq in var_supp_only( + pile_column.get_mapping_qualities() + ) + ) / var_reads_total + + # for the remaining stats we need access to complete + # read information + piled_reads = [ + p for p in var_supp_only(pile_column.pileups) + ] + assert len(piled_reads) == var_reads_total + sum_avg_base_qualities = 0 + sum_dist_from_center = 0 + sum_dist_from_3prime = 0 + sum_clipped_length = 0 + sum_unclipped_length = 0 + sum_num_mismatches_as_fraction = 0 + sum_mismatch_qualities = 0 + + for p in piled_reads: + sum_avg_base_qualities += sum( + p.alignment.query_qualities + ) / p.alignment.infer_query_length() + sum_clipped_length += p.alignment.query_alignment_length + unclipped_length = p.alignment.infer_read_length() + sum_unclipped_length += unclipped_length + read_center = p.alignment.query_alignment_length / 2 + sum_dist_from_center += 1 - abs( + p.query_position - read_center + ) / read_center + if p.alignment.is_reverse: + sum_dist_from_3prime += p.query_position / unclipped_length + else: + sum_dist_from_3prime += 1 - p.query_position / unclipped_length + + sum_num_mismatches = 0 + for qpos, rpos in p.alignment.get_aligned_pairs(): + if qpos is not None and rpos is not None: + if p.alignment.query_sequence[qpos] != ref_fetch( + rpos, rpos + 1 + ).upper(): # ref bases can be lowercase! + sum_num_mismatches += 1 + sum_mismatch_qualities += p.alignment.query_qualities[ + qpos + ] + sum_num_mismatches_as_fraction += ( + sum_num_mismatches / p.alignment.query_alignment_length + ) + avg_basequality = sum_avg_base_qualities / var_reads_total + avg_pos_as_fraction = sum_dist_from_center / var_reads_total + avg_num_mismatches_as_fraction = ( + sum_num_mismatches_as_fraction / var_reads_total + ) + avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total + avg_clipped_length = sum_clipped_length / var_reads_total + avg_distance_to_effective_3p_end = ( + sum_dist_from_3prime / var_reads_total + ) + + return ( + avg_mapping_quality, + avg_basequality, + var_reads_plus, + var_reads_minus, + avg_pos_as_fraction, + avg_num_mismatches_as_fraction, + avg_sum_mismatch_qualities, + avg_clipped_length, + avg_distance_to_effective_3p_end + ) + + def _postprocess_variant_records(self, invcf, + min_var_count2, min_var_count2_lc, + min_var_freq2, max_somatic_p, + max_somatic_p_depth, + min_ref_readpos, min_var_readpos, + min_ref_dist3, min_var_dist3, + min_ref_len, min_var_len, + max_relative_len_diff, + min_strandedness, min_strand_reads, + min_ref_basequal, min_var_basequal, + max_basequal_diff, + min_ref_mapqual, min_var_mapqual, + max_mapqual_diff, + max_ref_mmqs, max_var_mmqs, + min_mmqs_diff, max_mmqs_diff, + **args): + # set FILTER field according to Varscan criteria + # multiple FILTER entries must be separated by semicolons + # No filters applied should be indicated with MISSING + + # since posterior filters are always applied to just one sample, + # a better place to store the info is in the FT genotype field: + # can be PASS, '.' to indicate that filters have not been applied, + # or a semicolon-separated list of filters that failed + # unfortunately, gemini does not support this field + + with ExitStack() as io_stack: + normal_reads, tumor_reads = ( + io_stack.enter_context( + pysam.Samfile(fn, 'rb')) for fn in self.bam_input_files + ) + refseq = io_stack.enter_context(pysam.FastaFile(self.ref_genome)) + pileup_args = self._get_pysam_pileup_args() + for record in invcf: + if any(len(allele) > 1 for allele in record.alleles): + # skip indel postprocessing for the moment + yield record + continue + # get pileup for genomic region affected by this variant + if record.info['SS'] == '2': + # a somatic variant => generate pileup from tumor data + pile = tumor_reads.pileup( + record.chrom, record.start, record.stop, + **pileup_args + ) + sample_of_interest = 'TUMOR' + elif record.info['SS'] in ['1', '3']: + # a germline or LOH variant => pileup from normal data + pile = normal_reads.pileup( + record.chrom, record.start, record.stop, + **pileup_args + ) + sample_of_interest = 'NORMAL' + else: + # TO DO: figure out if there is anything interesting to do + # for SS status codes 0 (reference) and 5 (unknown) + yield record + continue + + # apply false-positive filtering a la varscan fpfilter + # find the variant site in the pileup columns + for pile_column in pile: + if pile_column.reference_pos == record.start: + break + # extract required information + # overall read depth at the site + read_depth = pile_column.get_num_aligned() + assert read_depth > 0 + # no multiallelic sites in varscan + assert len(record.alleles) == 2 + if record.samples[sample_of_interest]['RD'] > 0: + ref_stats, alt_stats = [ + self.get_allele_specific_pileup_column_stats( + allele, + pile_column, + partial( + pysam.FastaFile.fetch, refseq, record.chrom + ) + ) + for allele in record.alleles + ] + else: + ref_stats = None + alt_stats = self.get_allele_specific_pileup_column_stats( + record.alleles[1], + pile_column, + partial(pysam.FastaFile.fetch, refseq, record.chrom) + ) + ref_count = 0 + if ref_stats: + ref_count = ref_stats[2] + ref_stats[3] + if ref_stats[1] < min_ref_basequal: + record.filter.add('RefBaseQual') + if ref_count >= 2: + if ref_stats[0] < min_ref_mapqual: + record.filter.add('RefMapQual') + if ref_stats[4] < min_ref_readpos: + record.filter.add('RefReadPos') + # ref_stats[5] (avg_num_mismatches_as_fraction + # is not a filter criterion in VarScan fpfilter + if ref_stats[6] > max_ref_mmqs: + record.filter.add('RefMMQS') + if ref_stats[7] < min_ref_len: + # VarScan fpfilter does not apply this filter + # for indels, but there is no reason + # not to do it. + record.filter.add('RefAvgRL') + if ref_stats[8] < min_ref_dist3: + record.filter.add('RefDist3') + if alt_stats: + alt_count = alt_stats[2] + alt_stats[3] + if ( + alt_count < min_var_count2_lc + ) or ( + read_depth >= max_somatic_p_depth and + alt_count < min_var_count2 + ): + record.filter.add('VarCount') + if alt_count / read_depth < min_var_freq2: + record.filter.add('VarFreq') + if alt_stats[1] < min_var_basequal: + record.filter.add('VarBaseQual') + if alt_count > min_strand_reads: + if ( + alt_stats[2] / alt_count < min_strandedness + ) or ( + alt_stats[3] / alt_count < min_strandedness + ): + record.filter.add('Strand') + if alt_stats[2] + alt_stats[3] >= 2: + if alt_stats[0] < min_var_mapqual: + record.filter.add('VarMapQual') + if alt_stats[4] < min_var_readpos: + record.filter.add('VarReadPos') + # alt_stats[5] (avg_num_mismatches_as_fraction + # is not a filter criterion in VarScan fpfilter + if alt_stats[6] > max_var_mmqs: + record.filter.add('VarMMQS') + if alt_stats[7] < min_var_len: + # VarScan fpfilter does not apply this filter + # for indels, but there is no reason + # not to do it. + record.filter.add('VarAvgRL') + if alt_stats[8] < min_var_dist3: + record.filter.add('VarDist3') + if ref_count >= 2 and alt_count >= 2: + if (ref_stats[0] - alt_stats[0]) > max_mapqual_diff: + record.filter.add('MapQualDiff') + if (ref_stats[1] - alt_stats[1]) > max_basequal_diff: + record.filter.add('MaxBAQdiff') + mmqs_diff = alt_stats[6] - ref_stats[6] + if mmqs_diff < min_mmqs_diff: + record.filter.add('MinMMQSdiff') + if mmqs_diff > max_mmqs_diff: + record.filter.add('MMQSdiff') + if ( + 1 - alt_stats[7] / ref_stats[7] + ) > max_relative_len_diff: + record.filter.add('ReadLenDiff') + else: + # No variant-supporting reads for this record! + # This can happen in rare cases because of + # samtools mpileup issues, but indicates a + # rather unreliable variant call. + record.filter.add('VarCount') + record.filter.add('VarFreq') + yield record + + def _indel_flagged_records(self, vcf): + for record in vcf: + record.info['INDEL'] = True + yield record + + def _merge_generator(self, vcf1, vcf2): + try: + record1 = next(vcf1) + except StopIteration: + for record2 in vcf2: + yield record2 + return + try: + record2 = next(vcf2) + except StopIteration: + yield record1 + for record1 in vcf1: + yield record1 + return + while True: + if (record1.start, record1.stop) < (record2.start, record2.stop): + yield record1 + try: + record1 = next(vcf1) + except StopIteration: + yield record2 + for record2 in vcf2: + yield record2 + return + else: + yield record2 + try: + record2 = next(vcf2) + except StopIteration: + yield record1 + for record1 in vcf1: + yield record1 + return + + def merge_and_postprocess(self, snps_out, indels_out=None, + no_filters=False, **filter_args): + temporary_data = self.tmpfiles + self.tmpfiles = [] + temporary_snp_files = [f + '.snp.vcf' for f in temporary_data] + temporary_indel_files = [f + '.indel.vcf' for f in temporary_data] + + for f in temporary_data: + try: + os.remove(f) + except Exception: + pass + + def noop_gen(data, **kwargs): + for d in data: + yield d + + if no_filters: + apply_filters = noop_gen + else: + apply_filters = self._postprocess_variant_records + + output_header = self._compile_common_header( + temporary_snp_files[0], + no_filters + ) + if indels_out is None: + with open(snps_out, 'w') as o: + o.write(str(output_header).format(**filter_args)) + for snp_f, indel_f in zip( + temporary_snp_files, temporary_indel_files + ): + with pysam.VariantFile(snp_f, 'r') as snp_invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing contig + # info + self._add_ref_contigs_to_header(snp_invcf.header) + self._add_filters_to_header(snp_invcf.header) + self._add_indel_info_flag_to_header(snp_invcf.header) + with pysam.VariantFile(indel_f, 'r') as indel_invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info + self._add_ref_contigs_to_header(indel_invcf.header) + self._add_filters_to_header(indel_invcf.header) + self._add_indel_info_flag_to_header( + indel_invcf.header + ) + for record in apply_filters( + self._merge_generator( + snp_invcf, + self._indel_flagged_records(indel_invcf) + ), + **filter_args + ): + o.write(str(record)) + try: + os.remove(snp_f) + except Exception: + pass + try: + os.remove(indel_f) + except Exception: + pass + + else: + with open(snps_out, 'w') as o: + o.write(str(output_header).format(**filter_args)) + for f in temporary_snp_files: + with pysam.VariantFile(f, 'r') as invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info and errors because of undeclared + # filters + self._add_ref_contigs_to_header(invcf.header) + self._add_filters_to_header(invcf.header) + for record in apply_filters( + invcf, **filter_args + ): + o.write(str(record)) + try: + os.remove(f) + except Exception: + pass + with open(indels_out, 'w') as o: + o.write(str(output_header)) + for f in temporary_indel_files: + with pysam.VariantFile(f, 'r') as invcf: + # fix the input header on the fly + # to avoid Warnings from htslib about missing + # contig info and errors because of undeclared + # filters + self._add_ref_contigs_to_header(invcf.header) + self._add_filters_to_header(invcf.header) + self._add_indel_info_flag_to_header(invcf.header) + for record in apply_filters( + self._indel_flagged_records(invcf), **filter_args + ): + o.write(str(record)) + try: + os.remove(f) + except Exception: + pass + + +def varscan_call(ref_genome, normal, tumor, output_path, **args): + """Preparse arguments and orchestrate calling and postprocessing.""" + + if args.pop('split_output'): + if '%T' in output_path: + out = ( + output_path.replace('%T', 'snp'), + output_path.replace('%T', 'indel') + ) + else: + out = ( + output_path + '.snp', + output_path + '.indel' + ) + else: + out = (output_path, None) + + instance_args = { + k: args.pop(k) for k in [ + 'max_depth', + 'min_mapqual', + 'min_basequal', + 'threads', + 'verbose', + 'quiet' + ] + } + varscan_somatic_args = { + k: args.pop(k) for k in [ + 'normal_purity', + 'tumor_purity', + 'min_coverage', + 'min_var_count', + 'min_var_freq', + 'min_hom_freq', + 'somatic_p_value', + 'p_value' + ] + } + + v = VarScanCaller(ref_genome, [normal, tumor], **instance_args) + v.varcall_parallel(**varscan_somatic_args) + v.merge_and_postprocess(*out, **args) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument( + 'ref_genome', + metavar='reference_genome', + help='the reference genome (in fasta format)' + ) + p.add_argument( + '--normal', + metavar='BAM_file', required=True, + help='the BAM input file of aligned reads from the normal sample' + ) + p.add_argument( + '--tumor', + metavar='BAM_file', required=True, + help='the BAM input file of aligned reads from the tumor sample' + ) + p.add_argument( + '-o', '--ofile', required=True, + metavar='OFILE', dest='output_path', + help='Name of the variant output file. With --split-output, the name ' + 'may use the %%T replacement token or will be used as the ' + 'basename for the two output files to be generated (see ' + '-s|--split-output below).' + ) + p.add_argument( + '-s', '--split-output', + dest='split_output', action='store_true', default=False, + help='indicate that separate output files for SNPs and indels ' + 'should be generated (original VarScan behavior). If specified, ' + '%%T in the --ofile file name will be replaced with "snp" and ' + '"indel" to generate the names of the SNP and indel output ' + 'files, respectively. If %%T is not found in the file name, it ' + 'will get interpreted as a basename to which ".snp"/".indel" ' + 'will be appended.' + ) + p.add_argument( + '-t', '--threads', + type=int, default=1, + help='level of parallelism' + ) + p.add_argument( + '-v', '--verbose', + action='store_true', + help='be verbose about progress' + ) + p.add_argument( + '-q', '--quiet', + action='store_true', + help='suppress output from wrapped tools' + ) + call_group = p.add_argument_group('Variant calling parameters') + call_group.add_argument( + '--normal-purity', + dest='normal_purity', type=float, + default=1.0, + help='Estimated purity of the normal sample (default: 1.0)' + ) + call_group.add_argument( + '--tumor-purity', + dest='tumor_purity', type=float, + default=1.0, + help='Estimated purity of the tumor sample (default: 1.0)' + ) + call_group.add_argument( + '--max-pileup-depth', + dest='max_depth', type=int, default=8000, + help='Maximum depth of generated pileups (samtools mpileup -d option; ' + 'default: 8000)' + ) + call_group.add_argument( + '--min-basequal', + dest='min_basequal', type=int, + default=13, + help='Minimum base quality at the variant position to use a read ' + '(default: 13)' + ) + call_group.add_argument( + '--min-mapqual', + dest='min_mapqual', type=int, + default=0, + help='Minimum mapping quality required to use a read ' + '(default: 0)' + ) + call_group.add_argument( + '--min-coverage', + dest='min_coverage', type=int, + default=8, + help='Minimum site coverage required in the normal and in the tumor ' + 'sample to call a variant (default: 8)' + ) + call_group.add_argument( + '--min-var-count', + dest='min_var_count', type=int, + default=2, + help='Minimum number of variant-supporting reads required to call a ' + 'variant (default: 2)' + ) + call_group.add_argument( + '--min-var-freq', + dest='min_var_freq', type=float, + default=0.1, + help='Minimum variant allele frequency for calling (default: 0.1)' + ) + call_group.add_argument( + '--min-hom-freq', + dest='min_hom_freq', type=float, + default=0.75, + help='Minimum variant allele frequency for homozygous call ' + '(default: 0.75)' + ) + call_group.add_argument( + '--p-value', + dest='p_value', type=float, + default=0.99, + help='P-value threshold for heterozygous call (default: 0.99)' + ) + call_group.add_argument( + '--somatic-p-value', + dest='somatic_p_value', type=float, + default=0.05, + help='P-value threshold for somatic call (default: 0.05)' + ) + filter_group = p.add_argument_group('Posterior variant filter parameters') + filter_group.add_argument( + '--no-filters', + dest='no_filters', action='store_true', + help='Disable all posterior variant filters. ' + 'If specified, all following options will be ignored' + ) + filter_group.add_argument( + '--min-var-count2', + dest='min_var_count2', type=int, + default=4, + help='Minimum number of variant-supporting reads (default: 4)' + ) + filter_group.add_argument( + '--min-var-count2-lc', + dest='min_var_count2_lc', type=int, + default=2, + help='Minimum number of variant-supporting reads when depth below ' + '--somatic-p-depth (default: 2)' + ) + filter_group.add_argument( + '--min-var-freq2', + dest='min_var_freq2', type=float, + default=0.05, + help='Minimum variant allele frequency (default: 0.05)' + ) + filter_group.add_argument( + '--max-somatic-p', + dest='max_somatic_p', type=float, + default=0.05, + help='Maximum somatic p-value (default: 0.05)' + ) + filter_group.add_argument( + '--max-somatic-p-depth', + dest='max_somatic_p_depth', type=int, + default=10, + help='Depth required to run --max-somatic-p filter (default: 10)' + ) + filter_group.add_argument( + '--min-ref-readpos', + dest='min_ref_readpos', type=float, + default=0.1, + help='Minimum average relative distance of site from the ends of ' + 'ref-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-var-readpos', + dest='min_var_readpos', type=float, + default=0.1, + help='Minimum average relative distance of site from the ends of ' + 'variant-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-ref-dist3', + dest='min_ref_dist3', type=float, + default=0.1, + help='Minimum average relative distance of site from the effective ' + '3\'end of ref-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-var-dist3', + dest='min_var_dist3', type=float, + default=0.1, + help='Minimum average relative distance of site from the effective ' + '3\'end of variant-supporting reads (default: 0.1)' + ) + filter_group.add_argument( + '--min-ref-len', + dest='min_ref_len', type=int, + default=90, + help='Minimum average trimmed length of reads supporting the ref ' + 'allele (default: 90)' + ) + filter_group.add_argument( + '--min-var-len', + dest='min_var_len', type=int, + default=90, + help='Minimum average trimmed length of reads supporting the variant ' + 'allele (default: 90)' + ) + filter_group.add_argument( + '--max-len-diff', + dest='max_relative_len_diff', type=float, + default=0.25, + help='Maximum average relative read length difference (ref - var; ' + 'default: 0.25)' + ) + filter_group.add_argument( + '--min-strandedness', + dest='min_strandedness', type=float, + default=0.01, + help='Minimum fraction of variant reads from each strand ' + '(default: 0.01)' + ) + filter_group.add_argument( + '--min-strand-reads', + dest='min_strand_reads', type=int, + default=5, + help='Minimum allele depth required to run --min-strandedness filter ' + '(default: 5)' + ) + filter_group.add_argument( + '--min-ref-basequal', + dest='min_ref_basequal', type=int, + default=15, + help='Minimum average base quality for the ref allele (default: 15)' + ) + filter_group.add_argument( + '--min-var-basequal', + dest='min_var_basequal', type=int, + default=15, + help='Minimum average base quality for the variant allele ' + '(default: 15)' + ) + filter_group.add_argument( + '--max-basequal-diff', + dest='max_basequal_diff', type=int, + default=50, + help='Maximum average base quality diff (ref - var; default: 50)' + ) + filter_group.add_argument( + '--min-ref-mapqual', + dest='min_ref_mapqual', type=int, + default=15, + help='Minimum average mapping quality of reads supporting the ref ' + 'allele (default: 15)' + ) + filter_group.add_argument( + '--min-var-mapqual', + dest='min_var_mapqual', type=int, + default=15, + help='Minimum average mapping quality of reads supporting the variant ' + 'allele (default: 15)' + ) + filter_group.add_argument( + '--max-mapqual-diff', + dest='max_mapqual_diff', type=int, + default=50, + help='Maximum average mapping quality difference (ref - var; ' + 'default: 50)' + ) + filter_group.add_argument( + '--max-ref-mmqs', + dest='max_ref_mmqs', type=int, + default=100, + help='Maximum mismatch quality sum of reads supporting the ref ' + 'allele (default: 100)' + ) + filter_group.add_argument( + '--max-var-mmqs', + dest='max_var_mmqs', type=int, + default=100, + help='Maximum mismatch quality sum of reads supporting the variant ' + 'allele (default: 100)' + ) + filter_group.add_argument( + '--min-mmqs-diff', + dest='min_mmqs_diff', type=int, + default=0, + help='Minimum mismatch quality sum difference (var - ref; default: 0)' + ) + filter_group.add_argument( + '--max-mmqs-diff', + dest='max_mmqs_diff', type=int, + default=50, + help='Maximum mismatch quality sum difference (var - ref; default: 50)' + ) + args = vars(p.parse_args()) + varscan_call(**args)