varscan_somatic: varscan.py comparison

comparison varscan.py @ 8:b79bb8b09822 draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit bd1034af3217cd9fc654d7187bf674a89465755b

author	iuc
date	Thu, 28 Mar 2019 18:19:00 -0400
parents	2657ab48e16a
children	4e97191a1ff7

comparison

equal deleted inserted replaced

-:2657ab48e16a
+:b79bb8b09822
 def _standardize_format_fields(self, header):
 """Add standard FORMAT key declarations to a VCF header."""
 format_field_specs = [
-('GT', '1', 'String', 'Genotype'),
+# pysam will not add quotes around single-word descriptions,
+# which is a violation of the VCF specification.
+# To work around this we are using two words as the
+# GT format description (instead of the commonly used "Genotype").
+# This could be changed once pysam learns to quote correctly.
+('GT', '1', 'String', 'Genotype code'),
 ('GQ', '1', 'Integer', 'Genotype quality'),
 ('DP', '1', 'Integer', 'Read depth'),
 ('AD', 'R', 'Integer', 'Read depth for each allele'),
 ('ADF', 'R', 'Integer',
 'Read depth for each allele on the forward strand'),
 if spec[0] in header.formats:
 header.formats.remove_header(spec[0])
 header.merge(temp_header)
 def _compile_common_header(self, varcall_template, no_filters=False):
-# fix the header generated by VarScan
+# read the header produced by VarScan for use as a template
-# by adding reference and contig information
-common_header = pysam.VariantHeader()
-common_header.add_meta('reference', value=self.ref_genome)
-self._add_ref_contigs_to_header(common_header)
-if not no_filters:
-# add filter info
-self._add_filters_to_header(common_header)
-# change the source information
-common_header.add_meta('source', value='varscan.py')
-# declare an INDEL flag for record INFO fields
-self._add_indel_info_flag_to_header(common_header)
-# generate standard FORMAT field declarations
-# including a correct AD declaration to prevent VarScan's
-# non-standard one from getting propagated
-self._standardize_format_fields(common_header)
-# take the remaining metadata from the template header produced by
-# VarScan
 with pysam.VariantFile(varcall_template, 'r') as original_data:
 varscan_header = original_data.header
 # don't propagate non-standard and redundant FORMAT keys written
 # by VarScan
 varscan_header.formats.remove_header('AD')
 varscan_header.formats.remove_header('FREQ')
 varscan_header.formats.remove_header('RD')
 varscan_header.formats.remove_header('DP4')
+# build a new header containing information not written by VarScan
+common_header = pysam.VariantHeader()
+# copy over sample info from the VarScan template
 for sample in varscan_header.samples:
 common_header.samples.add(sample)
+# add reference information
+common_header.add_meta('reference', value=self.ref_genome)
+# change the source information
+common_header.add_meta('source', value='varscan.py')
+# add contig information
+self._add_ref_contigs_to_header(common_header)
+# declare an INDEL flag for record INFO fields
+self._add_indel_info_flag_to_header(common_header)
+# merge in remaining information from the VarScan template
 common_header.merge(varscan_header)
+# add additional FILTER declarations
+if not no_filters:
+# add filter info
+self._add_filters_to_header(common_header)
+# generate standard FORMAT field declarations
+# including a correct AD declaration to prevent VarScan's
+# non-standard one from getting propagated
+self._standardize_format_fields(common_header)
 return common_header
 def _fix_record_gt_fields(self, record):
 """Migrate non-standard genotype fields to standard ones."""
 # FREQ is easy to calculate from AD
 del record.format['FREQ']
 del record.format['RD']
 del record.format['DP4']
-def pileup_masker(self, mask):
-def apply_mask_on_pileup(piled_items):
-for item, status in zip(piled_items, mask):
-if status:
-yield item
-return apply_mask_on_pileup
 def get_allele_specific_pileup_column_stats(
-self, allele, pile_column, ref_fetch
+self, allele, pile_column, ref_fetch, use_md=True
 ):
-# number of reads supporting the given allele on
-# forward and reverse strand, and in total
 var_reads_plus = var_reads_minus = 0
-var_supp_read_mask = []
+sum_mapping_qualities = 0
-for base in pile_column.get_query_sequences():
+sum_base_qualities = 0
-if base == allele:
-# allele supporting read on + strand
-var_reads_plus += 1
-var_supp_read_mask.append(True)
-elif base.upper() == allele:
-# allele supporting read on - strand
-var_reads_minus += 1
-var_supp_read_mask.append(True)
-else:
-var_supp_read_mask.append(False)
-var_reads_total = var_reads_plus + var_reads_minus
-if var_reads_total == 0:
-# No stats without reads!
-return None
-var_supp_only = self.pileup_masker(var_supp_read_mask)
-# average mapping quality of the reads supporting the
-# given allele
-avg_mapping_quality = sum(
-mq for mq in var_supp_only(
-pile_column.get_mapping_qualities()
-)
-) / var_reads_total
-# for the remaining stats we need access to complete
-# read information
-piled_reads = [
-p for p in var_supp_only(pile_column.pileups)
-]
-assert len(piled_reads) == var_reads_total
-sum_avg_base_qualities = 0
 sum_dist_from_center = 0
 sum_dist_from_3prime = 0
 sum_clipped_length = 0
 sum_unclipped_length = 0
 sum_num_mismatches_as_fraction = 0
 sum_mismatch_qualities = 0
-for p in piled_reads:
+for p in pile_column.pileups:
-sum_avg_base_qualities += sum(
+# skip reads that don't support the allele we're looking for
-p.alignment.query_qualities
+if p.query_position is None:
-) / p.alignment.infer_query_length()
+continue
+if p.alignment.query_sequence[p.query_position] != allele:
+continue
+if p.alignment.is_reverse:
+var_reads_minus += 1
+else:
+var_reads_plus += 1
+sum_mapping_qualities += p.alignment.mapping_quality
+sum_base_qualities += p.alignment.query_qualities[p.query_position]
 sum_clipped_length += p.alignment.query_alignment_length
-unclipped_length = p.alignment.infer_read_length()
+unclipped_length = p.alignment.query_length
 sum_unclipped_length += unclipped_length
-read_center = p.alignment.query_alignment_length / 2
+# The following calculations are all in 1-based coordinates
+# with respect to the physical 5'-end of the read sequence.
+if p.alignment.is_reverse:
+read_base_pos = unclipped_length - p.query_position
+read_first_aln_pos = (
+unclipped_length - p.alignment.query_alignment_end + 1
+)
+read_last_aln_pos = (
+unclipped_length - p.alignment.query_alignment_start
+)
+else:
+read_base_pos = p.query_position + 1
+read_first_aln_pos = p.alignment.query_alignment_start + 1
+read_last_aln_pos = p.alignment.query_alignment_end
+# Note: the original bam-readcount algorithm uses the less accurate
+# p.alignment.query_alignment_length / 2 as the read center
+read_center = (read_first_aln_pos + read_last_aln_pos) / 2
 sum_dist_from_center += 1 - abs(
-p.query_position - read_center
+read_base_pos - read_center
-) / read_center
+) / (read_center - 1)
-if p.alignment.is_reverse:
+# Note: the original bam-readcount algorithm uses a more
-sum_dist_from_3prime += p.query_position / unclipped_length
+# complicated definition of the 3'-end of a read sequence, which
+# clips off runs of bases with a quality scores of exactly 2.
+sum_dist_from_3prime += (
+read_last_aln_pos - read_base_pos
+) / (unclipped_length - 1)
+sum_num_mismatches = 0
+if use_md:
+try:
+# see if the read has an MD tag, in which case pysam can
+# calculate the reference sequence for us
+aligned_pairs = p.alignment.get_aligned_pairs(
+matches_only=True, with_seq=True
+)
+except ValueError:
+use_md = False
+if use_md:
+# The ref sequence got calculated from alignment and MD tag.
+for qpos, rpos, ref_base in aligned_pairs:
+# pysam uses lowercase ref bases to indicate mismatches
+if (
+ref_base == 'a' or ref_base == 't' or
+ref_base == 'g' or ref_base == 'c'
+):
+sum_num_mismatches += 1
+sum_mismatch_qualities += p.alignment.query_qualities[
+qpos
+]
 else:
-sum_dist_from_3prime += 1 - p.query_position / unclipped_length
+# We need to obtain the aligned portion of the reference
+# sequence.
-sum_num_mismatches = 0
+aligned_pairs = p.alignment.get_aligned_pairs(
-for qpos, rpos in p.alignment.get_aligned_pairs():
+matches_only=True
-if qpos is not None and rpos is not None:
+)
-if p.alignment.query_sequence[qpos] != ref_fetch(
+# note that ref bases can be lowercase
-rpos, rpos + 1
+ref_seq = ref_fetch(
-).upper():  # ref bases can be lowercase!
+aligned_pairs[0][1], aligned_pairs[-1][1] + 1
+).upper()
+ref_offset = aligned_pairs[0][1]
+for qpos, rpos in p.alignment.get_aligned_pairs(
+matches_only=True
+):
+# see if we have a mismatch to the reference at this
+# position, but note that
+# - the query base may be lowercase (SAM/BAM spec)
+# - an '=', if present in the query seq, indicates a match
+#   to the reference (SAM/BAM spec)
+# - there cannot be a mismatch with an N in the reference
+ref_base = ref_seq[rpos - ref_offset]
+read_base = p.alignment.query_sequence[qpos].upper()
+if (
+read_base != '=' and ref_base != 'N'
+) and (
+ref_base != read_base
+):
 sum_num_mismatches += 1
 sum_mismatch_qualities += p.alignment.query_qualities[
 qpos
 ]
 sum_num_mismatches_as_fraction += (
 sum_num_mismatches / p.alignment.query_alignment_length
 )
-avg_basequality = sum_avg_base_qualities / var_reads_total
+var_reads_total = var_reads_plus + var_reads_minus
+if var_reads_total == 0:
+# No stats without reads!
+return None
+avg_mapping_quality = sum_mapping_qualities / var_reads_total
+avg_basequality = sum_base_qualities / var_reads_total
 avg_pos_as_fraction = sum_dist_from_center / var_reads_total
 avg_num_mismatches_as_fraction = (
 sum_num_mismatches_as_fraction / var_reads_total
 )
 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total
 pileup_args = self._get_pysam_pileup_args()
 for record in invcf:
 if any(len(allele) > 1 for allele in record.alleles):
 # skip indel postprocessing for the moment
 yield record
+continue
+if record.alleles[0] == 'N':
+# It makes no sense to call SNPs against an unknown
+# reference base
 continue
 # get pileup for genomic region affected by this variant
 if record.info['SS'] == '2':
 # a somatic variant => generate pileup from tumor data
 pile = tumor_reads.pileup(
 try:
 os.remove(f)
 except Exception:
 pass
-def noop_gen(data, **kwargs):
+def filter_minimal(data, **kwargs):
-for d in data:
+for record in data:
-yield d
+if record.alleles[0] != 'N' or len(record.alleles[1]) > 1:
+# Yield everything except SNPs called against an unknown
+# reference base
+yield record
 if no_filters:
-apply_filters = noop_gen
+apply_filters = filter_minimal
 else:
 apply_filters = self._postprocess_variant_records
 output_header = self._compile_common_header(
 temporary_snp_files[0],

Mercurial > repos > iuc > varscan_somatic

comparison varscan.py @ 8:b79bb8b09822 draft