# HG changeset patch # User iuc # Date 1553811540 14400 # Node ID b79bb8b09822420dbaa349a22b23d2b00208f8a9 # Parent 2657ab48e16a576f1fcf5448bb9383ce9a586efa planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit bd1034af3217cd9fc654d7187bf674a89465755b diff -r 2657ab48e16a -r b79bb8b09822 test-data/hg19_chrM.fa --- a/test-data/hg19_chrM.fa Mon Jan 21 12:05:00 2019 -0500 +++ b/test-data/hg19_chrM.fa Thu Mar 28 18:19:00 2019 -0400 @@ -1,5 +1,5 @@ >chrM -GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT +NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA diff -r 2657ab48e16a -r b79bb8b09822 test-data/high_cov_chrM.bam Binary file test-data/high_cov_chrM.bam has changed diff -r 2657ab48e16a -r b79bb8b09822 test-data/test-cache/hg19_chrM.fa --- a/test-data/test-cache/hg19_chrM.fa Mon Jan 21 12:05:00 2019 -0500 +++ b/test-data/test-cache/hg19_chrM.fa Thu Mar 28 18:19:00 2019 -0400 @@ -1,5 +1,5 @@ >chrM -GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT +NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA diff -r 2657ab48e16a -r b79bb8b09822 varscan.py --- a/varscan.py Mon Jan 21 12:05:00 2019 -0500 +++ b/varscan.py Thu Mar 28 18:19:00 2019 -0400 @@ -402,7 +402,12 @@ """Add standard FORMAT key declarations to a VCF header.""" format_field_specs = [ - ('GT', '1', 'String', 'Genotype'), + # pysam will not add quotes around single-word descriptions, + # which is a violation of the VCF specification. + # To work around this we are using two words as the + # GT format description (instead of the commonly used "Genotype"). + # This could be changed once pysam learns to quote correctly. + ('GT', '1', 'String', 'Genotype code'), ('GQ', '1', 'Integer', 'Genotype quality'), ('DP', '1', 'Integer', 'Read depth'), ('AD', 'R', 'Integer', 'Read depth for each allele'), @@ -423,24 +428,7 @@ header.merge(temp_header) def _compile_common_header(self, varcall_template, no_filters=False): - # fix the header generated by VarScan - # by adding reference and contig information - common_header = pysam.VariantHeader() - common_header.add_meta('reference', value=self.ref_genome) - self._add_ref_contigs_to_header(common_header) - if not no_filters: - # add filter info - self._add_filters_to_header(common_header) - # change the source information - common_header.add_meta('source', value='varscan.py') - # declare an INDEL flag for record INFO fields - self._add_indel_info_flag_to_header(common_header) - # generate standard FORMAT field declarations - # including a correct AD declaration to prevent VarScan's - # non-standard one from getting propagated - self._standardize_format_fields(common_header) - # take the remaining metadata from the template header produced by - # VarScan + # read the header produced by VarScan for use as a template with pysam.VariantFile(varcall_template, 'r') as original_data: varscan_header = original_data.header # don't propagate non-standard and redundant FORMAT keys written @@ -449,9 +437,29 @@ varscan_header.formats.remove_header('FREQ') varscan_header.formats.remove_header('RD') varscan_header.formats.remove_header('DP4') + # build a new header containing information not written by VarScan + common_header = pysam.VariantHeader() + # copy over sample info from the VarScan template for sample in varscan_header.samples: common_header.samples.add(sample) + # add reference information + common_header.add_meta('reference', value=self.ref_genome) + # change the source information + common_header.add_meta('source', value='varscan.py') + # add contig information + self._add_ref_contigs_to_header(common_header) + # declare an INDEL flag for record INFO fields + self._add_indel_info_flag_to_header(common_header) + # merge in remaining information from the VarScan template common_header.merge(varscan_header) + # add additional FILTER declarations + if not no_filters: + # add filter info + self._add_filters_to_header(common_header) + # generate standard FORMAT field declarations + # including a correct AD declaration to prevent VarScan's + # non-standard one from getting propagated + self._standardize_format_fields(common_header) return common_header def _fix_record_gt_fields(self, record): @@ -498,54 +506,12 @@ del record.format['RD'] del record.format['DP4'] - def pileup_masker(self, mask): - def apply_mask_on_pileup(piled_items): - for item, status in zip(piled_items, mask): - if status: - yield item - return apply_mask_on_pileup - def get_allele_specific_pileup_column_stats( - self, allele, pile_column, ref_fetch + self, allele, pile_column, ref_fetch, use_md=True ): - # number of reads supporting the given allele on - # forward and reverse strand, and in total var_reads_plus = var_reads_minus = 0 - var_supp_read_mask = [] - for base in pile_column.get_query_sequences(): - if base == allele: - # allele supporting read on + strand - var_reads_plus += 1 - var_supp_read_mask.append(True) - elif base.upper() == allele: - # allele supporting read on - strand - var_reads_minus += 1 - var_supp_read_mask.append(True) - else: - var_supp_read_mask.append(False) - var_reads_total = var_reads_plus + var_reads_minus - - if var_reads_total == 0: - # No stats without reads! - return None - - var_supp_only = self.pileup_masker(var_supp_read_mask) - - # average mapping quality of the reads supporting the - # given allele - avg_mapping_quality = sum( - mq for mq in var_supp_only( - pile_column.get_mapping_qualities() - ) - ) / var_reads_total - - # for the remaining stats we need access to complete - # read information - piled_reads = [ - p for p in var_supp_only(pile_column.pileups) - ] - assert len(piled_reads) == var_reads_total - sum_avg_base_qualities = 0 + sum_mapping_qualities = 0 + sum_base_qualities = 0 sum_dist_from_center = 0 sum_dist_from_3prime = 0 sum_clipped_length = 0 @@ -553,28 +519,99 @@ sum_num_mismatches_as_fraction = 0 sum_mismatch_qualities = 0 - for p in piled_reads: - sum_avg_base_qualities += sum( - p.alignment.query_qualities - ) / p.alignment.infer_query_length() + for p in pile_column.pileups: + # skip reads that don't support the allele we're looking for + if p.query_position is None: + continue + if p.alignment.query_sequence[p.query_position] != allele: + continue + + if p.alignment.is_reverse: + var_reads_minus += 1 + else: + var_reads_plus += 1 + sum_mapping_qualities += p.alignment.mapping_quality + sum_base_qualities += p.alignment.query_qualities[p.query_position] sum_clipped_length += p.alignment.query_alignment_length - unclipped_length = p.alignment.infer_read_length() + unclipped_length = p.alignment.query_length sum_unclipped_length += unclipped_length - read_center = p.alignment.query_alignment_length / 2 + # The following calculations are all in 1-based coordinates + # with respect to the physical 5'-end of the read sequence. + if p.alignment.is_reverse: + read_base_pos = unclipped_length - p.query_position + read_first_aln_pos = ( + unclipped_length - p.alignment.query_alignment_end + 1 + ) + read_last_aln_pos = ( + unclipped_length - p.alignment.query_alignment_start + ) + else: + read_base_pos = p.query_position + 1 + read_first_aln_pos = p.alignment.query_alignment_start + 1 + read_last_aln_pos = p.alignment.query_alignment_end + # Note: the original bam-readcount algorithm uses the less accurate + # p.alignment.query_alignment_length / 2 as the read center + read_center = (read_first_aln_pos + read_last_aln_pos) / 2 sum_dist_from_center += 1 - abs( - p.query_position - read_center - ) / read_center - if p.alignment.is_reverse: - sum_dist_from_3prime += p.query_position / unclipped_length - else: - sum_dist_from_3prime += 1 - p.query_position / unclipped_length + read_base_pos - read_center + ) / (read_center - 1) + # Note: the original bam-readcount algorithm uses a more + # complicated definition of the 3'-end of a read sequence, which + # clips off runs of bases with a quality scores of exactly 2. + sum_dist_from_3prime += ( + read_last_aln_pos - read_base_pos + ) / (unclipped_length - 1) sum_num_mismatches = 0 - for qpos, rpos in p.alignment.get_aligned_pairs(): - if qpos is not None and rpos is not None: - if p.alignment.query_sequence[qpos] != ref_fetch( - rpos, rpos + 1 - ).upper(): # ref bases can be lowercase! + if use_md: + try: + # see if the read has an MD tag, in which case pysam can + # calculate the reference sequence for us + aligned_pairs = p.alignment.get_aligned_pairs( + matches_only=True, with_seq=True + ) + except ValueError: + use_md = False + if use_md: + # The ref sequence got calculated from alignment and MD tag. + for qpos, rpos, ref_base in aligned_pairs: + # pysam uses lowercase ref bases to indicate mismatches + if ( + ref_base == 'a' or ref_base == 't' or + ref_base == 'g' or ref_base == 'c' + ): + sum_num_mismatches += 1 + sum_mismatch_qualities += p.alignment.query_qualities[ + qpos + ] + else: + # We need to obtain the aligned portion of the reference + # sequence. + aligned_pairs = p.alignment.get_aligned_pairs( + matches_only=True + ) + # note that ref bases can be lowercase + ref_seq = ref_fetch( + aligned_pairs[0][1], aligned_pairs[-1][1] + 1 + ).upper() + ref_offset = aligned_pairs[0][1] + + for qpos, rpos in p.alignment.get_aligned_pairs( + matches_only=True + ): + # see if we have a mismatch to the reference at this + # position, but note that + # - the query base may be lowercase (SAM/BAM spec) + # - an '=', if present in the query seq, indicates a match + # to the reference (SAM/BAM spec) + # - there cannot be a mismatch with an N in the reference + ref_base = ref_seq[rpos - ref_offset] + read_base = p.alignment.query_sequence[qpos].upper() + if ( + read_base != '=' and ref_base != 'N' + ) and ( + ref_base != read_base + ): sum_num_mismatches += 1 sum_mismatch_qualities += p.alignment.query_qualities[ qpos @@ -582,7 +619,13 @@ sum_num_mismatches_as_fraction += ( sum_num_mismatches / p.alignment.query_alignment_length ) - avg_basequality = sum_avg_base_qualities / var_reads_total + + var_reads_total = var_reads_plus + var_reads_minus + if var_reads_total == 0: + # No stats without reads! + return None + avg_mapping_quality = sum_mapping_qualities / var_reads_total + avg_basequality = sum_base_qualities / var_reads_total avg_pos_as_fraction = sum_dist_from_center / var_reads_total avg_num_mismatches_as_fraction = ( sum_num_mismatches_as_fraction / var_reads_total @@ -643,6 +686,10 @@ # skip indel postprocessing for the moment yield record continue + if record.alleles[0] == 'N': + # It makes no sense to call SNPs against an unknown + # reference base + continue # get pileup for genomic region affected by this variant if record.info['SS'] == '2': # a somatic variant => generate pileup from tumor data @@ -825,12 +872,15 @@ except Exception: pass - def noop_gen(data, **kwargs): - for d in data: - yield d + def filter_minimal(data, **kwargs): + for record in data: + if record.alleles[0] != 'N' or len(record.alleles[1]) > 1: + # Yield everything except SNPs called against an unknown + # reference base + yield record if no_filters: - apply_filters = noop_gen + apply_filters = filter_minimal else: apply_filters = self._postprocess_variant_records diff -r 2657ab48e16a -r b79bb8b09822 varscan_somatic.xml --- a/varscan_somatic.xml Mon Jan 21 12:05:00 2019 -0500 +++ b/varscan_somatic.xml Thu Mar 28 18:19:00 2019 -0400 @@ -1,4 +1,4 @@ - + Call germline/somatic and LOH variants from tumor-normal sample pairs macros.xml @@ -453,6 +453,33 @@ + + + + + + + + + + + + + + + + + + + + +