comparison varscan.py @ 8:c24910ad5441 draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit bd1034af3217cd9fc654d7187bf674a89465755b
author iuc
date Thu, 28 Mar 2019 18:19:36 -0400
parents 192bd5b78300
children 0bc56f894766
comparison
equal deleted inserted replaced
7:192bd5b78300 8:c24910ad5441
400 400
401 def _standardize_format_fields(self, header): 401 def _standardize_format_fields(self, header):
402 """Add standard FORMAT key declarations to a VCF header.""" 402 """Add standard FORMAT key declarations to a VCF header."""
403 403
404 format_field_specs = [ 404 format_field_specs = [
405 ('GT', '1', 'String', 'Genotype'), 405 # pysam will not add quotes around single-word descriptions,
406 # which is a violation of the VCF specification.
407 # To work around this we are using two words as the
408 # GT format description (instead of the commonly used "Genotype").
409 # This could be changed once pysam learns to quote correctly.
410 ('GT', '1', 'String', 'Genotype code'),
406 ('GQ', '1', 'Integer', 'Genotype quality'), 411 ('GQ', '1', 'Integer', 'Genotype quality'),
407 ('DP', '1', 'Integer', 'Read depth'), 412 ('DP', '1', 'Integer', 'Read depth'),
408 ('AD', 'R', 'Integer', 'Read depth for each allele'), 413 ('AD', 'R', 'Integer', 'Read depth for each allele'),
409 ('ADF', 'R', 'Integer', 414 ('ADF', 'R', 'Integer',
410 'Read depth for each allele on the forward strand'), 415 'Read depth for each allele on the forward strand'),
421 if spec[0] in header.formats: 426 if spec[0] in header.formats:
422 header.formats.remove_header(spec[0]) 427 header.formats.remove_header(spec[0])
423 header.merge(temp_header) 428 header.merge(temp_header)
424 429
425 def _compile_common_header(self, varcall_template, no_filters=False): 430 def _compile_common_header(self, varcall_template, no_filters=False):
426 # fix the header generated by VarScan 431 # read the header produced by VarScan for use as a template
427 # by adding reference and contig information
428 common_header = pysam.VariantHeader()
429 common_header.add_meta('reference', value=self.ref_genome)
430 self._add_ref_contigs_to_header(common_header)
431 if not no_filters:
432 # add filter info
433 self._add_filters_to_header(common_header)
434 # change the source information
435 common_header.add_meta('source', value='varscan.py')
436 # declare an INDEL flag for record INFO fields
437 self._add_indel_info_flag_to_header(common_header)
438 # generate standard FORMAT field declarations
439 # including a correct AD declaration to prevent VarScan's
440 # non-standard one from getting propagated
441 self._standardize_format_fields(common_header)
442 # take the remaining metadata from the template header produced by
443 # VarScan
444 with pysam.VariantFile(varcall_template, 'r') as original_data: 432 with pysam.VariantFile(varcall_template, 'r') as original_data:
445 varscan_header = original_data.header 433 varscan_header = original_data.header
446 # don't propagate non-standard and redundant FORMAT keys written 434 # don't propagate non-standard and redundant FORMAT keys written
447 # by VarScan 435 # by VarScan
448 varscan_header.formats.remove_header('AD') 436 varscan_header.formats.remove_header('AD')
449 varscan_header.formats.remove_header('FREQ') 437 varscan_header.formats.remove_header('FREQ')
450 varscan_header.formats.remove_header('RD') 438 varscan_header.formats.remove_header('RD')
451 varscan_header.formats.remove_header('DP4') 439 varscan_header.formats.remove_header('DP4')
440 # build a new header containing information not written by VarScan
441 common_header = pysam.VariantHeader()
442 # copy over sample info from the VarScan template
452 for sample in varscan_header.samples: 443 for sample in varscan_header.samples:
453 common_header.samples.add(sample) 444 common_header.samples.add(sample)
445 # add reference information
446 common_header.add_meta('reference', value=self.ref_genome)
447 # change the source information
448 common_header.add_meta('source', value='varscan.py')
449 # add contig information
450 self._add_ref_contigs_to_header(common_header)
451 # declare an INDEL flag for record INFO fields
452 self._add_indel_info_flag_to_header(common_header)
453 # merge in remaining information from the VarScan template
454 common_header.merge(varscan_header) 454 common_header.merge(varscan_header)
455 # add additional FILTER declarations
456 if not no_filters:
457 # add filter info
458 self._add_filters_to_header(common_header)
459 # generate standard FORMAT field declarations
460 # including a correct AD declaration to prevent VarScan's
461 # non-standard one from getting propagated
462 self._standardize_format_fields(common_header)
455 return common_header 463 return common_header
456 464
457 def _fix_record_gt_fields(self, record): 465 def _fix_record_gt_fields(self, record):
458 """Migrate non-standard genotype fields to standard ones.""" 466 """Migrate non-standard genotype fields to standard ones."""
459 467
496 # FREQ is easy to calculate from AD 504 # FREQ is easy to calculate from AD
497 del record.format['FREQ'] 505 del record.format['FREQ']
498 del record.format['RD'] 506 del record.format['RD']
499 del record.format['DP4'] 507 del record.format['DP4']
500 508
501 def pileup_masker(self, mask):
502 def apply_mask_on_pileup(piled_items):
503 for item, status in zip(piled_items, mask):
504 if status:
505 yield item
506 return apply_mask_on_pileup
507
508 def get_allele_specific_pileup_column_stats( 509 def get_allele_specific_pileup_column_stats(
509 self, allele, pile_column, ref_fetch 510 self, allele, pile_column, ref_fetch, use_md=True
510 ): 511 ):
511 # number of reads supporting the given allele on
512 # forward and reverse strand, and in total
513 var_reads_plus = var_reads_minus = 0 512 var_reads_plus = var_reads_minus = 0
514 var_supp_read_mask = [] 513 sum_mapping_qualities = 0
515 for base in pile_column.get_query_sequences(): 514 sum_base_qualities = 0
516 if base == allele:
517 # allele supporting read on + strand
518 var_reads_plus += 1
519 var_supp_read_mask.append(True)
520 elif base.upper() == allele:
521 # allele supporting read on - strand
522 var_reads_minus += 1
523 var_supp_read_mask.append(True)
524 else:
525 var_supp_read_mask.append(False)
526 var_reads_total = var_reads_plus + var_reads_minus
527
528 if var_reads_total == 0:
529 # No stats without reads!
530 return None
531
532 var_supp_only = self.pileup_masker(var_supp_read_mask)
533
534 # average mapping quality of the reads supporting the
535 # given allele
536 avg_mapping_quality = sum(
537 mq for mq in var_supp_only(
538 pile_column.get_mapping_qualities()
539 )
540 ) / var_reads_total
541
542 # for the remaining stats we need access to complete
543 # read information
544 piled_reads = [
545 p for p in var_supp_only(pile_column.pileups)
546 ]
547 assert len(piled_reads) == var_reads_total
548 sum_avg_base_qualities = 0
549 sum_dist_from_center = 0 515 sum_dist_from_center = 0
550 sum_dist_from_3prime = 0 516 sum_dist_from_3prime = 0
551 sum_clipped_length = 0 517 sum_clipped_length = 0
552 sum_unclipped_length = 0 518 sum_unclipped_length = 0
553 sum_num_mismatches_as_fraction = 0 519 sum_num_mismatches_as_fraction = 0
554 sum_mismatch_qualities = 0 520 sum_mismatch_qualities = 0
555 521
556 for p in piled_reads: 522 for p in pile_column.pileups:
557 sum_avg_base_qualities += sum( 523 # skip reads that don't support the allele we're looking for
558 p.alignment.query_qualities 524 if p.query_position is None:
559 ) / p.alignment.infer_query_length() 525 continue
526 if p.alignment.query_sequence[p.query_position] != allele:
527 continue
528
529 if p.alignment.is_reverse:
530 var_reads_minus += 1
531 else:
532 var_reads_plus += 1
533 sum_mapping_qualities += p.alignment.mapping_quality
534 sum_base_qualities += p.alignment.query_qualities[p.query_position]
560 sum_clipped_length += p.alignment.query_alignment_length 535 sum_clipped_length += p.alignment.query_alignment_length
561 unclipped_length = p.alignment.infer_read_length() 536 unclipped_length = p.alignment.query_length
562 sum_unclipped_length += unclipped_length 537 sum_unclipped_length += unclipped_length
563 read_center = p.alignment.query_alignment_length / 2 538 # The following calculations are all in 1-based coordinates
539 # with respect to the physical 5'-end of the read sequence.
540 if p.alignment.is_reverse:
541 read_base_pos = unclipped_length - p.query_position
542 read_first_aln_pos = (
543 unclipped_length - p.alignment.query_alignment_end + 1
544 )
545 read_last_aln_pos = (
546 unclipped_length - p.alignment.query_alignment_start
547 )
548 else:
549 read_base_pos = p.query_position + 1
550 read_first_aln_pos = p.alignment.query_alignment_start + 1
551 read_last_aln_pos = p.alignment.query_alignment_end
552 # Note: the original bam-readcount algorithm uses the less accurate
553 # p.alignment.query_alignment_length / 2 as the read center
554 read_center = (read_first_aln_pos + read_last_aln_pos) / 2
564 sum_dist_from_center += 1 - abs( 555 sum_dist_from_center += 1 - abs(
565 p.query_position - read_center 556 read_base_pos - read_center
566 ) / read_center 557 ) / (read_center - 1)
567 if p.alignment.is_reverse: 558 # Note: the original bam-readcount algorithm uses a more
568 sum_dist_from_3prime += p.query_position / unclipped_length 559 # complicated definition of the 3'-end of a read sequence, which
560 # clips off runs of bases with a quality scores of exactly 2.
561 sum_dist_from_3prime += (
562 read_last_aln_pos - read_base_pos
563 ) / (unclipped_length - 1)
564
565 sum_num_mismatches = 0
566 if use_md:
567 try:
568 # see if the read has an MD tag, in which case pysam can
569 # calculate the reference sequence for us
570 aligned_pairs = p.alignment.get_aligned_pairs(
571 matches_only=True, with_seq=True
572 )
573 except ValueError:
574 use_md = False
575 if use_md:
576 # The ref sequence got calculated from alignment and MD tag.
577 for qpos, rpos, ref_base in aligned_pairs:
578 # pysam uses lowercase ref bases to indicate mismatches
579 if (
580 ref_base == 'a' or ref_base == 't' or
581 ref_base == 'g' or ref_base == 'c'
582 ):
583 sum_num_mismatches += 1
584 sum_mismatch_qualities += p.alignment.query_qualities[
585 qpos
586 ]
569 else: 587 else:
570 sum_dist_from_3prime += 1 - p.query_position / unclipped_length 588 # We need to obtain the aligned portion of the reference
571 589 # sequence.
572 sum_num_mismatches = 0 590 aligned_pairs = p.alignment.get_aligned_pairs(
573 for qpos, rpos in p.alignment.get_aligned_pairs(): 591 matches_only=True
574 if qpos is not None and rpos is not None: 592 )
575 if p.alignment.query_sequence[qpos] != ref_fetch( 593 # note that ref bases can be lowercase
576 rpos, rpos + 1 594 ref_seq = ref_fetch(
577 ).upper(): # ref bases can be lowercase! 595 aligned_pairs[0][1], aligned_pairs[-1][1] + 1
596 ).upper()
597 ref_offset = aligned_pairs[0][1]
598
599 for qpos, rpos in p.alignment.get_aligned_pairs(
600 matches_only=True
601 ):
602 # see if we have a mismatch to the reference at this
603 # position, but note that
604 # - the query base may be lowercase (SAM/BAM spec)
605 # - an '=', if present in the query seq, indicates a match
606 # to the reference (SAM/BAM spec)
607 # - there cannot be a mismatch with an N in the reference
608 ref_base = ref_seq[rpos - ref_offset]
609 read_base = p.alignment.query_sequence[qpos].upper()
610 if (
611 read_base != '=' and ref_base != 'N'
612 ) and (
613 ref_base != read_base
614 ):
578 sum_num_mismatches += 1 615 sum_num_mismatches += 1
579 sum_mismatch_qualities += p.alignment.query_qualities[ 616 sum_mismatch_qualities += p.alignment.query_qualities[
580 qpos 617 qpos
581 ] 618 ]
582 sum_num_mismatches_as_fraction += ( 619 sum_num_mismatches_as_fraction += (
583 sum_num_mismatches / p.alignment.query_alignment_length 620 sum_num_mismatches / p.alignment.query_alignment_length
584 ) 621 )
585 avg_basequality = sum_avg_base_qualities / var_reads_total 622
623 var_reads_total = var_reads_plus + var_reads_minus
624 if var_reads_total == 0:
625 # No stats without reads!
626 return None
627 avg_mapping_quality = sum_mapping_qualities / var_reads_total
628 avg_basequality = sum_base_qualities / var_reads_total
586 avg_pos_as_fraction = sum_dist_from_center / var_reads_total 629 avg_pos_as_fraction = sum_dist_from_center / var_reads_total
587 avg_num_mismatches_as_fraction = ( 630 avg_num_mismatches_as_fraction = (
588 sum_num_mismatches_as_fraction / var_reads_total 631 sum_num_mismatches_as_fraction / var_reads_total
589 ) 632 )
590 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total 633 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total
640 pileup_args = self._get_pysam_pileup_args() 683 pileup_args = self._get_pysam_pileup_args()
641 for record in invcf: 684 for record in invcf:
642 if any(len(allele) > 1 for allele in record.alleles): 685 if any(len(allele) > 1 for allele in record.alleles):
643 # skip indel postprocessing for the moment 686 # skip indel postprocessing for the moment
644 yield record 687 yield record
688 continue
689 if record.alleles[0] == 'N':
690 # It makes no sense to call SNPs against an unknown
691 # reference base
645 continue 692 continue
646 # get pileup for genomic region affected by this variant 693 # get pileup for genomic region affected by this variant
647 if record.info['SS'] == '2': 694 if record.info['SS'] == '2':
648 # a somatic variant => generate pileup from tumor data 695 # a somatic variant => generate pileup from tumor data
649 pile = tumor_reads.pileup( 696 pile = tumor_reads.pileup(
823 try: 870 try:
824 os.remove(f) 871 os.remove(f)
825 except Exception: 872 except Exception:
826 pass 873 pass
827 874
828 def noop_gen(data, **kwargs): 875 def filter_minimal(data, **kwargs):
829 for d in data: 876 for record in data:
830 yield d 877 if record.alleles[0] != 'N' or len(record.alleles[1]) > 1:
878 # Yield everything except SNPs called against an unknown
879 # reference base
880 yield record
831 881
832 if no_filters: 882 if no_filters:
833 apply_filters = noop_gen 883 apply_filters = filter_minimal
834 else: 884 else:
835 apply_filters = self._postprocess_variant_records 885 apply_filters = self._postprocess_variant_records
836 886
837 output_header = self._compile_common_header( 887 output_header = self._compile_common_header(
838 temporary_snp_files[0], 888 temporary_snp_files[0],