Mercurial > repos > iuc > varscan_somatic
comparison varscan.py @ 8:b79bb8b09822 draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit bd1034af3217cd9fc654d7187bf674a89465755b
author | iuc |
---|---|
date | Thu, 28 Mar 2019 18:19:00 -0400 |
parents | 2657ab48e16a |
children | 4e97191a1ff7 |
comparison
equal
deleted
inserted
replaced
7:2657ab48e16a | 8:b79bb8b09822 |
---|---|
400 | 400 |
401 def _standardize_format_fields(self, header): | 401 def _standardize_format_fields(self, header): |
402 """Add standard FORMAT key declarations to a VCF header.""" | 402 """Add standard FORMAT key declarations to a VCF header.""" |
403 | 403 |
404 format_field_specs = [ | 404 format_field_specs = [ |
405 ('GT', '1', 'String', 'Genotype'), | 405 # pysam will not add quotes around single-word descriptions, |
406 # which is a violation of the VCF specification. | |
407 # To work around this we are using two words as the | |
408 # GT format description (instead of the commonly used "Genotype"). | |
409 # This could be changed once pysam learns to quote correctly. | |
410 ('GT', '1', 'String', 'Genotype code'), | |
406 ('GQ', '1', 'Integer', 'Genotype quality'), | 411 ('GQ', '1', 'Integer', 'Genotype quality'), |
407 ('DP', '1', 'Integer', 'Read depth'), | 412 ('DP', '1', 'Integer', 'Read depth'), |
408 ('AD', 'R', 'Integer', 'Read depth for each allele'), | 413 ('AD', 'R', 'Integer', 'Read depth for each allele'), |
409 ('ADF', 'R', 'Integer', | 414 ('ADF', 'R', 'Integer', |
410 'Read depth for each allele on the forward strand'), | 415 'Read depth for each allele on the forward strand'), |
421 if spec[0] in header.formats: | 426 if spec[0] in header.formats: |
422 header.formats.remove_header(spec[0]) | 427 header.formats.remove_header(spec[0]) |
423 header.merge(temp_header) | 428 header.merge(temp_header) |
424 | 429 |
425 def _compile_common_header(self, varcall_template, no_filters=False): | 430 def _compile_common_header(self, varcall_template, no_filters=False): |
426 # fix the header generated by VarScan | 431 # read the header produced by VarScan for use as a template |
427 # by adding reference and contig information | |
428 common_header = pysam.VariantHeader() | |
429 common_header.add_meta('reference', value=self.ref_genome) | |
430 self._add_ref_contigs_to_header(common_header) | |
431 if not no_filters: | |
432 # add filter info | |
433 self._add_filters_to_header(common_header) | |
434 # change the source information | |
435 common_header.add_meta('source', value='varscan.py') | |
436 # declare an INDEL flag for record INFO fields | |
437 self._add_indel_info_flag_to_header(common_header) | |
438 # generate standard FORMAT field declarations | |
439 # including a correct AD declaration to prevent VarScan's | |
440 # non-standard one from getting propagated | |
441 self._standardize_format_fields(common_header) | |
442 # take the remaining metadata from the template header produced by | |
443 # VarScan | |
444 with pysam.VariantFile(varcall_template, 'r') as original_data: | 432 with pysam.VariantFile(varcall_template, 'r') as original_data: |
445 varscan_header = original_data.header | 433 varscan_header = original_data.header |
446 # don't propagate non-standard and redundant FORMAT keys written | 434 # don't propagate non-standard and redundant FORMAT keys written |
447 # by VarScan | 435 # by VarScan |
448 varscan_header.formats.remove_header('AD') | 436 varscan_header.formats.remove_header('AD') |
449 varscan_header.formats.remove_header('FREQ') | 437 varscan_header.formats.remove_header('FREQ') |
450 varscan_header.formats.remove_header('RD') | 438 varscan_header.formats.remove_header('RD') |
451 varscan_header.formats.remove_header('DP4') | 439 varscan_header.formats.remove_header('DP4') |
440 # build a new header containing information not written by VarScan | |
441 common_header = pysam.VariantHeader() | |
442 # copy over sample info from the VarScan template | |
452 for sample in varscan_header.samples: | 443 for sample in varscan_header.samples: |
453 common_header.samples.add(sample) | 444 common_header.samples.add(sample) |
445 # add reference information | |
446 common_header.add_meta('reference', value=self.ref_genome) | |
447 # change the source information | |
448 common_header.add_meta('source', value='varscan.py') | |
449 # add contig information | |
450 self._add_ref_contigs_to_header(common_header) | |
451 # declare an INDEL flag for record INFO fields | |
452 self._add_indel_info_flag_to_header(common_header) | |
453 # merge in remaining information from the VarScan template | |
454 common_header.merge(varscan_header) | 454 common_header.merge(varscan_header) |
455 # add additional FILTER declarations | |
456 if not no_filters: | |
457 # add filter info | |
458 self._add_filters_to_header(common_header) | |
459 # generate standard FORMAT field declarations | |
460 # including a correct AD declaration to prevent VarScan's | |
461 # non-standard one from getting propagated | |
462 self._standardize_format_fields(common_header) | |
455 return common_header | 463 return common_header |
456 | 464 |
457 def _fix_record_gt_fields(self, record): | 465 def _fix_record_gt_fields(self, record): |
458 """Migrate non-standard genotype fields to standard ones.""" | 466 """Migrate non-standard genotype fields to standard ones.""" |
459 | 467 |
496 # FREQ is easy to calculate from AD | 504 # FREQ is easy to calculate from AD |
497 del record.format['FREQ'] | 505 del record.format['FREQ'] |
498 del record.format['RD'] | 506 del record.format['RD'] |
499 del record.format['DP4'] | 507 del record.format['DP4'] |
500 | 508 |
501 def pileup_masker(self, mask): | |
502 def apply_mask_on_pileup(piled_items): | |
503 for item, status in zip(piled_items, mask): | |
504 if status: | |
505 yield item | |
506 return apply_mask_on_pileup | |
507 | |
508 def get_allele_specific_pileup_column_stats( | 509 def get_allele_specific_pileup_column_stats( |
509 self, allele, pile_column, ref_fetch | 510 self, allele, pile_column, ref_fetch, use_md=True |
510 ): | 511 ): |
511 # number of reads supporting the given allele on | |
512 # forward and reverse strand, and in total | |
513 var_reads_plus = var_reads_minus = 0 | 512 var_reads_plus = var_reads_minus = 0 |
514 var_supp_read_mask = [] | 513 sum_mapping_qualities = 0 |
515 for base in pile_column.get_query_sequences(): | 514 sum_base_qualities = 0 |
516 if base == allele: | |
517 # allele supporting read on + strand | |
518 var_reads_plus += 1 | |
519 var_supp_read_mask.append(True) | |
520 elif base.upper() == allele: | |
521 # allele supporting read on - strand | |
522 var_reads_minus += 1 | |
523 var_supp_read_mask.append(True) | |
524 else: | |
525 var_supp_read_mask.append(False) | |
526 var_reads_total = var_reads_plus + var_reads_minus | |
527 | |
528 if var_reads_total == 0: | |
529 # No stats without reads! | |
530 return None | |
531 | |
532 var_supp_only = self.pileup_masker(var_supp_read_mask) | |
533 | |
534 # average mapping quality of the reads supporting the | |
535 # given allele | |
536 avg_mapping_quality = sum( | |
537 mq for mq in var_supp_only( | |
538 pile_column.get_mapping_qualities() | |
539 ) | |
540 ) / var_reads_total | |
541 | |
542 # for the remaining stats we need access to complete | |
543 # read information | |
544 piled_reads = [ | |
545 p for p in var_supp_only(pile_column.pileups) | |
546 ] | |
547 assert len(piled_reads) == var_reads_total | |
548 sum_avg_base_qualities = 0 | |
549 sum_dist_from_center = 0 | 515 sum_dist_from_center = 0 |
550 sum_dist_from_3prime = 0 | 516 sum_dist_from_3prime = 0 |
551 sum_clipped_length = 0 | 517 sum_clipped_length = 0 |
552 sum_unclipped_length = 0 | 518 sum_unclipped_length = 0 |
553 sum_num_mismatches_as_fraction = 0 | 519 sum_num_mismatches_as_fraction = 0 |
554 sum_mismatch_qualities = 0 | 520 sum_mismatch_qualities = 0 |
555 | 521 |
556 for p in piled_reads: | 522 for p in pile_column.pileups: |
557 sum_avg_base_qualities += sum( | 523 # skip reads that don't support the allele we're looking for |
558 p.alignment.query_qualities | 524 if p.query_position is None: |
559 ) / p.alignment.infer_query_length() | 525 continue |
526 if p.alignment.query_sequence[p.query_position] != allele: | |
527 continue | |
528 | |
529 if p.alignment.is_reverse: | |
530 var_reads_minus += 1 | |
531 else: | |
532 var_reads_plus += 1 | |
533 sum_mapping_qualities += p.alignment.mapping_quality | |
534 sum_base_qualities += p.alignment.query_qualities[p.query_position] | |
560 sum_clipped_length += p.alignment.query_alignment_length | 535 sum_clipped_length += p.alignment.query_alignment_length |
561 unclipped_length = p.alignment.infer_read_length() | 536 unclipped_length = p.alignment.query_length |
562 sum_unclipped_length += unclipped_length | 537 sum_unclipped_length += unclipped_length |
563 read_center = p.alignment.query_alignment_length / 2 | 538 # The following calculations are all in 1-based coordinates |
539 # with respect to the physical 5'-end of the read sequence. | |
540 if p.alignment.is_reverse: | |
541 read_base_pos = unclipped_length - p.query_position | |
542 read_first_aln_pos = ( | |
543 unclipped_length - p.alignment.query_alignment_end + 1 | |
544 ) | |
545 read_last_aln_pos = ( | |
546 unclipped_length - p.alignment.query_alignment_start | |
547 ) | |
548 else: | |
549 read_base_pos = p.query_position + 1 | |
550 read_first_aln_pos = p.alignment.query_alignment_start + 1 | |
551 read_last_aln_pos = p.alignment.query_alignment_end | |
552 # Note: the original bam-readcount algorithm uses the less accurate | |
553 # p.alignment.query_alignment_length / 2 as the read center | |
554 read_center = (read_first_aln_pos + read_last_aln_pos) / 2 | |
564 sum_dist_from_center += 1 - abs( | 555 sum_dist_from_center += 1 - abs( |
565 p.query_position - read_center | 556 read_base_pos - read_center |
566 ) / read_center | 557 ) / (read_center - 1) |
567 if p.alignment.is_reverse: | 558 # Note: the original bam-readcount algorithm uses a more |
568 sum_dist_from_3prime += p.query_position / unclipped_length | 559 # complicated definition of the 3'-end of a read sequence, which |
560 # clips off runs of bases with a quality scores of exactly 2. | |
561 sum_dist_from_3prime += ( | |
562 read_last_aln_pos - read_base_pos | |
563 ) / (unclipped_length - 1) | |
564 | |
565 sum_num_mismatches = 0 | |
566 if use_md: | |
567 try: | |
568 # see if the read has an MD tag, in which case pysam can | |
569 # calculate the reference sequence for us | |
570 aligned_pairs = p.alignment.get_aligned_pairs( | |
571 matches_only=True, with_seq=True | |
572 ) | |
573 except ValueError: | |
574 use_md = False | |
575 if use_md: | |
576 # The ref sequence got calculated from alignment and MD tag. | |
577 for qpos, rpos, ref_base in aligned_pairs: | |
578 # pysam uses lowercase ref bases to indicate mismatches | |
579 if ( | |
580 ref_base == 'a' or ref_base == 't' or | |
581 ref_base == 'g' or ref_base == 'c' | |
582 ): | |
583 sum_num_mismatches += 1 | |
584 sum_mismatch_qualities += p.alignment.query_qualities[ | |
585 qpos | |
586 ] | |
569 else: | 587 else: |
570 sum_dist_from_3prime += 1 - p.query_position / unclipped_length | 588 # We need to obtain the aligned portion of the reference |
571 | 589 # sequence. |
572 sum_num_mismatches = 0 | 590 aligned_pairs = p.alignment.get_aligned_pairs( |
573 for qpos, rpos in p.alignment.get_aligned_pairs(): | 591 matches_only=True |
574 if qpos is not None and rpos is not None: | 592 ) |
575 if p.alignment.query_sequence[qpos] != ref_fetch( | 593 # note that ref bases can be lowercase |
576 rpos, rpos + 1 | 594 ref_seq = ref_fetch( |
577 ).upper(): # ref bases can be lowercase! | 595 aligned_pairs[0][1], aligned_pairs[-1][1] + 1 |
596 ).upper() | |
597 ref_offset = aligned_pairs[0][1] | |
598 | |
599 for qpos, rpos in p.alignment.get_aligned_pairs( | |
600 matches_only=True | |
601 ): | |
602 # see if we have a mismatch to the reference at this | |
603 # position, but note that | |
604 # - the query base may be lowercase (SAM/BAM spec) | |
605 # - an '=', if present in the query seq, indicates a match | |
606 # to the reference (SAM/BAM spec) | |
607 # - there cannot be a mismatch with an N in the reference | |
608 ref_base = ref_seq[rpos - ref_offset] | |
609 read_base = p.alignment.query_sequence[qpos].upper() | |
610 if ( | |
611 read_base != '=' and ref_base != 'N' | |
612 ) and ( | |
613 ref_base != read_base | |
614 ): | |
578 sum_num_mismatches += 1 | 615 sum_num_mismatches += 1 |
579 sum_mismatch_qualities += p.alignment.query_qualities[ | 616 sum_mismatch_qualities += p.alignment.query_qualities[ |
580 qpos | 617 qpos |
581 ] | 618 ] |
582 sum_num_mismatches_as_fraction += ( | 619 sum_num_mismatches_as_fraction += ( |
583 sum_num_mismatches / p.alignment.query_alignment_length | 620 sum_num_mismatches / p.alignment.query_alignment_length |
584 ) | 621 ) |
585 avg_basequality = sum_avg_base_qualities / var_reads_total | 622 |
623 var_reads_total = var_reads_plus + var_reads_minus | |
624 if var_reads_total == 0: | |
625 # No stats without reads! | |
626 return None | |
627 avg_mapping_quality = sum_mapping_qualities / var_reads_total | |
628 avg_basequality = sum_base_qualities / var_reads_total | |
586 avg_pos_as_fraction = sum_dist_from_center / var_reads_total | 629 avg_pos_as_fraction = sum_dist_from_center / var_reads_total |
587 avg_num_mismatches_as_fraction = ( | 630 avg_num_mismatches_as_fraction = ( |
588 sum_num_mismatches_as_fraction / var_reads_total | 631 sum_num_mismatches_as_fraction / var_reads_total |
589 ) | 632 ) |
590 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total | 633 avg_sum_mismatch_qualities = sum_mismatch_qualities / var_reads_total |
640 pileup_args = self._get_pysam_pileup_args() | 683 pileup_args = self._get_pysam_pileup_args() |
641 for record in invcf: | 684 for record in invcf: |
642 if any(len(allele) > 1 for allele in record.alleles): | 685 if any(len(allele) > 1 for allele in record.alleles): |
643 # skip indel postprocessing for the moment | 686 # skip indel postprocessing for the moment |
644 yield record | 687 yield record |
688 continue | |
689 if record.alleles[0] == 'N': | |
690 # It makes no sense to call SNPs against an unknown | |
691 # reference base | |
645 continue | 692 continue |
646 # get pileup for genomic region affected by this variant | 693 # get pileup for genomic region affected by this variant |
647 if record.info['SS'] == '2': | 694 if record.info['SS'] == '2': |
648 # a somatic variant => generate pileup from tumor data | 695 # a somatic variant => generate pileup from tumor data |
649 pile = tumor_reads.pileup( | 696 pile = tumor_reads.pileup( |
823 try: | 870 try: |
824 os.remove(f) | 871 os.remove(f) |
825 except Exception: | 872 except Exception: |
826 pass | 873 pass |
827 | 874 |
828 def noop_gen(data, **kwargs): | 875 def filter_minimal(data, **kwargs): |
829 for d in data: | 876 for record in data: |
830 yield d | 877 if record.alleles[0] != 'N' or len(record.alleles[1]) > 1: |
878 # Yield everything except SNPs called against an unknown | |
879 # reference base | |
880 yield record | |
831 | 881 |
832 if no_filters: | 882 if no_filters: |
833 apply_filters = noop_gen | 883 apply_filters = filter_minimal |
834 else: | 884 else: |
835 apply_filters = self._postprocess_variant_records | 885 apply_filters = self._postprocess_variant_records |
836 | 886 |
837 output_header = self._compile_common_header( | 887 output_header = self._compile_common_header( |
838 temporary_snp_files[0], | 888 temporary_snp_files[0], |