comparison varscan.py @ 6:ab2f908f0b08 draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 4b7660df2384d8c8c6b5a6f02a00d13211e6ff6c
author iuc
date Fri, 18 Jan 2019 19:41:30 -0500
parents d062703d6f13
children 7c5e867820cf
comparison
equal deleted inserted replaced
5:d7e6c347538d 6:ab2f908f0b08
396 def _add_indel_info_flag_to_header(self, header): 396 def _add_indel_info_flag_to_header(self, header):
397 header.info.add( 397 header.info.add(
398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' 398 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL'
399 ) 399 )
400 400
401 def _standardize_format_fields(self, header):
402 """Add standard FORMAT key declarations to a VCF header."""
403
404 format_field_specs = [
405 ('GT', '1', 'String', 'Genotype'),
406 ('GQ', '1', 'Integer', 'Genotype quality'),
407 ('DP', '1', 'Integer', 'Read depth'),
408 ('AD', 'R', 'Integer', 'Read depth for each allele'),
409 ('ADF', 'R', 'Integer',
410 'Read depth for each allele on the forward strand'),
411 ('ADR', 'R', 'Integer',
412 'Read depth for each allele on the reverse strand')
413 ]
414 # Existing FORMAT declarations cannot be overwritten.
415 # The only viable strategy is to mark them as removed,
416 # then merge the header with another one containing the
417 # correct declarations. This is what is implemented below.
418 temp_header = pysam.VariantHeader()
419 for spec in format_field_specs:
420 temp_header.formats.add(*spec)
421 if spec[0] in header.formats:
422 header.formats.remove_header(spec[0])
423 header.merge(temp_header)
424
401 def _compile_common_header(self, varcall_template, no_filters=False): 425 def _compile_common_header(self, varcall_template, no_filters=False):
402 # fix the header generated by VarScan 426 # fix the header generated by VarScan
403 # by adding reference and contig information 427 # by adding reference and contig information
404 common_header = pysam.VariantHeader() 428 common_header = pysam.VariantHeader()
405 common_header.add_meta('reference', value=self.ref_genome) 429 common_header.add_meta('reference', value=self.ref_genome)
409 self._add_filters_to_header(common_header) 433 self._add_filters_to_header(common_header)
410 # change the source information 434 # change the source information
411 common_header.add_meta('source', value='varscan.py') 435 common_header.add_meta('source', value='varscan.py')
412 # declare an INDEL flag for record INFO fields 436 # declare an INDEL flag for record INFO fields
413 self._add_indel_info_flag_to_header(common_header) 437 self._add_indel_info_flag_to_header(common_header)
438 # generate standard FORMAT field declarations
439 # including a correct AD declaration to prevent VarScan's
440 # non-standard one from getting propagated
441 self._standardize_format_fields(common_header)
414 # take the remaining metadata from the template header produced by 442 # take the remaining metadata from the template header produced by
415 # VarScan 443 # VarScan
416 with pysam.VariantFile(varcall_template, 'r') as original_data: 444 with pysam.VariantFile(varcall_template, 'r') as original_data:
417 varscan_header = original_data.header 445 varscan_header = original_data.header
418 for sample in varscan_header.samples: 446 for sample in varscan_header.samples:
419 common_header.samples.add(sample) 447 common_header.samples.add(sample)
420 common_header.merge(varscan_header) 448 common_header.merge(varscan_header)
449 # don't propagate non-standard and redundant FORMAT keys written
450 # by VarScan
451 common_header.formats.remove_header('FREQ')
452 common_header.formats.remove_header('RD')
453 common_header.formats.remove_header('DP4')
421 return common_header 454 return common_header
455
456 def _fix_read_gt_fields(self, read):
457 """Migrate non-standard genotype fields to standard ones."""
458
459 for gt_field in read.samples.values():
460 # generate a standard AD field by combining VarScan's
461 # RD and non-standard AD fields
462 gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0])
463 # split VarScan's DP4 field into the standard fields
464 # ADF and ADR
465 gt_field['ADF'] = (
466 int(gt_field['DP4'][0]), int(gt_field['DP4'][2])
467 )
468 gt_field['ADR'] = (
469 int(gt_field['DP4'][1]), int(gt_field['DP4'][3])
470 )
471 # remove redundant fields
472 # FREQ is easy to calculate from AD
473 del read.format['FREQ']
474 del read.format['RD']
475 del read.format['DP4']
422 476
423 def pileup_masker(self, mask): 477 def pileup_masker(self, mask):
424 def apply_mask_on_pileup(piled_items): 478 def apply_mask_on_pileup(piled_items):
425 for item, status in zip(piled_items, mask): 479 for item, status in zip(piled_items, mask):
426 if status: 480 if status:
771 # to avoid Warnings from htslib about missing contig 825 # to avoid Warnings from htslib about missing contig
772 # info 826 # info
773 self._add_ref_contigs_to_header(snp_invcf.header) 827 self._add_ref_contigs_to_header(snp_invcf.header)
774 self._add_filters_to_header(snp_invcf.header) 828 self._add_filters_to_header(snp_invcf.header)
775 self._add_indel_info_flag_to_header(snp_invcf.header) 829 self._add_indel_info_flag_to_header(snp_invcf.header)
830 self._standardize_format_fields(snp_invcf.header)
776 with pysam.VariantFile(indel_f, 'r') as indel_invcf: 831 with pysam.VariantFile(indel_f, 'r') as indel_invcf:
777 # fix the input header on the fly 832 # fix the input header on the fly
778 # to avoid Warnings from htslib about missing 833 # to avoid Warnings from htslib about missing
779 # contig info 834 # contig info
780 self._add_ref_contigs_to_header(indel_invcf.header) 835 self._add_ref_contigs_to_header(indel_invcf.header)
781 self._add_filters_to_header(indel_invcf.header) 836 self._add_filters_to_header(indel_invcf.header)
782 self._add_indel_info_flag_to_header( 837 self._add_indel_info_flag_to_header(
783 indel_invcf.header 838 indel_invcf.header
784 ) 839 )
840 self._standardize_format_fields(indel_invcf.header)
785 for record in apply_filters( 841 for record in apply_filters(
786 self._merge_generator( 842 self._merge_generator(
787 snp_invcf, 843 snp_invcf,
788 self._indel_flagged_records(indel_invcf) 844 self._indel_flagged_records(indel_invcf)
789 ), 845 ),
790 **filter_args 846 **filter_args
791 ): 847 ):
848 self._fix_read_gt_fields(record)
792 o.write(str(record)) 849 o.write(str(record))
793 try: 850 try:
794 os.remove(snp_f) 851 os.remove(snp_f)
795 except Exception: 852 except Exception:
796 pass 853 pass
808 # to avoid Warnings from htslib about missing 865 # to avoid Warnings from htslib about missing
809 # contig info and errors because of undeclared 866 # contig info and errors because of undeclared
810 # filters 867 # filters
811 self._add_ref_contigs_to_header(invcf.header) 868 self._add_ref_contigs_to_header(invcf.header)
812 self._add_filters_to_header(invcf.header) 869 self._add_filters_to_header(invcf.header)
870 self._standardize_format_fields(invcf.header)
813 for record in apply_filters( 871 for record in apply_filters(
814 invcf, **filter_args 872 invcf, **filter_args
815 ): 873 ):
874 self._fix_read_gt_fields(record)
816 o.write(str(record)) 875 o.write(str(record))
817 try: 876 try:
818 os.remove(f) 877 os.remove(f)
819 except Exception: 878 except Exception:
820 pass 879 pass
827 # contig info and errors because of undeclared 886 # contig info and errors because of undeclared
828 # filters 887 # filters
829 self._add_ref_contigs_to_header(invcf.header) 888 self._add_ref_contigs_to_header(invcf.header)
830 self._add_filters_to_header(invcf.header) 889 self._add_filters_to_header(invcf.header)
831 self._add_indel_info_flag_to_header(invcf.header) 890 self._add_indel_info_flag_to_header(invcf.header)
891 self._standardize_format_fields(invcf.header)
832 for record in apply_filters( 892 for record in apply_filters(
833 self._indel_flagged_records(invcf), **filter_args 893 self._indel_flagged_records(invcf), **filter_args
834 ): 894 ):
895 self._fix_read_gt_fields(record)
835 o.write(str(record)) 896 o.write(str(record))
836 try: 897 try:
837 os.remove(f) 898 os.remove(f)
838 except Exception: 899 except Exception:
839 pass 900 pass