Mercurial > repos > iuc > varscan_somatic
diff varscan.py @ 6:2c66c4025db2 draft
planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit 4b7660df2384d8c8c6b5a6f02a00d13211e6ff6c
author | iuc |
---|---|
date | Fri, 18 Jan 2019 19:41:10 -0500 |
parents | 2fe9ebb98aad |
children | 2657ab48e16a |
line wrap: on
line diff
--- a/varscan.py Wed Dec 05 11:16:52 2018 -0500 +++ b/varscan.py Fri Jan 18 19:41:10 2019 -0500 @@ -398,6 +398,30 @@ 'INDEL', 0, 'Flag', 'Indicates that the variant is an INDEL' ) + def _standardize_format_fields(self, header): + """Add standard FORMAT key declarations to a VCF header.""" + + format_field_specs = [ + ('GT', '1', 'String', 'Genotype'), + ('GQ', '1', 'Integer', 'Genotype quality'), + ('DP', '1', 'Integer', 'Read depth'), + ('AD', 'R', 'Integer', 'Read depth for each allele'), + ('ADF', 'R', 'Integer', + 'Read depth for each allele on the forward strand'), + ('ADR', 'R', 'Integer', + 'Read depth for each allele on the reverse strand') + ] + # Existing FORMAT declarations cannot be overwritten. + # The only viable strategy is to mark them as removed, + # then merge the header with another one containing the + # correct declarations. This is what is implemented below. + temp_header = pysam.VariantHeader() + for spec in format_field_specs: + temp_header.formats.add(*spec) + if spec[0] in header.formats: + header.formats.remove_header(spec[0]) + header.merge(temp_header) + def _compile_common_header(self, varcall_template, no_filters=False): # fix the header generated by VarScan # by adding reference and contig information @@ -411,6 +435,10 @@ common_header.add_meta('source', value='varscan.py') # declare an INDEL flag for record INFO fields self._add_indel_info_flag_to_header(common_header) + # generate standard FORMAT field declarations + # including a correct AD declaration to prevent VarScan's + # non-standard one from getting propagated + self._standardize_format_fields(common_header) # take the remaining metadata from the template header produced by # VarScan with pysam.VariantFile(varcall_template, 'r') as original_data: @@ -418,8 +446,34 @@ for sample in varscan_header.samples: common_header.samples.add(sample) common_header.merge(varscan_header) + # don't propagate non-standard and redundant FORMAT keys written + # by VarScan + common_header.formats.remove_header('FREQ') + common_header.formats.remove_header('RD') + common_header.formats.remove_header('DP4') return common_header + def _fix_read_gt_fields(self, read): + """Migrate non-standard genotype fields to standard ones.""" + + for gt_field in read.samples.values(): + # generate a standard AD field by combining VarScan's + # RD and non-standard AD fields + gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0]) + # split VarScan's DP4 field into the standard fields + # ADF and ADR + gt_field['ADF'] = ( + int(gt_field['DP4'][0]), int(gt_field['DP4'][2]) + ) + gt_field['ADR'] = ( + int(gt_field['DP4'][1]), int(gt_field['DP4'][3]) + ) + # remove redundant fields + # FREQ is easy to calculate from AD + del read.format['FREQ'] + del read.format['RD'] + del read.format['DP4'] + def pileup_masker(self, mask): def apply_mask_on_pileup(piled_items): for item, status in zip(piled_items, mask): @@ -773,6 +827,7 @@ self._add_ref_contigs_to_header(snp_invcf.header) self._add_filters_to_header(snp_invcf.header) self._add_indel_info_flag_to_header(snp_invcf.header) + self._standardize_format_fields(snp_invcf.header) with pysam.VariantFile(indel_f, 'r') as indel_invcf: # fix the input header on the fly # to avoid Warnings from htslib about missing @@ -782,6 +837,7 @@ self._add_indel_info_flag_to_header( indel_invcf.header ) + self._standardize_format_fields(indel_invcf.header) for record in apply_filters( self._merge_generator( snp_invcf, @@ -789,6 +845,7 @@ ), **filter_args ): + self._fix_read_gt_fields(record) o.write(str(record)) try: os.remove(snp_f) @@ -810,9 +867,11 @@ # filters self._add_ref_contigs_to_header(invcf.header) self._add_filters_to_header(invcf.header) + self._standardize_format_fields(invcf.header) for record in apply_filters( invcf, **filter_args ): + self._fix_read_gt_fields(record) o.write(str(record)) try: os.remove(f) @@ -829,9 +888,11 @@ self._add_ref_contigs_to_header(invcf.header) self._add_filters_to_header(invcf.header) self._add_indel_info_flag_to_header(invcf.header) + self._standardize_format_fields(invcf.header) for record in apply_filters( self._indel_flagged_records(invcf), **filter_args ): + self._fix_read_gt_fields(record) o.write(str(record)) try: os.remove(f)