varscan_copynumber: varscan.py comparison

comparison varscan.py @ 7:192bd5b78300 draft

planemo upload for repository https://github.com/galaxyproject/iuc/tree/master/tools/varscan commit f5e4d8903b6d69b0507f6a8fc4c2533fc7a15dfa

author	iuc
date	Mon, 21 Jan 2019 12:06:22 -0500
parents	0cb031c7f464
children	c24910ad5441

comparison

equal deleted inserted replaced

-:0cb031c7f464
+:192bd5b78300
 self._standardize_format_fields(common_header)
 # take the remaining metadata from the template header produced by
 # VarScan
 with pysam.VariantFile(varcall_template, 'r') as original_data:
 varscan_header = original_data.header
+# don't propagate non-standard and redundant FORMAT keys written
+# by VarScan
+varscan_header.formats.remove_header('AD')
+varscan_header.formats.remove_header('FREQ')
+varscan_header.formats.remove_header('RD')
+varscan_header.formats.remove_header('DP4')
 for sample in varscan_header.samples:
 common_header.samples.add(sample)
 common_header.merge(varscan_header)
-# don't propagate non-standard and redundant FORMAT keys written
-# by VarScan
-common_header.formats.remove_header('FREQ')
-common_header.formats.remove_header('RD')
-common_header.formats.remove_header('DP4')
 return common_header
-def _fix_read_gt_fields(self, read):
+def _fix_record_gt_fields(self, record):
 """Migrate non-standard genotype fields to standard ones."""
-for gt_field in read.samples.values():
+# The key issue to consider here is that we need to modify
+# genotype field values on a per-sample basis, but htslib
+# reserves memory for the values of all samples upon the first
+# modification of the field in the variant record.
+# => We need to calculate all values first, then fill each
+# genotype field starting with the sample with the largest value.
+new_gt_fields = {'AD': [], 'ADF': [], 'ADR': []}
+# store the current genotype field contents for each sample
+per_sample_gts = record.samples.values()
+# calculate and store the new genotype field values
+for gt_field in per_sample_gts:
 # generate a standard AD field by combining VarScan's
 # RD and non-standard AD fields
-gt_field['AD'] = (gt_field['RD'], gt_field['AD'][0])
+new_gt_fields['AD'].append((gt_field['RD'], gt_field['AD'][0]))
 # split VarScan's DP4 field into the standard fields
 # ADF and ADR
-gt_field['ADF'] = (
+new_gt_fields['ADF'].append(
-int(gt_field['DP4'][0]), int(gt_field['DP4'][2])
+(int(gt_field['DP4'][0]), int(gt_field['DP4'][2]))
 )
-gt_field['ADR'] = (
+new_gt_fields['ADR'].append(
-int(gt_field['DP4'][1]), int(gt_field['DP4'][3])
+(int(gt_field['DP4'][1]), int(gt_field['DP4'][3]))
 )
+# Modify the record's genotype fields.
+# For each field, modify the sample containing the largest
+# value for the field first.
+# Without this precaution we could trigger:
+# "bcf_update_format: Assertion `!fmt->p_free' failed."
+# in vcf.c of htslib resulting in a core dump.
+for field in sorted(new_gt_fields):
+for new_field, sample_fields in sorted(
+zip(new_gt_fields[field], per_sample_gts),
+key=lambda x: max(x[0]),
+reverse=True
+):
+sample_fields[field] = new_field
 # remove redundant fields
 # FREQ is easy to calculate from AD
-del read.format['FREQ']
+del record.format['FREQ']
-del read.format['RD']
+del record.format['RD']
-del read.format['DP4']
+del record.format['DP4']
 def pileup_masker(self, mask):
 def apply_mask_on_pileup(piled_items):
 for item, status in zip(piled_items, mask):
 if status:
 snp_invcf,
 self._indel_flagged_records(indel_invcf)
 ),
 **filter_args
 ):
-self._fix_read_gt_fields(record)
+self._fix_record_gt_fields(record)
 o.write(str(record))
 try:
 os.remove(snp_f)
 except Exception:
 pass
 self._add_filters_to_header(invcf.header)
 self._standardize_format_fields(invcf.header)
 for record in apply_filters(
 invcf, **filter_args
 ):
-self._fix_read_gt_fields(record)
+self._fix_record_gt_fields(record)
 o.write(str(record))
 try:
 os.remove(f)
 except Exception:
 pass
 self._add_indel_info_flag_to_header(invcf.header)
 self._standardize_format_fields(invcf.header)
 for record in apply_filters(
 self._indel_flagged_records(invcf), **filter_args
 ):
-self._fix_read_gt_fields(record)
+self._fix_record_gt_fields(record)
 o.write(str(record))
 try:
 os.remove(f)
 except Exception:
 pass

Mercurial > repos > iuc > varscan_copynumber

comparison varscan.py @ 7:192bd5b78300 draft