comparison py/vcfFunc.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 import sys
2 import time
3 import os
4 import re
5 import random
6
7 INCLUDE_HOMS = False
8 INCLUDE_FAIL = False
9 CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND = True
10
11 def parseLine(splt,colDict,colSamp):
12
13 # check if we want to proceed..
14 ra = splt[colDict['REF']]
15 aa = splt[colDict['ALT']]
16 # enough columns?
17 if len(splt) != len(colDict):
18 return None
19 # exclude homs / filtered?
20 if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra):
21 return None
22 if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'):
23 return None
24
25 # default vals
26 alt_alleles = [aa]
27 alt_freqs = []
28
29 gt_perSamp = []
30
31 # any alt alleles?
32 alt_split = aa.split(',')
33 if len(alt_split) > 1:
34 alt_alleles = alt_split
35
36 # check INFO for AF
37 af = None
38 if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]:
39 info = splt[colDict['INFO']]+';'
40 af = re.findall(r"AF=.*?(?=;)",info)[0][3:]
41 if af != None:
42 af_splt = af.split(',')
43 while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason?
44 af_splt.append(af_splt[-1]) # phone it in.
45 if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay
46 alt_freqs = [float(n) for n in af_splt]
47 else:
48 alt_freqs = [None]*max([len(alt_alleles),1])
49
50 gt_perSamp = None
51 # if available (i.e. we simulated it) look for WP in info
52 if len(colSamp) == 0 and 'INFO' in colDict and 'WP=' in splt[colDict['INFO']]:
53 info = splt[colDict['INFO']]+';'
54 gt_perSamp = [re.findall(r"WP=.*?(?=;)",info)[0][3:]]
55 else:
56 # if no sample columns, check info for GT
57 if len(colSamp) == 0 and 'INFO' in colDict and 'GT=' in splt[colDict['INFO']]:
58 info = splt[colDict['INFO']]+';'
59 gt_perSamp = [re.findall(r"GT=.*?(?=;)",info)[0][3:]]
60 elif len(colSamp):
61 fmt = ':'+splt[colDict['FORMAT']]+':'
62 if ':GT:' in fmt:
63 gtInd = fmt.split(':').index('GT')
64 gt_perSamp = [splt[colSamp[iii]].split(':')[gtInd-1] for iii in xrange(len(colSamp))]
65 for i in xrange(len(gt_perSamp)):
66 gt_perSamp[i] = gt_perSamp[i].replace('.','0')
67 if gt_perSamp == None:
68 gt_perSamp = [None]*max([len(colSamp),1])
69
70 return (alt_alleles, alt_freqs, gt_perSamp)
71
72
73
74 def parseVCF(vcfPath,tumorNormal=False,ploidy=2):
75
76 tt = time.time()
77 print '--------------------------------'
78 sys.stdout.write('reading input VCF...\n')
79 sys.stdout.flush()
80
81 colDict = {}
82 colSamp = []
83 nSkipped = 0
84 nSkipped_becauseHash = 0
85 allVars = {} # [ref][pos]
86 sampNames = []
87 alreadyPrintedWarning = False
88 f = open(vcfPath,'r')
89 for line in f:
90
91 if line[0] != '#':
92 if len(colDict) == 0:
93 print '\n\nERROR: VCF has no header?\n'+VCF_FILENAME+'\n\n'
94 f.close()
95 exit(1)
96 splt = line[:-1].split('\t')
97 plOut = parseLine(splt,colDict,colSamp)
98 if plOut == None:
99 nSkipped += 1
100 else:
101 (aa, af, gt) = plOut
102
103 # make sure at least one allele somewhere contains the variant
104 if tumorNormal:
105 gtEval = gt[:2]
106 else:
107 gtEval = gt[:1]
108 if None in gtEval:
109 if CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND:
110 if not alreadyPrintedWarning:
111 print 'Warning: Found variants without a GT field, assuming heterozygous...'
112 alreadyPrintedWarning = True
113 for i in xrange(len(gtEval)):
114 tmp = ['0']*ploidy
115 tmp[random.randint(0,ploidy-1)] = '1'
116 gtEval[i] = '/'.join(tmp)
117 else:
118 # skip because no GT field was found
119 nSkipped += 1
120 continue
121 isNonReference = False
122 for gtVal in gtEval:
123 if gtVal != None:
124 if '1' in gtVal:
125 isNonReference = True
126 if not isNonReference:
127 # skip if no genotype actually contains this variant
128 nSkipped += 1
129 continue
130
131 chrom = splt[0]
132 pos = int(splt[1])
133 ref = splt[3]
134 # skip if position is <= 0
135 if pos <= 0:
136 nSkipped += 1
137 continue
138
139 # hash variants to avoid inserting duplicates (there are some messy VCFs out there...)
140 if chrom not in allVars:
141 allVars[chrom] = {}
142 if pos not in allVars[chrom]:
143 allVars[chrom][pos] = (pos,ref,aa,af,gtEval)
144 else:
145 nSkipped_becauseHash += 1
146
147 else:
148 if line[1] != '#':
149 cols = line[1:-1].split('\t')
150 for i in xrange(len(cols)):
151 if 'FORMAT' in colDict:
152 colSamp.append(i)
153 colDict[cols[i]] = i
154 if len(colSamp):
155 sampNames = cols[-len(colSamp):]
156 if len(colSamp) == 1:
157 pass
158 elif len(colSamp) == 2 and tumorNormal:
159 print 'Detected 2 sample columns in input VCF, assuming tumor/normal.'
160 else:
161 print 'Warning: Multiple sample columns present in input VCF. By default genReads uses only the first column.'
162 else:
163 sampNames = ['Unknown']
164 if tumorNormal:
165 #tumorInd = sampNames.index('TUMOR')
166 #normalInd = sampNames.index('NORMAL')
167 if 'NORMAL' not in sampNames or 'TUMOR' not in sampNames:
168 print '\n\nERROR: Input VCF must have a "NORMAL" and "TUMOR" column.\n'
169 f.close()
170
171 varsOut = {}
172 for r in allVars.keys():
173 varsOut[r] = [allVars[r][k] for k in sorted(allVars[r].keys())]
174
175 print 'found',sum([len(n) for n in allVars.values()]),'valid variants in input vcf.'
176 print ' *',nSkipped,'variants skipped: (qual filtered / ref genotypes / invalid syntax)'
177 print ' *',nSkipped_becauseHash,'variants skipped due to multiple variants found per position'
178 print '--------------------------------'
179 return (sampNames, varsOut)
180
181
182