comparison utilities/vcf_compare_OLD.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #!/usr/bin/env python
2 # encoding: utf-8
3
4 """ **************************************************
5
6 vcf_compare.py
7
8 - compare vcf file produced by workflow to golden vcf produced by simulator
9
10 Written by: Zach Stephens
11 Date: January 20, 2015
12 Contact: zstephe2@illinois.edu
13
14 ************************************************** """
15
16 import sys
17 import os
18 import copy
19 import time
20 import bisect
21 import re
22 import numpy as np
23 import optparse
24
25
26 EV_BPRANGE = 50 # how far to either side of a particular variant location do we want to check for equivalents?
27
28 DEFAULT_QUAL = -666 # if we can't find a qual score, use this instead so we know it's missing
29
30 MAX_VAL = 9999999999999 # an unreasonably large value that no reference fasta could concievably be longer than
31
32 DESC = """%prog: vcf comparison script."""
33 VERS = 0.1
34
35 PARSER = optparse.OptionParser('python %prog [options] -r <ref.fa> -g <golden.vcf> -w <workflow.vcf>',description=DESC,version="%prog v"+str(VERS))
36
37 PARSER.add_option('-r', help='* Reference Fasta', dest='REFF', action='store', metavar='<ref.fa>')
38 PARSER.add_option('-g', help='* Golden VCF', dest='GVCF', action='store', metavar='<golden.vcf>')
39 PARSER.add_option('-w', help='* Workflow VCF', dest='WVCF', action='store', metavar='<workflow.vcf>')
40 PARSER.add_option('-o', help='* Output Prefix', dest='OUTF', action='store', metavar='<prefix>')
41 PARSER.add_option('-m', help='Mappability Track', dest='MTRK', action='store', metavar='<track.bed>')
42 PARSER.add_option('-M', help='Maptrack Min Len', dest='MTMM', action='store', metavar='<int>')
43 PARSER.add_option('-t', help='Targetted Regions', dest='TREG', action='store', metavar='<regions.bed>')
44 PARSER.add_option('-T', help='Min Region Len', dest='MTRL', action='store', metavar='<int>')
45 PARSER.add_option('-c', help='Coverage Filter Threshold [%default]', dest='DP_THRESH', default=15, action='store', metavar='<int>')
46 PARSER.add_option('-a', help='Allele Freq Filter Threshold [%default]', dest='AF_THRESH', default=0.3, action='store', metavar='<float>')
47
48 PARSER.add_option('--vcf-out', help="Output Match/FN/FP variants [%default]", dest='VCF_OUT', default=False, action='store_true')
49 PARSER.add_option('--no-plot', help="No plotting [%default]", dest='NO_PLOT', default=False, action='store_true')
50 PARSER.add_option('--incl-homs', help="Include homozygous ref calls [%default]", dest='INCL_H', default=False, action='store_true')
51 PARSER.add_option('--incl-fail', help="Include calls that failed filters [%default]", dest='INCL_F', default=False, action='store_true')
52 PARSER.add_option('--fast', help="No equivalent variant detection [%default]", dest='FAST', default=False, action='store_true')
53
54 (OPTS,ARGS) = PARSER.parse_args()
55
56 REFERENCE = OPTS.REFF
57 GOLDEN_VCF = OPTS.GVCF
58 WORKFLOW_VCF = OPTS.WVCF
59 OUT_PREFIX = OPTS.OUTF
60 MAPTRACK = OPTS.MTRK
61 MIN_READLEN = OPTS.MTMM
62 BEDFILE = OPTS.TREG
63 DP_THRESH = int(OPTS.DP_THRESH)
64 AF_THRESH = float(OPTS.AF_THRESH)
65
66 VCF_OUT = OPTS.VCF_OUT
67 NO_PLOT = OPTS.NO_PLOT
68 INCLUDE_HOMS = OPTS.INCL_H
69 INCLUDE_FAIL = OPTS.INCL_F
70 FAST = OPTS.FAST
71
72 if len(sys.argv[1:]) == 0:
73 PARSER.print_help()
74 exit(1)
75
76 if OPTS.MTRL != None:
77 MINREGIONLEN = int(OPTS.MTRL)
78 else:
79 MINREGIONLEN = None
80
81 if MIN_READLEN == None:
82 MIN_READLEN = 0
83 else:
84 MIN_READLEN = int(MIN_READLEN)
85
86 if REFERENCE == None:
87 print 'Error: No reference provided.'
88 exit(1)
89 if GOLDEN_VCF == None:
90 print 'Error: No golden VCF provided.'
91 exit(1)
92 if WORKFLOW_VCF == None:
93 print 'Error: No workflow VCF provided.'
94 exit(1)
95 if OUT_PREFIX == None:
96 print 'Error: No output prefix provided.'
97 exit(1)
98 if (BEDFILE != None and MINREGIONLEN == None) or (BEDFILE == None and MINREGIONLEN != None):
99 print 'Error: Both -t and -T must be specified'
100 exit(1)
101
102 if NO_PLOT == False:
103 import matplotlib
104 matplotlib.use('Agg')
105 import matplotlib.pyplot as mpl
106 from matplotlib_venn import venn2, venn3
107 import warnings
108 warnings.filterwarnings("ignore", category=UserWarning, module='matplotlib_venn')
109
110 AF_STEPS = 20
111 AF_KEYS = np.linspace(0.0,1.0,AF_STEPS+1)
112
113 def quantize_AF(af):
114 if af >= 1.0:
115 return AF_STEPS
116 elif af <= 0.0:
117 return 0
118 else:
119 return int(af*AF_STEPS)
120
121 VCF_HEADER = '##fileformat=VCFv4.1\n##reference='+REFERENCE+'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n'
122
123 DP_TOKENS = ['DP','DPU','DPI'] # in the order that we'll look for them
124
125 def parseLine(splt,colDict,colSamp):
126
127 # check if we want to proceed..
128 ra = splt[colDict['REF']]
129 aa = splt[colDict['ALT']]
130 if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra):
131 return None
132 if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'):
133 return None
134
135 # default vals
136 cov = None
137 qual = DEFAULT_QUAL
138 alt_alleles = []
139 alt_freqs = [None]
140
141 # any alt alleles?
142 alt_split = aa.split(',')
143 if len(alt_split) > 1:
144 alt_alleles = alt_split
145
146 # cov
147 for dp_tok in DP_TOKENS:
148 # check INFO for DP first
149 if 'INFO' in colDict and dp_tok+'=' in splt[colDict['INFO']]:
150 cov = int(re.findall(re.escape(dp_tok)+r"=[0-9]+",splt[colDict['INFO']])[0][3:])
151 # check FORMAT/SAMPLE for DP second:
152 elif 'FORMAT' in colDict and len(colSamp):
153 format = splt[colDict['FORMAT']]+':'
154 if ':'+dp_tok+':' in format:
155 dpInd = format.split(':').index(dp_tok)
156 cov = int(splt[colSamp[0]].split(':')[dpInd])
157 if cov != None:
158 break
159
160 # check INFO for AF first
161 af = None
162 if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]:
163 info = splt[colDict['INFO']]+';'
164 af = re.findall(r"AF=.*?(?=;)",info)[0][3:]
165 # check FORMAT/SAMPLE for AF second:
166 elif 'FORMAT' in colDict and len(colSamp):
167 format = splt[colDict['FORMAT']]+':'
168 if ':AF:' in format:
169 afInd = splt[colDict['FORMAT']].split(':').index('AF')
170 af = splt[colSamp[0]].split(':')[afInd]
171
172 if af != None:
173 af_splt = af.split(',')
174 while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason?
175 af_splt.append(af_splt[-1]) # phone it in.
176 if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay
177 alt_freqs = [float(n) for n in af_splt]
178 else:
179 alt_freqs = [None]*max([len(alt_alleles),1])
180
181 # get QUAL if it's interesting
182 if 'QUAL' in colDict and splt[colDict['QUAL']] != '.':
183 qual = float(splt[colDict['QUAL']])
184
185 return (cov, qual, alt_alleles, alt_freqs)
186
187
188 def parseVCF(VCF_FILENAME,refName,targRegionsFl,outFile,outBool):
189 v_Hashed = {}
190 v_posHash = {}
191 v_Alts = {}
192 v_Cov = {}
193 v_AF = {}
194 v_Qual = {}
195 v_TargLen = {}
196 nBelowMinRLen = 0
197 line_unique = 0 # number of lines in vcf file containing unique variant
198 hash_coll = 0 # number of times we saw a hash collision ("per line" so non-unique alt alleles don't get counted multiple times)
199 var_filtered = 0 # number of variants excluded due to filters (e.g. hom-refs, qual)
200 var_merged = 0 # number of variants we merged into another due to having the same position specified
201 colDict = {}
202 colSamp = []
203 for line in open(VCF_FILENAME,'r'):
204 if line[0] != '#':
205 if len(colDict) == 0:
206 print '\n\nError: VCF has no header?\n'+VCF_FILENAME+'\n\n'
207 exit(1)
208 splt = line[:-1].split('\t')
209 if splt[0] == refName:
210
211 var = (int(splt[1]),splt[3],splt[4])
212 targInd = bisect.bisect(targRegionsFl,var[0])
213
214 if targInd%2 == 1:
215 targLen = targRegionsFl[targInd]-targRegionsFl[targInd-1]
216 if (BEDFILE != None and targLen >= MINREGIONLEN) or BEDFILE == None:
217
218 pl_out = parseLine(splt,colDict,colSamp)
219 if pl_out == None:
220 var_filtered += 1
221 continue
222 (cov, qual, aa, af) = pl_out
223
224 if var not in v_Hashed:
225
226 vpos = var[0]
227 if vpos in v_posHash:
228 if len(aa) == 0:
229 aa = [var[2]]
230 aa.extend([n[2] for n in v_Hashed.keys() if n[0] == vpos])
231 var_merged += 1
232 v_posHash[vpos] = 1
233
234 if len(aa):
235 allVars = [(var[0],var[1],n) for n in aa]
236 for i in xrange(len(allVars)):
237 v_Hashed[allVars[i]] = 1
238 #if allVars[i] not in v_Alts:
239 # v_Alts[allVars[i]] = []
240 #v_Alts[allVars[i]].extend(allVars)
241 v_Alts[allVars[i]] = allVars
242 else:
243 v_Hashed[var] = 1
244
245 if cov != None:
246 v_Cov[var] = cov
247 v_AF[var] = af[0] # only use first AF, even if multiple. fix this later?
248 v_Qual[var] = qual
249 v_TargLen[var] = targLen
250 line_unique += 1
251
252 else:
253 hash_coll += 1
254
255 else:
256 nBelowMinRLen += 1
257 else:
258 if line[1] != '#':
259 cols = line[1:-1].split('\t')
260 for i in xrange(len(cols)):
261 if 'FORMAT' in colDict:
262 colSamp.append(i)
263 colDict[cols[i]] = i
264 if VCF_OUT and outBool:
265 outBool = False
266 outFile.write(line)
267
268 return (v_Hashed, v_Alts, v_Cov, v_AF, v_Qual, v_TargLen, nBelowMinRLen, line_unique, var_filtered, var_merged, hash_coll)
269
270
271 def condenseByPos(listIn):
272 varListOfInterest = [n for n in listIn]
273 indCount = {}
274 for n in varListOfInterest:
275 c = n[0]
276 if c not in indCount:
277 indCount[c] = 0
278 indCount[c] += 1
279 #nonUniqueDict = {n:[] for n in sorted(indCount.keys()) if indCount[n] > 1} # the python 2.7 way
280 nonUniqueDict = {}
281 for n in sorted(indCount.keys()):
282 if indCount[n] > 1:
283 nonUniqueDict[n] = []
284 delList = []
285 for i in xrange(len(varListOfInterest)):
286 if varListOfInterest[i][0] in nonUniqueDict:
287 nonUniqueDict[varListOfInterest[i][0]].append(varListOfInterest[i])
288 delList.append(i)
289 delList = sorted(delList,reverse=True)
290 for di in delList:
291 del varListOfInterest[di]
292 for v in nonUniqueDict.values():
293 var = (v[0][0],v[0][1],','.join([n[2] for n in v[::-1]]))
294 varListOfInterest.append(var)
295 return varListOfInterest
296
297
298 def main():
299
300 ref = []
301 f = open(REFERENCE,'r')
302 nLines = 0
303 prevR = None
304 prevP = None
305 ref_inds = []
306 sys.stdout.write('\nindexing reference fasta... ')
307 sys.stdout.flush()
308 tt = time.time()
309 while 1:
310 nLines += 1
311 data = f.readline()
312 if not data:
313 ref_inds.append( (prevR, prevP, f.tell()-len(data)) )
314 break
315 if data[0] == '>':
316 if prevP != None:
317 ref_inds.append( (prevR, prevP, f.tell()-len(data)) )
318 prevP = f.tell()
319 prevR = data[1:-1]
320 print '{0:.3f} (sec)'.format(time.time()-tt)
321 #ref_inds = [('chrM', 6, 16909), ('chr1', 16915, 254252549), ('chr2', 254252555, 502315916), ('chr3', 502315922, 704298801), ('chr4', 704298807, 899276169), ('chr5', 899276175, 1083809741), ('chr6', 1083809747, 1258347116), ('chr7', 1258347122, 1420668559), ('chr8', 1420668565, 1569959868), ('chr9', 1569959874, 1713997574), ('chr10', 1713997581, 1852243023), ('chr11', 1852243030, 1989949677), ('chr12', 1989949684, 2126478617), ('chr13', 2126478624, 2243951900), ('chr14', 2243951907, 2353448438), ('chr15', 2353448445, 2458030465), ('chr16', 2458030472, 2550192321), ('chr17', 2550192328, 2633011443), ('chr18', 2633011450, 2712650243), ('chr19', 2712650250, 2772961813), ('chr20', 2772961820, 2837247851), ('chr21', 2837247858, 2886340351), ('chr22', 2886340358, 2938671016), ('chrX', 2938671022, 3097046994), ('chrY', 3097047000, 3157608038)]
322
323 ztV = 0 # total golden variants
324 ztW = 0 # total workflow variants
325 znP = 0 # total perfect matches
326 zfP = 0 # total false positives
327 znF = 0 # total false negatives
328 znE = 0 # total equivalent variants detected
329 zgF = 0 # total golden variants that were filtered and excluded
330 zgR = 0 # total golden variants that were excluded for being redundant
331 zgM = 0 # total golden variants that were merged into a single position
332 zwF = 0 # total workflow variants that were filtered and excluded
333 zwR = 0 # total workflow variants that were excluded for being redundant
334 zwM = 0 # total workflow variants that were merged into a single position
335 if BEDFILE != None:
336 zbM = 0
337
338 mappability_vs_FN = {0:0, 1:0} # [0] = # of FNs that were in mappable regions, [1] = # of FNs that were in unmappable regions
339 coverage_vs_FN = {} # [C] = # of FNs that were covered by C reads
340 alleleBal_vs_FN = {} # [AF] = # of FNs that were heterozygous genotypes with allele freq AF (quantized to multiples of 1/AF_STEPS)
341 for n in AF_KEYS:
342 alleleBal_vs_FN[n] = 0
343
344 #
345 # read in mappability track
346 #
347 mappability_tracks = {} # indexed by chr string (e.g. 'chr1'), has boolean array
348 prevRef = ''
349 relevantRegions = []
350 if MAPTRACK != None:
351 mtf = open(MAPTRACK,'r')
352 for line in mtf:
353 splt = line.strip().split('\t')
354 if prevRef != '' and splt[0] != prevRef:
355 # do stuff
356 if len(relevantRegions):
357 myTrack = [0]*(relevantRegions[-1][1]+100)
358 for r in relevantRegions:
359 for ri in xrange(r[0],r[1]):
360 myTrack[ri] = 1
361 mappability_tracks[prevRef] = [n for n in myTrack]
362 #
363 relevantRegions = []
364 if int(splt[3]) >= MIN_READLEN:
365 relevantRegions.append((int(splt[1]),int(splt[2])))
366 prevRef = splt[0]
367 mtf.close()
368 # do stuff
369 if len(relevantRegions):
370 myTrack = [0]*(relevantRegions[-1][1]+100)
371 for r in relevantRegions:
372 for ri in xrange(r[0],r[1]):
373 myTrack[ri] = 1
374 mappability_tracks[prevRef] = [n for n in myTrack]
375 #
376
377 #
378 # init vcf output, if desired
379 #
380 vcfo2 = None
381 vcfo3 = None
382 global vcfo2_firstTime
383 global vcfo3_firstTime
384 vcfo2_firstTime = False
385 vcfo3_firstTime = False
386 if VCF_OUT:
387 vcfo2 = open(OUT_PREFIX+'_FN.vcf','w')
388 vcfo3 = open(OUT_PREFIX+'_FP.vcf','w')
389 vcfo2_firstTime = True
390 vcfo3_firstTime = True
391
392 #
393 # data for plotting FN analysis
394 #
395 set1 = []
396 set2 = []
397 set3 = []
398 varAdj = 0
399
400 #
401 #
402 # For each sequence in reference fasta...
403 #
404 #
405 for n_RI in ref_inds:
406
407 refName = n_RI[0]
408 if FAST == False:
409 f.seek(n_RI[1])
410 print 'reading '+refName+'...',
411 myDat = f.read(n_RI[2]-n_RI[1]).split('\n')
412 myLen = sum([len(m) for m in myDat])
413 if sys.version_info >= (2,7):
414 print '{:,} bp'.format(myLen)
415 else:
416 print '{0:} bp'.format(myLen)
417 inWidth = len(myDat[0])
418 if len(myDat[-1]) == 0: # if last line is empty, remove it.
419 del myDat[-1]
420 if inWidth*(len(myDat)-1)+len(myDat[-1]) != myLen:
421 print 'fasta column-width not consistent.'
422 print myLen, inWidth*(len(myDat)-1)+len(myDat[-1])
423 for i in xrange(len(myDat)):
424 if len(myDat[i]) != inWidth:
425 print i, len(myDat[i]), inWidth
426 exit(1)
427
428 myDat = bytearray(''.join(myDat)).upper()
429 myLen = len(myDat)
430
431 #
432 # Parse relevant targeted regions
433 #
434 targRegionsFl = []
435 if BEDFILE != None:
436 bedfile = open(BEDFILE,'r')
437 for line in bedfile:
438 splt = line.split('\t')
439 if splt[0] == refName:
440 targRegionsFl.extend((int(splt[1]),int(splt[2])))
441 bedfile.close()
442 else:
443 targRegionsFl = [-1,MAX_VAL+1]
444
445 #
446 # Parse vcf files
447 #
448 sys.stdout.write('comparing variation in '+refName+'... ')
449 sys.stdout.flush()
450 tt = time.time()
451
452 (correctHashed, correctAlts, correctCov, correctAF, correctQual, correctTargLen, correctBelowMinRLen, correctUnique, gFiltered, gMerged, gRedundant) = parseVCF(GOLDEN_VCF, refName, targRegionsFl, vcfo2, vcfo2_firstTime)
453 (workflowHashed, workflowAlts, workflowCov, workflowAF, workflowQual, workflowTarLen, workflowBelowMinRLen, workflowUnique, wFiltered, wMerged, wRedundant) = parseVCF(WORKFLOW_VCF, refName, targRegionsFl, vcfo3, vcfo3_firstTime)
454 zgF += gFiltered
455 zgR += gRedundant
456 zgM += gMerged
457 zwF += wFiltered
458 zwR += wRedundant
459 zwM += wMerged
460
461 #
462 # Deduce which variants are FP / FN
463 #
464 solvedInds = {}
465 for var in correctHashed.keys():
466 if var in workflowHashed or var[0] in solvedInds:
467 correctHashed[var] = 2
468 workflowHashed[var] = 2
469 solvedInds[var[0]] = True
470 for var in correctHashed.keys()+workflowHashed.keys():
471 if var[0] in solvedInds:
472 correctHashed[var] = 2
473 workflowHashed[var] = 2
474 nPerfect = len(solvedInds)
475
476 # correctHashed[var] = 1: were not found
477 # = 2: should be discluded because we were found
478 # = 3: should be discluded because an alt was found
479 notFound = [n for n in sorted(correctHashed.keys()) if correctHashed[n] == 1]
480 FPvariants = [n for n in sorted(workflowHashed.keys()) if workflowHashed[n] == 1]
481
482 #
483 # condense all variants who have alternate alleles and were *not* found to have perfect matches
484 # into a single variant again. These will not be included in the candidates for equivalency checking. Sorry!
485 #
486 notFound = condenseByPos(notFound)
487 FPvariants = condenseByPos(FPvariants)
488
489 #
490 # tally up some values, if there are no golden variants lets save some CPU cycles and move to the next ref
491 #
492 totalGoldenVariants = nPerfect + len(notFound)
493 totalWorkflowVariants = nPerfect + len(FPvariants)
494 if totalGoldenVariants == 0:
495 zfP += len(FPvariants)
496 ztW += totalWorkflowVariants
497 print '{0:.3f} (sec)'.format(time.time()-tt)
498 continue
499
500 #
501 # let's check for equivalent variants
502 #
503 if FAST == False:
504 delList_i = []
505 delList_j = []
506 regionsToCheck = []
507 for i in xrange(len(FPvariants)):
508 pos = FPvariants[i][0]
509 regionsToCheck.append((max([pos-EV_BPRANGE-1,0]),min([pos+EV_BPRANGE,len(myDat)-1])))
510
511 for n in regionsToCheck:
512 refSection = myDat[n[0]:n[1]]
513
514 fpWithin = []
515 for i in xrange(len(FPvariants)):
516 m = FPvariants[i]
517 if (m[0] > n[0] and m[0] < n[1]):
518 fpWithin.append((m,i))
519 fpWithin = sorted(fpWithin)
520 adj = 0
521 altSection = copy.deepcopy(refSection)
522 for (m,i) in fpWithin:
523 lr = len(m[1])
524 la = len(m[2])
525 dpos = m[0]-n[0]+adj
526 altSection = altSection[:dpos-1] + m[2] + altSection[dpos-1+lr:]
527 adj += la-lr
528
529 nfWithin = []
530 for j in xrange(len(notFound)):
531 m = notFound[j]
532 if (m[0] > n[0] and m[0] < n[1]):
533 nfWithin.append((m,j))
534 nfWithin = sorted(nfWithin)
535 adj = 0
536 altSection2 = copy.deepcopy(refSection)
537 for (m,j) in nfWithin:
538 lr = len(m[1])
539 la = len(m[2])
540 dpos = m[0]-n[0]+adj
541 altSection2 = altSection2[:dpos-1] + m[2] + altSection2[dpos-1+lr:]
542 adj += la-lr
543
544 if altSection == altSection2:
545 for (m,i) in fpWithin:
546 if i not in delList_i:
547 delList_i.append(i)
548 for (m,j) in nfWithin:
549 if j not in delList_j:
550 delList_j.append(j)
551
552 nEquiv = 0
553 for i in sorted(list(set(delList_i)),reverse=True):
554 del FPvariants[i]
555 for j in sorted(list(set(delList_j)),reverse=True):
556 del notFound[j]
557 nEquiv += 1
558 nPerfect += nEquiv
559
560 #
561 # Tally up errors and whatnot
562 #
563 ztV += totalGoldenVariants
564 ztW += totalWorkflowVariants
565 znP += nPerfect
566 zfP += len(FPvariants)
567 znF += len(notFound)
568 if FAST == False:
569 znE += nEquiv
570 if BEDFILE != None:
571 zbM += correctBelowMinRLen
572
573 #
574 # try to identify a reason for FN variants:
575 #
576
577 venn_data = [[0,0,0] for n in notFound] # [i] = (unmappable, low cov, low het)
578 for i in xrange(len(notFound)):
579 var = notFound[i]
580
581 noReason = True
582
583 # mappability?
584 if MAPTRACK != None:
585 if refName in mappability_tracks and var[0] < len(mappability_tracks[refName]):
586 if mappability_tracks[refName][var[0]]:
587 mappability_vs_FN[1] += 1
588 venn_data[i][0] = 1
589 noReason = False
590 else:
591 mappability_vs_FN[0] += 1
592
593 # coverage?
594 if var in correctCov:
595 c = correctCov[var]
596 if c != None:
597 if c not in coverage_vs_FN:
598 coverage_vs_FN[c] = 0
599 coverage_vs_FN[c] += 1
600 if c < DP_THRESH:
601 venn_data[i][1] = 1
602 noReason = False
603
604 # heterozygous genotype messing things up?
605 #if var in correctAF:
606 # a = correctAF[var]
607 # if a != None:
608 # a = AF_KEYS[quantize_AF(a)]
609 # if a not in alleleBal_vs_FN:
610 # alleleBal_vs_FN[a] = 0
611 # alleleBal_vs_FN[a] += 1
612 # if a < AF_THRESH:
613 # venn_data[i][2] = 1
614
615 # no reason?
616 if noReason:
617 venn_data[i][2] += 1
618
619 for i in xrange(len(notFound)):
620 if venn_data[i][0]: set1.append(i+varAdj)
621 if venn_data[i][1]: set2.append(i+varAdj)
622 if venn_data[i][2]: set3.append(i+varAdj)
623 varAdj += len(notFound)
624
625 #
626 # if desired, write out vcf files.
627 #
628 notFound = sorted(notFound)
629 FPvariants = sorted(FPvariants)
630 if VCF_OUT:
631 for line in open(GOLDEN_VCF,'r'):
632 if line[0] != '#':
633 splt = line.split('\t')
634 if splt[0] == refName:
635 var = (int(splt[1]),splt[3],splt[4])
636 if var in notFound:
637 vcfo2.write(line)
638 for line in open(WORKFLOW_VCF,'r'):
639 if line[0] != '#':
640 splt = line.split('\t')
641 if splt[0] == refName:
642 var = (int(splt[1]),splt[3],splt[4])
643 if var in FPvariants:
644 vcfo3.write(line)
645
646 print '{0:.3f} (sec)'.format(time.time()-tt)
647
648 #
649 # close vcf output
650 #
651 print ''
652 if VCF_OUT:
653 print OUT_PREFIX+'_FN.vcf'
654 print OUT_PREFIX+'_FP.vcf'
655 vcfo2.close()
656 vcfo3.close()
657
658 #
659 # plot some FN stuff
660 #
661 if NO_PLOT == False:
662 nDetected = len(set(set1+set2+set3))
663 set1 = set(set1)
664 set2 = set(set2)
665 set3 = set(set3)
666
667 if len(set1): s1 = 'Unmappable'
668 else: s1 = ''
669 if len(set2): s2 = 'DP < '+str(DP_THRESH)
670 else: s2 = ''
671 #if len(set3): s3 = 'AF < '+str(AF_THRESH)
672 if len(set3): s3 = 'Unknown'
673 else: s3 = ''
674
675 mpl.figure(0)
676 tstr1 = 'False Negative Variants (Missed Detections)'
677 #tstr2 = str(nDetected)+' / '+str(znF)+' FN variants categorized'
678 tstr2 = ''
679 if MAPTRACK != None:
680 v = venn3([set1, set2, set3], (s1, s2, s3))
681 else:
682 v = venn2([set2, set3], (s2, s3))
683 mpl.figtext(0.5,0.95,tstr1,fontdict={'size':14,'weight':'bold'},horizontalalignment='center')
684 mpl.figtext(0.5,0.03,tstr2,fontdict={'size':14,'weight':'bold'},horizontalalignment='center')
685
686 ouf = OUT_PREFIX+'_FNvenn.pdf'
687 print ouf
688 mpl.savefig(ouf)
689
690 #
691 # spit out results to console
692 #
693 print '\n**********************************\n'
694 if BEDFILE != None:
695 print 'ONLY CONSIDERING VARIANTS FOUND WITHIN TARGETED REGIONS\n\n'
696 print 'Total Golden Variants: ',ztV,'\t[',zgF,'filtered,',zgM,'merged,',zgR,'redundant ]'
697 print 'Total Workflow Variants:',ztW,'\t[',zwF,'filtered,',zwM,'merged,',zwR,'redundant ]'
698 print ''
699 if ztV > 0 and ztW > 0:
700 print 'Perfect Matches:',znP,'({0:.2f}%)'.format(100.*float(znP)/ztV)
701 print 'FN variants: ',znF,'({0:.2f}%)'.format(100.*float(znF)/ztV)
702 print 'FP variants: ',zfP#,'({0:.2f}%)'.format(100.*float(zfP)/ztW)
703 if FAST == False:
704 print '\nNumber of equivalent variants denoted differently between the two vcfs:',znE
705 if BEDFILE != None:
706 print '\nNumber of golden variants located in targeted regions that were too small to be sampled from:',zbM
707 if FAST:
708 print "\nWarning! Running with '--fast' means that identical variants denoted differently between the two vcfs will not be detected! The values above may be lower than the true accuracy."
709 #if NO_PLOT:
710 if True:
711 print '\n#unmappable: ',len(set1)
712 print '#low_coverage:',len(set2)
713 print '#unknown: ',len(set3)
714 print '\n**********************************\n'
715
716
717
718
719
720 if __name__ == '__main__':
721 main()