Mercurial > repos > thondeboer > neat_genreads
comparison utilities/vcf_compare_OLD.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # encoding: utf-8 | |
| 3 | |
| 4 """ ************************************************** | |
| 5 | |
| 6 vcf_compare.py | |
| 7 | |
| 8 - compare vcf file produced by workflow to golden vcf produced by simulator | |
| 9 | |
| 10 Written by: Zach Stephens | |
| 11 Date: January 20, 2015 | |
| 12 Contact: zstephe2@illinois.edu | |
| 13 | |
| 14 ************************************************** """ | |
| 15 | |
| 16 import sys | |
| 17 import os | |
| 18 import copy | |
| 19 import time | |
| 20 import bisect | |
| 21 import re | |
| 22 import numpy as np | |
| 23 import optparse | |
| 24 | |
| 25 | |
| 26 EV_BPRANGE = 50 # how far to either side of a particular variant location do we want to check for equivalents? | |
| 27 | |
| 28 DEFAULT_QUAL = -666 # if we can't find a qual score, use this instead so we know it's missing | |
| 29 | |
| 30 MAX_VAL = 9999999999999 # an unreasonably large value that no reference fasta could concievably be longer than | |
| 31 | |
| 32 DESC = """%prog: vcf comparison script.""" | |
| 33 VERS = 0.1 | |
| 34 | |
| 35 PARSER = optparse.OptionParser('python %prog [options] -r <ref.fa> -g <golden.vcf> -w <workflow.vcf>',description=DESC,version="%prog v"+str(VERS)) | |
| 36 | |
| 37 PARSER.add_option('-r', help='* Reference Fasta', dest='REFF', action='store', metavar='<ref.fa>') | |
| 38 PARSER.add_option('-g', help='* Golden VCF', dest='GVCF', action='store', metavar='<golden.vcf>') | |
| 39 PARSER.add_option('-w', help='* Workflow VCF', dest='WVCF', action='store', metavar='<workflow.vcf>') | |
| 40 PARSER.add_option('-o', help='* Output Prefix', dest='OUTF', action='store', metavar='<prefix>') | |
| 41 PARSER.add_option('-m', help='Mappability Track', dest='MTRK', action='store', metavar='<track.bed>') | |
| 42 PARSER.add_option('-M', help='Maptrack Min Len', dest='MTMM', action='store', metavar='<int>') | |
| 43 PARSER.add_option('-t', help='Targetted Regions', dest='TREG', action='store', metavar='<regions.bed>') | |
| 44 PARSER.add_option('-T', help='Min Region Len', dest='MTRL', action='store', metavar='<int>') | |
| 45 PARSER.add_option('-c', help='Coverage Filter Threshold [%default]', dest='DP_THRESH', default=15, action='store', metavar='<int>') | |
| 46 PARSER.add_option('-a', help='Allele Freq Filter Threshold [%default]', dest='AF_THRESH', default=0.3, action='store', metavar='<float>') | |
| 47 | |
| 48 PARSER.add_option('--vcf-out', help="Output Match/FN/FP variants [%default]", dest='VCF_OUT', default=False, action='store_true') | |
| 49 PARSER.add_option('--no-plot', help="No plotting [%default]", dest='NO_PLOT', default=False, action='store_true') | |
| 50 PARSER.add_option('--incl-homs', help="Include homozygous ref calls [%default]", dest='INCL_H', default=False, action='store_true') | |
| 51 PARSER.add_option('--incl-fail', help="Include calls that failed filters [%default]", dest='INCL_F', default=False, action='store_true') | |
| 52 PARSER.add_option('--fast', help="No equivalent variant detection [%default]", dest='FAST', default=False, action='store_true') | |
| 53 | |
| 54 (OPTS,ARGS) = PARSER.parse_args() | |
| 55 | |
| 56 REFERENCE = OPTS.REFF | |
| 57 GOLDEN_VCF = OPTS.GVCF | |
| 58 WORKFLOW_VCF = OPTS.WVCF | |
| 59 OUT_PREFIX = OPTS.OUTF | |
| 60 MAPTRACK = OPTS.MTRK | |
| 61 MIN_READLEN = OPTS.MTMM | |
| 62 BEDFILE = OPTS.TREG | |
| 63 DP_THRESH = int(OPTS.DP_THRESH) | |
| 64 AF_THRESH = float(OPTS.AF_THRESH) | |
| 65 | |
| 66 VCF_OUT = OPTS.VCF_OUT | |
| 67 NO_PLOT = OPTS.NO_PLOT | |
| 68 INCLUDE_HOMS = OPTS.INCL_H | |
| 69 INCLUDE_FAIL = OPTS.INCL_F | |
| 70 FAST = OPTS.FAST | |
| 71 | |
| 72 if len(sys.argv[1:]) == 0: | |
| 73 PARSER.print_help() | |
| 74 exit(1) | |
| 75 | |
| 76 if OPTS.MTRL != None: | |
| 77 MINREGIONLEN = int(OPTS.MTRL) | |
| 78 else: | |
| 79 MINREGIONLEN = None | |
| 80 | |
| 81 if MIN_READLEN == None: | |
| 82 MIN_READLEN = 0 | |
| 83 else: | |
| 84 MIN_READLEN = int(MIN_READLEN) | |
| 85 | |
| 86 if REFERENCE == None: | |
| 87 print 'Error: No reference provided.' | |
| 88 exit(1) | |
| 89 if GOLDEN_VCF == None: | |
| 90 print 'Error: No golden VCF provided.' | |
| 91 exit(1) | |
| 92 if WORKFLOW_VCF == None: | |
| 93 print 'Error: No workflow VCF provided.' | |
| 94 exit(1) | |
| 95 if OUT_PREFIX == None: | |
| 96 print 'Error: No output prefix provided.' | |
| 97 exit(1) | |
| 98 if (BEDFILE != None and MINREGIONLEN == None) or (BEDFILE == None and MINREGIONLEN != None): | |
| 99 print 'Error: Both -t and -T must be specified' | |
| 100 exit(1) | |
| 101 | |
| 102 if NO_PLOT == False: | |
| 103 import matplotlib | |
| 104 matplotlib.use('Agg') | |
| 105 import matplotlib.pyplot as mpl | |
| 106 from matplotlib_venn import venn2, venn3 | |
| 107 import warnings | |
| 108 warnings.filterwarnings("ignore", category=UserWarning, module='matplotlib_venn') | |
| 109 | |
| 110 AF_STEPS = 20 | |
| 111 AF_KEYS = np.linspace(0.0,1.0,AF_STEPS+1) | |
| 112 | |
| 113 def quantize_AF(af): | |
| 114 if af >= 1.0: | |
| 115 return AF_STEPS | |
| 116 elif af <= 0.0: | |
| 117 return 0 | |
| 118 else: | |
| 119 return int(af*AF_STEPS) | |
| 120 | |
| 121 VCF_HEADER = '##fileformat=VCFv4.1\n##reference='+REFERENCE+'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n' | |
| 122 | |
| 123 DP_TOKENS = ['DP','DPU','DPI'] # in the order that we'll look for them | |
| 124 | |
| 125 def parseLine(splt,colDict,colSamp): | |
| 126 | |
| 127 # check if we want to proceed.. | |
| 128 ra = splt[colDict['REF']] | |
| 129 aa = splt[colDict['ALT']] | |
| 130 if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra): | |
| 131 return None | |
| 132 if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'): | |
| 133 return None | |
| 134 | |
| 135 # default vals | |
| 136 cov = None | |
| 137 qual = DEFAULT_QUAL | |
| 138 alt_alleles = [] | |
| 139 alt_freqs = [None] | |
| 140 | |
| 141 # any alt alleles? | |
| 142 alt_split = aa.split(',') | |
| 143 if len(alt_split) > 1: | |
| 144 alt_alleles = alt_split | |
| 145 | |
| 146 # cov | |
| 147 for dp_tok in DP_TOKENS: | |
| 148 # check INFO for DP first | |
| 149 if 'INFO' in colDict and dp_tok+'=' in splt[colDict['INFO']]: | |
| 150 cov = int(re.findall(re.escape(dp_tok)+r"=[0-9]+",splt[colDict['INFO']])[0][3:]) | |
| 151 # check FORMAT/SAMPLE for DP second: | |
| 152 elif 'FORMAT' in colDict and len(colSamp): | |
| 153 format = splt[colDict['FORMAT']]+':' | |
| 154 if ':'+dp_tok+':' in format: | |
| 155 dpInd = format.split(':').index(dp_tok) | |
| 156 cov = int(splt[colSamp[0]].split(':')[dpInd]) | |
| 157 if cov != None: | |
| 158 break | |
| 159 | |
| 160 # check INFO for AF first | |
| 161 af = None | |
| 162 if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]: | |
| 163 info = splt[colDict['INFO']]+';' | |
| 164 af = re.findall(r"AF=.*?(?=;)",info)[0][3:] | |
| 165 # check FORMAT/SAMPLE for AF second: | |
| 166 elif 'FORMAT' in colDict and len(colSamp): | |
| 167 format = splt[colDict['FORMAT']]+':' | |
| 168 if ':AF:' in format: | |
| 169 afInd = splt[colDict['FORMAT']].split(':').index('AF') | |
| 170 af = splt[colSamp[0]].split(':')[afInd] | |
| 171 | |
| 172 if af != None: | |
| 173 af_splt = af.split(',') | |
| 174 while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason? | |
| 175 af_splt.append(af_splt[-1]) # phone it in. | |
| 176 if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay | |
| 177 alt_freqs = [float(n) for n in af_splt] | |
| 178 else: | |
| 179 alt_freqs = [None]*max([len(alt_alleles),1]) | |
| 180 | |
| 181 # get QUAL if it's interesting | |
| 182 if 'QUAL' in colDict and splt[colDict['QUAL']] != '.': | |
| 183 qual = float(splt[colDict['QUAL']]) | |
| 184 | |
| 185 return (cov, qual, alt_alleles, alt_freqs) | |
| 186 | |
| 187 | |
| 188 def parseVCF(VCF_FILENAME,refName,targRegionsFl,outFile,outBool): | |
| 189 v_Hashed = {} | |
| 190 v_posHash = {} | |
| 191 v_Alts = {} | |
| 192 v_Cov = {} | |
| 193 v_AF = {} | |
| 194 v_Qual = {} | |
| 195 v_TargLen = {} | |
| 196 nBelowMinRLen = 0 | |
| 197 line_unique = 0 # number of lines in vcf file containing unique variant | |
| 198 hash_coll = 0 # number of times we saw a hash collision ("per line" so non-unique alt alleles don't get counted multiple times) | |
| 199 var_filtered = 0 # number of variants excluded due to filters (e.g. hom-refs, qual) | |
| 200 var_merged = 0 # number of variants we merged into another due to having the same position specified | |
| 201 colDict = {} | |
| 202 colSamp = [] | |
| 203 for line in open(VCF_FILENAME,'r'): | |
| 204 if line[0] != '#': | |
| 205 if len(colDict) == 0: | |
| 206 print '\n\nError: VCF has no header?\n'+VCF_FILENAME+'\n\n' | |
| 207 exit(1) | |
| 208 splt = line[:-1].split('\t') | |
| 209 if splt[0] == refName: | |
| 210 | |
| 211 var = (int(splt[1]),splt[3],splt[4]) | |
| 212 targInd = bisect.bisect(targRegionsFl,var[0]) | |
| 213 | |
| 214 if targInd%2 == 1: | |
| 215 targLen = targRegionsFl[targInd]-targRegionsFl[targInd-1] | |
| 216 if (BEDFILE != None and targLen >= MINREGIONLEN) or BEDFILE == None: | |
| 217 | |
| 218 pl_out = parseLine(splt,colDict,colSamp) | |
| 219 if pl_out == None: | |
| 220 var_filtered += 1 | |
| 221 continue | |
| 222 (cov, qual, aa, af) = pl_out | |
| 223 | |
| 224 if var not in v_Hashed: | |
| 225 | |
| 226 vpos = var[0] | |
| 227 if vpos in v_posHash: | |
| 228 if len(aa) == 0: | |
| 229 aa = [var[2]] | |
| 230 aa.extend([n[2] for n in v_Hashed.keys() if n[0] == vpos]) | |
| 231 var_merged += 1 | |
| 232 v_posHash[vpos] = 1 | |
| 233 | |
| 234 if len(aa): | |
| 235 allVars = [(var[0],var[1],n) for n in aa] | |
| 236 for i in xrange(len(allVars)): | |
| 237 v_Hashed[allVars[i]] = 1 | |
| 238 #if allVars[i] not in v_Alts: | |
| 239 # v_Alts[allVars[i]] = [] | |
| 240 #v_Alts[allVars[i]].extend(allVars) | |
| 241 v_Alts[allVars[i]] = allVars | |
| 242 else: | |
| 243 v_Hashed[var] = 1 | |
| 244 | |
| 245 if cov != None: | |
| 246 v_Cov[var] = cov | |
| 247 v_AF[var] = af[0] # only use first AF, even if multiple. fix this later? | |
| 248 v_Qual[var] = qual | |
| 249 v_TargLen[var] = targLen | |
| 250 line_unique += 1 | |
| 251 | |
| 252 else: | |
| 253 hash_coll += 1 | |
| 254 | |
| 255 else: | |
| 256 nBelowMinRLen += 1 | |
| 257 else: | |
| 258 if line[1] != '#': | |
| 259 cols = line[1:-1].split('\t') | |
| 260 for i in xrange(len(cols)): | |
| 261 if 'FORMAT' in colDict: | |
| 262 colSamp.append(i) | |
| 263 colDict[cols[i]] = i | |
| 264 if VCF_OUT and outBool: | |
| 265 outBool = False | |
| 266 outFile.write(line) | |
| 267 | |
| 268 return (v_Hashed, v_Alts, v_Cov, v_AF, v_Qual, v_TargLen, nBelowMinRLen, line_unique, var_filtered, var_merged, hash_coll) | |
| 269 | |
| 270 | |
| 271 def condenseByPos(listIn): | |
| 272 varListOfInterest = [n for n in listIn] | |
| 273 indCount = {} | |
| 274 for n in varListOfInterest: | |
| 275 c = n[0] | |
| 276 if c not in indCount: | |
| 277 indCount[c] = 0 | |
| 278 indCount[c] += 1 | |
| 279 #nonUniqueDict = {n:[] for n in sorted(indCount.keys()) if indCount[n] > 1} # the python 2.7 way | |
| 280 nonUniqueDict = {} | |
| 281 for n in sorted(indCount.keys()): | |
| 282 if indCount[n] > 1: | |
| 283 nonUniqueDict[n] = [] | |
| 284 delList = [] | |
| 285 for i in xrange(len(varListOfInterest)): | |
| 286 if varListOfInterest[i][0] in nonUniqueDict: | |
| 287 nonUniqueDict[varListOfInterest[i][0]].append(varListOfInterest[i]) | |
| 288 delList.append(i) | |
| 289 delList = sorted(delList,reverse=True) | |
| 290 for di in delList: | |
| 291 del varListOfInterest[di] | |
| 292 for v in nonUniqueDict.values(): | |
| 293 var = (v[0][0],v[0][1],','.join([n[2] for n in v[::-1]])) | |
| 294 varListOfInterest.append(var) | |
| 295 return varListOfInterest | |
| 296 | |
| 297 | |
| 298 def main(): | |
| 299 | |
| 300 ref = [] | |
| 301 f = open(REFERENCE,'r') | |
| 302 nLines = 0 | |
| 303 prevR = None | |
| 304 prevP = None | |
| 305 ref_inds = [] | |
| 306 sys.stdout.write('\nindexing reference fasta... ') | |
| 307 sys.stdout.flush() | |
| 308 tt = time.time() | |
| 309 while 1: | |
| 310 nLines += 1 | |
| 311 data = f.readline() | |
| 312 if not data: | |
| 313 ref_inds.append( (prevR, prevP, f.tell()-len(data)) ) | |
| 314 break | |
| 315 if data[0] == '>': | |
| 316 if prevP != None: | |
| 317 ref_inds.append( (prevR, prevP, f.tell()-len(data)) ) | |
| 318 prevP = f.tell() | |
| 319 prevR = data[1:-1] | |
| 320 print '{0:.3f} (sec)'.format(time.time()-tt) | |
| 321 #ref_inds = [('chrM', 6, 16909), ('chr1', 16915, 254252549), ('chr2', 254252555, 502315916), ('chr3', 502315922, 704298801), ('chr4', 704298807, 899276169), ('chr5', 899276175, 1083809741), ('chr6', 1083809747, 1258347116), ('chr7', 1258347122, 1420668559), ('chr8', 1420668565, 1569959868), ('chr9', 1569959874, 1713997574), ('chr10', 1713997581, 1852243023), ('chr11', 1852243030, 1989949677), ('chr12', 1989949684, 2126478617), ('chr13', 2126478624, 2243951900), ('chr14', 2243951907, 2353448438), ('chr15', 2353448445, 2458030465), ('chr16', 2458030472, 2550192321), ('chr17', 2550192328, 2633011443), ('chr18', 2633011450, 2712650243), ('chr19', 2712650250, 2772961813), ('chr20', 2772961820, 2837247851), ('chr21', 2837247858, 2886340351), ('chr22', 2886340358, 2938671016), ('chrX', 2938671022, 3097046994), ('chrY', 3097047000, 3157608038)] | |
| 322 | |
| 323 ztV = 0 # total golden variants | |
| 324 ztW = 0 # total workflow variants | |
| 325 znP = 0 # total perfect matches | |
| 326 zfP = 0 # total false positives | |
| 327 znF = 0 # total false negatives | |
| 328 znE = 0 # total equivalent variants detected | |
| 329 zgF = 0 # total golden variants that were filtered and excluded | |
| 330 zgR = 0 # total golden variants that were excluded for being redundant | |
| 331 zgM = 0 # total golden variants that were merged into a single position | |
| 332 zwF = 0 # total workflow variants that were filtered and excluded | |
| 333 zwR = 0 # total workflow variants that were excluded for being redundant | |
| 334 zwM = 0 # total workflow variants that were merged into a single position | |
| 335 if BEDFILE != None: | |
| 336 zbM = 0 | |
| 337 | |
| 338 mappability_vs_FN = {0:0, 1:0} # [0] = # of FNs that were in mappable regions, [1] = # of FNs that were in unmappable regions | |
| 339 coverage_vs_FN = {} # [C] = # of FNs that were covered by C reads | |
| 340 alleleBal_vs_FN = {} # [AF] = # of FNs that were heterozygous genotypes with allele freq AF (quantized to multiples of 1/AF_STEPS) | |
| 341 for n in AF_KEYS: | |
| 342 alleleBal_vs_FN[n] = 0 | |
| 343 | |
| 344 # | |
| 345 # read in mappability track | |
| 346 # | |
| 347 mappability_tracks = {} # indexed by chr string (e.g. 'chr1'), has boolean array | |
| 348 prevRef = '' | |
| 349 relevantRegions = [] | |
| 350 if MAPTRACK != None: | |
| 351 mtf = open(MAPTRACK,'r') | |
| 352 for line in mtf: | |
| 353 splt = line.strip().split('\t') | |
| 354 if prevRef != '' and splt[0] != prevRef: | |
| 355 # do stuff | |
| 356 if len(relevantRegions): | |
| 357 myTrack = [0]*(relevantRegions[-1][1]+100) | |
| 358 for r in relevantRegions: | |
| 359 for ri in xrange(r[0],r[1]): | |
| 360 myTrack[ri] = 1 | |
| 361 mappability_tracks[prevRef] = [n for n in myTrack] | |
| 362 # | |
| 363 relevantRegions = [] | |
| 364 if int(splt[3]) >= MIN_READLEN: | |
| 365 relevantRegions.append((int(splt[1]),int(splt[2]))) | |
| 366 prevRef = splt[0] | |
| 367 mtf.close() | |
| 368 # do stuff | |
| 369 if len(relevantRegions): | |
| 370 myTrack = [0]*(relevantRegions[-1][1]+100) | |
| 371 for r in relevantRegions: | |
| 372 for ri in xrange(r[0],r[1]): | |
| 373 myTrack[ri] = 1 | |
| 374 mappability_tracks[prevRef] = [n for n in myTrack] | |
| 375 # | |
| 376 | |
| 377 # | |
| 378 # init vcf output, if desired | |
| 379 # | |
| 380 vcfo2 = None | |
| 381 vcfo3 = None | |
| 382 global vcfo2_firstTime | |
| 383 global vcfo3_firstTime | |
| 384 vcfo2_firstTime = False | |
| 385 vcfo3_firstTime = False | |
| 386 if VCF_OUT: | |
| 387 vcfo2 = open(OUT_PREFIX+'_FN.vcf','w') | |
| 388 vcfo3 = open(OUT_PREFIX+'_FP.vcf','w') | |
| 389 vcfo2_firstTime = True | |
| 390 vcfo3_firstTime = True | |
| 391 | |
| 392 # | |
| 393 # data for plotting FN analysis | |
| 394 # | |
| 395 set1 = [] | |
| 396 set2 = [] | |
| 397 set3 = [] | |
| 398 varAdj = 0 | |
| 399 | |
| 400 # | |
| 401 # | |
| 402 # For each sequence in reference fasta... | |
| 403 # | |
| 404 # | |
| 405 for n_RI in ref_inds: | |
| 406 | |
| 407 refName = n_RI[0] | |
| 408 if FAST == False: | |
| 409 f.seek(n_RI[1]) | |
| 410 print 'reading '+refName+'...', | |
| 411 myDat = f.read(n_RI[2]-n_RI[1]).split('\n') | |
| 412 myLen = sum([len(m) for m in myDat]) | |
| 413 if sys.version_info >= (2,7): | |
| 414 print '{:,} bp'.format(myLen) | |
| 415 else: | |
| 416 print '{0:} bp'.format(myLen) | |
| 417 inWidth = len(myDat[0]) | |
| 418 if len(myDat[-1]) == 0: # if last line is empty, remove it. | |
| 419 del myDat[-1] | |
| 420 if inWidth*(len(myDat)-1)+len(myDat[-1]) != myLen: | |
| 421 print 'fasta column-width not consistent.' | |
| 422 print myLen, inWidth*(len(myDat)-1)+len(myDat[-1]) | |
| 423 for i in xrange(len(myDat)): | |
| 424 if len(myDat[i]) != inWidth: | |
| 425 print i, len(myDat[i]), inWidth | |
| 426 exit(1) | |
| 427 | |
| 428 myDat = bytearray(''.join(myDat)).upper() | |
| 429 myLen = len(myDat) | |
| 430 | |
| 431 # | |
| 432 # Parse relevant targeted regions | |
| 433 # | |
| 434 targRegionsFl = [] | |
| 435 if BEDFILE != None: | |
| 436 bedfile = open(BEDFILE,'r') | |
| 437 for line in bedfile: | |
| 438 splt = line.split('\t') | |
| 439 if splt[0] == refName: | |
| 440 targRegionsFl.extend((int(splt[1]),int(splt[2]))) | |
| 441 bedfile.close() | |
| 442 else: | |
| 443 targRegionsFl = [-1,MAX_VAL+1] | |
| 444 | |
| 445 # | |
| 446 # Parse vcf files | |
| 447 # | |
| 448 sys.stdout.write('comparing variation in '+refName+'... ') | |
| 449 sys.stdout.flush() | |
| 450 tt = time.time() | |
| 451 | |
| 452 (correctHashed, correctAlts, correctCov, correctAF, correctQual, correctTargLen, correctBelowMinRLen, correctUnique, gFiltered, gMerged, gRedundant) = parseVCF(GOLDEN_VCF, refName, targRegionsFl, vcfo2, vcfo2_firstTime) | |
| 453 (workflowHashed, workflowAlts, workflowCov, workflowAF, workflowQual, workflowTarLen, workflowBelowMinRLen, workflowUnique, wFiltered, wMerged, wRedundant) = parseVCF(WORKFLOW_VCF, refName, targRegionsFl, vcfo3, vcfo3_firstTime) | |
| 454 zgF += gFiltered | |
| 455 zgR += gRedundant | |
| 456 zgM += gMerged | |
| 457 zwF += wFiltered | |
| 458 zwR += wRedundant | |
| 459 zwM += wMerged | |
| 460 | |
| 461 # | |
| 462 # Deduce which variants are FP / FN | |
| 463 # | |
| 464 solvedInds = {} | |
| 465 for var in correctHashed.keys(): | |
| 466 if var in workflowHashed or var[0] in solvedInds: | |
| 467 correctHashed[var] = 2 | |
| 468 workflowHashed[var] = 2 | |
| 469 solvedInds[var[0]] = True | |
| 470 for var in correctHashed.keys()+workflowHashed.keys(): | |
| 471 if var[0] in solvedInds: | |
| 472 correctHashed[var] = 2 | |
| 473 workflowHashed[var] = 2 | |
| 474 nPerfect = len(solvedInds) | |
| 475 | |
| 476 # correctHashed[var] = 1: were not found | |
| 477 # = 2: should be discluded because we were found | |
| 478 # = 3: should be discluded because an alt was found | |
| 479 notFound = [n for n in sorted(correctHashed.keys()) if correctHashed[n] == 1] | |
| 480 FPvariants = [n for n in sorted(workflowHashed.keys()) if workflowHashed[n] == 1] | |
| 481 | |
| 482 # | |
| 483 # condense all variants who have alternate alleles and were *not* found to have perfect matches | |
| 484 # into a single variant again. These will not be included in the candidates for equivalency checking. Sorry! | |
| 485 # | |
| 486 notFound = condenseByPos(notFound) | |
| 487 FPvariants = condenseByPos(FPvariants) | |
| 488 | |
| 489 # | |
| 490 # tally up some values, if there are no golden variants lets save some CPU cycles and move to the next ref | |
| 491 # | |
| 492 totalGoldenVariants = nPerfect + len(notFound) | |
| 493 totalWorkflowVariants = nPerfect + len(FPvariants) | |
| 494 if totalGoldenVariants == 0: | |
| 495 zfP += len(FPvariants) | |
| 496 ztW += totalWorkflowVariants | |
| 497 print '{0:.3f} (sec)'.format(time.time()-tt) | |
| 498 continue | |
| 499 | |
| 500 # | |
| 501 # let's check for equivalent variants | |
| 502 # | |
| 503 if FAST == False: | |
| 504 delList_i = [] | |
| 505 delList_j = [] | |
| 506 regionsToCheck = [] | |
| 507 for i in xrange(len(FPvariants)): | |
| 508 pos = FPvariants[i][0] | |
| 509 regionsToCheck.append((max([pos-EV_BPRANGE-1,0]),min([pos+EV_BPRANGE,len(myDat)-1]))) | |
| 510 | |
| 511 for n in regionsToCheck: | |
| 512 refSection = myDat[n[0]:n[1]] | |
| 513 | |
| 514 fpWithin = [] | |
| 515 for i in xrange(len(FPvariants)): | |
| 516 m = FPvariants[i] | |
| 517 if (m[0] > n[0] and m[0] < n[1]): | |
| 518 fpWithin.append((m,i)) | |
| 519 fpWithin = sorted(fpWithin) | |
| 520 adj = 0 | |
| 521 altSection = copy.deepcopy(refSection) | |
| 522 for (m,i) in fpWithin: | |
| 523 lr = len(m[1]) | |
| 524 la = len(m[2]) | |
| 525 dpos = m[0]-n[0]+adj | |
| 526 altSection = altSection[:dpos-1] + m[2] + altSection[dpos-1+lr:] | |
| 527 adj += la-lr | |
| 528 | |
| 529 nfWithin = [] | |
| 530 for j in xrange(len(notFound)): | |
| 531 m = notFound[j] | |
| 532 if (m[0] > n[0] and m[0] < n[1]): | |
| 533 nfWithin.append((m,j)) | |
| 534 nfWithin = sorted(nfWithin) | |
| 535 adj = 0 | |
| 536 altSection2 = copy.deepcopy(refSection) | |
| 537 for (m,j) in nfWithin: | |
| 538 lr = len(m[1]) | |
| 539 la = len(m[2]) | |
| 540 dpos = m[0]-n[0]+adj | |
| 541 altSection2 = altSection2[:dpos-1] + m[2] + altSection2[dpos-1+lr:] | |
| 542 adj += la-lr | |
| 543 | |
| 544 if altSection == altSection2: | |
| 545 for (m,i) in fpWithin: | |
| 546 if i not in delList_i: | |
| 547 delList_i.append(i) | |
| 548 for (m,j) in nfWithin: | |
| 549 if j not in delList_j: | |
| 550 delList_j.append(j) | |
| 551 | |
| 552 nEquiv = 0 | |
| 553 for i in sorted(list(set(delList_i)),reverse=True): | |
| 554 del FPvariants[i] | |
| 555 for j in sorted(list(set(delList_j)),reverse=True): | |
| 556 del notFound[j] | |
| 557 nEquiv += 1 | |
| 558 nPerfect += nEquiv | |
| 559 | |
| 560 # | |
| 561 # Tally up errors and whatnot | |
| 562 # | |
| 563 ztV += totalGoldenVariants | |
| 564 ztW += totalWorkflowVariants | |
| 565 znP += nPerfect | |
| 566 zfP += len(FPvariants) | |
| 567 znF += len(notFound) | |
| 568 if FAST == False: | |
| 569 znE += nEquiv | |
| 570 if BEDFILE != None: | |
| 571 zbM += correctBelowMinRLen | |
| 572 | |
| 573 # | |
| 574 # try to identify a reason for FN variants: | |
| 575 # | |
| 576 | |
| 577 venn_data = [[0,0,0] for n in notFound] # [i] = (unmappable, low cov, low het) | |
| 578 for i in xrange(len(notFound)): | |
| 579 var = notFound[i] | |
| 580 | |
| 581 noReason = True | |
| 582 | |
| 583 # mappability? | |
| 584 if MAPTRACK != None: | |
| 585 if refName in mappability_tracks and var[0] < len(mappability_tracks[refName]): | |
| 586 if mappability_tracks[refName][var[0]]: | |
| 587 mappability_vs_FN[1] += 1 | |
| 588 venn_data[i][0] = 1 | |
| 589 noReason = False | |
| 590 else: | |
| 591 mappability_vs_FN[0] += 1 | |
| 592 | |
| 593 # coverage? | |
| 594 if var in correctCov: | |
| 595 c = correctCov[var] | |
| 596 if c != None: | |
| 597 if c not in coverage_vs_FN: | |
| 598 coverage_vs_FN[c] = 0 | |
| 599 coverage_vs_FN[c] += 1 | |
| 600 if c < DP_THRESH: | |
| 601 venn_data[i][1] = 1 | |
| 602 noReason = False | |
| 603 | |
| 604 # heterozygous genotype messing things up? | |
| 605 #if var in correctAF: | |
| 606 # a = correctAF[var] | |
| 607 # if a != None: | |
| 608 # a = AF_KEYS[quantize_AF(a)] | |
| 609 # if a not in alleleBal_vs_FN: | |
| 610 # alleleBal_vs_FN[a] = 0 | |
| 611 # alleleBal_vs_FN[a] += 1 | |
| 612 # if a < AF_THRESH: | |
| 613 # venn_data[i][2] = 1 | |
| 614 | |
| 615 # no reason? | |
| 616 if noReason: | |
| 617 venn_data[i][2] += 1 | |
| 618 | |
| 619 for i in xrange(len(notFound)): | |
| 620 if venn_data[i][0]: set1.append(i+varAdj) | |
| 621 if venn_data[i][1]: set2.append(i+varAdj) | |
| 622 if venn_data[i][2]: set3.append(i+varAdj) | |
| 623 varAdj += len(notFound) | |
| 624 | |
| 625 # | |
| 626 # if desired, write out vcf files. | |
| 627 # | |
| 628 notFound = sorted(notFound) | |
| 629 FPvariants = sorted(FPvariants) | |
| 630 if VCF_OUT: | |
| 631 for line in open(GOLDEN_VCF,'r'): | |
| 632 if line[0] != '#': | |
| 633 splt = line.split('\t') | |
| 634 if splt[0] == refName: | |
| 635 var = (int(splt[1]),splt[3],splt[4]) | |
| 636 if var in notFound: | |
| 637 vcfo2.write(line) | |
| 638 for line in open(WORKFLOW_VCF,'r'): | |
| 639 if line[0] != '#': | |
| 640 splt = line.split('\t') | |
| 641 if splt[0] == refName: | |
| 642 var = (int(splt[1]),splt[3],splt[4]) | |
| 643 if var in FPvariants: | |
| 644 vcfo3.write(line) | |
| 645 | |
| 646 print '{0:.3f} (sec)'.format(time.time()-tt) | |
| 647 | |
| 648 # | |
| 649 # close vcf output | |
| 650 # | |
| 651 print '' | |
| 652 if VCF_OUT: | |
| 653 print OUT_PREFIX+'_FN.vcf' | |
| 654 print OUT_PREFIX+'_FP.vcf' | |
| 655 vcfo2.close() | |
| 656 vcfo3.close() | |
| 657 | |
| 658 # | |
| 659 # plot some FN stuff | |
| 660 # | |
| 661 if NO_PLOT == False: | |
| 662 nDetected = len(set(set1+set2+set3)) | |
| 663 set1 = set(set1) | |
| 664 set2 = set(set2) | |
| 665 set3 = set(set3) | |
| 666 | |
| 667 if len(set1): s1 = 'Unmappable' | |
| 668 else: s1 = '' | |
| 669 if len(set2): s2 = 'DP < '+str(DP_THRESH) | |
| 670 else: s2 = '' | |
| 671 #if len(set3): s3 = 'AF < '+str(AF_THRESH) | |
| 672 if len(set3): s3 = 'Unknown' | |
| 673 else: s3 = '' | |
| 674 | |
| 675 mpl.figure(0) | |
| 676 tstr1 = 'False Negative Variants (Missed Detections)' | |
| 677 #tstr2 = str(nDetected)+' / '+str(znF)+' FN variants categorized' | |
| 678 tstr2 = '' | |
| 679 if MAPTRACK != None: | |
| 680 v = venn3([set1, set2, set3], (s1, s2, s3)) | |
| 681 else: | |
| 682 v = venn2([set2, set3], (s2, s3)) | |
| 683 mpl.figtext(0.5,0.95,tstr1,fontdict={'size':14,'weight':'bold'},horizontalalignment='center') | |
| 684 mpl.figtext(0.5,0.03,tstr2,fontdict={'size':14,'weight':'bold'},horizontalalignment='center') | |
| 685 | |
| 686 ouf = OUT_PREFIX+'_FNvenn.pdf' | |
| 687 print ouf | |
| 688 mpl.savefig(ouf) | |
| 689 | |
| 690 # | |
| 691 # spit out results to console | |
| 692 # | |
| 693 print '\n**********************************\n' | |
| 694 if BEDFILE != None: | |
| 695 print 'ONLY CONSIDERING VARIANTS FOUND WITHIN TARGETED REGIONS\n\n' | |
| 696 print 'Total Golden Variants: ',ztV,'\t[',zgF,'filtered,',zgM,'merged,',zgR,'redundant ]' | |
| 697 print 'Total Workflow Variants:',ztW,'\t[',zwF,'filtered,',zwM,'merged,',zwR,'redundant ]' | |
| 698 print '' | |
| 699 if ztV > 0 and ztW > 0: | |
| 700 print 'Perfect Matches:',znP,'({0:.2f}%)'.format(100.*float(znP)/ztV) | |
| 701 print 'FN variants: ',znF,'({0:.2f}%)'.format(100.*float(znF)/ztV) | |
| 702 print 'FP variants: ',zfP#,'({0:.2f}%)'.format(100.*float(zfP)/ztW) | |
| 703 if FAST == False: | |
| 704 print '\nNumber of equivalent variants denoted differently between the two vcfs:',znE | |
| 705 if BEDFILE != None: | |
| 706 print '\nNumber of golden variants located in targeted regions that were too small to be sampled from:',zbM | |
| 707 if FAST: | |
| 708 print "\nWarning! Running with '--fast' means that identical variants denoted differently between the two vcfs will not be detected! The values above may be lower than the true accuracy." | |
| 709 #if NO_PLOT: | |
| 710 if True: | |
| 711 print '\n#unmappable: ',len(set1) | |
| 712 print '#low_coverage:',len(set2) | |
| 713 print '#unknown: ',len(set3) | |
| 714 print '\n**********************************\n' | |
| 715 | |
| 716 | |
| 717 | |
| 718 | |
| 719 | |
| 720 if __name__ == '__main__': | |
| 721 main() |
