Mercurial > repos > thondeboer > neat_genreads
diff utilities/vcf_compare_OLD.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utilities/vcf_compare_OLD.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,721 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" ************************************************** + +vcf_compare.py + +- compare vcf file produced by workflow to golden vcf produced by simulator + +Written by: Zach Stephens +Date: January 20, 2015 +Contact: zstephe2@illinois.edu + +************************************************** """ + +import sys +import os +import copy +import time +import bisect +import re +import numpy as np +import optparse + + +EV_BPRANGE = 50 # how far to either side of a particular variant location do we want to check for equivalents? + +DEFAULT_QUAL = -666 # if we can't find a qual score, use this instead so we know it's missing + +MAX_VAL = 9999999999999 # an unreasonably large value that no reference fasta could concievably be longer than + +DESC = """%prog: vcf comparison script.""" +VERS = 0.1 + +PARSER = optparse.OptionParser('python %prog [options] -r <ref.fa> -g <golden.vcf> -w <workflow.vcf>',description=DESC,version="%prog v"+str(VERS)) + +PARSER.add_option('-r', help='* Reference Fasta', dest='REFF', action='store', metavar='<ref.fa>') +PARSER.add_option('-g', help='* Golden VCF', dest='GVCF', action='store', metavar='<golden.vcf>') +PARSER.add_option('-w', help='* Workflow VCF', dest='WVCF', action='store', metavar='<workflow.vcf>') +PARSER.add_option('-o', help='* Output Prefix', dest='OUTF', action='store', metavar='<prefix>') +PARSER.add_option('-m', help='Mappability Track', dest='MTRK', action='store', metavar='<track.bed>') +PARSER.add_option('-M', help='Maptrack Min Len', dest='MTMM', action='store', metavar='<int>') +PARSER.add_option('-t', help='Targetted Regions', dest='TREG', action='store', metavar='<regions.bed>') +PARSER.add_option('-T', help='Min Region Len', dest='MTRL', action='store', metavar='<int>') +PARSER.add_option('-c', help='Coverage Filter Threshold [%default]', dest='DP_THRESH', default=15, action='store', metavar='<int>') +PARSER.add_option('-a', help='Allele Freq Filter Threshold [%default]', dest='AF_THRESH', default=0.3, action='store', metavar='<float>') + +PARSER.add_option('--vcf-out', help="Output Match/FN/FP variants [%default]", dest='VCF_OUT', default=False, action='store_true') +PARSER.add_option('--no-plot', help="No plotting [%default]", dest='NO_PLOT', default=False, action='store_true') +PARSER.add_option('--incl-homs', help="Include homozygous ref calls [%default]", dest='INCL_H', default=False, action='store_true') +PARSER.add_option('--incl-fail', help="Include calls that failed filters [%default]", dest='INCL_F', default=False, action='store_true') +PARSER.add_option('--fast', help="No equivalent variant detection [%default]", dest='FAST', default=False, action='store_true') + +(OPTS,ARGS) = PARSER.parse_args() + +REFERENCE = OPTS.REFF +GOLDEN_VCF = OPTS.GVCF +WORKFLOW_VCF = OPTS.WVCF +OUT_PREFIX = OPTS.OUTF +MAPTRACK = OPTS.MTRK +MIN_READLEN = OPTS.MTMM +BEDFILE = OPTS.TREG +DP_THRESH = int(OPTS.DP_THRESH) +AF_THRESH = float(OPTS.AF_THRESH) + +VCF_OUT = OPTS.VCF_OUT +NO_PLOT = OPTS.NO_PLOT +INCLUDE_HOMS = OPTS.INCL_H +INCLUDE_FAIL = OPTS.INCL_F +FAST = OPTS.FAST + +if len(sys.argv[1:]) == 0: + PARSER.print_help() + exit(1) + +if OPTS.MTRL != None: + MINREGIONLEN = int(OPTS.MTRL) +else: + MINREGIONLEN = None + +if MIN_READLEN == None: + MIN_READLEN = 0 +else: + MIN_READLEN = int(MIN_READLEN) + +if REFERENCE == None: + print 'Error: No reference provided.' + exit(1) +if GOLDEN_VCF == None: + print 'Error: No golden VCF provided.' + exit(1) +if WORKFLOW_VCF == None: + print 'Error: No workflow VCF provided.' + exit(1) +if OUT_PREFIX == None: + print 'Error: No output prefix provided.' + exit(1) +if (BEDFILE != None and MINREGIONLEN == None) or (BEDFILE == None and MINREGIONLEN != None): + print 'Error: Both -t and -T must be specified' + exit(1) + +if NO_PLOT == False: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as mpl + from matplotlib_venn import venn2, venn3 + import warnings + warnings.filterwarnings("ignore", category=UserWarning, module='matplotlib_venn') + +AF_STEPS = 20 +AF_KEYS = np.linspace(0.0,1.0,AF_STEPS+1) + +def quantize_AF(af): + if af >= 1.0: + return AF_STEPS + elif af <= 0.0: + return 0 + else: + return int(af*AF_STEPS) + +VCF_HEADER = '##fileformat=VCFv4.1\n##reference='+REFERENCE+'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n' + +DP_TOKENS = ['DP','DPU','DPI'] # in the order that we'll look for them + +def parseLine(splt,colDict,colSamp): + + # check if we want to proceed.. + ra = splt[colDict['REF']] + aa = splt[colDict['ALT']] + if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra): + return None + if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'): + return None + + # default vals + cov = None + qual = DEFAULT_QUAL + alt_alleles = [] + alt_freqs = [None] + + # any alt alleles? + alt_split = aa.split(',') + if len(alt_split) > 1: + alt_alleles = alt_split + + # cov + for dp_tok in DP_TOKENS: + # check INFO for DP first + if 'INFO' in colDict and dp_tok+'=' in splt[colDict['INFO']]: + cov = int(re.findall(re.escape(dp_tok)+r"=[0-9]+",splt[colDict['INFO']])[0][3:]) + # check FORMAT/SAMPLE for DP second: + elif 'FORMAT' in colDict and len(colSamp): + format = splt[colDict['FORMAT']]+':' + if ':'+dp_tok+':' in format: + dpInd = format.split(':').index(dp_tok) + cov = int(splt[colSamp[0]].split(':')[dpInd]) + if cov != None: + break + + # check INFO for AF first + af = None + if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]: + info = splt[colDict['INFO']]+';' + af = re.findall(r"AF=.*?(?=;)",info)[0][3:] + # check FORMAT/SAMPLE for AF second: + elif 'FORMAT' in colDict and len(colSamp): + format = splt[colDict['FORMAT']]+':' + if ':AF:' in format: + afInd = splt[colDict['FORMAT']].split(':').index('AF') + af = splt[colSamp[0]].split(':')[afInd] + + if af != None: + af_splt = af.split(',') + while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason? + af_splt.append(af_splt[-1]) # phone it in. + if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay + alt_freqs = [float(n) for n in af_splt] + else: + alt_freqs = [None]*max([len(alt_alleles),1]) + + # get QUAL if it's interesting + if 'QUAL' in colDict and splt[colDict['QUAL']] != '.': + qual = float(splt[colDict['QUAL']]) + + return (cov, qual, alt_alleles, alt_freqs) + + +def parseVCF(VCF_FILENAME,refName,targRegionsFl,outFile,outBool): + v_Hashed = {} + v_posHash = {} + v_Alts = {} + v_Cov = {} + v_AF = {} + v_Qual = {} + v_TargLen = {} + nBelowMinRLen = 0 + line_unique = 0 # number of lines in vcf file containing unique variant + hash_coll = 0 # number of times we saw a hash collision ("per line" so non-unique alt alleles don't get counted multiple times) + var_filtered = 0 # number of variants excluded due to filters (e.g. hom-refs, qual) + var_merged = 0 # number of variants we merged into another due to having the same position specified + colDict = {} + colSamp = [] + for line in open(VCF_FILENAME,'r'): + if line[0] != '#': + if len(colDict) == 0: + print '\n\nError: VCF has no header?\n'+VCF_FILENAME+'\n\n' + exit(1) + splt = line[:-1].split('\t') + if splt[0] == refName: + + var = (int(splt[1]),splt[3],splt[4]) + targInd = bisect.bisect(targRegionsFl,var[0]) + + if targInd%2 == 1: + targLen = targRegionsFl[targInd]-targRegionsFl[targInd-1] + if (BEDFILE != None and targLen >= MINREGIONLEN) or BEDFILE == None: + + pl_out = parseLine(splt,colDict,colSamp) + if pl_out == None: + var_filtered += 1 + continue + (cov, qual, aa, af) = pl_out + + if var not in v_Hashed: + + vpos = var[0] + if vpos in v_posHash: + if len(aa) == 0: + aa = [var[2]] + aa.extend([n[2] for n in v_Hashed.keys() if n[0] == vpos]) + var_merged += 1 + v_posHash[vpos] = 1 + + if len(aa): + allVars = [(var[0],var[1],n) for n in aa] + for i in xrange(len(allVars)): + v_Hashed[allVars[i]] = 1 + #if allVars[i] not in v_Alts: + # v_Alts[allVars[i]] = [] + #v_Alts[allVars[i]].extend(allVars) + v_Alts[allVars[i]] = allVars + else: + v_Hashed[var] = 1 + + if cov != None: + v_Cov[var] = cov + v_AF[var] = af[0] # only use first AF, even if multiple. fix this later? + v_Qual[var] = qual + v_TargLen[var] = targLen + line_unique += 1 + + else: + hash_coll += 1 + + else: + nBelowMinRLen += 1 + else: + if line[1] != '#': + cols = line[1:-1].split('\t') + for i in xrange(len(cols)): + if 'FORMAT' in colDict: + colSamp.append(i) + colDict[cols[i]] = i + if VCF_OUT and outBool: + outBool = False + outFile.write(line) + + return (v_Hashed, v_Alts, v_Cov, v_AF, v_Qual, v_TargLen, nBelowMinRLen, line_unique, var_filtered, var_merged, hash_coll) + + +def condenseByPos(listIn): + varListOfInterest = [n for n in listIn] + indCount = {} + for n in varListOfInterest: + c = n[0] + if c not in indCount: + indCount[c] = 0 + indCount[c] += 1 + #nonUniqueDict = {n:[] for n in sorted(indCount.keys()) if indCount[n] > 1} # the python 2.7 way + nonUniqueDict = {} + for n in sorted(indCount.keys()): + if indCount[n] > 1: + nonUniqueDict[n] = [] + delList = [] + for i in xrange(len(varListOfInterest)): + if varListOfInterest[i][0] in nonUniqueDict: + nonUniqueDict[varListOfInterest[i][0]].append(varListOfInterest[i]) + delList.append(i) + delList = sorted(delList,reverse=True) + for di in delList: + del varListOfInterest[di] + for v in nonUniqueDict.values(): + var = (v[0][0],v[0][1],','.join([n[2] for n in v[::-1]])) + varListOfInterest.append(var) + return varListOfInterest + + +def main(): + + ref = [] + f = open(REFERENCE,'r') + nLines = 0 + prevR = None + prevP = None + ref_inds = [] + sys.stdout.write('\nindexing reference fasta... ') + sys.stdout.flush() + tt = time.time() + while 1: + nLines += 1 + data = f.readline() + if not data: + ref_inds.append( (prevR, prevP, f.tell()-len(data)) ) + break + if data[0] == '>': + if prevP != None: + ref_inds.append( (prevR, prevP, f.tell()-len(data)) ) + prevP = f.tell() + prevR = data[1:-1] + print '{0:.3f} (sec)'.format(time.time()-tt) + #ref_inds = [('chrM', 6, 16909), ('chr1', 16915, 254252549), ('chr2', 254252555, 502315916), ('chr3', 502315922, 704298801), ('chr4', 704298807, 899276169), ('chr5', 899276175, 1083809741), ('chr6', 1083809747, 1258347116), ('chr7', 1258347122, 1420668559), ('chr8', 1420668565, 1569959868), ('chr9', 1569959874, 1713997574), ('chr10', 1713997581, 1852243023), ('chr11', 1852243030, 1989949677), ('chr12', 1989949684, 2126478617), ('chr13', 2126478624, 2243951900), ('chr14', 2243951907, 2353448438), ('chr15', 2353448445, 2458030465), ('chr16', 2458030472, 2550192321), ('chr17', 2550192328, 2633011443), ('chr18', 2633011450, 2712650243), ('chr19', 2712650250, 2772961813), ('chr20', 2772961820, 2837247851), ('chr21', 2837247858, 2886340351), ('chr22', 2886340358, 2938671016), ('chrX', 2938671022, 3097046994), ('chrY', 3097047000, 3157608038)] + + ztV = 0 # total golden variants + ztW = 0 # total workflow variants + znP = 0 # total perfect matches + zfP = 0 # total false positives + znF = 0 # total false negatives + znE = 0 # total equivalent variants detected + zgF = 0 # total golden variants that were filtered and excluded + zgR = 0 # total golden variants that were excluded for being redundant + zgM = 0 # total golden variants that were merged into a single position + zwF = 0 # total workflow variants that were filtered and excluded + zwR = 0 # total workflow variants that were excluded for being redundant + zwM = 0 # total workflow variants that were merged into a single position + if BEDFILE != None: + zbM = 0 + + mappability_vs_FN = {0:0, 1:0} # [0] = # of FNs that were in mappable regions, [1] = # of FNs that were in unmappable regions + coverage_vs_FN = {} # [C] = # of FNs that were covered by C reads + alleleBal_vs_FN = {} # [AF] = # of FNs that were heterozygous genotypes with allele freq AF (quantized to multiples of 1/AF_STEPS) + for n in AF_KEYS: + alleleBal_vs_FN[n] = 0 + + # + # read in mappability track + # + mappability_tracks = {} # indexed by chr string (e.g. 'chr1'), has boolean array + prevRef = '' + relevantRegions = [] + if MAPTRACK != None: + mtf = open(MAPTRACK,'r') + for line in mtf: + splt = line.strip().split('\t') + if prevRef != '' and splt[0] != prevRef: + # do stuff + if len(relevantRegions): + myTrack = [0]*(relevantRegions[-1][1]+100) + for r in relevantRegions: + for ri in xrange(r[0],r[1]): + myTrack[ri] = 1 + mappability_tracks[prevRef] = [n for n in myTrack] + # + relevantRegions = [] + if int(splt[3]) >= MIN_READLEN: + relevantRegions.append((int(splt[1]),int(splt[2]))) + prevRef = splt[0] + mtf.close() + # do stuff + if len(relevantRegions): + myTrack = [0]*(relevantRegions[-1][1]+100) + for r in relevantRegions: + for ri in xrange(r[0],r[1]): + myTrack[ri] = 1 + mappability_tracks[prevRef] = [n for n in myTrack] + # + + # + # init vcf output, if desired + # + vcfo2 = None + vcfo3 = None + global vcfo2_firstTime + global vcfo3_firstTime + vcfo2_firstTime = False + vcfo3_firstTime = False + if VCF_OUT: + vcfo2 = open(OUT_PREFIX+'_FN.vcf','w') + vcfo3 = open(OUT_PREFIX+'_FP.vcf','w') + vcfo2_firstTime = True + vcfo3_firstTime = True + + # + # data for plotting FN analysis + # + set1 = [] + set2 = [] + set3 = [] + varAdj = 0 + + # + # + # For each sequence in reference fasta... + # + # + for n_RI in ref_inds: + + refName = n_RI[0] + if FAST == False: + f.seek(n_RI[1]) + print 'reading '+refName+'...', + myDat = f.read(n_RI[2]-n_RI[1]).split('\n') + myLen = sum([len(m) for m in myDat]) + if sys.version_info >= (2,7): + print '{:,} bp'.format(myLen) + else: + print '{0:} bp'.format(myLen) + inWidth = len(myDat[0]) + if len(myDat[-1]) == 0: # if last line is empty, remove it. + del myDat[-1] + if inWidth*(len(myDat)-1)+len(myDat[-1]) != myLen: + print 'fasta column-width not consistent.' + print myLen, inWidth*(len(myDat)-1)+len(myDat[-1]) + for i in xrange(len(myDat)): + if len(myDat[i]) != inWidth: + print i, len(myDat[i]), inWidth + exit(1) + + myDat = bytearray(''.join(myDat)).upper() + myLen = len(myDat) + + # + # Parse relevant targeted regions + # + targRegionsFl = [] + if BEDFILE != None: + bedfile = open(BEDFILE,'r') + for line in bedfile: + splt = line.split('\t') + if splt[0] == refName: + targRegionsFl.extend((int(splt[1]),int(splt[2]))) + bedfile.close() + else: + targRegionsFl = [-1,MAX_VAL+1] + + # + # Parse vcf files + # + sys.stdout.write('comparing variation in '+refName+'... ') + sys.stdout.flush() + tt = time.time() + + (correctHashed, correctAlts, correctCov, correctAF, correctQual, correctTargLen, correctBelowMinRLen, correctUnique, gFiltered, gMerged, gRedundant) = parseVCF(GOLDEN_VCF, refName, targRegionsFl, vcfo2, vcfo2_firstTime) + (workflowHashed, workflowAlts, workflowCov, workflowAF, workflowQual, workflowTarLen, workflowBelowMinRLen, workflowUnique, wFiltered, wMerged, wRedundant) = parseVCF(WORKFLOW_VCF, refName, targRegionsFl, vcfo3, vcfo3_firstTime) + zgF += gFiltered + zgR += gRedundant + zgM += gMerged + zwF += wFiltered + zwR += wRedundant + zwM += wMerged + + # + # Deduce which variants are FP / FN + # + solvedInds = {} + for var in correctHashed.keys(): + if var in workflowHashed or var[0] in solvedInds: + correctHashed[var] = 2 + workflowHashed[var] = 2 + solvedInds[var[0]] = True + for var in correctHashed.keys()+workflowHashed.keys(): + if var[0] in solvedInds: + correctHashed[var] = 2 + workflowHashed[var] = 2 + nPerfect = len(solvedInds) + + # correctHashed[var] = 1: were not found + # = 2: should be discluded because we were found + # = 3: should be discluded because an alt was found + notFound = [n for n in sorted(correctHashed.keys()) if correctHashed[n] == 1] + FPvariants = [n for n in sorted(workflowHashed.keys()) if workflowHashed[n] == 1] + + # + # condense all variants who have alternate alleles and were *not* found to have perfect matches + # into a single variant again. These will not be included in the candidates for equivalency checking. Sorry! + # + notFound = condenseByPos(notFound) + FPvariants = condenseByPos(FPvariants) + + # + # tally up some values, if there are no golden variants lets save some CPU cycles and move to the next ref + # + totalGoldenVariants = nPerfect + len(notFound) + totalWorkflowVariants = nPerfect + len(FPvariants) + if totalGoldenVariants == 0: + zfP += len(FPvariants) + ztW += totalWorkflowVariants + print '{0:.3f} (sec)'.format(time.time()-tt) + continue + + # + # let's check for equivalent variants + # + if FAST == False: + delList_i = [] + delList_j = [] + regionsToCheck = [] + for i in xrange(len(FPvariants)): + pos = FPvariants[i][0] + regionsToCheck.append((max([pos-EV_BPRANGE-1,0]),min([pos+EV_BPRANGE,len(myDat)-1]))) + + for n in regionsToCheck: + refSection = myDat[n[0]:n[1]] + + fpWithin = [] + for i in xrange(len(FPvariants)): + m = FPvariants[i] + if (m[0] > n[0] and m[0] < n[1]): + fpWithin.append((m,i)) + fpWithin = sorted(fpWithin) + adj = 0 + altSection = copy.deepcopy(refSection) + for (m,i) in fpWithin: + lr = len(m[1]) + la = len(m[2]) + dpos = m[0]-n[0]+adj + altSection = altSection[:dpos-1] + m[2] + altSection[dpos-1+lr:] + adj += la-lr + + nfWithin = [] + for j in xrange(len(notFound)): + m = notFound[j] + if (m[0] > n[0] and m[0] < n[1]): + nfWithin.append((m,j)) + nfWithin = sorted(nfWithin) + adj = 0 + altSection2 = copy.deepcopy(refSection) + for (m,j) in nfWithin: + lr = len(m[1]) + la = len(m[2]) + dpos = m[0]-n[0]+adj + altSection2 = altSection2[:dpos-1] + m[2] + altSection2[dpos-1+lr:] + adj += la-lr + + if altSection == altSection2: + for (m,i) in fpWithin: + if i not in delList_i: + delList_i.append(i) + for (m,j) in nfWithin: + if j not in delList_j: + delList_j.append(j) + + nEquiv = 0 + for i in sorted(list(set(delList_i)),reverse=True): + del FPvariants[i] + for j in sorted(list(set(delList_j)),reverse=True): + del notFound[j] + nEquiv += 1 + nPerfect += nEquiv + + # + # Tally up errors and whatnot + # + ztV += totalGoldenVariants + ztW += totalWorkflowVariants + znP += nPerfect + zfP += len(FPvariants) + znF += len(notFound) + if FAST == False: + znE += nEquiv + if BEDFILE != None: + zbM += correctBelowMinRLen + + # + # try to identify a reason for FN variants: + # + + venn_data = [[0,0,0] for n in notFound] # [i] = (unmappable, low cov, low het) + for i in xrange(len(notFound)): + var = notFound[i] + + noReason = True + + # mappability? + if MAPTRACK != None: + if refName in mappability_tracks and var[0] < len(mappability_tracks[refName]): + if mappability_tracks[refName][var[0]]: + mappability_vs_FN[1] += 1 + venn_data[i][0] = 1 + noReason = False + else: + mappability_vs_FN[0] += 1 + + # coverage? + if var in correctCov: + c = correctCov[var] + if c != None: + if c not in coverage_vs_FN: + coverage_vs_FN[c] = 0 + coverage_vs_FN[c] += 1 + if c < DP_THRESH: + venn_data[i][1] = 1 + noReason = False + + # heterozygous genotype messing things up? + #if var in correctAF: + # a = correctAF[var] + # if a != None: + # a = AF_KEYS[quantize_AF(a)] + # if a not in alleleBal_vs_FN: + # alleleBal_vs_FN[a] = 0 + # alleleBal_vs_FN[a] += 1 + # if a < AF_THRESH: + # venn_data[i][2] = 1 + + # no reason? + if noReason: + venn_data[i][2] += 1 + + for i in xrange(len(notFound)): + if venn_data[i][0]: set1.append(i+varAdj) + if venn_data[i][1]: set2.append(i+varAdj) + if venn_data[i][2]: set3.append(i+varAdj) + varAdj += len(notFound) + + # + # if desired, write out vcf files. + # + notFound = sorted(notFound) + FPvariants = sorted(FPvariants) + if VCF_OUT: + for line in open(GOLDEN_VCF,'r'): + if line[0] != '#': + splt = line.split('\t') + if splt[0] == refName: + var = (int(splt[1]),splt[3],splt[4]) + if var in notFound: + vcfo2.write(line) + for line in open(WORKFLOW_VCF,'r'): + if line[0] != '#': + splt = line.split('\t') + if splt[0] == refName: + var = (int(splt[1]),splt[3],splt[4]) + if var in FPvariants: + vcfo3.write(line) + + print '{0:.3f} (sec)'.format(time.time()-tt) + + # + # close vcf output + # + print '' + if VCF_OUT: + print OUT_PREFIX+'_FN.vcf' + print OUT_PREFIX+'_FP.vcf' + vcfo2.close() + vcfo3.close() + + # + # plot some FN stuff + # + if NO_PLOT == False: + nDetected = len(set(set1+set2+set3)) + set1 = set(set1) + set2 = set(set2) + set3 = set(set3) + + if len(set1): s1 = 'Unmappable' + else: s1 = '' + if len(set2): s2 = 'DP < '+str(DP_THRESH) + else: s2 = '' + #if len(set3): s3 = 'AF < '+str(AF_THRESH) + if len(set3): s3 = 'Unknown' + else: s3 = '' + + mpl.figure(0) + tstr1 = 'False Negative Variants (Missed Detections)' + #tstr2 = str(nDetected)+' / '+str(znF)+' FN variants categorized' + tstr2 = '' + if MAPTRACK != None: + v = venn3([set1, set2, set3], (s1, s2, s3)) + else: + v = venn2([set2, set3], (s2, s3)) + mpl.figtext(0.5,0.95,tstr1,fontdict={'size':14,'weight':'bold'},horizontalalignment='center') + mpl.figtext(0.5,0.03,tstr2,fontdict={'size':14,'weight':'bold'},horizontalalignment='center') + + ouf = OUT_PREFIX+'_FNvenn.pdf' + print ouf + mpl.savefig(ouf) + + # + # spit out results to console + # + print '\n**********************************\n' + if BEDFILE != None: + print 'ONLY CONSIDERING VARIANTS FOUND WITHIN TARGETED REGIONS\n\n' + print 'Total Golden Variants: ',ztV,'\t[',zgF,'filtered,',zgM,'merged,',zgR,'redundant ]' + print 'Total Workflow Variants:',ztW,'\t[',zwF,'filtered,',zwM,'merged,',zwR,'redundant ]' + print '' + if ztV > 0 and ztW > 0: + print 'Perfect Matches:',znP,'({0:.2f}%)'.format(100.*float(znP)/ztV) + print 'FN variants: ',znF,'({0:.2f}%)'.format(100.*float(znF)/ztV) + print 'FP variants: ',zfP#,'({0:.2f}%)'.format(100.*float(zfP)/ztW) + if FAST == False: + print '\nNumber of equivalent variants denoted differently between the two vcfs:',znE + if BEDFILE != None: + print '\nNumber of golden variants located in targeted regions that were too small to be sampled from:',zbM + if FAST: + print "\nWarning! Running with '--fast' means that identical variants denoted differently between the two vcfs will not be detected! The values above may be lower than the true accuracy." + #if NO_PLOT: + if True: + print '\n#unmappable: ',len(set1) + print '#low_coverage:',len(set2) + print '#unknown: ',len(set3) + print '\n**********************************\n' + + + + + +if __name__ == '__main__': + main()
