Mercurial > repos > thondeboer > neat_genreads
diff py/vcfFunc.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/vcfFunc.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,182 @@ +import sys +import time +import os +import re +import random + +INCLUDE_HOMS = False +INCLUDE_FAIL = False +CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND = True + +def parseLine(splt,colDict,colSamp): + + # check if we want to proceed.. + ra = splt[colDict['REF']] + aa = splt[colDict['ALT']] + # enough columns? + if len(splt) != len(colDict): + return None + # exclude homs / filtered? + if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra): + return None + if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'): + return None + + # default vals + alt_alleles = [aa] + alt_freqs = [] + + gt_perSamp = [] + + # any alt alleles? + alt_split = aa.split(',') + if len(alt_split) > 1: + alt_alleles = alt_split + + # check INFO for AF + af = None + if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]: + info = splt[colDict['INFO']]+';' + af = re.findall(r"AF=.*?(?=;)",info)[0][3:] + if af != None: + af_splt = af.split(',') + while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason? + af_splt.append(af_splt[-1]) # phone it in. + if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay + alt_freqs = [float(n) for n in af_splt] + else: + alt_freqs = [None]*max([len(alt_alleles),1]) + + gt_perSamp = None + # if available (i.e. we simulated it) look for WP in info + if len(colSamp) == 0 and 'INFO' in colDict and 'WP=' in splt[colDict['INFO']]: + info = splt[colDict['INFO']]+';' + gt_perSamp = [re.findall(r"WP=.*?(?=;)",info)[0][3:]] + else: + # if no sample columns, check info for GT + if len(colSamp) == 0 and 'INFO' in colDict and 'GT=' in splt[colDict['INFO']]: + info = splt[colDict['INFO']]+';' + gt_perSamp = [re.findall(r"GT=.*?(?=;)",info)[0][3:]] + elif len(colSamp): + fmt = ':'+splt[colDict['FORMAT']]+':' + if ':GT:' in fmt: + gtInd = fmt.split(':').index('GT') + gt_perSamp = [splt[colSamp[iii]].split(':')[gtInd-1] for iii in xrange(len(colSamp))] + for i in xrange(len(gt_perSamp)): + gt_perSamp[i] = gt_perSamp[i].replace('.','0') + if gt_perSamp == None: + gt_perSamp = [None]*max([len(colSamp),1]) + + return (alt_alleles, alt_freqs, gt_perSamp) + + + +def parseVCF(vcfPath,tumorNormal=False,ploidy=2): + + tt = time.time() + print '--------------------------------' + sys.stdout.write('reading input VCF...\n') + sys.stdout.flush() + + colDict = {} + colSamp = [] + nSkipped = 0 + nSkipped_becauseHash = 0 + allVars = {} # [ref][pos] + sampNames = [] + alreadyPrintedWarning = False + f = open(vcfPath,'r') + for line in f: + + if line[0] != '#': + if len(colDict) == 0: + print '\n\nERROR: VCF has no header?\n'+VCF_FILENAME+'\n\n' + f.close() + exit(1) + splt = line[:-1].split('\t') + plOut = parseLine(splt,colDict,colSamp) + if plOut == None: + nSkipped += 1 + else: + (aa, af, gt) = plOut + + # make sure at least one allele somewhere contains the variant + if tumorNormal: + gtEval = gt[:2] + else: + gtEval = gt[:1] + if None in gtEval: + if CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND: + if not alreadyPrintedWarning: + print 'Warning: Found variants without a GT field, assuming heterozygous...' + alreadyPrintedWarning = True + for i in xrange(len(gtEval)): + tmp = ['0']*ploidy + tmp[random.randint(0,ploidy-1)] = '1' + gtEval[i] = '/'.join(tmp) + else: + # skip because no GT field was found + nSkipped += 1 + continue + isNonReference = False + for gtVal in gtEval: + if gtVal != None: + if '1' in gtVal: + isNonReference = True + if not isNonReference: + # skip if no genotype actually contains this variant + nSkipped += 1 + continue + + chrom = splt[0] + pos = int(splt[1]) + ref = splt[3] + # skip if position is <= 0 + if pos <= 0: + nSkipped += 1 + continue + + # hash variants to avoid inserting duplicates (there are some messy VCFs out there...) + if chrom not in allVars: + allVars[chrom] = {} + if pos not in allVars[chrom]: + allVars[chrom][pos] = (pos,ref,aa,af,gtEval) + else: + nSkipped_becauseHash += 1 + + else: + if line[1] != '#': + cols = line[1:-1].split('\t') + for i in xrange(len(cols)): + if 'FORMAT' in colDict: + colSamp.append(i) + colDict[cols[i]] = i + if len(colSamp): + sampNames = cols[-len(colSamp):] + if len(colSamp) == 1: + pass + elif len(colSamp) == 2 and tumorNormal: + print 'Detected 2 sample columns in input VCF, assuming tumor/normal.' + else: + print 'Warning: Multiple sample columns present in input VCF. By default genReads uses only the first column.' + else: + sampNames = ['Unknown'] + if tumorNormal: + #tumorInd = sampNames.index('TUMOR') + #normalInd = sampNames.index('NORMAL') + if 'NORMAL' not in sampNames or 'TUMOR' not in sampNames: + print '\n\nERROR: Input VCF must have a "NORMAL" and "TUMOR" column.\n' + f.close() + + varsOut = {} + for r in allVars.keys(): + varsOut[r] = [allVars[r][k] for k in sorted(allVars[r].keys())] + + print 'found',sum([len(n) for n in allVars.values()]),'valid variants in input vcf.' + print ' *',nSkipped,'variants skipped: (qual filtered / ref genotypes / invalid syntax)' + print ' *',nSkipped_becauseHash,'variants skipped due to multiple variants found per position' + print '--------------------------------' + return (sampNames, varsOut) + + +
