Mercurial > repos > thondeboer > neat_genreads
comparison py/vcfFunc.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 import sys | |
| 2 import time | |
| 3 import os | |
| 4 import re | |
| 5 import random | |
| 6 | |
| 7 INCLUDE_HOMS = False | |
| 8 INCLUDE_FAIL = False | |
| 9 CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND = True | |
| 10 | |
| 11 def parseLine(splt,colDict,colSamp): | |
| 12 | |
| 13 # check if we want to proceed.. | |
| 14 ra = splt[colDict['REF']] | |
| 15 aa = splt[colDict['ALT']] | |
| 16 # enough columns? | |
| 17 if len(splt) != len(colDict): | |
| 18 return None | |
| 19 # exclude homs / filtered? | |
| 20 if not(INCLUDE_HOMS) and (aa == '.' or aa == '' or aa == ra): | |
| 21 return None | |
| 22 if not(INCLUDE_FAIL) and (splt[colDict['FILTER']] != 'PASS' and splt[colDict['FILTER']] != '.'): | |
| 23 return None | |
| 24 | |
| 25 # default vals | |
| 26 alt_alleles = [aa] | |
| 27 alt_freqs = [] | |
| 28 | |
| 29 gt_perSamp = [] | |
| 30 | |
| 31 # any alt alleles? | |
| 32 alt_split = aa.split(',') | |
| 33 if len(alt_split) > 1: | |
| 34 alt_alleles = alt_split | |
| 35 | |
| 36 # check INFO for AF | |
| 37 af = None | |
| 38 if 'INFO' in colDict and ';AF=' in ';'+splt[colDict['INFO']]: | |
| 39 info = splt[colDict['INFO']]+';' | |
| 40 af = re.findall(r"AF=.*?(?=;)",info)[0][3:] | |
| 41 if af != None: | |
| 42 af_splt = af.split(',') | |
| 43 while(len(af_splt) < len(alt_alleles)): # are we lacking enough AF values for some reason? | |
| 44 af_splt.append(af_splt[-1]) # phone it in. | |
| 45 if len(af_splt) != 0 and af_splt[0] != '.' and af_splt[0] != '': # missing data, yay | |
| 46 alt_freqs = [float(n) for n in af_splt] | |
| 47 else: | |
| 48 alt_freqs = [None]*max([len(alt_alleles),1]) | |
| 49 | |
| 50 gt_perSamp = None | |
| 51 # if available (i.e. we simulated it) look for WP in info | |
| 52 if len(colSamp) == 0 and 'INFO' in colDict and 'WP=' in splt[colDict['INFO']]: | |
| 53 info = splt[colDict['INFO']]+';' | |
| 54 gt_perSamp = [re.findall(r"WP=.*?(?=;)",info)[0][3:]] | |
| 55 else: | |
| 56 # if no sample columns, check info for GT | |
| 57 if len(colSamp) == 0 and 'INFO' in colDict and 'GT=' in splt[colDict['INFO']]: | |
| 58 info = splt[colDict['INFO']]+';' | |
| 59 gt_perSamp = [re.findall(r"GT=.*?(?=;)",info)[0][3:]] | |
| 60 elif len(colSamp): | |
| 61 fmt = ':'+splt[colDict['FORMAT']]+':' | |
| 62 if ':GT:' in fmt: | |
| 63 gtInd = fmt.split(':').index('GT') | |
| 64 gt_perSamp = [splt[colSamp[iii]].split(':')[gtInd-1] for iii in xrange(len(colSamp))] | |
| 65 for i in xrange(len(gt_perSamp)): | |
| 66 gt_perSamp[i] = gt_perSamp[i].replace('.','0') | |
| 67 if gt_perSamp == None: | |
| 68 gt_perSamp = [None]*max([len(colSamp),1]) | |
| 69 | |
| 70 return (alt_alleles, alt_freqs, gt_perSamp) | |
| 71 | |
| 72 | |
| 73 | |
| 74 def parseVCF(vcfPath,tumorNormal=False,ploidy=2): | |
| 75 | |
| 76 tt = time.time() | |
| 77 print '--------------------------------' | |
| 78 sys.stdout.write('reading input VCF...\n') | |
| 79 sys.stdout.flush() | |
| 80 | |
| 81 colDict = {} | |
| 82 colSamp = [] | |
| 83 nSkipped = 0 | |
| 84 nSkipped_becauseHash = 0 | |
| 85 allVars = {} # [ref][pos] | |
| 86 sampNames = [] | |
| 87 alreadyPrintedWarning = False | |
| 88 f = open(vcfPath,'r') | |
| 89 for line in f: | |
| 90 | |
| 91 if line[0] != '#': | |
| 92 if len(colDict) == 0: | |
| 93 print '\n\nERROR: VCF has no header?\n'+VCF_FILENAME+'\n\n' | |
| 94 f.close() | |
| 95 exit(1) | |
| 96 splt = line[:-1].split('\t') | |
| 97 plOut = parseLine(splt,colDict,colSamp) | |
| 98 if plOut == None: | |
| 99 nSkipped += 1 | |
| 100 else: | |
| 101 (aa, af, gt) = plOut | |
| 102 | |
| 103 # make sure at least one allele somewhere contains the variant | |
| 104 if tumorNormal: | |
| 105 gtEval = gt[:2] | |
| 106 else: | |
| 107 gtEval = gt[:1] | |
| 108 if None in gtEval: | |
| 109 if CHOOSE_RANDOM_PLOID_IF_NO_GT_FOUND: | |
| 110 if not alreadyPrintedWarning: | |
| 111 print 'Warning: Found variants without a GT field, assuming heterozygous...' | |
| 112 alreadyPrintedWarning = True | |
| 113 for i in xrange(len(gtEval)): | |
| 114 tmp = ['0']*ploidy | |
| 115 tmp[random.randint(0,ploidy-1)] = '1' | |
| 116 gtEval[i] = '/'.join(tmp) | |
| 117 else: | |
| 118 # skip because no GT field was found | |
| 119 nSkipped += 1 | |
| 120 continue | |
| 121 isNonReference = False | |
| 122 for gtVal in gtEval: | |
| 123 if gtVal != None: | |
| 124 if '1' in gtVal: | |
| 125 isNonReference = True | |
| 126 if not isNonReference: | |
| 127 # skip if no genotype actually contains this variant | |
| 128 nSkipped += 1 | |
| 129 continue | |
| 130 | |
| 131 chrom = splt[0] | |
| 132 pos = int(splt[1]) | |
| 133 ref = splt[3] | |
| 134 # skip if position is <= 0 | |
| 135 if pos <= 0: | |
| 136 nSkipped += 1 | |
| 137 continue | |
| 138 | |
| 139 # hash variants to avoid inserting duplicates (there are some messy VCFs out there...) | |
| 140 if chrom not in allVars: | |
| 141 allVars[chrom] = {} | |
| 142 if pos not in allVars[chrom]: | |
| 143 allVars[chrom][pos] = (pos,ref,aa,af,gtEval) | |
| 144 else: | |
| 145 nSkipped_becauseHash += 1 | |
| 146 | |
| 147 else: | |
| 148 if line[1] != '#': | |
| 149 cols = line[1:-1].split('\t') | |
| 150 for i in xrange(len(cols)): | |
| 151 if 'FORMAT' in colDict: | |
| 152 colSamp.append(i) | |
| 153 colDict[cols[i]] = i | |
| 154 if len(colSamp): | |
| 155 sampNames = cols[-len(colSamp):] | |
| 156 if len(colSamp) == 1: | |
| 157 pass | |
| 158 elif len(colSamp) == 2 and tumorNormal: | |
| 159 print 'Detected 2 sample columns in input VCF, assuming tumor/normal.' | |
| 160 else: | |
| 161 print 'Warning: Multiple sample columns present in input VCF. By default genReads uses only the first column.' | |
| 162 else: | |
| 163 sampNames = ['Unknown'] | |
| 164 if tumorNormal: | |
| 165 #tumorInd = sampNames.index('TUMOR') | |
| 166 #normalInd = sampNames.index('NORMAL') | |
| 167 if 'NORMAL' not in sampNames or 'TUMOR' not in sampNames: | |
| 168 print '\n\nERROR: Input VCF must have a "NORMAL" and "TUMOR" column.\n' | |
| 169 f.close() | |
| 170 | |
| 171 varsOut = {} | |
| 172 for r in allVars.keys(): | |
| 173 varsOut[r] = [allVars[r][k] for k in sorted(allVars[r].keys())] | |
| 174 | |
| 175 print 'found',sum([len(n) for n in allVars.values()]),'valid variants in input vcf.' | |
| 176 print ' *',nSkipped,'variants skipped: (qual filtered / ref genotypes / invalid syntax)' | |
| 177 print ' *',nSkipped_becauseHash,'variants skipped due to multiple variants found per position' | |
| 178 print '--------------------------------' | |
| 179 return (sampNames, varsOut) | |
| 180 | |
| 181 | |
| 182 |
