Mercurial > repos > thondeboer > neat_genreads
comparison utilities/genMutModel.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import sys | |
| 4 import os | |
| 5 import re | |
| 6 import bisect | |
| 7 import pickle | |
| 8 import argparse | |
| 9 import numpy as np | |
| 10 #matplotlib is not used as far as i can see | |
| 11 #import matplotlib.pyplot as mpl | |
| 12 | |
| 13 # absolute path to the directory above this script | |
| 14 SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-2]) | |
| 15 sys.path.append(SIM_PATH+'/py/') | |
| 16 | |
| 17 from refFunc import indexRef | |
| 18 | |
| 19 REF_WHITELIST = [str(n) for n in xrange(1,30)] + ['x','y','X','Y','mt','Mt','MT'] | |
| 20 REF_WHITELIST += ['chr'+n for n in REF_WHITELIST] | |
| 21 VALID_NUCL = ['A','C','G','T'] | |
| 22 VALID_TRINUC = [VALID_NUCL[i]+VALID_NUCL[j]+VALID_NUCL[k] for i in xrange(len(VALID_NUCL)) for j in xrange(len(VALID_NUCL)) for k in xrange(len(VALID_NUCL))] | |
| 23 # if parsing a dbsnp vcf, and no CAF= is found in info tag, use this as default val for population freq | |
| 24 VCF_DEFAULT_POP_FREQ = 0.00001 | |
| 25 | |
| 26 | |
| 27 ######################################################### | |
| 28 # VARIOUS HELPER FUNCTIONS # | |
| 29 ######################################################### | |
| 30 | |
| 31 | |
| 32 # given a reference index, grab the sequence string of a specified reference | |
| 33 def getChrFromFasta(refPath,ref_inds,chrName): | |
| 34 for i in xrange(len(ref_inds)): | |
| 35 if ref_inds[i][0] == chrName: | |
| 36 ref_inds_i = ref_inds[i] | |
| 37 break | |
| 38 refFile = open(refPath,'r') | |
| 39 refFile.seek(ref_inds_i[1]) | |
| 40 myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) | |
| 41 return myDat | |
| 42 | |
| 43 # cluster a sorted list | |
| 44 def clusterList(l,delta): | |
| 45 outList = [[l[0]]] | |
| 46 prevVal = l[0] | |
| 47 currentInd = 0 | |
| 48 for n in l[1:]: | |
| 49 if n-prevVal <= delta: | |
| 50 outList[currentInd].append(n) | |
| 51 else: | |
| 52 currentInd += 1 | |
| 53 outList.append([]) | |
| 54 outList[currentInd].append(n) | |
| 55 prevVal = n | |
| 56 return outList | |
| 57 | |
| 58 def list_2_countDict(l): | |
| 59 cDict = {} | |
| 60 for n in l: | |
| 61 if n not in cDict: | |
| 62 cDict[n] = 0 | |
| 63 cDict[n] += 1 | |
| 64 return cDict | |
| 65 | |
| 66 def getBedTracks(fn): | |
| 67 f = open(fn,'r') | |
| 68 trackDict = {} | |
| 69 for line in f: | |
| 70 splt = line.strip().split('\t') | |
| 71 if splt[0] not in trackDict: | |
| 72 trackDict[splt[0]] = [] | |
| 73 trackDict[splt[0]].extend([int(splt[1]),int(splt[2])]) | |
| 74 f.close() | |
| 75 return trackDict | |
| 76 | |
| 77 def getTrackLen(trackDict): | |
| 78 totSum = 0 | |
| 79 for k in trackDict.keys(): | |
| 80 for i in xrange(0,len(trackDict[k]),2): | |
| 81 totSum += trackDict[k][i+1] - trackDict[k][i] + 1 | |
| 82 return totSum | |
| 83 | |
| 84 def isInBed(track,ind): | |
| 85 myInd = bisect.bisect(track,ind) | |
| 86 if myInd&1: | |
| 87 return True | |
| 88 if myInd < len(track): | |
| 89 if track[myInd-1] == ind: | |
| 90 return True | |
| 91 return False | |
| 92 | |
| 93 ## return the mean distance to the median of a cluster | |
| 94 #def mean_dist_from_median(c): | |
| 95 # centroid = np.median([n for n in c]) | |
| 96 # dists = [] | |
| 97 # for n in c: | |
| 98 # dists.append(abs(n-centroid)) | |
| 99 # return np.mean(dists) | |
| 100 # | |
| 101 ## get median value from counting dictionary | |
| 102 #def quick_median(countDict): | |
| 103 # midPoint = sum(countDict.values())/2 | |
| 104 # mySum = 0 | |
| 105 # myInd = 0 | |
| 106 # sk = sorted(countDict.keys()) | |
| 107 # while mySum < midPoint: | |
| 108 # mySum += countDict[sk[myInd]] | |
| 109 # if mySum >= midPoint: | |
| 110 # break | |
| 111 # myInd += 1 | |
| 112 # return myInd | |
| 113 # | |
| 114 ## get median deviation from median of counting dictionary | |
| 115 #def median_deviation_from_median(countDict): | |
| 116 # myMedian = quick_median(countDict) | |
| 117 # deviations = {} | |
| 118 # for k in sorted(countDict.keys()): | |
| 119 # d = abs(k-myMedian) | |
| 120 # deviations[d] = countDict[k] | |
| 121 # return quick_median(deviations) | |
| 122 | |
| 123 | |
| 124 ################################################# | |
| 125 # PARSE INPUT OPTIONS # | |
| 126 ################################################# | |
| 127 | |
| 128 | |
| 129 parser = argparse.ArgumentParser(description='genMutModel.py') | |
| 130 parser.add_argument('-r', type=str, required=True, metavar='<str>', help="* ref.fa") | |
| 131 parser.add_argument('-m', type=str, required=True, metavar='<str>', help="* mutations.tsv [.vcf]") | |
| 132 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output.p") | |
| 133 parser.add_argument('-bi', type=str, required=False, metavar='<str>', default=None, help="only_use_these_regions.bed") | |
| 134 parser.add_argument('-be', type=str, required=False, metavar='<str>', default=None, help="exclude_these_regions.bed") | |
| 135 parser.add_argument('--save-trinuc', required=False,action='store_true', default=False, help='save trinuc counts for ref') | |
| 136 parser.add_argument('--no-whitelist',required=False,action='store_true', default=False, help='allow any non-standard ref') | |
| 137 parser.add_argument('--skip-common', required=False,action='store_true', default=False, help='do not save common snps + high mut regions') | |
| 138 args = parser.parse_args() | |
| 139 (REF, TSV, OUT_PICKLE, SAVE_TRINUC, NO_WHITELIST, SKIP_COMMON) = (args.r, args.m, args.o, args.save_trinuc, args.no_whitelist, args.skip_common) | |
| 140 | |
| 141 MYBED = None | |
| 142 if args.bi != None: | |
| 143 print 'only considering variants in specified bed regions...' | |
| 144 MYBED = (getBedTracks(args.bi),True) | |
| 145 elif args.be != None: | |
| 146 print 'only considering variants outside of specified bed regions...' | |
| 147 MYBED = (getBedTracks(args.be),False) | |
| 148 | |
| 149 if TSV[-4:] == '.vcf': | |
| 150 IS_VCF = True | |
| 151 elif TSV[-4:] == '.tsv': | |
| 152 IS_VCF = False | |
| 153 else: | |
| 154 print '\nError: Unknown format for mutation input.\n' | |
| 155 exit(1) | |
| 156 | |
| 157 | |
| 158 ##################################### | |
| 159 # main() # | |
| 160 ##################################### | |
| 161 | |
| 162 | |
| 163 def main(): | |
| 164 | |
| 165 ref_inds = indexRef(REF) | |
| 166 refList = [n[0] for n in ref_inds] | |
| 167 | |
| 168 # how many times do we observe each trinucleotide in the reference (and input bed region, if present)? | |
| 169 TRINUC_REF_COUNT = {} | |
| 170 TRINUC_BED_COUNT = {} | |
| 171 printBedWarning = True | |
| 172 # [(trinuc_a, trinuc_b)] = # of times we observed a mutation from trinuc_a into trinuc_b | |
| 173 TRINUC_TRANSITION_COUNT = {} | |
| 174 # total count of SNPs | |
| 175 SNP_COUNT = 0 | |
| 176 # overall SNP transition probabilities | |
| 177 SNP_TRANSITION_COUNT = {} | |
| 178 # total count of indels, indexed by length | |
| 179 INDEL_COUNT = {} | |
| 180 # tabulate how much non-N reference sequence we've eaten through | |
| 181 TOTAL_REFLEN = 0 | |
| 182 # detect variants that occur in a significant percentage of the input samples (pos,ref,alt,pop_fraction) | |
| 183 COMMON_VARIANTS = [] | |
| 184 # tabulate how many unique donors we've encountered (this is useful for identifying common variants) | |
| 185 TOTAL_DONORS = {} | |
| 186 # identify regions that have significantly higher local mutation rates than the average | |
| 187 HIGH_MUT_REGIONS = [] | |
| 188 | |
| 189 # load and process variants in each reference sequence individually, for memory reasons... | |
| 190 for refName in refList: | |
| 191 | |
| 192 if (refName not in REF_WHITELIST) and (not NO_WHITELIST): | |
| 193 print refName,'is not in our whitelist, skipping...' | |
| 194 continue | |
| 195 | |
| 196 print 'reading reference "'+refName+'"...' | |
| 197 refSequence = getChrFromFasta(REF,ref_inds,refName).upper() | |
| 198 TOTAL_REFLEN += len(refSequence) - refSequence.count('N') | |
| 199 | |
| 200 # list to be used for counting variants that occur multiple times in file (i.e. in multiple samples) | |
| 201 VDAT_COMMON = [] | |
| 202 | |
| 203 | |
| 204 """ ########################################################################## | |
| 205 ### COUNT TRINUCLEOTIDES IN REF ### | |
| 206 ########################################################################## """ | |
| 207 | |
| 208 | |
| 209 if MYBED != None: | |
| 210 if printBedWarning: | |
| 211 print "since you're using a bed input, we have to count trinucs in bed region even if you specified a trinuc count file for the reference..." | |
| 212 printBedWarning = False | |
| 213 if refName in MYBED[0]: | |
| 214 refKey = refName | |
| 215 elif ('chr' in refName) and (refName not in MYBED[0]) and (refName[3:] in MYBED[0]): | |
| 216 refKey = refName[3:] | |
| 217 elif ('chr' not in refName) and (refName not in MYBED[0]) and ('chr'+refName in MYBED[0]): | |
| 218 refKey = 'chr'+refName | |
| 219 if refKey in MYBED[0]: | |
| 220 subRegions = [(MYBED[0][refKey][n],MYBED[0][refKey][n+1]) for n in xrange(0,len(MYBED[0][refKey]),2)] | |
| 221 for sr in subRegions: | |
| 222 for i in xrange(sr[0],sr[1]+1-2): | |
| 223 trinuc = refSequence[i:i+3] | |
| 224 if not trinuc in VALID_TRINUC: | |
| 225 continue # skip if trinuc contains invalid characters, or not in specified bed region | |
| 226 if trinuc not in TRINUC_BED_COUNT: | |
| 227 TRINUC_BED_COUNT[trinuc] = 0 | |
| 228 TRINUC_BED_COUNT[trinuc] += 1 | |
| 229 | |
| 230 if not os.path.isfile(REF+'.trinucCounts'): | |
| 231 print 'counting trinucleotides in reference...' | |
| 232 for i in xrange(len(refSequence)-2): | |
| 233 if i%1000000 == 0 and i > 0: | |
| 234 print i,'/',len(refSequence) | |
| 235 #break | |
| 236 trinuc = refSequence[i:i+3] | |
| 237 if not trinuc in VALID_TRINUC: | |
| 238 continue # skip if trinuc contains invalid characters | |
| 239 if trinuc not in TRINUC_REF_COUNT: | |
| 240 TRINUC_REF_COUNT[trinuc] = 0 | |
| 241 TRINUC_REF_COUNT[trinuc] += 1 | |
| 242 else: | |
| 243 print 'skipping trinuc counts (for whole reference) because we found a file...' | |
| 244 | |
| 245 | |
| 246 """ ########################################################################## | |
| 247 ### READ INPUT VARIANTS ### | |
| 248 ########################################################################## """ | |
| 249 | |
| 250 | |
| 251 print 'reading input variants...' | |
| 252 f = open(TSV,'r') | |
| 253 isFirst = True | |
| 254 for line in f: | |
| 255 | |
| 256 if IS_VCF and line[0] == '#': | |
| 257 continue | |
| 258 if isFirst: | |
| 259 if IS_VCF: | |
| 260 # hard-code index values based on expected columns in vcf | |
| 261 (c1,c2,c3,m1,m2,m3) = (0,1,1,3,3,4) | |
| 262 else: | |
| 263 # determine columns of fields we're interested in | |
| 264 splt = line.strip().split('\t') | |
| 265 (c1,c2,c3) = (splt.index('chromosome'),splt.index('chromosome_start'),splt.index('chromosome_end')) | |
| 266 (m1,m2,m3) = (splt.index('reference_genome_allele'),splt.index('mutated_from_allele'),splt.index('mutated_to_allele')) | |
| 267 (d_id) = (splt.index('icgc_donor_id')) | |
| 268 isFirst = False | |
| 269 continue | |
| 270 | |
| 271 splt = line.strip().split('\t') | |
| 272 # we have -1 because tsv/vcf coords are 1-based, and our reference string index is 0-based | |
| 273 [chrName,chrStart,chrEnd] = [splt[c1],int(splt[c2])-1,int(splt[c3])-1] | |
| 274 [allele_ref,allele_normal,allele_tumor] = [splt[m1].upper(),splt[m2].upper(),splt[m3].upper()] | |
| 275 if IS_VCF: | |
| 276 if len(allele_ref) != len(allele_tumor): | |
| 277 # indels in tsv don't include the preserved first nucleotide, so lets trim the vcf alleles | |
| 278 [allele_ref,allele_normal,allele_tumor] = [allele_ref[1:],allele_normal[1:],allele_tumor[1:]] | |
| 279 if not allele_ref: allele_ref = '-' | |
| 280 if not allele_normal: allele_normal = '-' | |
| 281 if not allele_tumor: allele_tumor = '-' | |
| 282 # if alternate alleles are present, lets just ignore this variant. I may come back and improve this later | |
| 283 if ',' in allele_tumor: | |
| 284 continue | |
| 285 vcf_info = ';'+splt[7]+';' | |
| 286 else: | |
| 287 [donor_id] = [splt[d_id]] | |
| 288 | |
| 289 # if we encounter a multi-np (i.e. 3 nucl --> 3 different nucl), let's skip it for now... | |
| 290 if ('-' not in allele_normal and '-' not in allele_tumor) and (len(allele_normal) > 1 or len(allele_tumor) > 1): | |
| 291 print 'skipping a complex variant...' | |
| 292 continue | |
| 293 | |
| 294 # to deal with '1' vs 'chr1' references, manually change names. this is hacky and bad. | |
| 295 if 'chr' not in chrName: | |
| 296 chrName = 'chr'+chrName | |
| 297 if 'chr' not in refName: | |
| 298 refName = 'chr'+refName | |
| 299 # skip irrelevant variants | |
| 300 if chrName != refName: | |
| 301 continue | |
| 302 | |
| 303 # if variant is outside the regions we're interested in (if specified), skip it... | |
| 304 if MYBED != None: | |
| 305 refKey = refName | |
| 306 if not refKey in MYBED[0] and refKey[3:] in MYBED[0]: # account for 1 vs chr1, again... | |
| 307 refKey = refKey[3:] | |
| 308 if refKey not in MYBED[0]: | |
| 309 inBed = False | |
| 310 else: | |
| 311 inBed = isInBed(MYBED[0][refKey],chrStart) | |
| 312 if inBed != MYBED[1]: | |
| 313 continue | |
| 314 | |
| 315 # we want only snps | |
| 316 # so, no '-' characters allowed, and chrStart must be same as chrEnd | |
| 317 if '-' not in allele_normal and '-' not in allele_tumor and chrStart == chrEnd: | |
| 318 trinuc_ref = refSequence[chrStart-1:chrStart+2] | |
| 319 if not trinuc_ref in VALID_TRINUC: | |
| 320 continue # skip ref trinuc with invalid characters | |
| 321 # only consider positions where ref allele in tsv matches the nucleotide in our reference | |
| 322 if allele_ref == trinuc_ref[1]: | |
| 323 trinuc_normal = refSequence[chrStart-1] + allele_normal + refSequence[chrStart+1] | |
| 324 trinuc_tumor = refSequence[chrStart-1] + allele_tumor + refSequence[chrStart+1] | |
| 325 if not trinuc_normal in VALID_TRINUC or not trinuc_tumor in VALID_TRINUC: | |
| 326 continue # skip if mutation contains invalid char | |
| 327 key = (trinuc_normal,trinuc_tumor) | |
| 328 if key not in TRINUC_TRANSITION_COUNT: | |
| 329 TRINUC_TRANSITION_COUNT[key] = 0 | |
| 330 TRINUC_TRANSITION_COUNT[key] += 1 | |
| 331 SNP_COUNT += 1 | |
| 332 key2 = (allele_normal,allele_tumor) | |
| 333 if key2 not in SNP_TRANSITION_COUNT: | |
| 334 SNP_TRANSITION_COUNT[key2] = 0 | |
| 335 SNP_TRANSITION_COUNT[key2] += 1 | |
| 336 | |
| 337 if IS_VCF: | |
| 338 myPopFreq = VCF_DEFAULT_POP_FREQ | |
| 339 if ';CAF=' in vcf_info: | |
| 340 cafStr = re.findall(r";CAF=.*?(?=;)",vcf_info)[0] | |
| 341 if ',' in cafStr: | |
| 342 myPopFreq = float(cafStr[5:].split(',')[1]) | |
| 343 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor,myPopFreq)) | |
| 344 else: | |
| 345 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor)) | |
| 346 TOTAL_DONORS[donor_id] = True | |
| 347 else: | |
| 348 print '\nError: ref allele in variant call does not match reference.\n' | |
| 349 exit(1) | |
| 350 | |
| 351 # now let's look for indels... | |
| 352 if '-' in allele_normal: len_normal = 0 | |
| 353 else: len_normal = len(allele_normal) | |
| 354 if '-' in allele_tumor: len_tumor = 0 | |
| 355 else: len_tumor = len(allele_tumor) | |
| 356 if len_normal != len_tumor: | |
| 357 indel_len = len_tumor - len_normal | |
| 358 if indel_len not in INDEL_COUNT: | |
| 359 INDEL_COUNT[indel_len] = 0 | |
| 360 INDEL_COUNT[indel_len] += 1 | |
| 361 | |
| 362 if IS_VCF: | |
| 363 myPopFreq = VCF_DEFAULT_POP_FREQ | |
| 364 if ';CAF=' in vcf_info: | |
| 365 cafStr = re.findall(r";CAF=.*?(?=;)",vcf_info)[0] | |
| 366 if ',' in cafStr: | |
| 367 myPopFreq = float(cafStr[5:].split(',')[1]) | |
| 368 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor,myPopFreq)) | |
| 369 else: | |
| 370 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor)) | |
| 371 TOTAL_DONORS[donor_id] = True | |
| 372 f.close() | |
| 373 | |
| 374 # if we didn't find anything, skip ahead along to the next reference sequence | |
| 375 if not len(VDAT_COMMON): | |
| 376 print 'Found no variants for this reference, moving along...' | |
| 377 continue | |
| 378 | |
| 379 # | |
| 380 # identify common mutations | |
| 381 # | |
| 382 percentile_var = 95 | |
| 383 if IS_VCF: | |
| 384 minVal = np.percentile([n[4] for n in VDAT_COMMON],percentile_var) | |
| 385 for k in sorted(VDAT_COMMON): | |
| 386 if k[4] >= minVal: | |
| 387 COMMON_VARIANTS.append((refName,k[0],k[1],k[3],k[4])) | |
| 388 VDAT_COMMON = {(n[0],n[1],n[2],n[3]):n[4] for n in VDAT_COMMON} | |
| 389 else: | |
| 390 N_DONORS = len(TOTAL_DONORS) | |
| 391 VDAT_COMMON = list_2_countDict(VDAT_COMMON) | |
| 392 minVal = int(np.percentile(VDAT_COMMON.values(),percentile_var)) | |
| 393 for k in sorted(VDAT_COMMON.keys()): | |
| 394 if VDAT_COMMON[k] >= minVal: | |
| 395 COMMON_VARIANTS.append((refName,k[0],k[1],k[3],VDAT_COMMON[k]/float(N_DONORS))) | |
| 396 | |
| 397 # | |
| 398 # identify areas that have contained significantly higher random mutation rates | |
| 399 # | |
| 400 dist_thresh = 2000 | |
| 401 percentile_clust = 97 | |
| 402 qptn = 1000 | |
| 403 # identify regions with disproportionately more variants in them | |
| 404 VARIANT_POS = sorted([n[0] for n in VDAT_COMMON.keys()]) | |
| 405 clustered_pos = clusterList(VARIANT_POS,dist_thresh) | |
| 406 byLen = [(len(clustered_pos[i]),min(clustered_pos[i]),max(clustered_pos[i]),i) for i in xrange(len(clustered_pos))] | |
| 407 #byLen = sorted(byLen,reverse=True) | |
| 408 #minLen = int(np.percentile([n[0] for n in byLen],percentile_clust)) | |
| 409 #byLen = [n for n in byLen if n[0] >= minLen] | |
| 410 candidate_regions = [] | |
| 411 for n in byLen: | |
| 412 bi = int((n[1]-dist_thresh)/float(qptn))*qptn | |
| 413 bf = int((n[2]+dist_thresh)/float(qptn))*qptn | |
| 414 candidate_regions.append((n[0]/float(bf-bi),max([0,bi]),min([len(refSequence),bf]))) | |
| 415 minVal = np.percentile([n[0] for n in candidate_regions],percentile_clust) | |
| 416 for n in candidate_regions: | |
| 417 if n[0] >= minVal: | |
| 418 HIGH_MUT_REGIONS.append((refName,n[1],n[2],n[0])) | |
| 419 # collapse overlapping regions | |
| 420 for i in xrange(len(HIGH_MUT_REGIONS)-1,0,-1): | |
| 421 if HIGH_MUT_REGIONS[i-1][2] >= HIGH_MUT_REGIONS[i][1] and HIGH_MUT_REGIONS[i-1][0] == HIGH_MUT_REGIONS[i][0]: | |
| 422 avgMutRate = 0.5*HIGH_MUT_REGIONS[i-1][3]+0.5*HIGH_MUT_REGIONS[i][3] # not accurate, but I'm lazy | |
| 423 HIGH_MUT_REGIONS[i-1] = (HIGH_MUT_REGIONS[i-1][0], HIGH_MUT_REGIONS[i-1][1], HIGH_MUT_REGIONS[i][2], avgMutRate) | |
| 424 del HIGH_MUT_REGIONS[i] | |
| 425 | |
| 426 # | |
| 427 # if we didn't count ref trinucs because we found file, read in ref counts from file now | |
| 428 # | |
| 429 if os.path.isfile(REF+'.trinucCounts'): | |
| 430 print 'reading pre-computed trinuc counts...' | |
| 431 f = open(REF+'.trinucCounts','r') | |
| 432 for line in f: | |
| 433 splt = line.strip().split('\t') | |
| 434 TRINUC_REF_COUNT[splt[0]] = int(splt[1]) | |
| 435 f.close() | |
| 436 # otherwise, save trinuc counts to file, if desired | |
| 437 elif SAVE_TRINUC: | |
| 438 if MYBED != None: | |
| 439 print 'unable to save trinuc counts to file because using input bed region...' | |
| 440 else: | |
| 441 print 'saving trinuc counts to file...' | |
| 442 f = open(REF+'.trinucCounts','w') | |
| 443 for trinuc in sorted(TRINUC_REF_COUNT.keys()): | |
| 444 f.write(trinuc+'\t'+str(TRINUC_REF_COUNT[trinuc])+'\n') | |
| 445 f.close() | |
| 446 | |
| 447 # | |
| 448 # if using an input bed region, make necessary adjustments to trinuc ref counts based on the bed region trinuc counts | |
| 449 # | |
| 450 if MYBED != None: | |
| 451 if MYBED[1] == True: # we are restricting our attention to bed regions, so ONLY use bed region trinuc counts | |
| 452 TRINUC_REF_COUNT = TRINUC_BED_COUNT | |
| 453 else: # we are only looking outside bed regions, so subtract bed region trinucs from entire reference trinucs | |
| 454 for k in TRINUC_REF_COUNT.keys(): | |
| 455 if k in TRINUC_BED_COUNT: | |
| 456 TRINUC_REF_COUNT[k] -= TRINUC_BED_COUNT[k] | |
| 457 | |
| 458 # if for some reason we didn't find any valid input variants, exit gracefully... | |
| 459 totalVar = SNP_COUNT + sum(INDEL_COUNT.values()) | |
| 460 if totalVar == 0: | |
| 461 print '\nError: No valid variants were found, model could not be created. (Are you using the correct reference?)\n' | |
| 462 exit(1) | |
| 463 | |
| 464 """ ########################################################################## | |
| 465 ### COMPUTE PROBABILITIES ### | |
| 466 ########################################################################## """ | |
| 467 | |
| 468 | |
| 469 #for k in sorted(TRINUC_REF_COUNT.keys()): | |
| 470 # print k, TRINUC_REF_COUNT[k] | |
| 471 # | |
| 472 #for k in sorted(TRINUC_TRANSITION_COUNT.keys()): | |
| 473 # print k, TRINUC_TRANSITION_COUNT[k] | |
| 474 | |
| 475 # frequency that each trinuc mutated into anything else | |
| 476 TRINUC_MUT_PROB = {} | |
| 477 # frequency that a trinuc mutates into another trinuc, given that it mutated | |
| 478 TRINUC_TRANS_PROBS = {} | |
| 479 # frequency of snp transitions, given a snp occurs. | |
| 480 SNP_TRANS_FREQ = {} | |
| 481 | |
| 482 for trinuc in sorted(TRINUC_REF_COUNT.keys()): | |
| 483 myCount = 0 | |
| 484 for k in sorted(TRINUC_TRANSITION_COUNT.keys()): | |
| 485 if k[0] == trinuc: | |
| 486 myCount += TRINUC_TRANSITION_COUNT[k] | |
| 487 TRINUC_MUT_PROB[trinuc] = myCount / float(TRINUC_REF_COUNT[trinuc]) | |
| 488 for k in sorted(TRINUC_TRANSITION_COUNT.keys()): | |
| 489 if k[0] == trinuc: | |
| 490 TRINUC_TRANS_PROBS[k] = TRINUC_TRANSITION_COUNT[k] / float(myCount) | |
| 491 | |
| 492 for n1 in VALID_NUCL: | |
| 493 rollingTot = sum([SNP_TRANSITION_COUNT[(n1,n2)] for n2 in VALID_NUCL if (n1,n2) in SNP_TRANSITION_COUNT]) | |
| 494 for n2 in VALID_NUCL: | |
| 495 key2 = (n1,n2) | |
| 496 if key2 in SNP_TRANSITION_COUNT: | |
| 497 SNP_TRANS_FREQ[key2] = SNP_TRANSITION_COUNT[key2] / float(rollingTot) | |
| 498 | |
| 499 # compute average snp and indel frequencies | |
| 500 SNP_FREQ = SNP_COUNT/float(totalVar) | |
| 501 AVG_INDEL_FREQ = 1.-SNP_FREQ | |
| 502 INDEL_FREQ = {k:(INDEL_COUNT[k]/float(totalVar))/AVG_INDEL_FREQ for k in INDEL_COUNT.keys()} | |
| 503 if MYBED != None: | |
| 504 if MYBED[1] == True: | |
| 505 AVG_MUT_RATE = totalVar/float(getTrackLen(MYBED[0])) | |
| 506 else: | |
| 507 AVG_MUT_RATE = totalVar/float(TOTAL_REFLEN - getTrackLen(MYBED[0])) | |
| 508 else: | |
| 509 AVG_MUT_RATE = totalVar/float(TOTAL_REFLEN) | |
| 510 | |
| 511 # | |
| 512 # if values weren't found in data, appropriately append null entries | |
| 513 # | |
| 514 printTrinucWarning = False | |
| 515 for trinuc in VALID_TRINUC: | |
| 516 trinuc_mut = [trinuc[0]+n+trinuc[2] for n in VALID_NUCL if n != trinuc[1]] | |
| 517 if trinuc not in TRINUC_MUT_PROB: | |
| 518 TRINUC_MUT_PROB[trinuc] = 0. | |
| 519 printTrinucWarning = True | |
| 520 for trinuc2 in trinuc_mut: | |
| 521 if (trinuc,trinuc2) not in TRINUC_TRANS_PROBS: | |
| 522 TRINUC_TRANS_PROBS[(trinuc,trinuc2)] = 0. | |
| 523 printTrinucWarning = True | |
| 524 if printTrinucWarning: | |
| 525 print 'Warning: Some trinucleotides transitions were not encountered in the input dataset, probabilities of 0.0 have been assigned to these events.' | |
| 526 | |
| 527 # | |
| 528 # print some stuff | |
| 529 # | |
| 530 for k in sorted(TRINUC_MUT_PROB.keys()): | |
| 531 print 'p('+k+' mutates) =',TRINUC_MUT_PROB[k] | |
| 532 | |
| 533 for k in sorted(TRINUC_TRANS_PROBS.keys()): | |
| 534 print 'p('+k[0]+' --> '+k[1]+' | '+k[0]+' mutates) =',TRINUC_TRANS_PROBS[k] | |
| 535 | |
| 536 for k in sorted(INDEL_FREQ.keys()): | |
| 537 if k > 0: | |
| 538 print 'p(ins length = '+str(abs(k))+' | indel occurs) =',INDEL_FREQ[k] | |
| 539 else: | |
| 540 print 'p(del length = '+str(abs(k))+' | indel occurs) =',INDEL_FREQ[k] | |
| 541 | |
| 542 for k in sorted(SNP_TRANS_FREQ.keys()): | |
| 543 print 'p('+k[0]+' --> '+k[1]+' | SNP occurs) =',SNP_TRANS_FREQ[k] | |
| 544 | |
| 545 #for n in COMMON_VARIANTS: | |
| 546 # print n | |
| 547 | |
| 548 #for n in HIGH_MUT_REGIONS: | |
| 549 # print n | |
| 550 | |
| 551 print 'p(snp) =',SNP_FREQ | |
| 552 print 'p(indel) =',AVG_INDEL_FREQ | |
| 553 print 'overall average mut rate:',AVG_MUT_RATE | |
| 554 print 'total variants processed:',totalVar | |
| 555 | |
| 556 # | |
| 557 # save variables to file | |
| 558 # | |
| 559 if SKIP_COMMON: | |
| 560 OUT_DICT = {'AVG_MUT_RATE':AVG_MUT_RATE, | |
| 561 'SNP_FREQ':SNP_FREQ, | |
| 562 'SNP_TRANS_FREQ':SNP_TRANS_FREQ, | |
| 563 'INDEL_FREQ':INDEL_FREQ, | |
| 564 'TRINUC_MUT_PROB':TRINUC_MUT_PROB, | |
| 565 'TRINUC_TRANS_PROBS':TRINUC_TRANS_PROBS} | |
| 566 else: | |
| 567 OUT_DICT = {'AVG_MUT_RATE':AVG_MUT_RATE, | |
| 568 'SNP_FREQ':SNP_FREQ, | |
| 569 'SNP_TRANS_FREQ':SNP_TRANS_FREQ, | |
| 570 'INDEL_FREQ':INDEL_FREQ, | |
| 571 'TRINUC_MUT_PROB':TRINUC_MUT_PROB, | |
| 572 'TRINUC_TRANS_PROBS':TRINUC_TRANS_PROBS, | |
| 573 'COMMON_VARIANTS':COMMON_VARIANTS, | |
| 574 'HIGH_MUT_REGIONS':HIGH_MUT_REGIONS} | |
| 575 pickle.dump( OUT_DICT, open( OUT_PICKLE, "wb" ) ) | |
| 576 | |
| 577 | |
| 578 if __name__ == "__main__": | |
| 579 main() | |
| 580 | |
| 581 | |
| 582 | |
| 583 |
