comparison utilities/genMutModel.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #!/usr/bin/env python
2
3 import sys
4 import os
5 import re
6 import bisect
7 import pickle
8 import argparse
9 import numpy as np
10 #matplotlib is not used as far as i can see
11 #import matplotlib.pyplot as mpl
12
13 # absolute path to the directory above this script
14 SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-2])
15 sys.path.append(SIM_PATH+'/py/')
16
17 from refFunc import indexRef
18
19 REF_WHITELIST = [str(n) for n in xrange(1,30)] + ['x','y','X','Y','mt','Mt','MT']
20 REF_WHITELIST += ['chr'+n for n in REF_WHITELIST]
21 VALID_NUCL = ['A','C','G','T']
22 VALID_TRINUC = [VALID_NUCL[i]+VALID_NUCL[j]+VALID_NUCL[k] for i in xrange(len(VALID_NUCL)) for j in xrange(len(VALID_NUCL)) for k in xrange(len(VALID_NUCL))]
23 # if parsing a dbsnp vcf, and no CAF= is found in info tag, use this as default val for population freq
24 VCF_DEFAULT_POP_FREQ = 0.00001
25
26
27 #########################################################
28 # VARIOUS HELPER FUNCTIONS #
29 #########################################################
30
31
32 # given a reference index, grab the sequence string of a specified reference
33 def getChrFromFasta(refPath,ref_inds,chrName):
34 for i in xrange(len(ref_inds)):
35 if ref_inds[i][0] == chrName:
36 ref_inds_i = ref_inds[i]
37 break
38 refFile = open(refPath,'r')
39 refFile.seek(ref_inds_i[1])
40 myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n'))
41 return myDat
42
43 # cluster a sorted list
44 def clusterList(l,delta):
45 outList = [[l[0]]]
46 prevVal = l[0]
47 currentInd = 0
48 for n in l[1:]:
49 if n-prevVal <= delta:
50 outList[currentInd].append(n)
51 else:
52 currentInd += 1
53 outList.append([])
54 outList[currentInd].append(n)
55 prevVal = n
56 return outList
57
58 def list_2_countDict(l):
59 cDict = {}
60 for n in l:
61 if n not in cDict:
62 cDict[n] = 0
63 cDict[n] += 1
64 return cDict
65
66 def getBedTracks(fn):
67 f = open(fn,'r')
68 trackDict = {}
69 for line in f:
70 splt = line.strip().split('\t')
71 if splt[0] not in trackDict:
72 trackDict[splt[0]] = []
73 trackDict[splt[0]].extend([int(splt[1]),int(splt[2])])
74 f.close()
75 return trackDict
76
77 def getTrackLen(trackDict):
78 totSum = 0
79 for k in trackDict.keys():
80 for i in xrange(0,len(trackDict[k]),2):
81 totSum += trackDict[k][i+1] - trackDict[k][i] + 1
82 return totSum
83
84 def isInBed(track,ind):
85 myInd = bisect.bisect(track,ind)
86 if myInd&1:
87 return True
88 if myInd < len(track):
89 if track[myInd-1] == ind:
90 return True
91 return False
92
93 ## return the mean distance to the median of a cluster
94 #def mean_dist_from_median(c):
95 # centroid = np.median([n for n in c])
96 # dists = []
97 # for n in c:
98 # dists.append(abs(n-centroid))
99 # return np.mean(dists)
100 #
101 ## get median value from counting dictionary
102 #def quick_median(countDict):
103 # midPoint = sum(countDict.values())/2
104 # mySum = 0
105 # myInd = 0
106 # sk = sorted(countDict.keys())
107 # while mySum < midPoint:
108 # mySum += countDict[sk[myInd]]
109 # if mySum >= midPoint:
110 # break
111 # myInd += 1
112 # return myInd
113 #
114 ## get median deviation from median of counting dictionary
115 #def median_deviation_from_median(countDict):
116 # myMedian = quick_median(countDict)
117 # deviations = {}
118 # for k in sorted(countDict.keys()):
119 # d = abs(k-myMedian)
120 # deviations[d] = countDict[k]
121 # return quick_median(deviations)
122
123
124 #################################################
125 # PARSE INPUT OPTIONS #
126 #################################################
127
128
129 parser = argparse.ArgumentParser(description='genMutModel.py')
130 parser.add_argument('-r', type=str, required=True, metavar='<str>', help="* ref.fa")
131 parser.add_argument('-m', type=str, required=True, metavar='<str>', help="* mutations.tsv [.vcf]")
132 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output.p")
133 parser.add_argument('-bi', type=str, required=False, metavar='<str>', default=None, help="only_use_these_regions.bed")
134 parser.add_argument('-be', type=str, required=False, metavar='<str>', default=None, help="exclude_these_regions.bed")
135 parser.add_argument('--save-trinuc', required=False,action='store_true', default=False, help='save trinuc counts for ref')
136 parser.add_argument('--no-whitelist',required=False,action='store_true', default=False, help='allow any non-standard ref')
137 parser.add_argument('--skip-common', required=False,action='store_true', default=False, help='do not save common snps + high mut regions')
138 args = parser.parse_args()
139 (REF, TSV, OUT_PICKLE, SAVE_TRINUC, NO_WHITELIST, SKIP_COMMON) = (args.r, args.m, args.o, args.save_trinuc, args.no_whitelist, args.skip_common)
140
141 MYBED = None
142 if args.bi != None:
143 print 'only considering variants in specified bed regions...'
144 MYBED = (getBedTracks(args.bi),True)
145 elif args.be != None:
146 print 'only considering variants outside of specified bed regions...'
147 MYBED = (getBedTracks(args.be),False)
148
149 if TSV[-4:] == '.vcf':
150 IS_VCF = True
151 elif TSV[-4:] == '.tsv':
152 IS_VCF = False
153 else:
154 print '\nError: Unknown format for mutation input.\n'
155 exit(1)
156
157
158 #####################################
159 # main() #
160 #####################################
161
162
163 def main():
164
165 ref_inds = indexRef(REF)
166 refList = [n[0] for n in ref_inds]
167
168 # how many times do we observe each trinucleotide in the reference (and input bed region, if present)?
169 TRINUC_REF_COUNT = {}
170 TRINUC_BED_COUNT = {}
171 printBedWarning = True
172 # [(trinuc_a, trinuc_b)] = # of times we observed a mutation from trinuc_a into trinuc_b
173 TRINUC_TRANSITION_COUNT = {}
174 # total count of SNPs
175 SNP_COUNT = 0
176 # overall SNP transition probabilities
177 SNP_TRANSITION_COUNT = {}
178 # total count of indels, indexed by length
179 INDEL_COUNT = {}
180 # tabulate how much non-N reference sequence we've eaten through
181 TOTAL_REFLEN = 0
182 # detect variants that occur in a significant percentage of the input samples (pos,ref,alt,pop_fraction)
183 COMMON_VARIANTS = []
184 # tabulate how many unique donors we've encountered (this is useful for identifying common variants)
185 TOTAL_DONORS = {}
186 # identify regions that have significantly higher local mutation rates than the average
187 HIGH_MUT_REGIONS = []
188
189 # load and process variants in each reference sequence individually, for memory reasons...
190 for refName in refList:
191
192 if (refName not in REF_WHITELIST) and (not NO_WHITELIST):
193 print refName,'is not in our whitelist, skipping...'
194 continue
195
196 print 'reading reference "'+refName+'"...'
197 refSequence = getChrFromFasta(REF,ref_inds,refName).upper()
198 TOTAL_REFLEN += len(refSequence) - refSequence.count('N')
199
200 # list to be used for counting variants that occur multiple times in file (i.e. in multiple samples)
201 VDAT_COMMON = []
202
203
204 """ ##########################################################################
205 ### COUNT TRINUCLEOTIDES IN REF ###
206 ########################################################################## """
207
208
209 if MYBED != None:
210 if printBedWarning:
211 print "since you're using a bed input, we have to count trinucs in bed region even if you specified a trinuc count file for the reference..."
212 printBedWarning = False
213 if refName in MYBED[0]:
214 refKey = refName
215 elif ('chr' in refName) and (refName not in MYBED[0]) and (refName[3:] in MYBED[0]):
216 refKey = refName[3:]
217 elif ('chr' not in refName) and (refName not in MYBED[0]) and ('chr'+refName in MYBED[0]):
218 refKey = 'chr'+refName
219 if refKey in MYBED[0]:
220 subRegions = [(MYBED[0][refKey][n],MYBED[0][refKey][n+1]) for n in xrange(0,len(MYBED[0][refKey]),2)]
221 for sr in subRegions:
222 for i in xrange(sr[0],sr[1]+1-2):
223 trinuc = refSequence[i:i+3]
224 if not trinuc in VALID_TRINUC:
225 continue # skip if trinuc contains invalid characters, or not in specified bed region
226 if trinuc not in TRINUC_BED_COUNT:
227 TRINUC_BED_COUNT[trinuc] = 0
228 TRINUC_BED_COUNT[trinuc] += 1
229
230 if not os.path.isfile(REF+'.trinucCounts'):
231 print 'counting trinucleotides in reference...'
232 for i in xrange(len(refSequence)-2):
233 if i%1000000 == 0 and i > 0:
234 print i,'/',len(refSequence)
235 #break
236 trinuc = refSequence[i:i+3]
237 if not trinuc in VALID_TRINUC:
238 continue # skip if trinuc contains invalid characters
239 if trinuc not in TRINUC_REF_COUNT:
240 TRINUC_REF_COUNT[trinuc] = 0
241 TRINUC_REF_COUNT[trinuc] += 1
242 else:
243 print 'skipping trinuc counts (for whole reference) because we found a file...'
244
245
246 """ ##########################################################################
247 ### READ INPUT VARIANTS ###
248 ########################################################################## """
249
250
251 print 'reading input variants...'
252 f = open(TSV,'r')
253 isFirst = True
254 for line in f:
255
256 if IS_VCF and line[0] == '#':
257 continue
258 if isFirst:
259 if IS_VCF:
260 # hard-code index values based on expected columns in vcf
261 (c1,c2,c3,m1,m2,m3) = (0,1,1,3,3,4)
262 else:
263 # determine columns of fields we're interested in
264 splt = line.strip().split('\t')
265 (c1,c2,c3) = (splt.index('chromosome'),splt.index('chromosome_start'),splt.index('chromosome_end'))
266 (m1,m2,m3) = (splt.index('reference_genome_allele'),splt.index('mutated_from_allele'),splt.index('mutated_to_allele'))
267 (d_id) = (splt.index('icgc_donor_id'))
268 isFirst = False
269 continue
270
271 splt = line.strip().split('\t')
272 # we have -1 because tsv/vcf coords are 1-based, and our reference string index is 0-based
273 [chrName,chrStart,chrEnd] = [splt[c1],int(splt[c2])-1,int(splt[c3])-1]
274 [allele_ref,allele_normal,allele_tumor] = [splt[m1].upper(),splt[m2].upper(),splt[m3].upper()]
275 if IS_VCF:
276 if len(allele_ref) != len(allele_tumor):
277 # indels in tsv don't include the preserved first nucleotide, so lets trim the vcf alleles
278 [allele_ref,allele_normal,allele_tumor] = [allele_ref[1:],allele_normal[1:],allele_tumor[1:]]
279 if not allele_ref: allele_ref = '-'
280 if not allele_normal: allele_normal = '-'
281 if not allele_tumor: allele_tumor = '-'
282 # if alternate alleles are present, lets just ignore this variant. I may come back and improve this later
283 if ',' in allele_tumor:
284 continue
285 vcf_info = ';'+splt[7]+';'
286 else:
287 [donor_id] = [splt[d_id]]
288
289 # if we encounter a multi-np (i.e. 3 nucl --> 3 different nucl), let's skip it for now...
290 if ('-' not in allele_normal and '-' not in allele_tumor) and (len(allele_normal) > 1 or len(allele_tumor) > 1):
291 print 'skipping a complex variant...'
292 continue
293
294 # to deal with '1' vs 'chr1' references, manually change names. this is hacky and bad.
295 if 'chr' not in chrName:
296 chrName = 'chr'+chrName
297 if 'chr' not in refName:
298 refName = 'chr'+refName
299 # skip irrelevant variants
300 if chrName != refName:
301 continue
302
303 # if variant is outside the regions we're interested in (if specified), skip it...
304 if MYBED != None:
305 refKey = refName
306 if not refKey in MYBED[0] and refKey[3:] in MYBED[0]: # account for 1 vs chr1, again...
307 refKey = refKey[3:]
308 if refKey not in MYBED[0]:
309 inBed = False
310 else:
311 inBed = isInBed(MYBED[0][refKey],chrStart)
312 if inBed != MYBED[1]:
313 continue
314
315 # we want only snps
316 # so, no '-' characters allowed, and chrStart must be same as chrEnd
317 if '-' not in allele_normal and '-' not in allele_tumor and chrStart == chrEnd:
318 trinuc_ref = refSequence[chrStart-1:chrStart+2]
319 if not trinuc_ref in VALID_TRINUC:
320 continue # skip ref trinuc with invalid characters
321 # only consider positions where ref allele in tsv matches the nucleotide in our reference
322 if allele_ref == trinuc_ref[1]:
323 trinuc_normal = refSequence[chrStart-1] + allele_normal + refSequence[chrStart+1]
324 trinuc_tumor = refSequence[chrStart-1] + allele_tumor + refSequence[chrStart+1]
325 if not trinuc_normal in VALID_TRINUC or not trinuc_tumor in VALID_TRINUC:
326 continue # skip if mutation contains invalid char
327 key = (trinuc_normal,trinuc_tumor)
328 if key not in TRINUC_TRANSITION_COUNT:
329 TRINUC_TRANSITION_COUNT[key] = 0
330 TRINUC_TRANSITION_COUNT[key] += 1
331 SNP_COUNT += 1
332 key2 = (allele_normal,allele_tumor)
333 if key2 not in SNP_TRANSITION_COUNT:
334 SNP_TRANSITION_COUNT[key2] = 0
335 SNP_TRANSITION_COUNT[key2] += 1
336
337 if IS_VCF:
338 myPopFreq = VCF_DEFAULT_POP_FREQ
339 if ';CAF=' in vcf_info:
340 cafStr = re.findall(r";CAF=.*?(?=;)",vcf_info)[0]
341 if ',' in cafStr:
342 myPopFreq = float(cafStr[5:].split(',')[1])
343 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor,myPopFreq))
344 else:
345 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor))
346 TOTAL_DONORS[donor_id] = True
347 else:
348 print '\nError: ref allele in variant call does not match reference.\n'
349 exit(1)
350
351 # now let's look for indels...
352 if '-' in allele_normal: len_normal = 0
353 else: len_normal = len(allele_normal)
354 if '-' in allele_tumor: len_tumor = 0
355 else: len_tumor = len(allele_tumor)
356 if len_normal != len_tumor:
357 indel_len = len_tumor - len_normal
358 if indel_len not in INDEL_COUNT:
359 INDEL_COUNT[indel_len] = 0
360 INDEL_COUNT[indel_len] += 1
361
362 if IS_VCF:
363 myPopFreq = VCF_DEFAULT_POP_FREQ
364 if ';CAF=' in vcf_info:
365 cafStr = re.findall(r";CAF=.*?(?=;)",vcf_info)[0]
366 if ',' in cafStr:
367 myPopFreq = float(cafStr[5:].split(',')[1])
368 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor,myPopFreq))
369 else:
370 VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor))
371 TOTAL_DONORS[donor_id] = True
372 f.close()
373
374 # if we didn't find anything, skip ahead along to the next reference sequence
375 if not len(VDAT_COMMON):
376 print 'Found no variants for this reference, moving along...'
377 continue
378
379 #
380 # identify common mutations
381 #
382 percentile_var = 95
383 if IS_VCF:
384 minVal = np.percentile([n[4] for n in VDAT_COMMON],percentile_var)
385 for k in sorted(VDAT_COMMON):
386 if k[4] >= minVal:
387 COMMON_VARIANTS.append((refName,k[0],k[1],k[3],k[4]))
388 VDAT_COMMON = {(n[0],n[1],n[2],n[3]):n[4] for n in VDAT_COMMON}
389 else:
390 N_DONORS = len(TOTAL_DONORS)
391 VDAT_COMMON = list_2_countDict(VDAT_COMMON)
392 minVal = int(np.percentile(VDAT_COMMON.values(),percentile_var))
393 for k in sorted(VDAT_COMMON.keys()):
394 if VDAT_COMMON[k] >= minVal:
395 COMMON_VARIANTS.append((refName,k[0],k[1],k[3],VDAT_COMMON[k]/float(N_DONORS)))
396
397 #
398 # identify areas that have contained significantly higher random mutation rates
399 #
400 dist_thresh = 2000
401 percentile_clust = 97
402 qptn = 1000
403 # identify regions with disproportionately more variants in them
404 VARIANT_POS = sorted([n[0] for n in VDAT_COMMON.keys()])
405 clustered_pos = clusterList(VARIANT_POS,dist_thresh)
406 byLen = [(len(clustered_pos[i]),min(clustered_pos[i]),max(clustered_pos[i]),i) for i in xrange(len(clustered_pos))]
407 #byLen = sorted(byLen,reverse=True)
408 #minLen = int(np.percentile([n[0] for n in byLen],percentile_clust))
409 #byLen = [n for n in byLen if n[0] >= minLen]
410 candidate_regions = []
411 for n in byLen:
412 bi = int((n[1]-dist_thresh)/float(qptn))*qptn
413 bf = int((n[2]+dist_thresh)/float(qptn))*qptn
414 candidate_regions.append((n[0]/float(bf-bi),max([0,bi]),min([len(refSequence),bf])))
415 minVal = np.percentile([n[0] for n in candidate_regions],percentile_clust)
416 for n in candidate_regions:
417 if n[0] >= minVal:
418 HIGH_MUT_REGIONS.append((refName,n[1],n[2],n[0]))
419 # collapse overlapping regions
420 for i in xrange(len(HIGH_MUT_REGIONS)-1,0,-1):
421 if HIGH_MUT_REGIONS[i-1][2] >= HIGH_MUT_REGIONS[i][1] and HIGH_MUT_REGIONS[i-1][0] == HIGH_MUT_REGIONS[i][0]:
422 avgMutRate = 0.5*HIGH_MUT_REGIONS[i-1][3]+0.5*HIGH_MUT_REGIONS[i][3] # not accurate, but I'm lazy
423 HIGH_MUT_REGIONS[i-1] = (HIGH_MUT_REGIONS[i-1][0], HIGH_MUT_REGIONS[i-1][1], HIGH_MUT_REGIONS[i][2], avgMutRate)
424 del HIGH_MUT_REGIONS[i]
425
426 #
427 # if we didn't count ref trinucs because we found file, read in ref counts from file now
428 #
429 if os.path.isfile(REF+'.trinucCounts'):
430 print 'reading pre-computed trinuc counts...'
431 f = open(REF+'.trinucCounts','r')
432 for line in f:
433 splt = line.strip().split('\t')
434 TRINUC_REF_COUNT[splt[0]] = int(splt[1])
435 f.close()
436 # otherwise, save trinuc counts to file, if desired
437 elif SAVE_TRINUC:
438 if MYBED != None:
439 print 'unable to save trinuc counts to file because using input bed region...'
440 else:
441 print 'saving trinuc counts to file...'
442 f = open(REF+'.trinucCounts','w')
443 for trinuc in sorted(TRINUC_REF_COUNT.keys()):
444 f.write(trinuc+'\t'+str(TRINUC_REF_COUNT[trinuc])+'\n')
445 f.close()
446
447 #
448 # if using an input bed region, make necessary adjustments to trinuc ref counts based on the bed region trinuc counts
449 #
450 if MYBED != None:
451 if MYBED[1] == True: # we are restricting our attention to bed regions, so ONLY use bed region trinuc counts
452 TRINUC_REF_COUNT = TRINUC_BED_COUNT
453 else: # we are only looking outside bed regions, so subtract bed region trinucs from entire reference trinucs
454 for k in TRINUC_REF_COUNT.keys():
455 if k in TRINUC_BED_COUNT:
456 TRINUC_REF_COUNT[k] -= TRINUC_BED_COUNT[k]
457
458 # if for some reason we didn't find any valid input variants, exit gracefully...
459 totalVar = SNP_COUNT + sum(INDEL_COUNT.values())
460 if totalVar == 0:
461 print '\nError: No valid variants were found, model could not be created. (Are you using the correct reference?)\n'
462 exit(1)
463
464 """ ##########################################################################
465 ### COMPUTE PROBABILITIES ###
466 ########################################################################## """
467
468
469 #for k in sorted(TRINUC_REF_COUNT.keys()):
470 # print k, TRINUC_REF_COUNT[k]
471 #
472 #for k in sorted(TRINUC_TRANSITION_COUNT.keys()):
473 # print k, TRINUC_TRANSITION_COUNT[k]
474
475 # frequency that each trinuc mutated into anything else
476 TRINUC_MUT_PROB = {}
477 # frequency that a trinuc mutates into another trinuc, given that it mutated
478 TRINUC_TRANS_PROBS = {}
479 # frequency of snp transitions, given a snp occurs.
480 SNP_TRANS_FREQ = {}
481
482 for trinuc in sorted(TRINUC_REF_COUNT.keys()):
483 myCount = 0
484 for k in sorted(TRINUC_TRANSITION_COUNT.keys()):
485 if k[0] == trinuc:
486 myCount += TRINUC_TRANSITION_COUNT[k]
487 TRINUC_MUT_PROB[trinuc] = myCount / float(TRINUC_REF_COUNT[trinuc])
488 for k in sorted(TRINUC_TRANSITION_COUNT.keys()):
489 if k[0] == trinuc:
490 TRINUC_TRANS_PROBS[k] = TRINUC_TRANSITION_COUNT[k] / float(myCount)
491
492 for n1 in VALID_NUCL:
493 rollingTot = sum([SNP_TRANSITION_COUNT[(n1,n2)] for n2 in VALID_NUCL if (n1,n2) in SNP_TRANSITION_COUNT])
494 for n2 in VALID_NUCL:
495 key2 = (n1,n2)
496 if key2 in SNP_TRANSITION_COUNT:
497 SNP_TRANS_FREQ[key2] = SNP_TRANSITION_COUNT[key2] / float(rollingTot)
498
499 # compute average snp and indel frequencies
500 SNP_FREQ = SNP_COUNT/float(totalVar)
501 AVG_INDEL_FREQ = 1.-SNP_FREQ
502 INDEL_FREQ = {k:(INDEL_COUNT[k]/float(totalVar))/AVG_INDEL_FREQ for k in INDEL_COUNT.keys()}
503 if MYBED != None:
504 if MYBED[1] == True:
505 AVG_MUT_RATE = totalVar/float(getTrackLen(MYBED[0]))
506 else:
507 AVG_MUT_RATE = totalVar/float(TOTAL_REFLEN - getTrackLen(MYBED[0]))
508 else:
509 AVG_MUT_RATE = totalVar/float(TOTAL_REFLEN)
510
511 #
512 # if values weren't found in data, appropriately append null entries
513 #
514 printTrinucWarning = False
515 for trinuc in VALID_TRINUC:
516 trinuc_mut = [trinuc[0]+n+trinuc[2] for n in VALID_NUCL if n != trinuc[1]]
517 if trinuc not in TRINUC_MUT_PROB:
518 TRINUC_MUT_PROB[trinuc] = 0.
519 printTrinucWarning = True
520 for trinuc2 in trinuc_mut:
521 if (trinuc,trinuc2) not in TRINUC_TRANS_PROBS:
522 TRINUC_TRANS_PROBS[(trinuc,trinuc2)] = 0.
523 printTrinucWarning = True
524 if printTrinucWarning:
525 print 'Warning: Some trinucleotides transitions were not encountered in the input dataset, probabilities of 0.0 have been assigned to these events.'
526
527 #
528 # print some stuff
529 #
530 for k in sorted(TRINUC_MUT_PROB.keys()):
531 print 'p('+k+' mutates) =',TRINUC_MUT_PROB[k]
532
533 for k in sorted(TRINUC_TRANS_PROBS.keys()):
534 print 'p('+k[0]+' --> '+k[1]+' | '+k[0]+' mutates) =',TRINUC_TRANS_PROBS[k]
535
536 for k in sorted(INDEL_FREQ.keys()):
537 if k > 0:
538 print 'p(ins length = '+str(abs(k))+' | indel occurs) =',INDEL_FREQ[k]
539 else:
540 print 'p(del length = '+str(abs(k))+' | indel occurs) =',INDEL_FREQ[k]
541
542 for k in sorted(SNP_TRANS_FREQ.keys()):
543 print 'p('+k[0]+' --> '+k[1]+' | SNP occurs) =',SNP_TRANS_FREQ[k]
544
545 #for n in COMMON_VARIANTS:
546 # print n
547
548 #for n in HIGH_MUT_REGIONS:
549 # print n
550
551 print 'p(snp) =',SNP_FREQ
552 print 'p(indel) =',AVG_INDEL_FREQ
553 print 'overall average mut rate:',AVG_MUT_RATE
554 print 'total variants processed:',totalVar
555
556 #
557 # save variables to file
558 #
559 if SKIP_COMMON:
560 OUT_DICT = {'AVG_MUT_RATE':AVG_MUT_RATE,
561 'SNP_FREQ':SNP_FREQ,
562 'SNP_TRANS_FREQ':SNP_TRANS_FREQ,
563 'INDEL_FREQ':INDEL_FREQ,
564 'TRINUC_MUT_PROB':TRINUC_MUT_PROB,
565 'TRINUC_TRANS_PROBS':TRINUC_TRANS_PROBS}
566 else:
567 OUT_DICT = {'AVG_MUT_RATE':AVG_MUT_RATE,
568 'SNP_FREQ':SNP_FREQ,
569 'SNP_TRANS_FREQ':SNP_TRANS_FREQ,
570 'INDEL_FREQ':INDEL_FREQ,
571 'TRINUC_MUT_PROB':TRINUC_MUT_PROB,
572 'TRINUC_TRANS_PROBS':TRINUC_TRANS_PROBS,
573 'COMMON_VARIANTS':COMMON_VARIANTS,
574 'HIGH_MUT_REGIONS':HIGH_MUT_REGIONS}
575 pickle.dump( OUT_DICT, open( OUT_PICKLE, "wb" ) )
576
577
578 if __name__ == "__main__":
579 main()
580
581
582
583