Mercurial > repos > thondeboer > neat_genreads
diff utilities/genMutModel.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utilities/genMutModel.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,583 @@ +#!/usr/bin/env python + +import sys +import os +import re +import bisect +import pickle +import argparse +import numpy as np +#matplotlib is not used as far as i can see +#import matplotlib.pyplot as mpl + +# absolute path to the directory above this script +SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-2]) +sys.path.append(SIM_PATH+'/py/') + +from refFunc import indexRef + +REF_WHITELIST = [str(n) for n in xrange(1,30)] + ['x','y','X','Y','mt','Mt','MT'] +REF_WHITELIST += ['chr'+n for n in REF_WHITELIST] +VALID_NUCL = ['A','C','G','T'] +VALID_TRINUC = [VALID_NUCL[i]+VALID_NUCL[j]+VALID_NUCL[k] for i in xrange(len(VALID_NUCL)) for j in xrange(len(VALID_NUCL)) for k in xrange(len(VALID_NUCL))] +# if parsing a dbsnp vcf, and no CAF= is found in info tag, use this as default val for population freq +VCF_DEFAULT_POP_FREQ = 0.00001 + + +######################################################### +# VARIOUS HELPER FUNCTIONS # +######################################################### + + +# given a reference index, grab the sequence string of a specified reference +def getChrFromFasta(refPath,ref_inds,chrName): + for i in xrange(len(ref_inds)): + if ref_inds[i][0] == chrName: + ref_inds_i = ref_inds[i] + break + refFile = open(refPath,'r') + refFile.seek(ref_inds_i[1]) + myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) + return myDat + +# cluster a sorted list +def clusterList(l,delta): + outList = [[l[0]]] + prevVal = l[0] + currentInd = 0 + for n in l[1:]: + if n-prevVal <= delta: + outList[currentInd].append(n) + else: + currentInd += 1 + outList.append([]) + outList[currentInd].append(n) + prevVal = n + return outList + +def list_2_countDict(l): + cDict = {} + for n in l: + if n not in cDict: + cDict[n] = 0 + cDict[n] += 1 + return cDict + +def getBedTracks(fn): + f = open(fn,'r') + trackDict = {} + for line in f: + splt = line.strip().split('\t') + if splt[0] not in trackDict: + trackDict[splt[0]] = [] + trackDict[splt[0]].extend([int(splt[1]),int(splt[2])]) + f.close() + return trackDict + +def getTrackLen(trackDict): + totSum = 0 + for k in trackDict.keys(): + for i in xrange(0,len(trackDict[k]),2): + totSum += trackDict[k][i+1] - trackDict[k][i] + 1 + return totSum + +def isInBed(track,ind): + myInd = bisect.bisect(track,ind) + if myInd&1: + return True + if myInd < len(track): + if track[myInd-1] == ind: + return True + return False + +## return the mean distance to the median of a cluster +#def mean_dist_from_median(c): +# centroid = np.median([n for n in c]) +# dists = [] +# for n in c: +# dists.append(abs(n-centroid)) +# return np.mean(dists) +# +## get median value from counting dictionary +#def quick_median(countDict): +# midPoint = sum(countDict.values())/2 +# mySum = 0 +# myInd = 0 +# sk = sorted(countDict.keys()) +# while mySum < midPoint: +# mySum += countDict[sk[myInd]] +# if mySum >= midPoint: +# break +# myInd += 1 +# return myInd +# +## get median deviation from median of counting dictionary +#def median_deviation_from_median(countDict): +# myMedian = quick_median(countDict) +# deviations = {} +# for k in sorted(countDict.keys()): +# d = abs(k-myMedian) +# deviations[d] = countDict[k] +# return quick_median(deviations) + + +################################################# +# PARSE INPUT OPTIONS # +################################################# + + +parser = argparse.ArgumentParser(description='genMutModel.py') +parser.add_argument('-r', type=str, required=True, metavar='<str>', help="* ref.fa") +parser.add_argument('-m', type=str, required=True, metavar='<str>', help="* mutations.tsv [.vcf]") +parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output.p") +parser.add_argument('-bi', type=str, required=False, metavar='<str>', default=None, help="only_use_these_regions.bed") +parser.add_argument('-be', type=str, required=False, metavar='<str>', default=None, help="exclude_these_regions.bed") +parser.add_argument('--save-trinuc', required=False,action='store_true', default=False, help='save trinuc counts for ref') +parser.add_argument('--no-whitelist',required=False,action='store_true', default=False, help='allow any non-standard ref') +parser.add_argument('--skip-common', required=False,action='store_true', default=False, help='do not save common snps + high mut regions') +args = parser.parse_args() +(REF, TSV, OUT_PICKLE, SAVE_TRINUC, NO_WHITELIST, SKIP_COMMON) = (args.r, args.m, args.o, args.save_trinuc, args.no_whitelist, args.skip_common) + +MYBED = None +if args.bi != None: + print 'only considering variants in specified bed regions...' + MYBED = (getBedTracks(args.bi),True) +elif args.be != None: + print 'only considering variants outside of specified bed regions...' + MYBED = (getBedTracks(args.be),False) + +if TSV[-4:] == '.vcf': + IS_VCF = True +elif TSV[-4:] == '.tsv': + IS_VCF = False +else: + print '\nError: Unknown format for mutation input.\n' + exit(1) + + +##################################### +# main() # +##################################### + + +def main(): + + ref_inds = indexRef(REF) + refList = [n[0] for n in ref_inds] + + # how many times do we observe each trinucleotide in the reference (and input bed region, if present)? + TRINUC_REF_COUNT = {} + TRINUC_BED_COUNT = {} + printBedWarning = True + # [(trinuc_a, trinuc_b)] = # of times we observed a mutation from trinuc_a into trinuc_b + TRINUC_TRANSITION_COUNT = {} + # total count of SNPs + SNP_COUNT = 0 + # overall SNP transition probabilities + SNP_TRANSITION_COUNT = {} + # total count of indels, indexed by length + INDEL_COUNT = {} + # tabulate how much non-N reference sequence we've eaten through + TOTAL_REFLEN = 0 + # detect variants that occur in a significant percentage of the input samples (pos,ref,alt,pop_fraction) + COMMON_VARIANTS = [] + # tabulate how many unique donors we've encountered (this is useful for identifying common variants) + TOTAL_DONORS = {} + # identify regions that have significantly higher local mutation rates than the average + HIGH_MUT_REGIONS = [] + + # load and process variants in each reference sequence individually, for memory reasons... + for refName in refList: + + if (refName not in REF_WHITELIST) and (not NO_WHITELIST): + print refName,'is not in our whitelist, skipping...' + continue + + print 'reading reference "'+refName+'"...' + refSequence = getChrFromFasta(REF,ref_inds,refName).upper() + TOTAL_REFLEN += len(refSequence) - refSequence.count('N') + + # list to be used for counting variants that occur multiple times in file (i.e. in multiple samples) + VDAT_COMMON = [] + + + """ ########################################################################## + ### COUNT TRINUCLEOTIDES IN REF ### + ########################################################################## """ + + + if MYBED != None: + if printBedWarning: + print "since you're using a bed input, we have to count trinucs in bed region even if you specified a trinuc count file for the reference..." + printBedWarning = False + if refName in MYBED[0]: + refKey = refName + elif ('chr' in refName) and (refName not in MYBED[0]) and (refName[3:] in MYBED[0]): + refKey = refName[3:] + elif ('chr' not in refName) and (refName not in MYBED[0]) and ('chr'+refName in MYBED[0]): + refKey = 'chr'+refName + if refKey in MYBED[0]: + subRegions = [(MYBED[0][refKey][n],MYBED[0][refKey][n+1]) for n in xrange(0,len(MYBED[0][refKey]),2)] + for sr in subRegions: + for i in xrange(sr[0],sr[1]+1-2): + trinuc = refSequence[i:i+3] + if not trinuc in VALID_TRINUC: + continue # skip if trinuc contains invalid characters, or not in specified bed region + if trinuc not in TRINUC_BED_COUNT: + TRINUC_BED_COUNT[trinuc] = 0 + TRINUC_BED_COUNT[trinuc] += 1 + + if not os.path.isfile(REF+'.trinucCounts'): + print 'counting trinucleotides in reference...' + for i in xrange(len(refSequence)-2): + if i%1000000 == 0 and i > 0: + print i,'/',len(refSequence) + #break + trinuc = refSequence[i:i+3] + if not trinuc in VALID_TRINUC: + continue # skip if trinuc contains invalid characters + if trinuc not in TRINUC_REF_COUNT: + TRINUC_REF_COUNT[trinuc] = 0 + TRINUC_REF_COUNT[trinuc] += 1 + else: + print 'skipping trinuc counts (for whole reference) because we found a file...' + + + """ ########################################################################## + ### READ INPUT VARIANTS ### + ########################################################################## """ + + + print 'reading input variants...' + f = open(TSV,'r') + isFirst = True + for line in f: + + if IS_VCF and line[0] == '#': + continue + if isFirst: + if IS_VCF: + # hard-code index values based on expected columns in vcf + (c1,c2,c3,m1,m2,m3) = (0,1,1,3,3,4) + else: + # determine columns of fields we're interested in + splt = line.strip().split('\t') + (c1,c2,c3) = (splt.index('chromosome'),splt.index('chromosome_start'),splt.index('chromosome_end')) + (m1,m2,m3) = (splt.index('reference_genome_allele'),splt.index('mutated_from_allele'),splt.index('mutated_to_allele')) + (d_id) = (splt.index('icgc_donor_id')) + isFirst = False + continue + + splt = line.strip().split('\t') + # we have -1 because tsv/vcf coords are 1-based, and our reference string index is 0-based + [chrName,chrStart,chrEnd] = [splt[c1],int(splt[c2])-1,int(splt[c3])-1] + [allele_ref,allele_normal,allele_tumor] = [splt[m1].upper(),splt[m2].upper(),splt[m3].upper()] + if IS_VCF: + if len(allele_ref) != len(allele_tumor): + # indels in tsv don't include the preserved first nucleotide, so lets trim the vcf alleles + [allele_ref,allele_normal,allele_tumor] = [allele_ref[1:],allele_normal[1:],allele_tumor[1:]] + if not allele_ref: allele_ref = '-' + if not allele_normal: allele_normal = '-' + if not allele_tumor: allele_tumor = '-' + # if alternate alleles are present, lets just ignore this variant. I may come back and improve this later + if ',' in allele_tumor: + continue + vcf_info = ';'+splt[7]+';' + else: + [donor_id] = [splt[d_id]] + + # if we encounter a multi-np (i.e. 3 nucl --> 3 different nucl), let's skip it for now... + if ('-' not in allele_normal and '-' not in allele_tumor) and (len(allele_normal) > 1 or len(allele_tumor) > 1): + print 'skipping a complex variant...' + continue + + # to deal with '1' vs 'chr1' references, manually change names. this is hacky and bad. + if 'chr' not in chrName: + chrName = 'chr'+chrName + if 'chr' not in refName: + refName = 'chr'+refName + # skip irrelevant variants + if chrName != refName: + continue + + # if variant is outside the regions we're interested in (if specified), skip it... + if MYBED != None: + refKey = refName + if not refKey in MYBED[0] and refKey[3:] in MYBED[0]: # account for 1 vs chr1, again... + refKey = refKey[3:] + if refKey not in MYBED[0]: + inBed = False + else: + inBed = isInBed(MYBED[0][refKey],chrStart) + if inBed != MYBED[1]: + continue + + # we want only snps + # so, no '-' characters allowed, and chrStart must be same as chrEnd + if '-' not in allele_normal and '-' not in allele_tumor and chrStart == chrEnd: + trinuc_ref = refSequence[chrStart-1:chrStart+2] + if not trinuc_ref in VALID_TRINUC: + continue # skip ref trinuc with invalid characters + # only consider positions where ref allele in tsv matches the nucleotide in our reference + if allele_ref == trinuc_ref[1]: + trinuc_normal = refSequence[chrStart-1] + allele_normal + refSequence[chrStart+1] + trinuc_tumor = refSequence[chrStart-1] + allele_tumor + refSequence[chrStart+1] + if not trinuc_normal in VALID_TRINUC or not trinuc_tumor in VALID_TRINUC: + continue # skip if mutation contains invalid char + key = (trinuc_normal,trinuc_tumor) + if key not in TRINUC_TRANSITION_COUNT: + TRINUC_TRANSITION_COUNT[key] = 0 + TRINUC_TRANSITION_COUNT[key] += 1 + SNP_COUNT += 1 + key2 = (allele_normal,allele_tumor) + if key2 not in SNP_TRANSITION_COUNT: + SNP_TRANSITION_COUNT[key2] = 0 + SNP_TRANSITION_COUNT[key2] += 1 + + if IS_VCF: + myPopFreq = VCF_DEFAULT_POP_FREQ + if ';CAF=' in vcf_info: + cafStr = re.findall(r";CAF=.*?(?=;)",vcf_info)[0] + if ',' in cafStr: + myPopFreq = float(cafStr[5:].split(',')[1]) + VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor,myPopFreq)) + else: + VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor)) + TOTAL_DONORS[donor_id] = True + else: + print '\nError: ref allele in variant call does not match reference.\n' + exit(1) + + # now let's look for indels... + if '-' in allele_normal: len_normal = 0 + else: len_normal = len(allele_normal) + if '-' in allele_tumor: len_tumor = 0 + else: len_tumor = len(allele_tumor) + if len_normal != len_tumor: + indel_len = len_tumor - len_normal + if indel_len not in INDEL_COUNT: + INDEL_COUNT[indel_len] = 0 + INDEL_COUNT[indel_len] += 1 + + if IS_VCF: + myPopFreq = VCF_DEFAULT_POP_FREQ + if ';CAF=' in vcf_info: + cafStr = re.findall(r";CAF=.*?(?=;)",vcf_info)[0] + if ',' in cafStr: + myPopFreq = float(cafStr[5:].split(',')[1]) + VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor,myPopFreq)) + else: + VDAT_COMMON.append((chrStart,allele_ref,allele_normal,allele_tumor)) + TOTAL_DONORS[donor_id] = True + f.close() + + # if we didn't find anything, skip ahead along to the next reference sequence + if not len(VDAT_COMMON): + print 'Found no variants for this reference, moving along...' + continue + + # + # identify common mutations + # + percentile_var = 95 + if IS_VCF: + minVal = np.percentile([n[4] for n in VDAT_COMMON],percentile_var) + for k in sorted(VDAT_COMMON): + if k[4] >= minVal: + COMMON_VARIANTS.append((refName,k[0],k[1],k[3],k[4])) + VDAT_COMMON = {(n[0],n[1],n[2],n[3]):n[4] for n in VDAT_COMMON} + else: + N_DONORS = len(TOTAL_DONORS) + VDAT_COMMON = list_2_countDict(VDAT_COMMON) + minVal = int(np.percentile(VDAT_COMMON.values(),percentile_var)) + for k in sorted(VDAT_COMMON.keys()): + if VDAT_COMMON[k] >= minVal: + COMMON_VARIANTS.append((refName,k[0],k[1],k[3],VDAT_COMMON[k]/float(N_DONORS))) + + # + # identify areas that have contained significantly higher random mutation rates + # + dist_thresh = 2000 + percentile_clust = 97 + qptn = 1000 + # identify regions with disproportionately more variants in them + VARIANT_POS = sorted([n[0] for n in VDAT_COMMON.keys()]) + clustered_pos = clusterList(VARIANT_POS,dist_thresh) + byLen = [(len(clustered_pos[i]),min(clustered_pos[i]),max(clustered_pos[i]),i) for i in xrange(len(clustered_pos))] + #byLen = sorted(byLen,reverse=True) + #minLen = int(np.percentile([n[0] for n in byLen],percentile_clust)) + #byLen = [n for n in byLen if n[0] >= minLen] + candidate_regions = [] + for n in byLen: + bi = int((n[1]-dist_thresh)/float(qptn))*qptn + bf = int((n[2]+dist_thresh)/float(qptn))*qptn + candidate_regions.append((n[0]/float(bf-bi),max([0,bi]),min([len(refSequence),bf]))) + minVal = np.percentile([n[0] for n in candidate_regions],percentile_clust) + for n in candidate_regions: + if n[0] >= minVal: + HIGH_MUT_REGIONS.append((refName,n[1],n[2],n[0])) + # collapse overlapping regions + for i in xrange(len(HIGH_MUT_REGIONS)-1,0,-1): + if HIGH_MUT_REGIONS[i-1][2] >= HIGH_MUT_REGIONS[i][1] and HIGH_MUT_REGIONS[i-1][0] == HIGH_MUT_REGIONS[i][0]: + avgMutRate = 0.5*HIGH_MUT_REGIONS[i-1][3]+0.5*HIGH_MUT_REGIONS[i][3] # not accurate, but I'm lazy + HIGH_MUT_REGIONS[i-1] = (HIGH_MUT_REGIONS[i-1][0], HIGH_MUT_REGIONS[i-1][1], HIGH_MUT_REGIONS[i][2], avgMutRate) + del HIGH_MUT_REGIONS[i] + + # + # if we didn't count ref trinucs because we found file, read in ref counts from file now + # + if os.path.isfile(REF+'.trinucCounts'): + print 'reading pre-computed trinuc counts...' + f = open(REF+'.trinucCounts','r') + for line in f: + splt = line.strip().split('\t') + TRINUC_REF_COUNT[splt[0]] = int(splt[1]) + f.close() + # otherwise, save trinuc counts to file, if desired + elif SAVE_TRINUC: + if MYBED != None: + print 'unable to save trinuc counts to file because using input bed region...' + else: + print 'saving trinuc counts to file...' + f = open(REF+'.trinucCounts','w') + for trinuc in sorted(TRINUC_REF_COUNT.keys()): + f.write(trinuc+'\t'+str(TRINUC_REF_COUNT[trinuc])+'\n') + f.close() + + # + # if using an input bed region, make necessary adjustments to trinuc ref counts based on the bed region trinuc counts + # + if MYBED != None: + if MYBED[1] == True: # we are restricting our attention to bed regions, so ONLY use bed region trinuc counts + TRINUC_REF_COUNT = TRINUC_BED_COUNT + else: # we are only looking outside bed regions, so subtract bed region trinucs from entire reference trinucs + for k in TRINUC_REF_COUNT.keys(): + if k in TRINUC_BED_COUNT: + TRINUC_REF_COUNT[k] -= TRINUC_BED_COUNT[k] + + # if for some reason we didn't find any valid input variants, exit gracefully... + totalVar = SNP_COUNT + sum(INDEL_COUNT.values()) + if totalVar == 0: + print '\nError: No valid variants were found, model could not be created. (Are you using the correct reference?)\n' + exit(1) + + """ ########################################################################## + ### COMPUTE PROBABILITIES ### + ########################################################################## """ + + + #for k in sorted(TRINUC_REF_COUNT.keys()): + # print k, TRINUC_REF_COUNT[k] + # + #for k in sorted(TRINUC_TRANSITION_COUNT.keys()): + # print k, TRINUC_TRANSITION_COUNT[k] + + # frequency that each trinuc mutated into anything else + TRINUC_MUT_PROB = {} + # frequency that a trinuc mutates into another trinuc, given that it mutated + TRINUC_TRANS_PROBS = {} + # frequency of snp transitions, given a snp occurs. + SNP_TRANS_FREQ = {} + + for trinuc in sorted(TRINUC_REF_COUNT.keys()): + myCount = 0 + for k in sorted(TRINUC_TRANSITION_COUNT.keys()): + if k[0] == trinuc: + myCount += TRINUC_TRANSITION_COUNT[k] + TRINUC_MUT_PROB[trinuc] = myCount / float(TRINUC_REF_COUNT[trinuc]) + for k in sorted(TRINUC_TRANSITION_COUNT.keys()): + if k[0] == trinuc: + TRINUC_TRANS_PROBS[k] = TRINUC_TRANSITION_COUNT[k] / float(myCount) + + for n1 in VALID_NUCL: + rollingTot = sum([SNP_TRANSITION_COUNT[(n1,n2)] for n2 in VALID_NUCL if (n1,n2) in SNP_TRANSITION_COUNT]) + for n2 in VALID_NUCL: + key2 = (n1,n2) + if key2 in SNP_TRANSITION_COUNT: + SNP_TRANS_FREQ[key2] = SNP_TRANSITION_COUNT[key2] / float(rollingTot) + + # compute average snp and indel frequencies + SNP_FREQ = SNP_COUNT/float(totalVar) + AVG_INDEL_FREQ = 1.-SNP_FREQ + INDEL_FREQ = {k:(INDEL_COUNT[k]/float(totalVar))/AVG_INDEL_FREQ for k in INDEL_COUNT.keys()} + if MYBED != None: + if MYBED[1] == True: + AVG_MUT_RATE = totalVar/float(getTrackLen(MYBED[0])) + else: + AVG_MUT_RATE = totalVar/float(TOTAL_REFLEN - getTrackLen(MYBED[0])) + else: + AVG_MUT_RATE = totalVar/float(TOTAL_REFLEN) + + # + # if values weren't found in data, appropriately append null entries + # + printTrinucWarning = False + for trinuc in VALID_TRINUC: + trinuc_mut = [trinuc[0]+n+trinuc[2] for n in VALID_NUCL if n != trinuc[1]] + if trinuc not in TRINUC_MUT_PROB: + TRINUC_MUT_PROB[trinuc] = 0. + printTrinucWarning = True + for trinuc2 in trinuc_mut: + if (trinuc,trinuc2) not in TRINUC_TRANS_PROBS: + TRINUC_TRANS_PROBS[(trinuc,trinuc2)] = 0. + printTrinucWarning = True + if printTrinucWarning: + print 'Warning: Some trinucleotides transitions were not encountered in the input dataset, probabilities of 0.0 have been assigned to these events.' + + # + # print some stuff + # + for k in sorted(TRINUC_MUT_PROB.keys()): + print 'p('+k+' mutates) =',TRINUC_MUT_PROB[k] + + for k in sorted(TRINUC_TRANS_PROBS.keys()): + print 'p('+k[0]+' --> '+k[1]+' | '+k[0]+' mutates) =',TRINUC_TRANS_PROBS[k] + + for k in sorted(INDEL_FREQ.keys()): + if k > 0: + print 'p(ins length = '+str(abs(k))+' | indel occurs) =',INDEL_FREQ[k] + else: + print 'p(del length = '+str(abs(k))+' | indel occurs) =',INDEL_FREQ[k] + + for k in sorted(SNP_TRANS_FREQ.keys()): + print 'p('+k[0]+' --> '+k[1]+' | SNP occurs) =',SNP_TRANS_FREQ[k] + + #for n in COMMON_VARIANTS: + # print n + + #for n in HIGH_MUT_REGIONS: + # print n + + print 'p(snp) =',SNP_FREQ + print 'p(indel) =',AVG_INDEL_FREQ + print 'overall average mut rate:',AVG_MUT_RATE + print 'total variants processed:',totalVar + + # + # save variables to file + # + if SKIP_COMMON: + OUT_DICT = {'AVG_MUT_RATE':AVG_MUT_RATE, + 'SNP_FREQ':SNP_FREQ, + 'SNP_TRANS_FREQ':SNP_TRANS_FREQ, + 'INDEL_FREQ':INDEL_FREQ, + 'TRINUC_MUT_PROB':TRINUC_MUT_PROB, + 'TRINUC_TRANS_PROBS':TRINUC_TRANS_PROBS} + else: + OUT_DICT = {'AVG_MUT_RATE':AVG_MUT_RATE, + 'SNP_FREQ':SNP_FREQ, + 'SNP_TRANS_FREQ':SNP_TRANS_FREQ, + 'INDEL_FREQ':INDEL_FREQ, + 'TRINUC_MUT_PROB':TRINUC_MUT_PROB, + 'TRINUC_TRANS_PROBS':TRINUC_TRANS_PROBS, + 'COMMON_VARIANTS':COMMON_VARIANTS, + 'HIGH_MUT_REGIONS':HIGH_MUT_REGIONS} + pickle.dump( OUT_DICT, open( OUT_PICKLE, "wb" ) ) + + +if __name__ == "__main__": + main() + + + +
