| 0 | 1 ### import libraries ### | 
|  | 2 import sys | 
|  | 3 import collections, math | 
|  | 4 import heapq | 
|  | 5 import itertools | 
|  | 6 | 
|  | 7 | 
|  | 8 | 
|  | 9 ### basic function ### | 
|  | 10 def permuterepeat(n,rlist): | 
|  | 11     f = math.factorial | 
|  | 12     nfac=f(n) | 
|  | 13     rfaclist=[f(i) for i in rlist] | 
|  | 14     for rfac in rfaclist: | 
|  | 15         nfac=nfac/rfac | 
|  | 16     return nfac | 
|  | 17 | 
|  | 18 def nCr(n,r): | 
|  | 19     f = math.factorial | 
|  | 20     return f(n) / f(r) / f(n-r) | 
|  | 21 | 
|  | 22 def averagelist(a,b,expectedlevelofminor): | 
|  | 23     product=[] | 
|  | 24     for i in range(len(a)): | 
|  | 25         product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i]) | 
|  | 26 | 
|  | 27     return product | 
|  | 28 | 
|  | 29 def complement_base(read): | 
|  | 30     collect='' | 
|  | 31     for i in read: | 
|  | 32         if i.upper()=='A': | 
|  | 33             collect+='T' | 
|  | 34         elif i.upper()=='T': | 
|  | 35             collect+='A' | 
|  | 36         elif i.upper()=='C': | 
|  | 37             collect+='G' | 
|  | 38         elif i.upper()=='G': | 
|  | 39             collect+='C' | 
|  | 40     return collect | 
|  | 41 def makeallpossible(read): | 
|  | 42     collect=[] | 
|  | 43     for i in range(len(read)): | 
|  | 44         tmp= read[i:]+read[:i] | 
|  | 45         collect.append(tmp) | 
|  | 46         collect.append(complement_base(tmp)) | 
|  | 47     return collect | 
|  | 48 | 
|  | 49 def motifsimplify(base): | 
|  | 50     '''str--> str | 
|  | 51     ''' | 
|  | 52     motiflength=len(base) | 
|  | 53     temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base)))) | 
|  | 54 | 
|  | 55     return temp[0] | 
|  | 56 | 
|  | 57 def majorallele(seq): | 
|  | 58     binseq=list(set(seq)) | 
|  | 59     binseq.sort(reverse=True)   # highly mutate mode | 
|  | 60     #binseq.sort()              # majority mode | 
|  | 61     storeform='' | 
|  | 62     storevalue=0 | 
|  | 63     for i in binseq: | 
|  | 64         if seq.count(i)>storevalue: | 
|  | 65             storeform=i | 
|  | 66             storevalue=seq.count(i) | 
|  | 67 | 
|  | 68     return int(storeform) | 
|  | 69 | 
|  | 70 ### decide global parameter ### | 
|  | 71 COORDINATECOLUMN=1 | 
|  | 72 ALLELECOLUMN=2 | 
|  | 73 MOTIFCOLUMN=3 | 
|  | 74 inputname=sys.argv[1] | 
|  | 75 errorprofile=sys.argv[2] | 
|  | 76 EXPECTEDLEVELOFMINOR=float(sys.argv[3]) | 
|  | 77 if EXPECTEDLEVELOFMINOR >0.5: | 
|  | 78 	try: | 
|  | 79 		errorexpectcontribution=int('a') | 
|  | 80 	except Exception, eee: | 
|  | 81 		print eee | 
|  | 82 		stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5") | 
|  | 83 MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8))  #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012 | 
|  | 84 | 
|  | 85 | 
|  | 86 ## Fixed global variable | 
|  | 87 ALLREPEATTYPE=[1,2,3,4] | 
|  | 88 ALLREPEATTYPENAME=['mono','di','tri','tetra'] | 
|  | 89 monomotif=['A','C'] | 
|  | 90 dimotif=['AC','AG','AT','CG'] | 
|  | 91 trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG'] | 
|  | 92 tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\ | 
|  | 93 'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\ | 
|  | 94 'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC'] | 
|  | 95 ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif} | 
|  | 96 monorange=range(5,60) | 
|  | 97 dirange=range(6,60) | 
|  | 98 trirange=range(9,60) | 
|  | 99 tetrarange=range(12,80) | 
|  | 100 ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange} | 
|  | 101 | 
|  | 102 ######################################### | 
|  | 103 ######## Prob calculation sector ######## | 
|  | 104 ######################################### | 
|  | 105 def multinomial_prob(majorallele,STRlength,motif,probdatabase): | 
|  | 106     '''int,int,str,dict-->int | 
|  | 107     ### get prob for each STRlength to be generated from major allele | 
|  | 108     ''' | 
|  | 109     #print (majorallele,STRlength,motif) | 
|  | 110     prob=probdatabase[len(motif)][motif][majorallele][STRlength] | 
|  | 111     return prob | 
|  | 112 | 
|  | 113 ################################################ | 
|  | 114 ######## error model database sector ########### | 
|  | 115 ################################################ | 
|  | 116 | 
|  | 117 ## structure generator | 
|  | 118 errormodeldatabase={1:{},2:{},3:{},4:{}} | 
|  | 119 sumbymajoralleledatabase={1:{},2:{},3:{},4:{}} | 
|  | 120 for repeattype in ALLREPEATTYPE: | 
|  | 121     for motif in ALLMOTIF[repeattype]: | 
|  | 122         errormodeldatabase[repeattype][motif]={} | 
|  | 123         sumbymajoralleledatabase[repeattype][motif]={} | 
|  | 124         for motifsize1 in ALLRANGE[repeattype]: | 
|  | 125             errormodeldatabase[repeattype][motif][motifsize1]={} | 
|  | 126             sumbymajoralleledatabase[repeattype][motif][motifsize1]=0 | 
|  | 127             for motifsize2 in ALLRANGE[repeattype]: | 
|  | 128                 errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE | 
|  | 129 #print errormodeldatabase | 
|  | 130 ## read database | 
|  | 131 | 
|  | 132 ## get read count for each major allele | 
|  | 133 fd=open(errorprofile) | 
|  | 134 lines=fd.readlines() | 
|  | 135 for line in lines: | 
|  | 136     temp=line.strip().split('\t') | 
|  | 137     t_major=int(temp[0]) | 
|  | 138     t_count=int(temp[2]) | 
|  | 139     motif=temp[3] | 
|  | 140     sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count | 
|  | 141 fd.close() | 
|  | 142 ##print sumbymajoralleledatabase | 
|  | 143 | 
|  | 144 ## get probability | 
|  | 145 fd=open(errorprofile) | 
|  | 146 lines=fd.readlines() | 
|  | 147 for line in lines: | 
|  | 148     temp=line.strip().split('\t') | 
|  | 149     t_major=int(temp[0]) | 
|  | 150     t_read=int(temp[1]) | 
|  | 151     t_count=int(temp[2]) | 
|  | 152     motif=temp[3] | 
|  | 153     if sumbymajoralleledatabase[len(motif)][motif][t_major]>0: | 
|  | 154         errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0) | 
|  | 155         #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0)) | 
|  | 156 | 
|  | 157     #else: | 
|  | 158     #    errormodeldatabase[repeattype][motif][t_major][t_read]=0 | 
|  | 159 fd.close() | 
|  | 160 #print errormodeldatabase | 
|  | 161 #print math.log(100,10) | 
|  | 162 ######################################### | 
|  | 163 ######## input reading sector ########### | 
|  | 164 ######################################### | 
|  | 165 | 
|  | 166 | 
|  | 167 | 
|  | 168 fd = open(inputname) | 
|  | 169 ##fd=open('sampleinput_C.txt') | 
|  | 170 lines=fd.xreadlines() | 
|  | 171 for line in lines: | 
|  | 172     i_read=[] | 
|  | 173     i2_read=[] | 
|  | 174     temp=line.strip().split('\t') | 
|  | 175     i_coordinate=temp[COORDINATECOLUMN-1] | 
|  | 176     i_motif=motifsimplify(temp[MOTIFCOLUMN-1]) | 
|  | 177     i_read=temp[ALLELECOLUMN-1].split(',') | 
|  | 178     i_read=map(int,i_read) | 
|  | 179     depth=len(i_read) | 
|  | 180     heteromajor1=int(temp[6]) | 
|  | 181     heteromajor2=int(temp[7]) | 
|  | 182 | 
|  | 183 ### calculate the change to detect combination (using error profile) | 
|  | 184     heterozygous_collector=0 | 
|  | 185     alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read] | 
|  | 186     blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read] | 
|  | 187 | 
|  | 188     ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR) | 
|  | 189 | 
|  | 190     if 0 in ablist: | 
|  | 191         continue | 
|  | 192     heterozygous_collector=reduce(lambda y, z: y*z,ablist ) | 
|  | 193 | 
|  | 194 ### prob of combination (using multinomial distribution) | 
|  | 195     frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)] | 
|  | 196     ## print frequency_distribution | 
|  | 197     expandbypermutation=permuterepeat(depth,frequency_distribution) | 
|  | 198 | 
|  | 199     print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth) |