Mercurial > repos > arkarachai-fungtammasan > microsatellite_ngs
comparison heteroprob.py @ 0:20ab85af9505
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Fri, 03 Oct 2014 20:54:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:20ab85af9505 |
---|---|
1 ### import libraries ### | |
2 import sys | |
3 import collections, math | |
4 import heapq | |
5 import itertools | |
6 | |
7 | |
8 | |
9 ### basic function ### | |
10 def permuterepeat(n,rlist): | |
11 f = math.factorial | |
12 nfac=f(n) | |
13 rfaclist=[f(i) for i in rlist] | |
14 for rfac in rfaclist: | |
15 nfac=nfac/rfac | |
16 return nfac | |
17 | |
18 def nCr(n,r): | |
19 f = math.factorial | |
20 return f(n) / f(r) / f(n-r) | |
21 | |
22 def averagelist(a,b,expectedlevelofminor): | |
23 product=[] | |
24 for i in range(len(a)): | |
25 product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i]) | |
26 | |
27 return product | |
28 | |
29 def complement_base(read): | |
30 collect='' | |
31 for i in read: | |
32 if i.upper()=='A': | |
33 collect+='T' | |
34 elif i.upper()=='T': | |
35 collect+='A' | |
36 elif i.upper()=='C': | |
37 collect+='G' | |
38 elif i.upper()=='G': | |
39 collect+='C' | |
40 return collect | |
41 def makeallpossible(read): | |
42 collect=[] | |
43 for i in range(len(read)): | |
44 tmp= read[i:]+read[:i] | |
45 collect.append(tmp) | |
46 collect.append(complement_base(tmp)) | |
47 return collect | |
48 | |
49 def motifsimplify(base): | |
50 '''str--> str | |
51 ''' | |
52 motiflength=len(base) | |
53 temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base)))) | |
54 | |
55 return temp[0] | |
56 | |
57 def majorallele(seq): | |
58 binseq=list(set(seq)) | |
59 binseq.sort(reverse=True) # highly mutate mode | |
60 #binseq.sort() # majority mode | |
61 storeform='' | |
62 storevalue=0 | |
63 for i in binseq: | |
64 if seq.count(i)>storevalue: | |
65 storeform=i | |
66 storevalue=seq.count(i) | |
67 | |
68 return int(storeform) | |
69 | |
70 ### decide global parameter ### | |
71 COORDINATECOLUMN=1 | |
72 ALLELECOLUMN=2 | |
73 MOTIFCOLUMN=3 | |
74 inputname=sys.argv[1] | |
75 errorprofile=sys.argv[2] | |
76 EXPECTEDLEVELOFMINOR=float(sys.argv[3]) | |
77 if EXPECTEDLEVELOFMINOR >0.5: | |
78 try: | |
79 errorexpectcontribution=int('a') | |
80 except Exception, eee: | |
81 print eee | |
82 stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5") | |
83 MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012 | |
84 | |
85 | |
86 ## Fixed global variable | |
87 ALLREPEATTYPE=[1,2,3,4] | |
88 ALLREPEATTYPENAME=['mono','di','tri','tetra'] | |
89 monomotif=['A','C'] | |
90 dimotif=['AC','AG','AT','CG'] | |
91 trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG'] | |
92 tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\ | |
93 'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\ | |
94 'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC'] | |
95 ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif} | |
96 monorange=range(5,60) | |
97 dirange=range(6,60) | |
98 trirange=range(9,60) | |
99 tetrarange=range(12,80) | |
100 ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange} | |
101 | |
102 ######################################### | |
103 ######## Prob calculation sector ######## | |
104 ######################################### | |
105 def multinomial_prob(majorallele,STRlength,motif,probdatabase): | |
106 '''int,int,str,dict-->int | |
107 ### get prob for each STRlength to be generated from major allele | |
108 ''' | |
109 #print (majorallele,STRlength,motif) | |
110 prob=probdatabase[len(motif)][motif][majorallele][STRlength] | |
111 return prob | |
112 | |
113 ################################################ | |
114 ######## error model database sector ########### | |
115 ################################################ | |
116 | |
117 ## structure generator | |
118 errormodeldatabase={1:{},2:{},3:{},4:{}} | |
119 sumbymajoralleledatabase={1:{},2:{},3:{},4:{}} | |
120 for repeattype in ALLREPEATTYPE: | |
121 for motif in ALLMOTIF[repeattype]: | |
122 errormodeldatabase[repeattype][motif]={} | |
123 sumbymajoralleledatabase[repeattype][motif]={} | |
124 for motifsize1 in ALLRANGE[repeattype]: | |
125 errormodeldatabase[repeattype][motif][motifsize1]={} | |
126 sumbymajoralleledatabase[repeattype][motif][motifsize1]=0 | |
127 for motifsize2 in ALLRANGE[repeattype]: | |
128 errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE | |
129 #print errormodeldatabase | |
130 ## read database | |
131 | |
132 ## get read count for each major allele | |
133 fd=open(errorprofile) | |
134 lines=fd.readlines() | |
135 for line in lines: | |
136 temp=line.strip().split('\t') | |
137 t_major=int(temp[0]) | |
138 t_count=int(temp[2]) | |
139 motif=temp[3] | |
140 sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count | |
141 fd.close() | |
142 ##print sumbymajoralleledatabase | |
143 | |
144 ## get probability | |
145 fd=open(errorprofile) | |
146 lines=fd.readlines() | |
147 for line in lines: | |
148 temp=line.strip().split('\t') | |
149 t_major=int(temp[0]) | |
150 t_read=int(temp[1]) | |
151 t_count=int(temp[2]) | |
152 motif=temp[3] | |
153 if sumbymajoralleledatabase[len(motif)][motif][t_major]>0: | |
154 errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0) | |
155 #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0)) | |
156 | |
157 #else: | |
158 # errormodeldatabase[repeattype][motif][t_major][t_read]=0 | |
159 fd.close() | |
160 #print errormodeldatabase | |
161 #print math.log(100,10) | |
162 ######################################### | |
163 ######## input reading sector ########### | |
164 ######################################### | |
165 | |
166 | |
167 | |
168 fd = open(inputname) | |
169 ##fd=open('sampleinput_C.txt') | |
170 lines=fd.xreadlines() | |
171 for line in lines: | |
172 i_read=[] | |
173 i2_read=[] | |
174 temp=line.strip().split('\t') | |
175 i_coordinate=temp[COORDINATECOLUMN-1] | |
176 i_motif=motifsimplify(temp[MOTIFCOLUMN-1]) | |
177 i_read=temp[ALLELECOLUMN-1].split(',') | |
178 i_read=map(int,i_read) | |
179 depth=len(i_read) | |
180 heteromajor1=int(temp[6]) | |
181 heteromajor2=int(temp[7]) | |
182 | |
183 ### calculate the change to detect combination (using error profile) | |
184 heterozygous_collector=0 | |
185 alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read] | |
186 blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read] | |
187 | |
188 ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR) | |
189 | |
190 if 0 in ablist: | |
191 continue | |
192 heterozygous_collector=reduce(lambda y, z: y*z,ablist ) | |
193 | |
194 ### prob of combination (using multinomial distribution) | |
195 frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)] | |
196 ## print frequency_distribution | |
197 expandbypermutation=permuterepeat(depth,frequency_distribution) | |
198 | |
199 print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth) |