Mercurial > repos > xuebing > sharplabtool
comparison tools/ilmn_pacbio/cov_model.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 from optparse import OptionParser, SUPPRESS_HELP | |
3 import os, random, quake | |
4 | |
5 ############################################################ | |
6 # cov_model.py | |
7 # | |
8 # Given a file of kmer counts, reports the cutoff to use | |
9 # to separate trusted/untrusted kmers. | |
10 ############################################################ | |
11 | |
12 ############################################################ | |
13 # main | |
14 ############################################################ | |
15 def main(): | |
16 usage = 'usage: %prog [options] <counts file>' | |
17 parser = OptionParser(usage) | |
18 parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') | |
19 parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]') | |
20 parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]') | |
21 # help='Model kmer coverage as a function of GC content of kmers [default: %default]' | |
22 parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) | |
23 (options, args) = parser.parse_args() | |
24 | |
25 if len(args) != 1: | |
26 parser.error('Must provide kmers counts file') | |
27 else: | |
28 ctsf = args[0] | |
29 | |
30 if options.count_kmers: | |
31 model_cutoff(ctsf, options.ratio) | |
32 print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() | |
33 | |
34 else: | |
35 if options.model_gc: | |
36 model_q_gc_cutoffs(ctsf, 25000, options.ratio) | |
37 else: | |
38 model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample) | |
39 print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() | |
40 | |
41 | |
42 ############################################################ | |
43 # model_cutoff | |
44 # | |
45 # Make a histogram of kmers to give to R to learn the cutoff | |
46 ############################################################ | |
47 def model_cutoff(ctsf, ratio): | |
48 # make kmer histogram | |
49 cov_max = 0 | |
50 for line in open(ctsf): | |
51 cov = int(line.split()[1]) | |
52 if cov > cov_max: | |
53 cov_max = cov | |
54 | |
55 kmer_hist = [0]*cov_max | |
56 for line in open(ctsf): | |
57 cov = int(line.split()[1]) | |
58 kmer_hist[cov-1] += 1 | |
59 | |
60 cov_out = open('kmers.hist', 'w') | |
61 for cov in range(0,cov_max): | |
62 if kmer_hist[cov]: | |
63 print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov]) | |
64 cov_out.close() | |
65 | |
66 os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir)) | |
67 | |
68 | |
69 ############################################################ | |
70 # model_q_cutoff | |
71 # | |
72 # Sample kmers to give to R to learn the cutoff | |
73 # 'div100' is necessary when the number of kmers is too | |
74 # large for random.sample, so we only consider every 100th | |
75 # kmer. | |
76 ############################################################ | |
77 def model_q_cutoff(ctsf, sample, ratio, no_sample=False): | |
78 if not no_sample: | |
79 # count number of kmer coverages | |
80 num_covs = 0 | |
81 for line in open(ctsf): | |
82 num_covs += 1 | |
83 | |
84 # choose random kmer coverages | |
85 div100 = False | |
86 if sample >= num_covs: | |
87 rand_covs = range(num_covs) | |
88 else: | |
89 if num_covs > 1000000000: | |
90 div100 = True | |
91 rand_covs = random.sample(xrange(num_covs/100), sample) | |
92 else: | |
93 rand_covs = random.sample(xrange(num_covs), sample) | |
94 rand_covs.sort() | |
95 | |
96 # print to file | |
97 out = open('kmers.txt', 'w') | |
98 kmer_i = 0 | |
99 rand_i = 0 | |
100 for line in open(ctsf): | |
101 if div100: | |
102 if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]: | |
103 print >> out, line.split()[1] | |
104 rand_i += 1 | |
105 if rand_i >= sample: | |
106 break | |
107 else: | |
108 if kmer_i == rand_covs[rand_i]: | |
109 print >> out, line.split()[1] | |
110 rand_i += 1 | |
111 if rand_i >= sample: | |
112 break | |
113 kmer_i += 1 | |
114 out.close() | |
115 | |
116 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir)) | |
117 | |
118 | |
119 ############################################################ | |
120 # model_q_gc_cutoffs | |
121 # | |
122 # Sample kmers to give to R to learn the cutoff for each | |
123 # GC value | |
124 ############################################################ | |
125 def model_q_gc_cutoffs(ctsf, sample, ratio): | |
126 # count number of kmer coverages at each at | |
127 k = len(open(ctsf).readline().split()[0]) | |
128 num_covs_at = [0]*(k+1) | |
129 for line in open(ctsf): | |
130 kmer = line.split()[0] | |
131 num_covs_at[count_at(kmer)] += 1 | |
132 | |
133 # for each AT bin | |
134 at_cutoffs = [] | |
135 for at in range(1,k): | |
136 # sample covs | |
137 if sample >= num_covs_at[at]: | |
138 rand_covs = range(num_covs_at[at]) | |
139 else: | |
140 rand_covs = random.sample(xrange(num_covs_at[at]), sample) | |
141 rand_covs.sort() | |
142 | |
143 # print to file | |
144 out = open('kmers.txt', 'w') | |
145 kmer_i = 0 | |
146 rand_i = 0 | |
147 for line in open(ctsf): | |
148 (kmer,cov) = line.split() | |
149 if count_at(kmer) == at: | |
150 if kmer_i == rand_covs[rand_i]: | |
151 print >> out, cov | |
152 rand_i += 1 | |
153 if rand_i >= sample: | |
154 break | |
155 kmer_i += 1 | |
156 out.close() | |
157 | |
158 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) | |
159 | |
160 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
161 if at in [1,k-1]: # setting extremes to next closests | |
162 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
163 | |
164 os.system('mv kmers.txt kmers.at%d.txt' % at) | |
165 os.system('mv cutoff.txt cutoff.at%d.txt' % at) | |
166 | |
167 out = open('cutoffs.gc.txt','w') | |
168 print >> out, '\n'.join(at_cutoffs) | |
169 out.close() | |
170 | |
171 | |
172 ############################################################ | |
173 # model_q_gc_cutoffs_bigmem | |
174 # | |
175 # Sample kmers to give to R to learn the cutoff for each | |
176 # GC value | |
177 ############################################################ | |
178 def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio): | |
179 # input coverages | |
180 k = 0 | |
181 for line in open(ctsf): | |
182 (kmer,cov) = line.split() | |
183 if k == 0: | |
184 k = len(kmer) | |
185 at_covs = ['']*(k+1) | |
186 else: | |
187 at = count_at(kmer) | |
188 if at_covs[at]: | |
189 at_covs[at].append(cov) | |
190 else: | |
191 at_covs[at] = [cov] | |
192 | |
193 for at in range(1,k): | |
194 print '%d %d' % (at,len(at_covs[at])) | |
195 | |
196 # for each AT bin | |
197 at_cutoffs = [] | |
198 for at in range(1,k): | |
199 # sample covs | |
200 if sample >= len(at_covs[at]): | |
201 rand_covs = at_covs[at] | |
202 else: | |
203 rand_covs = random.sample(at_covs[at], sample) | |
204 | |
205 # print to file | |
206 out = open('kmers.txt', 'w') | |
207 for rc in rand_covs: | |
208 print >> out, rc | |
209 out.close() | |
210 | |
211 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) | |
212 | |
213 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
214 if at in [1,k-1]: # setting extremes to next closests | |
215 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
216 | |
217 os.system('mv kmers.txt kmers.at%d.txt' % at) | |
218 os.system('mv cutoff.txt cutoff.at%d.txt' % at) | |
219 | |
220 out = open('cutoffs.gc.txt','w') | |
221 print >> out, '\n'.join(at_cutoffs) | |
222 out.close() | |
223 | |
224 | |
225 ############################################################ | |
226 # count_at | |
227 # | |
228 # Count A's and T's in the given sequence | |
229 ############################################################ | |
230 def count_at(seq): | |
231 return len([nt for nt in seq if nt in ['A','T']]) | |
232 | |
233 | |
234 ############################################################ | |
235 # __main__ | |
236 ############################################################ | |
237 if __name__ == '__main__': | |
238 main() |