Mercurial > repos > xuebing > sharplabtool
comparison tools/regVariation/microsats_mutability.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 #Guruprasad Ananda | |
3 """ | |
4 This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool. | |
5 """ | |
6 from galaxy import eggs | |
7 import sys, string, re, commands, tempfile, os, fileinput | |
8 from galaxy.tools.util.galaxyops import * | |
9 from bx.intervals.io import * | |
10 from bx.intervals.operations import quicksect | |
11 | |
12 fout = open(sys.argv[2],'w') | |
13 p_group = int(sys.argv[3]) #primary "group-by" feature | |
14 p_bin_size = int(sys.argv[4]) | |
15 s_group = int(sys.argv[5]) #sub-group by feature | |
16 s_bin_size = int(sys.argv[6]) | |
17 mono_threshold = 9 | |
18 non_mono_threshold = 4 | |
19 p_group_cols = [p_group, p_group+7] | |
20 s_group_cols = [s_group, s_group+7] | |
21 num_generations = int(sys.argv[7]) | |
22 region = sys.argv[8] | |
23 int_file = sys.argv[9] | |
24 if int_file != "None": #User has specified an interval file | |
25 try: | |
26 fint = open(int_file, 'r') | |
27 dbkey_i = sys.argv[10] | |
28 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] ) | |
29 except: | |
30 stop_err("Unable to open input Interval file") | |
31 | |
32 def stop_err(msg): | |
33 sys.stderr.write(msg) | |
34 sys.exit() | |
35 | |
36 def reverse_complement(text): | |
37 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
38 comp = [ch for ch in text.translate(DNA_COMP)] | |
39 comp.reverse() | |
40 return "".join(comp) | |
41 | |
42 def get_unique_elems(elems): | |
43 seen=set() | |
44 return[x for x in elems if x not in seen and not seen.add(x)] | |
45 | |
46 def get_binned_lists(uniqlist, binsize): | |
47 binnedlist=[] | |
48 uniqlist.sort() | |
49 start = int(uniqlist[0]) | |
50 bin_ind=0 | |
51 l_ind=0 | |
52 binnedlist.append([]) | |
53 while l_ind < len(uniqlist): | |
54 elem = int(uniqlist[l_ind]) | |
55 if elem in range(start,start+binsize): | |
56 binnedlist[bin_ind].append(elem) | |
57 else: | |
58 start += binsize | |
59 bin_ind += 1 | |
60 binnedlist.append([]) | |
61 binnedlist[bin_ind].append(elem) | |
62 l_ind += 1 | |
63 return binnedlist | |
64 | |
65 def fetch_weight(H,C,t): | |
66 if (H-(C-H)) < t: | |
67 return 2.0 | |
68 else: | |
69 return 1.0 | |
70 | |
71 def mutabilityEstimator(repeats1,repeats2,thresholds): | |
72 mut_num = 0.0 #Mutability Numerator | |
73 mut_den = 0.0 #Mutability denominator | |
74 for ind,H in enumerate(repeats1): | |
75 C = repeats2[ind] | |
76 t = thresholds[ind] | |
77 w = fetch_weight(H,C,t) | |
78 mut_num += ((H-C)*(H-C)*w) | |
79 mut_den += w | |
80 return [mut_num, mut_den] | |
81 | |
82 def output_writer(blk, blk_lines): | |
83 global winspecies, speciesind | |
84 all_elems_1=[] | |
85 all_elems_2=[] | |
86 all_s_elems_1=[] | |
87 all_s_elems_2=[] | |
88 for bline in blk_lines: | |
89 if not(bline): | |
90 continue | |
91 items = bline.split('\t') | |
92 seq1 = items[1] | |
93 start1 = items[2] | |
94 end1 = items[3] | |
95 seq2 = items[8] | |
96 start2 = items[9] | |
97 end2 = items[10] | |
98 if p_group_cols[0] == 6: | |
99 items[p_group_cols[0]] = int(items[p_group_cols[0]]) | |
100 items[p_group_cols[1]] = int(items[p_group_cols[1]]) | |
101 if s_group_cols[0] == 6: | |
102 items[s_group_cols[0]] = int(items[s_group_cols[0]]) | |
103 items[s_group_cols[1]] = int(items[s_group_cols[1]]) | |
104 all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1 | |
105 all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2 | |
106 if s_group_cols[0] != -1: #sub-group is not None | |
107 all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1 | |
108 all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2 | |
109 uniq_elems_1 = get_unique_elems(all_elems_1) | |
110 uniq_elems_2 = get_unique_elems(all_elems_2) | |
111 if s_group_cols[0] != -1: | |
112 uniq_s_elems_1 = get_unique_elems(all_s_elems_1) | |
113 uniq_s_elems_2 = get_unique_elems(all_s_elems_2) | |
114 mut1={} | |
115 mut2={} | |
116 count1 = {} | |
117 count2 = {} | |
118 """ | |
119 if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc) | |
120 uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y))) | |
121 """ | |
122 if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number. | |
123 uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size) | |
124 uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size) | |
125 | |
126 if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number. | |
127 uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size) | |
128 uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size) | |
129 | |
130 for pitem1 in uniq_elems_1: | |
131 #repeats1 = [] | |
132 #repeats2 = [] | |
133 thresholds = [] | |
134 if s_group_cols[0] != -1: #Sub-group by feature is not None | |
135 for sitem1 in uniq_s_elems_1: | |
136 repeats1 = [] | |
137 repeats2 = [] | |
138 if type(sitem1) == type(''): | |
139 sitem1 = sitem1.strip() | |
140 for bline in blk_lines: | |
141 belems = bline.split('\t') | |
142 if type(pitem1) == list: | |
143 if p_group_cols[0] == 6: | |
144 belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) | |
145 if belems[p_group_cols[0]] in pitem1: | |
146 if belems[s_group_cols[0]]==sitem1: | |
147 repeats1.append(int(belems[6])) | |
148 repeats2.append(int(belems[13])) | |
149 if belems[4] == 'mononucleotide': | |
150 thresholds.append(mono_threshold) | |
151 else: | |
152 thresholds.append(non_mono_threshold) | |
153 mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
154 if region == 'align': | |
155 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) | |
156 else: | |
157 if winspecies == 1: | |
158 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1) | |
159 elif winspecies == 2: | |
160 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2) | |
161 else: | |
162 if type(sitem1) == list: | |
163 if s_group_cols[0] == 6: | |
164 belems[s_group_cols[0]] = int(belems[s_group_cols[0]]) | |
165 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1: | |
166 repeats1.append(int(belems[6])) | |
167 repeats2.append(int(belems[13])) | |
168 if belems[4] == 'mononucleotide': | |
169 thresholds.append(mono_threshold) | |
170 else: | |
171 thresholds.append(non_mono_threshold) | |
172 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
173 if region == 'align': | |
174 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) | |
175 else: | |
176 if winspecies == 1: | |
177 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1) | |
178 elif winspecies == 2: | |
179 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2) | |
180 else: | |
181 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1: | |
182 repeats1.append(int(belems[6])) | |
183 repeats2.append(int(belems[13])) | |
184 if belems[4] == 'mononucleotide': | |
185 thresholds.append(mono_threshold) | |
186 else: | |
187 thresholds.append(non_mono_threshold) | |
188 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
189 if region == 'align': | |
190 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) | |
191 else: | |
192 if winspecies == 1: | |
193 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1) | |
194 elif winspecies == 2: | |
195 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2) | |
196 else: #Sub-group by feature is None | |
197 for bline in blk_lines: | |
198 belems = bline.split('\t') | |
199 if type(pitem1) == list: | |
200 #print >>sys.stderr, "item: " + str(item1) | |
201 if p_group_cols[0] == 6: | |
202 belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) | |
203 if belems[p_group_cols[0]] in pitem1: | |
204 repeats1.append(int(belems[6])) | |
205 repeats2.append(int(belems[13])) | |
206 if belems[4] == 'mononucleotide': | |
207 thresholds.append(mono_threshold) | |
208 else: | |
209 thresholds.append(non_mono_threshold) | |
210 else: | |
211 if belems[p_group_cols[0]]==pitem1: | |
212 repeats1.append(int(belems[6])) | |
213 repeats2.append(int(belems[13])) | |
214 if belems[4] == 'mononucleotide': | |
215 thresholds.append(mono_threshold) | |
216 else: | |
217 thresholds.append(non_mono_threshold) | |
218 mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
219 if region == 'align': | |
220 count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2)) | |
221 else: | |
222 if winspecies == 1: | |
223 count1[str(pitem1)]=sum(repeats1) | |
224 elif winspecies == 2: | |
225 count1[str(pitem1)]=sum(repeats2) | |
226 | |
227 for pitem2 in uniq_elems_2: | |
228 #repeats1 = [] | |
229 #repeats2 = [] | |
230 thresholds = [] | |
231 if s_group_cols[0] != -1: #Sub-group by feature is not None | |
232 for sitem2 in uniq_s_elems_2: | |
233 repeats1 = [] | |
234 repeats2 = [] | |
235 if type(sitem2)==type(''): | |
236 sitem2 = sitem2.strip() | |
237 for bline in blk_lines: | |
238 belems = bline.split('\t') | |
239 if type(pitem2) == list: | |
240 if p_group_cols[0] == 6: | |
241 belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) | |
242 if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2: | |
243 repeats2.append(int(belems[13])) | |
244 repeats1.append(int(belems[6])) | |
245 if belems[4] == 'mononucleotide': | |
246 thresholds.append(mono_threshold) | |
247 else: | |
248 thresholds.append(non_mono_threshold) | |
249 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
250 #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2) | |
251 if region == 'align': | |
252 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) | |
253 else: | |
254 if winspecies == 1: | |
255 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) | |
256 elif winspecies == 2: | |
257 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) | |
258 else: | |
259 if type(sitem2) == list: | |
260 if s_group_cols[0] == 6: | |
261 belems[s_group_cols[1]] = int(belems[s_group_cols[1]]) | |
262 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2: | |
263 repeats2.append(int(belems[13])) | |
264 repeats1.append(int(belems[6])) | |
265 if belems[4] == 'mononucleotide': | |
266 thresholds.append(mono_threshold) | |
267 else: | |
268 thresholds.append(non_mono_threshold) | |
269 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
270 if region == 'align': | |
271 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) | |
272 else: | |
273 if winspecies == 1: | |
274 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) | |
275 elif winspecies == 2: | |
276 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) | |
277 else: | |
278 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2: | |
279 repeats1.append(int(belems[13])) | |
280 repeats2.append(int(belems[6])) | |
281 if belems[4] == 'mononucleotide': | |
282 thresholds.append(mono_threshold) | |
283 else: | |
284 thresholds.append(non_mono_threshold) | |
285 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
286 if region == 'align': | |
287 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) | |
288 else: | |
289 if winspecies == 1: | |
290 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) | |
291 elif winspecies == 2: | |
292 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) | |
293 else: #Sub-group by feature is None | |
294 for bline in blk_lines: | |
295 belems = bline.split('\t') | |
296 if type(pitem2) == list: | |
297 if p_group_cols[0] == 6: | |
298 belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) | |
299 if belems[p_group_cols[1]] in pitem2: | |
300 repeats2.append(int(belems[13])) | |
301 repeats1.append(int(belems[6])) | |
302 if belems[4] == 'mononucleotide': | |
303 thresholds.append(mono_threshold) | |
304 else: | |
305 thresholds.append(non_mono_threshold) | |
306 else: | |
307 if belems[p_group_cols[1]]==pitem2: | |
308 repeats2.append(int(belems[13])) | |
309 repeats1.append(int(belems[6])) | |
310 if belems[4] == 'mononucleotide': | |
311 thresholds.append(mono_threshold) | |
312 else: | |
313 thresholds.append(non_mono_threshold) | |
314 mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
315 if region == 'align': | |
316 count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2)) | |
317 else: | |
318 if winspecies == 1: | |
319 count2["%s" %(pitem2)]=sum(repeats2) | |
320 elif winspecies == 2: | |
321 count2["%s" %(pitem2)]=sum(repeats1) | |
322 for key in mut1.keys(): | |
323 if key in mut2.keys(): | |
324 mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1]) | |
325 count = count1[key] | |
326 del mut2[key] | |
327 else: | |
328 unit_found = False | |
329 if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too | |
330 if p_group_cols[0] == 7: | |
331 this,other = 0,1 | |
332 else: | |
333 this,other = 1,0 | |
334 groups1 = key.split('\t') | |
335 mutn = mut1[key][0] | |
336 mutd = mut1[key][1] | |
337 count = 0 | |
338 for key2 in mut2.keys(): | |
339 groups2 = key2.split('\t') | |
340 if groups1[other] == groups2[other]: | |
341 if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2: | |
342 #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1]) | |
343 mutn += mut2[key2][0] | |
344 mutd += mut2[key2][1] | |
345 count += int(count2[key2]) | |
346 unit_found = True | |
347 del mut2[key2] | |
348 #break | |
349 if unit_found: | |
350 mut = mutn/mutd | |
351 else: | |
352 mut = mut1[key][0]/mut1[key][1] | |
353 count = count1[key] | |
354 mut = "%.2e" %(mut/num_generations) | |
355 if region == 'align': | |
356 print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count) | |
357 elif region == 'win': | |
358 fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count)) | |
359 fout.flush() | |
360 | |
361 #catch any remaining repeats, for instance if the orthologous position contained different repeat units | |
362 for remaining_key in mut2.keys(): | |
363 mut = mut2[remaining_key][0]/mut2[remaining_key][1] | |
364 mut = "%.2e" %(mut/num_generations) | |
365 count = count2[remaining_key] | |
366 if region == 'align': | |
367 print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) | |
368 elif region == 'win': | |
369 fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count)) | |
370 fout.flush() | |
371 #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) | |
372 | |
373 def counter(node, start, end, report_func): | |
374 if start <= node.start < end and start < node.end <= end: | |
375 report_func(node) | |
376 if node.right: | |
377 counter(node.right, start, end, report_func) | |
378 if node.left: | |
379 counter(node.left, start, end, report_func) | |
380 elif node.start < start and node.right: | |
381 counter(node.right, start, end, report_func) | |
382 elif node.start >= end and node.left and node.left.maxend > start: | |
383 counter(node.left, start, end, report_func) | |
384 | |
385 | |
386 def main(): | |
387 infile = sys.argv[1] | |
388 | |
389 for i, line in enumerate( file ( infile )): | |
390 line = line.rstrip('\r\n') | |
391 if len( line )>0 and not line.startswith( '#' ): | |
392 elems = line.split( '\t' ) | |
393 break | |
394 if i == 30: | |
395 break # Hopefully we'll never get here... | |
396 | |
397 if len( elems ) != 15: | |
398 stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." ) | |
399 global winspecies, speciesind | |
400 if region == 'win': | |
401 if dbkey_i in elems[1]: | |
402 winspecies = 1 | |
403 speciesind = 1 | |
404 elif dbkey_i in elems[8]: | |
405 winspecies = 2 | |
406 speciesind = 8 | |
407 else: | |
408 stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") | |
409 | |
410 fin = open(infile, 'r') | |
411 skipped = 0 | |
412 blk=0 | |
413 win=0 | |
414 linestr="" | |
415 | |
416 if region == 'win': | |
417 | |
418 msats = NiceReaderWrapper( fileinput.FileInput( infile ), | |
419 chrom_col = speciesind, | |
420 start_col = speciesind+1, | |
421 end_col = speciesind+2, | |
422 strand_col = -1, | |
423 fix_strand = True) | |
424 msatTree = quicksect.IntervalTree() | |
425 for item in msats: | |
426 if type( item ) is GenomicInterval: | |
427 msatTree.insert( item, msats.linenum, item.fields ) | |
428 | |
429 for iline in fint: | |
430 try: | |
431 iline = iline.rstrip('\r\n') | |
432 if not(iline) or iline == "": | |
433 continue | |
434 ielems = iline.strip("\r\n").split('\t') | |
435 ichr = ielems[chr_col_i] | |
436 istart = int(ielems[start_col_i]) | |
437 iend = int(ielems[end_col_i]) | |
438 isrc = "%s.%s" %(dbkey_i,ichr) | |
439 if isrc not in msatTree.chroms: | |
440 continue | |
441 result = [] | |
442 root = msatTree.chroms[isrc] #root node for the chrom | |
443 counter(root, istart, iend, lambda node: result.append( node )) | |
444 if not(result): | |
445 continue | |
446 tmpfile1 = tempfile.NamedTemporaryFile('wb+') | |
447 for node in result: | |
448 tmpfile1.write("%s\n" % "\t".join( node.other )) | |
449 | |
450 tmpfile1.seek(0) | |
451 output_writer(iline, tmpfile1.readlines()) | |
452 except: | |
453 skipped+=1 | |
454 if skipped: | |
455 print "Skipped %d intervals as invalid." %(skipped) | |
456 elif region == 'align': | |
457 if s_group_cols[0] != -1: | |
458 print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount" | |
459 else: | |
460 print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount" | |
461 prev_bnum = -1 | |
462 try: | |
463 for line in fin: | |
464 line = line.strip("\r\n") | |
465 if not(line) or line == "": | |
466 continue | |
467 elems = line.split('\t') | |
468 try: | |
469 assert int(elems[0]) | |
470 assert len(elems) == 15 | |
471 except: | |
472 continue | |
473 new_bnum = int(elems[0]) | |
474 if new_bnum != prev_bnum: | |
475 if prev_bnum != -1: | |
476 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) | |
477 linestr = line + "\n" | |
478 else: | |
479 linestr += line | |
480 linestr += "\n" | |
481 prev_bnum = new_bnum | |
482 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) | |
483 except Exception, ea: | |
484 print >>sys.stderr, ea | |
485 skipped += 1 | |
486 if skipped: | |
487 print "Skipped %d lines as invalid." %(skipped) | |
488 if __name__ == "__main__": | |
489 main() |