Mercurial > repos > xuebing > sharplabtool
comparison tools/regVariation/getIndelRates_3way.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 #Guruprasad Ananda | |
3 | |
4 from galaxy import eggs | |
5 import pkg_resources | |
6 pkg_resources.require( "bx-python" ) | |
7 | |
8 import sys, os, tempfile | |
9 import traceback | |
10 import fileinput | |
11 from warnings import warn | |
12 | |
13 from galaxy.tools.util.galaxyops import * | |
14 from bx.intervals.io import * | |
15 | |
16 from bx.intervals.operations import quicksect | |
17 | |
18 def stop_err(msg): | |
19 sys.stderr.write(msg) | |
20 sys.exit() | |
21 | |
22 def counter(node, start, end, sort_col): | |
23 global full, blk_len, blk_list | |
24 if node.start < start: | |
25 if node.right: | |
26 counter(node.right, start, end, sort_col) | |
27 elif start <= node.start <= end and start <= node.end <= end: | |
28 full += 1 | |
29 if node.other[0] not in blk_list: | |
30 blk_list.append(node.other[0]) | |
31 blk_len += int(node.other[sort_col+2]) | |
32 if node.left and node.left.maxend > start: | |
33 counter(node.left, start, end, sort_col) | |
34 if node.right: | |
35 counter(node.right, start, end, sort_col) | |
36 elif node.start > end: | |
37 if node.left: | |
38 counter(node.left, start, end, sort_col) | |
39 | |
40 | |
41 infile = sys.argv[1] | |
42 fout = open(sys.argv[2],'w') | |
43 int_file = sys.argv[3] | |
44 if int_file != "None": #User has specified an interval file | |
45 try: | |
46 fint = open(int_file, 'r') | |
47 dbkey_i = sys.argv[4] | |
48 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) | |
49 except: | |
50 stop_err("Unable to open input Interval file") | |
51 | |
52 def main(): | |
53 | |
54 for i, line in enumerate( file ( infile )): | |
55 line = line.rstrip('\r\n') | |
56 if len( line )>0 and not line.startswith( '#' ): | |
57 elems = line.split( '\t' ) | |
58 break | |
59 if i == 30: | |
60 break # Hopefully we'll never get here... | |
61 | |
62 if len( elems ) != 18: | |
63 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) | |
64 | |
65 for i, line in enumerate( file ( infile )): | |
66 line = line.rstrip('\r\n') | |
67 elems = line.split('\t') | |
68 try: | |
69 assert int(elems[0]) | |
70 assert len(elems) == 18 | |
71 if int_file != "None": | |
72 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: | |
73 stop_err("The species build corresponding to your interval file is not present in the Indel file.") | |
74 if dbkey_i in elems[3]: | |
75 sort_col = 4 | |
76 elif dbkey_i in elems[8]: | |
77 sort_col = 9 | |
78 elif dbkey_i in elems[13]: | |
79 sort_col = 14 | |
80 else: | |
81 species = [] | |
82 species.append( elems[3].split('.')[0] ) | |
83 species.append( elems[8].split('.')[0] ) | |
84 species.append( elems[13].split('.')[0] ) | |
85 sort_col = 0 #Based on block numbers | |
86 break | |
87 except: | |
88 continue | |
89 | |
90 | |
91 fin = open(infile, 'r') | |
92 skipped = 0 | |
93 | |
94 if int_file == "None": | |
95 sorted_infile = tempfile.NamedTemporaryFile() | |
96 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile | |
97 try: | |
98 os.system(cmdline) | |
99 except: | |
100 stop_err("Encountered error while sorting the input file.") | |
101 print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) | |
102 prev_bnum = -1 | |
103 sorted_infile.seek(0) | |
104 for line in sorted_infile.readlines(): | |
105 line = line.rstrip('\r\n') | |
106 elems = line.split('\t') | |
107 try: | |
108 assert int(elems[0]) | |
109 assert len(elems) == 18 | |
110 new_bnum = int(elems[0]) | |
111 if new_bnum != prev_bnum: | |
112 if prev_bnum != -1: | |
113 irate = [] | |
114 drate = [] | |
115 for i,elem in enumerate(inserts): | |
116 try: | |
117 irate.append(str("%.2e" %(inserts[i]/blen[i]))) | |
118 except: | |
119 irate.append('0') | |
120 try: | |
121 drate.append(str("%.2e" %(deletes[i]/blen[i]))) | |
122 except: | |
123 drate.append('0') | |
124 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) | |
125 inserts = [0.0, 0.0, 0.0] | |
126 deletes = [0.0, 0.0, 0.0] | |
127 blen = [] | |
128 blen.append( int(elems[6]) ) | |
129 blen.append( int(elems[11]) ) | |
130 blen.append( int(elems[16]) ) | |
131 line_sp = elems[1].split('.')[0] | |
132 sp_ind = species.index(line_sp) | |
133 if elems[1].endswith('insert'): | |
134 inserts[sp_ind] += 1 | |
135 elif elems[1].endswith('delete'): | |
136 deletes[sp_ind] += 1 | |
137 prev_bnum = new_bnum | |
138 except Exception, ei: | |
139 #print >>sys.stderr, ei | |
140 continue | |
141 irate = [] | |
142 drate = [] | |
143 for i,elem in enumerate(inserts): | |
144 try: | |
145 irate.append(str("%.2e" %(inserts[i]/blen[i]))) | |
146 except: | |
147 irate.append('0') | |
148 try: | |
149 drate.append(str("%.2e" %(deletes[i]/blen[i]))) | |
150 except: | |
151 drate.append('0') | |
152 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) | |
153 sys.exit() | |
154 | |
155 | |
156 inf = open(infile, 'r') | |
157 start_met = False | |
158 end_met = False | |
159 sp_file = tempfile.NamedTemporaryFile() | |
160 for n, line in enumerate(inf): | |
161 line = line.rstrip('\r\n') | |
162 elems = line.split('\t') | |
163 try: | |
164 assert int(elems[0]) | |
165 assert len(elems) == 18 | |
166 if dbkey_i not in elems[1]: | |
167 if not(start_met): | |
168 continue | |
169 else: | |
170 sp_end = n | |
171 break | |
172 else: | |
173 print >>sp_file, line | |
174 if not(start_met): | |
175 start_met = True | |
176 sp_start = n | |
177 except: | |
178 continue | |
179 | |
180 try: | |
181 assert sp_end | |
182 except: | |
183 sp_end = n+1 | |
184 | |
185 sp_file.seek(0) | |
186 win = NiceReaderWrapper( fileinput.FileInput( int_file ), | |
187 chrom_col=chr_col_i, | |
188 start_col=start_col_i, | |
189 end_col=end_col_i, | |
190 strand_col=strand_col_i, | |
191 fix_strand=True) | |
192 | |
193 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), | |
194 chrom_col=1, | |
195 start_col=sort_col, | |
196 end_col=sort_col+1, | |
197 strand_col=-1, | |
198 fix_strand=True) | |
199 | |
200 indelTree = quicksect.IntervalTree() | |
201 for item in indel: | |
202 if type( item ) is GenomicInterval: | |
203 indelTree.insert( item, indel.linenum, item.fields ) | |
204 result=[] | |
205 | |
206 global full, blk_len, blk_list | |
207 for interval in win: | |
208 if type( interval ) is Header: | |
209 pass | |
210 if type( interval ) is Comment: | |
211 pass | |
212 elif type( interval ) == GenomicInterval: | |
213 chrom = interval.chrom | |
214 start = int(interval.start) | |
215 end = int(interval.end) | |
216 if start > end: | |
217 warn( "Interval start after end!" ) | |
218 ins_chr = "%s.%s_insert" %(dbkey_i,chrom) | |
219 del_chr = "%s.%s_delete" %(dbkey_i,chrom) | |
220 irate = 0 | |
221 drate = 0 | |
222 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: | |
223 pass | |
224 else: | |
225 if ins_chr in indelTree.chroms: | |
226 full = 0.0 | |
227 blk_len = 0 | |
228 blk_list = [] | |
229 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree | |
230 counter(root, start, end, sort_col) | |
231 if blk_len: | |
232 irate = full/blk_len | |
233 | |
234 if del_chr in indelTree.chroms: | |
235 full = 0.0 | |
236 blk_len = 0 | |
237 blk_list = [] | |
238 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree | |
239 counter(root, start, end, sort_col) | |
240 if blk_len: | |
241 drate = full/blk_len | |
242 | |
243 interval.fields.append(str("%.2e" %irate)) | |
244 interval.fields.append(str("%.2e" %drate)) | |
245 print >>fout, "\t".join(interval.fields) | |
246 fout.flush() | |
247 | |
248 if __name__ == "__main__": | |
249 main() |