comparison tools/regVariation/getIndelRates_3way.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 #Guruprasad Ananda
3
4 from galaxy import eggs
5 import pkg_resources
6 pkg_resources.require( "bx-python" )
7
8 import sys, os, tempfile
9 import traceback
10 import fileinput
11 from warnings import warn
12
13 from galaxy.tools.util.galaxyops import *
14 from bx.intervals.io import *
15
16 from bx.intervals.operations import quicksect
17
18 def stop_err(msg):
19 sys.stderr.write(msg)
20 sys.exit()
21
22 def counter(node, start, end, sort_col):
23 global full, blk_len, blk_list
24 if node.start < start:
25 if node.right:
26 counter(node.right, start, end, sort_col)
27 elif start <= node.start <= end and start <= node.end <= end:
28 full += 1
29 if node.other[0] not in blk_list:
30 blk_list.append(node.other[0])
31 blk_len += int(node.other[sort_col+2])
32 if node.left and node.left.maxend > start:
33 counter(node.left, start, end, sort_col)
34 if node.right:
35 counter(node.right, start, end, sort_col)
36 elif node.start > end:
37 if node.left:
38 counter(node.left, start, end, sort_col)
39
40
41 infile = sys.argv[1]
42 fout = open(sys.argv[2],'w')
43 int_file = sys.argv[3]
44 if int_file != "None": #User has specified an interval file
45 try:
46 fint = open(int_file, 'r')
47 dbkey_i = sys.argv[4]
48 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
49 except:
50 stop_err("Unable to open input Interval file")
51
52 def main():
53
54 for i, line in enumerate( file ( infile )):
55 line = line.rstrip('\r\n')
56 if len( line )>0 and not line.startswith( '#' ):
57 elems = line.split( '\t' )
58 break
59 if i == 30:
60 break # Hopefully we'll never get here...
61
62 if len( elems ) != 18:
63 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
64
65 for i, line in enumerate( file ( infile )):
66 line = line.rstrip('\r\n')
67 elems = line.split('\t')
68 try:
69 assert int(elems[0])
70 assert len(elems) == 18
71 if int_file != "None":
72 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]:
73 stop_err("The species build corresponding to your interval file is not present in the Indel file.")
74 if dbkey_i in elems[3]:
75 sort_col = 4
76 elif dbkey_i in elems[8]:
77 sort_col = 9
78 elif dbkey_i in elems[13]:
79 sort_col = 14
80 else:
81 species = []
82 species.append( elems[3].split('.')[0] )
83 species.append( elems[8].split('.')[0] )
84 species.append( elems[13].split('.')[0] )
85 sort_col = 0 #Based on block numbers
86 break
87 except:
88 continue
89
90
91 fin = open(infile, 'r')
92 skipped = 0
93
94 if int_file == "None":
95 sorted_infile = tempfile.NamedTemporaryFile()
96 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
97 try:
98 os.system(cmdline)
99 except:
100 stop_err("Encountered error while sorting the input file.")
101 print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2])
102 prev_bnum = -1
103 sorted_infile.seek(0)
104 for line in sorted_infile.readlines():
105 line = line.rstrip('\r\n')
106 elems = line.split('\t')
107 try:
108 assert int(elems[0])
109 assert len(elems) == 18
110 new_bnum = int(elems[0])
111 if new_bnum != prev_bnum:
112 if prev_bnum != -1:
113 irate = []
114 drate = []
115 for i,elem in enumerate(inserts):
116 try:
117 irate.append(str("%.2e" %(inserts[i]/blen[i])))
118 except:
119 irate.append('0')
120 try:
121 drate.append(str("%.2e" %(deletes[i]/blen[i])))
122 except:
123 drate.append('0')
124 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
125 inserts = [0.0, 0.0, 0.0]
126 deletes = [0.0, 0.0, 0.0]
127 blen = []
128 blen.append( int(elems[6]) )
129 blen.append( int(elems[11]) )
130 blen.append( int(elems[16]) )
131 line_sp = elems[1].split('.')[0]
132 sp_ind = species.index(line_sp)
133 if elems[1].endswith('insert'):
134 inserts[sp_ind] += 1
135 elif elems[1].endswith('delete'):
136 deletes[sp_ind] += 1
137 prev_bnum = new_bnum
138 except Exception, ei:
139 #print >>sys.stderr, ei
140 continue
141 irate = []
142 drate = []
143 for i,elem in enumerate(inserts):
144 try:
145 irate.append(str("%.2e" %(inserts[i]/blen[i])))
146 except:
147 irate.append('0')
148 try:
149 drate.append(str("%.2e" %(deletes[i]/blen[i])))
150 except:
151 drate.append('0')
152 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
153 sys.exit()
154
155
156 inf = open(infile, 'r')
157 start_met = False
158 end_met = False
159 sp_file = tempfile.NamedTemporaryFile()
160 for n, line in enumerate(inf):
161 line = line.rstrip('\r\n')
162 elems = line.split('\t')
163 try:
164 assert int(elems[0])
165 assert len(elems) == 18
166 if dbkey_i not in elems[1]:
167 if not(start_met):
168 continue
169 else:
170 sp_end = n
171 break
172 else:
173 print >>sp_file, line
174 if not(start_met):
175 start_met = True
176 sp_start = n
177 except:
178 continue
179
180 try:
181 assert sp_end
182 except:
183 sp_end = n+1
184
185 sp_file.seek(0)
186 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
187 chrom_col=chr_col_i,
188 start_col=start_col_i,
189 end_col=end_col_i,
190 strand_col=strand_col_i,
191 fix_strand=True)
192
193 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
194 chrom_col=1,
195 start_col=sort_col,
196 end_col=sort_col+1,
197 strand_col=-1,
198 fix_strand=True)
199
200 indelTree = quicksect.IntervalTree()
201 for item in indel:
202 if type( item ) is GenomicInterval:
203 indelTree.insert( item, indel.linenum, item.fields )
204 result=[]
205
206 global full, blk_len, blk_list
207 for interval in win:
208 if type( interval ) is Header:
209 pass
210 if type( interval ) is Comment:
211 pass
212 elif type( interval ) == GenomicInterval:
213 chrom = interval.chrom
214 start = int(interval.start)
215 end = int(interval.end)
216 if start > end:
217 warn( "Interval start after end!" )
218 ins_chr = "%s.%s_insert" %(dbkey_i,chrom)
219 del_chr = "%s.%s_delete" %(dbkey_i,chrom)
220 irate = 0
221 drate = 0
222 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
223 pass
224 else:
225 if ins_chr in indelTree.chroms:
226 full = 0.0
227 blk_len = 0
228 blk_list = []
229 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree
230 counter(root, start, end, sort_col)
231 if blk_len:
232 irate = full/blk_len
233
234 if del_chr in indelTree.chroms:
235 full = 0.0
236 blk_len = 0
237 blk_list = []
238 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree
239 counter(root, start, end, sort_col)
240 if blk_len:
241 drate = full/blk_len
242
243 interval.fields.append(str("%.2e" %irate))
244 interval.fields.append(str("%.2e" %drate))
245 print >>fout, "\t".join(interval.fields)
246 fout.flush()
247
248 if __name__ == "__main__":
249 main()