0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3
|
|
4 from galaxy import eggs
|
|
5 import pkg_resources
|
|
6 pkg_resources.require( "bx-python" )
|
|
7
|
|
8 import sys, os, tempfile
|
|
9 import traceback
|
|
10 import fileinput
|
|
11 from warnings import warn
|
|
12
|
|
13 from galaxy.tools.util.galaxyops import *
|
|
14 from bx.intervals.io import *
|
|
15
|
|
16 from bx.intervals.operations import quicksect
|
|
17
|
|
18 def stop_err(msg):
|
|
19 sys.stderr.write(msg)
|
|
20 sys.exit()
|
|
21
|
|
22 def counter(node, start, end, sort_col):
|
|
23 global full, blk_len, blk_list
|
|
24 if node.start < start:
|
|
25 if node.right:
|
|
26 counter(node.right, start, end, sort_col)
|
|
27 elif start <= node.start <= end and start <= node.end <= end:
|
|
28 full += 1
|
|
29 if node.other[0] not in blk_list:
|
|
30 blk_list.append(node.other[0])
|
|
31 blk_len += int(node.other[sort_col+2])
|
|
32 if node.left and node.left.maxend > start:
|
|
33 counter(node.left, start, end, sort_col)
|
|
34 if node.right:
|
|
35 counter(node.right, start, end, sort_col)
|
|
36 elif node.start > end:
|
|
37 if node.left:
|
|
38 counter(node.left, start, end, sort_col)
|
|
39
|
|
40
|
|
41 infile = sys.argv[1]
|
|
42 fout = open(sys.argv[2],'w')
|
|
43 int_file = sys.argv[3]
|
|
44 if int_file != "None": #User has specified an interval file
|
|
45 try:
|
|
46 fint = open(int_file, 'r')
|
|
47 dbkey_i = sys.argv[4]
|
|
48 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
|
|
49 except:
|
|
50 stop_err("Unable to open input Interval file")
|
|
51
|
|
52 def main():
|
|
53
|
|
54 for i, line in enumerate( file ( infile )):
|
|
55 line = line.rstrip('\r\n')
|
|
56 if len( line )>0 and not line.startswith( '#' ):
|
|
57 elems = line.split( '\t' )
|
|
58 break
|
|
59 if i == 30:
|
|
60 break # Hopefully we'll never get here...
|
|
61
|
|
62 if len( elems ) != 18:
|
|
63 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
|
|
64
|
|
65 for i, line in enumerate( file ( infile )):
|
|
66 line = line.rstrip('\r\n')
|
|
67 elems = line.split('\t')
|
|
68 try:
|
|
69 assert int(elems[0])
|
|
70 assert len(elems) == 18
|
|
71 if int_file != "None":
|
|
72 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]:
|
|
73 stop_err("The species build corresponding to your interval file is not present in the Indel file.")
|
|
74 if dbkey_i in elems[3]:
|
|
75 sort_col = 4
|
|
76 elif dbkey_i in elems[8]:
|
|
77 sort_col = 9
|
|
78 elif dbkey_i in elems[13]:
|
|
79 sort_col = 14
|
|
80 else:
|
|
81 species = []
|
|
82 species.append( elems[3].split('.')[0] )
|
|
83 species.append( elems[8].split('.')[0] )
|
|
84 species.append( elems[13].split('.')[0] )
|
|
85 sort_col = 0 #Based on block numbers
|
|
86 break
|
|
87 except:
|
|
88 continue
|
|
89
|
|
90
|
|
91 fin = open(infile, 'r')
|
|
92 skipped = 0
|
|
93
|
|
94 if int_file == "None":
|
|
95 sorted_infile = tempfile.NamedTemporaryFile()
|
|
96 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
|
|
97 try:
|
|
98 os.system(cmdline)
|
|
99 except:
|
|
100 stop_err("Encountered error while sorting the input file.")
|
|
101 print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2])
|
|
102 prev_bnum = -1
|
|
103 sorted_infile.seek(0)
|
|
104 for line in sorted_infile.readlines():
|
|
105 line = line.rstrip('\r\n')
|
|
106 elems = line.split('\t')
|
|
107 try:
|
|
108 assert int(elems[0])
|
|
109 assert len(elems) == 18
|
|
110 new_bnum = int(elems[0])
|
|
111 if new_bnum != prev_bnum:
|
|
112 if prev_bnum != -1:
|
|
113 irate = []
|
|
114 drate = []
|
|
115 for i,elem in enumerate(inserts):
|
|
116 try:
|
|
117 irate.append(str("%.2e" %(inserts[i]/blen[i])))
|
|
118 except:
|
|
119 irate.append('0')
|
|
120 try:
|
|
121 drate.append(str("%.2e" %(deletes[i]/blen[i])))
|
|
122 except:
|
|
123 drate.append('0')
|
|
124 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
|
|
125 inserts = [0.0, 0.0, 0.0]
|
|
126 deletes = [0.0, 0.0, 0.0]
|
|
127 blen = []
|
|
128 blen.append( int(elems[6]) )
|
|
129 blen.append( int(elems[11]) )
|
|
130 blen.append( int(elems[16]) )
|
|
131 line_sp = elems[1].split('.')[0]
|
|
132 sp_ind = species.index(line_sp)
|
|
133 if elems[1].endswith('insert'):
|
|
134 inserts[sp_ind] += 1
|
|
135 elif elems[1].endswith('delete'):
|
|
136 deletes[sp_ind] += 1
|
|
137 prev_bnum = new_bnum
|
|
138 except Exception, ei:
|
|
139 #print >>sys.stderr, ei
|
|
140 continue
|
|
141 irate = []
|
|
142 drate = []
|
|
143 for i,elem in enumerate(inserts):
|
|
144 try:
|
|
145 irate.append(str("%.2e" %(inserts[i]/blen[i])))
|
|
146 except:
|
|
147 irate.append('0')
|
|
148 try:
|
|
149 drate.append(str("%.2e" %(deletes[i]/blen[i])))
|
|
150 except:
|
|
151 drate.append('0')
|
|
152 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
|
|
153 sys.exit()
|
|
154
|
|
155
|
|
156 inf = open(infile, 'r')
|
|
157 start_met = False
|
|
158 end_met = False
|
|
159 sp_file = tempfile.NamedTemporaryFile()
|
|
160 for n, line in enumerate(inf):
|
|
161 line = line.rstrip('\r\n')
|
|
162 elems = line.split('\t')
|
|
163 try:
|
|
164 assert int(elems[0])
|
|
165 assert len(elems) == 18
|
|
166 if dbkey_i not in elems[1]:
|
|
167 if not(start_met):
|
|
168 continue
|
|
169 else:
|
|
170 sp_end = n
|
|
171 break
|
|
172 else:
|
|
173 print >>sp_file, line
|
|
174 if not(start_met):
|
|
175 start_met = True
|
|
176 sp_start = n
|
|
177 except:
|
|
178 continue
|
|
179
|
|
180 try:
|
|
181 assert sp_end
|
|
182 except:
|
|
183 sp_end = n+1
|
|
184
|
|
185 sp_file.seek(0)
|
|
186 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
|
|
187 chrom_col=chr_col_i,
|
|
188 start_col=start_col_i,
|
|
189 end_col=end_col_i,
|
|
190 strand_col=strand_col_i,
|
|
191 fix_strand=True)
|
|
192
|
|
193 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
|
|
194 chrom_col=1,
|
|
195 start_col=sort_col,
|
|
196 end_col=sort_col+1,
|
|
197 strand_col=-1,
|
|
198 fix_strand=True)
|
|
199
|
|
200 indelTree = quicksect.IntervalTree()
|
|
201 for item in indel:
|
|
202 if type( item ) is GenomicInterval:
|
|
203 indelTree.insert( item, indel.linenum, item.fields )
|
|
204 result=[]
|
|
205
|
|
206 global full, blk_len, blk_list
|
|
207 for interval in win:
|
|
208 if type( interval ) is Header:
|
|
209 pass
|
|
210 if type( interval ) is Comment:
|
|
211 pass
|
|
212 elif type( interval ) == GenomicInterval:
|
|
213 chrom = interval.chrom
|
|
214 start = int(interval.start)
|
|
215 end = int(interval.end)
|
|
216 if start > end:
|
|
217 warn( "Interval start after end!" )
|
|
218 ins_chr = "%s.%s_insert" %(dbkey_i,chrom)
|
|
219 del_chr = "%s.%s_delete" %(dbkey_i,chrom)
|
|
220 irate = 0
|
|
221 drate = 0
|
|
222 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
|
|
223 pass
|
|
224 else:
|
|
225 if ins_chr in indelTree.chroms:
|
|
226 full = 0.0
|
|
227 blk_len = 0
|
|
228 blk_list = []
|
|
229 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree
|
|
230 counter(root, start, end, sort_col)
|
|
231 if blk_len:
|
|
232 irate = full/blk_len
|
|
233
|
|
234 if del_chr in indelTree.chroms:
|
|
235 full = 0.0
|
|
236 blk_len = 0
|
|
237 blk_list = []
|
|
238 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree
|
|
239 counter(root, start, end, sort_col)
|
|
240 if blk_len:
|
|
241 drate = full/blk_len
|
|
242
|
|
243 interval.fields.append(str("%.2e" %irate))
|
|
244 interval.fields.append(str("%.2e" %drate))
|
|
245 print >>fout, "\t".join(interval.fields)
|
|
246 fout.flush()
|
|
247
|
|
248 if __name__ == "__main__":
|
|
249 main() |