0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3 """
|
|
4 Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output.
|
|
5 """
|
|
6 from galaxy import eggs
|
|
7 import sys, os, tempfile, string, math, re
|
|
8
|
|
9 def reverse_complement(text):
|
|
10 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
|
|
11 comp = [ch for ch in text.translate(DNA_COMP)]
|
|
12 comp.reverse()
|
|
13 return "".join(comp)
|
|
14
|
|
15 def main():
|
|
16 if len(sys.argv) != 8:
|
|
17 print >>sys.stderr, "Insufficient number of arguments."
|
|
18 sys.exit()
|
|
19
|
|
20 infile = open(sys.argv[1],'r')
|
|
21 separation = int(sys.argv[2])
|
|
22 outfile = sys.argv[3]
|
|
23 align_type = sys.argv[4]
|
|
24 if align_type == "2way":
|
|
25 align_type_len = 2
|
|
26 elif align_type == "3way":
|
|
27 align_type_len = 3
|
|
28 mono_threshold = int(sys.argv[5])
|
|
29 non_mono_threshold = int(sys.argv[6])
|
|
30 allow_different_units = int(sys.argv[7])
|
|
31
|
|
32 print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" %(separation, mono_threshold, non_mono_threshold, allow_different_units==1)
|
|
33 try:
|
|
34 fout = open(outfile, "w")
|
|
35 print >>fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit"
|
|
36 #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik")
|
|
37 sputnik_cmd = "sputnik"
|
|
38 input = infile.read()
|
|
39 skipped = 0
|
|
40 block_num = 0
|
|
41 input = input.replace('\r','\n')
|
|
42 for block in input.split('\n\n'):
|
|
43 block_num += 1
|
|
44 tmpin = tempfile.NamedTemporaryFile()
|
|
45 tmpout = tempfile.NamedTemporaryFile()
|
|
46 tmpin.write(block.strip())
|
|
47 blk = tmpin.read()
|
|
48 cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name
|
|
49 try:
|
|
50 os.system(cmdline)
|
|
51 except Exception, es:
|
|
52 continue
|
|
53 sputnik_out = tmpout.read()
|
|
54 tmpin.close()
|
|
55 tmpout.close()
|
|
56 if sputnik_out != "":
|
|
57 if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')):
|
|
58 skipped += 1
|
|
59 continue
|
|
60 align_block = block.strip().split('>')
|
|
61
|
|
62 lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6}
|
|
63 blockdict={}
|
|
64 r=0
|
|
65 namelist=[]
|
|
66 for k,sput_block in enumerate(sputnik_out.split('>')[1:]):
|
|
67 whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip()
|
|
68 p = re.compile('\n(\S*nucleotide)')
|
|
69 repeats = p.split(sput_block.strip())
|
|
70 repeats_count = len(repeats)
|
|
71 j = 1
|
|
72 name = repeats[0].strip()
|
|
73 try:
|
|
74 coords = re.search('\d+[-_:]\d+',name).group()
|
|
75 coords = coords.replace('_','-').replace(':','-')
|
|
76 except Exception, e:
|
|
77 coords = '0-0'
|
|
78 pass
|
|
79 r += 1
|
|
80 blockdict[r]={}
|
|
81 try:
|
|
82 sp_name = name[:name.index('.')]
|
|
83 chr_name = name[name.index('.'):name.index('(')]
|
|
84 namelist.append(sp_name + chr_name)
|
|
85 except:
|
|
86 namelist.append(name[:20])
|
|
87 while j < repeats_count:
|
|
88 try:
|
|
89 if repeats[j].strip() not in lendict:
|
|
90 j += 2
|
|
91 continue
|
|
92
|
|
93 if blockdict[r].has_key('types'):
|
|
94 blockdict[r]['types'].append(repeats[j].strip()) #type of microsat
|
|
95 else:
|
|
96 blockdict[r]['types'] = [repeats[j].strip()] #type of microsat
|
|
97
|
|
98 sequence = ''.join(align_block[r].split('\n')[1:]).replace('\n','').strip()
|
|
99 start = int(repeats[j+1].split('--')[0].split(':')[0].strip())
|
|
100 #check to see if there are gaps before the start of the repeat, and change the start accordingly
|
|
101 sgaps = 0
|
|
102 ch_pos = start - 1
|
|
103 while ch_pos >= 0:
|
|
104 if whole_seq[ch_pos] == '-':
|
|
105 sgaps += 1
|
|
106 else:
|
|
107 break #break at the 1st non-gap character
|
|
108 ch_pos -= 1
|
|
109 if blockdict[r].has_key('starts'):
|
|
110 blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS
|
|
111 else:
|
|
112 blockdict[r]['starts'] = [start+sgaps]
|
|
113
|
|
114 end = int(repeats[j+1].split('--')[0].split(':')[1].strip())
|
|
115 #check to see if there are gaps after the end of the repeat, and change the end accordingly
|
|
116 egaps = 0
|
|
117 for ch in whole_seq[end:]:
|
|
118 if ch == '-':
|
|
119 egaps += 1
|
|
120 else:
|
|
121 break #break at the 1st non-gap character
|
|
122 if blockdict[r].has_key('ends'):
|
|
123 blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS
|
|
124 else:
|
|
125 blockdict[r]['ends'] = [end+egaps]
|
|
126
|
|
127 repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence
|
|
128 repeat_len = repeats[j+1].split('--')[1].split()[1].strip()
|
|
129 gap_count = repeat_seq.count('-')
|
|
130 #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count
|
|
131 repeat_len = str(int(repeat_len) - gap_count)
|
|
132
|
|
133 rel_start = blockdict[r]['starts'][-1]
|
|
134 gaps_before_start = whole_seq[:rel_start].count('-')
|
|
135
|
|
136 if blockdict[r].has_key('gaps_before_start'):
|
|
137 blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths
|
|
138 else:
|
|
139 blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths
|
|
140
|
|
141 whole_seq_start= int(coords.split('-')[0])
|
|
142 if blockdict[r].has_key('whole_seq_start'):
|
|
143 blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths
|
|
144 else:
|
|
145 blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths
|
|
146
|
|
147 if blockdict[r].has_key('lengths'):
|
|
148 blockdict[r]['lengths'].append(repeat_len) #lengths
|
|
149 else:
|
|
150 blockdict[r]['lengths'] = [repeat_len] #lengths
|
|
151
|
|
152 if blockdict[r].has_key('counts'):
|
|
153 blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit
|
|
154 else:
|
|
155 blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit
|
|
156
|
|
157 if blockdict[r].has_key('units'):
|
|
158 blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit
|
|
159 else:
|
|
160 blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit
|
|
161
|
|
162 except Exception, eh:
|
|
163 pass
|
|
164 j+=2
|
|
165 #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'.
|
|
166 delete_index_list = []
|
|
167 for ind, item in enumerate(blockdict[r]['ends']):
|
|
168 try:
|
|
169 if blockdict[r]['starts'][ind+1]-item < separation:
|
|
170 if ind not in delete_index_list:
|
|
171 delete_index_list.append(ind)
|
|
172 if ind+1 not in delete_index_list:
|
|
173 delete_index_list.append(ind+1)
|
|
174 except Exception, ek:
|
|
175 pass
|
|
176 for index in delete_index_list: #mark them for deletion
|
|
177 try:
|
|
178 blockdict[r]['starts'][index] = 'marked'
|
|
179 blockdict[r]['ends'][index] = 'marked'
|
|
180 blockdict[r]['types'][index] = 'marked'
|
|
181 blockdict[r]['gaps_before_start'][index] = 'marked'
|
|
182 blockdict[r]['whole_seq_start'][index] = 'marked'
|
|
183 blockdict[r]['lengths'][index] = 'marked'
|
|
184 blockdict[r]['counts'][index] = 'marked'
|
|
185 blockdict[r]['units'][index] = 'marked'
|
|
186 except Exception, ej:
|
|
187 pass
|
|
188 #remove 'marked' elements from all the lists
|
|
189 """
|
|
190 for key in blockdict[r].keys():
|
|
191 for elem in blockdict[r][key]:
|
|
192 if elem == 'marked':
|
|
193 blockdict[r][key].remove(elem)
|
|
194 """
|
|
195 #print blockdict
|
|
196
|
|
197 #make sure that the blockdict has keys for both the species
|
|
198 if (1 not in blockdict) or (2 not in blockdict):
|
|
199 continue
|
|
200
|
|
201 visited_2 = [0 for x in range(len(blockdict[2]['starts']))]
|
|
202 for ind1,coord_s1 in enumerate(blockdict[1]['starts']):
|
|
203 if coord_s1 == 'marked':
|
|
204 continue
|
|
205 coord_e1 = blockdict[1]['ends'][ind1]
|
|
206 out = []
|
|
207 for ind2,coord_s2 in enumerate(blockdict[2]['starts']):
|
|
208 if coord_s2 == 'marked':
|
|
209 visited_2[ind2] = 1
|
|
210 continue
|
|
211 coord_e2 = blockdict[2]['ends'][ind2]
|
|
212 #skip if the 2 repeats are not of the same type or don't have the same repeating unit.
|
|
213 if allow_different_units == 0:
|
|
214 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
|
|
215 continue
|
|
216 else:
|
|
217 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2):
|
|
218 continue
|
|
219 #print >>sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2)
|
|
220 #skip if the repeat number thresholds are not met
|
|
221 if blockdict[1]['types'][ind1] == 'mononucleotide':
|
|
222 if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
|
|
223 continue
|
|
224 else:
|
|
225 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
|
|
226 continue
|
|
227
|
|
228 if blockdict[2]['types'][ind2] == 'mononucleotide':
|
|
229 if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
|
|
230 continue
|
|
231 else:
|
|
232 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
|
|
233 continue
|
|
234 #print "s1,e1=%s,%s; s2,e2=%s,%s" %(coord_s1,coord_e1,coord_s2,coord_e2)
|
|
235 if (coord_s1 in range(coord_s2,coord_e2)) or (coord_e1 in range(coord_s2,coord_e2)):
|
|
236 out.append(str(block_num))
|
|
237 out.append(namelist[0])
|
|
238 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
|
|
239 rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
|
|
240 out.append(str(rel_start))
|
|
241 out.append(str(rel_end))
|
|
242 out.append(blockdict[1]['types'][ind1])
|
|
243 out.append(blockdict[1]['lengths'][ind1])
|
|
244 out.append(blockdict[1]['counts'][ind1])
|
|
245 out.append(blockdict[1]['units'][ind1])
|
|
246 out.append(namelist[1])
|
|
247 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
|
|
248 rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
|
|
249 out.append(str(rel_start))
|
|
250 out.append(str(rel_end))
|
|
251 out.append(blockdict[2]['types'][ind2])
|
|
252 out.append(blockdict[2]['lengths'][ind2])
|
|
253 out.append(blockdict[2]['counts'][ind2])
|
|
254 out.append(blockdict[2]['units'][ind2])
|
|
255 print >>fout, '\t'.join(out)
|
|
256 visited_2[ind2] = 1
|
|
257 out=[]
|
|
258
|
|
259 if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet.
|
|
260 for ind2, coord_s2 in enumerate(blockdict[2]['starts']):
|
|
261 if coord_s2 == 'marked':
|
|
262 continue
|
|
263 if visited_2[ind] != 0:
|
|
264 continue
|
|
265 coord_e2 = blockdict[2]['ends'][ind2]
|
|
266 out = []
|
|
267 for ind1,coord_s1 in enumerate(blockdict[1]['starts']):
|
|
268 if coord_s1 == 'marked':
|
|
269 continue
|
|
270 coord_e1 = blockdict[1]['ends'][ind1]
|
|
271 #skip if the 2 repeats are not of the same type or don't have the same repeating unit.
|
|
272 if allow_different_units == 0:
|
|
273 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
|
|
274 continue
|
|
275 else:
|
|
276 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2:
|
|
277 continue
|
|
278 #skip if the repeat number thresholds are not met
|
|
279 if blockdict[1]['types'][ind1] == 'mononucleotide':
|
|
280 if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
|
|
281 continue
|
|
282 else:
|
|
283 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
|
|
284 continue
|
|
285
|
|
286 if blockdict[2]['types'][ind2] == 'mononucleotide':
|
|
287 if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
|
|
288 continue
|
|
289 else:
|
|
290 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
|
|
291 continue
|
|
292
|
|
293 if (coord_s2 in range(coord_s1,coord_e1)) or (coord_e2 in range(coord_s1,coord_e1)):
|
|
294 out.append(str(block_num))
|
|
295 out.append(namelist[0])
|
|
296 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
|
|
297 rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
|
|
298 out.append(str(rel_start))
|
|
299 out.append(str(rel_end))
|
|
300 out.append(blockdict[1]['types'][ind1])
|
|
301 out.append(blockdict[1]['lengths'][ind1])
|
|
302 out.append(blockdict[1]['counts'][ind1])
|
|
303 out.append(blockdict[1]['units'][ind1])
|
|
304 out.append(namelist[1])
|
|
305 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
|
|
306 rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
|
|
307 out.append(str(rel_start))
|
|
308 out.append(str(rel_end))
|
|
309 out.append(blockdict[2]['types'][ind2])
|
|
310 out.append(blockdict[2]['lengths'][ind2])
|
|
311 out.append(blockdict[2]['counts'][ind2])
|
|
312 out.append(blockdict[2]['units'][ind2])
|
|
313 print >>fout, '\t'.join(out)
|
|
314 visited_2[ind2] = 1
|
|
315 out=[]
|
|
316
|
|
317 #print >>fout, blockdict
|
|
318 except Exception, exc:
|
|
319 print >>sys.stderr, "type(exc),args,exc: %s, %s, %s" %(type(exc), exc.args, exc)
|
|
320
|
|
321 if __name__ == "__main__":
|
|
322 main()
|
|
323
|