18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 ##@file
|
|
4 # Convert the output from Refalign (MSA program) into the 'fasta' format.
|
|
5 # Usually used before subsequent analysis such as the estimation of deletion rate.
|
|
6 #
|
|
7 # usage: refalign2fasta.py [ options ]
|
|
8 # options:
|
|
9 # -h: this help
|
|
10 # -i: name of the input file (output from refalign)
|
|
11 # -r: name of the reference sequence (discard if not provided)
|
|
12 # -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')
|
|
13 # -o: name of the output file (default=inFileName'.fa_aln',format='fasta')
|
|
14
|
|
15 import os
|
|
16 import sys
|
|
17 import getopt
|
|
18 import exceptions
|
|
19
|
|
20 if not os.environ.has_key( "REPET_PATH" ):
|
|
21 print "ERROR: no environment variable REPET_PATH"
|
|
22 sys.exit(1)
|
|
23 sys.path.append( os.environ["REPET_PATH"] )
|
|
24
|
|
25 import pyRepet.seq.Bioseq
|
|
26
|
|
27
|
|
28 def help():
|
|
29 """
|
|
30 Give the list of the command-line options.
|
|
31 """
|
|
32 print
|
|
33 print "usage:",sys.argv[0]," [ options ]"
|
|
34 print "options:"
|
|
35 print " -h: this help"
|
|
36 print " -i: name of the input file (output from refalign)"
|
|
37 print " -r: name of the reference sequence (discard if not provided)"
|
|
38 print " -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')"
|
|
39 print " -o: name of the output file (default=inFileName'.fa_aln',format='fasta')"
|
|
40 print
|
|
41
|
|
42
|
|
43 def getAlignments( inFileName ):
|
|
44 """
|
|
45 Retrieve the alignments from the input file.
|
|
46
|
|
47 @param inFileName: name of the input file
|
|
48 @type: string
|
|
49
|
|
50 @return: list of alignments ( refseq, seq, header of seq )
|
|
51 @rtype: list of 3d-tuples
|
|
52 """
|
|
53
|
|
54 lAlign = []
|
|
55
|
|
56 inFile = open( inFileName, "r" )
|
|
57 line = inFile.readline()
|
|
58 while True:
|
|
59 if line == "":
|
|
60 break
|
|
61 refseq, seq, label = line[:-1].split("\t")[:3]
|
|
62 lAlign.append( ( refseq, seq, label ) )
|
|
63 line = inFile.readline()
|
|
64 inFile.close()
|
|
65
|
|
66 return lAlign
|
|
67
|
|
68
|
|
69 def getGaps( seq ):
|
|
70 """
|
|
71 Get the gaps on a sequence, start coordinate and length. The start
|
|
72 coordinate of a gap is the # of the nucleotide after which it starts.
|
|
73
|
|
74 @param seq: sequence to analyse
|
|
75 @type seq: string
|
|
76
|
|
77 @return: list of gaps ( start coordinate, length )
|
|
78 @rtype: list of 2d-tuples
|
|
79 """
|
|
80
|
|
81 prev = "N"
|
|
82 lGapsOnSeq = []
|
|
83 i = 0
|
|
84 lengthGap = 0
|
|
85 for c in seq:
|
|
86 if c == "-" and prev != "-":
|
|
87 startGap = i
|
|
88 if c != "-" and prev == "-":
|
|
89 lGapsOnSeq.append( ( startGap, lengthGap ) )
|
|
90 lengthGap = 0
|
|
91 if c != "-":
|
|
92 i += 1
|
|
93 else:
|
|
94 lengthGap += 1
|
|
95 prev = c
|
|
96
|
|
97 # case with a gap at the end of the sequence
|
|
98 if seq[ len(seq) - 1 ] == "-":
|
|
99 lGapsOnSeq.append( ( startGap, lengthGap ) )
|
|
100
|
|
101 return lGapsOnSeq
|
|
102
|
|
103
|
|
104 def getGapsOnRefSeq( lAlign ):
|
|
105 """
|
|
106 Retrieve the gaps on the ref seq in all the alignments.
|
|
107
|
|
108 @param lAlign: list of alignments ( refseq, seq, header of seq )
|
|
109 @type lAlign: list of 3d-tuples
|
|
110
|
|
111 @return: list of gaps per alignment
|
|
112 @rtype: list of lists of 2d-tuples
|
|
113 """
|
|
114
|
|
115 lGapsOnRef = []
|
|
116
|
|
117 for align in lAlign:
|
|
118 refseq = align[0]
|
|
119 lGapsOnRef.append( getGaps( refseq ) )
|
|
120
|
|
121 return lGapsOnRef
|
|
122
|
|
123
|
|
124 def insertGap( seq, startGap, lengthGap ):
|
|
125 """
|
|
126 Get a new seq by inserting a gap in the give seq.
|
|
127
|
|
128 @param seq: sequence
|
|
129 @type seq: string
|
|
130
|
|
131 @param startGap:
|
|
132 @type: startGap: integer
|
|
133
|
|
134 @param lengthGap: length of the gap
|
|
135 @type lengthGap: integer
|
|
136
|
|
137 @return: new seq made from the give seq by inserting the gap
|
|
138 @rtype: string
|
|
139 """
|
|
140
|
|
141 new_seq = seq[:startGap] + (lengthGap*'-') + seq[startGap:]
|
|
142 return new_seq
|
|
143
|
|
144
|
|
145 def insertListGaps( inSeq, lGaps ):
|
|
146 """
|
|
147 Insert all the gaps from the list into the sequence.
|
|
148
|
|
149 @param inSeq: sequence
|
|
150 @type inSeq: string
|
|
151
|
|
152 @param lGaps: list of gaps ( start coordinate, length )
|
|
153 @type: list of 2d-tuples
|
|
154
|
|
155 @return: sequence with the gaps
|
|
156 @rtype: string
|
|
157 """
|
|
158
|
|
159 # insert gaps from the right to the left
|
|
160 lGaps.sort()
|
|
161 lGaps.reverse()
|
|
162
|
|
163 prevPos = 0
|
|
164 outSeq = inSeq
|
|
165
|
|
166 for startGap, lengthGap in lGaps:
|
|
167 if startGap != prevPos:
|
|
168 outSeq = insertGap( outSeq, startGap, lengthGap )
|
|
169 prevPos = startGap
|
|
170
|
|
171 return outSeq
|
|
172
|
|
173
|
|
174 def insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName ):
|
|
175 """
|
|
176 Get the ref seq with all its gaps inserted.
|
|
177
|
|
178 @param lAlign: list of alignments ( refseq, seq, header of seq )
|
|
179 @type lAlign: list of 3d-tuples
|
|
180
|
|
181 @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
|
|
182 @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
|
|
183
|
|
184 @param refseqName: name of the reference sequence
|
|
185 @type refseqName: string
|
|
186
|
|
187 @return: ref seq with all its gaps inserted
|
|
188 @rtype: string
|
|
189 """
|
|
190
|
|
191 # retrieve the initial ref seq, ie without any gap
|
|
192 initRefSeq = lAlign[0][0].replace("-","")
|
|
193
|
|
194 # convert the list of lists of gaps into a list of gaps
|
|
195 flat_lGaps = []
|
|
196 for gaps in lGapsOnRefSeqPerAlign:
|
|
197 flat_lGaps.extend( gaps )
|
|
198
|
|
199 # insert the gaps in the sequence of ref seq
|
|
200 finalRefSeq = insertListGaps( initRefSeq, flat_lGaps )
|
|
201
|
|
202 Align_refseq = ( refseqName, finalRefSeq )
|
|
203
|
|
204 return Align_refseq
|
|
205
|
|
206
|
|
207 def insertgap_seq( lAlign, lGapsOnRefSeqPerAlign ):
|
|
208 """
|
|
209 Insert in the sequences all the gaps found on the ref seq.
|
|
210
|
|
211 @param lAlign: list of alignments ( refseq, seq, header of seq )
|
|
212 @type lAlign: list of 3d-tuples
|
|
213
|
|
214 @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
|
|
215 @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
|
|
216
|
|
217 @return: list of lists (sequence with gaps, header)
|
|
218 @rtype: list of 2d-tuples
|
|
219
|
|
220 insert dans les seq les gaps donnes par la liste de liste de gap
|
|
221 retourne une liste des nouvelles seq apres insertion des gaps
|
|
222 """
|
|
223
|
|
224 # for each gap, add the nb of the alignment in which it has been found
|
|
225 newlistgap_seq =[]
|
|
226 alignID = 0
|
|
227 for lGaps in lGapsOnRefSeqPerAlign:
|
|
228 for startGap,lengthGap in lGaps:
|
|
229 newlistgap_seq.append( ( startGap, lengthGap, alignID ) )
|
|
230 alignID += 1
|
|
231 newlistgap_seq.sort()
|
|
232
|
|
233 Align_seq = []
|
|
234
|
|
235 # for each alignment
|
|
236 for j in xrange(0,len(lAlign)):
|
|
237
|
|
238 #create a new list = list of gaps to be inserted in a given seq
|
|
239 newlist = []
|
|
240 offset = 0
|
|
241 longmax = 0
|
|
242 longself = 0
|
|
243 posprec = 0
|
|
244 for startGap, lengthGap, alignID in newlistgap_seq:
|
|
245 # when 2 gaps have the same start, we keep the longer one
|
|
246 if startGap != posprec:
|
|
247 if longmax > longself:
|
|
248 newlist.append( (posprec + offset, longmax - longself) )
|
|
249 offset += longself
|
|
250 longmax=0
|
|
251 longself=0
|
|
252 #lorsque le numero de la seq du tuple a la meme valeur que j
|
|
253 #on stocke la valeur de lengthGap
|
|
254 if j == alignID:
|
|
255 longself = lengthGap
|
|
256 #sinon on prend comme valeur de longmax la valeur maximale de longmax et lengthGap
|
|
257 else:
|
|
258 longmax = max(longmax, lengthGap)
|
|
259 posprec = startGap
|
|
260 if longmax > longself:
|
|
261 newlist.append((posprec + offset, longmax - longself))
|
|
262
|
|
263 newSeq = insertListGaps( (lAlign[j][1]), newlist )
|
|
264 header = lAlign[j][2]
|
|
265 Align_seq.append( ( header, newSeq ) )
|
|
266
|
|
267 return Align_seq
|
|
268
|
|
269
|
|
270 def getSeqWithDeletions( lAlign ):
|
|
271 """
|
|
272 Get the sequences by putting gaps only when they correspond to a deletion compare to ref seq.
|
|
273 Used for instance when we want to estimate the deletion rate.
|
|
274
|
|
275 @param lAlign: list of alignments ( refseq, seq, header of seq )
|
|
276 @type lAlign: list of 3d-tuples
|
|
277
|
|
278 @return: list of lists ( header, sequence with gaps )
|
|
279 @rtype: list of 2d-tuples
|
|
280 """
|
|
281
|
|
282 Align_seq = []
|
|
283
|
|
284 for align in lAlign:
|
|
285 refseq = align[0]
|
|
286 seq = align[1]
|
|
287 header = align[2]
|
|
288 newSeq = ""
|
|
289 for i in xrange(0,len(refseq)):
|
|
290 if refseq[i] != "-":
|
|
291 newSeq += seq[i]
|
|
292 Align_seq.append( ( header, newSeq ) )
|
|
293
|
|
294 return Align_seq
|
|
295
|
|
296
|
|
297 def saveMSA( outFileName, Align_seq, Align_seqref=None ):
|
|
298 """
|
|
299 Save the results as a multiple sequence alignment (MSA) in the 'fasta' format.
|
|
300
|
|
301 @param outFileName: name of the output file
|
|
302 @type outFileName: string
|
|
303
|
|
304 @param Align_seqref: sequence of ref seq
|
|
305 @type Align_seqref: string
|
|
306 """
|
|
307
|
|
308 outFile = open( outFileName, "w" )
|
|
309 bs = pyRepet.seq.Bioseq.Bioseq()
|
|
310
|
|
311 # if provided, save the ref seq
|
|
312 if Align_seqref != None:
|
|
313 bs.header = Align_seqref[0]
|
|
314 bs.sequence = Align_seqref[1]
|
|
315 bs.write( outFile )
|
|
316
|
|
317 # save the other sequences
|
|
318 for i in Align_seq:
|
|
319 bs.header = i[0]
|
|
320 bs.sequence = i[1]
|
|
321 bs.write( outFile )
|
|
322
|
|
323 outFile.close()
|
|
324
|
|
325
|
|
326 def saveOnlyWithDeletions( lAlign, refseqName, outFileName ):
|
|
327 Align_seq = getSeqWithDeletions( lAlign )
|
|
328 if refseqName != "":
|
|
329 Align_seqref = ( refseqName, lAlign[0][0].replace("-","") )
|
|
330 saveMSA( outFileName, Align_seq, Align_seqref )
|
|
331 else:
|
|
332 saveMSA( outFileName, Align_seq )
|
|
333
|
|
334
|
|
335 def main():
|
|
336
|
|
337 inFileName = ""
|
|
338 refseqName = ""
|
|
339 keepGap = "id"
|
|
340 outFileName = ""
|
|
341 global verbose
|
|
342 verbose = 0
|
|
343
|
|
344 try:
|
|
345 opts, args = getopt.getopt(sys.argv[1:],"hi:r:g:o:v:")
|
|
346 except getopt.GetoptError, err:
|
|
347 print str(err)
|
|
348 help()
|
|
349 sys.exit(1)
|
|
350 for o,a in opts:
|
|
351 if o == "-h":
|
|
352 help()
|
|
353 sys.exit(0)
|
|
354 elif o == "-i":
|
|
355 inFileName = a
|
|
356 elif o == "-r":
|
|
357 refseqName = a
|
|
358 elif o == "-g":
|
|
359 keepGap = a
|
|
360 elif o == "-o":
|
|
361 outFileName = a
|
|
362 elif o == "-v":
|
|
363 verbose = int(a)
|
|
364
|
|
365 if inFileName == "":
|
|
366 print "ERROR: missing input file name"
|
|
367 help()
|
|
368 sys.exit(1)
|
|
369
|
|
370 if verbose > 0:
|
|
371 print "START %s" % (sys.argv[0].split("/")[-1])
|
|
372 sys.stdout.flush()
|
|
373
|
|
374 lAlign = getAlignments( inFileName )
|
|
375 if verbose > 0:
|
|
376 print "nb of alignments: %i" % ( len(lAlign) )
|
|
377 sys.stdout.flush()
|
|
378
|
|
379 if outFileName == "":
|
|
380 outFileName = "%s.fa_aln" % ( inFileName )
|
|
381 if verbose > 0:
|
|
382 print "output file: '%s'" % ( outFileName )
|
|
383
|
|
384 if keepGap == "id":
|
|
385 lGapsOnRefSeqPerAlign = getGapsOnRefSeq( lAlign )
|
|
386 Align_seq = insertgap_seq( lAlign, lGapsOnRefSeqPerAlign )
|
|
387 if refseqName != "":
|
|
388 Align_seqref = insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName )
|
|
389 saveMSA( outFileName, Align_seq, Align_seqref )
|
|
390 else:
|
|
391 saveMSA( outFileName, Align_seq )
|
|
392
|
|
393 elif keepGap == "d":
|
|
394 saveOnlyWithDeletions( lAlign, refseqName, outFileName )
|
|
395
|
|
396 elif keepGap == "i":
|
|
397 print "ERROR: '-g i' not yet available"
|
|
398 sys.exit(1)
|
|
399
|
|
400 if verbose > 0:
|
|
401 print "END %s" % (sys.argv[0].split("/")[-1])
|
|
402 sys.stdout.flush()
|
|
403
|
|
404 return 0
|
|
405
|
|
406
|
|
407 if __name__ == "__main__" :
|
|
408 main ()
|