annotate commons/tools/refalign2fasta.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 ##@file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 # Convert the output from Refalign (MSA program) into the 'fasta' format.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 # Usually used before subsequent analysis such as the estimation of deletion rate.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 # usage: refalign2fasta.py [ options ]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 # options:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 # -h: this help
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 # -i: name of the input file (output from refalign)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 # -r: name of the reference sequence (discard if not provided)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 # -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 # -o: name of the output file (default=inFileName'.fa_aln',format='fasta')
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 import getopt
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 import exceptions
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 if not os.environ.has_key( "REPET_PATH" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 print "ERROR: no environment variable REPET_PATH"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 sys.path.append( os.environ["REPET_PATH"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 import pyRepet.seq.Bioseq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 def help():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 Give the list of the command-line options.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 print "usage:",sys.argv[0]," [ options ]"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 print "options:"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 print " -h: this help"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 print " -i: name of the input file (output from refalign)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 print " -r: name of the reference sequence (discard if not provided)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 print " -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 print " -o: name of the output file (default=inFileName'.fa_aln',format='fasta')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 def getAlignments( inFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 Retrieve the alignments from the input file.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 @param inFileName: name of the input file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 @type: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 @return: list of alignments ( refseq, seq, header of seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 @rtype: list of 3d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 lAlign = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 inFile = open( inFileName, "r" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 line = inFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 refseq, seq, label = line[:-1].split("\t")[:3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 lAlign.append( ( refseq, seq, label ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 line = inFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 inFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 return lAlign
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 def getGaps( seq ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 Get the gaps on a sequence, start coordinate and length. The start
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 coordinate of a gap is the # of the nucleotide after which it starts.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 @param seq: sequence to analyse
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 @type seq: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 @return: list of gaps ( start coordinate, length )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 @rtype: list of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 prev = "N"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 lGapsOnSeq = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 i = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 lengthGap = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 for c in seq:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 if c == "-" and prev != "-":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 startGap = i
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 if c != "-" and prev == "-":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 lGapsOnSeq.append( ( startGap, lengthGap ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 lengthGap = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 if c != "-":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 i += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 lengthGap += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 prev = c
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 # case with a gap at the end of the sequence
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 if seq[ len(seq) - 1 ] == "-":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 lGapsOnSeq.append( ( startGap, lengthGap ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 return lGapsOnSeq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 def getGapsOnRefSeq( lAlign ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 Retrieve the gaps on the ref seq in all the alignments.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 @param lAlign: list of alignments ( refseq, seq, header of seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 @type lAlign: list of 3d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 @return: list of gaps per alignment
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 @rtype: list of lists of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 lGapsOnRef = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 for align in lAlign:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 refseq = align[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 lGapsOnRef.append( getGaps( refseq ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 return lGapsOnRef
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 def insertGap( seq, startGap, lengthGap ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 Get a new seq by inserting a gap in the give seq.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 @param seq: sequence
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 @type seq: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 @param startGap:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 @type: startGap: integer
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 @param lengthGap: length of the gap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135 @type lengthGap: integer
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 @return: new seq made from the give seq by inserting the gap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 @rtype: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 new_seq = seq[:startGap] + (lengthGap*'-') + seq[startGap:]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 return new_seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 def insertListGaps( inSeq, lGaps ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 Insert all the gaps from the list into the sequence.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 @param inSeq: sequence
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 @type inSeq: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 @param lGaps: list of gaps ( start coordinate, length )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153 @type: list of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155 @return: sequence with the gaps
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 @rtype: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 # insert gaps from the right to the left
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160 lGaps.sort()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161 lGaps.reverse()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163 prevPos = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164 outSeq = inSeq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166 for startGap, lengthGap in lGaps:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167 if startGap != prevPos:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168 outSeq = insertGap( outSeq, startGap, lengthGap )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 prevPos = startGap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 return outSeq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174 def insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 Get the ref seq with all its gaps inserted.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178 @param lAlign: list of alignments ( refseq, seq, header of seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 @type lAlign: list of 3d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 @param refseqName: name of the reference sequence
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185 @type refseqName: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
187 @return: ref seq with all its gaps inserted
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
188 @rtype: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
189 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
190
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
191 # retrieve the initial ref seq, ie without any gap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
192 initRefSeq = lAlign[0][0].replace("-","")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
193
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
194 # convert the list of lists of gaps into a list of gaps
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
195 flat_lGaps = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
196 for gaps in lGapsOnRefSeqPerAlign:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
197 flat_lGaps.extend( gaps )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
198
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
199 # insert the gaps in the sequence of ref seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
200 finalRefSeq = insertListGaps( initRefSeq, flat_lGaps )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
201
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
202 Align_refseq = ( refseqName, finalRefSeq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
203
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
204 return Align_refseq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
205
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
206
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
207 def insertgap_seq( lAlign, lGapsOnRefSeqPerAlign ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
208 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
209 Insert in the sequences all the gaps found on the ref seq.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
210
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
211 @param lAlign: list of alignments ( refseq, seq, header of seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
212 @type lAlign: list of 3d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
213
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
214 @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
215 @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
216
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
217 @return: list of lists (sequence with gaps, header)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
218 @rtype: list of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
219
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
220 insert dans les seq les gaps donnes par la liste de liste de gap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
221 retourne une liste des nouvelles seq apres insertion des gaps
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
222 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
223
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
224 # for each gap, add the nb of the alignment in which it has been found
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
225 newlistgap_seq =[]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
226 alignID = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
227 for lGaps in lGapsOnRefSeqPerAlign:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
228 for startGap,lengthGap in lGaps:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
229 newlistgap_seq.append( ( startGap, lengthGap, alignID ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
230 alignID += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
231 newlistgap_seq.sort()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
232
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
233 Align_seq = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
234
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
235 # for each alignment
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
236 for j in xrange(0,len(lAlign)):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
237
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
238 #create a new list = list of gaps to be inserted in a given seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
239 newlist = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
240 offset = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
241 longmax = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
242 longself = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
243 posprec = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
244 for startGap, lengthGap, alignID in newlistgap_seq:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
245 # when 2 gaps have the same start, we keep the longer one
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
246 if startGap != posprec:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
247 if longmax > longself:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
248 newlist.append( (posprec + offset, longmax - longself) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
249 offset += longself
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
250 longmax=0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
251 longself=0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
252 #lorsque le numero de la seq du tuple a la meme valeur que j
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
253 #on stocke la valeur de lengthGap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
254 if j == alignID:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
255 longself = lengthGap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
256 #sinon on prend comme valeur de longmax la valeur maximale de longmax et lengthGap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
257 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
258 longmax = max(longmax, lengthGap)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
259 posprec = startGap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
260 if longmax > longself:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
261 newlist.append((posprec + offset, longmax - longself))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
262
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
263 newSeq = insertListGaps( (lAlign[j][1]), newlist )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
264 header = lAlign[j][2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
265 Align_seq.append( ( header, newSeq ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
266
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
267 return Align_seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
268
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
269
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
270 def getSeqWithDeletions( lAlign ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
271 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
272 Get the sequences by putting gaps only when they correspond to a deletion compare to ref seq.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
273 Used for instance when we want to estimate the deletion rate.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
274
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
275 @param lAlign: list of alignments ( refseq, seq, header of seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
276 @type lAlign: list of 3d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
277
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
278 @return: list of lists ( header, sequence with gaps )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
279 @rtype: list of 2d-tuples
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
280 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
281
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
282 Align_seq = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
283
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
284 for align in lAlign:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
285 refseq = align[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
286 seq = align[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
287 header = align[2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
288 newSeq = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
289 for i in xrange(0,len(refseq)):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
290 if refseq[i] != "-":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
291 newSeq += seq[i]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
292 Align_seq.append( ( header, newSeq ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
293
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
294 return Align_seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
295
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
296
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
297 def saveMSA( outFileName, Align_seq, Align_seqref=None ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
298 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
299 Save the results as a multiple sequence alignment (MSA) in the 'fasta' format.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
300
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
301 @param outFileName: name of the output file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
302 @type outFileName: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
303
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
304 @param Align_seqref: sequence of ref seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
305 @type Align_seqref: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
306 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
307
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
308 outFile = open( outFileName, "w" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
309 bs = pyRepet.seq.Bioseq.Bioseq()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
310
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
311 # if provided, save the ref seq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
312 if Align_seqref != None:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
313 bs.header = Align_seqref[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
314 bs.sequence = Align_seqref[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
315 bs.write( outFile )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
316
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
317 # save the other sequences
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
318 for i in Align_seq:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
319 bs.header = i[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
320 bs.sequence = i[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
321 bs.write( outFile )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
322
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
323 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
324
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
325
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
326 def saveOnlyWithDeletions( lAlign, refseqName, outFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
327 Align_seq = getSeqWithDeletions( lAlign )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
328 if refseqName != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
329 Align_seqref = ( refseqName, lAlign[0][0].replace("-","") )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
330 saveMSA( outFileName, Align_seq, Align_seqref )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
331 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
332 saveMSA( outFileName, Align_seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
333
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
334
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
335 def main():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
336
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
337 inFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
338 refseqName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
339 keepGap = "id"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
340 outFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
341 global verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
342 verbose = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
343
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
344 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
345 opts, args = getopt.getopt(sys.argv[1:],"hi:r:g:o:v:")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
346 except getopt.GetoptError, err:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
347 print str(err)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
348 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
349 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
350 for o,a in opts:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
351 if o == "-h":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
352 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
353 sys.exit(0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
354 elif o == "-i":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
355 inFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
356 elif o == "-r":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
357 refseqName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
358 elif o == "-g":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
359 keepGap = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
360 elif o == "-o":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
361 outFileName = a
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
362 elif o == "-v":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
363 verbose = int(a)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
364
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
365 if inFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
366 print "ERROR: missing input file name"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
367 help()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
368 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
369
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
370 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
371 print "START %s" % (sys.argv[0].split("/")[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
372 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
373
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
374 lAlign = getAlignments( inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
375 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
376 print "nb of alignments: %i" % ( len(lAlign) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
377 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
378
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
379 if outFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
380 outFileName = "%s.fa_aln" % ( inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
381 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
382 print "output file: '%s'" % ( outFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
383
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
384 if keepGap == "id":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
385 lGapsOnRefSeqPerAlign = getGapsOnRefSeq( lAlign )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
386 Align_seq = insertgap_seq( lAlign, lGapsOnRefSeqPerAlign )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
387 if refseqName != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
388 Align_seqref = insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
389 saveMSA( outFileName, Align_seq, Align_seqref )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
390 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
391 saveMSA( outFileName, Align_seq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
392
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
393 elif keepGap == "d":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
394 saveOnlyWithDeletions( lAlign, refseqName, outFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
395
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
396 elif keepGap == "i":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
397 print "ERROR: '-g i' not yet available"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
398 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
399
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
400 if verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
401 print "END %s" % (sys.argv[0].split("/")[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
402 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
403
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
404 return 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
405
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
406
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
407 if __name__ == "__main__" :
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
408 main ()