Mercurial > repos > davidmurphy > codonlogo
comparison corebio/ssearch_io/fasta.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 # Copyright (c) 2006 John Gilman | |
2 # | |
3 # This software is distributed under the MIT Open Source License. | |
4 # <http://www.opensource.org/licenses/mit-license.html> | |
5 # | |
6 # Permission is hereby granted, free of charge, to any person obtaining a | |
7 # copy of this software and associated documentation files (the "Software"), | |
8 # to deal in the Software without restriction, including without limitation | |
9 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
10 # and/or sell copies of the Software, and to permit persons to whom the | |
11 # Software is furnished to do so, subject to the following conditions: | |
12 # | |
13 # The above copyright notice and this permission notice shall be included | |
14 # in all copies or substantial portions of the Software. | |
15 # | |
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
19 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
22 # THE SOFTWARE. | |
23 | |
24 | |
25 | |
26 | |
27 """Read the output of a fasta sequence similarity search. | |
28 | |
29 FASTA is a DNA and Protein sequence alignment software package first described | |
30 by David J. Lipman and William R. Pearson in 1985. In addition to rapid | |
31 heuristic search methods, the FASTA package provides SSEARCH, an implementation | |
32 of the optimal Smith Waterman algorithm. | |
33 | |
34 The module can parse the output from fasta, ssearch and other search programs | |
35 in the fasta collection. It will parse both default ('-m 1') and compact | |
36 ('-m 9 -d 0') output. | |
37 | |
38 Refs: | |
39 ftp.virginia.edu/pub/fasta | |
40 http://en.wikipedia.org/wiki/FASTA | |
41 """ | |
42 | |
43 | |
44 | |
45 from corebio.utils import Reiterate, Token, isblank | |
46 from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment | |
47 from math import floor | |
48 import re | |
49 | |
50 __all__ = 'read' | |
51 | |
52 _rangere = re.compile(r"\((\d+)-\d+:(\d+)-\d+\)") | |
53 | |
54 | |
55 def read(fin) : | |
56 """Read and parse a fasta search output file. | |
57 | |
58 returns: a list of Results | |
59 """ | |
60 scanner = _scan(fin) | |
61 | |
62 report = None | |
63 result = None | |
64 hit = None | |
65 #query_seq = None | |
66 #target_seq = None | |
67 alignment_num = 0 | |
68 | |
69 | |
70 for token in scanner : | |
71 #print token | |
72 typeof = token.typeof | |
73 value = token.data | |
74 | |
75 if typeof == 'begin_report' : | |
76 report = Report() | |
77 elif typeof == 'algorithm' : | |
78 report.algorithm = value | |
79 elif typeof == 'algorithm_version' : | |
80 report.algorithm_version = value | |
81 elif typeof == 'algorithm_reference' : | |
82 report.algorithm_reference = value | |
83 elif typeof == 'database_name' : | |
84 report.database_name = value | |
85 elif typeof == 'database_letters' : | |
86 report.database_letters = value | |
87 elif typeof == 'database_entries' : | |
88 report.database_entries = value | |
89 elif typeof == 'end_report' : | |
90 # Final sanity checking | |
91 break | |
92 elif typeof == 'parameter' : | |
93 key = value[0] | |
94 value = value[1] | |
95 report.parameters[key] = value | |
96 | |
97 elif typeof == 'begin_result' : | |
98 result = Result() | |
99 report.results.append(result) | |
100 | |
101 elif typeof == 'query_name' : | |
102 result.query.name = value | |
103 elif typeof == 'query_description' : | |
104 result.query.description = value | |
105 elif typeof == 'end_result' : | |
106 pass | |
107 | |
108 elif typeof == 'begin_hit' : | |
109 hit = Hit() | |
110 elif typeof == 'target_name' : | |
111 hit.target.name = value | |
112 elif typeof == 'target_description' : | |
113 hit.target.description = value | |
114 elif typeof == 'target_length' : | |
115 hit.target.length = value | |
116 elif typeof == 'raw_score' : | |
117 hit.raw_score = value | |
118 elif typeof == 'bit_score' : | |
119 hit.bit_score = value | |
120 elif typeof == 'significance' : | |
121 hit.significance = value | |
122 elif typeof == 'end_hit' : | |
123 result.hits.append(hit) | |
124 hit = None | |
125 | |
126 elif typeof == 'begin_alignment' : | |
127 alignment = Alignment() | |
128 tseq = [] | |
129 qseq = [] | |
130 elif typeof == 'end_alignment' : | |
131 tseq = ''.join(tseq) | |
132 qseq = ''.join(qseq) | |
133 L = max (len(tseq), len(qseq) ) | |
134 tseq = tseq.ljust(L).replace(' ', '.') | |
135 qseq = qseq.ljust(L).replace(' ', '.') | |
136 alignment.query_seq = tseq | |
137 alignment.target_seq = qseq | |
138 result.hits[alignment_num].alignments.append(alignment) | |
139 alignment_num+=1 | |
140 tseq = None | |
141 qseq = None | |
142 elif typeof == 'target_seq' : | |
143 tseq += value | |
144 elif typeof == 'query_seq' : | |
145 qseq += value | |
146 elif typeof == 'alignment_raw_score' : | |
147 alignment.raw_score = value | |
148 | |
149 elif typeof == 'alignment_bit_score' : | |
150 alignment.bit_score = value | |
151 elif typeof == 'alignment_significance' : | |
152 alignment.significance = value | |
153 elif typeof == 'alignment_length' : | |
154 alignment.length = value | |
155 elif typeof == 'alignment_similar' : | |
156 alignment.similar = value | |
157 elif typeof == 'alignment_identical' : | |
158 alignment.identical = value | |
159 elif typeof == 'alignment_query_start' : | |
160 alignment.query_start = value | |
161 elif typeof == 'alignment_target_start' : | |
162 alignment.target_start = value | |
163 | |
164 else: | |
165 # Should never get here. | |
166 raise RuntimeError("Unrecoverable internal parse error (SPE)") | |
167 pass | |
168 | |
169 | |
170 return report | |
171 # End method read() | |
172 | |
173 | |
174 def _scan(fin) : | |
175 | |
176 def next_nonempty(i) : | |
177 L = i.next() | |
178 while L.strip() == '': L = i.next() | |
179 return L | |
180 | |
181 | |
182 lines = Reiterate(iter(fin)) | |
183 try : | |
184 | |
185 yield Token("begin_report", lineno= lines.index()) | |
186 | |
187 # find header line : "SSEARCH searches a sequence data bank" | |
188 L = lines.next() | |
189 | |
190 if L[0] == '#' : | |
191 yield Token("parameter", ("command", L[1:].strip()), lines.index()) | |
192 L = lines.next() | |
193 | |
194 while not L : L= lines.next() | |
195 algorithm = L.split()[0] | |
196 expected = [ "SSEARCH", "FASTA","TFASTA","FASTX", | |
197 "FASTY","TFASTX","TFASTY"] | |
198 if algorithm not in expected: | |
199 raise ValueError("Parse failed: line %d" % lines.index() ) | |
200 yield Token ("algorithm", algorithm, lines.index() ) | |
201 | |
202 # Next line should be the version | |
203 L = lines.next() | |
204 if not L.startswith(" version") : | |
205 raise ValueError("Parse failed: Cannot find version.") | |
206 yield Token( "algorithm_version", L[8:].split()[0].strip(), lines.index()) | |
207 | |
208 # Algorithm reference | |
209 L = lines.next() | |
210 if not L.startswith("Please cite:") : | |
211 raise ValueError("Parse failed: Expecting citation" + L) | |
212 cite = lines.next().strip() + ' ' + lines.next().strip() | |
213 yield Token( "algorithm_reference", cite) | |
214 | |
215 # Find line "searching testset.fa library" | |
216 L = lines.next() | |
217 while not L.startswith("searching") : L = lines.next() | |
218 yield Token("database_name", L[10:-8], lines.index() ) | |
219 | |
220 # Results | |
221 L = lines.next() | |
222 while isblank(L) : L = lines.next() | |
223 if ">>>" not in L : | |
224 raise ValueError("Parse failed on line %d: " % lines.index()) | |
225 | |
226 while ">>>" in L : | |
227 yield Token("begin_result", lineno= lines.index()) | |
228 index = L.find('>>>') | |
229 (name, description) = L[index+3:].split(' ',1) | |
230 yield Token("query_name", name, lines.index()) | |
231 yield Token("query_description", description, lines.index()) | |
232 | |
233 while not L.startswith("The best scores are:") : | |
234 L = lines.next() | |
235 L = lines.next() | |
236 # hits | |
237 while not isblank(L) : | |
238 lineno = lines.index() | |
239 desc = L[0:49] | |
240 yield Token("begin_hit", lineno= lineno) | |
241 yield Token("target_description", desc, lineno, 0) | |
242 yield Token("target_name", desc.split(' ',1)[0], lineno, 0) | |
243 yield Token("target_length", int(L[52:56]), lineno, 52) | |
244 fields = L[57:].split() | |
245 raw, bit, sig = fields[0], fields[1], fields[2] | |
246 #print raw, bit, sig | |
247 yield Token("raw_score", float(raw), lineno, 57) | |
248 yield Token("bit_score", float(bit), lineno) | |
249 yield Token("significance", float(sig), lineno) | |
250 yield Token("end_hit", lineno=lineno) | |
251 L = lines.next() | |
252 | |
253 # Optimal alignment information | |
254 L = next_nonempty(lines) | |
255 #print ">>>", L, L.startswith('>>') | |
256 while L.startswith('>>'): | |
257 if L.startswith('>>>') : break | |
258 | |
259 yield Token("begin_alignment", lineno=lines.index() ) | |
260 | |
261 # 1 2 3 4 | |
262 #01234567890123456789012345678901234567890123456789 | |
263 # s-w opt: 46 Z-score: 70.7 bits: 18.5 E(): 3.6 | |
264 L = lines.next() | |
265 fields = L.split() | |
266 raw, bit, sig = fields[2], fields[6], fields[8] | |
267 yield Token("alignment_raw_score", float(raw), lineno) | |
268 yield Token("alignment_bit_score", float(bit), lineno) | |
269 yield Token("alignment_significance", float(sig), lineno) | |
270 | |
271 #Smith-Waterman score: 46; 38.095% identity (71.429% similar) in 21 aa overlap (2-22:36-56) | |
272 L = lines.next() | |
273 lineno = lines.index() | |
274 fields = L.split() | |
275 assert( len(fields) ==12) | |
276 alen = int(fields[8]) | |
277 identical = int( floor(0.5+alen* float(fields[3][:-1])/100.)) | |
278 similar = int( floor(0.5+alen* float(fields[3][:-1])/100.)) | |
279 yield Token("alignment_length", alen, lineno) | |
280 yield Token("alignment_similar", similar, lineno) | |
281 yield Token("alignment_identical", identical, lineno) | |
282 | |
283 m = _rangere.match( fields[11]) | |
284 assert (m is not None) | |
285 yield Token("alignment_query_start", int(m.group(1))-1, lineno) | |
286 yield Token("alignment_target_start", int(m.group(2))-1, lineno) | |
287 | |
288 | |
289 count = 1 | |
290 while True: | |
291 L = lines.next() | |
292 count += 1 | |
293 | |
294 | |
295 | |
296 if L.startswith('>>'): break | |
297 if '>>>' in L: | |
298 lines.push(L) | |
299 break | |
300 if 'residues' in L and 'sequences' in L : | |
301 lines.push(L) | |
302 break | |
303 if not L or L[0].isspace() : continue | |
304 | |
305 | |
306 # there are 2 lines before the first query sequence (but | |
307 # we started the count at 1). There is 1 line between query | |
308 # and target, 3 lines between target and query, unless the | |
309 # query ends before the ends and the target wraps onto another | |
310 # Then there are two lines between target and target. | |
311 | |
312 # Smith-Waterman score: 34; 35.294% identity ... | |
313 # | |
314 # 30 40 50 60 70 | |
315 # d1pfsa EGFLHLEDKPHPLQCQFFVESVIPAGSYQVPYRINVNNG-RPELAFDFKAMKRA | |
316 # : . . .:: .: .:: | |
317 # d8rxna MKKYVCTVCGYEYDPAEGDPDNGVKPGTSFDDLPADWVCPVCGA | |
318 # 10 20 30 40 | |
319 # | |
320 # d8rxna PKSEFEAA | |
321 # 50 | |
322 | |
323 lineno=lines.index() | |
324 if count==4 : | |
325 yield Token("query_seq", L[7:].rstrip(), lineno) | |
326 else : | |
327 yield Token("target_seq", L[7:].rstrip(),lineno) | |
328 count = 0 | |
329 | |
330 yield Token("end_alignment", lineno=lines.index() ) | |
331 yield Token("end_result", lineno= lines.index()) | |
332 L = next_nonempty(lines) | |
333 # End results | |
334 | |
335 # "13355 residues in 93 query sequences" | |
336 # "13355 residues in 93 library sequences" | |
337 #print '>>', L | |
338 LL = L.split() | |
339 yield Token("database_letters",int(LL[0]), lines.index() ) | |
340 yield Token("database_entries", int(LL[3]), lines.index() ) | |
341 | |
342 yield Token("end_report", lineno= lines.index()) | |
343 except StopIteration : | |
344 raise ValueError("Premature end of file ") | |
345 | |
346 | |
347 | |
348 | |
349 |