comparison corebio/ssearch_io/blastxml.py @ 7:8d676bbd1f2d

Uploaded
author davidmurphy
date Mon, 16 Jan 2012 07:03:36 -0500
parents c55bdc2fb9fa
children
comparison
equal deleted inserted replaced
6:4a4aca3d57c9 7:8d676bbd1f2d
1
2 # Copyright (c) 2006 John Gilman
3 #
4 # This software is distributed under the MIT Open Source License.
5 # <http://www.opensource.org/licenses/mit-license.html>
6 #
7 # Permission is hereby granted, free of charge, to any person obtaining a
8 # copy of this software and associated documentation files (the "Software"),
9 # to deal in the Software without restriction, including without limitation
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 # and/or sell copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following conditions:
13 #
14 # The above copyright notice and this permission notice shall be included
15 # in all copies or substantial portions of the Software.
16 #
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 # THE SOFTWARE.
24
25
26 """Read BLAST XML output.
27
28 The DTD is available at
29 http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd
30
31 """
32
33 # See also
34 #
35 # http://bugzilla.open-bio.org/show_bug.cgi?id=1933
36 #http://portal.open-bio.org/pipermail/biojava-dev/2004-December/002513.html
37
38
39 from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment
40
41 import xml.sax
42 from xml.sax.handler import ContentHandler
43
44 __all__ = 'read'
45
46 def read(fin):
47 """Read BLAST xml output and return a list of Result objects.
48 """
49 parser = xml.sax.make_parser()
50 handler = _BlastHandler()
51 parser.setContentHandler(handler)
52
53 #To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd
54 parser.setFeature(xml.sax.handler.feature_validation, 0)
55 parser.setFeature(xml.sax.handler.feature_namespaces, 0)
56 parser.setFeature(xml.sax.handler.feature_external_pes, 0)
57 parser.setFeature(xml.sax.handler.feature_external_ges, 0)
58
59 try :
60 parser.parse(fin)
61 except xml.sax.SAXParseException, e :
62 raise ValueError( "Cannot parse file; "+str(e))
63 return handler.report
64
65 class _BlastHandler( ContentHandler) :
66 def __init__(self):
67 """
68 """
69 ContentHandler.__init__(self)
70 self._content = []
71 self.report = None
72 self._result = None
73 self._hit = None
74 self._hsp = None
75
76
77 def characters(self, ch):
78 self._content.append(ch)
79
80 def startDocument(self):
81 self.report = Report()
82
83 def endDocument(self) :
84 pass
85
86 def startElement(self, name, attr):
87 if name == 'BlastOutput' :
88 pass
89 elif name == 'Iteration' :
90 result = Result()
91 self._result = result
92 self.report.results.append(result)
93 elif name == 'Parameters' :
94 pass
95 elif name == 'Statistics' :
96 pass
97 elif name == 'Hit' :
98 self._hit = Hit()
99 self._result.hits.append(self._hit)
100 elif name == 'Hsp' :
101 self._hsp = Alignment()
102 self._hit.alignments.append(self._hsp)
103 else :
104 pass
105
106
107 def endElement(self, name):
108 content = ''.join(self._content).strip()
109 self._content = []
110
111 report = self.report
112 result = self._result
113 hsp = self._hsp
114 hit = self._hit
115
116 if name == 'BlastOutput' :
117 pass
118 elif name == 'BlastOutput_program' :
119 report.algorithm = content
120 elif name == 'BlastOutput_version' :
121 report.algorithm_version = content.split()[1]
122 elif name == 'BlastOutput_reference' :
123 report.algorithm_reference = content
124 elif name == 'BlastOutput_db' :
125 report.database_name = content
126 elif name == 'BlastOutput_query-ID' : pass
127 elif name == 'BlastOutput_query-def' : pass
128 elif name == 'BlastOutput_query-len' : pass
129 elif name == 'BlastOutput_query-seq' : pass
130 elif name == 'BlastOutput_param' : pass
131 elif name == 'BlastOutput_iterations' : pass
132 elif name == 'BlastOutput_mbstat' : pass
133
134 elif name == 'Iteration' : pass
135 elif name == 'Iteration_iter-num' : pass
136 elif name == 'Iteration_query-ID' :
137 result.query.name = content
138 elif name == 'Iteration_query-def' :
139 result.query.description = content
140 elif name == 'Iteration_query-len' :
141 result.query.length = int(content)
142 elif name == 'Iteration_hits' : pass
143 elif name == 'Iteration_stat' : pass
144 elif name == 'Iteration_message' : pass
145
146 elif name == 'Parameters' :
147 pass
148 elif name == 'Parameters_matrix' :
149 report.parameters['matrix'] = content
150 elif name == 'Parameters_expect' :
151 report.parameters['expect'] = content
152 elif name == 'Parameters_include' :
153 report.parameters['include'] = content
154 elif name == 'Parameters_sc-match' :
155 report.parameters['sc-match'] = content
156 elif name == 'Parameters_sc-mismatch' :
157 report.parameters['sc-mismatch'] = content
158 elif name == 'Parameters_gap-open' :
159 report.parameters['gap-open'] = content
160 elif name == 'Parameters_gap-extend' :
161 report.parameters['gap-extend'] = content
162 elif name == 'Parameters_filter' :
163 report.parameters['filter'] = content
164 elif name == 'Parameters_pattern' :
165 report.parameters['pattern'] = content
166 elif name == 'Parameters_entrez-query' :
167 report.parameters['entrez-query'] = content
168
169 elif name == 'Statistics' :
170 pass
171 elif name == 'Statistics_db-num' :
172 result.statistics['db-num'] = int(content)
173 elif name == 'Statistics_db-len' :
174 result.statistics['db-len'] = int(content)
175 elif name == 'Statistics_hsp-len' :
176 result.statistics['hsp-len'] = int(content)
177 elif name == 'Statistics_eff-space' :
178 result.statistics['eff-space'] = float(content)
179 elif name == 'Statistics_kappa' :
180 result.statistics['kappa'] = float(content)
181 elif name == 'Statistics_lambda' :
182 result.statistics['lambda'] = float(content)
183 elif name == 'Statistics_entropy' :
184 result.statistics['entropy'] = float(content)
185
186 elif name == 'Hit' :
187 self._hit = None
188 elif name == 'Hit_num' :
189 pass
190 elif name == 'Hit_id' :
191 hit.target.name = content
192 elif name == 'Hit_def' :
193 hit.target.description = content
194 elif name == 'Hit_accession' :
195 hit.target.accession = content
196 elif name == 'Hit_len' :
197 hit.target.length = int(content)
198 elif name == 'Hit_hsps' :
199 pass
200
201 elif name == 'Hsp' :
202 self._hsp = None
203 elif name == 'Hsp_num' :
204 pass
205 elif name == 'Hsp_bit-score' :
206 hsp.bit_score = float(content)
207 elif name == 'Hsp_score' :
208 hsp.raw_score = float(content)
209 elif name == 'Hsp_evalue' :
210 hsp.significance = float(content)
211 elif name == 'Hsp_query-from' :
212 hsp.query_start = int(content) -1
213 elif name == 'Hsp_query-to' :
214 #hsp.query_end= int(content)
215 pass
216 elif name == 'Hsp_hit-from' :
217 hsp.target_start = int(content) -1
218 elif name == 'Hsp_hit-to' :
219 #hsp.target_end = int(content)
220 pass
221 elif name == 'Hsp_pattern-from' :
222 pass
223 elif name == 'Hsp_pattern-to' :
224 pass
225 elif name == 'Hsp_query-frame' :
226 hsp.query_frame = int(content)
227 elif name == 'Hsp_hit-frame' :
228 hsp.target_frame = int(content)
229 elif name == 'Hsp_identity' :
230 hsp.identical = int(content)
231 elif name == 'Hsp_positive' :
232 hsp.similar = int(content)
233 elif name == 'Hsp_gaps' :
234 hsp.gaps = int(content)
235 elif name == 'Hsp_align-len' :
236 hsp.length = int(content)
237 elif name == 'Hsp_density' :
238 pass
239 elif name == 'Hsp_qseq' :
240 hsp.query_seq = content
241 elif name == 'Hsp_hseq' :
242 hsp.target_seq = content
243 elif name == 'Hsp_midline' :
244 hsp.mid_seq = content
245 else :
246 pass
247
248
249