0
|
1
|
|
2 # Copyright (c) 2006 John Gilman
|
|
3 #
|
|
4 # This software is distributed under the MIT Open Source License.
|
|
5 # <http://www.opensource.org/licenses/mit-license.html>
|
|
6 #
|
|
7 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
8 # copy of this software and associated documentation files (the "Software"),
|
|
9 # to deal in the Software without restriction, including without limitation
|
|
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
11 # and/or sell copies of the Software, and to permit persons to whom the
|
|
12 # Software is furnished to do so, subject to the following conditions:
|
|
13 #
|
|
14 # The above copyright notice and this permission notice shall be included
|
|
15 # in all copies or substantial portions of the Software.
|
|
16 #
|
|
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
23 # THE SOFTWARE.
|
|
24
|
|
25
|
|
26 """Read BLAST XML output.
|
|
27
|
|
28 The DTD is available at
|
|
29 http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd
|
|
30
|
|
31 """
|
|
32
|
|
33 # See also
|
|
34 #
|
|
35 # http://bugzilla.open-bio.org/show_bug.cgi?id=1933
|
|
36 #http://portal.open-bio.org/pipermail/biojava-dev/2004-December/002513.html
|
|
37
|
|
38
|
|
39 from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment
|
|
40
|
|
41 import xml.sax
|
|
42 from xml.sax.handler import ContentHandler
|
|
43
|
|
44 __all__ = 'read'
|
|
45
|
|
46 def read(fin):
|
|
47 """Read BLAST xml output and return a list of Result objects.
|
|
48 """
|
|
49 parser = xml.sax.make_parser()
|
|
50 handler = _BlastHandler()
|
|
51 parser.setContentHandler(handler)
|
|
52
|
|
53 #To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd
|
|
54 parser.setFeature(xml.sax.handler.feature_validation, 0)
|
|
55 parser.setFeature(xml.sax.handler.feature_namespaces, 0)
|
|
56 parser.setFeature(xml.sax.handler.feature_external_pes, 0)
|
|
57 parser.setFeature(xml.sax.handler.feature_external_ges, 0)
|
|
58
|
|
59 try :
|
|
60 parser.parse(fin)
|
|
61 except xml.sax.SAXParseException, e :
|
|
62 raise ValueError( "Cannot parse file; "+str(e))
|
|
63 return handler.report
|
|
64
|
|
65 class _BlastHandler( ContentHandler) :
|
|
66 def __init__(self):
|
|
67 """
|
|
68 """
|
|
69 ContentHandler.__init__(self)
|
|
70 self._content = []
|
|
71 self.report = None
|
|
72 self._result = None
|
|
73 self._hit = None
|
|
74 self._hsp = None
|
|
75
|
|
76
|
|
77 def characters(self, ch):
|
|
78 self._content.append(ch)
|
|
79
|
|
80 def startDocument(self):
|
|
81 self.report = Report()
|
|
82
|
|
83 def endDocument(self) :
|
|
84 pass
|
|
85
|
|
86 def startElement(self, name, attr):
|
|
87 if name == 'BlastOutput' :
|
|
88 pass
|
|
89 elif name == 'Iteration' :
|
|
90 result = Result()
|
|
91 self._result = result
|
|
92 self.report.results.append(result)
|
|
93 elif name == 'Parameters' :
|
|
94 pass
|
|
95 elif name == 'Statistics' :
|
|
96 pass
|
|
97 elif name == 'Hit' :
|
|
98 self._hit = Hit()
|
|
99 self._result.hits.append(self._hit)
|
|
100 elif name == 'Hsp' :
|
|
101 self._hsp = Alignment()
|
|
102 self._hit.alignments.append(self._hsp)
|
|
103 else :
|
|
104 pass
|
|
105
|
|
106
|
|
107 def endElement(self, name):
|
|
108 content = ''.join(self._content).strip()
|
|
109 self._content = []
|
|
110
|
|
111 report = self.report
|
|
112 result = self._result
|
|
113 hsp = self._hsp
|
|
114 hit = self._hit
|
|
115
|
|
116 if name == 'BlastOutput' :
|
|
117 pass
|
|
118 elif name == 'BlastOutput_program' :
|
|
119 report.algorithm = content
|
|
120 elif name == 'BlastOutput_version' :
|
|
121 report.algorithm_version = content.split()[1]
|
|
122 elif name == 'BlastOutput_reference' :
|
|
123 report.algorithm_reference = content
|
|
124 elif name == 'BlastOutput_db' :
|
|
125 report.database_name = content
|
|
126 elif name == 'BlastOutput_query-ID' : pass
|
|
127 elif name == 'BlastOutput_query-def' : pass
|
|
128 elif name == 'BlastOutput_query-len' : pass
|
|
129 elif name == 'BlastOutput_query-seq' : pass
|
|
130 elif name == 'BlastOutput_param' : pass
|
|
131 elif name == 'BlastOutput_iterations' : pass
|
|
132 elif name == 'BlastOutput_mbstat' : pass
|
|
133
|
|
134 elif name == 'Iteration' : pass
|
|
135 elif name == 'Iteration_iter-num' : pass
|
|
136 elif name == 'Iteration_query-ID' :
|
|
137 result.query.name = content
|
|
138 elif name == 'Iteration_query-def' :
|
|
139 result.query.description = content
|
|
140 elif name == 'Iteration_query-len' :
|
|
141 result.query.length = int(content)
|
|
142 elif name == 'Iteration_hits' : pass
|
|
143 elif name == 'Iteration_stat' : pass
|
|
144 elif name == 'Iteration_message' : pass
|
|
145
|
|
146 elif name == 'Parameters' :
|
|
147 pass
|
|
148 elif name == 'Parameters_matrix' :
|
|
149 report.parameters['matrix'] = content
|
|
150 elif name == 'Parameters_expect' :
|
|
151 report.parameters['expect'] = content
|
|
152 elif name == 'Parameters_include' :
|
|
153 report.parameters['include'] = content
|
|
154 elif name == 'Parameters_sc-match' :
|
|
155 report.parameters['sc-match'] = content
|
|
156 elif name == 'Parameters_sc-mismatch' :
|
|
157 report.parameters['sc-mismatch'] = content
|
|
158 elif name == 'Parameters_gap-open' :
|
|
159 report.parameters['gap-open'] = content
|
|
160 elif name == 'Parameters_gap-extend' :
|
|
161 report.parameters['gap-extend'] = content
|
|
162 elif name == 'Parameters_filter' :
|
|
163 report.parameters['filter'] = content
|
|
164 elif name == 'Parameters_pattern' :
|
|
165 report.parameters['pattern'] = content
|
|
166 elif name == 'Parameters_entrez-query' :
|
|
167 report.parameters['entrez-query'] = content
|
|
168
|
|
169 elif name == 'Statistics' :
|
|
170 pass
|
|
171 elif name == 'Statistics_db-num' :
|
|
172 result.statistics['db-num'] = int(content)
|
|
173 elif name == 'Statistics_db-len' :
|
|
174 result.statistics['db-len'] = int(content)
|
|
175 elif name == 'Statistics_hsp-len' :
|
|
176 result.statistics['hsp-len'] = int(content)
|
|
177 elif name == 'Statistics_eff-space' :
|
|
178 result.statistics['eff-space'] = float(content)
|
|
179 elif name == 'Statistics_kappa' :
|
|
180 result.statistics['kappa'] = float(content)
|
|
181 elif name == 'Statistics_lambda' :
|
|
182 result.statistics['lambda'] = float(content)
|
|
183 elif name == 'Statistics_entropy' :
|
|
184 result.statistics['entropy'] = float(content)
|
|
185
|
|
186 elif name == 'Hit' :
|
|
187 self._hit = None
|
|
188 elif name == 'Hit_num' :
|
|
189 pass
|
|
190 elif name == 'Hit_id' :
|
|
191 hit.target.name = content
|
|
192 elif name == 'Hit_def' :
|
|
193 hit.target.description = content
|
|
194 elif name == 'Hit_accession' :
|
|
195 hit.target.accession = content
|
|
196 elif name == 'Hit_len' :
|
|
197 hit.target.length = int(content)
|
|
198 elif name == 'Hit_hsps' :
|
|
199 pass
|
|
200
|
|
201 elif name == 'Hsp' :
|
|
202 self._hsp = None
|
|
203 elif name == 'Hsp_num' :
|
|
204 pass
|
|
205 elif name == 'Hsp_bit-score' :
|
|
206 hsp.bit_score = float(content)
|
|
207 elif name == 'Hsp_score' :
|
|
208 hsp.raw_score = float(content)
|
|
209 elif name == 'Hsp_evalue' :
|
|
210 hsp.significance = float(content)
|
|
211 elif name == 'Hsp_query-from' :
|
|
212 hsp.query_start = int(content) -1
|
|
213 elif name == 'Hsp_query-to' :
|
|
214 #hsp.query_end= int(content)
|
|
215 pass
|
|
216 elif name == 'Hsp_hit-from' :
|
|
217 hsp.target_start = int(content) -1
|
|
218 elif name == 'Hsp_hit-to' :
|
|
219 #hsp.target_end = int(content)
|
|
220 pass
|
|
221 elif name == 'Hsp_pattern-from' :
|
|
222 pass
|
|
223 elif name == 'Hsp_pattern-to' :
|
|
224 pass
|
|
225 elif name == 'Hsp_query-frame' :
|
|
226 hsp.query_frame = int(content)
|
|
227 elif name == 'Hsp_hit-frame' :
|
|
228 hsp.target_frame = int(content)
|
|
229 elif name == 'Hsp_identity' :
|
|
230 hsp.identical = int(content)
|
|
231 elif name == 'Hsp_positive' :
|
|
232 hsp.similar = int(content)
|
|
233 elif name == 'Hsp_gaps' :
|
|
234 hsp.gaps = int(content)
|
|
235 elif name == 'Hsp_align-len' :
|
|
236 hsp.length = int(content)
|
|
237 elif name == 'Hsp_density' :
|
|
238 pass
|
|
239 elif name == 'Hsp_qseq' :
|
|
240 hsp.query_seq = content
|
|
241 elif name == 'Hsp_hseq' :
|
|
242 hsp.target_seq = content
|
|
243 elif name == 'Hsp_midline' :
|
|
244 hsp.mid_seq = content
|
|
245 else :
|
|
246 pass
|
|
247
|
|
248
|
|
249
|