Mercurial > repos > davidmurphy > codonlogo
diff corebio/ssearch_io/blastxml.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/ssearch_io/blastxml.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,249 @@ + +# Copyright (c) 2006 John Gilman +# +# This software is distributed under the MIT Open Source License. +# <http://www.opensource.org/licenses/mit-license.html> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +"""Read BLAST XML output. + +The DTD is available at +http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd + +""" + +# See also +# +# http://bugzilla.open-bio.org/show_bug.cgi?id=1933 +#http://portal.open-bio.org/pipermail/biojava-dev/2004-December/002513.html + + +from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment + +import xml.sax +from xml.sax.handler import ContentHandler + +__all__ = 'read' + +def read(fin): + """Read BLAST xml output and return a list of Result objects. + """ + parser = xml.sax.make_parser() + handler = _BlastHandler() + parser.setContentHandler(handler) + + #To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd + parser.setFeature(xml.sax.handler.feature_validation, 0) + parser.setFeature(xml.sax.handler.feature_namespaces, 0) + parser.setFeature(xml.sax.handler.feature_external_pes, 0) + parser.setFeature(xml.sax.handler.feature_external_ges, 0) + + try : + parser.parse(fin) + except xml.sax.SAXParseException, e : + raise ValueError( "Cannot parse file; "+str(e)) + return handler.report + +class _BlastHandler( ContentHandler) : + def __init__(self): + """ + """ + ContentHandler.__init__(self) + self._content = [] + self.report = None + self._result = None + self._hit = None + self._hsp = None + + + def characters(self, ch): + self._content.append(ch) + + def startDocument(self): + self.report = Report() + + def endDocument(self) : + pass + + def startElement(self, name, attr): + if name == 'BlastOutput' : + pass + elif name == 'Iteration' : + result = Result() + self._result = result + self.report.results.append(result) + elif name == 'Parameters' : + pass + elif name == 'Statistics' : + pass + elif name == 'Hit' : + self._hit = Hit() + self._result.hits.append(self._hit) + elif name == 'Hsp' : + self._hsp = Alignment() + self._hit.alignments.append(self._hsp) + else : + pass + + + def endElement(self, name): + content = ''.join(self._content).strip() + self._content = [] + + report = self.report + result = self._result + hsp = self._hsp + hit = self._hit + + if name == 'BlastOutput' : + pass + elif name == 'BlastOutput_program' : + report.algorithm = content + elif name == 'BlastOutput_version' : + report.algorithm_version = content.split()[1] + elif name == 'BlastOutput_reference' : + report.algorithm_reference = content + elif name == 'BlastOutput_db' : + report.database_name = content + elif name == 'BlastOutput_query-ID' : pass + elif name == 'BlastOutput_query-def' : pass + elif name == 'BlastOutput_query-len' : pass + elif name == 'BlastOutput_query-seq' : pass + elif name == 'BlastOutput_param' : pass + elif name == 'BlastOutput_iterations' : pass + elif name == 'BlastOutput_mbstat' : pass + + elif name == 'Iteration' : pass + elif name == 'Iteration_iter-num' : pass + elif name == 'Iteration_query-ID' : + result.query.name = content + elif name == 'Iteration_query-def' : + result.query.description = content + elif name == 'Iteration_query-len' : + result.query.length = int(content) + elif name == 'Iteration_hits' : pass + elif name == 'Iteration_stat' : pass + elif name == 'Iteration_message' : pass + + elif name == 'Parameters' : + pass + elif name == 'Parameters_matrix' : + report.parameters['matrix'] = content + elif name == 'Parameters_expect' : + report.parameters['expect'] = content + elif name == 'Parameters_include' : + report.parameters['include'] = content + elif name == 'Parameters_sc-match' : + report.parameters['sc-match'] = content + elif name == 'Parameters_sc-mismatch' : + report.parameters['sc-mismatch'] = content + elif name == 'Parameters_gap-open' : + report.parameters['gap-open'] = content + elif name == 'Parameters_gap-extend' : + report.parameters['gap-extend'] = content + elif name == 'Parameters_filter' : + report.parameters['filter'] = content + elif name == 'Parameters_pattern' : + report.parameters['pattern'] = content + elif name == 'Parameters_entrez-query' : + report.parameters['entrez-query'] = content + + elif name == 'Statistics' : + pass + elif name == 'Statistics_db-num' : + result.statistics['db-num'] = int(content) + elif name == 'Statistics_db-len' : + result.statistics['db-len'] = int(content) + elif name == 'Statistics_hsp-len' : + result.statistics['hsp-len'] = int(content) + elif name == 'Statistics_eff-space' : + result.statistics['eff-space'] = float(content) + elif name == 'Statistics_kappa' : + result.statistics['kappa'] = float(content) + elif name == 'Statistics_lambda' : + result.statistics['lambda'] = float(content) + elif name == 'Statistics_entropy' : + result.statistics['entropy'] = float(content) + + elif name == 'Hit' : + self._hit = None + elif name == 'Hit_num' : + pass + elif name == 'Hit_id' : + hit.target.name = content + elif name == 'Hit_def' : + hit.target.description = content + elif name == 'Hit_accession' : + hit.target.accession = content + elif name == 'Hit_len' : + hit.target.length = int(content) + elif name == 'Hit_hsps' : + pass + + elif name == 'Hsp' : + self._hsp = None + elif name == 'Hsp_num' : + pass + elif name == 'Hsp_bit-score' : + hsp.bit_score = float(content) + elif name == 'Hsp_score' : + hsp.raw_score = float(content) + elif name == 'Hsp_evalue' : + hsp.significance = float(content) + elif name == 'Hsp_query-from' : + hsp.query_start = int(content) -1 + elif name == 'Hsp_query-to' : + #hsp.query_end= int(content) + pass + elif name == 'Hsp_hit-from' : + hsp.target_start = int(content) -1 + elif name == 'Hsp_hit-to' : + #hsp.target_end = int(content) + pass + elif name == 'Hsp_pattern-from' : + pass + elif name == 'Hsp_pattern-to' : + pass + elif name == 'Hsp_query-frame' : + hsp.query_frame = int(content) + elif name == 'Hsp_hit-frame' : + hsp.target_frame = int(content) + elif name == 'Hsp_identity' : + hsp.identical = int(content) + elif name == 'Hsp_positive' : + hsp.similar = int(content) + elif name == 'Hsp_gaps' : + hsp.gaps = int(content) + elif name == 'Hsp_align-len' : + hsp.length = int(content) + elif name == 'Hsp_density' : + pass + elif name == 'Hsp_qseq' : + hsp.query_seq = content + elif name == 'Hsp_hseq' : + hsp.target_seq = content + elif name == 'Hsp_midline' : + hsp.mid_seq = content + else : + pass + + +