view corebio/ssearch_io/blastxml.py @ 14:778f03497adb

Uploaded
author davidmurphy
date Fri, 24 Feb 2012 11:37:26 -0500
parents c55bdc2fb9fa
children
line wrap: on
line source


#  Copyright (c) 2006 John Gilman
#
#  This software is distributed under the MIT Open Source License.
#  <http://www.opensource.org/licenses/mit-license.html>
#
#  Permission is hereby granted, free of charge, to any person obtaining a 
#  copy of this software and associated documentation files (the "Software"),
#  to deal in the Software without restriction, including without limitation
#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
#  and/or sell copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included
#  in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
#  THE SOFTWARE.


"""Read BLAST XML output.

The DTD is available at
http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd

"""

# See also
# 
# http://bugzilla.open-bio.org/show_bug.cgi?id=1933
#http://portal.open-bio.org/pipermail/biojava-dev/2004-December/002513.html


from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment

import xml.sax
from xml.sax.handler import ContentHandler

__all__ = 'read'

def read(fin):
    """Read BLAST xml output and return a list of Result objects.
    """
    parser = xml.sax.make_parser()
    handler = _BlastHandler() 
    parser.setContentHandler(handler)
        
    #To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd
    parser.setFeature(xml.sax.handler.feature_validation, 0)
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    parser.setFeature(xml.sax.handler.feature_external_pes, 0)
    parser.setFeature(xml.sax.handler.feature_external_ges, 0)

    try :
        parser.parse(fin)
    except xml.sax.SAXParseException, e :
        raise ValueError( "Cannot parse file; "+str(e))
    return handler.report

class _BlastHandler( ContentHandler) :
    def __init__(self):
        """
        """
        ContentHandler.__init__(self)
        self._content = []
        self.report = None
        self._result = None
        self._hit = None
        self._hsp = None

        
    def characters(self, ch):
        self._content.append(ch) 
   
    def startDocument(self):
        self.report = Report()
        
    def endDocument(self) :
        pass
        
    def startElement(self, name, attr):
        if name == 'BlastOutput' :
            pass
        elif name == 'Iteration' :
            result = Result()
            self._result = result
            self.report.results.append(result)
        elif name == 'Parameters' :
            pass
        elif name == 'Statistics' :
            pass
        elif name == 'Hit' :
            self._hit = Hit()
            self._result.hits.append(self._hit)
        elif name == 'Hsp' :
            self._hsp = Alignment()
            self._hit.alignments.append(self._hsp)
        else :
            pass


    def endElement(self, name):
        content = ''.join(self._content).strip()
        self._content = []

        report = self.report
        result = self._result
        hsp = self._hsp
        hit = self._hit
        
        if name == 'BlastOutput' : 
            pass
        elif name == 'BlastOutput_program' :
            report.algorithm = content
        elif name == 'BlastOutput_version' :
            report.algorithm_version = content.split()[1]
        elif name == 'BlastOutput_reference' :
            report.algorithm_reference = content
        elif name == 'BlastOutput_db' :
            report.database_name = content
        elif name == 'BlastOutput_query-ID' : pass
        elif name == 'BlastOutput_query-def' : pass
        elif name == 'BlastOutput_query-len' : pass
        elif name == 'BlastOutput_query-seq' : pass            
        elif name == 'BlastOutput_param' : pass
        elif name == 'BlastOutput_iterations' : pass
        elif name == 'BlastOutput_mbstat' : pass
            
        elif name == 'Iteration' : pass
        elif name == 'Iteration_iter-num' : pass            
        elif name == 'Iteration_query-ID' :  
            result.query.name = content
        elif name == 'Iteration_query-def' :             
            result.query.description = content
        elif name == 'Iteration_query-len' : 
            result.query.length = int(content)            
        elif name == 'Iteration_hits' : pass            
        elif name == 'Iteration_stat' : pass            
        elif name == 'Iteration_message' : pass  
                      
        elif name == 'Parameters' : 
            pass        
        elif name == 'Parameters_matrix' :
            report.parameters['matrix'] = content            
        elif name == 'Parameters_expect' :
            report.parameters['expect'] = content              
        elif name == 'Parameters_include' :
            report.parameters['include'] = content              
        elif name == 'Parameters_sc-match' :
            report.parameters['sc-match'] = content              
        elif name == 'Parameters_sc-mismatch' :
            report.parameters['sc-mismatch'] = content              
        elif name == 'Parameters_gap-open' :
            report.parameters['gap-open'] = content              
        elif name == 'Parameters_gap-extend' :
            report.parameters['gap-extend'] = content              
        elif name == 'Parameters_filter' :
            report.parameters['filter'] = content              
        elif name == 'Parameters_pattern' :
            report.parameters['pattern'] = content              
        elif name == 'Parameters_entrez-query' :
            report.parameters['entrez-query'] = content  

        elif name == 'Statistics' :
            pass              
        elif name == 'Statistics_db-num' :
            result.statistics['db-num'] = int(content)            
        elif name == 'Statistics_db-len' :
            result.statistics['db-len'] = int(content)              
        elif name == 'Statistics_hsp-len' :
            result.statistics['hsp-len'] = int(content)            
        elif name == 'Statistics_eff-space' :
            result.statistics['eff-space'] = float(content)            
        elif name == 'Statistics_kappa' :
            result.statistics['kappa'] = float(content)            
        elif name == 'Statistics_lambda' :
            result.statistics['lambda'] = float(content)          
        elif name == 'Statistics_entropy' :
            result.statistics['entropy'] = float(content)            

        elif name == 'Hit' :
            self._hit = None
        elif name == 'Hit_num' :
            pass            
        elif name == 'Hit_id' :
            hit.target.name = content            
        elif name == 'Hit_def' :
            hit.target.description = content
        elif name == 'Hit_accession' :
            hit.target.accession = content            
        elif name == 'Hit_len' :
            hit.target.length = int(content)             
        elif name == 'Hit_hsps' :
            pass            

        elif name == 'Hsp' :
            self._hsp = None                
        elif name == 'Hsp_num' :
            pass            
        elif name == 'Hsp_bit-score' :
            hsp.bit_score = float(content)            
        elif name == 'Hsp_score' :
            hsp.raw_score = float(content)             
        elif name == 'Hsp_evalue' :
            hsp.significance = float(content)             
        elif name == 'Hsp_query-from' :
            hsp.query_start = int(content) -1           
        elif name == 'Hsp_query-to' :
            #hsp.query_end= int(content)              
            pass
        elif name == 'Hsp_hit-from' :
            hsp.target_start = int(content) -1              
        elif name == 'Hsp_hit-to' :
            #hsp.target_end = int(content)             
            pass
        elif name == 'Hsp_pattern-from' :
            pass            
        elif name == 'Hsp_pattern-to' :
            pass            
        elif name == 'Hsp_query-frame' :
            hsp.query_frame = int(content)              
        elif name == 'Hsp_hit-frame' :
            hsp.target_frame = int(content)            
        elif name == 'Hsp_identity' :
            hsp.identical = int(content)            
        elif name == 'Hsp_positive' :
            hsp.similar = int(content)             
        elif name == 'Hsp_gaps' :
            hsp.gaps = int(content)            
        elif name == 'Hsp_align-len' :
            hsp.length = int(content)             
        elif name == 'Hsp_density' :
            pass            
        elif name == 'Hsp_qseq' :
            hsp.query_seq = content               
        elif name == 'Hsp_hseq' :
            hsp.target_seq = content            
        elif name == 'Hsp_midline' :
            hsp.mid_seq = content     
        else :
            pass