Mercurial > repos > davidmurphy > codonlogo
diff corebio/ssearch_io/__init__.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/ssearch_io/__init__.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,207 @@ + +# Copyright (c) 2006 John Gilman +# +# This software is distributed under the MIT Open Source License. +# <http://www.opensource.org/licenses/mit-license.html> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" Parse the output of BLAST and similar sequence search analysis reports. + +The result of a sequence database search is represented by the Report class. + o Each Report contains one or more results, one for each database query. + o Each Result contains one or more hits + o Each Hit may contain one or more Alignments (High scoring Sequence pairs) + +CoreBio is often capable of guessing the correct format: +>>> from corebio import ssearch_io +>>> afile = open("test_corebio/data/ssearch/ssearch_out.txt") +>>> report = ssearch_io.read(afile) +>>> print report + +Alternatively, each report type has a seperate module. Each module defines a +read(fin) method that can parse that report format. + +>>> from corebio.ssearch_io import fasta +>>> report = fasta.read( open("test_corebio/data/ssearch/ssearch_out.txt") ) +>>> print report + +Module Application Comments +--------------------------------------------------------------------------- +fasta FASTA / SSEARCH Default (-m 1) or compact (-m 9 -d 0) +blastxml NCBI Blast NCBI XML format + +Status: Beta +""" +# Dev. References : +# Inspired by Bioperls searchIO system +# http://www.bioperl.org/wiki/HOWTO:SearchIO + +__all__ = ['read', 'Report', 'Result', + 'Hit','Annotation', 'Alignment'] + + +from corebio.utils import stdrepr + +def read(fin) : + """ Read and parse an analysis report. + + returns : + A database search Report. + raises : + ValueError - If the file cannot be parsed + """ + + import fasta + import blastxml + parsers = (fasta, blastxml) + for p in parsers: + try: + return p.read(fin) + except ValueError, e: + pass + fin.seek(0) # FIXME. Non seakable stdin? + + raise ValueError("Cannot parse sequence file: Tried fasta and blastxml") + + + +class Report(object) : + """The results of a database search. The Report contains a list of 1 or more + Results, one for each query. Each query result containts a list of hits. + Each Hit contains a list of HSP's (High scoring segment pairs). + + The structure of the report will vary somewhat depending on the source. + + algorithm -- e.g. 'BLASTX' + algorithm_version -- e.g. '2.2.4 [Aug-26-2002]' + algorithm_reference -- + database_name -- e.g. 'test.fa' + database_letters -- number of residues in database e.g. 1291 + database_entries -- number of database entries + + parameters -- Dictionary of parameters used in search + + results -- A list of list of Results, one per query + """ + __slots__ = ['algorithm', 'algorithm_version', 'algorithm_reference','database_name', + 'database_letters', 'database_entries', 'parameters', 'results'] + + def __init__(self) : + for name in self.__slots__ : setattr(self, name, None) + self.parameters = {} + self.results = [] + + def __repr__(self): + return stdrepr(self) + + +class Result(object) : + """ The result from searching a database with a single query sequence. + + query -- Information about the query sequence + statistics -- A dictionary of search statistics + hits -- A list of Hits + """ + __slots__ = ['query', 'statistics', 'hits'] + + def __init__(self) : + for name in self.__slots__ : setattr(self, name, None) + self.query = Annotation() + self.statistics = {} + self.hits = [] + + def __repr__(self): + return stdrepr(self) + + +class Hit(object) : + """ A search hit between a query sequence and a subject sequence. + Each hit may have one or more Alignments + + target -- Information about the target sequence. + raw_score -- Typically the ignficance of the hit in bits, e.g. 92.0 + significance -- Typically evalue. e.g '2e-022' + alignments -- A list of alignments between subject and target + """ + __slots__ =['target', 'raw_score', 'bit_score', 'significance', + 'alignments'] + def __init__(self) : + for name in self.__slots__ : setattr(self, name, None) + self.target = Annotation() + self.alignments = [] + + def __repr__(self): + return stdrepr(self) + +class Annotation(object) : + """ Information about a subject or query sequence. + + name -- subject sequence name, e.g. '443893|124775' + description -- e.g. 'LaForas sequence' + length -- subject sequence length, e.g. 331 + locus -- e.g. '124775' + accession -- e.g. '443893' + """ + # Fixme: change into generic sequence annotation class? + __slots__ = ['name', 'description', 'length', 'locus', 'accession', ] + + def __init__(self): + for name in self.__slots__ : + setattr(self, name, None) + + def __repr__(self): + return stdrepr(self) + +class Alignment(object): + """An alignment between query and subject sequences. + For BLAST, these are High scoring Segment pairs (HSPs) + + raw_score -- Typically signficance of the hit in bits, e.g. 92.0 + significance -- Typically evalue. e.g '2e-022' + + similar -- number of conserved residues #FIXME eiter frac or num + identical -- number of identical residues + gaps -- number of gaps + length -- length of the alignment + + query_seq -- query string from alignment + target_seq -- hit string from alignment + mid_seq -- + + query_start -- + query_frame -- + + target_start -- + target_frame -- + + """ + __slots__ = ['raw_score', 'bit_score', 'significance', 'similar', + 'identical', 'gaps', 'length', 'query_seq', 'target_seq', 'mid_seq', + 'query_start', 'query_frame', 'target_start', + 'target_frame'] + + def __init__(self): + for name in self.__slots__ : + setattr(self, name, None) + + def __repr__(self): + return stdrepr(self) + + \ No newline at end of file