comparison corebio/ssearch_io/__init__.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1
2 # Copyright (c) 2006 John Gilman
3 #
4 # This software is distributed under the MIT Open Source License.
5 # <http://www.opensource.org/licenses/mit-license.html>
6 #
7 # Permission is hereby granted, free of charge, to any person obtaining a
8 # copy of this software and associated documentation files (the "Software"),
9 # to deal in the Software without restriction, including without limitation
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 # and/or sell copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following conditions:
13 #
14 # The above copyright notice and this permission notice shall be included
15 # in all copies or substantial portions of the Software.
16 #
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 # THE SOFTWARE.
24
25 """ Parse the output of BLAST and similar sequence search analysis reports.
26
27 The result of a sequence database search is represented by the Report class.
28 o Each Report contains one or more results, one for each database query.
29 o Each Result contains one or more hits
30 o Each Hit may contain one or more Alignments (High scoring Sequence pairs)
31
32 CoreBio is often capable of guessing the correct format:
33 >>> from corebio import ssearch_io
34 >>> afile = open("test_corebio/data/ssearch/ssearch_out.txt")
35 >>> report = ssearch_io.read(afile)
36 >>> print report
37
38 Alternatively, each report type has a seperate module. Each module defines a
39 read(fin) method that can parse that report format.
40
41 >>> from corebio.ssearch_io import fasta
42 >>> report = fasta.read( open("test_corebio/data/ssearch/ssearch_out.txt") )
43 >>> print report
44
45 Module Application Comments
46 ---------------------------------------------------------------------------
47 fasta FASTA / SSEARCH Default (-m 1) or compact (-m 9 -d 0)
48 blastxml NCBI Blast NCBI XML format
49
50 Status: Beta
51 """
52 # Dev. References :
53 # Inspired by Bioperls searchIO system
54 # http://www.bioperl.org/wiki/HOWTO:SearchIO
55
56 __all__ = ['read', 'Report', 'Result',
57 'Hit','Annotation', 'Alignment']
58
59
60 from corebio.utils import stdrepr
61
62 def read(fin) :
63 """ Read and parse an analysis report.
64
65 returns :
66 A database search Report.
67 raises :
68 ValueError - If the file cannot be parsed
69 """
70
71 import fasta
72 import blastxml
73 parsers = (fasta, blastxml)
74 for p in parsers:
75 try:
76 return p.read(fin)
77 except ValueError, e:
78 pass
79 fin.seek(0) # FIXME. Non seakable stdin?
80
81 raise ValueError("Cannot parse sequence file: Tried fasta and blastxml")
82
83
84
85 class Report(object) :
86 """The results of a database search. The Report contains a list of 1 or more
87 Results, one for each query. Each query result containts a list of hits.
88 Each Hit contains a list of HSP's (High scoring segment pairs).
89
90 The structure of the report will vary somewhat depending on the source.
91
92 algorithm -- e.g. 'BLASTX'
93 algorithm_version -- e.g. '2.2.4 [Aug-26-2002]'
94 algorithm_reference --
95 database_name -- e.g. 'test.fa'
96 database_letters -- number of residues in database e.g. 1291
97 database_entries -- number of database entries
98
99 parameters -- Dictionary of parameters used in search
100
101 results -- A list of list of Results, one per query
102 """
103 __slots__ = ['algorithm', 'algorithm_version', 'algorithm_reference','database_name',
104 'database_letters', 'database_entries', 'parameters', 'results']
105
106 def __init__(self) :
107 for name in self.__slots__ : setattr(self, name, None)
108 self.parameters = {}
109 self.results = []
110
111 def __repr__(self):
112 return stdrepr(self)
113
114
115 class Result(object) :
116 """ The result from searching a database with a single query sequence.
117
118 query -- Information about the query sequence
119 statistics -- A dictionary of search statistics
120 hits -- A list of Hits
121 """
122 __slots__ = ['query', 'statistics', 'hits']
123
124 def __init__(self) :
125 for name in self.__slots__ : setattr(self, name, None)
126 self.query = Annotation()
127 self.statistics = {}
128 self.hits = []
129
130 def __repr__(self):
131 return stdrepr(self)
132
133
134 class Hit(object) :
135 """ A search hit between a query sequence and a subject sequence.
136 Each hit may have one or more Alignments
137
138 target -- Information about the target sequence.
139 raw_score -- Typically the ignficance of the hit in bits, e.g. 92.0
140 significance -- Typically evalue. e.g '2e-022'
141 alignments -- A list of alignments between subject and target
142 """
143 __slots__ =['target', 'raw_score', 'bit_score', 'significance',
144 'alignments']
145 def __init__(self) :
146 for name in self.__slots__ : setattr(self, name, None)
147 self.target = Annotation()
148 self.alignments = []
149
150 def __repr__(self):
151 return stdrepr(self)
152
153 class Annotation(object) :
154 """ Information about a subject or query sequence.
155
156 name -- subject sequence name, e.g. '443893|124775'
157 description -- e.g. 'LaForas sequence'
158 length -- subject sequence length, e.g. 331
159 locus -- e.g. '124775'
160 accession -- e.g. '443893'
161 """
162 # Fixme: change into generic sequence annotation class?
163 __slots__ = ['name', 'description', 'length', 'locus', 'accession', ]
164
165 def __init__(self):
166 for name in self.__slots__ :
167 setattr(self, name, None)
168
169 def __repr__(self):
170 return stdrepr(self)
171
172 class Alignment(object):
173 """An alignment between query and subject sequences.
174 For BLAST, these are High scoring Segment pairs (HSPs)
175
176 raw_score -- Typically signficance of the hit in bits, e.g. 92.0
177 significance -- Typically evalue. e.g '2e-022'
178
179 similar -- number of conserved residues #FIXME eiter frac or num
180 identical -- number of identical residues
181 gaps -- number of gaps
182 length -- length of the alignment
183
184 query_seq -- query string from alignment
185 target_seq -- hit string from alignment
186 mid_seq --
187
188 query_start --
189 query_frame --
190
191 target_start --
192 target_frame --
193
194 """
195 __slots__ = ['raw_score', 'bit_score', 'significance', 'similar',
196 'identical', 'gaps', 'length', 'query_seq', 'target_seq', 'mid_seq',
197 'query_start', 'query_frame', 'target_start',
198 'target_frame']
199
200 def __init__(self):
201 for name in self.__slots__ :
202 setattr(self, name, None)
203
204 def __repr__(self):
205 return stdrepr(self)
206
207