0
|
1
|
|
2 # Copyright (c) 2006 John Gilman
|
|
3 #
|
|
4 # This software is distributed under the MIT Open Source License.
|
|
5 # <http://www.opensource.org/licenses/mit-license.html>
|
|
6 #
|
|
7 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
8 # copy of this software and associated documentation files (the "Software"),
|
|
9 # to deal in the Software without restriction, including without limitation
|
|
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
11 # and/or sell copies of the Software, and to permit persons to whom the
|
|
12 # Software is furnished to do so, subject to the following conditions:
|
|
13 #
|
|
14 # The above copyright notice and this permission notice shall be included
|
|
15 # in all copies or substantial portions of the Software.
|
|
16 #
|
|
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
23 # THE SOFTWARE.
|
|
24
|
|
25 """ Parse the output of BLAST and similar sequence search analysis reports.
|
|
26
|
|
27 The result of a sequence database search is represented by the Report class.
|
|
28 o Each Report contains one or more results, one for each database query.
|
|
29 o Each Result contains one or more hits
|
|
30 o Each Hit may contain one or more Alignments (High scoring Sequence pairs)
|
|
31
|
|
32 CoreBio is often capable of guessing the correct format:
|
|
33 >>> from corebio import ssearch_io
|
|
34 >>> afile = open("test_corebio/data/ssearch/ssearch_out.txt")
|
|
35 >>> report = ssearch_io.read(afile)
|
|
36 >>> print report
|
|
37
|
|
38 Alternatively, each report type has a seperate module. Each module defines a
|
|
39 read(fin) method that can parse that report format.
|
|
40
|
|
41 >>> from corebio.ssearch_io import fasta
|
|
42 >>> report = fasta.read( open("test_corebio/data/ssearch/ssearch_out.txt") )
|
|
43 >>> print report
|
|
44
|
|
45 Module Application Comments
|
|
46 ---------------------------------------------------------------------------
|
|
47 fasta FASTA / SSEARCH Default (-m 1) or compact (-m 9 -d 0)
|
|
48 blastxml NCBI Blast NCBI XML format
|
|
49
|
|
50 Status: Beta
|
|
51 """
|
|
52 # Dev. References :
|
|
53 # Inspired by Bioperls searchIO system
|
|
54 # http://www.bioperl.org/wiki/HOWTO:SearchIO
|
|
55
|
|
56 __all__ = ['read', 'Report', 'Result',
|
|
57 'Hit','Annotation', 'Alignment']
|
|
58
|
|
59
|
|
60 from corebio.utils import stdrepr
|
|
61
|
|
62 def read(fin) :
|
|
63 """ Read and parse an analysis report.
|
|
64
|
|
65 returns :
|
|
66 A database search Report.
|
|
67 raises :
|
|
68 ValueError - If the file cannot be parsed
|
|
69 """
|
|
70
|
|
71 import fasta
|
|
72 import blastxml
|
|
73 parsers = (fasta, blastxml)
|
|
74 for p in parsers:
|
|
75 try:
|
|
76 return p.read(fin)
|
|
77 except ValueError, e:
|
|
78 pass
|
|
79 fin.seek(0) # FIXME. Non seakable stdin?
|
|
80
|
|
81 raise ValueError("Cannot parse sequence file: Tried fasta and blastxml")
|
|
82
|
|
83
|
|
84
|
|
85 class Report(object) :
|
|
86 """The results of a database search. The Report contains a list of 1 or more
|
|
87 Results, one for each query. Each query result containts a list of hits.
|
|
88 Each Hit contains a list of HSP's (High scoring segment pairs).
|
|
89
|
|
90 The structure of the report will vary somewhat depending on the source.
|
|
91
|
|
92 algorithm -- e.g. 'BLASTX'
|
|
93 algorithm_version -- e.g. '2.2.4 [Aug-26-2002]'
|
|
94 algorithm_reference --
|
|
95 database_name -- e.g. 'test.fa'
|
|
96 database_letters -- number of residues in database e.g. 1291
|
|
97 database_entries -- number of database entries
|
|
98
|
|
99 parameters -- Dictionary of parameters used in search
|
|
100
|
|
101 results -- A list of list of Results, one per query
|
|
102 """
|
|
103 __slots__ = ['algorithm', 'algorithm_version', 'algorithm_reference','database_name',
|
|
104 'database_letters', 'database_entries', 'parameters', 'results']
|
|
105
|
|
106 def __init__(self) :
|
|
107 for name in self.__slots__ : setattr(self, name, None)
|
|
108 self.parameters = {}
|
|
109 self.results = []
|
|
110
|
|
111 def __repr__(self):
|
|
112 return stdrepr(self)
|
|
113
|
|
114
|
|
115 class Result(object) :
|
|
116 """ The result from searching a database with a single query sequence.
|
|
117
|
|
118 query -- Information about the query sequence
|
|
119 statistics -- A dictionary of search statistics
|
|
120 hits -- A list of Hits
|
|
121 """
|
|
122 __slots__ = ['query', 'statistics', 'hits']
|
|
123
|
|
124 def __init__(self) :
|
|
125 for name in self.__slots__ : setattr(self, name, None)
|
|
126 self.query = Annotation()
|
|
127 self.statistics = {}
|
|
128 self.hits = []
|
|
129
|
|
130 def __repr__(self):
|
|
131 return stdrepr(self)
|
|
132
|
|
133
|
|
134 class Hit(object) :
|
|
135 """ A search hit between a query sequence and a subject sequence.
|
|
136 Each hit may have one or more Alignments
|
|
137
|
|
138 target -- Information about the target sequence.
|
|
139 raw_score -- Typically the ignficance of the hit in bits, e.g. 92.0
|
|
140 significance -- Typically evalue. e.g '2e-022'
|
|
141 alignments -- A list of alignments between subject and target
|
|
142 """
|
|
143 __slots__ =['target', 'raw_score', 'bit_score', 'significance',
|
|
144 'alignments']
|
|
145 def __init__(self) :
|
|
146 for name in self.__slots__ : setattr(self, name, None)
|
|
147 self.target = Annotation()
|
|
148 self.alignments = []
|
|
149
|
|
150 def __repr__(self):
|
|
151 return stdrepr(self)
|
|
152
|
|
153 class Annotation(object) :
|
|
154 """ Information about a subject or query sequence.
|
|
155
|
|
156 name -- subject sequence name, e.g. '443893|124775'
|
|
157 description -- e.g. 'LaForas sequence'
|
|
158 length -- subject sequence length, e.g. 331
|
|
159 locus -- e.g. '124775'
|
|
160 accession -- e.g. '443893'
|
|
161 """
|
|
162 # Fixme: change into generic sequence annotation class?
|
|
163 __slots__ = ['name', 'description', 'length', 'locus', 'accession', ]
|
|
164
|
|
165 def __init__(self):
|
|
166 for name in self.__slots__ :
|
|
167 setattr(self, name, None)
|
|
168
|
|
169 def __repr__(self):
|
|
170 return stdrepr(self)
|
|
171
|
|
172 class Alignment(object):
|
|
173 """An alignment between query and subject sequences.
|
|
174 For BLAST, these are High scoring Segment pairs (HSPs)
|
|
175
|
|
176 raw_score -- Typically signficance of the hit in bits, e.g. 92.0
|
|
177 significance -- Typically evalue. e.g '2e-022'
|
|
178
|
|
179 similar -- number of conserved residues #FIXME eiter frac or num
|
|
180 identical -- number of identical residues
|
|
181 gaps -- number of gaps
|
|
182 length -- length of the alignment
|
|
183
|
|
184 query_seq -- query string from alignment
|
|
185 target_seq -- hit string from alignment
|
|
186 mid_seq --
|
|
187
|
|
188 query_start --
|
|
189 query_frame --
|
|
190
|
|
191 target_start --
|
|
192 target_frame --
|
|
193
|
|
194 """
|
|
195 __slots__ = ['raw_score', 'bit_score', 'significance', 'similar',
|
|
196 'identical', 'gaps', 'length', 'query_seq', 'target_seq', 'mid_seq',
|
|
197 'query_start', 'query_frame', 'target_start',
|
|
198 'target_frame']
|
|
199
|
|
200 def __init__(self):
|
|
201 for name in self.__slots__ :
|
|
202 setattr(self, name, None)
|
|
203
|
|
204 def __repr__(self):
|
|
205 return stdrepr(self)
|
|
206
|
|
207 |