Mercurial > repos > davidmurphy > codonlogo
comparison corebio/ssearch_io/__init__.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 | |
2 # Copyright (c) 2006 John Gilman | |
3 # | |
4 # This software is distributed under the MIT Open Source License. | |
5 # <http://www.opensource.org/licenses/mit-license.html> | |
6 # | |
7 # Permission is hereby granted, free of charge, to any person obtaining a | |
8 # copy of this software and associated documentation files (the "Software"), | |
9 # to deal in the Software without restriction, including without limitation | |
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
11 # and/or sell copies of the Software, and to permit persons to whom the | |
12 # Software is furnished to do so, subject to the following conditions: | |
13 # | |
14 # The above copyright notice and this permission notice shall be included | |
15 # in all copies or substantial portions of the Software. | |
16 # | |
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
23 # THE SOFTWARE. | |
24 | |
25 """ Parse the output of BLAST and similar sequence search analysis reports. | |
26 | |
27 The result of a sequence database search is represented by the Report class. | |
28 o Each Report contains one or more results, one for each database query. | |
29 o Each Result contains one or more hits | |
30 o Each Hit may contain one or more Alignments (High scoring Sequence pairs) | |
31 | |
32 CoreBio is often capable of guessing the correct format: | |
33 >>> from corebio import ssearch_io | |
34 >>> afile = open("test_corebio/data/ssearch/ssearch_out.txt") | |
35 >>> report = ssearch_io.read(afile) | |
36 >>> print report | |
37 | |
38 Alternatively, each report type has a seperate module. Each module defines a | |
39 read(fin) method that can parse that report format. | |
40 | |
41 >>> from corebio.ssearch_io import fasta | |
42 >>> report = fasta.read( open("test_corebio/data/ssearch/ssearch_out.txt") ) | |
43 >>> print report | |
44 | |
45 Module Application Comments | |
46 --------------------------------------------------------------------------- | |
47 fasta FASTA / SSEARCH Default (-m 1) or compact (-m 9 -d 0) | |
48 blastxml NCBI Blast NCBI XML format | |
49 | |
50 Status: Beta | |
51 """ | |
52 # Dev. References : | |
53 # Inspired by Bioperls searchIO system | |
54 # http://www.bioperl.org/wiki/HOWTO:SearchIO | |
55 | |
56 __all__ = ['read', 'Report', 'Result', | |
57 'Hit','Annotation', 'Alignment'] | |
58 | |
59 | |
60 from corebio.utils import stdrepr | |
61 | |
62 def read(fin) : | |
63 """ Read and parse an analysis report. | |
64 | |
65 returns : | |
66 A database search Report. | |
67 raises : | |
68 ValueError - If the file cannot be parsed | |
69 """ | |
70 | |
71 import fasta | |
72 import blastxml | |
73 parsers = (fasta, blastxml) | |
74 for p in parsers: | |
75 try: | |
76 return p.read(fin) | |
77 except ValueError, e: | |
78 pass | |
79 fin.seek(0) # FIXME. Non seakable stdin? | |
80 | |
81 raise ValueError("Cannot parse sequence file: Tried fasta and blastxml") | |
82 | |
83 | |
84 | |
85 class Report(object) : | |
86 """The results of a database search. The Report contains a list of 1 or more | |
87 Results, one for each query. Each query result containts a list of hits. | |
88 Each Hit contains a list of HSP's (High scoring segment pairs). | |
89 | |
90 The structure of the report will vary somewhat depending on the source. | |
91 | |
92 algorithm -- e.g. 'BLASTX' | |
93 algorithm_version -- e.g. '2.2.4 [Aug-26-2002]' | |
94 algorithm_reference -- | |
95 database_name -- e.g. 'test.fa' | |
96 database_letters -- number of residues in database e.g. 1291 | |
97 database_entries -- number of database entries | |
98 | |
99 parameters -- Dictionary of parameters used in search | |
100 | |
101 results -- A list of list of Results, one per query | |
102 """ | |
103 __slots__ = ['algorithm', 'algorithm_version', 'algorithm_reference','database_name', | |
104 'database_letters', 'database_entries', 'parameters', 'results'] | |
105 | |
106 def __init__(self) : | |
107 for name in self.__slots__ : setattr(self, name, None) | |
108 self.parameters = {} | |
109 self.results = [] | |
110 | |
111 def __repr__(self): | |
112 return stdrepr(self) | |
113 | |
114 | |
115 class Result(object) : | |
116 """ The result from searching a database with a single query sequence. | |
117 | |
118 query -- Information about the query sequence | |
119 statistics -- A dictionary of search statistics | |
120 hits -- A list of Hits | |
121 """ | |
122 __slots__ = ['query', 'statistics', 'hits'] | |
123 | |
124 def __init__(self) : | |
125 for name in self.__slots__ : setattr(self, name, None) | |
126 self.query = Annotation() | |
127 self.statistics = {} | |
128 self.hits = [] | |
129 | |
130 def __repr__(self): | |
131 return stdrepr(self) | |
132 | |
133 | |
134 class Hit(object) : | |
135 """ A search hit between a query sequence and a subject sequence. | |
136 Each hit may have one or more Alignments | |
137 | |
138 target -- Information about the target sequence. | |
139 raw_score -- Typically the ignficance of the hit in bits, e.g. 92.0 | |
140 significance -- Typically evalue. e.g '2e-022' | |
141 alignments -- A list of alignments between subject and target | |
142 """ | |
143 __slots__ =['target', 'raw_score', 'bit_score', 'significance', | |
144 'alignments'] | |
145 def __init__(self) : | |
146 for name in self.__slots__ : setattr(self, name, None) | |
147 self.target = Annotation() | |
148 self.alignments = [] | |
149 | |
150 def __repr__(self): | |
151 return stdrepr(self) | |
152 | |
153 class Annotation(object) : | |
154 """ Information about a subject or query sequence. | |
155 | |
156 name -- subject sequence name, e.g. '443893|124775' | |
157 description -- e.g. 'LaForas sequence' | |
158 length -- subject sequence length, e.g. 331 | |
159 locus -- e.g. '124775' | |
160 accession -- e.g. '443893' | |
161 """ | |
162 # Fixme: change into generic sequence annotation class? | |
163 __slots__ = ['name', 'description', 'length', 'locus', 'accession', ] | |
164 | |
165 def __init__(self): | |
166 for name in self.__slots__ : | |
167 setattr(self, name, None) | |
168 | |
169 def __repr__(self): | |
170 return stdrepr(self) | |
171 | |
172 class Alignment(object): | |
173 """An alignment between query and subject sequences. | |
174 For BLAST, these are High scoring Segment pairs (HSPs) | |
175 | |
176 raw_score -- Typically signficance of the hit in bits, e.g. 92.0 | |
177 significance -- Typically evalue. e.g '2e-022' | |
178 | |
179 similar -- number of conserved residues #FIXME eiter frac or num | |
180 identical -- number of identical residues | |
181 gaps -- number of gaps | |
182 length -- length of the alignment | |
183 | |
184 query_seq -- query string from alignment | |
185 target_seq -- hit string from alignment | |
186 mid_seq -- | |
187 | |
188 query_start -- | |
189 query_frame -- | |
190 | |
191 target_start -- | |
192 target_frame -- | |
193 | |
194 """ | |
195 __slots__ = ['raw_score', 'bit_score', 'significance', 'similar', | |
196 'identical', 'gaps', 'length', 'query_seq', 'target_seq', 'mid_seq', | |
197 'query_start', 'query_frame', 'target_start', | |
198 'target_frame'] | |
199 | |
200 def __init__(self): | |
201 for name in self.__slots__ : | |
202 setattr(self, name, None) | |
203 | |
204 def __repr__(self): | |
205 return stdrepr(self) | |
206 | |
207 |