Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/__init__.py @ 4:4d47ab2b7bcc
Uploaded
author | davidmurphy |
---|---|
date | Fri, 13 Jan 2012 07:18:19 -0500 |
parents | c55bdc2fb9fa |
children |
comparison
equal
deleted
inserted
replaced
3:09d2dac9ef73 | 4:4d47ab2b7bcc |
---|---|
1 | |
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> | |
3 # Copyright (c) 2006, The Regents of the University of California, through | |
4 # Lawrence Berkeley National Laboratory (subject to receipt of any required | |
5 # approvals from the U.S. Dept. of Energy). All rights reserved. | |
6 | |
7 # This software is distributed under the new BSD Open Source License. | |
8 # <http://www.opensource.org/licenses/bsd-license.html> | |
9 # | |
10 # Redistribution and use in source and binary forms, with or without | |
11 # modification, are permitted provided that the following conditions are met: | |
12 # | |
13 # (1) Redistributions of source code must retain the above copyright notice, | |
14 # this list of conditions and the following disclaimer. | |
15 # | |
16 # (2) Redistributions in binary form must reproduce the above copyright | |
17 # notice, this list of conditions and the following disclaimer in the | |
18 # documentation and or other materials provided with the distribution. | |
19 # | |
20 # (3) Neither the name of the University of California, Lawrence Berkeley | |
21 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors | |
22 # may be used to endorse or promote products derived from this software | |
23 # without specific prior written permission. | |
24 # | |
25 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
26 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
27 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
28 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
29 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
30 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
31 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
32 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
33 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
34 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
35 # POSSIBILITY OF SUCH DAMAGE. | |
36 | |
37 | |
38 | |
39 | |
40 | |
41 """ Sequence file reading and writing. | |
42 | |
43 Biological sequence data is stored and transmitted using a wide variety of | |
44 different file formats. This package provides convient methods to read and | |
45 write several of these file fomats. | |
46 | |
47 CoreBio is often capable of guessing the correct file type, either from the | |
48 file extension or the structure of the file: | |
49 >>> import corebio.seq_io | |
50 >>> afile = open("test_corebio/data/cap.fa") | |
51 >>> seqs = corebio.seq_io.read(afile) | |
52 | |
53 Alternatively, each sequence file type has a seperate module named FILETYPE_io | |
54 (e.g. fasta_io, clustal_io). | |
55 >>> import corebio.seq_io.fasta_io | |
56 >>> afile = open("test_corebio/data/cap.fa") | |
57 >>> seqs = corebio.seq_io.fasta_io.read( afile ) | |
58 | |
59 Sequence data can also be written back to files: | |
60 >>> fout = open("out.fa", "w") | |
61 >>> corebio.seq_io.fasta_io.write( fout, seqs ) | |
62 | |
63 | |
64 Supported File Formats | |
65 ---------------------- | |
66 | |
67 Module Name Extension read write features | |
68 --------------------------------------------------------------------------- | |
69 array_io array, flatfile yes yes none | |
70 clustal_io clustalw aln yes yes | |
71 fasta_io fasta, Pearson fa yes yes none | |
72 genbank_io genbank gb yes | |
73 intelligenetics_io intelligenetics ig yes yes | |
74 msf_io msf msf yes | |
75 nbrf_io nbrf, pir pir yes | |
76 nexus_io nexus nexus yes | |
77 phylip_io phylip phy yes | |
78 plain_io plain, raw txt yes yes none | |
79 table_io table tbl yes yes none | |
80 | |
81 Each IO module defines one or more of the following functions and variables: | |
82 | |
83 read(afile, alphabet=None) | |
84 Read a file of sequence data and return a SeqList, a collection | |
85 of Seq's (Alphabetic strings) and features. | |
86 | |
87 read_seq(afile, alphabet=None) | |
88 Read a single sequence from a file. | |
89 | |
90 iter_seq(afile, alphabet =None) | |
91 Iterate over the sequences in a file. | |
92 | |
93 index(afile, alphabet = None) | |
94 Instead of loading all of the sequences into memory, scan the file and | |
95 return an index map that will load sequences on demand. Typically not | |
96 implemented for formats with interleaved sequences. | |
97 | |
98 write(afile, seqlist) | |
99 Write a collection of sequences to the specifed file. | |
100 | |
101 write_seq(afile, seq) | |
102 Write one sequence to the file. Only implemented for non-inteleaved, | |
103 headerless formats, such as fasta and plain. | |
104 | |
105 example | |
106 A string containing a short example of the file format | |
107 | |
108 names | |
109 A list of synonyms for the file format. e.g. for fasta_io, ( 'fasta', | |
110 'pearson', 'fa'). The first entry is the preferred format name. | |
111 | |
112 extensions | |
113 A list of file name extensions used for this file format. e.g. | |
114 fasta_io.extensions is ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt', | |
115 'aa','fna','mpfa'). The preferred or standard extension is first in the | |
116 list. | |
117 | |
118 | |
119 Attributes : | |
120 - formats -- Available seq_io format parsers | |
121 - format_names -- A map between format names and format parsers. | |
122 - format_extensions -- A map between filename extensions and parsers. | |
123 | |
124 """ | |
125 | |
126 # Dev. References : | |
127 # | |
128 # - http://iubio.bio.indiana.edu/soft/molbio/readseq/java/Readseq2-help.html | |
129 # - http://www.ebi.ac.uk/help/formats_frame.html | |
130 # - http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html | |
131 # - http://bioperl.org/HOWTOs/html/SeqIO.html | |
132 # - http://emboss.sourceforge.net/docs/themes/SequenceFormats.html | |
133 # - http://www.cse.ucsc.edu/research/compbio/a2m-desc.html (a2m) | |
134 # - http://www.genomatix.de/online_help/help/sequence_formats.html | |
135 | |
136 from corebio.seq import * | |
137 | |
138 import clustal_io | |
139 import fasta_io | |
140 import msf_io | |
141 import nbrf_io | |
142 import nexus_io | |
143 import plain_io | |
144 import phylip_io | |
145 #import null_io | |
146 import stockholm_io | |
147 import intelligenetics_io | |
148 import table_io | |
149 import array_io | |
150 import genbank_io | |
151 | |
152 __all__ = [ | |
153 'clustal_io', | |
154 'fasta_io', | |
155 'msf_io', | |
156 'nbrf_io', | |
157 'nexus_io', | |
158 'plain_io', | |
159 'phylip_io', | |
160 'null_io', | |
161 'stockholm_io', | |
162 'intelligenetics_io', | |
163 'table_io', | |
164 'array_io', | |
165 'genbank_io', | |
166 'read', | |
167 'formats', | |
168 'format_names', | |
169 'format_extensions', | |
170 ] | |
171 | |
172 formats = ( clustal_io, fasta_io, plain_io, msf_io, genbank_io,nbrf_io, nexus_io, phylip_io, stockholm_io, intelligenetics_io, table_io, array_io) | |
173 """Available seq_io formats""" | |
174 | |
175 | |
176 def format_names() : | |
177 """Return a map between format names and format modules""" | |
178 global formats | |
179 fnames = {} | |
180 for f in formats : | |
181 for name in f.names : | |
182 assert name not in fnames # Insanity check | |
183 fnames[name] = f | |
184 return fnames | |
185 | |
186 def format_extensions() : | |
187 """Return a map between filename extensions and sequence file types""" | |
188 global formats | |
189 fext = {} | |
190 for f in formats : | |
191 for ext in f.extensions : | |
192 assert ext not in fext # Insanity check | |
193 fext[ext] = f | |
194 return fext | |
195 | |
196 | |
197 # seq_io._parsers is an ordered list of sequence parsers that are tried, in | |
198 # turn, on files of unknown format. Each parser must raise an exception when | |
199 # fed a format further down the list. | |
200 # | |
201 # The general trend is most common to least common file format. However, | |
202 # 'nbrf_io' is before 'fasta_io' because nbrf looks like fasta with extras, and | |
203 # 'array_io' is last, since it is very general. | |
204 _parsers = (nbrf_io, fasta_io, clustal_io, phylip_io, genbank_io, stockholm_io, msf_io, nexus_io, table_io, array_io) | |
205 | |
206 | |
207 def _get_parsers(fin) : | |
208 global _parsers | |
209 | |
210 fnames = format_names() | |
211 fext = format_extensions() | |
212 parsers = list(_parsers) | |
213 best_guess = parsers[0] | |
214 | |
215 # If a filename is supplied use the extension to guess the format. | |
216 if hasattr(fin, "name") and '.' in fin.name : | |
217 extension = fin.name.split('.')[-1] | |
218 if extension in fnames: | |
219 best_guess = fnames[extension] | |
220 elif extension in fext : | |
221 best_guess = fext[extension] | |
222 | |
223 if best_guess in parsers : | |
224 parsers.remove(best_guess) | |
225 parsers.insert(0,best_guess) | |
226 | |
227 return parsers | |
228 | |
229 | |
230 | |
231 def read(fin, alphabet=None) : | |
232 """ Read a sequence file and attempt to guess its format. | |
233 First the filename extension (if available) is used to infer the format. | |
234 If that fails, then we attempt to parse the file using several common | |
235 formats. | |
236 | |
237 returns : | |
238 SeqList | |
239 raises : | |
240 ValueError - If the file cannot be parsed. | |
241 ValueError - Sequence do not conform to the alphabet. | |
242 """ | |
243 | |
244 alphabet = Alphabet(alphabet) | |
245 parsers = _get_parsers(fin) | |
246 | |
247 for p in _get_parsers(fin) : | |
248 try: | |
249 return p.read(fin, alphabet) | |
250 except ValueError: | |
251 pass | |
252 fin.seek(0) # FIXME. Non seakable stdin? | |
253 | |
254 names = ", ".join([ p.names[0] for p in parsers]) | |
255 raise ValueError("Cannot parse sequence file: Tried %s " % names) | |
256 | |
257 | |
258 | |
259 | |
260 | |
261 |