comparison corebio/seq_io/__init__.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
3 # Copyright (c) 2006, The Regents of the University of California, through
4 # Lawrence Berkeley National Laboratory (subject to receipt of any required
5 # approvals from the U.S. Dept. of Energy). All rights reserved.
6
7 # This software is distributed under the new BSD Open Source License.
8 # <http://www.opensource.org/licenses/bsd-license.html>
9 #
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are met:
12 #
13 # (1) Redistributions of source code must retain the above copyright notice,
14 # this list of conditions and the following disclaimer.
15 #
16 # (2) Redistributions in binary form must reproduce the above copyright
17 # notice, this list of conditions and the following disclaimer in the
18 # documentation and or other materials provided with the distribution.
19 #
20 # (3) Neither the name of the University of California, Lawrence Berkeley
21 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
22 # may be used to endorse or promote products derived from this software
23 # without specific prior written permission.
24 #
25 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 # POSSIBILITY OF SUCH DAMAGE.
36
37
38
39
40
41 """ Sequence file reading and writing.
42
43 Biological sequence data is stored and transmitted using a wide variety of
44 different file formats. This package provides convient methods to read and
45 write several of these file fomats.
46
47 CoreBio is often capable of guessing the correct file type, either from the
48 file extension or the structure of the file:
49 >>> import corebio.seq_io
50 >>> afile = open("test_corebio/data/cap.fa")
51 >>> seqs = corebio.seq_io.read(afile)
52
53 Alternatively, each sequence file type has a seperate module named FILETYPE_io
54 (e.g. fasta_io, clustal_io).
55 >>> import corebio.seq_io.fasta_io
56 >>> afile = open("test_corebio/data/cap.fa")
57 >>> seqs = corebio.seq_io.fasta_io.read( afile )
58
59 Sequence data can also be written back to files:
60 >>> fout = open("out.fa", "w")
61 >>> corebio.seq_io.fasta_io.write( fout, seqs )
62
63
64 Supported File Formats
65 ----------------------
66
67 Module Name Extension read write features
68 ---------------------------------------------------------------------------
69 array_io array, flatfile yes yes none
70 clustal_io clustalw aln yes yes
71 fasta_io fasta, Pearson fa yes yes none
72 genbank_io genbank gb yes
73 intelligenetics_io intelligenetics ig yes yes
74 msf_io msf msf yes
75 nbrf_io nbrf, pir pir yes
76 nexus_io nexus nexus yes
77 phylip_io phylip phy yes
78 plain_io plain, raw txt yes yes none
79 table_io table tbl yes yes none
80
81 Each IO module defines one or more of the following functions and variables:
82
83 read(afile, alphabet=None)
84 Read a file of sequence data and return a SeqList, a collection
85 of Seq's (Alphabetic strings) and features.
86
87 read_seq(afile, alphabet=None)
88 Read a single sequence from a file.
89
90 iter_seq(afile, alphabet =None)
91 Iterate over the sequences in a file.
92
93 index(afile, alphabet = None)
94 Instead of loading all of the sequences into memory, scan the file and
95 return an index map that will load sequences on demand. Typically not
96 implemented for formats with interleaved sequences.
97
98 write(afile, seqlist)
99 Write a collection of sequences to the specifed file.
100
101 write_seq(afile, seq)
102 Write one sequence to the file. Only implemented for non-inteleaved,
103 headerless formats, such as fasta and plain.
104
105 example
106 A string containing a short example of the file format
107
108 names
109 A list of synonyms for the file format. e.g. for fasta_io, ( 'fasta',
110 'pearson', 'fa'). The first entry is the preferred format name.
111
112 extensions
113 A list of file name extensions used for this file format. e.g.
114 fasta_io.extensions is ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt',
115 'aa','fna','mpfa'). The preferred or standard extension is first in the
116 list.
117
118
119 Attributes :
120 - formats -- Available seq_io format parsers
121 - format_names -- A map between format names and format parsers.
122 - format_extensions -- A map between filename extensions and parsers.
123
124 """
125
126 # Dev. References :
127 #
128 # - http://iubio.bio.indiana.edu/soft/molbio/readseq/java/Readseq2-help.html
129 # - http://www.ebi.ac.uk/help/formats_frame.html
130 # - http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html
131 # - http://bioperl.org/HOWTOs/html/SeqIO.html
132 # - http://emboss.sourceforge.net/docs/themes/SequenceFormats.html
133 # - http://www.cse.ucsc.edu/research/compbio/a2m-desc.html (a2m)
134 # - http://www.genomatix.de/online_help/help/sequence_formats.html
135
136 from corebio.seq import *
137
138 import clustal_io
139 import fasta_io
140 import msf_io
141 import nbrf_io
142 import nexus_io
143 import plain_io
144 import phylip_io
145 #import null_io
146 import stockholm_io
147 import intelligenetics_io
148 import table_io
149 import array_io
150 import genbank_io
151
152 __all__ = [
153 'clustal_io',
154 'fasta_io',
155 'msf_io',
156 'nbrf_io',
157 'nexus_io',
158 'plain_io',
159 'phylip_io',
160 'null_io',
161 'stockholm_io',
162 'intelligenetics_io',
163 'table_io',
164 'array_io',
165 'genbank_io',
166 'read',
167 'formats',
168 'format_names',
169 'format_extensions',
170 ]
171
172 formats = ( clustal_io, fasta_io, plain_io, msf_io, genbank_io,nbrf_io, nexus_io, phylip_io, stockholm_io, intelligenetics_io, table_io, array_io)
173 """Available seq_io formats"""
174
175
176 def format_names() :
177 """Return a map between format names and format modules"""
178 global formats
179 fnames = {}
180 for f in formats :
181 for name in f.names :
182 assert name not in fnames # Insanity check
183 fnames[name] = f
184 return fnames
185
186 def format_extensions() :
187 """Return a map between filename extensions and sequence file types"""
188 global formats
189 fext = {}
190 for f in formats :
191 for ext in f.extensions :
192 assert ext not in fext # Insanity check
193 fext[ext] = f
194 return fext
195
196
197 # seq_io._parsers is an ordered list of sequence parsers that are tried, in
198 # turn, on files of unknown format. Each parser must raise an exception when
199 # fed a format further down the list.
200 #
201 # The general trend is most common to least common file format. However,
202 # 'nbrf_io' is before 'fasta_io' because nbrf looks like fasta with extras, and
203 # 'array_io' is last, since it is very general.
204 _parsers = (nbrf_io, fasta_io, clustal_io, phylip_io, genbank_io, stockholm_io, msf_io, nexus_io, table_io, array_io)
205
206
207 def _get_parsers(fin) :
208 global _parsers
209
210 fnames = format_names()
211 fext = format_extensions()
212 parsers = list(_parsers)
213 best_guess = parsers[0]
214
215 # If a filename is supplied use the extension to guess the format.
216 if hasattr(fin, "name") and '.' in fin.name :
217 extension = fin.name.split('.')[-1]
218 if extension in fnames:
219 best_guess = fnames[extension]
220 elif extension in fext :
221 best_guess = fext[extension]
222
223 if best_guess in parsers :
224 parsers.remove(best_guess)
225 parsers.insert(0,best_guess)
226
227 return parsers
228
229
230
231 def read(fin, alphabet=None) :
232 """ Read a sequence file and attempt to guess its format.
233 First the filename extension (if available) is used to infer the format.
234 If that fails, then we attempt to parse the file using several common
235 formats.
236
237 returns :
238 SeqList
239 raises :
240 ValueError - If the file cannot be parsed.
241 ValueError - Sequence do not conform to the alphabet.
242 """
243
244 alphabet = Alphabet(alphabet)
245 parsers = _get_parsers(fin)
246
247 for p in _get_parsers(fin) :
248 try:
249 return p.read(fin, alphabet)
250 except ValueError:
251 pass
252 fin.seek(0) # FIXME. Non seakable stdin?
253
254 names = ", ".join([ p.names[0] for p in parsers])
255 raise ValueError("Cannot parse sequence file: Tried %s " % names)
256
257
258
259
260
261