0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
|
|
4 #
|
|
5 # This software is distributed under the MIT Open Source License.
|
|
6 # <http://www.opensource.org/licenses/mit-license.html>
|
|
7 #
|
|
8 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
9 # copy of this software and associated documentation files (the "Software"),
|
|
10 # to deal in the Software without restriction, including without limitation
|
|
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
12 # and/or sell copies of the Software, and to permit persons to whom the
|
|
13 # Software is furnished to do so, subject to the following conditions:
|
|
14 #
|
|
15 # The above copyright notice and this permission notice shall be included
|
|
16 # in all copies or substantial portions of the Software.
|
|
17 #
|
|
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
24 # THE SOFTWARE.
|
|
25 #
|
|
26
|
|
27 """Read and write sequence information in FASTA format.
|
|
28
|
|
29 This is a very common format for unannotated biological sequence data,
|
|
30 accepted by many multiple sequence alignment programs. Each sequence
|
|
31 consists of a single-line description, followed by lines of sequence data.
|
|
32 The first character of the description line is a greater-than (">") symbol
|
|
33 in the first column. The first word of the description is often the name or
|
|
34 ID of the sequence. Fasta files containing multiple sequences have one
|
|
35 sequence listed right after another.
|
|
36
|
|
37
|
|
38 Example Fasta File ::
|
|
39
|
|
40 >Lamprey GLOBIN V - SEA LAMPREY
|
|
41 PIVDTGSVA-P------------------LSAAEKTKIRSAWAPVYSTY---ETSGVDILVKFFTSTPAAQEFFPKFKGL
|
|
42 TT-----ADQLKKSA---DVRWHA-ERIINAVNDAVASMDDTEKMS--MKL-RDLSGKH----AKSFQV-----DPQYFK
|
|
43 VLAAVI-AD-TVAAGD--AGFEKLMSM------I---CILLR----S-----A-----Y------------
|
|
44 >Hagfish GLOBIN III - ATLANTIC HAGFISH
|
|
45 PITDHGQPP-T------------------LSEGDKKAIRESWPQIYKNF---EQNSLAVLLEFLKKFPKAQDSFPKFSAK
|
|
46 KS-------HLEQDP---AVKLQA-EVIINAVNHTIGLMDKEAAMK--KYL-KDLSTKH----STEFQV-----NPDMFK
|
|
47 ELSAVF-VS-TMG-GK--AAYEKLFSI------I---ATLLR----S-----T-----YDA----------
|
|
48 >Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG
|
|
49 ----------GS-----------------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL
|
|
50 GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK----LHVDPANFRLLGNVFIT
|
|
51 VLARHF-QH-EFTPELQ-HALEAHFCA------V---GDALA----K-----A-----YH-----------
|
|
52
|
|
53
|
|
54 """
|
|
55 import re
|
|
56 from corebio.utils import *
|
|
57 from corebio.seq import *
|
|
58 from corebio.seq_io import *
|
|
59
|
|
60
|
|
61 names = ( 'fasta', 'pearson', 'fa')
|
|
62 extensions = ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt', 'aa','fna','mpfa', 'faa', 'fnn','mfasta')
|
|
63
|
|
64
|
|
65 example = """
|
|
66 >Lamprey GLOBIN V - SEA LAMPREY
|
|
67 PIVDTGSVA-P------------------LSAAEKTKIRSAWAPVYSTY---ETSGVDILVKFFTSTPAAQEFFPKFKGL
|
|
68 TT-----ADQLKKSA---DVRWHA-ERIINAVNDAVASMDDTEKMS--MKL-RDLSGKH----AKSFQV-----DPQYFK
|
|
69 VLAAVI-AD-TVAAGD--AGFEKLMSM------I---CILLR----S-----A-----Y------------
|
|
70
|
|
71 >Hagfish GLOBIN III - ATLANTIC HAGFISH
|
|
72 PITDHGQPP-T------------------LSEGDKKAIRESWPQIYKNF---EQNSLAVLLEFLKKFPKAQDSFPKFSAK
|
|
73 KS-------HLEQDP---AVKLQA-EVIINAVNHTIGLMDKEAAMK--KYL-KDLSTKH----STEFQV-----NPDMFK
|
|
74 ELSAVF-VS-TMG-GK--AAYEKLFSI------I---ATLLR----S-----T-----YDA----------
|
|
75
|
|
76 >Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG
|
|
77 ----------GS-----------------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL
|
|
78 GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK----LHVDPANFRLLGNVFIT
|
|
79 VLARHF-QH-EFTPELQ-HALEAHFCA------V---GDALA----K-----A-----YH-----------
|
|
80
|
|
81 """
|
|
82
|
|
83
|
|
84 def read(fin, alphabet=None):
|
|
85 """Read and parse a fasta file.
|
|
86
|
|
87 Args:
|
|
88 fin -- A stream or file to read
|
|
89 alphabet -- The expected alphabet of the data, if given
|
|
90 Returns:
|
|
91 SeqList -- A list of sequences
|
|
92 Raises:
|
|
93 ValueError -- If the file is unparsable
|
|
94 """
|
|
95 seqs = [ s for s in iterseq(fin, alphabet)]
|
|
96 name = names[0]
|
|
97 if hasattr(fin, "name") : name = fin.name
|
|
98 return SeqList(seqs, name=name)
|
|
99
|
|
100
|
|
101 def readseq(fin, alphabet=None) :
|
|
102 """Read one sequence from the file, starting
|
|
103 from the current file position."""
|
|
104 return iterseq(fin, alphabet).next()
|
|
105
|
|
106
|
|
107 def iterseq(fin, alphabet=None):
|
|
108 """ Parse a fasta file and generate sequences.
|
|
109
|
|
110 Args:
|
|
111 fin -- A stream or file to read
|
|
112 alphabet -- The expected alphabet of the data, if given
|
|
113 Yeilds:
|
|
114 Seq -- One alphabetic sequence at a time.
|
|
115 Raises:
|
|
116 ValueError -- If the file is unparsable
|
|
117 """
|
|
118 alphabet = Alphabet(alphabet)
|
|
119
|
|
120 seqs = []
|
|
121 comments = [] # FIXME: comments before first sequence are lost.
|
|
122 header = None
|
|
123 header_lineno = -1
|
|
124
|
|
125 def build_seq(seqs,alphabet, header, header_lineno,comments) :
|
|
126 try :
|
|
127 name = header.split(' ',1)[0]
|
|
128 if comments :
|
|
129 header += '\n' + '\n'.join(comments)
|
|
130 s = Seq( "".join(seqs), alphabet, name=name, description=header)
|
|
131 except ValueError:
|
|
132 raise ValueError(
|
|
133 "Parsed failed with sequence starting at line %d: "
|
|
134 "Character not in alphabet: %s" % (header_lineno, alphabet) )
|
|
135 return s
|
|
136
|
|
137 for lineno, line in enumerate(fin) :
|
|
138 line = line.strip()
|
|
139 if line == '' : continue
|
|
140 if line.startswith('>') :
|
|
141 if header is not None :
|
|
142 yield build_seq(seqs,alphabet, header, header_lineno, comments)
|
|
143 header = None
|
|
144 seqs = []
|
|
145 header = line[1:]
|
|
146 header_lineno = lineno
|
|
147 comments = []
|
|
148 elif line.startswith(';') :
|
|
149 # Optional (and unusual) comment line
|
|
150 comments.append(line[1:])
|
|
151 else :
|
|
152 if header is None :
|
|
153 raise ValueError (
|
|
154 "Parse failed on line %d: sequence before header"
|
|
155 % (lineno) )
|
|
156 seqs.append(line)
|
|
157
|
|
158 if not seqs: return
|
|
159 yield build_seq(seqs,alphabet, header, header_lineno, comments)
|
|
160
|
|
161
|
|
162 def write(fout, seqs):
|
|
163 """Write a fasta file.
|
|
164
|
|
165 Args:
|
|
166 fout -- A writable stream.
|
|
167 seqs -- A list of Seq's
|
|
168 """
|
|
169 if seqs.description :
|
|
170 for line in seqs.description.splitlines():
|
|
171 print >>fout, ';'+ line
|
|
172 for s in seqs :
|
|
173 writeseq(fout, s)
|
|
174
|
|
175
|
|
176 def writeseq(afile, seq):
|
|
177 """ Write a single sequence in fasta format.
|
|
178
|
|
179 Args:
|
|
180 afile -- A writable stream.
|
|
181 seq -- A Seq instance
|
|
182 """
|
|
183
|
|
184 header = seq.description or seq.name or ''
|
|
185
|
|
186 # We prepend '>' to the first header line
|
|
187 # Additional lines start with ';' to indicate comment lines
|
|
188 if header :
|
|
189 header = header.splitlines()
|
|
190 print >>afile, '>'+header[0]
|
|
191 if len(header) > 1 :
|
|
192 for h in header[1:] :
|
|
193 print >>afile, ';' +h
|
|
194 else :
|
|
195 print >>afile, '>'
|
|
196
|
|
197 L = len(seq)
|
|
198 line_length = 80
|
|
199 for n in range (1+ L/line_length) :
|
|
200 print >>afile, seq[n * line_length: (n+1) * line_length]
|
|
201 print >>afile
|
|
202
|
|
203
|
|
204 def index(afile, alphabet=None) :
|
|
205 """Return a FileIndex for the fasta file. Sequences can be retrieved
|
|
206 by item number or name.
|
|
207 """
|
|
208 def parser( afile) :
|
|
209 return readseq(afile, alphabet)
|
|
210
|
|
211 key = re.compile(r"^>\s*(\S*)")
|
|
212 def linekey( line):
|
|
213 k = key.search(line)
|
|
214 if k is None : return None
|
|
215 return k.group(1)
|
|
216
|
|
217 return FileIndex(afile, linekey, parser)
|
|
218
|
|
219
|
|
220
|
|
221
|
|
222
|
|
223
|
|
224
|
|
225
|
|
226
|
|
227
|
|
228
|
|
229
|
|
230 |