annotate corebio/seq_io/intelligenetics_io.py @ 7:8d676bbd1f2d

Uploaded
author davidmurphy
date Mon, 16 Jan 2012 07:03:36 -0500
parents c55bdc2fb9fa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 #!/usr/bin/env python
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5 # This software is distributed under the MIT Open Source License.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 # <http://www.opensource.org/licenses/mit-license.html>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 # Permission is hereby granted, free of charge, to any person obtaining a
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 # copy of this software and associated documentation files (the "Software"),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10 # to deal in the Software without restriction, including without limitation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12 # and/or sell copies of the Software, and to permit persons to whom the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 # Software is furnished to do so, subject to the following conditions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15 # The above copyright notice and this permission notice shall be included
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16 # in all copies or substantial portions of the Software.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24 # THE SOFTWARE.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27 """Read and write sequence information in IntelliGenetics format.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29 A sequence file in IG format can contain several sequences, each consisting of a
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30 number of comment lines that must begin with a semicolon (";"), a line with the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31 sequence name and the sequence itself terminated with the termination character
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32 '1' for linear or '2' for circular sequences. The termination caracter is
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33 defacto optional.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35 --- Example IG File ---
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37 ;H.sapiens fau mRNA, 518 bases
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 HSFAU
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39 ttcctctttctcgactccatcttcgcggtagctgggaccgccgttcagtc
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40 actcttaagtcttttgtaattctggctttctctaataaaaaagccactta
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41 gttcagtcaaaaaaaaaa1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42 ;H.sapiens fau 1 gene, 2016 bases
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43 HSFAU1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44 ctaccattttccctctcgattctatatgtacactcgggacaagttctcct
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 gatcgaaaacggcaaaactaaggccccaagtaggaatgccttagttttcg
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46 gggttaacaatgattaacactgagcctcacacccacgcgatgccctcagc
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 tcctcgctcagcgctctcaccaacagccgtagcccgcagccccgctggac
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48 accggttctccatccccgcagcgtagcccggaacatggtagctgccatct
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49 ttacctgctacgccagccttctgtgcgcgcaactgtctggtcccgcccc2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53 from corebio.utils import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54 from corebio.seq import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55 from corebio.seq_io import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 names = ( 'intelligenetics', 'ig', 'stanford', )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59 extensions = ('ig')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62 example = """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63 ;H.sapiens fau mRNA, 518 bases
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64 HSFAU
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 ttcctctttctcgactccatcttcgcggtagctgggaccgccgttcagtc
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66 actcttaagtcttttgtaattctggctttctctaataaaaaagccactta
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 gttcagtcaaaaaaaaaa1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68 ;H.sapiens fau 1 gene, 2016 bases
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69 HSFAU1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70 ctaccattttccctctcgattctatatgtacactcgggacaagttctcct
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71 gatcgaaaacggcaaaactaaggccccaagtaggaatgccttagttttcg
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 gggttaacaatgattaacactgagcctcacacccacgcgatgccctcagc
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73 tcctcgctcagcgctctcaccaacagccgtagcccgcagccccgctggac
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74 accggttctccatccccgcagcgtagcccggaacatggtagctgccatct
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75 ttacctgctacgccagccttctgtgcgcgcaactgtctggtcccgcccc2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81 def read(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82 """Read and parse an IG file.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
86 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
87 Returns:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
88 SeqList -- A list of sequences
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
89 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
90 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
91 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
92 seqs = [ s for s in iterseq(fin, alphabet)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
93 return SeqList(seqs)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
94
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
95
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
96 def iterseq(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
97 """ Parse an IG file and generate sequences.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
98
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
99 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
100 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
101 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
102 Yeilds:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
103 Seq -- One alphabetic sequence at a time.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
104 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
105 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
106 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
107 alphabet = Alphabet(alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
108
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
109 seqs = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
110 header = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
111 start_lineno = -1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
112 name = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
113
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
114 def build_seq(seqs,alphabet, name, comments, lineno) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
115 try :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
116 desc = '\n'.join(comments)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
117 s = Seq( "".join(seqs), alphabet, name=name, description=desc)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
118 except ValueError :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
119 raise ValueError(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
120 "Parsed failed with sequence starting at line %d: "
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
121 "Character not in alphabet: %s" % (lineno, alphabet) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
122 return s
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
123
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
124 for lineno, line in enumerate(fin) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
125 line = line.strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
126 if line == '' : continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
127 if line.startswith(';') :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
128 if seqs :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
129 # end of sequence
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
130 yield build_seq(seqs,alphabet, name, header, start_lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
131 header = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
132 seqs = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
133 name = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
134 header.append(line[1:])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
135 start_lineno = lineno
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
136 elif not name :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
137 name = line
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
138 elif line[-1] == '1' or line[-1]=='2':
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
139 # End of sequence
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
140 seqs.append(remove_whitespace(line[0:-1]))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
141 yield build_seq(seqs,alphabet, name, header, start_lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
142 header = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
143 seqs = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
144 name = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
145 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
146 seqs.append( remove_whitespace(line))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
147
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
148 if seqs :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
149 yield build_seq(seqs,alphabet, name, header, start_lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
150 return
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
151
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
152
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
153
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
154
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
155
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
156 def write(fout, seqs):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
157 """Write an IG file.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
158
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
159 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
160 fout -- A writable stream.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
161 seqs -- A list of Seq's
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
162 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
163 ValueError -- If a sequence is missing a name
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
164 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
165 for s in seqs :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
166 writeseq(fout, s)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
167
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
168
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
169 def writeseq(fout, seq):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
170 """ Write a single sequence in IG format.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
171
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
172 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
173 afile -- A writable stream.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
174 seq -- A Seq instance
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
175 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
176 ValueError -- If a sequence is missing a name
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
177 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
178
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
179 desc = seq.description or ''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
180
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
181 # We prepend ';' to each line
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
182 for h in desc.splitlines() :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
183 print >> fout, ';' +h
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
184
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
185 if not seq.name :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
186 raise ValueError(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
187 "Write failed with missing sequence name: %s"% str(seq) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
188 print >>fout, seq.name
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
189 L = len(seq)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
190 line_length = 80
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
191 for n in range (1+ int(L/line_length)) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
192 print >>fout, seq[n * line_length: (n+1) * line_length]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
193 print >>fout
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
194
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
195
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
196
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
197
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
198
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
199
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
200
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
201
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
202
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
203