0
|
1
|
|
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
|
|
3 #
|
|
4 # This software is distributed under the MIT Open Source License.
|
|
5 # <http://www.opensource.org/licenses/mit-license.html>
|
|
6 #
|
|
7 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
8 # copy of this software and associated documentation files (the "Software"),
|
|
9 # to deal in the Software without restriction, including without limitation
|
|
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
11 # and/or sell copies of the Software, and to permit persons to whom the
|
|
12 # Software is furnished to do so, subject to the following conditions:
|
|
13 #
|
|
14 # The above copyright notice and this permission notice shall be included
|
|
15 # in all copies or substantial portions of the Software.
|
|
16 #
|
|
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
23 # THE SOFTWARE.
|
|
24 #
|
|
25
|
|
26 """Read a multiple sequence alignment in STOCKHOLM format.
|
|
27
|
|
28 This file format is used by PFAM and HMMER. At present, all annotation
|
|
29 information is ignored.
|
|
30
|
|
31 See:
|
|
32 - http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
|
|
33 - HMMER manual
|
|
34
|
|
35 """
|
|
36
|
|
37 import re
|
|
38
|
|
39 from corebio.utils import *
|
|
40 from corebio.seq import *
|
|
41 from corebio.seq_io import *
|
|
42
|
|
43
|
|
44
|
|
45 example = """
|
|
46 # STOCKHOLM 1.0
|
|
47 #=GF ID CBS
|
|
48 #=GF AC PF00571
|
|
49 #=GF DE CBS domain
|
|
50 #=GF AU Bateman A
|
|
51 #=GF CC CBS domains are small intracellular modules mostly found
|
|
52 #=GF CC in 2 or four copies within a protein.
|
|
53 #=GF SQ 67
|
|
54 #=GS O31698/18-71 AC O31698
|
|
55 #=GS O83071/192-246 AC O83071
|
|
56 #=GS O83071/259-312 AC O83071
|
|
57 #=GS O31698/88-139 AC O31698
|
|
58 #=GS O31698/88-139 OS Bacillus subtilis
|
|
59 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
|
|
60 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
|
|
61 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
|
|
62 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
|
|
63 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
|
|
64 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
|
|
65 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
|
|
66 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
|
|
67 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
|
|
68 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
|
|
69 #=GR O31699/88-139 AS ________________*__________________________
|
|
70 #=GR_O31699/88-139_IN ____________1______________2__________0____
|
|
71 //
|
|
72 """
|
|
73
|
|
74
|
|
75
|
|
76 names = ("stockholm", "pfam",)
|
|
77 extensions = ('sth', 'stockholm', 'align')
|
|
78
|
|
79
|
|
80 header_line = re.compile(r'#\s+STOCKHOLM\s+1.\d\s+$')
|
|
81
|
|
82 def iterseq(fin, alphabet=None):
|
|
83 """Iterate over the sequences in the file."""
|
|
84 # Default implementation
|
|
85 return iter(read(fin, alphabet) )
|
|
86
|
|
87
|
|
88 def read(fin, alphabet=None) :
|
|
89 alphabet = Alphabet(alphabet)
|
|
90 seq_ids = []
|
|
91 seqs = []
|
|
92 block_count = 0
|
|
93
|
|
94
|
|
95 for token in _scan(fin):
|
|
96 if token.typeof== "begin_block":
|
|
97 block_count = 0
|
|
98 elif token.typeof == "seq_id":
|
|
99 if len(seqs) <= block_count :
|
|
100 seq_ids.append(token.data)
|
|
101 seqs.append([])
|
|
102 elif token.typeof == "seq":
|
|
103 if not alphabet.alphabetic(token.data) :
|
|
104 raise ValueError (
|
|
105 "Character on line: %d not in alphabet: %s : %s" % (
|
|
106 token.lineno, alphabet, token.data) )
|
|
107 seqs[block_count].append(token.data)
|
|
108 block_count +=1
|
|
109
|
|
110
|
|
111 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)]
|
|
112 return SeqList(seqs)
|
|
113
|
|
114
|
|
115 def _scan( fin ):
|
|
116
|
|
117 header, body, block = range(3)
|
|
118
|
|
119 yield Token("begin")
|
|
120 state = header
|
|
121 for L, line in enumerate(fin):
|
|
122
|
|
123
|
|
124 if state==header :
|
|
125 if line.isspace() : continue
|
|
126 m = header_line.match(line)
|
|
127 state = body
|
|
128 if m is not None :
|
|
129 # print "header: ", m.group()
|
|
130 yield Token("header", m.group() )
|
|
131 continue
|
|
132 else :
|
|
133 raise ValueError("Parse error on line: %d" % L)
|
|
134
|
|
135
|
|
136 if state == body :
|
|
137 if line.isspace() : continue
|
|
138 yield Token("begin_block")
|
|
139 state = block
|
|
140 # fall through to block
|
|
141
|
|
142 if state == block:
|
|
143 if line.isspace() :
|
|
144 yield Token("end_block")
|
|
145 state = body
|
|
146 continue
|
|
147 if line.strip() == '//' :
|
|
148 yield Token("end_block")
|
|
149 return
|
|
150
|
|
151
|
|
152 if line[0] =='#' : # Comment or annotation line
|
|
153 continue
|
|
154
|
|
155 name_seq = line.split(None,1) # Split into two parts at first whitespace
|
|
156 if len(name_seq) != 2 :
|
|
157 raise ValueError("Parse error on line: %d" % L)
|
|
158
|
|
159
|
|
160 yield Token("seq_id", name_seq[0].strip() )
|
|
161 yield Token("seq", name_seq[1].strip() )
|
|
162 continue
|
|
163
|
|
164 # END state blocks. If I ever get here something has gone terrible wrong
|
|
165 raise RuntimeError()
|
|
166
|
|
167
|
|
168
|
|
169
|
|
170
|
|
171
|
|
172
|