comparison corebio/seq_io/stockholm_io.py @ 4:4d47ab2b7bcc

Uploaded
author davidmurphy
date Fri, 13 Jan 2012 07:18:19 -0500
parents c55bdc2fb9fa
children
comparison
equal deleted inserted replaced
3:09d2dac9ef73 4:4d47ab2b7bcc
1
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
3 #
4 # This software is distributed under the MIT Open Source License.
5 # <http://www.opensource.org/licenses/mit-license.html>
6 #
7 # Permission is hereby granted, free of charge, to any person obtaining a
8 # copy of this software and associated documentation files (the "Software"),
9 # to deal in the Software without restriction, including without limitation
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 # and/or sell copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following conditions:
13 #
14 # The above copyright notice and this permission notice shall be included
15 # in all copies or substantial portions of the Software.
16 #
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 # THE SOFTWARE.
24 #
25
26 """Read a multiple sequence alignment in STOCKHOLM format.
27
28 This file format is used by PFAM and HMMER. At present, all annotation
29 information is ignored.
30
31 See:
32 - http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
33 - HMMER manual
34
35 """
36
37 import re
38
39 from corebio.utils import *
40 from corebio.seq import *
41 from corebio.seq_io import *
42
43
44
45 example = """
46 # STOCKHOLM 1.0
47 #=GF ID CBS
48 #=GF AC PF00571
49 #=GF DE CBS domain
50 #=GF AU Bateman A
51 #=GF CC CBS domains are small intracellular modules mostly found
52 #=GF CC in 2 or four copies within a protein.
53 #=GF SQ 67
54 #=GS O31698/18-71 AC O31698
55 #=GS O83071/192-246 AC O83071
56 #=GS O83071/259-312 AC O83071
57 #=GS O31698/88-139 AC O31698
58 #=GS O31698/88-139 OS Bacillus subtilis
59 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
60 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
61 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
62 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
63 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
64 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
65 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
66 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
67 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
68 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
69 #=GR O31699/88-139 AS ________________*__________________________
70 #=GR_O31699/88-139_IN ____________1______________2__________0____
71 //
72 """
73
74
75
76 names = ("stockholm", "pfam",)
77 extensions = ('sth', 'stockholm', 'align')
78
79
80 header_line = re.compile(r'#\s+STOCKHOLM\s+1.\d\s+$')
81
82 def iterseq(fin, alphabet=None):
83 """Iterate over the sequences in the file."""
84 # Default implementation
85 return iter(read(fin, alphabet) )
86
87
88 def read(fin, alphabet=None) :
89 alphabet = Alphabet(alphabet)
90 seq_ids = []
91 seqs = []
92 block_count = 0
93
94
95 for token in _scan(fin):
96 if token.typeof== "begin_block":
97 block_count = 0
98 elif token.typeof == "seq_id":
99 if len(seqs) <= block_count :
100 seq_ids.append(token.data)
101 seqs.append([])
102 elif token.typeof == "seq":
103 if not alphabet.alphabetic(token.data) :
104 raise ValueError (
105 "Character on line: %d not in alphabet: %s : %s" % (
106 token.lineno, alphabet, token.data) )
107 seqs[block_count].append(token.data)
108 block_count +=1
109
110
111 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)]
112 return SeqList(seqs)
113
114
115 def _scan( fin ):
116
117 header, body, block = range(3)
118
119 yield Token("begin")
120 state = header
121 for L, line in enumerate(fin):
122
123
124 if state==header :
125 if line.isspace() : continue
126 m = header_line.match(line)
127 state = body
128 if m is not None :
129 # print "header: ", m.group()
130 yield Token("header", m.group() )
131 continue
132 else :
133 raise ValueError("Parse error on line: %d" % L)
134
135
136 if state == body :
137 if line.isspace() : continue
138 yield Token("begin_block")
139 state = block
140 # fall through to block
141
142 if state == block:
143 if line.isspace() :
144 yield Token("end_block")
145 state = body
146 continue
147 if line.strip() == '//' :
148 yield Token("end_block")
149 return
150
151
152 if line[0] =='#' : # Comment or annotation line
153 continue
154
155 name_seq = line.split(None,1) # Split into two parts at first whitespace
156 if len(name_seq) != 2 :
157 raise ValueError("Parse error on line: %d" % L)
158
159
160 yield Token("seq_id", name_seq[0].strip() )
161 yield Token("seq", name_seq[1].strip() )
162 continue
163
164 # END state blocks. If I ever get here something has gone terrible wrong
165 raise RuntimeError()
166
167
168
169
170
171
172