Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/stockholm_io.py @ 4:4d47ab2b7bcc
Uploaded
author | davidmurphy |
---|---|
date | Fri, 13 Jan 2012 07:18:19 -0500 |
parents | c55bdc2fb9fa |
children |
comparison
equal
deleted
inserted
replaced
3:09d2dac9ef73 | 4:4d47ab2b7bcc |
---|---|
1 | |
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> | |
3 # | |
4 # This software is distributed under the MIT Open Source License. | |
5 # <http://www.opensource.org/licenses/mit-license.html> | |
6 # | |
7 # Permission is hereby granted, free of charge, to any person obtaining a | |
8 # copy of this software and associated documentation files (the "Software"), | |
9 # to deal in the Software without restriction, including without limitation | |
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
11 # and/or sell copies of the Software, and to permit persons to whom the | |
12 # Software is furnished to do so, subject to the following conditions: | |
13 # | |
14 # The above copyright notice and this permission notice shall be included | |
15 # in all copies or substantial portions of the Software. | |
16 # | |
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
23 # THE SOFTWARE. | |
24 # | |
25 | |
26 """Read a multiple sequence alignment in STOCKHOLM format. | |
27 | |
28 This file format is used by PFAM and HMMER. At present, all annotation | |
29 information is ignored. | |
30 | |
31 See: | |
32 - http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html | |
33 - HMMER manual | |
34 | |
35 """ | |
36 | |
37 import re | |
38 | |
39 from corebio.utils import * | |
40 from corebio.seq import * | |
41 from corebio.seq_io import * | |
42 | |
43 | |
44 | |
45 example = """ | |
46 # STOCKHOLM 1.0 | |
47 #=GF ID CBS | |
48 #=GF AC PF00571 | |
49 #=GF DE CBS domain | |
50 #=GF AU Bateman A | |
51 #=GF CC CBS domains are small intracellular modules mostly found | |
52 #=GF CC in 2 or four copies within a protein. | |
53 #=GF SQ 67 | |
54 #=GS O31698/18-71 AC O31698 | |
55 #=GS O83071/192-246 AC O83071 | |
56 #=GS O83071/259-312 AC O83071 | |
57 #=GS O31698/88-139 AC O31698 | |
58 #=GS O31698/88-139 OS Bacillus subtilis | |
59 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS | |
60 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777 | |
61 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY | |
62 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE | |
63 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS | |
64 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH | |
65 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE | |
66 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH | |
67 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH | |
68 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE | |
69 #=GR O31699/88-139 AS ________________*__________________________ | |
70 #=GR_O31699/88-139_IN ____________1______________2__________0____ | |
71 // | |
72 """ | |
73 | |
74 | |
75 | |
76 names = ("stockholm", "pfam",) | |
77 extensions = ('sth', 'stockholm', 'align') | |
78 | |
79 | |
80 header_line = re.compile(r'#\s+STOCKHOLM\s+1.\d\s+$') | |
81 | |
82 def iterseq(fin, alphabet=None): | |
83 """Iterate over the sequences in the file.""" | |
84 # Default implementation | |
85 return iter(read(fin, alphabet) ) | |
86 | |
87 | |
88 def read(fin, alphabet=None) : | |
89 alphabet = Alphabet(alphabet) | |
90 seq_ids = [] | |
91 seqs = [] | |
92 block_count = 0 | |
93 | |
94 | |
95 for token in _scan(fin): | |
96 if token.typeof== "begin_block": | |
97 block_count = 0 | |
98 elif token.typeof == "seq_id": | |
99 if len(seqs) <= block_count : | |
100 seq_ids.append(token.data) | |
101 seqs.append([]) | |
102 elif token.typeof == "seq": | |
103 if not alphabet.alphabetic(token.data) : | |
104 raise ValueError ( | |
105 "Character on line: %d not in alphabet: %s : %s" % ( | |
106 token.lineno, alphabet, token.data) ) | |
107 seqs[block_count].append(token.data) | |
108 block_count +=1 | |
109 | |
110 | |
111 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)] | |
112 return SeqList(seqs) | |
113 | |
114 | |
115 def _scan( fin ): | |
116 | |
117 header, body, block = range(3) | |
118 | |
119 yield Token("begin") | |
120 state = header | |
121 for L, line in enumerate(fin): | |
122 | |
123 | |
124 if state==header : | |
125 if line.isspace() : continue | |
126 m = header_line.match(line) | |
127 state = body | |
128 if m is not None : | |
129 # print "header: ", m.group() | |
130 yield Token("header", m.group() ) | |
131 continue | |
132 else : | |
133 raise ValueError("Parse error on line: %d" % L) | |
134 | |
135 | |
136 if state == body : | |
137 if line.isspace() : continue | |
138 yield Token("begin_block") | |
139 state = block | |
140 # fall through to block | |
141 | |
142 if state == block: | |
143 if line.isspace() : | |
144 yield Token("end_block") | |
145 state = body | |
146 continue | |
147 if line.strip() == '//' : | |
148 yield Token("end_block") | |
149 return | |
150 | |
151 | |
152 if line[0] =='#' : # Comment or annotation line | |
153 continue | |
154 | |
155 name_seq = line.split(None,1) # Split into two parts at first whitespace | |
156 if len(name_seq) != 2 : | |
157 raise ValueError("Parse error on line: %d" % L) | |
158 | |
159 | |
160 yield Token("seq_id", name_seq[0].strip() ) | |
161 yield Token("seq", name_seq[1].strip() ) | |
162 continue | |
163 | |
164 # END state blocks. If I ever get here something has gone terrible wrong | |
165 raise RuntimeError() | |
166 | |
167 | |
168 | |
169 | |
170 | |
171 | |
172 |