Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/clustal_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 | |
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> | |
3 # | |
4 # This software is distributed under the MIT Open Source License. | |
5 # <http://www.opensource.org/licenses/mit-license.html> | |
6 # | |
7 # Permission is hereby granted, free of charge, to any person obtaining a | |
8 # copy of this software and associated documentation files (the "Software"), | |
9 # to deal in the Software without restriction, including without limitation | |
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
11 # and/or sell copies of the Software, and to permit persons to whom the | |
12 # Software is furnished to do so, subject to the following conditions: | |
13 # | |
14 # The above copyright notice and this permission notice shall be included | |
15 # in all copies or substantial portions of the Software. | |
16 # | |
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
23 # THE SOFTWARE. | |
24 # | |
25 | |
26 """ Read and write the CLUSTAL sequence file format. | |
27 | |
28 See : | |
29 - http://www.cmpharm.ucsf.edu/~goh/Treecorr/sampleAlignment.html | |
30 - http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format | |
31 | |
32 Ref : | |
33 - Higgins D., Thompson J., Gibson T., Thompson J.D., Higgins D.G., Gibson | |
34 T.J. (1994). CLUSTAL W: improving the sensitivity of progressive multiple | |
35 sequence alignment through sequence weighting, position-specific gap | |
36 penalties and weight matrix choice. Nucleic Acids Res. 22:4673-4680. | |
37 """ | |
38 | |
39 # TODO: What happens if CLUSTAL is not the first line of the file? | |
40 | |
41 | |
42 import re | |
43 | |
44 from corebio.utils import * | |
45 from corebio.seq import * | |
46 from corebio.seq_io import * | |
47 | |
48 __all__ = ('example', 'names', 'extensions', 'read') | |
49 | |
50 example = """ | |
51 CLUSTAL W (1.81) multiple sequence alignment | |
52 | |
53 | |
54 CXCR3_MOUSE --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF | |
55 BLR_HUMAN --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH- | |
56 CXCR1_HUMAN --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY | |
57 CXCR4_MURINE -----------------------------------YTSDN---------YSGSGDYDSNK | |
58 : : :.. .. | |
59 | |
60 CXCR3_MOUSE -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD | |
61 BLR_HUMAN --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD | |
62 CXCR1_HUMAN -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD | |
63 CXCR4_MURINE -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD | |
64 :. .: * ::** .::** * :: : * *: : ::*::** | |
65 | |
66 CXCR3_MOUSE VLLVLTLPLWAVDAA-VQWVFGPGLCKVAGALFNINFYAGAFLLACISFDRYLSIVHATQ | |
67 BLR_HUMAN LLLVFILPFAVAEGS-VGWVLGTFLCKTVIALHKVNFYCSSLLLACIAVDRYLAIVHAVH | |
68 CXCR1_HUMAN LLFALTLPIWAASKV-NGWIFGTFLCKVVSLLKEVNFYSGILLLACISVDRYLAIVHATR | |
69 CXCR4_MURINE LLFVITLPFWAVDAM-ADWYFGKFLCKAVHIIYTVNLYSSVLILAFISLDRYLAIVHATN | |
70 :*:.: **: ... * :* ***.. : :*:*.. ::** *:.****:****.. | |
71 """ | |
72 | |
73 | |
74 | |
75 names = ("clustal", "clustalw",) | |
76 extensions = ('aln',) | |
77 | |
78 | |
79 header_line = re.compile(r'(CLUSTAL.*)$') | |
80 | |
81 # (sequence_id) (Sequence) (Optional sequence number) | |
82 seq_line = re.compile(r'(\s*\S+\s+)(\S+)\s*(\d*)$') | |
83 | |
84 # Saved group includes variable length leading space. | |
85 # Must consult a seq_line to figure out how long the leading spoace is since | |
86 # the maximum CLUSTAL ids length (normally 10 characters) can be changed. | |
87 match_line = re.compile(r'([\s:\.\*]*)$') | |
88 | |
89 | |
90 def iterseq(fin, alphabet=None): | |
91 """Iterate over the sequences in the file.""" | |
92 # Default implementation | |
93 return iter(read(fin, alphabet) ) | |
94 | |
95 | |
96 def read(fin, alphabet=None) : | |
97 alphabet = Alphabet(alphabet) | |
98 seq_ids = [] | |
99 seqs = [] | |
100 block_count = 0 | |
101 | |
102 | |
103 for token in _scan(fin): | |
104 if token.typeof== "begin_block": | |
105 block_count = 0 | |
106 elif token.typeof == "seq_id": | |
107 if len(seqs) <= block_count : | |
108 seq_ids.append(token.data) | |
109 seqs.append([]) | |
110 elif token.typeof == "seq": | |
111 if not alphabet.alphabetic(token.data) : | |
112 raise ValueError( | |
113 "Character on line: %d not in alphabet: %s : %s" % ( | |
114 token.lineno, alphabet, token.data) ) | |
115 seqs[block_count].append(token.data) | |
116 block_count +=1 | |
117 | |
118 | |
119 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)] | |
120 return SeqList(seqs) | |
121 | |
122 | |
123 # 1) The word "CLUSTAL" should be the first word on the first line of the file. | |
124 # (But sometimes isn't.) | |
125 # 2) The alignment is displayed in blocks of fixed length. | |
126 # 3) Each line in the block corresponds to one sequence. | |
127 # 4) Each sequence line starts with a sequence name followed by at least one | |
128 # space and then the sequence. | |
129 | |
130 def _scan( fin ): | |
131 """Scan a clustal format MSA file and yeild tokens. | |
132 The basic file structure is | |
133 begin_document | |
134 header? | |
135 (begin_block | |
136 (seq_id seq seq_index?)+ | |
137 match_line? | |
138 end_block)* | |
139 end_document | |
140 | |
141 Usage: | |
142 for token in scan(clustal_file): | |
143 do_somthing(token) | |
144 """ | |
145 header, body, block = range(3) | |
146 | |
147 yield Token("begin") | |
148 leader_width = -1 | |
149 state = header | |
150 for L, line in enumerate(fin): | |
151 if state==header : | |
152 if line.isspace() : continue | |
153 m = header_line.match(line) | |
154 state = body | |
155 if m is not None : | |
156 yield Token("header", m.group() ) | |
157 continue | |
158 else : | |
159 raise ValueError("Cannot find required header") | |
160 | |
161 | |
162 if state == body : | |
163 if line.isspace() : continue | |
164 yield Token("begin_block") | |
165 state = block | |
166 # fall through to block | |
167 | |
168 if state == block: | |
169 if line.isspace() : | |
170 yield Token("end_block") | |
171 state = body | |
172 continue | |
173 | |
174 m = match_line.match(line) | |
175 if m is not None : | |
176 yield Token("match_line", line[leader_width:-1]) | |
177 continue | |
178 | |
179 m = seq_line.match(line) | |
180 if m is None: | |
181 raise ValueError("Parse error on line: %d" % L) | |
182 leader_width = len(m.group(1)) | |
183 yield Token("seq_id", m.group(1).strip() ) | |
184 yield Token("seq", m.group(2).strip() ) | |
185 if m.group(3) : | |
186 yield Token("seq_num", m.group(3)) | |
187 continue | |
188 | |
189 # END state blocks. If I ever get here something has gone terrible wrong | |
190 raise RuntimeError() | |
191 | |
192 if state==block: | |
193 yield Token("end_block") | |
194 yield Token("end") | |
195 return | |
196 | |
197 def write(fout, seqs) : | |
198 """Write 'seqs' to 'fout' as text in clustal format""" | |
199 header = "CLUSTAL W (1.81) multiple sequence alignment" | |
200 name_width = 17 | |
201 seq_width = 60 | |
202 | |
203 print >>fout, header | |
204 print >>fout | |
205 print >>fout | |
206 | |
207 L = 0 | |
208 for s in seqs: L = max(L, len(s)) | |
209 | |
210 for block in range(0, L, seq_width): | |
211 for s in seqs : | |
212 start = min(block, len(s)) | |
213 end = min( start+seq_width, len(s)) | |
214 print >>fout, s.name.ljust(name_width), | |
215 print >>fout, s[start:end] | |
216 print >>fout | |
217 | |
218 | |
219 | |
220 | |
221 | |
222 |