Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/clustal_io.py @ 4:4d47ab2b7bcc
Uploaded
| author | davidmurphy |
|---|---|
| date | Fri, 13 Jan 2012 07:18:19 -0500 |
| parents | c55bdc2fb9fa |
| children |
comparison
equal
deleted
inserted
replaced
| 3:09d2dac9ef73 | 4:4d47ab2b7bcc |
|---|---|
| 1 | |
| 2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> | |
| 3 # | |
| 4 # This software is distributed under the MIT Open Source License. | |
| 5 # <http://www.opensource.org/licenses/mit-license.html> | |
| 6 # | |
| 7 # Permission is hereby granted, free of charge, to any person obtaining a | |
| 8 # copy of this software and associated documentation files (the "Software"), | |
| 9 # to deal in the Software without restriction, including without limitation | |
| 10 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
| 11 # and/or sell copies of the Software, and to permit persons to whom the | |
| 12 # Software is furnished to do so, subject to the following conditions: | |
| 13 # | |
| 14 # The above copyright notice and this permission notice shall be included | |
| 15 # in all copies or substantial portions of the Software. | |
| 16 # | |
| 17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| 23 # THE SOFTWARE. | |
| 24 # | |
| 25 | |
| 26 """ Read and write the CLUSTAL sequence file format. | |
| 27 | |
| 28 See : | |
| 29 - http://www.cmpharm.ucsf.edu/~goh/Treecorr/sampleAlignment.html | |
| 30 - http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format | |
| 31 | |
| 32 Ref : | |
| 33 - Higgins D., Thompson J., Gibson T., Thompson J.D., Higgins D.G., Gibson | |
| 34 T.J. (1994). CLUSTAL W: improving the sensitivity of progressive multiple | |
| 35 sequence alignment through sequence weighting, position-specific gap | |
| 36 penalties and weight matrix choice. Nucleic Acids Res. 22:4673-4680. | |
| 37 """ | |
| 38 | |
| 39 # TODO: What happens if CLUSTAL is not the first line of the file? | |
| 40 | |
| 41 | |
| 42 import re | |
| 43 | |
| 44 from corebio.utils import * | |
| 45 from corebio.seq import * | |
| 46 from corebio.seq_io import * | |
| 47 | |
| 48 __all__ = ('example', 'names', 'extensions', 'read') | |
| 49 | |
| 50 example = """ | |
| 51 CLUSTAL W (1.81) multiple sequence alignment | |
| 52 | |
| 53 | |
| 54 CXCR3_MOUSE --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF | |
| 55 BLR_HUMAN --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH- | |
| 56 CXCR1_HUMAN --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY | |
| 57 CXCR4_MURINE -----------------------------------YTSDN---------YSGSGDYDSNK | |
| 58 : : :.. .. | |
| 59 | |
| 60 CXCR3_MOUSE -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD | |
| 61 BLR_HUMAN --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD | |
| 62 CXCR1_HUMAN -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD | |
| 63 CXCR4_MURINE -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD | |
| 64 :. .: * ::** .::** * :: : * *: : ::*::** | |
| 65 | |
| 66 CXCR3_MOUSE VLLVLTLPLWAVDAA-VQWVFGPGLCKVAGALFNINFYAGAFLLACISFDRYLSIVHATQ | |
| 67 BLR_HUMAN LLLVFILPFAVAEGS-VGWVLGTFLCKTVIALHKVNFYCSSLLLACIAVDRYLAIVHAVH | |
| 68 CXCR1_HUMAN LLFALTLPIWAASKV-NGWIFGTFLCKVVSLLKEVNFYSGILLLACISVDRYLAIVHATR | |
| 69 CXCR4_MURINE LLFVITLPFWAVDAM-ADWYFGKFLCKAVHIIYTVNLYSSVLILAFISLDRYLAIVHATN | |
| 70 :*:.: **: ... * :* ***.. : :*:*.. ::** *:.****:****.. | |
| 71 """ | |
| 72 | |
| 73 | |
| 74 | |
| 75 names = ("clustal", "clustalw",) | |
| 76 extensions = ('aln',) | |
| 77 | |
| 78 | |
| 79 header_line = re.compile(r'(CLUSTAL.*)$') | |
| 80 | |
| 81 # (sequence_id) (Sequence) (Optional sequence number) | |
| 82 seq_line = re.compile(r'(\s*\S+\s+)(\S+)\s*(\d*)$') | |
| 83 | |
| 84 # Saved group includes variable length leading space. | |
| 85 # Must consult a seq_line to figure out how long the leading spoace is since | |
| 86 # the maximum CLUSTAL ids length (normally 10 characters) can be changed. | |
| 87 match_line = re.compile(r'([\s:\.\*]*)$') | |
| 88 | |
| 89 | |
| 90 def iterseq(fin, alphabet=None): | |
| 91 """Iterate over the sequences in the file.""" | |
| 92 # Default implementation | |
| 93 return iter(read(fin, alphabet) ) | |
| 94 | |
| 95 | |
| 96 def read(fin, alphabet=None) : | |
| 97 alphabet = Alphabet(alphabet) | |
| 98 seq_ids = [] | |
| 99 seqs = [] | |
| 100 block_count = 0 | |
| 101 | |
| 102 | |
| 103 for token in _scan(fin): | |
| 104 if token.typeof== "begin_block": | |
| 105 block_count = 0 | |
| 106 elif token.typeof == "seq_id": | |
| 107 if len(seqs) <= block_count : | |
| 108 seq_ids.append(token.data) | |
| 109 seqs.append([]) | |
| 110 elif token.typeof == "seq": | |
| 111 if not alphabet.alphabetic(token.data) : | |
| 112 raise ValueError( | |
| 113 "Character on line: %d not in alphabet: %s : %s" % ( | |
| 114 token.lineno, alphabet, token.data) ) | |
| 115 seqs[block_count].append(token.data) | |
| 116 block_count +=1 | |
| 117 | |
| 118 | |
| 119 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)] | |
| 120 return SeqList(seqs) | |
| 121 | |
| 122 | |
| 123 # 1) The word "CLUSTAL" should be the first word on the first line of the file. | |
| 124 # (But sometimes isn't.) | |
| 125 # 2) The alignment is displayed in blocks of fixed length. | |
| 126 # 3) Each line in the block corresponds to one sequence. | |
| 127 # 4) Each sequence line starts with a sequence name followed by at least one | |
| 128 # space and then the sequence. | |
| 129 | |
| 130 def _scan( fin ): | |
| 131 """Scan a clustal format MSA file and yeild tokens. | |
| 132 The basic file structure is | |
| 133 begin_document | |
| 134 header? | |
| 135 (begin_block | |
| 136 (seq_id seq seq_index?)+ | |
| 137 match_line? | |
| 138 end_block)* | |
| 139 end_document | |
| 140 | |
| 141 Usage: | |
| 142 for token in scan(clustal_file): | |
| 143 do_somthing(token) | |
| 144 """ | |
| 145 header, body, block = range(3) | |
| 146 | |
| 147 yield Token("begin") | |
| 148 leader_width = -1 | |
| 149 state = header | |
| 150 for L, line in enumerate(fin): | |
| 151 if state==header : | |
| 152 if line.isspace() : continue | |
| 153 m = header_line.match(line) | |
| 154 state = body | |
| 155 if m is not None : | |
| 156 yield Token("header", m.group() ) | |
| 157 continue | |
| 158 else : | |
| 159 raise ValueError("Cannot find required header") | |
| 160 | |
| 161 | |
| 162 if state == body : | |
| 163 if line.isspace() : continue | |
| 164 yield Token("begin_block") | |
| 165 state = block | |
| 166 # fall through to block | |
| 167 | |
| 168 if state == block: | |
| 169 if line.isspace() : | |
| 170 yield Token("end_block") | |
| 171 state = body | |
| 172 continue | |
| 173 | |
| 174 m = match_line.match(line) | |
| 175 if m is not None : | |
| 176 yield Token("match_line", line[leader_width:-1]) | |
| 177 continue | |
| 178 | |
| 179 m = seq_line.match(line) | |
| 180 if m is None: | |
| 181 raise ValueError("Parse error on line: %d" % L) | |
| 182 leader_width = len(m.group(1)) | |
| 183 yield Token("seq_id", m.group(1).strip() ) | |
| 184 yield Token("seq", m.group(2).strip() ) | |
| 185 if m.group(3) : | |
| 186 yield Token("seq_num", m.group(3)) | |
| 187 continue | |
| 188 | |
| 189 # END state blocks. If I ever get here something has gone terrible wrong | |
| 190 raise RuntimeError() | |
| 191 | |
| 192 if state==block: | |
| 193 yield Token("end_block") | |
| 194 yield Token("end") | |
| 195 return | |
| 196 | |
| 197 def write(fout, seqs) : | |
| 198 """Write 'seqs' to 'fout' as text in clustal format""" | |
| 199 header = "CLUSTAL W (1.81) multiple sequence alignment" | |
| 200 name_width = 17 | |
| 201 seq_width = 60 | |
| 202 | |
| 203 print >>fout, header | |
| 204 print >>fout | |
| 205 print >>fout | |
| 206 | |
| 207 L = 0 | |
| 208 for s in seqs: L = max(L, len(s)) | |
| 209 | |
| 210 for block in range(0, L, seq_width): | |
| 211 for s in seqs : | |
| 212 start = min(block, len(s)) | |
| 213 end = min( start+seq_width, len(s)) | |
| 214 print >>fout, s.name.ljust(name_width), | |
| 215 print >>fout, s[start:end] | |
| 216 print >>fout | |
| 217 | |
| 218 | |
| 219 | |
| 220 | |
| 221 | |
| 222 |
