annotate corebio/seq_io/msf_io.py @ 12:b819394a2634

Uploaded
author davidmurphy
date Wed, 22 Feb 2012 06:42:17 -0500
parents c55bdc2fb9fa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 #!/usr/bin/env python
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3 # Copyright (c) 2005 Clare Gollnick <cgollnick@berkeley.edu>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5 # This software is distributed under the MIT Open Source License.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 # <http://www.opensource.org/licenses/mit-license.html>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 # Permission is hereby granted, free of charge, to any person obtaining a
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 # copy of this software and associated documentation files (the "Software"),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10 # to deal in the Software without restriction, including without limitation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12 # and/or sell copies of the Software, and to permit persons to whom the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 # Software is furnished to do so, subject to the following conditions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15 # The above copyright notice and this permission notice shall be included
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16 # in all copies or substantial portions of the Software.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24 # THE SOFTWARE.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28 """Read sequence information in MSF format.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30 This is a file format for biological sequence data. The sequences are interweaved and each line is labeled with the sequence name. The MSF format can be identified in one, or more of the following ways:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31 1. The word PileUp on the first line (optional)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32 2. the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT at the start of the file (optional)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33 3. the word MSF on the first line of the file, and the characters ".." at the end of this line (optional)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34 4. A header containing sequence information followed by a line with the characters "//"
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36 example= """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 PileUp
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41 MSF: 64 Type: P Check: 767 ..
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43 Name: Cow Len: 100 Check: 3761 Weight: 1.00
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44 Name: Carp Len: 100 Check: 1550 Weight: 1.00
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 Name: Chicken Len: 100 Check: 2397 Weight: 1.00
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46 Name: Human Len: 100 Check: 9021 Weight: 1.00
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 Name: Loach Len: 100 Check: 984 Weight: 1.00
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48 Name: Mouse Len: 100 Check: 2993 Weight: 1.00
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51 //
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54 Cow MAYPMQLGFQ DATSPIMEEL LHFHDHTLMI VFLISSLVLY IISLMLTTKL
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55 Carp MAHPTQLGFK DAAMPVMEEL LHFHDHALMI VLLISTLVLY IITAMVSTKL
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56 Chicken MANHSQLGFQ DASSPIMEEL VEFHDHALMV ALAICSLVLY LLTLMLMEKL
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57 Human MAHAAQVGLQ DATSPIMEEL ITFHDHALMI IFLICFLVLY ALFLTLTTKL
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 Loach MAHPTQLGFQ DAASPVMEEL LHFHDHALMI VFLISALVLY VIITTVSTKL
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59 Mouse MAYPFQLGLQ DATSPIMEEL MNFHDHTLMI VFLISSLVLY IISLMLTTKL
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63 Cow THTSTMDAQE VETIWTILPA IILILIALPS LRILYMMDEI NNPSLTVKTM
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64 Carp TNKYILDSQE IEIVWTILPA VILVLIALPS LRILYLMDEI NDPHLTIKAM
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 Chicken S.SNTVDAQE VELIWTILPA IVLVLLALPS LQILYMMDEI DEPDLTLKAI
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66 Human TNTNISDAQE METVWTILPA IILVLIALPS LRILYMTDEV NDPSLTIKSI
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 Loach TNMYILDSQE IEIVWTVLPA LILILIALPS LRILYLMDEI NDPHLTIKAM
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68 Mouse THTSTMDAQE VETIWTILPA VILIMIALPS LRILYMMDEI NNPVLTVKTM
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 import re
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74 from corebio.seq import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75 from corebio.seq_io import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76 from corebio.utils import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78 names = ('msf', 'gcg-msf', 'gcg', 'PileUp')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79 extensions = ('msf')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81 end_header=re.compile(r'(//)(\s*)$')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82 seq_line=re.compile(r'\s*(\S+)\s+([\S\s.?]+)$')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84 def iterseq(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85 """Iterate over the sequences in the file."""
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
86 # Default implementation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
87 return iter(read(fin, alphabet) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
88
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
89
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
90
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
91 def read(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
92 alphabet =Alphabet(alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
93 seq_ids=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
94 seqs=[]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
95 block_count=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
96
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
97 for token in _line_is(fin):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
98 if token.typeof=="begin_block":
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
99 block_count=0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
100
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
101 elif token.typeof == "seq_id":
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
102 if len(seqs)<= block_count:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
103 seq_ids.append(token.data)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
104 seqs.append([])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
105 elif token.typeof=="seq":
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
106 if not alphabet.alphabetic(token.data):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
107 raise ValueError(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
108 "Character on line: %d not in alphabet: %s : %s" % (
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
109 token.lineno, alphabet, token.data) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
110 seqs[block_count].append(token.data)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
111 block_count +=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
112 if seq_ids==[]:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
113 raise ValueError("Parse error, possible wrong format")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
114 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
115 return SeqList(seqs)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
116
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
117 def _line_is(fin):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
118 header, body, block = range(3)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
119 yield Token("begin")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
120 state=header
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
121 for L, line in enumerate(fin):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
122 if state==header:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
123 if line.isspace():continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
124 m=end_header.match(line)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
125 if m is not None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
126 yield Token("end_header")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
127 state=body
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
128 continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
129 else: continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
130
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
131 if state==body:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
132 if line.isspace():continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
133 yield Token("begin_block")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
134 state=block
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
135 #skips to a block of sequences
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
136
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
137 if state==block:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
138 if line.isspace():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
139 yield Token("end_block")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
140 state=body
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
141 continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
142 m=seq_line.match(line)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
143 if m is None:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
144 raise ValueError("Parse error on line: %d" % L)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
145 if m.group(1).isdigit() and m.group(2).strip().isdigit():
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
146 continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
147 yield Token("seq_id",m.group(1).strip() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
148 data=m.group(2)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
149 data="".join((data.split()))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
150 yield Token("seq",data.strip() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
151
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
152
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
153
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
154
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
155
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
156