comparison corebio/seq_io/msf_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 #!/usr/bin/env python
2
3 # Copyright (c) 2005 Clare Gollnick <cgollnick@berkeley.edu>
4 #
5 # This software is distributed under the MIT Open Source License.
6 # <http://www.opensource.org/licenses/mit-license.html>
7 #
8 # Permission is hereby granted, free of charge, to any person obtaining a
9 # copy of this software and associated documentation files (the "Software"),
10 # to deal in the Software without restriction, including without limitation
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 # and/or sell copies of the Software, and to permit persons to whom the
13 # Software is furnished to do so, subject to the following conditions:
14 #
15 # The above copyright notice and this permission notice shall be included
16 # in all copies or substantial portions of the Software.
17 #
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 # THE SOFTWARE.
25 #
26
27
28 """Read sequence information in MSF format.
29
30 This is a file format for biological sequence data. The sequences are interweaved and each line is labeled with the sequence name. The MSF format can be identified in one, or more of the following ways:
31 1. The word PileUp on the first line (optional)
32 2. the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT at the start of the file (optional)
33 3. the word MSF on the first line of the file, and the characters ".." at the end of this line (optional)
34 4. A header containing sequence information followed by a line with the characters "//"
35 """
36 example= """
37
38 PileUp
39
40
41 MSF: 64 Type: P Check: 767 ..
42
43 Name: Cow Len: 100 Check: 3761 Weight: 1.00
44 Name: Carp Len: 100 Check: 1550 Weight: 1.00
45 Name: Chicken Len: 100 Check: 2397 Weight: 1.00
46 Name: Human Len: 100 Check: 9021 Weight: 1.00
47 Name: Loach Len: 100 Check: 984 Weight: 1.00
48 Name: Mouse Len: 100 Check: 2993 Weight: 1.00
49
50
51 //
52
53
54 Cow MAYPMQLGFQ DATSPIMEEL LHFHDHTLMI VFLISSLVLY IISLMLTTKL
55 Carp MAHPTQLGFK DAAMPVMEEL LHFHDHALMI VLLISTLVLY IITAMVSTKL
56 Chicken MANHSQLGFQ DASSPIMEEL VEFHDHALMV ALAICSLVLY LLTLMLMEKL
57 Human MAHAAQVGLQ DATSPIMEEL ITFHDHALMI IFLICFLVLY ALFLTLTTKL
58 Loach MAHPTQLGFQ DAASPVMEEL LHFHDHALMI VFLISALVLY VIITTVSTKL
59 Mouse MAYPFQLGLQ DATSPIMEEL MNFHDHTLMI VFLISSLVLY IISLMLTTKL
60
61
62
63 Cow THTSTMDAQE VETIWTILPA IILILIALPS LRILYMMDEI NNPSLTVKTM
64 Carp TNKYILDSQE IEIVWTILPA VILVLIALPS LRILYLMDEI NDPHLTIKAM
65 Chicken S.SNTVDAQE VELIWTILPA IVLVLLALPS LQILYMMDEI DEPDLTLKAI
66 Human TNTNISDAQE METVWTILPA IILVLIALPS LRILYMTDEV NDPSLTIKSI
67 Loach TNMYILDSQE IEIVWTVLPA LILILIALPS LRILYLMDEI NDPHLTIKAM
68 Mouse THTSTMDAQE VETIWTILPA VILIMIALPS LRILYMMDEI NNPVLTVKTM
69
70 """
71
72 import re
73
74 from corebio.seq import *
75 from corebio.seq_io import *
76 from corebio.utils import *
77
78 names = ('msf', 'gcg-msf', 'gcg', 'PileUp')
79 extensions = ('msf')
80
81 end_header=re.compile(r'(//)(\s*)$')
82 seq_line=re.compile(r'\s*(\S+)\s+([\S\s.?]+)$')
83
84 def iterseq(fin, alphabet=None):
85 """Iterate over the sequences in the file."""
86 # Default implementation
87 return iter(read(fin, alphabet) )
88
89
90
91 def read(fin, alphabet=None):
92 alphabet =Alphabet(alphabet)
93 seq_ids=[]
94 seqs=[]
95 block_count=0
96
97 for token in _line_is(fin):
98 if token.typeof=="begin_block":
99 block_count=0
100
101 elif token.typeof == "seq_id":
102 if len(seqs)<= block_count:
103 seq_ids.append(token.data)
104 seqs.append([])
105 elif token.typeof=="seq":
106 if not alphabet.alphabetic(token.data):
107 raise ValueError(
108 "Character on line: %d not in alphabet: %s : %s" % (
109 token.lineno, alphabet, token.data) )
110 seqs[block_count].append(token.data)
111 block_count +=1
112 if seq_ids==[]:
113 raise ValueError("Parse error, possible wrong format")
114 seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)]
115 return SeqList(seqs)
116
117 def _line_is(fin):
118 header, body, block = range(3)
119 yield Token("begin")
120 state=header
121 for L, line in enumerate(fin):
122 if state==header:
123 if line.isspace():continue
124 m=end_header.match(line)
125 if m is not None:
126 yield Token("end_header")
127 state=body
128 continue
129 else: continue
130
131 if state==body:
132 if line.isspace():continue
133 yield Token("begin_block")
134 state=block
135 #skips to a block of sequences
136
137 if state==block:
138 if line.isspace():
139 yield Token("end_block")
140 state=body
141 continue
142 m=seq_line.match(line)
143 if m is None:
144 raise ValueError("Parse error on line: %d" % L)
145 if m.group(1).isdigit() and m.group(2).strip().isdigit():
146 continue
147 yield Token("seq_id",m.group(1).strip() )
148 data=m.group(2)
149 data="".join((data.split()))
150 yield Token("seq",data.strip() )
151
152
153
154
155
156