Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/nbrf_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 | |
2 # Copyright (c) 2006, The Regents of the University of California, through | |
3 # Lawrence Berkeley National Laboratory (subject to receipt of any required | |
4 # approvals from the U.S. Dept. of Energy). All rights reserved. | |
5 | |
6 # This software is distributed under the new BSD Open Source License. | |
7 # <http://www.opensource.org/licenses/bsd-license.html> | |
8 # | |
9 # Redistribution and use in source and binary forms, with or without | |
10 # modification, are permitted provided that the following conditions are met: | |
11 # | |
12 # (1) Redistributions of source code must retain the above copyright notice, | |
13 # this list of conditions and the following disclaimer. | |
14 # | |
15 # (2) Redistributions in binary form must reproduce the above copyright | |
16 # notice, this list of conditions and the following disclaimer in the | |
17 # documentation and or other materials provided with the distribution. | |
18 # | |
19 # (3) Neither the name of the University of California, Lawrence Berkeley | |
20 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors | |
21 # may be used to endorse or promote products derived from this software | |
22 # without specific prior written permission. | |
23 # | |
24 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
25 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
26 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
27 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
28 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
29 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
30 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
31 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
32 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
33 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
34 # POSSIBILITY OF SUCH DAMAGE. | |
35 | |
36 """Sequence IO for NBRF/PIR format. | |
37 | |
38 The format is similar to fasta. The header line consistins of '>', a two- | |
39 letter sequence type (P1, F1, DL, DC, RL, RC, or XX), a semicolon, and a | |
40 sequence ID. The next line is a textual description of the sequence, | |
41 followed by one or more lines containing the sequence data. The end of | |
42 the sequence is marked by a "*" (asterisk) character. | |
43 | |
44 type_code -- A map between NBRF two letter type codes and Alphabets. | |
45 | |
46 | |
47 see: http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html | |
48 | |
49 --- Example NBRF File --- | |
50 | |
51 >P1;CRAB_ANAPL | |
52 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). | |
53 MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR | |
54 SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH | |
55 GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ | |
56 SDVPERSIPI TREEKPAIAG AQRK* | |
57 | |
58 >P1;CRAB_BOVIN | |
59 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). | |
60 MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR | |
61 PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV | |
62 HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK | |
63 QASGPERTIP ITREEKPAVT AAPKK* | |
64 | |
65 """ | |
66 | |
67 from corebio.utils import * | |
68 from corebio.seq import * | |
69 from corebio.seq_io import * | |
70 | |
71 names = ("nbrf", "pir",) | |
72 extensions = ('nbrf', 'pir', 'ali') | |
73 | |
74 | |
75 | |
76 | |
77 type_code = { | |
78 'P1' : protein_alphabet, # Protein (complete) | |
79 'F1' : protein_alphabet, # Protein (fragment) | |
80 'DL' : dna_alphabet, # DNA (linear) | |
81 'DC' : dna_alphabet, # DNA (circular) | |
82 'RC' : rna_alphabet, # RNA (linear) | |
83 'RL' : rna_alphabet, # RNA (circular) | |
84 'N3' : rna_alphabet, # tRNA | |
85 'N1' : rna_alphabet, # other functional RNA | |
86 'XX' : generic_alphabet | |
87 } | |
88 | |
89 def read(fin, alphabet=None): | |
90 """Read and parse a NBRF seqquence file. | |
91 | |
92 Args: | |
93 fin -- A stream or file to read | |
94 alphabet -- The expected alphabet of the data. If not supplied, then | |
95 an appropriate alphabet will be inferred from the data. | |
96 Returns: | |
97 SeqList -- A list of sequences | |
98 Raises: | |
99 ValueError -- If the file is unparsable | |
100 """ | |
101 seqs = [ s for s in iterseq(fin, alphabet)] | |
102 return SeqList(seqs) | |
103 | |
104 | |
105 | |
106 def iterseq(fin, alphabet=None): | |
107 """ Generate sequences from an NBRF file. | |
108 | |
109 arguments: | |
110 fin -- A stream or file to read | |
111 alphabet -- | |
112 yeilds : | |
113 Seq | |
114 raises : | |
115 ValueError -- On a parse error. | |
116 """ | |
117 | |
118 body, header,sequence = range(3) # Internal states | |
119 | |
120 state = body | |
121 seq_id = None | |
122 seq_desc = None | |
123 seq_alpha = None | |
124 seqs = [] | |
125 | |
126 for lineno, line in enumerate(fin) : | |
127 if state == body : | |
128 if line == "" or line.isspace() : | |
129 continue | |
130 if line[0] == '>': | |
131 seq_type, seq_id = line[1:].split(';') | |
132 if alphabet : | |
133 seq_alpha = alphabet | |
134 else : | |
135 seq_alpha = type_code[seq_type] | |
136 state = header | |
137 continue | |
138 raise ValueError("Parse error on line: %d" % lineno) | |
139 | |
140 elif state == header : | |
141 seq_desc = line.strip() | |
142 state = sequence | |
143 continue | |
144 | |
145 elif state == sequence : | |
146 data = "".join(line.split()) # Strip out white space | |
147 if data[-1] =='*' : | |
148 # End of sequence data | |
149 seqs.append(data[:-1]) | |
150 | |
151 seq = Seq( "".join(seqs), name = seq_id.strip(), | |
152 description = seq_desc, alphabet = seq_alpha) | |
153 | |
154 yield seq | |
155 state= body | |
156 seq_id = None | |
157 seq_desc = None | |
158 seqs = [] | |
159 continue | |
160 else : | |
161 seqs.append(data) | |
162 continue | |
163 else : | |
164 # If we ever get here something has gone terrible wrong | |
165 assert(False) | |
166 | |
167 # end for | |
168 | |
169 |