0
|
1 #!/usr/bin/env python
|
|
2
|
|
3
|
|
4 """Read GenBank flat files.
|
|
5
|
|
6 Currently only reads sequence data and not annotations.
|
|
7
|
|
8 """
|
|
9 from corebio.utils import *
|
|
10 from corebio.seq import *
|
|
11
|
|
12
|
|
13 names = ( 'genbank',)
|
|
14 extensions = ('gb','genbank', 'gbk')
|
|
15
|
|
16
|
|
17
|
|
18 def read(fin, alphabet=None):
|
|
19 """Read and parse a file of genbank records.
|
|
20
|
|
21 Args:
|
|
22 fin -- A stream or file to read
|
|
23 alphabet -- The expected alphabet of the data, if given
|
|
24
|
|
25 Returns:
|
|
26 SeqList -- A list of sequences
|
|
27
|
|
28 Raises:
|
|
29 ValueError -- If the file is unparsable
|
|
30 """
|
|
31 seqs = [ s for s in iterseq(fin, alphabet)]
|
|
32 return SeqList(seqs)
|
|
33
|
|
34
|
|
35 def iterseq(fin, alphabet=None):
|
|
36 """ Iterate over genbank records
|
|
37
|
|
38 Args:
|
|
39 fin -- A stream or file to read
|
|
40 alphabet -- The expected alphabet of the data, if given
|
|
41
|
|
42 Yeilds:
|
|
43 Seq -- One alphabetic sequence at a time.
|
|
44
|
|
45 Raises:
|
|
46 ValueError -- If the file is unparsable
|
|
47 """
|
|
48 alphabet = Alphabet(alphabet)
|
|
49
|
|
50 seq = []
|
|
51
|
|
52 def notblank(string) :
|
|
53 return not isblank(string)
|
|
54
|
|
55 lines = Reiterate(iter(fin))
|
|
56
|
|
57
|
|
58 while True :
|
|
59 line = lines.filter( notblank )
|
|
60 if not line.startswith('LOCUS') :
|
|
61 raise ValueError(
|
|
62 "Cannot find start of record at line %d"% lines.index() )
|
|
63
|
|
64 line = lines.filter(lambda s : s.startswith('ORIGIN')
|
|
65 or s.startswith('//') )
|
|
66
|
|
67 if line.startswith('//') :
|
|
68 # No sequence data
|
|
69 yield Seq( '', alphabet)
|
|
70 else:
|
|
71 for line in lines :
|
|
72 if line.startswith('//') :
|
|
73 yield Seq( ''.join(seq), alphabet)
|
|
74 seq = []
|
|
75 break
|
|
76 seq.extend( line.split()[1:] )
|
|
77
|
|
78
|
|
79
|
|
80
|
|
81
|
|
82
|
|
83
|
|
84
|
|
85 |