annotate corebio/seq_io/genbank_io.py @ 14:778f03497adb

Uploaded
author davidmurphy
date Fri, 24 Feb 2012 11:37:26 -0500
parents c55bdc2fb9fa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 #!/usr/bin/env python
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 """Read GenBank flat files.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 Currently only reads sequence data and not annotations.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 from corebio.utils import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10 from corebio.seq import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 names = ( 'genbank',)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14 extensions = ('gb','genbank', 'gbk')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 def read(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 """Read and parse a file of genbank records.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25 Returns:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26 SeqList -- A list of sequences
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31 seqs = [ s for s in iterseq(fin, alphabet)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32 return SeqList(seqs)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35 def iterseq(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36 """ Iterate over genbank records
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42 Yeilds:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43 Seq -- One alphabetic sequence at a time.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48 alphabet = Alphabet(alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50 seq = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52 def notblank(string) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53 return not isblank(string)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55 lines = Reiterate(iter(fin))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 while True :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59 line = lines.filter( notblank )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60 if not line.startswith('LOCUS') :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61 raise ValueError(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62 "Cannot find start of record at line %d"% lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64 line = lines.filter(lambda s : s.startswith('ORIGIN')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 or s.startswith('//') )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 if line.startswith('//') :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68 # No sequence data
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69 yield Seq( '', alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71 for line in lines :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 if line.startswith('//') :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73 yield Seq( ''.join(seq), alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74 seq = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76 seq.extend( line.split()[1:] )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85