annotate tools/genome_diversity/cdblib.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python2.5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 '''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 that efficiently handle many keys, while remaining space-efficient.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 http://cr.yp.to/cdb.html
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 When generated databases are only used with Python code, consider using hash()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 rather than djb_hash() for a tidy speedup.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 '''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 from _struct import Struct
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 from itertools import chain
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 def py_djb_hash(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 '''Return the value of DJB's hash function for the given 8-bit string.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 h = 5381
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 for c in s:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 h = (((h << 5) + h) ^ ord(c)) & 0xffffffff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 return h
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 from _cdblib import djb_hash
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 except ImportError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 djb_hash = py_djb_hash
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 read_2_le4 = Struct('<LL').unpack
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 write_2_le4 = Struct('<LL').pack
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 class Reader(object):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 '''A dictionary-like object for reading a Constant Database accessed
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 through a string or string-like sequence, such as mmap.mmap().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 def __init__(self, data, hashfn=djb_hash):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 '''Create an instance reading from a sequence and using hashfn to hash
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 keys.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 if len(data) < 2048:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 raise IOError('CDB too small')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 self.data = data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 self.hashfn = hashfn
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 self.table_start = min(p[0] for p in self.index)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 # Assume load load factor is 0.5 like official CDB.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 self.length = sum(p[1] >> 1 for p in self.index)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 def iteritems(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 '''Like dict.iteritems(). Items are returned in insertion order.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 pos = 2048
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 while pos < self.table_start:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 klen, dlen = read_2_le4(self.data[pos:pos+8])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 pos += 8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 key = self.data[pos:pos+klen]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 pos += klen
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 data = self.data[pos:pos+dlen]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 pos += dlen
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 yield key, data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 def items(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 '''Like dict.items().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 return list(self.iteritems())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 def iterkeys(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 '''Like dict.iterkeys().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 return (p[0] for p in self.iteritems())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 __iter__ = iterkeys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 def itervalues(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 '''Like dict.itervalues().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 return (p[1] for p in self.iteritems())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 def keys(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 '''Like dict.keys().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 return [p[0] for p in self.iteritems()]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 def values(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 '''Like dict.values().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 return [p[1] for p in self.iteritems()]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 def __getitem__(self, key):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 '''Like dict.__getitem__().'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 value = self.get(key)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 if value is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 raise KeyError(key)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 return value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 def has_key(self, key):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 '''Return True if key exists in the database.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 return self.get(key) is not None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 __contains__ = has_key
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 def __len__(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 '''Return the number of records in the database.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 return self.length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 def gets(self, key):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 '''Yield values for key in insertion order.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 # Truncate to 32 bits and remove sign.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 h = self.hashfn(key) & 0xffffffff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 start, nslots = self.index[h & 0xff]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 if nslots:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 end = start + (nslots << 3)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 slot_off = start + (((h >> 8) % nslots) << 3)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 for pos in chain(xrange(slot_off, end, 8),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 xrange(start, slot_off, 8)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 rec_h, rec_pos = read_2_le4(self.data[pos:pos+8])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 if not rec_h:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 elif rec_h == h:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 rec_pos += 8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 if self.data[rec_pos:rec_pos+klen] == key:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 rec_pos += klen
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 yield self.data[rec_pos:rec_pos+dlen]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 def get(self, key, default=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 '''Get the first value for key, returning default if missing.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 # Avoid exception catch when handling default case; much faster.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 return chain(self.gets(key), (default,)).next()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 def getint(self, key, default=None, base=0):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 '''Get the first value for key converted it to an int, returning
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 default if missing.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 value = self.get(key, default)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 if value is not default:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 return int(value, base)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 return value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 def getints(self, key, base=0):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 '''Yield values for key in insertion order after converting to int.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 return (int(v, base) for v in self.gets(key))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 def getstring(self, key, default=None, encoding='utf-8'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 '''Get the first value for key decoded as unicode, returning default if
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 not found.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 value = self.get(key, default)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 if value is not default:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 return value.decode(encoding)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 return value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 def getstrings(self, key, encoding='utf-8'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 '''Yield values for key in insertion order after decoding as
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 unicode.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 return (v.decode(encoding) for v in self.gets(key))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 class Writer(object):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 '''Object for building new Constant Databases, and writing them to a
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 seekable file-like object.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 def __init__(self, fp, hashfn=djb_hash):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 '''Create an instance writing to a file-like object, using hashfn to
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 hash keys.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 self.fp = fp
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 self.hashfn = hashfn
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 fp.write('\x00' * 2048)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 self._unordered = [[] for i in xrange(256)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 def put(self, key, value=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 '''Write a string key/value pair to the output file.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 assert type(key) is str and type(value) is str
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 pos = self.fp.tell()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 self.fp.write(write_2_le4(len(key), len(value)))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 self.fp.write(key)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 self.fp.write(value)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 h = self.hashfn(key) & 0xffffffff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 self._unordered[h & 0xff].append((h, pos))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 def puts(self, key, values):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 '''Write more than one value for the same key to the output file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 Equivalent to calling put() in a loop.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 for value in values:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 self.put(key, value)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 def putint(self, key, value):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 '''Write an integer as a base-10 string associated with the given key
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 to the output file.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 self.put(key, str(value))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 def putints(self, key, values):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 '''Write zero or more integers for the same key to the output file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 Equivalent to calling putint() in a loop.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 self.puts(key, (str(value) for value in values))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 def putstring(self, key, value, encoding='utf-8'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 '''Write a unicode string associated with the given key to the output
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201 file after encoding it as UTF-8 or the given encoding.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 self.put(key, unicode.encode(value, encoding))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 def putstrings(self, key, values, encoding='utf-8'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 '''Write zero or more unicode strings to the output file. Equivalent to
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 calling putstring() in a loop.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207 self.puts(key, (unicode.encode(value, encoding) for value in values))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 def finalize(self):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 '''Write the final hash tables to the output file, and write out its
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 index. The output file remains open upon return.'''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 index = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 for tbl in self._unordered:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 length = len(tbl) << 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 ordered = [(0, 0)] * length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 for pair in tbl:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217 where = (pair[0] >> 8) % length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 for i in chain(xrange(where, length), xrange(0, where)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 if not ordered[i][0]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 ordered[i] = pair
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 index.append((self.fp.tell(), length))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224 for pair in ordered:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225 self.fp.write(write_2_le4(*pair))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 self.fp.seek(0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228 for pair in index:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229 self.fp.write(write_2_le4(*pair))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230 self.fp = None # prevent double finalize()