annotate cdblib.py @ 28:184d14e4270d

Update to Miller Lab devshed revision 4ede22dd5500
author Richard Burhans <burhans@bx.psu.edu>
date Wed, 17 Jul 2013 12:46:46 -0400
parents 2c498d40ecde
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
1 #!/usr/bin/env python
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
2
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
3 '''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
4 Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
5 that efficiently handle many keys, while remaining space-efficient.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
6
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
7 http://cr.yp.to/cdb.html
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
8
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
9 When generated databases are only used with Python code, consider using hash()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
10 rather than djb_hash() for a tidy speedup.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
11 '''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
12
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
13 from _struct import Struct
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
14 from itertools import chain
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
15
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
16
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
17 def py_djb_hash(s):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
18 '''Return the value of DJB's hash function for the given 8-bit string.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
19 h = 5381
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
20 for c in s:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
21 h = (((h << 5) + h) ^ ord(c)) & 0xffffffff
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
22 return h
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
23
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
24 try:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
25 from _cdblib import djb_hash
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
26 except ImportError:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
27 djb_hash = py_djb_hash
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
28
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
29 read_2_le4 = Struct('<LL').unpack
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
30 write_2_le4 = Struct('<LL').pack
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
31
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
32
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
33 class Reader(object):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
34 '''A dictionary-like object for reading a Constant Database accessed
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
35 through a string or string-like sequence, such as mmap.mmap().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
36
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
37 def __init__(self, data, hashfn=djb_hash):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
38 '''Create an instance reading from a sequence and using hashfn to hash
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
39 keys.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
40 if len(data) < 2048:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
41 raise IOError('CDB too small')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
42
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
43 self.data = data
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
44 self.hashfn = hashfn
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
45
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
46 self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
47 self.table_start = min(p[0] for p in self.index)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
48 # Assume load load factor is 0.5 like official CDB.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
49 self.length = sum(p[1] >> 1 for p in self.index)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
50
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
51 def iteritems(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
52 '''Like dict.iteritems(). Items are returned in insertion order.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
53 pos = 2048
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
54 while pos < self.table_start:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
55 klen, dlen = read_2_le4(self.data[pos:pos+8])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
56 pos += 8
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
57
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
58 key = self.data[pos:pos+klen]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
59 pos += klen
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
60
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
61 data = self.data[pos:pos+dlen]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
62 pos += dlen
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
63
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
64 yield key, data
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
65
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
66 def items(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
67 '''Like dict.items().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
68 return list(self.iteritems())
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
69
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
70 def iterkeys(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
71 '''Like dict.iterkeys().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
72 return (p[0] for p in self.iteritems())
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
73 __iter__ = iterkeys
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
74
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
75 def itervalues(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
76 '''Like dict.itervalues().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
77 return (p[1] for p in self.iteritems())
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
78
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
79 def keys(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
80 '''Like dict.keys().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
81 return [p[0] for p in self.iteritems()]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
82
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
83 def values(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
84 '''Like dict.values().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
85 return [p[1] for p in self.iteritems()]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
86
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
87 def __getitem__(self, key):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
88 '''Like dict.__getitem__().'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
89 value = self.get(key)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
90 if value is None:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
91 raise KeyError(key)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
92 return value
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
93
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
94 def has_key(self, key):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
95 '''Return True if key exists in the database.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
96 return self.get(key) is not None
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
97 __contains__ = has_key
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
98
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
99 def __len__(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
100 '''Return the number of records in the database.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
101 return self.length
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
102
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
103 def gets(self, key):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
104 '''Yield values for key in insertion order.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
105 # Truncate to 32 bits and remove sign.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
106 h = self.hashfn(key) & 0xffffffff
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
107 start, nslots = self.index[h & 0xff]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
108
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
109 if nslots:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
110 end = start + (nslots << 3)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
111 slot_off = start + (((h >> 8) % nslots) << 3)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
112
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
113 for pos in chain(xrange(slot_off, end, 8),
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
114 xrange(start, slot_off, 8)):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
115 rec_h, rec_pos = read_2_le4(self.data[pos:pos+8])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
116
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
117 if not rec_h:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
118 break
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
119 elif rec_h == h:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
120 klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
121 rec_pos += 8
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
122
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
123 if self.data[rec_pos:rec_pos+klen] == key:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
124 rec_pos += klen
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
125 yield self.data[rec_pos:rec_pos+dlen]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
126
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
127 def get(self, key, default=None):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
128 '''Get the first value for key, returning default if missing.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
129 # Avoid exception catch when handling default case; much faster.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
130 return chain(self.gets(key), (default,)).next()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
131
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
132 def getint(self, key, default=None, base=0):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
133 '''Get the first value for key converted it to an int, returning
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
134 default if missing.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
135 value = self.get(key, default)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
136 if value is not default:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
137 return int(value, base)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
138 return value
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
139
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
140 def getints(self, key, base=0):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
141 '''Yield values for key in insertion order after converting to int.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
142 return (int(v, base) for v in self.gets(key))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
143
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
144 def getstring(self, key, default=None, encoding='utf-8'):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
145 '''Get the first value for key decoded as unicode, returning default if
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
146 not found.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
147 value = self.get(key, default)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
148 if value is not default:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
149 return value.decode(encoding)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
150 return value
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
151
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
152 def getstrings(self, key, encoding='utf-8'):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
153 '''Yield values for key in insertion order after decoding as
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
154 unicode.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
155 return (v.decode(encoding) for v in self.gets(key))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
156
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
157
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
158 class Writer(object):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
159 '''Object for building new Constant Databases, and writing them to a
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
160 seekable file-like object.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
161
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
162 def __init__(self, fp, hashfn=djb_hash):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
163 '''Create an instance writing to a file-like object, using hashfn to
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
164 hash keys.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
165 self.fp = fp
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
166 self.hashfn = hashfn
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
167
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
168 fp.write('\x00' * 2048)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
169 self._unordered = [[] for i in xrange(256)]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
170
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
171 def put(self, key, value=''):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
172 '''Write a string key/value pair to the output file.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
173 assert type(key) is str and type(value) is str
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
174
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
175 pos = self.fp.tell()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
176 self.fp.write(write_2_le4(len(key), len(value)))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
177 self.fp.write(key)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
178 self.fp.write(value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
179
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
180 h = self.hashfn(key) & 0xffffffff
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
181 self._unordered[h & 0xff].append((h, pos))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
182
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
183 def puts(self, key, values):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
184 '''Write more than one value for the same key to the output file.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
185 Equivalent to calling put() in a loop.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
186 for value in values:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
187 self.put(key, value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
188
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
189 def putint(self, key, value):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
190 '''Write an integer as a base-10 string associated with the given key
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
191 to the output file.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
192 self.put(key, str(value))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
193
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
194 def putints(self, key, values):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
195 '''Write zero or more integers for the same key to the output file.
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
196 Equivalent to calling putint() in a loop.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
197 self.puts(key, (str(value) for value in values))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
198
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
199 def putstring(self, key, value, encoding='utf-8'):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
200 '''Write a unicode string associated with the given key to the output
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
201 file after encoding it as UTF-8 or the given encoding.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
202 self.put(key, unicode.encode(value, encoding))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
203
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
204 def putstrings(self, key, values, encoding='utf-8'):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
205 '''Write zero or more unicode strings to the output file. Equivalent to
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
206 calling putstring() in a loop.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
207 self.puts(key, (unicode.encode(value, encoding) for value in values))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
208
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
209 def finalize(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
210 '''Write the final hash tables to the output file, and write out its
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
211 index. The output file remains open upon return.'''
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
212 index = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
213 for tbl in self._unordered:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
214 length = len(tbl) << 1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
215 ordered = [(0, 0)] * length
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
216 for pair in tbl:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
217 where = (pair[0] >> 8) % length
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
218 for i in chain(xrange(where, length), xrange(0, where)):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
219 if not ordered[i][0]:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
220 ordered[i] = pair
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
221 break
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
222
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
223 index.append((self.fp.tell(), length))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
224 for pair in ordered:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
225 self.fp.write(write_2_le4(*pair))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
226
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
227 self.fp.seek(0)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
228 for pair in index:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
229 self.fp.write(write_2_le4(*pair))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
230 self.fp = None # prevent double finalize()