|
2 | 2 | from math import ceil, floor, exp, log
|
3 | 3 | class Bloom:
|
4 | 4 | ''' Implements the basic bloom filter using bytearray and MD5 hashing
|
5 |
| - If we let k be the number of hash functions involved, m the size of the bloom filter, n be the expected number of elemeents coming in, then |
| 5 | + If k is the number of hashes, m the size of the bloom filter, and n the expected number of elemeents, then |
6 | 6 | k = (m/n) ln 2 and m = -(n ln p)/(ln 2)^2 are asymptotically optimal.
|
7 | 7 | However, as bytearrays are used, m is made divisible by eight, currently pessimistically.
|
8 | 8 | '''
|
9 | 9 |
|
10 |
| - def hash(s, seed, m): |
11 |
| - ''' Compute one hash of at least m bits, returned as a bytes object of whatever length ''' |
12 |
| - return hashlib.md5(s.encode()).digest() |
| 10 | + def _hash(s, seed, m): |
| 11 | + ''' Compute one hash of >= m bits (must be constant), returned as a bytes object of whatever length ''' |
| 12 | + return hashlib.md5(s.encode(), seed).digest() |
13 | 13 |
|
14 |
| - def khashes(s, k, m): |
15 |
| - ''' Make k hashes of length m from one string; return as integer list ''' |
| 14 | + def _khashes(s, k, m): |
| 15 | + ''' Make k hashes of length m from one string; return as integer list |
| 16 | + This is currently done by computing as many hashes as needed to get k indices. |
| 17 | + A more-efficient alternative would be using |
| 18 | + Kirsch, A. and Mitzenmacher, M. (2008), Less hashing, same performance: Building a better Bloom filter. |
| 19 | + Random Struct. Alg., 33: 187-218. doi:10.1002/rsa.20208''' |
| 20 | + hashes = [_hash(s, 0)] |
| 21 | + hashlen = len(hashes[0]) << 3 |
| 22 | + hashes += [] |
16 | 23 |
|
17 | 24 |
|
18 | 25 | def __init__(self, n, p):
|
|
0 commit comments