url去重 --布隆过滤器 bloom filter原理及python实现
Posted CrossPython
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了url去重 --布隆过滤器 bloom filter原理及python实现相关的知识,希望对你有一定的参考价值。
https://blog.csdn.net/a1368783069/article/details/52137417
# -*- encoding: utf-8 -*- """This module implements a bloom filter probabilistic data structure and an a Scalable Bloom Filter that grows in size as your add more items to it without increasing the false positive error_rate. Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ >>> from pybloom import BloomFilter >>> f = BloomFilter(capacity=10000, error_rate=0.001) >>> for i in range(0, f.capacity): ... _ = f.add(i) ... >>> 0 in f True >>> f.capacity in f False >>> len(f) <= f.capacity True >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate True >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 >>> for i in range(0, count): ... _ = sbf.add(i) ... >>> sbf.capacity > count True >>> len(sbf) <= count True >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate True """ import math import hashlib from struct import unpack, pack, calcsize try: import bitarray except ImportError: raise ImportError(‘pybloom requires bitarray >= 0.3.4‘) __version__ = ‘1.1‘ __author__ = "Jay Baird <[email protected]>, Bob Ippolito <[email protected]>, Marius Eriksen <[email protected]>, Alex Brasetvik <[email protected]>" def make_hashfuncs(num_slices, num_bits): if num_bits >= (1 << 31): fmt_code, chunk_size = ‘Q‘, 8 elif num_bits >= (1 << 15): fmt_code, chunk_size = ‘I‘, 4 else: fmt_code, chunk_size = ‘H‘, 2 total_hash_bits = 8 * num_slices * chunk_size if total_hash_bits > 384: hashfn = hashlib.sha512 elif total_hash_bits > 256: hashfn = hashlib.sha384 elif total_hash_bits > 160: hashfn = hashlib.sha256 elif total_hash_bits > 128: hashfn = hashlib.sha1 else: hashfn = hashlib.md5 fmt = fmt_code * (hashfn().digest_size // chunk_size) num_salts, extra = divmod(num_slices, len(fmt)) if extra: num_salts += 1 salts = [hashfn(hashfn(pack(‘I‘, i)).digest()) for i in range(num_salts)] def _make_hashfuncs(key): #if isinstance(key, unicode): # key = key.encode(‘utf-8‘) #else: # key = str(key) key = str(key).encode("utf-8") rval = [] for salt in salts: h = salt.copy() h.update(key) rval.extend(uint % num_bits for uint in unpack(fmt, h.digest())) del rval[num_slices:] return rval return _make_hashfuncs class BloomFilter(object): FILE_FMT = ‘<dQQQQ‘ def __init__(self, capacity, error_rate=0.001): """Implements a space-efficient probabilistic data structure capacity this BloomFilter must be able to store at least *capacity* elements while maintaining no more than *error_rate* chance of false positives error_rate the error_rate of the filter returning false positives. This determines the filters capacity. Inserting more than capacity elements greatly increases the chance of false positives. >>> b = BloomFilter(capacity=100000, error_rate=0.001) >>> b.add("test") False >>> "test" in b True """ if not (0 < error_rate < 1): raise ValueError("Error_Rate must be between 0 and 1.") if not capacity > 0: raise ValueError("Capacity must be > 0") # given M = num_bits, k = num_slices, p = error_rate, n = capacity # solving for m = bits_per_slice # n ~= M * ((ln(2) ** 2) / abs(ln(P))) # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) num_slices = int(math.ceil(math.log(1 / error_rate, 2))) # the error_rate constraint assumes a fill rate of 1/2 # so we double the capacity to simplify the API bits_per_slice = int(math.ceil( (2 * capacity * abs(math.log(error_rate))) / (num_slices * (math.log(2) ** 2)))) self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) self.bitarray = bitarray.bitarray(self.num_bits, endian=‘little‘) self.bitarray.setall(False) def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): self.error_rate = error_rate self.num_slices = num_slices self.bits_per_slice = bits_per_slice self.capacity = capacity self.num_bits = num_slices * bits_per_slice self.count = count self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) def __contains__(self, key): """Tests a key‘s membership in this bloom filter. >>> b = BloomFilter(capacity=100) >>> b.add("hello") False >>> "hello" in b True """ bits_per_slice = self.bits_per_slice bitarray = self.bitarray if not isinstance(key, list): hashes = self.make_hashes(key) else: hashes = key offset = 0 for k in hashes: if not bitarray[offset + k]: return False offset += bits_per_slice return True def __len__(self): """Return the number of keys stored by this bloom filter.""" return self.count def add(self, key, skip_check=False): """ Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = BloomFilter(capacity=100) >>> b.add("hello") False >>> b.add("hello") True """ bitarray = self.bitarray bits_per_slice = self.bits_per_slice hashes = self.make_hashes(key) if not skip_check and hashes in self: return True if self.count > self.capacity: raise IndexError("BloomFilter is at capacity") offset = 0 for k in hashes: self.bitarray[offset + k] = True offset += bits_per_slice self.count += 1 return False def copy(self): """Return a copy of this bloom filter. """ new_filter = BloomFilter(self.capacity, self.error_rate) new_filter.bitarray = self.bitarray.copy() return new_filter def union(self, other): """ Calculates the union of the two underlying bitarrays and returns a new bloom filter object.""" if self.capacity != other.capacity or self.error_rate != other.error_rate: raise ValueError("Unioning filters requires both filters to have both the same capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray | other.bitarray return new_bloom def __or__(self, other): return self.union(other) def intersection(self, other): """ Calculates the union of the two underlying bitarrays and returns a new bloom filter object.""" if self.capacity != other.capacity or self.error_rate != other.error_rate: raise ValueError("Intersecting filters requires both filters to have equal capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray & other.bitarray return new_bloom def __and__(self, other): return self.intersection(other) def tofile(self, f): """Write the bloom filter to file object `f‘. Underlying bits are written as machine values. This is much more space efficient than pickling the object.""" f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, self.bits_per_slice, self.capacity, self.count)) self.bitarray.tofile(f) @classmethod def fromfile(cls, f, n=-1): """Read a bloom filter from file-object `f‘ serialized with ``BloomFilter.tofile‘‘. If `n‘ > 0 read only so many bytes.""" headerlen = calcsize(cls.FILE_FMT) if 0 < n < headerlen: raise ValueError(‘n too small!‘) filter = cls(1) # Bogus instantiation, we will `_setup‘. filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) filter.bitarray = bitarray.bitarray(endian=‘little‘) if n > 0: filter.bitarray.fromfile(f, n - headerlen) else: filter.bitarray.fromfile(f) if filter.num_bits != filter.bitarray.length() and (filter.num_bits + (8 - filter.num_bits % 8) != filter.bitarray.length()): raise ValueError(‘Bit length mismatch!‘) return filter def __getstate__(self): d = self.__dict__.copy() del d[‘make_hashes‘] return d def __setstate__(self, d): self.__dict__.update(d) self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) class ScalableBloomFilter(object): SMALL_SET_GROWTH = 2 # slower, but takes up less memory LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster FILE_FMT = ‘<idQd‘ def __init__(self, initial_capacity=100, error_rate=0.001, mode=SMALL_SET_GROWTH): """Implements a space-efficient probabilistic data structure that grows as more items are added while maintaining a steady false positive rate initial_capacity the initial capacity of the filter error_rate the error_rate of the filter returning false positives. This determines the filters capacity. Going over capacity greatly increases the chance of false positives. mode can be either ScalableBloomFilter.SMALL_SET_GROWTH or ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower but uses less memory. LARGE_SET_GROWTH is faster but consumes memory faster. >>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("test") False >>> "test" in b True >>> unicode_string = u‘?‘ >>> b.add(unicode_string) False >>> unicode_string in b True """ if not error_rate or error_rate < 0: raise ValueError("Error_Rate must be a decimal less than 0.") self._setup(mode, 0.9, initial_capacity, error_rate) self.filters = [] def _setup(self, mode, ratio, initial_capacity, error_rate): self.scale = mode self.ratio = ratio self.initial_capacity = initial_capacity self.error_rate = error_rate def __contains__(self, key): """Tests a key‘s membership in this bloom filter. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> "hello" in b True """ for f in reversed(self.filters): if key in f: return True return False def add(self, key): """Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> b.add("hello") True """ if key in self: return True filter = self.filters[-1] if self.filters else None if filter is None or filter.count >= filter.capacity: num_filters = len(self.filters) filter = BloomFilter( capacity=self.initial_capacity * (self.scale ** num_filters), error_rate=self.error_rate * (self.ratio ** num_filters)) self.filters.append(filter) filter.add(key, skip_check=True) return False @property def capacity(self): """Returns the total capacity for all filters in this SBF""" return sum([f.capacity for f in self.filters]) @property def count(self): return len(self) def tofile(self, f): """Serialize this ScalableBloomFilter into the file-object `f‘.""" f.write(pack(self.FILE_FMT, self.scale, self.ratio, self.initial_capacity, self.error_rate)) # Write #-of-filters f.write(pack(‘<l‘, len(self.filters))) if len(self.filters) > 0: # Then each filter directly, with a header describing # their lengths. headerpos = f.tell() headerfmt = ‘<‘ + ‘Q‘*(len(self.filters)) f.write(‘.‘ * calcsize(headerfmt)) filter_sizes = [] for filter in self.filters: begin = f.tell() filter.tofile(f) filter_sizes.append(f.tell() - begin) f.seek(headerpos) f.write(pack(headerfmt, *filter_sizes)) @classmethod def fromfile(cls, f): """Deserialize the ScalableBloomFilter in file object `f‘.""" filter = cls() filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) nfilters, = unpack(‘<l‘, f.read(calcsize(‘<l‘))) if nfilters > 0: header_fmt = ‘<‘ + ‘Q‘*nfilters bytes = f.read(calcsize(header_fmt)) filter_lengths = unpack(header_fmt, bytes) for fl in filter_lengths: filter.filters.append(BloomFilter.fromfile(f, fl)) else: filter.filters = [] return filter def __len__(self): """Returns the total number of elements stored in this SBF""" return sum([f.count for f in self.filters]) if __name__ == "__main__": import doctest doctest.testmod()
以上是关于url去重 --布隆过滤器 bloom filter原理及python实现的主要内容,如果未能解决你的问题,请参考以下文章
Scrapy爬虫去重效率优化之Bloom Filter的算法的对接
硬核 | Redis 布隆(Bloom Filter)过滤器原理与实战