Mercurial > mozreview > gecko / file revision / mfbt/BloomFilter.h@98e2b4ee9775a2a2b5cc1bb8a390bfe901814f38

summary |
shortlog |
changelog |
pushlog |
graph |
tags |
bookmarks |
branches |
files |
changeset |
file |
latest |
revisions |
annotate |
diff |
comparison |
raw |
help

mfbt/BloomFilter.h

author | Sylvestre Ledru <sledru@mozilla.com> |

Mon, 06 Nov 2017 13:22:06 +0100 | |

changeset 694109 | 98e2b4ee9775a2a2b5cc1bb8a390bfe901814f38 |

parent 189518 | cf068fd95d3cef2e75205ae37c937bfaee01506f |

permissions | -rw-r--r-- |

reformat all
MozReview-Commit-ID: ApY4gI1IbKr

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* * A counting Bloom filter implementation. This allows consumers to * do fast probabilistic "is item X in set Y?" testing which will * never answer "no" when the correct answer is "yes" (but might * incorrectly answer "yes" when the correct answer is "no"). */ #ifndef mozilla_BloomFilter_h #define mozilla_BloomFilter_h #include "mozilla/Assertions.h" #include "mozilla/Likely.h" #include <stdint.h> #include <string.h> namespace mozilla { /* * This class implements a counting Bloom filter as described at * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with * 8-bit counters. This allows quick probabilistic answers to the * question "is object X in set Y?" where the contents of Y might not * be time-invariant. The probabilistic nature of the test means that * sometimes the answer will be "yes" when it should be "no". If the * answer is "no", then X is guaranteed not to be in Y. * * The filter is parametrized on KeySize, which is the size of the key * generated by each of hash functions used by the filter, in bits, * and the type of object T being added and removed. T must implement * a |uint32_t hash() const| method which returns a uint32_t hash key * that will be used to generate the two separate hash functions for * the Bloom filter. This hash key MUST be well-distributed for good * results! KeySize is not allowed to be larger than 16. * * The filter uses exactly 2**KeySize bytes of memory. From now on we * will refer to the memory used by the filter as M. * * The expected rate of incorrect "yes" answers depends on M and on * the number N of objects in set Y. As long as N is small compared * to M, the rate of such answers is expected to be approximately * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred * elements then using a KeySize of 12 gives a reasonably low * incorrect answer rate. A KeySize of 12 has the additional benefit * of using exactly one page for the filter in typical hardware * configurations. */ template <unsigned KeySize, class T> class BloomFilter { /* * A counting Bloom filter with 8-bit counters. For now we assume * that having two hash functions is enough, but we may revisit that * decision later. * * The filter uses an array with 2**KeySize entries. * * Assuming a well-distributed hash function, a Bloom filter with * array size M containing N elements and * using k hash function has expected false positive rate exactly * * $ (1 - (1 - 1/M)^{kN})^k $ * * because each array slot has a * * $ (1 - 1/M)^{kN} $ * * chance of being 0, and the expected false positive rate is the * probability that all of the k hash functions will hit a nonzero * slot. * * For reasonable assumptions (M large, kN large, which should both * hold if we're worried about false positives) about M and kN this * becomes approximately * * $$ (1 - \exp(-kN/M))^k $$ * * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, * or in other words * * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ * * where r is the false positive rate. This can be used to compute * the desired KeySize for a given load N and false positive rate r. * * If N/M is assumed small, then the false positive rate can * further be approximated as 4*N^2/M^2. So increasing KeySize by * 1, which doubles M, reduces the false positive rate by about a * factor of 4, and a false positive rate of 1% corresponds to * about M/N == 20. * * What this means in practice is that for a few hundred keys using a * KeySize of 12 gives false positive rates on the order of 0.25-4%. * * Similarly, using a KeySize of 10 would lead to a 4% false * positive rate for N == 100 and to quite bad false positive * rates for larger N. */ public: BloomFilter() { static_assert(KeySize <= kKeyShift, "KeySize too big"); // Should we have a custom operator new using calloc instead and // require that we're allocated via the operator? clear(); } /* * Clear the filter. This should be done before reusing it, because * just removing all items doesn't clear counters that hit the upper * bound. */ void clear(); /* * Add an item to the filter. */ void add(const T* aValue); /* * Remove an item from the filter. */ void remove(const T* aValue); /* * Check whether the filter might contain an item. This can * sometimes return true even if the item is not in the filter, * but will never return false for items that are actually in the * filter. */ bool mightContain(const T* aValue) const; /* * Methods for add/remove/contain when we already have a hash computed */ void add(uint32_t aHash); void remove(uint32_t aHash); bool mightContain(uint32_t aHash) const; private: static const size_t kArraySize = (1 << KeySize); static const uint32_t kKeyMask = (1 << KeySize) - 1; static const uint32_t kKeyShift = 16; static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; } static uint32_t hash2(uint32_t aHash) { return (aHash >> kKeyShift) & kKeyMask; } uint8_t& firstSlot(uint32_t aHash) { return mCounters[hash1(aHash)]; } uint8_t& secondSlot(uint32_t aHash) { return mCounters[hash2(aHash)]; } const uint8_t& firstSlot(uint32_t aHash) const { return mCounters[hash1(aHash)]; } const uint8_t& secondSlot(uint32_t aHash) const { return mCounters[hash2(aHash)]; } static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; } uint8_t mCounters[kArraySize]; }; template <unsigned KeySize, class T> inline void BloomFilter<KeySize, T>::clear() { memset(mCounters, 0, kArraySize); } template <unsigned KeySize, class T> inline void BloomFilter<KeySize, T>::add(uint32_t aHash) { uint8_t& slot1 = firstSlot(aHash); if (MOZ_LIKELY(!full(slot1))) { ++slot1; } uint8_t& slot2 = secondSlot(aHash); if (MOZ_LIKELY(!full(slot2))) { ++slot2; } } template <unsigned KeySize, class T> MOZ_ALWAYS_INLINE void BloomFilter<KeySize, T>::add(const T* aValue) { uint32_t hash = aValue->hash(); return add(hash); } template <unsigned KeySize, class T> inline void BloomFilter<KeySize, T>::remove(uint32_t aHash) { // If the slots are full, we don't know whether we bumped them to be // there when we added or not, so just leave them full. uint8_t& slot1 = firstSlot(aHash); if (MOZ_LIKELY(!full(slot1))) { --slot1; } uint8_t& slot2 = secondSlot(aHash); if (MOZ_LIKELY(!full(slot2))) { --slot2; } } template <unsigned KeySize, class T> MOZ_ALWAYS_INLINE void BloomFilter<KeySize, T>::remove(const T* aValue) { uint32_t hash = aValue->hash(); remove(hash); } template <unsigned KeySize, class T> MOZ_ALWAYS_INLINE bool BloomFilter<KeySize, T>::mightContain( uint32_t aHash) const { // Check that all the slots for this hash contain something return firstSlot(aHash) && secondSlot(aHash); } template <unsigned KeySize, class T> MOZ_ALWAYS_INLINE bool BloomFilter<KeySize, T>::mightContain( const T* aValue) const { uint32_t hash = aValue->hash(); return mightContain(hash); } } // namespace mozilla #endif /* mozilla_BloomFilter_h */