File: Shared\Utilities\BloomFilter.cs
Web Access
Project: ..\..\..\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
#nullable disable
 
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
 
namespace Microsoft.CodeAnalysis.Shared.Utilities
{
    internal partial class BloomFilter
    {
        // From MurmurHash:
        // 'm' and 'r' are mixing constants generated off-line.
        // The values for m and r are chosen through experimentation and 
        // supported by evidence that they work well.
        private const uint Compute_Hash_m = 0x5bd1e995;
        private const int Compute_Hash_r = 24;
 
        private readonly BitArray _bitArray;
        private readonly int _hashFunctionCount;
        private readonly bool _isCaseSensitive;
 
        /// <summary><![CDATA[
        /// 1) n  = Number of items in the filter
        /// 
        /// 2) p = Probability of false positives, (a double between 0 and 1).
        /// 
        /// 3) m = Number of bits in the filter
        /// 
        /// 4) k = Number of hash functions
        /// 
        /// m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0)))))
        /// 
        /// k = round(log(2.0) * m / n)
        /// ]]></summary>
        public BloomFilter(int expectedCount, double falsePositiveProbability, bool isCaseSensitive)
        {
            var m = Math.Max(1, ComputeM(expectedCount, falsePositiveProbability));
            var k = Math.Max(1, ComputeK(expectedCount, falsePositiveProbability));
 
            // We must have size in even bytes, so that when we deserialize from bytes we get a bit array with the same count.
            // The count is used by the hash functions.
            var sizeInEvenBytes = (m + 7) & ~7;
 
            _bitArray = new BitArray(length: sizeInEvenBytes);
            _hashFunctionCount = k;
            _isCaseSensitive = isCaseSensitive;
        }
 
        public BloomFilter(double falsePositiveProbability, bool isCaseSensitive, ICollection<string> values)
            : this(values.Count, falsePositiveProbability, isCaseSensitive)
        {
            AddRange(values);
        }
 
        public BloomFilter(
            double falsePositiveProbability,
            ICollection<string> stringValues,
            ICollection<long> longValues)
            : this(stringValues.Count + longValues.Count, falsePositiveProbability, isCaseSensitive: false)
        {
            AddRange(stringValues);
            AddRange(longValues);
        }
 
        private BloomFilter(BitArray bitArray, int hashFunctionCount, bool isCaseSensitive)
        {
            _bitArray = bitArray ?? throw new ArgumentNullException(nameof(bitArray));
            _hashFunctionCount = hashFunctionCount;
            _isCaseSensitive = isCaseSensitive;
        }
 
        // m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0)))))
        private static int ComputeM(int expectedCount, double falsePositiveProbability)
        {
            var p = falsePositiveProbability;
            double n = expectedCount;
 
            var numerator = n * Math.Log(p);
            var denominator = Math.Log(1.0 / Math.Pow(2.0, Math.Log(2.0)));
            return unchecked((int)Math.Ceiling(numerator / denominator));
        }
 
        // k = round(log(2.0) * m / n)
        private static int ComputeK(int expectedCount, double falsePositiveProbability)
        {
            double n = expectedCount;
            double m = ComputeM(expectedCount, falsePositiveProbability);
 
            var temp = Math.Log(2.0) * m / n;
            return unchecked((int)Math.Round(temp));
        }
 
        /// <summary>
        /// Modification of the murmurhash2 algorithm.  Code is simpler because it operates over
        /// strings instead of byte arrays.  Because each string character is two bytes, it is known
        /// that the input will be an even number of bytes (though not necessarily a multiple of 4).
        /// 
        /// This is needed over the normal 'string.GetHashCode()' because we need to be able to generate
        /// 'k' different well distributed hashes for any given string s.  Also, we want to be able to
        /// generate these hashes without allocating any memory.  My ideal solution would be to use an
        /// MD5 hash.  However, there appears to be no way to do MD5 in .NET where you can:
        /// 
        /// a) feed it individual values instead of a byte[]
        /// 
        /// b) have the hash computed into a byte[] you provide instead of a newly allocated one
        /// 
        /// Generating 'k' pieces of garbage on each insert and lookup seems very wasteful.  So,
        /// instead, we use murmur hash since it provides well distributed values, allows for a
        /// seed, and allocates no memory.
        /// 
        /// Murmur hash is public domain.  Actual code is included below as reference.
        /// </summary>
        private int ComputeHash(string key, int seed)
        {
            unchecked
            {
                // Initialize the hash to a 'random' value
 
                var numberOfCharsLeft = key.Length;
                var h = (uint)(seed ^ numberOfCharsLeft);
 
                // Mix 4 bytes at a time into the hash.  NOTE: 4 bytes is two chars, so we iterate
                // through the string two chars at a time.
                var index = 0;
                while (numberOfCharsLeft >= 2)
                {
                    var c1 = GetCharacter(key, index);
                    var c2 = GetCharacter(key, index + 1);
 
                    h = CombineTwoCharacters(h, c1, c2);
 
                    index += 2;
                    numberOfCharsLeft -= 2;
                }
 
                // Handle the last char (or 2 bytes) if they exist.  This happens if the original string had
                // odd length.
                if (numberOfCharsLeft == 1)
                {
                    var c = GetCharacter(key, index);
                    h = CombineLastCharacter(h, c);
                }
 
                // Do a few final mixes of the hash to ensure the last few bytes are well-incorporated.
 
                h = FinalMix(h);
 
                return (int)h;
            }
        }
 
        private static int ComputeHash(long key, int seed)
        {
            // This is a duplicate of ComputeHash(string key, int seed).  However, because
            // we only have 64bits to encode we just unroll that function here.  See
            // Other function for documentation on what's going on here.
            unchecked
            {
                // Initialize the hash to a 'random' value
 
                var numberOfCharsLeft = 4;
                var h = (uint)(seed ^ numberOfCharsLeft);
 
                // Mix 4 bytes at a time into the hash.  NOTE: 4 bytes is two chars, so we iterate
                // through the long two chars at a time.
                var index = 0;
                while (numberOfCharsLeft >= 2)
                {
                    var c1 = GetCharacter(key, index);
                    var c2 = GetCharacter(key, index + 1);
 
                    h = CombineTwoCharacters(h, c1, c2);
 
                    index += 2;
                    numberOfCharsLeft -= 2;
                }
 
                Debug.Assert(numberOfCharsLeft == 0);
 
                // Do a few final mixes of the hash to ensure the last few bytes are well-incorporated.
                h = FinalMix(h);
 
                return (int)h;
            }
        }
 
        private static uint CombineLastCharacter(uint h, uint c)
        {
            unchecked
            {
                h ^= c;
                h *= Compute_Hash_m;
                return h;
            }
        }
 
        private static uint FinalMix(uint h)
        {
            unchecked
            {
                h ^= h >> 13;
                h *= Compute_Hash_m;
                h ^= h >> 15;
                return h;
            }
        }
 
        private static uint CombineTwoCharacters(uint h, uint c1, uint c2)
        {
            unchecked
            {
                var k = c1 | (c2 << 16);
 
                k *= Compute_Hash_m;
                k ^= k >> Compute_Hash_r;
                k *= Compute_Hash_m;
 
                h *= Compute_Hash_m;
                h ^= k;
 
                return h;
            }
        }
 
        private char GetCharacter(string key, int index)
        {
            var c = key[index];
            return _isCaseSensitive ? c : char.ToLowerInvariant(c);
        }
 
        private static char GetCharacter(long key, int index)
        {
            Debug.Assert(index <= 3);
            return (char)(key >> (16 * index));
        }
 
#if false
        //-----------------------------------------------------------------------------
        // MurmurHash2, by Austin Appleby
        //
        // Note - This code makes a few assumptions about how your machine behaves -
        // 1. We can read a 4-byte value from any address without crashing
        // 2. sizeof(int) == 4
        //
        // And it has a few limitations -
        // 1. It will not work incrementally.
        // 2. It will not produce the same results on little-endian and big-endian
        //    machines.
        unsigned int MurmurHash2(const void* key, int len, unsigned int seed)
        {
            // 'm' and 'r' are mixing constants generated offline.
            // The values for m and r are chosen through experimentation and 
            // supported by evidence that they work well.
            
            const unsigned int m = 0x5bd1e995;
            const int r = 24;
 
            // Initialize the hash to a 'random' value
            unsigned int h = seed ^ len;
 
            // Mix 4 bytes at a time into the hash
            const unsigned char* data = (const unsigned char*)key;
 
            while(len >= 4)
            {
                unsigned int k = *(unsigned int*)data;
 
                k *= m; 
                k ^= k >> r; 
                k *= m; 
 
                h *= m; 
                h ^= k;
 
                data += 4;
                len -= 4;
            }
    
            // Handle the last few bytes of the input array
            switch(len)
            {
                case 3: h ^= data[2] << 16;
                case 2: h ^= data[1] << 8;
                case 1: h ^= data[0];
                        h *= m;
            };
 
            // Do a few final mixes of the hash to ensure the last few
            // bytes are well-incorporated.
 
            h ^= h >> 13;
            h *= m;
            h ^= h >> 15;
 
            return h;
        } 
#endif
 
        public void AddRange(IEnumerable<string> values)
        {
            foreach (var v in values)
            {
                Add(v);
            }
        }
 
        public void AddRange(IEnumerable<long> values)
        {
            foreach (var v in values)
            {
                Add(v);
            }
        }
 
        public void Add(string value)
        {
            for (var i = 0; i < _hashFunctionCount; i++)
            {
                _bitArray[GetBitArrayIndex(value, i)] = true;
            }
        }
 
        private int GetBitArrayIndex(string value, int i)
        {
            var hash = ComputeHash(value, i);
            hash %= _bitArray.Length;
            return Math.Abs(hash);
        }
 
        public void Add(long value)
        {
            for (var i = 0; i < _hashFunctionCount; i++)
            {
                _bitArray[GetBitArrayIndex(value, i)] = true;
            }
        }
 
        private int GetBitArrayIndex(long value, int i)
        {
            var hash = ComputeHash(value, i);
            hash %= _bitArray.Length;
            return Math.Abs(hash);
        }
 
        public bool ProbablyContains(string value)
        {
            for (var i = 0; i < _hashFunctionCount; i++)
            {
                if (!_bitArray[GetBitArrayIndex(value, i)])
                {
                    return false;
                }
            }
 
            return true;
        }
 
        public bool ProbablyContains(long value)
        {
            for (var i = 0; i < _hashFunctionCount; i++)
            {
                if (!_bitArray[GetBitArrayIndex(value, i)])
                {
                    return false;
                }
            }
 
            return true;
        }
 
        public bool IsEquivalent(BloomFilter filter)
        {
            return IsEquivalent(_bitArray, filter._bitArray)
                && _hashFunctionCount == filter._hashFunctionCount
                && _isCaseSensitive == filter._isCaseSensitive;
        }
 
        private static bool IsEquivalent(BitArray array1, BitArray array2)
        {
            if (array1.Length != array2.Length)
            {
                return false;
            }
 
            for (var i = 0; i < array1.Length; i++)
            {
                if (array1[i] != array2[i])
                {
                    return false;
                }
            }
 
            return true;
        }
    }
}