Skip to content

Instantly share code, notes, and snippets.

@develar
Created November 30, 2021 16:53

Revisions

  1. develar created this gist Nov 30, 2021.
    464 changes: 464 additions & 0 deletions BlockFilter.java
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,464 @@
    // Copyright 2000-2021 Jim Apple. Use of this source code is governed by the Apache 2.0 license that can be found in https://github.com/jbapple/libfilter/blob/master/LICENSE.txt.
    package com.intellij.util.lang;

    import java.nio.ByteBuffer;

    /**
    * BlockFilter is a block Bloom filter (from Putze et al.'s "Cache-, Hash- and
    * Space-Efficient Bloom Filters") with some twists:
    * <ol>
    * <li> Each block is a split Bloom filter - see Section 2.1 of Broder and Mitzenmacher's
    * "Network Applications of Bloom Filters: A Survey".
    * <li> The number of bits set during each key addition is constant in order to try and
    * take advantage of SIMD instructions.
    * </ol>
    *
    * https://github.com/jbapple/libfilter
    */
    @SuppressWarnings("CommentedOutCode")
    public final class BlockFilter {
    private final int[] payload;
    private final int bucketCount;

    public BlockFilter(int[] payload) {
    this.payload = payload;
    this.bucketCount = payload.length / 8;
    }

    public BlockFilter(int numberOfDistinctValues, double falsePositiveProbability) {
    payload = new int[Math.max(8, (getBytesNeeded(numberOfDistinctValues, falsePositiveProbability) / 4) / 8 * 8)];
    bucketCount = payload.length / 8;
    }

    public BlockFilter(ByteBuffer buffer) {
    int size = buffer.getInt();
    payload = new int[size];
    buffer.asIntBuffer().get(payload);
    buffer.position(buffer.position() + (payload.length * Integer.BYTES));

    bucketCount = payload.length / 8;
    }

    public int sizeInBytes() {
    return Integer.BYTES + (payload.length * Integer.BYTES);
    }

    public void write(ByteBuffer buffer) {
    buffer.putInt(payload.length);
    buffer.asIntBuffer().put(payload);
    buffer.position(buffer.position() + (payload.length * Integer.BYTES));
    }

    /**
    * Calculates the expected false positive probability from the size (in bytes) and fill
    * factor (in number of distinct values, <code>ndv</code>).
    *
    * @param ndv the number of distinct values
    * @param bytes the size of the filter
    * @return the false positive probability
    */
    private static double computeFalsePositiveProbability(double ndv, double bytes) {
    double word_bits = 32;
    double bucket_words = 8;
    double hash_bits = 32;
    // From equation 3 of Putze et al.
    //
    // m = bytes * CHAR_BIT
    // n = ndv
    // c = m / n
    // b = m / (bucket_words * word_bits)
    // B = n * c / b = n * (m / n) / (m / (bucket_words * word_bits)) = m / (m /
    // (bucket_words * word_bits)) = bucket_words * word_bits k = number of hashes, in our
    // case: bucket_words
    if (ndv == 0) return 0.0;
    if (bytes <= 0) return 1.0;
    //noinspection PointlessArithmeticExpression
    if (1.0 * ndv / (1.0 * bytes * 8) > 3) {
    return 1.0;
    }

    double result = 0;
    double lam = bucket_words * word_bits / ((bytes * 8) / ndv);
    double logLam = Math.log(lam);
    double log1collide = -hash_bits * Math.log(2.0);
    int MAX_J = 10000;
    for (int j = 0; j < MAX_J; ++j) {
    int i = MAX_J - 1 - j;
    double logP = i * logLam - lam - logGamma(i + 1);
    double logFinner = bucket_words * Math.log(1.0 - Math.pow(1.0 - 1.0 / word_bits, i));
    double logCollide = Math.log(i) + log1collide;
    result += Math.exp(logP + logFinner) + Math.exp(logP + logCollide);
    }
    //noinspection ManualMinMaxCalculation
    return (result > 1.0) ? 1.0 : result;
    }

    /**
    * Calculates the number of bytes needed to create a filter with the given number of
    * distinct values and false positive probability.
    *
    * @param ndv the number of distinct values
    * @param fpp the desired false positive probability
    * @return the number of bytes needed
    */
    public static int getBytesNeeded(int ndv, double fpp) {
    double word_bits = 32;
    double bucket_words = 8;
    int bucket_bytes = (int)((word_bits * bucket_words) / 8);
    double result = 1;
    while (computeFalsePositiveProbability(ndv, result) > fpp) {
    result *= 2;
    }
    if (result <= bucket_bytes) return bucket_bytes;
    double lo = 0;
    while (lo + 1 < result) {
    double mid = lo + (result - lo) / 2;
    double test = computeFalsePositiveProbability(ndv, mid);
    if (test < fpp) {
    result = mid;
    }
    else if (test == fpp) {
    result = ((mid + bucket_bytes - 1) / bucket_bytes) * bucket_bytes;
    if (result > Integer.MAX_VALUE) return Integer.MAX_VALUE;
    return (int)result;
    }
    else {
    lo = mid;
    }
    }
    result = ((result + bucket_bytes - 1) / bucket_bytes) * bucket_bytes;
    if (result > Integer.MAX_VALUE) return Integer.MAX_VALUE;
    return (int)result;
    }

    /**
    * Calculates the number of distinct elements that can be stored in a filter of size
    * <code>bytes</code> without exceeding the given false positive probability.
    *
    * @param bytes the size of the filter
    * @param fpp the desired false positive probability
    * @return the maximum number of distinct values that can be added
    */
    public static double getTotalHashCapacity(double bytes, double fpp) {
    double result = 1;
    while (computeFalsePositiveProbability(result, bytes) < fpp) {
    result *= 2;
    }
    if (result == 1) return 0;
    double lo = 0;
    while (lo + 1 < result) {
    double mid = lo + (result - lo) / 2;
    double test = computeFalsePositiveProbability(mid, bytes);
    if (test < fpp) {
    lo = mid;
    }
    else if (test == fpp) {
    return mid;
    }
    else {
    result = mid;
    }
    }
    return lo;
    }

    private static final int[] INTERNAL_HASH_SEEDS = {0x47b6137b, 0x44974d91, 0x8824ad5b,
    0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31};

    private static int index(long hash, int num_buckets) {
    return (int)(((hash >>> 32) * ((long)num_buckets)) >>> 32);
    }

    private static void makeMask(long hash, int[] result) {
    // the loops are done individually for easier auto-vectorization by the JIT
    for (int i = 0; i < 8; ++i) {
    result[0] = (int)hash * INTERNAL_HASH_SEEDS[i];
    }
    for (int i = 0; i < 8; ++i) {
    result[i] >>>= (32 - 5);
    }
    for (int i = 0; i < 8; ++i) {
    result[i] = 1 << result[i];
    }
    }

    /**
    * Add a hash value to the filter.
    * <p>
    * Do not mix with <code>FindHash32</code> - a hash value that is present with
    * <code>FindHash64(x)</code> will not be present when calling
    * <code>FindHash32((int)x)</code>.
    * <p>
    * Do not pass values to this function, only their hashes.
    * <p>
    * <em>Hashes must be 64-bits</em>. 32-bit hashes will result in a useless filter where
    * <code>FindHash64</code> always returns <code>true</code> for any input.
    *
    * @param hash the 64-bit hash value of the element you wish to insert
    */
    public void addHash64(long hash) {
    int bucketIndex = index(hash, bucketCount);
    int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
    makeMask(hash, mask);
    for (int i = 0; i < 8; ++i) {
    payload[bucketIndex * 8 + i] = mask[i] | payload[bucketIndex * 8 + i];
    }
    }

    /**
    * Find a hash value in the filter.
    * <p>
    * Do not mix with <code>AddHash32</code> - a hash value that is added with
    * <code>AddHash32</code> will not be present when calling <code>FindHash64</code>.
    * <p>
    * Do not pass values to this function, only their hashes.
    * <p>
    * <em>Hashes must be 64-bits</em>. 32-bit hashes will return incorrect results.
    *
    * @param hash the 64-bit hash value of the element you are checking the presence of
    */
    public boolean mightContain(long hash) {
    int bucketIndex = index(hash, bucketCount);
    int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
    makeMask(hash, mask);
    for (int i = 0; i < 8; ++i) {
    if (0 == (payload[bucketIndex * 8 + i] & mask[i])) {
    return false;
    }
    }
    return true;
    }

    //private static final long REHASH_32 = (((long)0xd1012a3a) << 32) | (long)0x7a1f4a8a;

    ///**
    // * Add a hash value to the filter.
    // * <p>
    // * Do not mix with <code>FindHash64</code> - a hash value that is present with
    // * <code>FindHash64(x)</code> will not be present when calling
    // * <code>FindHash32((int)x)</code>.
    // *
    // * @param hash the 32-bit hash value of the element you wish to insert
    // */
    //public boolean AddHash32(int hash) {
    // long hash64 = (((REHASH_32 * (long)hash) >>> 32) << 32) | hash;
    // int bucket_idx = index(hash64, payload.length / 8);
    // int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
    // makeMask(hash, mask);
    // for (int i = 0; i < 8; ++i) {
    // payload[bucket_idx * 8 + i] = mask[i] | payload[bucket_idx * 8 + i];
    // }
    // return true;
    //}

    ///**
    // * Find a hash value in the filter.
    // * <p>
    // * Do not mix with <code>AddHash64</code> - a hash value that is added with
    // * <code>AddHash64</code> will not be present when calling <code>FindHash32</code>.
    // *
    // * @param hash the 32-bit hash value of the element you are checking the presence of
    // */
    //public boolean FindHash32(int hash) {
    // long hash64 = (((REHASH_32 * (long)hash) >>> 32) << 32) | hash;
    // int bucket_idx = index(hash64, payload.length / 8);
    // int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9};
    // makeMask(hash, mask);
    // for (int i = 0; i < 8; ++i) {
    // if (0 == (payload[bucket_idx * 8 + i] & mask[i])) {
    // return false;
    // }
    // }
    // return true;
    //}

    private static double logGamma(double x) {
    double ret;

    if (Double.isNaN(x) || (x <= 0.0)) {
    ret = Double.NaN;
    }
    else if (x < 0.5) {
    return logGamma1p(x) - Math.log(x);
    }
    else if (x <= 2.5) {
    return logGamma1p((x - 0.5) - 0.5);
    }
    else if (x <= 8.0) {
    final int n = (int)Math.floor(x - 1.5);
    double prod = 1.0;
    for (int i = 1; i <= n; i++) {
    prod *= x - i;
    }
    return logGamma1p(x - (n + 1)) + Math.log(prod);
    }
    else {
    double sum = lanczos(x);
    double tmp = x + LANCZOS_G + .5;
    ret = ((x + .5) * Math.log(tmp)) - tmp +
    HALF_LOG_2_PI + Math.log(sum / x);
    }

    return ret;
    }

    @SuppressWarnings("DuplicatedCode")
    private static double invGamma1pm1(final double x) {
    if (x < -0.5) {
    throw new RuntimeException("NumberIsTooSmallException");
    }
    if (x > 1.5) {
    throw new RuntimeException("NumberIsTooLargeException");
    }

    final double ret;
    final double t = x <= 0.5 ? x : (x - 0.5) - 0.5;
    if (t < 0.0) {
    final double a = INV_GAMMA1P_M1_A0 + t * INV_GAMMA1P_M1_A1;
    double b = INV_GAMMA1P_M1_B8;
    b = INV_GAMMA1P_M1_B7 + t * b;
    b = INV_GAMMA1P_M1_B6 + t * b;
    b = INV_GAMMA1P_M1_B5 + t * b;
    b = INV_GAMMA1P_M1_B4 + t * b;
    b = INV_GAMMA1P_M1_B3 + t * b;
    b = INV_GAMMA1P_M1_B2 + t * b;
    b = INV_GAMMA1P_M1_B1 + t * b;
    b = 1.0 + t * b;

    double c = INV_GAMMA1P_M1_C13 + t * (a / b);
    c = INV_GAMMA1P_M1_C12 + t * c;
    c = INV_GAMMA1P_M1_C11 + t * c;
    c = INV_GAMMA1P_M1_C10 + t * c;
    c = INV_GAMMA1P_M1_C9 + t * c;
    c = INV_GAMMA1P_M1_C8 + t * c;
    c = INV_GAMMA1P_M1_C7 + t * c;
    c = INV_GAMMA1P_M1_C6 + t * c;
    c = INV_GAMMA1P_M1_C5 + t * c;
    c = INV_GAMMA1P_M1_C4 + t * c;
    c = INV_GAMMA1P_M1_C3 + t * c;
    c = INV_GAMMA1P_M1_C2 + t * c;
    c = INV_GAMMA1P_M1_C1 + t * c;
    c = INV_GAMMA1P_M1_C + t * c;
    if (x > 0.5) {
    ret = t * c / x;
    }
    else {
    ret = x * ((c + 0.5) + 0.5);
    }
    }
    else {
    double p = INV_GAMMA1P_M1_P6;
    p = INV_GAMMA1P_M1_P5 + t * p;
    p = INV_GAMMA1P_M1_P4 + t * p;
    p = INV_GAMMA1P_M1_P3 + t * p;
    p = INV_GAMMA1P_M1_P2 + t * p;
    p = INV_GAMMA1P_M1_P1 + t * p;
    p = INV_GAMMA1P_M1_P0 + t * p;

    double q = INV_GAMMA1P_M1_Q4;
    q = INV_GAMMA1P_M1_Q3 + t * q;
    q = INV_GAMMA1P_M1_Q2 + t * q;
    q = INV_GAMMA1P_M1_Q1 + t * q;
    q = 1.0 + t * q;

    double c = INV_GAMMA1P_M1_C13 + (p / q) * t;
    c = INV_GAMMA1P_M1_C12 + t * c;
    c = INV_GAMMA1P_M1_C11 + t * c;
    c = INV_GAMMA1P_M1_C10 + t * c;
    c = INV_GAMMA1P_M1_C9 + t * c;
    c = INV_GAMMA1P_M1_C8 + t * c;
    c = INV_GAMMA1P_M1_C7 + t * c;
    c = INV_GAMMA1P_M1_C6 + t * c;
    c = INV_GAMMA1P_M1_C5 + t * c;
    c = INV_GAMMA1P_M1_C4 + t * c;
    c = INV_GAMMA1P_M1_C3 + t * c;
    c = INV_GAMMA1P_M1_C2 + t * c;
    c = INV_GAMMA1P_M1_C1 + t * c;
    c = INV_GAMMA1P_M1_C0 + t * c;

    if (x > 0.5) {
    ret = (t / x) * ((c - 0.5) - 0.5);
    }
    else {
    ret = x * c;
    }
    }

    return ret;
    }

    private static double lanczos(final double x) {
    double sum = 0.0;
    for (int i = LANCZOS.length - 1; i > 0; --i) {
    sum += (LANCZOS[i] / (x + i));
    }
    return sum + LANCZOS[0];
    }

    private static double logGamma1p(final double x) {
    if (x < -0.5) {
    throw new RuntimeException("NumberIsTooSmallException");
    }
    if (x > 1.5) {
    throw new RuntimeException("NumberIsTooLargeException");
    }
    return -Math.log1p(invGamma1pm1(x));
    }

    private static final double HALF_LOG_2_PI = 0.5 * Math.log(2.0 * Math.PI);
    private static final double LANCZOS_G = 607.0 / 128.0;
    private static final double[] LANCZOS = {
    0.99999999999999709182,
    57.156235665862923517,
    -59.597960355475491248,
    14.136097974741747174,
    -0.49191381609762019978,
    .33994649984811888699e-4,
    .46523628927048575665e-4,
    -.98374475304879564677e-4,
    .15808870322491248884e-3,
    -.21026444172410488319e-3,
    .21743961811521264320e-3,
    -.16431810653676389022e-3,
    .84418223983852743293e-4,
    -.26190838401581408670e-4,
    .36899182659531622704e-5,
    };

    private static final double INV_GAMMA1P_M1_A0 = .611609510448141581788E-08;
    private static final double INV_GAMMA1P_M1_A1 = .624730830116465516210E-08;
    private static final double INV_GAMMA1P_M1_B1 = .203610414066806987300E+00;
    private static final double INV_GAMMA1P_M1_B2 = .266205348428949217746E-01;
    private static final double INV_GAMMA1P_M1_B3 = .493944979382446875238E-03;
    private static final double INV_GAMMA1P_M1_B4 = -.851419432440314906588E-05;
    private static final double INV_GAMMA1P_M1_B5 = -.643045481779353022248E-05;
    private static final double INV_GAMMA1P_M1_B6 = .992641840672773722196E-06;
    private static final double INV_GAMMA1P_M1_B7 = -.607761895722825260739E-07;
    private static final double INV_GAMMA1P_M1_B8 = .195755836614639731882E-09;
    private static final double INV_GAMMA1P_M1_P0 = .6116095104481415817861E-08;
    private static final double INV_GAMMA1P_M1_P1 = .6871674113067198736152E-08;
    private static final double INV_GAMMA1P_M1_P2 = .6820161668496170657918E-09;
    private static final double INV_GAMMA1P_M1_P3 = .4686843322948848031080E-10;
    private static final double INV_GAMMA1P_M1_P4 = .1572833027710446286995E-11;
    private static final double INV_GAMMA1P_M1_P5 = -.1249441572276366213222E-12;
    private static final double INV_GAMMA1P_M1_P6 = .4343529937408594255178E-14;
    private static final double INV_GAMMA1P_M1_Q1 = .3056961078365221025009E+00;
    private static final double INV_GAMMA1P_M1_Q2 = .5464213086042296536016E-01;
    private static final double INV_GAMMA1P_M1_Q3 = .4956830093825887312020E-02;
    private static final double INV_GAMMA1P_M1_Q4 = .2692369466186361192876E-03;
    private static final double INV_GAMMA1P_M1_C = -.422784335098467139393487909917598E+00;
    private static final double INV_GAMMA1P_M1_C0 = .577215664901532860606512090082402E+00;
    private static final double INV_GAMMA1P_M1_C1 = -.655878071520253881077019515145390E+00;
    private static final double INV_GAMMA1P_M1_C2 = -.420026350340952355290039348754298E-01;
    private static final double INV_GAMMA1P_M1_C3 = .166538611382291489501700795102105E+00;
    private static final double INV_GAMMA1P_M1_C4 = -.421977345555443367482083012891874E-01;
    private static final double INV_GAMMA1P_M1_C5 = -.962197152787697356211492167234820E-02;
    private static final double INV_GAMMA1P_M1_C6 = .721894324666309954239501034044657E-02;
    private static final double INV_GAMMA1P_M1_C7 = -.116516759185906511211397108401839E-02;
    private static final double INV_GAMMA1P_M1_C8 = -.215241674114950972815729963053648E-03;
    private static final double INV_GAMMA1P_M1_C9 = .128050282388116186153198626328164E-03;
    private static final double INV_GAMMA1P_M1_C10 = -.201348547807882386556893914210218E-04;
    private static final double INV_GAMMA1P_M1_C11 = -.125049348214267065734535947383309E-05;
    private static final double INV_GAMMA1P_M1_C12 = .113302723198169588237412962033074E-05;
    private static final double INV_GAMMA1P_M1_C13 = -.205633841697760710345015413002057E-06;
    }