Skip to content

Instantly share code, notes, and snippets.

@joelonsql
Created June 2, 2021 08:55

Revisions

  1. joelonsql created this gist Jun 2, 2021.
    172 changes: 172 additions & 0 deletions hyperloglog-demo.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,172 @@
    --
    -- Here comes some general advise on how to use HyperLogLog
    -- for someone who wants to implement a YouTube-like service
    -- using the awesome https://github.com/citusdata/postgresql-hll PostgreSQL extension
    --

    CREATE TABLE counter (
    id text NOT NULL,
    sketch hll NOT NULL DEFAULT hll_empty(),
    PRIMARY KEY (id)
    );

    -- register new thing to count
    INSERT INTO counter (id) VALUES ('my_cool_cat_video');

    --
    -- get the HLL-sketch datastructure
    -- and also estimate the cardinality (=number of unique views) for it
    --
    -- read-only operation, thus horizontally scalable
    --
    SELECT
    sketch,
    hll_cardinality(sketch)
    FROM counter WHERE id = 'my_cool_cat_video';

    --
    -- each time the video is watched,
    -- the client should mutate the sketch-data structure
    --
    SELECT hll_add([current sketch value], hll_hash_bigint([the userid that watched the video]));
    --
    -- if the new value returned by hll_add() is different than the [current sketch value],
    -- which will happen more and more infrequent over time, thus it scales well,
    -- this will cause a write operation to the database:
    --
    UPDATE counter SET sketch = [new sketch value] WHERE id = 'my_cool_cat_video';
    --
    -- the hll_add(...,hll_hash_bigint()) is read-only can be computed by any PostgreSQL instance,
    -- running anywhere, and doesn't need to run on the same instance that keeps the actual
    -- value to update, thus is scales horizontally.
    --
    --
    -- Example:
    --

    INSERT INTO counter (id) VALUES ('my_cool_cat_video');

    SELECT
    sketch,
    hll_cardinality(sketch)
    FROM counter WHERE id = 'my_cool_cat_video';

    /*
    sketch | hll_cardinality
    ----------+-----------------
    \x118b7f | 0
    (1 row)
    */

    SELECT hll_add('\x118b7f'::hll, hll_hash_bigint(12345));
    /*
    hll_add
    --------------------------
    \x128b7f3cd2dbd3ee832997
    (1 row)
    */

    SELECT hll_cardinality('\x128b7f3cd2dbd3ee832997'::hll);

    /*
    hll_cardinality
    -----------------
    1
    (1 row)
    */

    --
    -- same user watches same video again, value is unchanged:
    --

    SELECT hll_add('\x128b7f3cd2dbd3ee832997'::hll, hll_hash_bigint(12345));
    /*
    hll_add
    --------------------------
    \x128b7f3cd2dbd3ee832997
    (1 row)
    */

    --
    -- some other user watches the same video, value this time changes:
    --
    SELECT hll_add('\x128b7f3cd2dbd3ee832997'::hll, hll_hash_bigint(54321));
    /*
    hll_add
    ------------------------------------------
    \x128b7ffdda8f47ee4c2b8f3cd2dbd3ee832997
    (1 row)
    */

    SELECT hll_cardinality('\x128b7ffdda8f47ee4c2b8f3cd2dbd3ee832997'::hll);
    /*
    hll_cardinality
    -----------------
    2
    (1 row)
    */

    --
    -- after thousands of users have watched the video,
    -- the counter will not be 100% accurate and will just be an estimation,
    -- but a good estimation with predictable error:
    -- "The HyperLogLog algorithm is able to estimate cardinalities of > 109 with a typical accuracy (standard error) of 2%, using 1.5 kB of memory."
    -- https://en.wikipedia.org/wiki/HyperLogLog
    --

    --
    -- Simulate 10000 users watching the video:
    --
    DO $$
    DECLARE
    _sketch hll;
    BEGIN
    _sketch := hll_empty();
    FOR i IN 1..10000 LOOP
    _sketch := hll_add(_sketch, hll_hash_bigint(i));
    END LOOP;
    RAISE NOTICE 'count estimation: %, sketch value: %', hll_cardinality(_sketch), _sketch;
    END
    $$;

    /*
    NOTICE: count estimation: 9923.837380245724, sketch value: \x148b7f10c46114621884540c43210681106210c82220a31884539c433088618c6618c4310462214a2210621048310063288822144228c6409044188c120421310a421022194e320cc3110a2088c520c4118d04208a410c23110c31904420862114661846348822188853944320842188832902210c661144220c2510c8329cc119426184683086420825398e208ce311081008e4110e3094a4188832112721882218221108220c652904321083304a51048230c4910cc2120c339ca20104219421194a4190a329127188a413043198a820c8318c25110a3210431908121042194c2091021244208c861906410c812086318c4638086288e221465388851142219c6320c6200ca42008319ca218866418a20886328445210821904808c641086218c8218d023104318082110631206628c8218c6310465408c72104319c63188881904331045190c46106218cc7304812108420c6338c6510c4220c85094251988339043110883144418c6619467210a218442108642088610822208611188318ca331823194c61906708c82104671886220863095842986410c6311442208a641442208432944311863214c209423190a81148228c6418ca73946420c83304e8594c4210642804338883084854104431064290603884318824388243144320ca320844184422142440c4618c6120c23211053086228443208822104610c8518c84288633186229501210441148228842214631946419ce21248311062214c21246231445284e41988508962110c319c432888418c65204453108518ca210c861182721085114a320c8318c8810864110621106228881214e5304a3388c720c81194a211863191471148419cc419cc1190431188210c4118c62108831842732082090412946211c2220c23388c409028188a329468198e011c81290a21188320ce320c42210620142818469111041906438c83314a2214e211c222844520862490a32088120c4510c82110861906229c821884221902295041944310c83194631882811883110c2188a619483108632104210c8121862288441948529081188631886718886198232886220844188821884510c4321065508a510063294641148318843190831b88800c6308c6328c673886311066210631908521c8329465109011086328c421882228ce220c44206012106310863390c62048538c6311084208632842220c4419063190c1190470946510c4418042110840886730c872186228863518433848218c42290e120c62084e51908430cc5308422084268c45210831108228883208e43104320d0310c42120831186210c84198632a4431082130ca6194c328466218c320c0710c83208a12102220c4418c231946221464110a3194a61046728c83110a519464108633886310c6221cc21886321464204631884520d01194620a0a6304c411047204a22908720c22190e4294a2284c4298c538ca730cc51804119885194601186318c630946420c8239862088222186248c413906608c4810c6511483128620a4c619823214641286441c8428461288a32088131c4518c6628c6138cc30882318c432884628c8328c43208c308cc2390431086319ca10a484190a418c242846410c432108618462188a21944528c671206420866288642990420c4318c43388651844110845108843082628864084630906611043190a4090a30844218c263144418486198a3194a228c4308cc2090c310d84288620a0a21048411464108c4108c308c421146511084108431046511044118412146410824
    */

    --
    -- if we now simulate letting userid 10001 watch the video,
    -- it is likely the value will not change, since it only changes
    -- for some values, which statistically over time makes a good estimation.
    --
    -- let's see if the HLL value is unchanged, when we simulate userid 10001 watching the video:
    --
    SELECT
    '\x148b7f10c46114621884540c43210681106210c82220a31884539c433088618c6618c4310462214a2210621048310063288822144228c6409044188c120421310a421022194e320cc3110a2088c520c4118d04208a410c23110c31904420862114661846348822188853944320842188832902210c661144220c2510c8329cc119426184683086420825398e208ce311081008e4110e3094a4188832112721882218221108220c652904321083304a51048230c4910cc2120c339ca20104219421194a4190a329127188a413043198a820c8318c25110a3210431908121042194c2091021244208c861906410c812086318c4638086288e221465388851142219c6320c6200ca42008319ca218866418a20886328445210821904808c641086218c8218d023104318082110631206628c8218c6310465408c72104319c63188881904331045190c46106218cc7304812108420c6338c6510c4220c85094251988339043110883144418c6619467210a218442108642088610822208611188318ca331823194c61906708c82104671886220863095842986410c6311442208a641442208432944311863214c209423190a81148228c6418ca73946420c83304e8594c4210642804338883084854104431064290603884318824388243144320ca320844184422142440c4618c6120c23211053086228443208822104610c8518c84288633186229501210441148228842214631946419ce21248311062214c21246231445284e41988508962110c319c432888418c65204453108518ca210c861182721085114a320c8318c8810864110621106228881214e5304a3388c720c81194a211863191471148419cc419cc1190431188210c4118c62108831842732082090412946211c2220c23388c409028188a329468198e011c81290a21188320ce320c42210620142818469111041906438c83314a2214e211c222844520862490a32088120c4510c82110861906229c821884221902295041944310c83194631882811883110c2188a619483108632104210c8121862288441948529081188631886718886198232886220844188821884510c4321065508a510063294641148318843190831b88800c6308c6328c673886311066210631908521c8329465109011086328c421882228ce220c44206012106310863390c62048538c6311084208632842220c4419063190c1190470946510c4418042110840886730c872186228863518433848218c42290e120c62084e51908430cc5308422084268c45210831108228883208e43104320d0310c42120831186210c84198632a4431082130ca6194c328466218c320c0710c83208a12102220c4418c231946221464110a3194a61046728c83110a519464108633886310c6221cc21886321464204631884520d01194620a0a6304c411047204a22908720c22190e4294a2284c4298c538ca730cc51804119885194601186318c630946420c8239862088222186248c413906608c4810c6511483128620a4c619823214641286441c8428461288a32088131c4518c6628c6138cc30882318c432884628c8328c43208c308cc2390431086319ca10a484190a418c242846410c432108618462188a21944528c671206420866288642990420c4318c43388651844110845108843082628864084630906611043190a4090a30844218c263144418486198a3194a228c4308cc2090c310d84288620a0a21048411464108c4108c308c421146511084108431046511044118412146410824'::hll
    =
    hll_add
    (
    '\x148b7f10c46114621884540c43210681106210c82220a31884539c433088618c6618c4310462214a2210621048310063288822144228c6409044188c120421310a421022194e320cc3110a2088c520c4118d04208a410c23110c31904420862114661846348822188853944320842188832902210c661144220c2510c8329cc119426184683086420825398e208ce311081008e4110e3094a4188832112721882218221108220c652904321083304a51048230c4910cc2120c339ca20104219421194a4190a329127188a413043198a820c8318c25110a3210431908121042194c2091021244208c861906410c812086318c4638086288e221465388851142219c6320c6200ca42008319ca218866418a20886328445210821904808c641086218c8218d023104318082110631206628c8218c6310465408c72104319c63188881904331045190c46106218cc7304812108420c6338c6510c4220c85094251988339043110883144418c6619467210a218442108642088610822208611188318ca331823194c61906708c82104671886220863095842986410c6311442208a641442208432944311863214c209423190a81148228c6418ca73946420c83304e8594c4210642804338883084854104431064290603884318824388243144320ca320844184422142440c4618c6120c23211053086228443208822104610c8518c84288633186229501210441148228842214631946419ce21248311062214c21246231445284e41988508962110c319c432888418c65204453108518ca210c861182721085114a320c8318c8810864110621106228881214e5304a3388c720c81194a211863191471148419cc419cc1190431188210c4118c62108831842732082090412946211c2220c23388c409028188a329468198e011c81290a21188320ce320c42210620142818469111041906438c83314a2214e211c222844520862490a32088120c4510c82110861906229c821884221902295041944310c83194631882811883110c2188a619483108632104210c8121862288441948529081188631886718886198232886220844188821884510c4321065508a510063294641148318843190831b88800c6308c6328c673886311066210631908521c8329465109011086328c421882228ce220c44206012106310863390c62048538c6311084208632842220c4419063190c1190470946510c4418042110840886730c872186228863518433848218c42290e120c62084e51908430cc5308422084268c45210831108228883208e43104320d0310c42120831186210c84198632a4431082130ca6194c328466218c320c0710c83208a12102220c4418c231946221464110a3194a61046728c83110a519464108633886310c6221cc21886321464204631884520d01194620a0a6304c411047204a22908720c22190e4294a2284c4298c538ca730cc51804119885194601186318c630946420c8239862088222186248c413906608c4810c6511483128620a4c619823214641286441c8428461288a32088131c4518c6628c6138cc30882318c432884628c8328c43208c308cc2390431086319ca10a484190a418c242846410c432108618462188a21944528c671206420866288642990420c4318c43388651844110845108843082628864084630906611043190a4090a30844218c263144418486198a3194a228c4308cc2090c310d84288620a0a21048411464108c4108c308c421146511084108431046511044118412146410824'::hll,
    hll_hash_bigint(10001)
    ) AS test_if_unchanged
    ;
    /*
    test_if_unchanged
    -------------------
    t
    (1 row)
    */

    --
    -- Yes! We were lucky, the hash of 10001 didn't cause any update of the value,
    -- we thus didn't need to update the value in the database, and saved a write-operation this time.
    --

    --
    -- The HyperLogLog extension allows controlling how often such updates occur,
    -- and how good estimations is desired, using parameters to the hll_empty() function,
    -- see the documentation for details. By default, it keeps exact count for small values,
    -- which is probably a good, since it's more notable if there are differences for small values,
    -- since a user might know some friend watched their video, but the counter is not updated.
    -- But for bigger values, it switches over to estimation, which is probably OK since users
    -- won't notice if the value is off by a few values.
    --