Skip to content

Instantly share code, notes, and snippets.

@devdazed
Created October 11, 2012 16:14

Revisions

  1. devdazed revised this gist Oct 11, 2012. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -64,4 +64,4 @@ def run_counts(tries, size):
    print " Actual: %d" % tries
    print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100)

    run_counts(1000000000, 4096)
    run_counts(1000000000, 16384)
  2. devdazed revised this gist Oct 11, 2012. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -59,8 +59,6 @@ def run_counts(tries, size):
    break

    lpc.increment(i)

    print " Counting"
    count = lpc.current_count()
    print " Count: %d" % count
    print " Actual: %d" % tries
  3. devdazed revised this gist Oct 11, 2012. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -6,10 +6,10 @@
    http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
    Installation:
    pip install murmur
    pip install smhasher
    pip install bitarray
    """
    import murmur
    import smhasher
    import math
    import itertools
    from bitarray import bitarray
    @@ -44,7 +44,7 @@ def increment(self, item):
    """
    Counts an item
    """
    mm_hash = murmur.string_hash(str(item))
    mm_hash = smhasher.murmur3_x64_128(str(item))
    offset = mm_hash % self.bit_map.length()
    self.bit_map[offset] = True

    @@ -66,4 +66,4 @@ def run_counts(tries, size):
    print " Actual: %d" % tries
    print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100)

    run_counts(1000000000, 32768)
    run_counts(1000000000, 4096)
  4. devdazed revised this gist Oct 11, 2012. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@
    Simple Linear Probabilistic Counters
    Credit for idea goes to:
    http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html
    http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
    Installation:
    @@ -53,8 +54,12 @@ def run_counts(tries, size):
    lpc = LPCounter(size)
    print "%d Kilobyte Bitmap" % size
    print " Incrementing %d items" % tries
    for i in itertools.count(tries):
    for i in itertools.count(0):
    if i > tries:
    break

    lpc.increment(i)

    print " Counting"
    count = lpc.current_count()
    print " Count: %d" % count
  5. devdazed revised this gist Oct 11, 2012. 1 changed file with 3 additions and 4 deletions.
    7 changes: 3 additions & 4 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,7 @@
    """
    Simple Linear Probabalistic Counters
    Simple Linear Probabilistic Counters
    Credit for idea goes to:
    http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html
    http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
    Installation:
    @@ -17,7 +16,7 @@

    class LPCounter(object):
    """
    Real Time Linear Probablistic Counter of Unique Items
    Real Time Linear Probabilistic Counter of Unique Items
    """

    def __init__(self, max_space):
    @@ -62,4 +61,4 @@ def run_counts(tries, size):
    print " Actual: %d" % tries
    print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100)

    run_counts(1000000000, 32768)
    run_counts(1000000000, 32768)
  6. devdazed revised this gist Oct 11, 2012. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@
    Simple Linear Probabalistic Counters
    Credit for idea goes to:
    http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html
    http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
    Installation:
  7. devdazed revised this gist Oct 11, 2012. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,9 @@
    """
    Simple Linear Probabalistic Counters
    Credit for idea goes to:
    http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/
    Installation:
    pip install murmur
    pip install bitarray
  8. devdazed revised this gist Oct 11, 2012. 1 changed file with 3 additions and 2 deletions.
    5 changes: 3 additions & 2 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -7,6 +7,7 @@
    """
    import murmur
    import math
    import itertools
    from bitarray import bitarray


    @@ -49,12 +50,12 @@ def run_counts(tries, size):
    lpc = LPCounter(size)
    print "%d Kilobyte Bitmap" % size
    print " Incrementing %d items" % tries
    for i in range(1, tries):
    for i in itertools.count(tries):
    lpc.increment(i)
    print " Counting"
    count = lpc.current_count()
    print " Count: %d" % count
    print " Actual: %d" % tries
    print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100)

    run_counts(100000, 5)
    run_counts(1000000000, 32768)
  9. devdazed created this gist Oct 11, 2012.
    60 changes: 60 additions & 0 deletions lp_counters.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    """
    Simple Linear Probabalistic Counters
    Installation:
    pip install murmur
    pip install bitarray
    """
    import murmur
    import math
    from bitarray import bitarray


    class LPCounter(object):
    """
    Real Time Linear Probablistic Counter of Unique Items
    """

    def __init__(self, max_space):
    """
    Initializes the counter, max_space is the maximum amount of memory
    (in KB) you want to use to maintain your counter, more memory=more
    accurate
    """
    self.bit_map = bitarray(8 * 1024 * max_space)
    self.bit_map.setall(False)

    def current_count(self):
    """
    Gets the current value of the bitmap, to do that we follow the formula:
    -size * ln(unset_bits/size)
    """
    ratio = float(self.bit_map.count(False)) / float(self.bit_map.length())
    if ratio == 0.0:
    return self.bit_map.length()
    else:
    return -self.bit_map.length() * math.log(ratio)

    def increment(self, item):
    """
    Counts an item
    """
    mm_hash = murmur.string_hash(str(item))
    offset = mm_hash % self.bit_map.length()
    self.bit_map[offset] = True

    if __name__ == '__main__':
    def run_counts(tries, size):
    """Run some counts"""
    lpc = LPCounter(size)
    print "%d Kilobyte Bitmap" % size
    print " Incrementing %d items" % tries
    for i in range(1, tries):
    lpc.increment(i)
    print " Counting"
    count = lpc.current_count()
    print " Count: %d" % count
    print " Actual: %d" % tries
    print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100)

    run_counts(100000, 5)