Created
October 11, 2012 16:14
Revisions
-
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -64,4 +64,4 @@ def run_counts(tries, size): print " Actual: %d" % tries print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100) run_counts(1000000000, 16384) -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -59,8 +59,6 @@ def run_counts(tries, size): break lpc.increment(i) count = lpc.current_count() print " Count: %d" % count print " Actual: %d" % tries -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 4 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,10 +6,10 @@ http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/ Installation: pip install smhasher pip install bitarray """ import smhasher import math import itertools from bitarray import bitarray @@ -44,7 +44,7 @@ def increment(self, item): """ Counts an item """ mm_hash = smhasher.murmur3_x64_128(str(item)) offset = mm_hash % self.bit_map.length() self.bit_map[offset] = True @@ -66,4 +66,4 @@ def run_counts(tries, size): print " Actual: %d" % tries print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100) run_counts(1000000000, 4096) -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 6 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,6 +2,7 @@ Simple Linear Probabilistic Counters Credit for idea goes to: http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/ Installation: @@ -53,8 +54,12 @@ def run_counts(tries, size): lpc = LPCounter(size) print "%d Kilobyte Bitmap" % size print " Incrementing %d items" % tries for i in itertools.count(0): if i > tries: break lpc.increment(i) print " Counting" count = lpc.current_count() print " Count: %d" % count -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 3 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,8 +1,7 @@ """ Simple Linear Probabilistic Counters Credit for idea goes to: http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/ Installation: @@ -17,7 +16,7 @@ class LPCounter(object): """ Real Time Linear Probabilistic Counter of Unique Items """ def __init__(self, max_space): @@ -62,4 +61,4 @@ def run_counts(tries, size): print " Actual: %d" % tries print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100) run_counts(1000000000, 32768) -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,6 +2,7 @@ Simple Linear Probabalistic Counters Credit for idea goes to: http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/ Installation: -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 3 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,9 @@ """ Simple Linear Probabalistic Counters Credit for idea goes to: http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/ Installation: pip install murmur pip install bitarray -
devdazed revised this gist
Oct 11, 2012 . 1 changed file with 3 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,6 +7,7 @@ """ import murmur import math import itertools from bitarray import bitarray @@ -49,12 +50,12 @@ def run_counts(tries, size): lpc = LPCounter(size) print "%d Kilobyte Bitmap" % size print " Incrementing %d items" % tries for i in itertools.count(tries): lpc.increment(i) print " Counting" count = lpc.current_count() print " Count: %d" % count print " Actual: %d" % tries print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100) run_counts(1000000000, 32768) -
devdazed created this gist
Oct 11, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,60 @@ """ Simple Linear Probabalistic Counters Installation: pip install murmur pip install bitarray """ import murmur import math from bitarray import bitarray class LPCounter(object): """ Real Time Linear Probablistic Counter of Unique Items """ def __init__(self, max_space): """ Initializes the counter, max_space is the maximum amount of memory (in KB) you want to use to maintain your counter, more memory=more accurate """ self.bit_map = bitarray(8 * 1024 * max_space) self.bit_map.setall(False) def current_count(self): """ Gets the current value of the bitmap, to do that we follow the formula: -size * ln(unset_bits/size) """ ratio = float(self.bit_map.count(False)) / float(self.bit_map.length()) if ratio == 0.0: return self.bit_map.length() else: return -self.bit_map.length() * math.log(ratio) def increment(self, item): """ Counts an item """ mm_hash = murmur.string_hash(str(item)) offset = mm_hash % self.bit_map.length() self.bit_map[offset] = True if __name__ == '__main__': def run_counts(tries, size): """Run some counts""" lpc = LPCounter(size) print "%d Kilobyte Bitmap" % size print " Incrementing %d items" % tries for i in range(1, tries): lpc.increment(i) print " Counting" count = lpc.current_count() print " Count: %d" % count print " Actual: %d" % tries print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100) run_counts(100000, 5)