Created
March 22, 2013 20:36
-
-
Save ConstantineLignos/5224537 to your computer and use it in GitHub Desktop.
Sample items (i.e., words) from a list of items based on their frequencies. Assumes the input file contains lines of the form "<count> <item>".
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Randomly sample items from a list of items with their frequencies.""" | |
import sys | |
import random | |
from operator import itemgetter | |
def parse_countword_line(line): | |
"""Parse a line of the form '<count> <word>' into a tuple (word, count).""" | |
count, word = line.split() | |
return (word, int(count)) | |
def load_word_counts(path): | |
"""Load a wordlist file into a dictionary of form {word: count}.""" | |
with open(path, 'rU') as wordlist: | |
return dict(parse_countword_line(line) for line in wordlist) | |
def counts_to_freqs(countdict): | |
"""Convert a dictionary with counts as values to frequencies.""" | |
# Convert total to float so dividing by it will produce a float | |
total = float(sum(countdict.itervalues())) | |
# Convert counts to frequencies | |
return {word: (count / total) for word, count in countdict.iteritems()} | |
def sample_n_keys(freq_dict, numkeys): | |
"""Sample numkeys keys from freq_dict, whose values are frequencies. | |
Uses Shannon-Miller-Selfridge sampling. It may be possible to speed | |
up sampling by using bisection instead of summing, but this is fast | |
enough.""" | |
# Sort into word and freq lists. Sorting speeds up sampling by | |
# putting frequent items first. | |
words, freqs = zip(*sorted(freq_dict.items(), key=itemgetter(1), | |
reverse=True)) | |
# Sampling loop, run numkeys times | |
for _ in range(numkeys): | |
# Draw a random number and sum frequencies until we hit it | |
rand = random.random() | |
freq_sum = 0.0 | |
for idx, freq in enumerate(freqs): | |
freq_sum += freq | |
if freq_sum > rand: | |
yield words[idx] | |
# After yielding, return to the sampling loop | |
break | |
def main(): | |
"""Parse arguments, get the frequencies, and print the samples.""" | |
try: | |
wordlist_path = sys.argv[1] | |
n_samples = int(sys.argv[2]) | |
except IndexError: | |
print "Usage: freqsampler wordlist nsamples" | |
sys.exit(1) | |
word_freqs = counts_to_freqs(load_word_counts(wordlist_path)) | |
for key in sample_n_keys(word_freqs, n_samples): | |
print key | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment