ConstantineLignos · March 22, 2013 20:36
diff --git a/freqsampler.py b/freqsampler.py
 #!/usr/bin/env python
 """Randomly sample items from a list of items with their frequencies."""


 import sys
 import random
 from operator import itemgetter


 def parse_countword_line(line):
    """Parse a line of the form '<count> <word>' into a tuple (word, count)."""
    count, word = line.split()
    return (word, int(count))


 def load_word_counts(path):
    """Load a wordlist file into a dictionary of form {word: count}."""
    with open(path, 'rU') as wordlist:
        return dict(parse_countword_line(line) for line in wordlist)


 def counts_to_freqs(countdict):
    """Convert a dictionary with counts as values to frequencies."""
    # Convert total to float so dividing by it will produce a float
    total = float(sum(countdict.itervalues()))
    # Convert counts to frequencies
    return {word: (count / total) for word, count in countdict.iteritems()}


 def sample_n_keys(freq_dict, numkeys):
    """Sample numkeys keys from freq_dict, whose values are frequencies.

    Uses Shannon-Miller-Selfridge sampling. It may be possible to speed
    up sampling by using bisection instead of summing, but this is fast
    enough."""

    # Sort into word and freq lists. Sorting speeds up sampling by
    # putting frequent items first.
    words, freqs = zip(*sorted(freq_dict.items(), key=itemgetter(1),
                               reverse=True))
    # Sampling loop, run numkeys times
    for _ in range(numkeys):
        # Draw a random number and sum frequencies until we hit it
        rand = random.random()
        freq_sum = 0.0
        for idx, freq in enumerate(freqs):
            freq_sum += freq
            if freq_sum > rand:
                yield words[idx]
                # After yielding, return to the sampling loop
                break


 def main():
    """Parse arguments, get the frequencies, and print the samples."""
    try:
        wordlist_path = sys.argv[1]
        n_samples = int(sys.argv[2])
    except IndexError:
        print "Usage: freqsampler wordlist nsamples"
        sys.exit(1)

    word_freqs = counts_to_freqs(load_word_counts(wordlist_path))
    for key in sample_n_keys(word_freqs, n_samples):
        print key


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	"""Randomly sample items from a list of items with their frequencies."""


	import sys
	import random
	from operator import itemgetter


	def parse_countword_line(line):
	"""Parse a line of the form '<count> <word>' into a tuple (word, count)."""
	count, word = line.split()
	return (word, int(count))


	def load_word_counts(path):
	"""Load a wordlist file into a dictionary of form {word: count}."""
	with open(path, 'rU') as wordlist:
	return dict(parse_countword_line(line) for line in wordlist)


	def counts_to_freqs(countdict):
	"""Convert a dictionary with counts as values to frequencies."""
	# Convert total to float so dividing by it will produce a float
	total = float(sum(countdict.itervalues()))
	# Convert counts to frequencies
	return {word: (count / total) for word, count in countdict.iteritems()}


	def sample_n_keys(freq_dict, numkeys):
	"""Sample numkeys keys from freq_dict, whose values are frequencies.

	Uses Shannon-Miller-Selfridge sampling. It may be possible to speed
	up sampling by using bisection instead of summing, but this is fast
	enough."""

	# Sort into word and freq lists. Sorting speeds up sampling by
	# putting frequent items first.
	words, freqs = zip(*sorted(freq_dict.items(), key=itemgetter(1),
	reverse=True))
	# Sampling loop, run numkeys times
	for _ in range(numkeys):
	# Draw a random number and sum frequencies until we hit it
	rand = random.random()
	freq_sum = 0.0
	for idx, freq in enumerate(freqs):
	freq_sum += freq
	if freq_sum > rand:
	yield words[idx]
	# After yielding, return to the sampling loop
	break


	def main():
	"""Parse arguments, get the frequencies, and print the samples."""
	try:
	wordlist_path = sys.argv[1]
	n_samples = int(sys.argv[2])
	except IndexError:
	print "Usage: freqsampler wordlist nsamples"
	sys.exit(1)

	word_freqs = counts_to_freqs(load_word_counts(wordlist_path))
	for key in sample_n_keys(word_freqs, n_samples):
	print key


	if __name__ == "__main__":
	main()