Created
November 4, 2021 17:55
-
-
Save joodicator/a414fe4bf92bf166457981d75d5f22a2 to your computer and use it in GitHub Desktop.
estimate_scrabble_word_count.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import combinations | |
from functools import reduce | |
from operator import mul | |
from math import factorial, perm | |
from collections import Counter | |
import re | |
# English letter frequencies from | |
# <https://en.wikipedia.org/wiki/Letter_frequency>. | |
eng_letter_freqs = { | |
'A': 0.078, | |
'B': 0.02, | |
'C': 0.04, | |
'D': 0.038, | |
'E': 0.11, | |
'F': 0.014, | |
'G': 0.03, | |
'H': 0.023, | |
'I': 0.086, | |
'J': 0.0021, | |
'K': 0.0097, | |
'L': 0.053, | |
'M': 0.027, | |
'N': 0.072, | |
'O': 0.061, | |
'P': 0.028, | |
'Q': 0.0019, | |
'R': 0.073, | |
'S': 0.087, | |
'T': 0.067, | |
'U': 0.033, | |
'V': 0.01, | |
'W': 0.0091, | |
'X': 0.0027, | |
'Y': 0.016, | |
'Z': 0.0044, | |
} | |
assert 0.99 < sum(eng_letter_freqs.values()) < 1.01, \ | |
str(sum(eng_letter_freqs.values())) | |
# Valid SOWPODS words of each length, from | |
# <https://en.wikipedia.org/wiki/Collins_Scrabble_Words#Word_count>. | |
valid_words_of_length = { | |
2: 127, | |
3: 1347, | |
4: 5638, | |
5: 12972, | |
6: 23033, | |
7: 34342, | |
8: 42150, | |
} | |
CHAIN_RE = re.compile(r'(.)\1*') | |
def number_of_words(rack): | |
rack = sorted(rack) | |
# The number of times each letters occurs in the rack: | |
multiplicity = Counter(rack) | |
# Will be updated to contain the estimated answer: | |
count = 0.0 | |
for word_length in range(2, 9): | |
# The number of ways to arrange `word_length` symbols, assuming all of the | |
# symbols are distinct and that order is significant: | |
num_permutations = factorial(word_length) | |
for letters in combinations(rack, word_length): | |
# #(W : W is a permutation of `letters`, W is a valid word) | |
# = P(W is a permutation of `letters` | W is a valid word) | |
# * #(W is a valid word) | |
# ~= P(L = letters[0]) * ... * P(L = letters[-1]) | |
# * #(W is a permutation of `letters`) | |
# * #(W is a valid word) | |
# An estimate of the number of permutations of `letters` which are | |
# valid words: | |
partial_count = reduce(mul, (eng_letter_freqs[l] for l in letters)) \ | |
* num_permutations * valid_words_of_length[word_length] | |
# Examine each (contiguous) set of repeated letters within `letters`: | |
i = 0 | |
while i < len(letters): | |
letter = letters[i] | |
j = i + 1 | |
while j < len(letters) and letters[j] == letter: | |
j += 1 | |
# `c` is the number of times `letter` is repeated within `letters`, and | |
# `d` is the number of times `letter` is repeated within `rack`: | |
c, d, i = j-i, multiplicity[letter], j | |
if c == d == 1: continue | |
# Correct for the fact that due to this string of `c` letters: | |
# 1. the number of times this value of `letters` is encountered | |
# is `d!/(c! (d-c)!)` times too large; and | |
# 2. the value of `num_permutations` is `c!` times too large; | |
# and hence: `partial_count` is `d!/(d-c)!` times too large: | |
partial_count /= perm(d, c) | |
count += partial_count | |
return count | |
if __name__ == '__main__': | |
import sys | |
for arg in sys.argv[1:]: | |
arg = arg.upper() | |
print(f'{arg}\t{number_of_words(arg)}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment