Created
July 13, 2018 00:46
-
-
Save dovinmu/e034a019566e4c93cfe126a3994b1959 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def loadWordSet(): | |
s = set() | |
try: | |
with open('/usr/share/dict/american-english', 'r') as f: | |
for line in f: | |
s.add(line.strip()) | |
except: | |
print("Could not find built-in dictionary") | |
# rely on https://github.com/dwyl/english-words.git | |
try: | |
with open('words_alpha.txt') as f: | |
for line in f: | |
s.add(line.strip()) | |
except: | |
print("You'll need to download the file 'words_alpha.txt' from https://github.com/dwyl/english-words into this folder") | |
return s | |
def getPlaintext(url): | |
import requests | |
r = requests.get(url) | |
return r.text | |
def getFileText(fname): | |
with open(fname) as f: | |
return f.read() | |
def getParagraphList(text): | |
# gets a list of paragraphs. the assumption that paragraphs will start with | |
# letters seems to hold up well to the real world, so I don't attempt to clean | |
# the text | |
result = [] | |
text = text.split('\n') | |
temp_par = '' | |
for line in text: | |
if len(line) < 2: | |
if len(temp_par) > 0: | |
result.append(temp_par) | |
temp_par = '' | |
continue | |
temp_par += line.strip() + ' ' | |
return result | |
def getWordList(text): | |
# clean words and return a list | |
result = [] | |
for par in getParagraphList(text): | |
for char in set('\'"1234567890-=!@#$%^&*()_+{}[]|\;:,.<>/?~`'): | |
par = par.replace(char,'') | |
for word in par.split(' '): | |
if len(word) == 0: | |
continue | |
result.append(word) | |
return result | |
def findAcrostic(text, wordset, level='paragraph', min_word_len=4, max_word_len=20): | |
charbuf = '' | |
if level=='paragraph': | |
text = getParagraphList(text) | |
for i in range(len(text)): | |
par = text[i] | |
if par in ['\r'] or len(par) < 2: | |
continue | |
else: | |
charbuf += par[0].lower() | |
# check first n characters in the charbuf for a word, checking for largest first | |
for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1): | |
if charbuf[:j] in wordset: | |
# print the word | |
print(charbuf[:j].upper(), i-len(charbuf), '-', i) | |
# print the lines sequence that makes the acrostic, abbreviated | |
for par_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1): | |
print(text[par_idx][0], text[par_idx][1:100]+'...', end='\n') | |
charbuf = charbuf[j:] | |
print('\n') | |
break | |
# allow the charbuf to overflow | |
if len(charbuf) > max_word_len: | |
charbuf = charbuf[1:] | |
if level=='word': | |
text = getWordList(text) | |
for i in range(len(text)): | |
word = text[i] | |
charbuf += word[0].lower() | |
# check first n characters in the charbuf for a word, checking for largest first | |
for j in range(min(len(charbuf),max_word_len), min_word_len-1, -1): | |
if charbuf[:j] in wordset: | |
# print the word | |
print(charbuf[:j].upper(), i-len(charbuf), '-', i) | |
# print the word sequence that makes the acrostic | |
for word_idx in range(i-len(charbuf)+1, i-len(charbuf)+j+1): | |
print(text[word_idx][0].upper() + text[word_idx][1:], end=' ') | |
print('\n') | |
charbuf = charbuf[j:] | |
break | |
# allow the charbuf to overflow | |
if len(charbuf) > max_word_len: | |
charbuf = charbuf[1:] | |
def gutenbergAcrostic(url, wordset): | |
#print('processing url {}'.format(url)) | |
if url[:1] == '/': | |
text = getFileText(url) | |
else: | |
text = getPlaintext(url) | |
print('\t\t paragraphs'.upper()) | |
findAcrostic(text, wordset, level='paragraph', min_word_len=4) | |
print('\t\t words'.upper()) | |
findAcrostic(text, wordset, level='word', min_word_len=7) | |
if __name__ == "__main__": | |
wordset = loadWordSet() | |
for name,url in { | |
'A Christmas Carol':'http://www.gutenberg.org/cache/epub/46/pg46.txt', | |
'Pride and Prejudice':'http://www.gutenberg.org/files/1342/1342-0.txt', | |
"Alice's Adventures in Wonderland":'http://www.gutenberg.org/files/11/11-0.txt' | |
}.items(): | |
print('\n\t\t'+name.upper()) | |
gutenbergAcrostic(url, wordset) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment