Skip to content

Instantly share code, notes, and snippets.

@joelkuiper
Created January 30, 2020 13:22
Show Gist options
  • Save joelkuiper/171128b93af94d268046bbd73fd4e489 to your computer and use it in GitHub Desktop.
Save joelkuiper/171128b93af94d268046bbd73fd4e489 to your computer and use it in GitHub Desktop.
import logging
import multiprocessing as mp, os
import re
import gensim.parsing.preprocessing as preprocessing
from nltk.tokenize.treebank import TreebankWordTokenizer
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+', flags=re.U|re.M)
NEWLINES = re.compile(r'\n|\r|\\n|\\r', flags=re.U|re.M)
tokenizer = TreebankWordTokenizer()
RE_NUMBER = re.compile(
r"(?:^|(?<=[^\w,.]))[+-]?"
r"(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)"
r"(?:$|(?=\b))")
def replace_numbers(text, replace_with='-NUM-'):
"""Replace all numbers in ``text`` str with ``replace_with`` str."""
return RE_NUMBER.sub(replace_with, text)
def process(line):
line = preprocessing.strip_tags(line)
line = re.sub(LINEBREAK_REGEX, '', line)
line = " ".join(["-BOS-"] + tokenizer.tokenize(line, True) + ["-EOS-"])
line = replace_numbers(line)
line = re.sub(NEWLINES, '', line)
out.write(line + "\n")
input_file = "Downloads/sentences.txt"
out = open("sentences_norm_punct.txt", "a+")
def process_wrapper(chunkStart, chunkSize):
with open(input_file, 'rb') as f:
f.seek(chunkStart)
lines = f.read(chunkSize).splitlines()
for line in lines:
process(line)
def chunkify(fname,size=1024*1024):
fileEnd = os.path.getsize(fname)
f = open(fname,'rb')
chunkEnd = f.tell()
while True:
chunkStart = chunkEnd
f.seek(size,1)
f.readline()
chunkEnd = f.tell()
yield chunkStart, chunkEnd - chunkStart
if chunkEnd > fileEnd:
f.close()
break
#init objects
NUM_CORES = 22
pool = mp.Pool(NUM_CORES)
jobs = []
#create jobs
for chunkStart,chunkSize in chunkify(input_file):
jobs.append(pool.apply_async(process_wrapper, (chunkStart, chunkSize)))
#wait for all jobs to finish
for job in jobs:
job.get()
#clean up
pool.close()
out.close()
from gensim.models.fasttext import FastText
model = FastText(size=128, window=5, min_count=20, workers=NUM_CORES)
model.build_vocab(corpus_file="sentences_norm_punct.txt")
model.train(corpus_file="sentences_norm_punct.txt", total_examples=model.corpus_count, epochs=5)
model.save("fasttext_128_cbow")
# from gensim.models.fasttext import FastText
# from gensim.models.word2vec import LineSentence
# logging.info("Training model")
# model = FastText(corpus_file="sentences_norm_punct.txt", workers=NUM_CORES, hs=1, sg=1, min_count=25, size=128)
# logging.info("Saving model")
# model.wv.save("fasttext_128_norm")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment