Created
January 30, 2020 13:22
-
-
Save joelkuiper/171128b93af94d268046bbd73fd4e489 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import multiprocessing as mp, os | |
import re | |
import gensim.parsing.preprocessing as preprocessing | |
from nltk.tokenize.treebank import TreebankWordTokenizer | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+', flags=re.U|re.M) | |
NEWLINES = re.compile(r'\n|\r|\\n|\\r', flags=re.U|re.M) | |
tokenizer = TreebankWordTokenizer() | |
RE_NUMBER = re.compile( | |
r"(?:^|(?<=[^\w,.]))[+-]?" | |
r"(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)" | |
r"(?:$|(?=\b))") | |
def replace_numbers(text, replace_with='-NUM-'): | |
"""Replace all numbers in ``text`` str with ``replace_with`` str.""" | |
return RE_NUMBER.sub(replace_with, text) | |
def process(line): | |
line = preprocessing.strip_tags(line) | |
line = re.sub(LINEBREAK_REGEX, '', line) | |
line = " ".join(["-BOS-"] + tokenizer.tokenize(line, True) + ["-EOS-"]) | |
line = replace_numbers(line) | |
line = re.sub(NEWLINES, '', line) | |
out.write(line + "\n") | |
input_file = "Downloads/sentences.txt" | |
out = open("sentences_norm_punct.txt", "a+") | |
def process_wrapper(chunkStart, chunkSize): | |
with open(input_file, 'rb') as f: | |
f.seek(chunkStart) | |
lines = f.read(chunkSize).splitlines() | |
for line in lines: | |
process(line) | |
def chunkify(fname,size=1024*1024): | |
fileEnd = os.path.getsize(fname) | |
f = open(fname,'rb') | |
chunkEnd = f.tell() | |
while True: | |
chunkStart = chunkEnd | |
f.seek(size,1) | |
f.readline() | |
chunkEnd = f.tell() | |
yield chunkStart, chunkEnd - chunkStart | |
if chunkEnd > fileEnd: | |
f.close() | |
break | |
#init objects | |
NUM_CORES = 22 | |
pool = mp.Pool(NUM_CORES) | |
jobs = [] | |
#create jobs | |
for chunkStart,chunkSize in chunkify(input_file): | |
jobs.append(pool.apply_async(process_wrapper, (chunkStart, chunkSize))) | |
#wait for all jobs to finish | |
for job in jobs: | |
job.get() | |
#clean up | |
pool.close() | |
out.close() | |
from gensim.models.fasttext import FastText | |
model = FastText(size=128, window=5, min_count=20, workers=NUM_CORES) | |
model.build_vocab(corpus_file="sentences_norm_punct.txt") | |
model.train(corpus_file="sentences_norm_punct.txt", total_examples=model.corpus_count, epochs=5) | |
model.save("fasttext_128_cbow") | |
# from gensim.models.fasttext import FastText | |
# from gensim.models.word2vec import LineSentence | |
# logging.info("Training model") | |
# model = FastText(corpus_file="sentences_norm_punct.txt", workers=NUM_CORES, hs=1, sg=1, min_count=25, size=128) | |
# logging.info("Saving model") | |
# model.wv.save("fasttext_128_norm") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment