joelkuiper · January 30, 2020 13:22
diff --git a/fasttext.py b/fasttext.py
 import logging
 import multiprocessing as mp, os
 import re
 import gensim.parsing.preprocessing as preprocessing
 from nltk.tokenize.treebank import TreebankWordTokenizer


 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


 LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+', flags=re.U|re.M)
 NEWLINES = re.compile(r'\n|\r|\\n|\\r', flags=re.U|re.M)
 tokenizer = TreebankWordTokenizer()

 RE_NUMBER = re.compile(
    r"(?:^|(?<=[^\w,.]))[+-]?"
    r"(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)"
    r"(?:$|(?=\b))")


 def replace_numbers(text, replace_with='-NUM-'):
    """Replace all numbers in ``text`` str with ``replace_with`` str."""
    return RE_NUMBER.sub(replace_with, text)


 def process(line):
    line = preprocessing.strip_tags(line)
    line = re.sub(LINEBREAK_REGEX, '', line)
    line = " ".join(["-BOS-"] + tokenizer.tokenize(line, True) + ["-EOS-"])
    line = replace_numbers(line)
    line = re.sub(NEWLINES, '', line)
    out.write(line + "\n")


 input_file = "Downloads/sentences.txt"
 out = open("sentences_norm_punct.txt", "a+")


 def process_wrapper(chunkStart, chunkSize):
    with open(input_file, 'rb') as f:
        f.seek(chunkStart)
        lines = f.read(chunkSize).splitlines()
        for line in lines:
            process(line)


 def chunkify(fname,size=1024*1024):
    fileEnd = os.path.getsize(fname)
    f = open(fname,'rb')
    chunkEnd = f.tell()
    while True:
        chunkStart = chunkEnd
        f.seek(size,1)
        f.readline()
        chunkEnd = f.tell()
        yield chunkStart, chunkEnd - chunkStart
        if chunkEnd > fileEnd:
            f.close()
            break


 #init objects
 NUM_CORES = 22
 pool = mp.Pool(NUM_CORES)
 jobs = []

 #create jobs
 for chunkStart,chunkSize in chunkify(input_file):
    jobs.append(pool.apply_async(process_wrapper, (chunkStart, chunkSize)))

 #wait for all jobs to finish
 for job in jobs:
    job.get()

 #clean up
 pool.close()
 out.close()

 from gensim.models.fasttext import FastText
 model = FastText(size=128, window=5, min_count=20, workers=NUM_CORES)
 model.build_vocab(corpus_file="sentences_norm_punct.txt")
 model.train(corpus_file="sentences_norm_punct.txt", total_examples=model.corpus_count, epochs=5)
 model.save("fasttext_128_cbow")



 # from gensim.models.fasttext import FastText
 # from gensim.models.word2vec import LineSentence
 # logging.info("Training model")
 # model = FastText(corpus_file="sentences_norm_punct.txt", workers=NUM_CORES, hs=1, sg=1, min_count=25, size=128)
 # logging.info("Saving model")
 # model.wv.save("fasttext_128_norm")
	import logging
	import multiprocessing as mp, os
	import re
	import gensim.parsing.preprocessing as preprocessing
	from nltk.tokenize.treebank import TreebankWordTokenizer


	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


	LINEBREAK_REGEX = re.compile(r'((\r\n)\|[\n\v])+', flags=re.U\|re.M)
	NEWLINES = re.compile(r'\n\|\r\|\\n\|\\r', flags=re.U\|re.M)
	tokenizer = TreebankWordTokenizer()

	RE_NUMBER = re.compile(
	r"(?:^\|(?<=[^\w,.]))[+-]?"
	r"(([1-9]\d{0,2}(,\d{3})+(\.\d)?)\|([1-9]\d{0,2}([ .]\d{3})+(,\d)?)\|(\d*?[.,]\d+)\|\d+)"
	r"(?:$\|(?=\b))")


	def replace_numbers(text, replace_with='-NUM-'):
	"""Replace all numbers in ``text`` str with ``replace_with`` str."""
	return RE_NUMBER.sub(replace_with, text)


	def process(line):
	line = preprocessing.strip_tags(line)
	line = re.sub(LINEBREAK_REGEX, '', line)
	line = " ".join(["-BOS-"] + tokenizer.tokenize(line, True) + ["-EOS-"])
	line = replace_numbers(line)
	line = re.sub(NEWLINES, '', line)
	out.write(line + "\n")


	input_file = "Downloads/sentences.txt"
	out = open("sentences_norm_punct.txt", "a+")


	def process_wrapper(chunkStart, chunkSize):
	with open(input_file, 'rb') as f:
	f.seek(chunkStart)
	lines = f.read(chunkSize).splitlines()
	for line in lines:
	process(line)


	def chunkify(fname,size=1024*1024):
	fileEnd = os.path.getsize(fname)
	f = open(fname,'rb')
	chunkEnd = f.tell()
	while True:
	chunkStart = chunkEnd
	f.seek(size,1)
	f.readline()
	chunkEnd = f.tell()
	yield chunkStart, chunkEnd - chunkStart
	if chunkEnd > fileEnd:
	f.close()
	break


	#init objects
	NUM_CORES = 22
	pool = mp.Pool(NUM_CORES)
	jobs = []

	#create jobs
	for chunkStart,chunkSize in chunkify(input_file):
	jobs.append(pool.apply_async(process_wrapper, (chunkStart, chunkSize)))

	#wait for all jobs to finish
	for job in jobs:
	job.get()

	#clean up
	pool.close()
	out.close()

	from gensim.models.fasttext import FastText
	model = FastText(size=128, window=5, min_count=20, workers=NUM_CORES)
	model.build_vocab(corpus_file="sentences_norm_punct.txt")
	model.train(corpus_file="sentences_norm_punct.txt", total_examples=model.corpus_count, epochs=5)
	model.save("fasttext_128_cbow")



	# from gensim.models.fasttext import FastText
	# from gensim.models.word2vec import LineSentence
	# logging.info("Training model")
	# model = FastText(corpus_file="sentences_norm_punct.txt", workers=NUM_CORES, hs=1, sg=1, min_count=25, size=128)
	# logging.info("Saving model")
	# model.wv.save("fasttext_128_norm")