Last active
January 25, 2019 10:20
-
-
Save naoh16/eabd11ed010b450963c108b2436eac4f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
mkdfa.py | |
Implementation of mkdfa.pl for python 3 | |
Copyright (c) 2019 Sunao Hara. | |
This script is released under the MIT license. | |
https://opensource.org/licenses/mit-license.php | |
""" | |
import sys | |
import os | |
import shutil | |
import tempfile | |
import re | |
import codecs | |
import subprocess | |
JULIUS_BIN = os.path.dirname(os.path.abspath(__file__)) | |
if sys.platform == 'win32': | |
CMD_MKFA = JULIUS_BIN + '/mkfa.exe' | |
CMD_DFA_MINIMIZE = JULIUS_BIN + '/dfa_minimize.exe' | |
else: | |
CMD_MKFA = JULIUS_BIN + '/mkfa' | |
CMD_DFA_MINIMIZE = JULIUS_BIN + '/dfa_minimize' | |
# generate reverse grammar file | |
def gen_reverse_grammar(gramfile, rgramfile): | |
results = [] | |
with codecs.open(gramfile, "r", 'utf-8') as fin: | |
n = 0 | |
for line in fin: | |
if line.find('#') >= 0: | |
line = line[line.find('#'):] | |
if line.find(':') == -1: | |
continue | |
try: | |
(left, right) = line.split(':', 1) | |
right_list = re.split(r' +', right.strip()) | |
right_list.reverse() | |
#print("%s:%s" % (left, ' '.join(right_list))) | |
results.append( left + ':' + ' '.join(right_list) ) | |
n = n + 1; | |
except: | |
pass | |
with open(rgramfile, "w") as fout: | |
for line in results: | |
fout.write(line + '\n') | |
print("%s has %d rules" % (gramfile, n)) | |
def extract_vocafile(src, catefile, termfile): | |
n1 = 0 | |
n2 = 0 | |
categories = [] | |
with codecs.open(src, "r", 'utf-8') as fin: | |
for line in fin: | |
if len(line.strip()) == 0: | |
continue | |
if line.find('%') == 0: | |
category = line[1:] | |
categories.append(category.strip()) | |
n1 = n1 + 1 | |
else: | |
n2 = n2 + 1 | |
with codecs.open(catefile, "w", 'utf-8') as fout: | |
for line in categories: | |
print("#%s" % line, file=fout) | |
with codecs.open(termfile, "w", 'utf-8') as fout: | |
termid = 0 | |
for line in categories: | |
print("%d %s" % (termid, line), file=fout) | |
termid = termid + 1 | |
print("%s has %d categories and %d words" % (vocafile, n1, n2)); | |
def vocafile2dictfile(vocafile, dictfile): | |
with codecs.open(vocafile, "r", 'utf-8') as fin: | |
with codecs.open(dictfile, "w", 'utf-8') as fout: | |
id = -1 | |
for line in fin: | |
line = line.strip() | |
if len(line) == 0: | |
continue | |
if line.find('%') == 0: | |
id = id + 1 | |
continue | |
(name, phones) = re.split(r' +', line, 1) | |
print("%d\t[%s]\t%s" % (id, name, phones), file=fout) | |
def call_mkfa(rgramfile, tmpvocafile, dfafile, tmpprefix): | |
cmd = [CMD_MKFA, '-e1'] | |
cmd = cmd + ['-fg', rgramfile] | |
cmd = cmd + ['-fv', tmpvocafile] | |
cmd = cmd + ['-fo', tmpprefix + '.dfa'] | |
cmd = cmd + ['-fh', tmpprefix + '.h'] | |
subprocess.run(cmd) | |
print("---") | |
if os.path.exists(CMD_DFA_MINIMIZE): | |
cmd = [CMD_DFA_MINIMIZE, tmpprefix + '.dfa', '-o', dfafile] | |
subprocess.run(cmd) | |
else: | |
print("Warning: dfa_minimize not found in the same place as mkdfa.py"); | |
print("Warning: no minimization performed"); | |
shutil.copyfile(tmpprefix + '.dfa', dfafile) | |
os.unlink(tmpprefix + '.dfa') | |
os.unlink(tmpprefix + '.h') | |
if __name__ == '__main__': | |
corename = sys.argv[1] | |
gramfile = corename + ".grammar"; | |
vocafile = corename + ".voca"; | |
dfafile = corename + ".dfa"; | |
dictfile = corename + ".dict"; | |
termfile = corename + ".term"; | |
tmpprefix = tempfile.gettempdir() + "/_julius_mkdfa_tmp_"; | |
tmpvocafile = tmpprefix + ".voca"; | |
rgramfile = tmpprefix + ".grammar" | |
# generate reverse grammar file | |
gen_reverse_grammar(gramfile, rgramfile) | |
# make temporary voca for mkfa (include only category info) | |
extract_vocafile(vocafile, tmpvocafile, termfile) | |
print('---') | |
call_mkfa(rgramfile, tmpvocafile, dfafile, tmpprefix) | |
vocafile2dictfile(vocafile, dictfile) | |
print('---') | |
print("generated %s %s %s" % (dfafile, termfile, dictfile)) | |
os.unlink(tmpvocafile) | |
os.unlink(rgramfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment