hayakawa · November 30, 2017 05:02 · Nov 30, 2017
diff --git a/normalize_neologd_filter.py b/normalize_neologd_filter.py
@@ -0,0 +1,69 @@
+# encoding: utf8
+from __future__ import unicode_literals
+import re
+import unicodedata
+import sys
+
+################################################################
+# This is modify version.
+#
+# If you need original source code, see below:
+# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp
+#
+# (Keyword for search: 'normalize_neologd.py')
+################################################################
+
+def unicode_normalize(cls, s):
+    pt = re.compile('([{}]+)'.format(cls))
+
+    def norm(c):
+        return unicodedata.normalize('NFKC', c) if pt.match(c) else c
+
+    s = ''.join(norm(x) for x in re.split(pt, s))
+    s = re.sub('－', '-', s)
+    return s
+
+def remove_extra_spaces(s):
+    s = re.sub('[ 　]+', ' ', s)
+    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
+                      '\u3040-\u309F',  # HIRAGANA
+                      '\u30A0-\u30FF',  # KATAKANA
+                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
+                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
+                      ))
+    basic_latin = '\u0000-\u007F'
+
+    def remove_space_between(cls1, cls2, s):
+        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
+        while p.search(s):
+            s = p.sub(r'\1\2', s)
+        return s
+
+    s = remove_space_between(blocks, blocks, s)
+    s = remove_space_between(blocks, basic_latin, s)
+    s = remove_space_between(basic_latin, blocks, s)
+    return s
+
+def normalize_neologd(s):
+    s = s.strip()
+    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)
+
+    def maketrans(f, t):
+        return {ord(x): ord(y) for x, y in zip(f, t)}
+
+    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
+    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
+    s = re.sub('[~∼∾〜〰～]', '', s)  # remove tildes
+    s = s.translate(
+        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
+              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))
+
+    s = remove_extra_spaces(s)
+    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
+    s = re.sub('[’]', '\'', s)
+    s = re.sub('[”]', '"', s)
+    return s
+
+if __name__ == "__main__":
+    for target in sys.stdin:
+        print(normalize_neologd(target))