Skip to content

Instantly share code, notes, and snippets.

@hayakawa
Created November 30, 2017 05:02

Revisions

  1. hayakawa created this gist Nov 30, 2017.
    69 changes: 69 additions & 0 deletions normalize_neologd_filter.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,69 @@
    # encoding: utf8
    from __future__ import unicode_literals
    import re
    import unicodedata
    import sys

    ################################################################
    # This is modify version.
    #
    # If you need original source code, see below:
    # https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp
    #
    # (Keyword for search: 'normalize_neologd.py')
    ################################################################

    def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
    return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('-', '-', s)
    return s

    def remove_extra_spaces(s):
    s = re.sub('[  ]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF', # CJK UNIFIED IDEOGRAPHS
    '\u3040-\u309F', # HIRAGANA
    '\u30A0-\u30FF', # KATAKANA
    '\u3000-\u303F', # CJK SYMBOLS AND PUNCTUATION
    '\uFF00-\uFFEF' # HALFWIDTH AND FULLWIDTH FORMS
    ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
    p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
    while p.search(s):
    s = p.sub(r'\1\2', s)
    return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

    def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('0-9A-Za-z。-゚', s)

    def maketrans(f, t):
    return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens
    s = re.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus
    s = re.sub('[~∼∾〜〰~]', '', s) # remove tildes
    s = s.translate(
    maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」',
    '!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜', s) # keep =,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

    if __name__ == "__main__":
    for target in sys.stdin:
    print(normalize_neologd(target))