Created
October 14, 2023 06:22
-
-
Save ChenyangGao/15452e9b93a9aea6c5951e537558d5a4 to your computer and use it in GitHub Desktop.
Unihan (UNICODE HAN DATABASE) Character Information Query Tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
"""Unihan (UNICODE HAN DATABASE) Character Information Query Tool | |
Latest version: | |
- https://www.unicode.org/reports/tr38/ | |
Unihan source: | |
- https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip | |
- https://www.unicode.org/Public/UCD/latest/ucd/ | |
Other recommended projects: | |
- https://pypi.org/project/cihai/ | |
- https://libunihan.sourceforge.net | |
""" | |
__author__ = "ChenyangGao <https://chenyanggao.github.io/>" | |
__version__ = (0, 0, 1) | |
__all__ = ["UNIHANDB", "info", "updatedb"] | |
from pathlib import Path as _Path | |
from tempfile import TemporaryDirectory as _TemporaryDirectory | |
from urllib.request import urlopen as _urlopen, urlretrieve as _urlretrieve | |
from zipfile import ZipFile as _ZipFile | |
try: | |
UNIHANDB_FILE = str(_Path(__file__).with_name("unihan.pkl")) | |
except NameError: | |
UNIHANDB_FILE = "unihan.pkl" | |
def _get_last_modified_time(): | |
url = "https://www.unicode.org/Public/UCD/latest/ucd/" | |
content = _urlopen(url).read() | |
find_text = b'>Unihan.zip</a></td><td align="right">' | |
idx = content.index(find_text) + len(find_text) | |
return content[idx:idx+16].decode("ascii") | |
def _reporthook(blocks, block_size, total_size): | |
downloaded_size = blocks * block_size | |
if downloaded_size >= total_size: | |
print("\r\x1b[K", end="") | |
else: | |
pct = downloaded_size * 100 / total_size | |
print(f'\rdownloading | {downloaded_size} of {total_size} | {pct:.6f}%', end="") | |
def updatedb(): | |
"""Update the local Unihan database. | |
Data Source: https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip | |
""" | |
global UNIHANDB | |
try: | |
last_modified_time = _get_last_modified_time() | |
if UNIHANDB[""] >= last_modified_time: | |
print("^_^ already the latest version") | |
return | |
except NameError: | |
pass | |
url = "https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip" | |
unihandb = {"": last_modified_time} | |
with _TemporaryDirectory() as tmpdir: | |
path = _Path(tmpdir) / "Unihan.zip" | |
_urlretrieve(url, path, reporthook=_reporthook) | |
with _ZipFile(path) as zf: | |
for filename in zf.namelist(): | |
with zf.open(filename) as f: | |
for l in f: | |
if l.startswith(b"U"): | |
l = l[:-1].decode("utf-8") | |
else: | |
continue | |
ucn, field, value = l.split(maxsplit=2) | |
codepoint = int(ucn[2:], 16) | |
try: | |
unihandb[codepoint][field] = value | |
except KeyError: | |
unihandb[codepoint] = { | |
"char": chr(codepoint), | |
"codepoint": codepoint, | |
"ucn": ucn, | |
field: value | |
} | |
UNIHANDB = unihandb | |
__import__("pickle").dump(unihandb, open(UNIHANDB_FILE, "wb")) | |
def info(char: int | str) -> dict: | |
"""Query the information of a Unihan (UNICODE HAN DATABASE) character. | |
:Reference: | |
- https://www.unicode.org/reports/tr38/ | |
- https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip | |
:param char: A unicode code point, URN (Uniform Resource Name) or single character of Unihan. | |
:return: Information related to the Unihan character. | |
""" | |
if isinstance(char, str): | |
if char.startswith("U+"): | |
codepoint = int(char[2:], 16) | |
else: | |
codepoint = ord(char) | |
else: | |
codepoint = char | |
try: | |
return dict(UNIHANDB[codepoint]) | |
except KeyError as e: | |
raise ValueError(f"not a unihan: {char!r}") from e | |
try: | |
# The local Unihan database | |
UNIHANDB = __import__("pickle").load(open(UNIHANDB_FILE, "rb")) | |
except FileNotFoundError: | |
updatedb() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment