Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active June 20, 2025 23:18
Show Gist options
  • Save yeiichi/df0cfb26c5632730f79e40bfc4a7f3cd to your computer and use it in GitHub Desktop.
Save yeiichi/df0cfb26c5632730f79e40bfc4a7f3cd to your computer and use it in GitHub Desktop.
Extract numeric chunks from a string
#!/usr/bin/env python3
import re
import unicodedata
from typing import List
# Constants
KANJI_DIGITS = '〇零一壱二弐三参四五伍六七八九'
ARABIC_DIGITS = '001122334556789'
KANJI_TO_ARABIC_MAPPING = str.maketrans(KANJI_DIGITS, ARABIC_DIGITS)
UNICODE_NORMALIZATION_FORM = 'NFKC'
def kanji_to_arabic(input_string: str) -> str:
"""
Converts Kanji numerals in the input string to Arabic numerals.
Args:
input_string (str): The string containing Kanji numerals.
Returns:
str: The string with Kanji numerals converted to Arabic numerals.
"""
return input_string.translate(KANJI_TO_ARABIC_MAPPING)
def extract_number_chunks(input_string: str) -> List[str]:
"""
Extracts numeric chunks from a string after normalizing it.
Args:
input_string (str): The input string.
Returns:
List[str]: A list of extracted numbers as strings.
"""
normalized_string = unicodedata.normalize(UNICODE_NORMALIZATION_FORM, input_string)
return re.findall(r'\d+', normalized_string)
def main(input_string: str):
return extract_number_chunks(kanji_to_arabic(input_string))
if __name__ == '__main__':
test_string = '001-1223鶴の壱五四八番_455/67::89'
expected_result = ['001', '1223', '1548', '455', '67', '89']
print("Test String:", test_string)
print("Expected result:", expected_result)
result = main(test_string)
assert result == expected_result, "Assertion failed!"
print("Assertion passed!") # Optional: Feedback that assertion was successful
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment