Last active
June 20, 2025 23:18
-
-
Save yeiichi/df0cfb26c5632730f79e40bfc4a7f3cd to your computer and use it in GitHub Desktop.
Extract numeric chunks from a string
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
import unicodedata | |
from typing import List | |
# Constants | |
KANJI_DIGITS = '〇零一壱二弐三参四五伍六七八九' | |
ARABIC_DIGITS = '001122334556789' | |
KANJI_TO_ARABIC_MAPPING = str.maketrans(KANJI_DIGITS, ARABIC_DIGITS) | |
UNICODE_NORMALIZATION_FORM = 'NFKC' | |
def kanji_to_arabic(input_string: str) -> str: | |
""" | |
Converts Kanji numerals in the input string to Arabic numerals. | |
Args: | |
input_string (str): The string containing Kanji numerals. | |
Returns: | |
str: The string with Kanji numerals converted to Arabic numerals. | |
""" | |
return input_string.translate(KANJI_TO_ARABIC_MAPPING) | |
def extract_number_chunks(input_string: str) -> List[str]: | |
""" | |
Extracts numeric chunks from a string after normalizing it. | |
Args: | |
input_string (str): The input string. | |
Returns: | |
List[str]: A list of extracted numbers as strings. | |
""" | |
normalized_string = unicodedata.normalize(UNICODE_NORMALIZATION_FORM, input_string) | |
return re.findall(r'\d+', normalized_string) | |
def main(input_string: str): | |
return extract_number_chunks(kanji_to_arabic(input_string)) | |
if __name__ == '__main__': | |
test_string = '001-1223鶴の壱五四八番_455/67::89' | |
expected_result = ['001', '1223', '1548', '455', '67', '89'] | |
print("Test String:", test_string) | |
print("Expected result:", expected_result) | |
result = main(test_string) | |
assert result == expected_result, "Assertion failed!" | |
print("Assertion passed!") # Optional: Feedback that assertion was successful |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment