Skip to content

Instantly share code, notes, and snippets.

@Hanaasagi
Created January 17, 2025 03:49
Show Gist options
  • Save Hanaasagi/2d06c4e83c182c460766d811162836fb to your computer and use it in GitHub Desktop.
Save Hanaasagi/2d06c4e83c182c460766d811162836fb to your computer and use it in GitHub Desktop.
from enum import Enum, auto
"""
START:
SIGN -> SIGN
DIGIT -> INT
"0" -> ZERO
"." -> P_DOT
ERROR -> ERROR
SIGN:
DIGIT -> INT
"0" -> ZERO
"." -> P_DOT
ERROR -> ERROR
ZERO:
"x" | "X" -> HEX
"o" | "O" -> OCT
"b" | "B" -> BIN
"." -> P_DOT
DIGIT -> INT # (Warning: Invalid octal sequence, e.g., 09)
"e" | "E" -> EXP
END -> END
INT:
DIGIT -> INT
"." -> DOT
"e" | "E" -> EXP
END -> END
DOT:
DIGIT -> FRAC
ERROR -> ERROR
FRAC:
DIGIT -> FRAC
"e" | "E" -> EXP
END -> END
EXP:
"+" | "-" -> EXP_SIGN
DIGIT -> EXP_INT
ERROR -> ERROR
EXP_SIGN:
DIGIT -> EXP_INT
ERROR -> ERROR
EXP_INT:
DIGIT -> EXP_INT
END -> END
HEX:
HEX_DIGIT -> HEX
END -> END
OCT:
OCT_DIGIT -> OCT
END -> END
BIN:
"0" | "1" -> BIN
END -> END
END:
ERROR -> ERROR
ERROR:
ERROR -> ERROR
"""
class State(Enum):
START = auto()
SIGN = auto()
ZERO = auto()
INT = auto()
P_DOT = auto()
DOT = auto()
FRAC = auto()
EXP = auto()
EXP_SIGN = auto()
EXP_INT = auto()
HEX = auto()
OCT = auto()
BIN = auto()
END = auto()
ERROR = auto()
class TokenType(Enum):
NUMBER = auto()
ERROR = auto()
class NumberParser:
def __init__(self):
self.state = State.START
def transition(self, char: str):
if self.state == State.START:
if char in "+-":
self.state = State.SIGN
elif char == "0":
self.state = State.ZERO
elif char.isdigit():
self.state = State.INT
elif char == ".":
self.state = State.P_DOT
else:
self.state = State.ERROR
elif self.state == State.SIGN:
if char == "0":
self.state = State.ZERO
elif char.isdigit():
self.state = State.INT
elif char == ".":
self.state = State.P_DOT
else:
self.state = State.ERROR
elif self.state == State.ZERO:
if char in "xX":
self.state = State.HEX
elif char in "oO":
self.state = State.OCT
elif char in "bB":
self.state = State.BIN
elif char == ".":
self.state = State.P_DOT
elif char.isdigit():
self.state = State.INT # Warning: Invalid octal sequence
elif char in "eE":
self.state = State.EXP
else:
self.state = State.END
elif self.state == State.INT:
if char.isdigit():
self.state = State.INT
elif char == ".":
self.state = State.DOT
elif char in "eE":
self.state = State.EXP
else:
self.state = State.END
elif self.state == State.P_DOT:
if char.isdigit():
self.state = State.FRAC
else:
self.state = State.ERROR
elif self.state == State.DOT:
if char.isdigit():
self.state = State.FRAC
else:
self.state = State.ERROR
elif self.state == State.FRAC:
if char.isdigit():
self.state = State.FRAC
elif char in "eE":
self.state = State.EXP
else:
self.state = State.END
elif self.state == State.EXP:
if char in "+-":
self.state = State.EXP_SIGN
elif char.isdigit():
self.state = State.EXP_INT
else:
self.state = State.ERROR
elif self.state == State.EXP_SIGN:
if char.isdigit():
self.state = State.EXP_INT
else:
self.state = State.ERROR
elif self.state == State.EXP_INT:
if char.isdigit():
self.state = State.EXP_INT
else:
self.state = State.END
elif self.state == State.HEX:
if char.isdigit() or char.lower() in "abcdef":
self.state = State.HEX
else:
self.state = State.END
elif self.state == State.OCT:
if char in "01234567":
self.state = State.OCT
else:
self.state = State.END
elif self.state == State.BIN:
if char in "01":
self.state = State.BIN
else:
self.state = State.END
else:
self.state = State.ERROR
def parse(self, token: str):
self.state = State.START
prev_char = ""
for char in token:
if char == '_' and prev_char != "" and (prev_char.isdigit() or prev_char.lower() in "abcdef_"):
prev_char = char
continue
prev_char = char
self.transition(char)
if self.state == State.ERROR:
return TokenType.ERROR
if self.state in {State.INT, State.FRAC, State.EXP_INT, State.HEX, State.OCT, State.BIN, State.ZERO, State.DOT}:
return TokenType.NUMBER
return TokenType.ERROR
def test_parser():
parser = NumberParser()
good_cases = [
"0",
"0.0",
"-0",
"+0",
"123",
"-123",
"+123",
"0.2",
"2.123456",
".2",
"2.",
"-2.5",
"+2.5",
"1e3",
"1e+3",
"1e-3",
"-1e-3",
"+1e3",
"0x3F",
"0o77",
"0b1010",
"-0x3A",
"+0b101",
"0e0",
"-0e0",
"+0e0",
"123.456",
"1e1000",
"1e-1000",
"1e+1000",
"1_000",
"1_000_000",
"1_000.000_1",
"1_000_000e10",
"0b0",
"0x0",
"0o0",
"-1.",
"+1.",
"01",
]
bad_cases = [
"invalid",
"1e",
"0b102",
"0xG",
"0o08",
"0o9",
"0bb1",
"0.2.",
"0.2.2",
"-",
"+",
".",
"1e+",
"1e-",
"e3",
"1.2.3",
"1e+1.5",
"+-123",
"-+123",
"0b123",
"0o08",
"0b102",
"0xG",
"0._2",
"_3",
"FF",
"1e3e2"
]
print("-" * 20, "GOOD", "-" * 20)
for case in good_cases:
result = parser.parse(case)
print(f"Token: {case:16} -> Type: {result.name}")
print("-" * 20, "BAD", "-" * 20)
for case in bad_cases:
result = parser.parse(case)
print(f"Token: {case:16} -> Type: {result.name}")
test_parser()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment