Created
December 4, 2024 12:49
-
-
Save qexat/6d3b1effa4af9a1cbdde64b1794dcf29 to your computer and use it in GitHub Desktop.
(munii) lexers!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Char = struct | |
include Char | |
let is_alpha : t -> bool = function | |
| 'A' .. 'Z' | 'a' .. 'z' | '_' -> true | |
| _ -> false | |
;; | |
let is_digit : t -> bool = function | |
| '0' .. '9' -> true | |
| _ -> false | |
;; | |
let is_alphanumeric (char : t) : bool = is_alpha char || is_digit char | |
end | |
module Iter = struct | |
type ('a, 'b) t = | |
| Base of 'a | |
| Continue of 'b | |
let return (value : 'a) : ('a, 'b) t = Base value | |
let continue (value : 'b) : ('a, 'b) t = Continue value | |
let rec loop (f : 'a -> ('b, 'a) t) (a : 'a) : 'b = | |
match f a with | |
| Base b -> b | |
| Continue a' -> loop f a' | |
;; | |
end | |
module List = struct | |
include List | |
let contains : 'a t -> 'a -> bool = Fun.flip mem | |
end | |
module Option = struct | |
include Option | |
let ( or ) (option : 'a t) (default : 'a) : 'a = value ~default option | |
end | |
let ( or ) = Option.( or ) | |
module TokenKind = struct | |
type t = | |
| IDENTIFIER | |
| INTEGER | |
| STRING | |
| UNIT | |
| DYNAMIC | |
| FUN | |
| LET | |
| BRACKET_LEFT | |
| BRACKET_RIGHT | |
| PAREN_LEFT | |
| PAREN_RIGHT | |
| DOT | |
| COMMA | |
| SEMICOLON | |
| EQUAL | |
| COLON_EQUAL | |
| COLON | |
| BANG_EQUAL | |
| GREATER | |
| GREATER_EQUAL | |
| LESS | |
| LESS_EQUAL | |
| PLUS | |
| MINUS | |
| ASTERISK | |
| SLASH | |
| PERCENT | |
| VERTICAL_BAR | |
| MINUS_GREATER | |
| EOF | |
let get_keyword : string -> t option = function | |
| "dynamic" -> Some DYNAMIC | |
| "fun" -> Some FUN | |
| "let" -> Some LET | |
| _ -> None | |
;; | |
let render : t -> string = function | |
| ASTERISK -> "ASTERISK" | |
| BANG_EQUAL -> "BANG_EQUAL" | |
| BRACKET_LEFT -> "BRACKET_LEFT" | |
| BRACKET_RIGHT -> "BRACKET_RIGHT" | |
| COLON -> "COLON" | |
| COLON_EQUAL -> "COLON_EQUAL" | |
| COMMA -> "COMMA" | |
| DOT -> "DOT" | |
| DYNAMIC -> "DYNAMIC" | |
| EOF -> "EOF" | |
| EQUAL -> "EQUAL" | |
| FUN -> "FUN" | |
| GREATER -> "GREATER" | |
| GREATER_EQUAL -> "GREATER_EQUAL" | |
| IDENTIFIER -> "IDENTIFIER" | |
| INTEGER -> "INTEGER" | |
| LESS -> "LESS" | |
| LESS_EQUAL -> "LESS_EQUAL" | |
| LET -> "LET" | |
| MINUS -> "MINUS" | |
| MINUS_GREATER -> "MINUS_GREATER" | |
| PAREN_LEFT -> "PAREN_LEFT" | |
| PAREN_RIGHT -> "PAREN_RIGHT" | |
| PERCENT -> "PERCENT" | |
| PLUS -> "PLUS" | |
| SEMICOLON -> "SEMICOLON" | |
| SLASH -> "SLASH" | |
| STRING -> "STRING" | |
| UNIT -> "UNIT" | |
| VERTICAL_BAR -> "VERTICAL_BAR" | |
;; | |
end | |
module Token = struct | |
type t = | |
{ kind : TokenKind.t | |
; lexeme : string | |
; position : int | |
} | |
let render : t -> string = function | |
| { kind; lexeme; position } -> | |
Printf.sprintf "%s '%s' %d" (TokenKind.render kind) (String.escaped lexeme) position | |
;; | |
end | |
module Err = struct | |
type kind = .. | |
type t = | |
{ kind : kind | |
; context : string | |
; position : int | |
} | |
end | |
module Lexer = struct | |
type Err.kind += | |
| InvalidCharacterLiteral of | |
{ lexeme : string | |
; reason : string | |
} | |
| UnrecognizedToken of | |
{ lexeme : string | |
; potential_matches : string list | |
} | |
| UnterminatedStringLiteral | |
type t = | |
{ source : string | |
; start : int ref | |
; current : int ref | |
; tokens : Token.t list ref | |
} | |
let create (source : string) : t = | |
{ source; start = ref 0; current = ref 0; tokens = ref [] } | |
;; | |
let is_at_end : t -> bool = function | |
| { current; source; _ } -> !current >= String.length source | |
;; | |
let synchronize_start : t -> unit = function | |
| { start; current; _ } -> start := !current | |
;; | |
let get_current_lexeme : t -> string = function | |
| { source; start; current } -> String.sub source !start (!current - !start) | |
;; | |
let build_token (lexer : t) (kind : TokenKind.t) : Token.t = | |
{ kind; lexeme = get_current_lexeme lexer; position = !(lexer.start) } | |
;; | |
let peek (lexer : t) : string = | |
match is_at_end lexer with | |
| true -> "\x00" | |
| false -> String.sub lexer.source !(lexer.current) 1 | |
;; | |
let advance : t -> unit = function | |
| { current; _ } -> current := !current + 1 | |
;; | |
let consume (lexer : t) : string = | |
let char = peek lexer in | |
advance lexer; | |
char | |
;; | |
let satisfies_predicate (lexer : t) (predicate : string -> bool) : bool = | |
match predicate (peek lexer) with | |
| false -> false | |
| true -> | |
advance lexer; | |
true | |
;; | |
let matches (lexer : t) (char : string) : bool = satisfies_predicate lexer (( = ) char) | |
let matches_one_of (lexer : t) (chars : string list) : bool = | |
satisfies_predicate lexer (List.contains chars) | |
;; | |
let not_matches (lexer : t) (char : string) : bool = | |
satisfies_predicate lexer (( <> ) char) | |
;; | |
let matches_none_of (lexer : t) (chars : string list) : bool = | |
not (matches_one_of lexer chars) | |
;; | |
let scan_string (lexer : t) : (TokenKind.t, Err.t) result = | |
while matches_none_of lexer [ "\" "; "\n"; "\x00" ] do | |
() | |
done; | |
match matches lexer "\"" with | |
| false -> | |
Error | |
{ kind = UnterminatedStringLiteral | |
; context = lexer.source | |
; position = !(lexer.start) | |
} | |
| true -> Ok TokenKind.STRING | |
;; | |
let scan_integer (lexer : t) : (TokenKind.t, Err.t) result = | |
while satisfies_predicate lexer (String.for_all Char.is_digit) do | |
() | |
done; | |
Ok TokenKind.INTEGER | |
;; | |
let scan_keyword_or_identifier (lexer : t) : (TokenKind.t, Err.t) result = | |
while satisfies_predicate lexer (String.for_all Char.is_alphanumeric) do | |
() | |
done; | |
Ok (get_current_lexeme lexer |> TokenKind.get_keyword or TokenKind.IDENTIFIER) | |
;; | |
let scan_token (lexer : t) : (Token.t, Err.t) result = | |
let open Iter in | |
loop | |
(fun () : ((TokenKind.t, Err.t) result, unit) Iter.t -> | |
match consume lexer with | |
| "\x00" -> return (Ok TokenKind.EOF) | |
| " " | "\r" | "\t" | "\n" -> | |
synchronize_start lexer; | |
continue () | |
| ":" when matches lexer "=" -> return (Ok TokenKind.COLON_EQUAL) | |
| "!" when matches lexer "=" -> return (Ok TokenKind.BANG_EQUAL) | |
| ">" when matches lexer "=" -> return (Ok TokenKind.GREATER_EQUAL) | |
| "<" when matches lexer "=" -> return (Ok TokenKind.LESS_EQUAL) | |
| "-" when matches lexer ">" -> return (Ok TokenKind.MINUS_GREATER) | |
| "(" when matches lexer ")" -> return (Ok TokenKind.UNIT) | |
| "[" -> return (Ok TokenKind.BRACKET_LEFT) | |
| "]" -> return (Ok TokenKind.BRACKET_RIGHT) | |
| "(" -> return (Ok TokenKind.PAREN_LEFT) | |
| ")" -> return (Ok TokenKind.PAREN_RIGHT) | |
| "." -> return (Ok TokenKind.DOT) | |
| "," -> return (Ok TokenKind.COMMA) | |
| ";" -> return (Ok TokenKind.SEMICOLON) | |
| ":" -> return (Ok TokenKind.COLON) | |
| "=" -> return (Ok TokenKind.EQUAL) | |
| ">" -> return (Ok TokenKind.GREATER) | |
| "<" -> return (Ok TokenKind.LESS) | |
| "+" -> return (Ok TokenKind.PLUS) | |
| "-" -> return (Ok TokenKind.MINUS) | |
| "*" -> return (Ok TokenKind.ASTERISK) | |
| "/" -> return (Ok TokenKind.SLASH) | |
| "%" -> return (Ok TokenKind.PERCENT) | |
| "|" -> return (Ok TokenKind.VERTICAL_BAR) | |
| "\"" -> return (scan_string lexer) | |
| char when (String.for_all Char.is_digit) char -> return (scan_integer lexer) | |
| char when (String.for_all Char.is_alpha) char -> | |
return (scan_keyword_or_identifier lexer) | |
| char -> | |
return | |
(Error | |
{ Err.kind = UnrecognizedToken { lexeme = char; potential_matches = [] } | |
; context = lexer.source | |
; position = !(lexer.start) | |
})) | |
() | |
|> Result.map (build_token lexer) | |
;; | |
let run (lexer : t) : (Token.t list, Err.t) result = | |
let aux () = | |
Iter.loop | |
(fun () : ((unit, Err.t) result, unit) Iter.t -> | |
match is_at_end lexer with | |
| true -> Iter.return (Ok ()) | |
| false -> | |
(match scan_token lexer with | |
| Ok token -> | |
lexer.tokens := token :: !(lexer.tokens); | |
Iter.continue () | |
| Error error -> Iter.return (Error error))) | |
() | |
in | |
match aux () with | |
| Error error -> Error error | |
| Ok () -> | |
synchronize_start lexer; | |
lexer.tokens := build_token lexer TokenKind.EOF :: !(lexer.tokens); | |
Ok (List.rev !(lexer.tokens)) | |
;; | |
end | |
let lex (source : string) : (Token.t list, Err.t) result = | |
Lexer.create source |> Lexer.run | |
;; | |
let print_error (error : Err.t) : unit = () | |
let source = "let x = 3" | |
let tokens = | |
match lex source with | |
| Error error -> | |
print_error error; | |
exit 1 | |
| Ok tokens -> tokens | |
;; | |
List.iter (fun token -> Printf.printf "%s\n" (Token.render token)) tokens |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ruff: noqa: PGH004 | |
# ruff: noqa | |
""" | |
munii | |
""" | |
from __future__ import annotations | |
import abc | |
import enum | |
import sys | |
import typing | |
import attrs | |
import result | |
if typing.TYPE_CHECKING: | |
import collections.abc | |
class TokenKind(enum.Enum): | |
IDENTIFIER = enum.auto() | |
INTEGER = enum.auto() | |
STRING = enum.auto() | |
UNIT = enum.auto() | |
# *- Keywords -* # | |
DYNAMIC = enum.auto() | |
FUN = enum.auto() | |
LET = enum.auto() | |
BRACKET_LEFT = enum.auto() | |
BRACKET_RIGHT = enum.auto() | |
PAREN_LEFT = enum.auto() | |
PAREN_RIGHT = enum.auto() | |
DOT = enum.auto() | |
COMMA = enum.auto() | |
SEMICOLON = enum.auto() | |
EQUAL = enum.auto() | |
COLON_EQUAL = enum.auto() | |
COLON = enum.auto() | |
BANG_EQUAL = enum.auto() | |
GREATER = enum.auto() | |
GREATER_EQUAL = enum.auto() | |
LESS = enum.auto() | |
LESS_EQUAL = enum.auto() | |
PLUS = enum.auto() | |
MINUS = enum.auto() | |
ASTERISK = enum.auto() | |
SLASH = enum.auto() | |
PERCENT = enum.auto() | |
VERTICAL_BAR = enum.auto() | |
MINUS_GREATER = enum.auto() | |
EOF = enum.auto() | |
LEXEME_TO_KEYWORD_MAPPING = { | |
"dynamic": TokenKind.DYNAMIC, | |
"fun": TokenKind.FUN, | |
"let": TokenKind.LET, | |
} | |
class Token(typing.NamedTuple): | |
kind: TokenKind | |
lexeme: str | |
position: int | |
class MuniiErrorVisitor[R_co](typing.Protocol): | |
def visit_invalid_character_literal_error( | |
self, | |
error: InvalidCharacterLiteralError, | |
) -> R_co: ... | |
def visit_unrecognized_token_error( | |
self, | |
error: UnrecognizedTokenError, | |
) -> R_co: ... | |
def visit_unterminated_string_literal_error( | |
self, | |
error: UnterminatedStringLiteralError, | |
) -> R_co: ... | |
@attrs.frozen | |
class MuniiErrorBase(abc.ABC): | |
context: str | |
position: int | |
@abc.abstractmethod | |
def accept[R](self, visitor: MuniiErrorVisitor[R]) -> R: | |
pass | |
@attrs.frozen | |
class InvalidCharacterLiteralError(MuniiErrorBase): | |
token: Token | |
reason: str | |
@typing.override | |
def accept[R](self, visitor: MuniiErrorVisitor[R]) -> R: | |
return visitor.visit_invalid_character_literal_error(self) | |
@attrs.frozen | |
class UnrecognizedTokenError(MuniiErrorBase): | |
lexeme: str | |
potential_matches: list[str] = attrs.field(factory=list) | |
@typing.override | |
def accept[R](self, visitor: MuniiErrorVisitor[R]) -> R: | |
return visitor.visit_unrecognized_token_error(self) | |
@attrs.frozen | |
class UnterminatedStringLiteralError(MuniiErrorBase): | |
@typing.override | |
def accept[R](self, visitor: MuniiErrorVisitor[R]) -> R: | |
return visitor.visit_unterminated_string_literal_error(self) | |
type MuniiError = ( | |
InvalidCharacterLiteralError | |
| UnrecognizedTokenError | |
| UnterminatedStringLiteralError | |
) | |
@attrs.define | |
class Lexer: | |
# *- Parameters -* # | |
source: typing.Final[str] | |
# *- State -* # | |
start: int = attrs.field(init=False, default=0) | |
current: int = attrs.field(init=False, default=0) | |
tokens: list[Token] = attrs.field(init=False, factory=list) | |
def is_at_end(self) -> bool: | |
return self.current >= len(self.source) | |
def synchronize_start(self) -> None: | |
self.start = self.current | |
def get_current_lexeme(self) -> str: | |
return self.source[self.start : self.current] | |
def build_token(self, kind: TokenKind) -> Token: | |
return Token(kind, self.get_current_lexeme(), self.start) | |
def peek(self) -> str: | |
if self.is_at_end(): | |
return "\0" | |
return self.source[self.current] | |
def advance(self) -> None: | |
self.current += 1 | |
def consume(self) -> str: | |
char = self.peek() | |
self.advance() | |
return char | |
def satisfies_predicate( | |
self, | |
predicate: collections.abc.Callable[[str], bool], | |
) -> bool: | |
if predicate(self.peek()): | |
self.advance() | |
return True | |
return False | |
def matches(self, char: str) -> bool: | |
return self.satisfies_predicate(lambda current: current == char) | |
def matches_one_of(self, *chars: str) -> bool: | |
return self.satisfies_predicate(lambda current: current in chars) | |
def not_matches(self, char: str) -> bool: | |
return self.satisfies_predicate(lambda current: current != char) | |
def matches_none_of(self, *chars: str) -> bool: | |
return self.satisfies_predicate(lambda current: current not in chars) | |
def scan_string(self) -> result.Result[TokenKind, MuniiError]: | |
while self.matches_none_of('"', "\n", "\0"): | |
continue | |
if not self.matches('"'): | |
return result.Err( | |
UnterminatedStringLiteralError( | |
self.source, | |
self.start, | |
) | |
) | |
return result.Ok(TokenKind.STRING) | |
def scan_integer(self) -> result.Result[TokenKind, MuniiError]: | |
while self.satisfies_predicate(str.isdecimal): | |
continue | |
return result.Ok(TokenKind.INTEGER) | |
def scan_keyword_or_identifier(self) -> TokenKind: | |
while self.satisfies_predicate(str.isidentifier): | |
continue | |
lexeme = self.get_current_lexeme() | |
return LEXEME_TO_KEYWORD_MAPPING.get(lexeme, TokenKind.IDENTIFIER) | |
def scan_token(self) -> result.Result[Token, MuniiError]: | |
token_kind: TokenKind | None = None | |
while token_kind is None: | |
match self.consume(): | |
case "\0": | |
token_kind = TokenKind.EOF | |
case " " | "\r" | "\t" | "\n": | |
self.synchronize_start() | |
continue | |
case ":" if self.matches("="): | |
token_kind = TokenKind.COLON_EQUAL | |
case "!" if self.matches("="): | |
token_kind = TokenKind.BANG_EQUAL | |
case ">" if self.matches("="): | |
token_kind = TokenKind.GREATER_EQUAL | |
case "<" if self.matches("="): | |
token_kind = TokenKind.LESS_EQUAL | |
case "-" if self.matches(">"): | |
token_kind = TokenKind.MINUS_GREATER | |
case "(" if self.matches(")"): | |
token_kind = TokenKind.UNIT | |
case "[": | |
token_kind = TokenKind.BRACKET_LEFT | |
case "]": | |
token_kind = TokenKind.BRACKET_RIGHT | |
case "(": | |
token_kind = TokenKind.PAREN_LEFT | |
case ")": | |
token_kind = TokenKind.PAREN_RIGHT | |
case ".": | |
token_kind = TokenKind.DOT | |
case ",": | |
token_kind = TokenKind.COMMA | |
case ";": | |
token_kind = TokenKind.SEMICOLON | |
case ":": | |
token_kind = TokenKind.COLON | |
case "=": | |
token_kind = TokenKind.EQUAL | |
case ">": | |
token_kind = TokenKind.GREATER | |
case "<": | |
token_kind = TokenKind.LESS | |
case "+": | |
token_kind = TokenKind.PLUS | |
case "-": | |
token_kind = TokenKind.MINUS | |
case "*": | |
token_kind = TokenKind.ASTERISK | |
case "/": | |
token_kind = TokenKind.SLASH | |
case "%": | |
token_kind = TokenKind.PERCENT | |
case "|": | |
token_kind = TokenKind.VERTICAL_BAR | |
case '"': | |
token_kind_result = self.scan_string() | |
if isinstance(token_kind_result, result.Err): | |
return token_kind_result | |
token_kind = token_kind_result.unwrap() | |
case char if char.isdecimal(): | |
token_kind_result = self.scan_integer() | |
if isinstance(token_kind_result, result.Err): | |
return token_kind_result | |
token_kind = token_kind_result.unwrap() | |
case char if char.isidentifier(): | |
token_kind = self.scan_keyword_or_identifier() | |
case char: | |
return result.Err( | |
UnrecognizedTokenError( | |
self.source, | |
self.start, | |
self.get_current_lexeme(), | |
) | |
) | |
return result.Ok(self.build_token(token_kind)) | |
def run(self) -> result.Result[list[Token], MuniiError]: | |
while not self.is_at_end(): | |
self.synchronize_start() | |
token_result = self.scan_token() | |
if isinstance(token_result, result.Err): | |
return token_result | |
self.tokens.append(token_result.unwrap()) | |
self.synchronize_start() | |
self.tokens.append(self.build_token(TokenKind.EOF)) | |
return result.Ok(self.tokens) | |
def main() -> None: | |
""" | |
Main function. | |
""" | |
source = "let x = 3" | |
match Lexer(source).run(): | |
case result.Ok(tokens): | |
for token in tokens: | |
print(token) | |
case result.Err(error): | |
print(f"\x1b[1;31mError:\x1b[22;39m {error}", file=sys.stderr) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::fmt; | |
trait IsMuniiIdentifier { | |
fn is_identifier(self) -> bool; | |
} | |
impl IsMuniiIdentifier for char { | |
fn is_identifier(self) -> bool { | |
match self { | |
'A'..='Z' | 'a'..='z' | '_' => true, | |
_ => false, | |
} | |
} | |
} | |
#[derive(Clone, Copy, Debug)] | |
enum TokenKind { | |
Identifier, | |
Integer, | |
String, | |
Unit, | |
Dynamic, | |
Fun, | |
Let, | |
BracketLeft, | |
BracketRight, | |
ParenLeft, | |
ParenRight, | |
Dot, | |
Comma, | |
Semicolon, | |
Equal, | |
ColonEqual, | |
Colon, | |
BangEqual, | |
Greater, | |
GreaterEqual, | |
Less, | |
LessEqual, | |
Plus, | |
Minus, | |
Asterisk, | |
Slash, | |
Percent, | |
VerticalBar, | |
MinusGreater, | |
Eof, | |
} | |
impl TokenKind { | |
fn get_keyword(lexeme: String) -> Option<Self> { | |
match lexeme.as_str() { | |
"dynamic" => Some(Self::Dynamic), | |
"fun" => Some(Self::Fun), | |
"let" => Some(Self::Let), | |
_ => None, | |
} | |
} | |
} | |
impl fmt::Display for TokenKind { | |
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | |
write!(f, "{:?}", self) | |
} | |
} | |
#[derive(Clone, Debug)] | |
struct Token { | |
kind: TokenKind, | |
lexeme: String, | |
position: usize, | |
} | |
impl fmt::Display for Token { | |
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | |
write!(f, "{:?}", self) | |
} | |
} | |
#[derive(Clone, Debug)] | |
enum ErrKind { | |
UnrecognizedToken { | |
token: char, | |
potential_matches: Vec<String>, | |
}, | |
UnterminatedStringLiteral, | |
} | |
#[derive(Clone, Debug)] | |
struct Error { | |
kind: ErrKind, | |
context: String, | |
position: usize, | |
} | |
struct Lexer { | |
source: String, | |
start: usize, | |
current: usize, | |
tokens: Vec<Token>, | |
} | |
impl Lexer { | |
fn create(source: String) -> Self { | |
Self { | |
source, | |
start: 0, | |
current: 0, | |
tokens: vec![], | |
} | |
} | |
fn is_at_end(&self) -> bool { | |
self.current >= self.source.len() | |
} | |
fn synchronize_start(&mut self) { | |
self.start = self.current | |
} | |
fn get_current_lexeme(&self) -> String { | |
self.source[self.start..self.current].to_string() | |
} | |
fn build_token(&self, kind: TokenKind) -> Token { | |
Token { | |
kind, | |
lexeme: self.get_current_lexeme(), | |
position: self.start, | |
} | |
} | |
fn peek(&self) -> char { | |
self.source.chars().nth(self.current).unwrap_or('\x00') | |
} | |
fn advance(&mut self) { | |
self.current += 1 | |
} | |
fn consume(&mut self) -> char { | |
let char = self.peek(); | |
self.advance(); | |
char | |
} | |
fn satisfies_predicate<F>(&mut self, predicate: F) -> bool | |
where | |
F: Fn(char) -> bool, | |
{ | |
match predicate(self.peek()) { | |
false => false, | |
true => { | |
self.advance(); | |
true | |
} | |
} | |
} | |
fn matches(&mut self, char: char) -> bool { | |
self.satisfies_predicate(|peeked| char == peeked) | |
} | |
fn matches_one_of(&mut self, chars: &[char]) -> bool { | |
self.satisfies_predicate(|peeked| chars.contains(&peeked)) | |
} | |
fn not_matches(&mut self, char: char) -> bool { | |
self.satisfies_predicate(|peeked| char != peeked) | |
} | |
fn matches_none_of(&mut self, chars: &[char]) -> bool { | |
!self.matches_one_of(chars) | |
} | |
fn scan_string(&mut self) -> Result<TokenKind, Error> { | |
while self.matches_none_of(&['"', '\n', '\x00']) {} | |
match self.matches('"') { | |
false => Err(Error { | |
kind: ErrKind::UnterminatedStringLiteral, | |
context: self.source.clone(), | |
position: self.start, | |
}), | |
true => Ok(TokenKind::String), | |
} | |
} | |
fn scan_integer(&mut self) -> Result<TokenKind, Error> { | |
while self.satisfies_predicate(|char| char.is_ascii_digit()) {} | |
Ok(TokenKind::Integer) | |
} | |
fn scan_keyword_or_integer(&mut self) -> TokenKind { | |
while self.satisfies_predicate(|char| char.is_identifier()) {} | |
TokenKind::get_keyword(self.get_current_lexeme()).unwrap_or(TokenKind::Identifier) | |
} | |
fn scan_token(&mut self) -> Result<Token, Error> { | |
let mut token_kind: Option<TokenKind> = None; | |
while token_kind.is_none() { | |
match self.consume() { | |
'\0' => token_kind = Some(TokenKind::Eof), | |
' ' | '\r' | '\t' | '\n' => { | |
self.synchronize_start(); | |
continue; | |
} | |
':' if self.matches('=') => token_kind = Some(TokenKind::ColonEqual), | |
'!' if self.matches('=') => token_kind = Some(TokenKind::BangEqual), | |
'>' if self.matches('=') => token_kind = Some(TokenKind::GreaterEqual), | |
'<' if self.matches('=') => token_kind = Some(TokenKind::LessEqual), | |
'-' if self.matches('>') => token_kind = Some(TokenKind::MinusGreater), | |
'(' if self.matches(')') => token_kind = Some(TokenKind::Unit), | |
'[' => token_kind = Some(TokenKind::BracketLeft), | |
']' => token_kind = Some(TokenKind::BracketRight), | |
'(' => token_kind = Some(TokenKind::ParenLeft), | |
')' => token_kind = Some(TokenKind::ParenRight), | |
'.' => token_kind = Some(TokenKind::Dot), | |
',' => token_kind = Some(TokenKind::Comma), | |
';' => token_kind = Some(TokenKind::Semicolon), | |
':' => token_kind = Some(TokenKind::Colon), | |
'=' => token_kind = Some(TokenKind::Equal), | |
'>' => token_kind = Some(TokenKind::Greater), | |
'<' => token_kind = Some(TokenKind::Less), | |
'+' => token_kind = Some(TokenKind::Plus), | |
'-' => token_kind = Some(TokenKind::Minus), | |
'*' => token_kind = Some(TokenKind::Asterisk), | |
'/' => token_kind = Some(TokenKind::Slash), | |
'%' => token_kind = Some(TokenKind::Percent), | |
'|' => token_kind = Some(TokenKind::VerticalBar), | |
'"' => token_kind = Some(self.scan_string()?), | |
'0'..='9' => token_kind = Some(self.scan_integer()?), | |
char if char.is_identifier() => token_kind = Some(self.scan_keyword_or_integer()), | |
char => { | |
return Err(Error { | |
kind: ErrKind::UnrecognizedToken { | |
token: char, | |
potential_matches: vec![], | |
}, | |
context: self.source.clone(), | |
position: self.start, | |
}) | |
} | |
} | |
} | |
Ok(self.build_token(token_kind.unwrap())) | |
} | |
fn run(&mut self) -> Result<(), Error> { | |
while !self.is_at_end() { | |
self.synchronize_start(); | |
let token = self.scan_token()?; | |
self.tokens.push(token); | |
} | |
self.synchronize_start(); | |
self.tokens.push(self.build_token(TokenKind::Eof)); | |
Ok(()) | |
} | |
fn tokens(self) -> Vec<Token> { | |
self.tokens | |
} | |
} | |
fn lex(source: String) -> Result<Vec<Token>, Error> { | |
let mut lexer = Lexer::create(source); | |
lexer.run()?; | |
Ok(lexer.tokens()) | |
} | |
fn print_error(error: Error) { | |
eprintln!("\x1b[1;31mError:\x1b[22;39m {:#?}", error); | |
} | |
fn main() { | |
let source = "let x = 3 $".to_string(); | |
match lex(source) { | |
Ok(tokens) => { | |
println!("{:#?}", tokens); | |
} | |
Err(error) => print_error(error), | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
these snippets contain bugs but its left as an exercise to the reader