Last active
December 13, 2024 18:26
-
-
Save shreyassanthu77/1a8dc996df6a7742e0277dd7b8668b3f to your computer and use it in GitHub Desktop.
a lexer in zig that does stuff....ya know like the stuff lexers do
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const log = std.log.scoped(.lexer); | |
const Allocator = std.mem.Allocator; | |
const ArenaAllocator = std.heap.ArenaAllocator; | |
const utils = @import("utils.zig"); | |
pub const tok = @import("token.zig"); | |
pub const Token = tok.Token; | |
arena: ArenaAllocator, | |
src: []const u8, | |
i: usize = 0, | |
line: u32 = 1, | |
column: u32 = 1, | |
start: tok.Location = undefined, | |
const Self = @This(); | |
pub fn init(allocator: Allocator, src: []const u8) *Self { | |
var arena = ArenaAllocator.init(allocator); | |
const lexer = arena.allocator().create(Self) catch @panic("OOM"); | |
lexer.* = .{ | |
.arena = arena, | |
.src = src, | |
}; | |
return lexer; | |
} | |
pub inline fn deinit(self: *Self) void { | |
self.arena.deinit(); | |
} | |
pub fn next(self: *Self) Token { | |
self.consumeWhile(std.ascii.isWhitespace); | |
self.start = self.loc(); | |
const ch = self.peek() orelse { | |
return self.make(.eof); | |
}; | |
return switch (ch) { | |
'a'...'z', 'A'...'Z', '_' => self.nextIdent(), | |
'0'...'9' => self.nextNumber(), | |
'"', '\'', '`' => self.nextString(ch) catch Token.init(.{ .err = "Out of memory" }, self.span(self.start)), | |
';' => self.makeAndConsume(.semicolon), | |
'.' => self.makeAndConsume(.dot), | |
',' => self.makeAndConsume(.comma), | |
'(' => self.makeAndConsume(.lparen), | |
')' => self.makeAndConsume(.rparen), | |
'{' => self.makeAndConsume(.lsquirly), | |
'}' => self.makeAndConsume(.rsquirly), | |
'[' => self.makeAndConsume(.lbracket), | |
']' => self.makeAndConsume(.rbracket), | |
'=' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.eq_eq) | |
else | |
self.make(.eq); | |
}, | |
'!' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.bang_eq) | |
else | |
self.make(.bang); | |
}, | |
'<' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.lt_eq) | |
else | |
self.make(.lt); | |
}, | |
'>' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.gt_eq) | |
else | |
self.make(.gt); | |
}, | |
'&' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('&'))) | |
self.make(.ampersand_ampersand) | |
else if (self.optional(utils.char('='))) | |
self.make(.ampersand_eq) | |
else | |
self.make(.ampersand); | |
}, | |
'|' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('|'))) | |
self.make(.pipe_pipe) | |
else if (self.optional(utils.char('='))) | |
self.make(.pipe_eq) | |
else if (self.optional(utils.char('>'))) | |
self.make(.pipe_gt) | |
else | |
self.make(.pipe); | |
}, | |
'~' => self.makeAndConsume(.tilde), | |
'+' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.plus_eq) | |
else | |
self.make(.plus); | |
}, | |
'-' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.minus_eq) | |
else | |
self.make(.minus); | |
}, | |
'*' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.star_eq) | |
else if (self.optional(utils.char('*'))) | |
self.make(.star_star) | |
else | |
self.make(.star); | |
}, | |
'/' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.slash_eq) | |
else if (self.optional(utils.char('/'))) | |
self.nextSingleLineComment() | |
else if (self.optional(utils.char('*'))) | |
self.nextMultiLineComment() | |
else | |
self.make(.slash); | |
}, | |
'%' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.percent_eq) | |
else | |
self.make(.percent); | |
}, | |
'^' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('='))) | |
self.make(.caret_eq) | |
else | |
self.make(.caret); | |
}, | |
'@' => self.makeAndConsume(.at), | |
':' => self.makeAndConsume(.colon), | |
'?' => { | |
_ = self.consume(); | |
return if (self.optional(utils.char('?'))) | |
self.make(.question_question) | |
else | |
self.make(.question); | |
}, | |
else => { | |
_ = self.consume(); | |
return self.err("unexpected character: '{c}'", .{ch}); | |
}, | |
}; | |
} | |
/// [a-zA-Z_][a-zA-Z0-9_]* | |
inline fn nextIdent(self: *Self) Token { | |
const id_start = self.i; | |
_ = self.consume(); | |
self.consumeWhile(utils.isIdentChar); | |
const ident = self.src[id_start..self.i]; | |
return self.make(.{ .ident = ident }); | |
} | |
/// number: int + fracton? + exp? | |
/// | bin | |
/// | oct | |
/// | hex | |
/// | |
/// int: [0-9] | [1-9][0-9]+ | |
/// fraction: \.[0-9]+ | |
/// exp: [eE] [-+]? [0-9]+ | |
/// | |
/// bin: 0b[01]+ | |
/// oct: 0o[0-7]+ | |
/// hex: 0x[0-9a-fA-F]+(.[0-9a-fA-F]+)?([pP][-+]?[0-9]+)? | |
inline fn nextNumber(self: *Self) Token { | |
const number_start = self.i; | |
const i = self.consume(); | |
if (i == '0') { | |
if (self.peek()) |p| { | |
switch (p) { | |
'b' => return self.nextBinary(), | |
'o' => return self.nextOctal(), | |
'x' => return self.nextHex(), | |
'0'...'9' => return self.err("leading zero in number", .{}), | |
else => {}, | |
} | |
} | |
} | |
self.consumeWhile(std.ascii.isDigit); | |
if (self.optional(utils.dot)) { | |
self.consumeWhile(std.ascii.isDigit); | |
} | |
if (self.optional(utils.exponent)) { | |
_ = self.optional(utils.sign); | |
const p = self.peek(); | |
if (p == null or !std.ascii.isDigit(p.?)) { | |
return self.err("expected digit after exponent", .{}); | |
} | |
self.consumeWhile(std.ascii.isDigit); | |
} | |
const number = self.src[number_start..self.i]; | |
const f: f128 = std.fmt.parseFloat(f128, number) catch { | |
return self.err("failed to parse number", .{}); | |
}; | |
// if the number is an integer, we can convert it to i64 | |
if (f == @trunc(f)) { | |
const int: i128 = @intFromFloat(f); | |
return self.make(.{ .int = int }); | |
} | |
return self.make(.{ .float = f }); | |
} | |
inline fn nextBinary(self: *Self) Token { | |
_ = self.consume(); // consume 'b' | |
const number_start = self.i; | |
self.consumeWhile(utils.isBinaryDigit); | |
const number = self.src[number_start..self.i]; | |
const int: i128 = std.fmt.parseInt(i128, number, 2) catch { | |
return self.err("failed to parse binary number {s}", .{number}); | |
}; | |
return self.make(.{ .int = int }); | |
} | |
inline fn nextOctal(self: *Self) Token { | |
_ = self.consume(); // consume 'o' | |
const number_start = self.i; | |
self.consumeWhile(utils.isOctalDigit); | |
const number = self.src[number_start..self.i]; | |
const int: i128 = std.fmt.parseInt(i128, number, 8) catch { | |
return self.err("failed to parse octal number {s}", .{number}); | |
}; | |
return self.make(.{ .int = int }); | |
} | |
inline fn nextHex(self: *Self) Token { | |
const number_start = self.i - 1; // include '0' | |
_ = self.consume(); // consume 'x' | |
self.consumeWhile(utils.isHexDigit); | |
if (self.optional(utils.dot)) { | |
self.consumeWhile(utils.isHexDigit); | |
} | |
if (self.optional(utils.hexPower)) { | |
_ = self.optional(utils.sign); | |
const p = self.peek(); | |
if (p == null or !utils.isHexDigit(p.?)) { | |
return self.err("expected digit after hex power", .{}); | |
} | |
self.consumeWhile(utils.isHexDigit); | |
} | |
const number = self.src[number_start..self.i]; | |
const f: f128 = std.fmt.parseFloat(f128, number) catch { | |
return self.err("failed to parse hex number {s}", .{number}); | |
}; | |
// if the number is an integer, we can convert it to i64 | |
if (f == @trunc(f)) { | |
const int: i128 = @intFromFloat(f); | |
return self.make(.{ .int = int }); | |
} | |
return self.make(.{ .float = f }); | |
} | |
inline fn nextString(self: *Self, quote: u8) !Token { | |
_ = self.consume(); // consume quote | |
const alloc = self.arena.allocator(); | |
var buf = std.ArrayList(u8).init(alloc); | |
while (self.peek()) |ch| { | |
switch (ch) { | |
'\n' => { | |
if (quote == '`') { | |
try buf.append(self.consume()); | |
} else { | |
return self.err("unexpected newline in string, use `...` for multiline strings", .{}); | |
} | |
}, | |
'\\' => { | |
_ = self.consume(); // consume backslash | |
const escape = self.peek() orelse { | |
return self.err("unexpected end of string", .{}); | |
}; | |
switch (escape) { | |
'n' => { | |
_ = self.consume(); // consume 'n' | |
try buf.append('\n'); | |
}, | |
'r' => { | |
_ = self.consume(); // consume 'r' | |
try buf.append('\r'); | |
}, | |
't' => { | |
_ = self.consume(); // consume 't' | |
try buf.append('\t'); | |
}, | |
'\\' => { | |
_ = self.consume(); // consume '\\' | |
try buf.append('\\'); | |
}, | |
'u' => { | |
_ = self.consume(); // consume 'u' | |
const i = self.i; | |
self.consumeWhile(utils.isHexDigit); | |
const hex = self.src[i..self.i]; | |
const codepoint: u21 = std.fmt.parseInt(u21, hex, 16) catch { | |
return self.err("failed to parse unicode codepoint", .{}); | |
}; | |
var bytes = [_]u8{ 0, 0, 0, 0, 0, 0 }; | |
const n = std.unicode.utf8Encode(codepoint, &bytes) catch { | |
return self.err("failed to encode unicode codepoint", .{}); | |
}; | |
if (n == 0) { | |
return self.err("invalid unicode codepoint", .{}); | |
} | |
try buf.appendSlice(bytes[0..n]); | |
}, | |
'"', '\'', '`' => { | |
if (escape == quote) { | |
try buf.append(self.consume()); | |
} else { | |
return self.err("invalid escape sequence", .{}); | |
} | |
}, | |
else => return self.err("unknown escape sequence: '\\{c}'", .{escape}), | |
} | |
}, | |
else => { | |
if (ch == quote) { | |
break; | |
} | |
try buf.append(self.consume()); | |
}, | |
} | |
} | |
const string = try buf.toOwnedSlice(); | |
_ = self.consume(); // consume quote | |
return self.make(.{ .string = string }); | |
} | |
inline fn nextSingleLineComment(self: *Self) Token { | |
self.consumeWhile(utils.char('/')); | |
const start = self.i; | |
self.consumeUntill(utils.newline); | |
const comment = self.src[start..self.i]; | |
return self.make(.{ .comment = comment }); | |
} | |
inline fn nextMultiLineComment(self: *Self) Token { | |
const start = self.i; | |
while (self.peek()) |_| { | |
self.consumeUntill(utils.char('*')); | |
_ = self.consume(); | |
if (self.peek() == '/') { | |
_ = self.consume(); | |
break; | |
} | |
} | |
const comment = self.src[start .. self.i - 2]; | |
return self.make(.{ .comment = comment }); | |
} | |
inline fn make(self: *Self, value: Token.Value) Token { | |
return Token.init(value, self.span(self.start)); | |
} | |
inline fn makeAndConsume(self: *Self, value: Token.Value) Token { | |
_ = self.consume(); | |
return Token.init(value, self.span(self.start)); | |
} | |
inline fn err(self: *Self, comptime fmt: []const u8, args: anytype) Token { | |
return Token.err(self.arena.allocator(), fmt, args, self.span(self.start)); | |
} | |
inline fn optional(self: *Self, pred: fn (u8) bool) bool { | |
const ch = self.peek() orelse return false; | |
if (pred(ch)) { | |
_ = self.consume(); | |
return true; | |
} | |
return false; | |
} | |
inline fn consumeWhile(self: *Self, pred: fn (u8) bool) void { | |
while (self.peek()) |ch| { | |
if (pred(ch)) { | |
_ = self.consume(); | |
} else { | |
break; | |
} | |
} | |
} | |
inline fn consumeUntill(self: *Self, pred: fn (u8) bool) void { | |
while (self.peek()) |ch| { | |
if (!pred(ch)) { | |
_ = self.consume(); | |
} else { | |
break; | |
} | |
} | |
} | |
inline fn peek(self: *Self) ?u8 { | |
if (self.i >= self.src.len) { | |
return null; | |
} | |
return self.src[self.i]; | |
} | |
fn consume(self: *Self) u8 { | |
if (self.i >= self.src.len) { | |
return 0; | |
} | |
const c = self.src[self.i]; | |
self.i += 1; | |
if (c == '\n') { | |
self.line += 1; | |
self.column = 1; | |
} else { | |
self.column += 1; | |
} | |
return c; | |
} | |
inline fn loc(self: *Self) tok.Location { | |
return tok.Location.init(@intCast(self.i), self.line, self.column); | |
} | |
inline fn span(self: *Self, start: tok.Location) tok.Span { | |
return tok.Span.init(start, self.loc()); | |
} | |
pub const Iterator = struct { | |
parent: *Self, | |
done: bool = false, | |
pub fn next(it: *@This()) ?Token { | |
if (it.done) { | |
return null; | |
} | |
const token = it.parent.next(); | |
if (token.value == .eof) { | |
it.done = true; | |
} | |
return token; | |
} | |
}; | |
pub fn iter(self: *Self) Iterator { | |
return Iterator{ | |
.parent = self, | |
}; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const ast = @import("ast"); | |
pub const Span = ast.Span; | |
pub const Location = ast.Location; | |
const Allocator = std.mem.Allocator; | |
pub const Token = struct { | |
span: Span, | |
value: Value, | |
pub const Value = union(enum) { | |
err: []const u8, | |
eof, | |
ident: []const u8, | |
int: i128, | |
float: f128, | |
string: []const u8, | |
comment: []const u8, | |
semicolon, | |
dot, | |
comma, | |
lparen, | |
rparen, | |
lsquirly, | |
rsquirly, | |
lbracket, | |
rbracket, | |
eq, | |
eq_eq, | |
bang, | |
bang_eq, | |
lt, | |
lt_eq, | |
gt, | |
gt_eq, | |
ampersand, | |
ampersand_ampersand, | |
ampersand_eq, | |
pipe, | |
pipe_pipe, | |
pipe_eq, | |
pipe_gt, // |> | |
tilde, | |
plus, | |
plus_eq, | |
minus, | |
minus_eq, | |
star, | |
star_star, | |
star_eq, | |
slash, | |
slash_eq, | |
percent, | |
percent_eq, | |
caret, | |
caret_eq, | |
at, | |
colon, | |
question, | |
question_question, | |
}; | |
const Self = @This(); | |
pub inline fn init(value: Value, span: Span) Self { | |
return Self{ .span = span, .value = value }; | |
} | |
pub inline fn err(allocator: Allocator, comptime fmt: []const u8, args: anytype, span: Span) Self { | |
const msg = std.fmt.allocPrint(allocator, fmt, args) catch { | |
return Self{ .span = span, .value = .{ .err = "allocation failure" } }; | |
}; | |
return Self{ .span = span, .value = .{ .err = msg } }; | |
} | |
pub fn print(self: Self) void { | |
std.debug.print("{d}:{d}-{d}:{d} ", .{ | |
self.span.start.line, | |
self.span.start.column, | |
self.span.end.line, | |
self.span.end.column, | |
}); | |
switch (self.value) { | |
.err => |e| std.debug.print("error: {s}", .{e}), | |
.eof => std.debug.print("eof", .{}), | |
.ident => |ident| std.debug.print("ident: {s}", .{ident}), | |
.int => |int| std.debug.print("int: {d}", .{int}), | |
.float => |float| std.debug.print("float: {d}", .{float}), | |
.string => |string| std.debug.print("string: {s}", .{string}), | |
.comment => |comment| std.debug.print("comment: {s}", .{comment}), | |
else => { | |
const name = @tagName(self.value); | |
std.debug.print("{s}", .{name}); | |
}, | |
} | |
std.debug.print("\n", .{}); | |
} | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
pub fn isIdentChar(c: u8) bool { | |
return c == '_' or std.ascii.isAlphanumeric(c); | |
} | |
pub fn isBinaryDigit(c: u8) bool { | |
return c == '0' or c == '1'; | |
} | |
pub fn isOctalDigit(c: u8) bool { | |
return c >= '0' and c <= '7'; | |
} | |
pub fn isHexDigit(c: u8) bool { | |
return c >= '0' and c <= '9' or c >= 'a' and c <= 'f' or c >= 'A' and c <= 'F'; | |
} | |
pub fn newline(c: u8) bool { | |
return c == '\n'; | |
} | |
pub fn dot(c: u8) bool { | |
return c == '.'; | |
} | |
pub fn exponent(c: u8) bool { | |
return c == 'e' or c == 'E'; | |
} | |
pub fn hexPower(c: u8) bool { | |
return c == 'p' or c == 'P'; | |
} | |
pub fn sign(c: u8) bool { | |
return c == '+' or c == '-'; | |
} | |
pub fn char(comptime expected: u8) fn (c: u8) bool { | |
return struct { | |
pub fn is(c: u8) bool { | |
return c == expected; | |
} | |
}.is; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment