Skip to content

Instantly share code, notes, and snippets.

@shreyassanthu77
Last active December 13, 2024 18:26
Show Gist options
  • Save shreyassanthu77/1a8dc996df6a7742e0277dd7b8668b3f to your computer and use it in GitHub Desktop.
Save shreyassanthu77/1a8dc996df6a7742e0277dd7b8668b3f to your computer and use it in GitHub Desktop.
a lexer in zig that does stuff....ya know like the stuff lexers do
const std = @import("std");
const log = std.log.scoped(.lexer);
const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const utils = @import("utils.zig");
pub const tok = @import("token.zig");
pub const Token = tok.Token;
arena: ArenaAllocator,
src: []const u8,
i: usize = 0,
line: u32 = 1,
column: u32 = 1,
start: tok.Location = undefined,
const Self = @This();
pub fn init(allocator: Allocator, src: []const u8) *Self {
var arena = ArenaAllocator.init(allocator);
const lexer = arena.allocator().create(Self) catch @panic("OOM");
lexer.* = .{
.arena = arena,
.src = src,
};
return lexer;
}
pub inline fn deinit(self: *Self) void {
self.arena.deinit();
}
pub fn next(self: *Self) Token {
self.consumeWhile(std.ascii.isWhitespace);
self.start = self.loc();
const ch = self.peek() orelse {
return self.make(.eof);
};
return switch (ch) {
'a'...'z', 'A'...'Z', '_' => self.nextIdent(),
'0'...'9' => self.nextNumber(),
'"', '\'', '`' => self.nextString(ch) catch Token.init(.{ .err = "Out of memory" }, self.span(self.start)),
';' => self.makeAndConsume(.semicolon),
'.' => self.makeAndConsume(.dot),
',' => self.makeAndConsume(.comma),
'(' => self.makeAndConsume(.lparen),
')' => self.makeAndConsume(.rparen),
'{' => self.makeAndConsume(.lsquirly),
'}' => self.makeAndConsume(.rsquirly),
'[' => self.makeAndConsume(.lbracket),
']' => self.makeAndConsume(.rbracket),
'=' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.eq_eq)
else
self.make(.eq);
},
'!' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.bang_eq)
else
self.make(.bang);
},
'<' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.lt_eq)
else
self.make(.lt);
},
'>' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.gt_eq)
else
self.make(.gt);
},
'&' => {
_ = self.consume();
return if (self.optional(utils.char('&')))
self.make(.ampersand_ampersand)
else if (self.optional(utils.char('=')))
self.make(.ampersand_eq)
else
self.make(.ampersand);
},
'|' => {
_ = self.consume();
return if (self.optional(utils.char('|')))
self.make(.pipe_pipe)
else if (self.optional(utils.char('=')))
self.make(.pipe_eq)
else if (self.optional(utils.char('>')))
self.make(.pipe_gt)
else
self.make(.pipe);
},
'~' => self.makeAndConsume(.tilde),
'+' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.plus_eq)
else
self.make(.plus);
},
'-' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.minus_eq)
else
self.make(.minus);
},
'*' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.star_eq)
else if (self.optional(utils.char('*')))
self.make(.star_star)
else
self.make(.star);
},
'/' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.slash_eq)
else if (self.optional(utils.char('/')))
self.nextSingleLineComment()
else if (self.optional(utils.char('*')))
self.nextMultiLineComment()
else
self.make(.slash);
},
'%' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.percent_eq)
else
self.make(.percent);
},
'^' => {
_ = self.consume();
return if (self.optional(utils.char('=')))
self.make(.caret_eq)
else
self.make(.caret);
},
'@' => self.makeAndConsume(.at),
':' => self.makeAndConsume(.colon),
'?' => {
_ = self.consume();
return if (self.optional(utils.char('?')))
self.make(.question_question)
else
self.make(.question);
},
else => {
_ = self.consume();
return self.err("unexpected character: '{c}'", .{ch});
},
};
}
/// [a-zA-Z_][a-zA-Z0-9_]*
inline fn nextIdent(self: *Self) Token {
const id_start = self.i;
_ = self.consume();
self.consumeWhile(utils.isIdentChar);
const ident = self.src[id_start..self.i];
return self.make(.{ .ident = ident });
}
/// number: int + fracton? + exp?
/// | bin
/// | oct
/// | hex
///
/// int: [0-9] | [1-9][0-9]+
/// fraction: \.[0-9]+
/// exp: [eE] [-+]? [0-9]+
///
/// bin: 0b[01]+
/// oct: 0o[0-7]+
/// hex: 0x[0-9a-fA-F]+(.[0-9a-fA-F]+)?([pP][-+]?[0-9]+)?
inline fn nextNumber(self: *Self) Token {
const number_start = self.i;
const i = self.consume();
if (i == '0') {
if (self.peek()) |p| {
switch (p) {
'b' => return self.nextBinary(),
'o' => return self.nextOctal(),
'x' => return self.nextHex(),
'0'...'9' => return self.err("leading zero in number", .{}),
else => {},
}
}
}
self.consumeWhile(std.ascii.isDigit);
if (self.optional(utils.dot)) {
self.consumeWhile(std.ascii.isDigit);
}
if (self.optional(utils.exponent)) {
_ = self.optional(utils.sign);
const p = self.peek();
if (p == null or !std.ascii.isDigit(p.?)) {
return self.err("expected digit after exponent", .{});
}
self.consumeWhile(std.ascii.isDigit);
}
const number = self.src[number_start..self.i];
const f: f128 = std.fmt.parseFloat(f128, number) catch {
return self.err("failed to parse number", .{});
};
// if the number is an integer, we can convert it to i64
if (f == @trunc(f)) {
const int: i128 = @intFromFloat(f);
return self.make(.{ .int = int });
}
return self.make(.{ .float = f });
}
inline fn nextBinary(self: *Self) Token {
_ = self.consume(); // consume 'b'
const number_start = self.i;
self.consumeWhile(utils.isBinaryDigit);
const number = self.src[number_start..self.i];
const int: i128 = std.fmt.parseInt(i128, number, 2) catch {
return self.err("failed to parse binary number {s}", .{number});
};
return self.make(.{ .int = int });
}
inline fn nextOctal(self: *Self) Token {
_ = self.consume(); // consume 'o'
const number_start = self.i;
self.consumeWhile(utils.isOctalDigit);
const number = self.src[number_start..self.i];
const int: i128 = std.fmt.parseInt(i128, number, 8) catch {
return self.err("failed to parse octal number {s}", .{number});
};
return self.make(.{ .int = int });
}
inline fn nextHex(self: *Self) Token {
const number_start = self.i - 1; // include '0'
_ = self.consume(); // consume 'x'
self.consumeWhile(utils.isHexDigit);
if (self.optional(utils.dot)) {
self.consumeWhile(utils.isHexDigit);
}
if (self.optional(utils.hexPower)) {
_ = self.optional(utils.sign);
const p = self.peek();
if (p == null or !utils.isHexDigit(p.?)) {
return self.err("expected digit after hex power", .{});
}
self.consumeWhile(utils.isHexDigit);
}
const number = self.src[number_start..self.i];
const f: f128 = std.fmt.parseFloat(f128, number) catch {
return self.err("failed to parse hex number {s}", .{number});
};
// if the number is an integer, we can convert it to i64
if (f == @trunc(f)) {
const int: i128 = @intFromFloat(f);
return self.make(.{ .int = int });
}
return self.make(.{ .float = f });
}
inline fn nextString(self: *Self, quote: u8) !Token {
_ = self.consume(); // consume quote
const alloc = self.arena.allocator();
var buf = std.ArrayList(u8).init(alloc);
while (self.peek()) |ch| {
switch (ch) {
'\n' => {
if (quote == '`') {
try buf.append(self.consume());
} else {
return self.err("unexpected newline in string, use `...` for multiline strings", .{});
}
},
'\\' => {
_ = self.consume(); // consume backslash
const escape = self.peek() orelse {
return self.err("unexpected end of string", .{});
};
switch (escape) {
'n' => {
_ = self.consume(); // consume 'n'
try buf.append('\n');
},
'r' => {
_ = self.consume(); // consume 'r'
try buf.append('\r');
},
't' => {
_ = self.consume(); // consume 't'
try buf.append('\t');
},
'\\' => {
_ = self.consume(); // consume '\\'
try buf.append('\\');
},
'u' => {
_ = self.consume(); // consume 'u'
const i = self.i;
self.consumeWhile(utils.isHexDigit);
const hex = self.src[i..self.i];
const codepoint: u21 = std.fmt.parseInt(u21, hex, 16) catch {
return self.err("failed to parse unicode codepoint", .{});
};
var bytes = [_]u8{ 0, 0, 0, 0, 0, 0 };
const n = std.unicode.utf8Encode(codepoint, &bytes) catch {
return self.err("failed to encode unicode codepoint", .{});
};
if (n == 0) {
return self.err("invalid unicode codepoint", .{});
}
try buf.appendSlice(bytes[0..n]);
},
'"', '\'', '`' => {
if (escape == quote) {
try buf.append(self.consume());
} else {
return self.err("invalid escape sequence", .{});
}
},
else => return self.err("unknown escape sequence: '\\{c}'", .{escape}),
}
},
else => {
if (ch == quote) {
break;
}
try buf.append(self.consume());
},
}
}
const string = try buf.toOwnedSlice();
_ = self.consume(); // consume quote
return self.make(.{ .string = string });
}
inline fn nextSingleLineComment(self: *Self) Token {
self.consumeWhile(utils.char('/'));
const start = self.i;
self.consumeUntill(utils.newline);
const comment = self.src[start..self.i];
return self.make(.{ .comment = comment });
}
inline fn nextMultiLineComment(self: *Self) Token {
const start = self.i;
while (self.peek()) |_| {
self.consumeUntill(utils.char('*'));
_ = self.consume();
if (self.peek() == '/') {
_ = self.consume();
break;
}
}
const comment = self.src[start .. self.i - 2];
return self.make(.{ .comment = comment });
}
inline fn make(self: *Self, value: Token.Value) Token {
return Token.init(value, self.span(self.start));
}
inline fn makeAndConsume(self: *Self, value: Token.Value) Token {
_ = self.consume();
return Token.init(value, self.span(self.start));
}
inline fn err(self: *Self, comptime fmt: []const u8, args: anytype) Token {
return Token.err(self.arena.allocator(), fmt, args, self.span(self.start));
}
inline fn optional(self: *Self, pred: fn (u8) bool) bool {
const ch = self.peek() orelse return false;
if (pred(ch)) {
_ = self.consume();
return true;
}
return false;
}
inline fn consumeWhile(self: *Self, pred: fn (u8) bool) void {
while (self.peek()) |ch| {
if (pred(ch)) {
_ = self.consume();
} else {
break;
}
}
}
inline fn consumeUntill(self: *Self, pred: fn (u8) bool) void {
while (self.peek()) |ch| {
if (!pred(ch)) {
_ = self.consume();
} else {
break;
}
}
}
inline fn peek(self: *Self) ?u8 {
if (self.i >= self.src.len) {
return null;
}
return self.src[self.i];
}
fn consume(self: *Self) u8 {
if (self.i >= self.src.len) {
return 0;
}
const c = self.src[self.i];
self.i += 1;
if (c == '\n') {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
return c;
}
inline fn loc(self: *Self) tok.Location {
return tok.Location.init(@intCast(self.i), self.line, self.column);
}
inline fn span(self: *Self, start: tok.Location) tok.Span {
return tok.Span.init(start, self.loc());
}
pub const Iterator = struct {
parent: *Self,
done: bool = false,
pub fn next(it: *@This()) ?Token {
if (it.done) {
return null;
}
const token = it.parent.next();
if (token.value == .eof) {
it.done = true;
}
return token;
}
};
pub fn iter(self: *Self) Iterator {
return Iterator{
.parent = self,
};
}
const std = @import("std");
const ast = @import("ast");
pub const Span = ast.Span;
pub const Location = ast.Location;
const Allocator = std.mem.Allocator;
pub const Token = struct {
span: Span,
value: Value,
pub const Value = union(enum) {
err: []const u8,
eof,
ident: []const u8,
int: i128,
float: f128,
string: []const u8,
comment: []const u8,
semicolon,
dot,
comma,
lparen,
rparen,
lsquirly,
rsquirly,
lbracket,
rbracket,
eq,
eq_eq,
bang,
bang_eq,
lt,
lt_eq,
gt,
gt_eq,
ampersand,
ampersand_ampersand,
ampersand_eq,
pipe,
pipe_pipe,
pipe_eq,
pipe_gt, // |>
tilde,
plus,
plus_eq,
minus,
minus_eq,
star,
star_star,
star_eq,
slash,
slash_eq,
percent,
percent_eq,
caret,
caret_eq,
at,
colon,
question,
question_question,
};
const Self = @This();
pub inline fn init(value: Value, span: Span) Self {
return Self{ .span = span, .value = value };
}
pub inline fn err(allocator: Allocator, comptime fmt: []const u8, args: anytype, span: Span) Self {
const msg = std.fmt.allocPrint(allocator, fmt, args) catch {
return Self{ .span = span, .value = .{ .err = "allocation failure" } };
};
return Self{ .span = span, .value = .{ .err = msg } };
}
pub fn print(self: Self) void {
std.debug.print("{d}:{d}-{d}:{d} ", .{
self.span.start.line,
self.span.start.column,
self.span.end.line,
self.span.end.column,
});
switch (self.value) {
.err => |e| std.debug.print("error: {s}", .{e}),
.eof => std.debug.print("eof", .{}),
.ident => |ident| std.debug.print("ident: {s}", .{ident}),
.int => |int| std.debug.print("int: {d}", .{int}),
.float => |float| std.debug.print("float: {d}", .{float}),
.string => |string| std.debug.print("string: {s}", .{string}),
.comment => |comment| std.debug.print("comment: {s}", .{comment}),
else => {
const name = @tagName(self.value);
std.debug.print("{s}", .{name});
},
}
std.debug.print("\n", .{});
}
};
const std = @import("std");
pub fn isIdentChar(c: u8) bool {
return c == '_' or std.ascii.isAlphanumeric(c);
}
pub fn isBinaryDigit(c: u8) bool {
return c == '0' or c == '1';
}
pub fn isOctalDigit(c: u8) bool {
return c >= '0' and c <= '7';
}
pub fn isHexDigit(c: u8) bool {
return c >= '0' and c <= '9' or c >= 'a' and c <= 'f' or c >= 'A' and c <= 'F';
}
pub fn newline(c: u8) bool {
return c == '\n';
}
pub fn dot(c: u8) bool {
return c == '.';
}
pub fn exponent(c: u8) bool {
return c == 'e' or c == 'E';
}
pub fn hexPower(c: u8) bool {
return c == 'p' or c == 'P';
}
pub fn sign(c: u8) bool {
return c == '+' or c == '-';
}
pub fn char(comptime expected: u8) fn (c: u8) bool {
return struct {
pub fn is(c: u8) bool {
return c == expected;
}
}.is;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment