Last active
August 26, 2025 00:12
-
-
Save shreyassanthu77/6c49394c7694a887c4589271aeafa041 to your computer and use it in GitHub Desktop.
A very minimal and dumb xml parser written in zig
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const Self = @This(); | |
source: [:0]const u8, | |
pos: usize = 0, | |
state: State = .start, | |
pub fn next(self: *Self) !Token { | |
var start = self.pos; | |
var text_end: usize = 0; | |
var quote: u8 = 0; | |
var tok: Token = .eof; | |
parsing: switch (self.state) { | |
.start => switch (self.source[self.pos]) { | |
0 => return .eof, | |
' ', '\t', '\r', '\n' => { | |
self.pos += 1; | |
continue :parsing .start; | |
}, | |
'<' => { | |
self.pos += 1; | |
continue :parsing .saw_lt; | |
}, | |
else => { | |
start = self.pos; | |
text_end = 0; | |
continue :parsing .scanning_text_content; | |
}, | |
}, | |
.saw_lt => switch (self.source[self.pos]) { | |
0 => return error.UnexpectedEOF, | |
'?' => { | |
// skip until '?>' | |
self.pos += 1; | |
const ptr = self.source.ptr; | |
const end = self.source.len; | |
var i = self.pos; | |
while (i < end) : (i += 1) { | |
if (ptr[i] == '?' and ptr[i + 1] == '>') { | |
self.pos = i + 2; | |
continue :parsing .start; | |
} | |
} | |
return error.UnterminatedProcessingInstruction; | |
}, | |
'!' => { | |
// skip until '>' | |
self.pos += 1; | |
const ptr = self.source.ptr; | |
const end = self.source.len; | |
var i = self.pos; | |
while (i < end) : (i += 1) { | |
if (ptr[i] == '>') { | |
self.pos = i + 1; | |
continue :parsing .start; | |
} | |
} | |
return error.UnterminatedComment; | |
}, | |
'/' => { | |
self.pos += 1; | |
tok = .{ .close_tag = "" }; | |
continue :parsing .saw_lt; | |
}, | |
'a'...'z', 'A'...'Z', '_' => { | |
start = self.pos; | |
self.pos += 1; | |
continue :parsing .scanning_name; | |
}, | |
else => return error.UnexpectedCharacter, | |
}, | |
inline .scanning_name, | |
.scanning_attribute_name, | |
=> |s| switch (self.source[self.pos]) { | |
0 => return error.UnexpectedEOF, | |
'a'...'z', 'A'...'Z', '0'...'9', '_', '-' => { | |
self.pos += 1; | |
continue :parsing s; | |
}, | |
else => { | |
if (self.pos == start) return error.EmptyTagName; | |
const name = self.source[start..self.pos]; | |
if (s == .scanning_name) { | |
if (tok == .close_tag) { | |
tok = .{ .close_tag = name }; | |
continue :parsing .scanned_name; | |
} else { | |
self.state = .scanned_name; | |
tok = .{ .open_tag = name }; | |
} | |
} else if (s == .scanning_attribute_name) { | |
tok = .{ .attribute = .{ | |
.name = name, | |
.value = undefined, | |
} }; | |
continue :parsing .scanned_attribute_name; | |
} else comptime unreachable; | |
}, | |
}, | |
.scanned_name => switch (self.source[self.pos]) { | |
0 => return error.UnexpectedEOF, | |
' ', '\t', '\r', '\n' => { | |
self.pos += 1; | |
continue :parsing .scanned_name; | |
}, | |
'/' => { | |
self.pos += 1; | |
tok = .{ .close_tag = "" }; | |
continue :parsing .scanned_name; | |
}, | |
'>' => { | |
self.pos += 1; | |
self.state = .start; | |
if (tok != .close_tag) { | |
tok = .open_tag_end; | |
} | |
}, | |
else => { | |
start = self.pos; | |
continue :parsing .scanning_attribute_name; | |
}, | |
}, | |
.scanned_attribute_name => switch (self.source[self.pos]) { | |
0 => return error.UnexpectedEOF, | |
' ', '\t', '\r', '\n' => { | |
self.pos += 1; | |
continue :parsing .scanned_attribute_name; | |
}, | |
'=' => { | |
self.pos += 1; | |
continue :parsing .expect_attribute_value; | |
}, | |
else => return error.ExpectedAttributeValue, | |
}, | |
.expect_attribute_value => switch (self.source[self.pos]) { | |
'"', '\'' => |q| { | |
self.pos += 1; | |
start = self.pos; | |
quote = q; | |
continue :parsing .scanning_attribute_value; | |
}, | |
else => return error.UnexpectedEOF, | |
}, | |
.scanning_attribute_value => switch (self.source[self.pos]) { | |
0 => return error.UnexpectedEOF, | |
'"', '\'' => |q| { | |
if (q != quote) { | |
self.pos += 1; | |
continue :parsing .scanning_attribute_value; | |
} | |
const value = self.source[start..self.pos]; | |
self.pos += 1; | |
self.state = .scanned_name; | |
tok.attribute.value = value; | |
}, | |
else => { | |
self.pos += 1; | |
continue :parsing .scanning_attribute_value; | |
}, | |
}, | |
.scanning_text_content => switch (self.source[self.pos]) { | |
0, '<' => { | |
const end = if (text_end == 0) self.pos else text_end; | |
if (end == start) continue :parsing .saw_lt; | |
self.state = .start; | |
const text = self.source[start..end]; | |
tok = .{ .text_content = text }; | |
}, | |
' ', '\t', '\r', '\n' => { | |
if (text_end == 0) text_end = self.pos; | |
self.pos += 1; | |
continue :parsing .scanning_text_content; | |
}, | |
else => { | |
text_end = 0; | |
self.pos += 1; | |
continue :parsing .scanning_text_content; | |
}, | |
}, | |
} | |
return tok; | |
} | |
pub const Token = union(enum) { | |
eof, | |
open_tag: []const u8, | |
attribute: struct { | |
name: []const u8, | |
value: []const u8, | |
}, | |
open_tag_end, | |
text_content: []const u8, | |
close_tag: []const u8, | |
pub fn format(this: @This(), writer: *std.io.Writer) !void { | |
switch (this) { | |
.eof => try writer.writeAll("eof"), | |
.open_tag => |name| try writer.print("open_tag: '{s}'", .{name}), | |
.close_tag => |name| try writer.print("close_tag: '{s}'", .{name}), | |
.open_tag_end => try writer.writeAll("open_tag_end"), | |
.attribute => |attr| { | |
try writer.print("attribute: '{s}'", .{attr.name}); | |
if (attr.value) |value| { | |
try writer.print(" = '{s}'", .{value}); | |
} | |
}, | |
.text_content => |text| try writer.print("text_content: '{s}'", .{text}), | |
} | |
} | |
}; | |
const State = enum { | |
start, | |
saw_lt, | |
scanning_name, | |
scanned_name, | |
scanning_attribute_name, | |
scanned_attribute_name, | |
expect_attribute_value, | |
scanning_attribute_value, | |
scanning_text_content, | |
}; | |
pub const parse = Node.parse; | |
pub const Node = union(enum) { | |
element: Element, | |
text: []const u8, | |
pub const Attribute = struct { | |
name: []const u8, | |
value: []const u8, | |
}; | |
pub const Element = struct { | |
tag: []const u8, | |
attributes: []const Attribute, | |
children: []const Node, | |
}; | |
pub const Parsed = struct { | |
root: Node, | |
arena: std.heap.ArenaAllocator, | |
pub fn deinit(self: *Parsed) void { | |
self.arena.deinit(); | |
} | |
}; | |
pub fn parse(xml: *Self, allocator: std.mem.Allocator) !?Parsed { | |
var arena = std.heap.ArenaAllocator.init(allocator); | |
errdefer arena.deinit(); | |
const alloc = arena.allocator(); | |
const tag = try xml.next(); | |
switch (tag) { | |
.eof => return null, | |
.open_tag => |tag_name| { | |
const root = try parseNode(alloc, xml, tag_name); | |
return Parsed{ | |
.root = root, | |
.arena = arena, | |
}; | |
}, | |
.text_content => |text| { | |
const root = Node{ .text = text }; | |
return Parsed{ | |
.root = root, | |
.arena = arena, | |
}; | |
}, | |
else => return error.UnexpectedToken, | |
} | |
} | |
fn parseNode(allocator: std.mem.Allocator, xml: *Self, name: []const u8) !Node { | |
var node: Node = .{ | |
.element = .{ | |
.tag = name, | |
.attributes = &.{}, | |
.children = &.{}, | |
}, | |
}; | |
var children: std.ArrayList(Node) = .empty; | |
errdefer children.deinit(allocator); | |
var attributes: std.ArrayList(Attribute) = .empty; | |
errdefer attributes.deinit(allocator); | |
var is_self_closing: bool = true; | |
while (true) { | |
switch (try xml.next()) { | |
.eof => return error.UnexpectedEOF, | |
.attribute => |attr| { | |
try attributes.append(allocator, .{ | |
.name = attr.name, | |
.value = attr.value, | |
}); | |
}, | |
.open_tag_end => is_self_closing = false, | |
.open_tag => |tag| { | |
const child = try parseNode(allocator, xml, tag); | |
try children.append(allocator, child); | |
}, | |
.text_content => |text| { | |
try children.append(allocator, .{ .text = text }); | |
}, | |
.close_tag => |tag| { | |
if ((is_self_closing and std.mem.eql(u8, tag, "")) or | |
(!is_self_closing and std.mem.eql(u8, tag, name))) | |
{ | |
node.element.children = try children.toOwnedSlice(allocator); | |
node.element.attributes = try attributes.toOwnedSlice(allocator); | |
return node; | |
} | |
return error.UnexpectedClosingTag; | |
}, | |
} | |
} | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment