Skip to content

Instantly share code, notes, and snippets.

@shreyassanthu77
Last active August 26, 2025 00:12
Show Gist options
  • Save shreyassanthu77/6c49394c7694a887c4589271aeafa041 to your computer and use it in GitHub Desktop.
Save shreyassanthu77/6c49394c7694a887c4589271aeafa041 to your computer and use it in GitHub Desktop.
A very minimal and dumb xml parser written in zig
const std = @import("std");
const Self = @This();
source: [:0]const u8,
pos: usize = 0,
state: State = .start,
pub fn next(self: *Self) !Token {
var start = self.pos;
var text_end: usize = 0;
var quote: u8 = 0;
var tok: Token = .eof;
parsing: switch (self.state) {
.start => switch (self.source[self.pos]) {
0 => return .eof,
' ', '\t', '\r', '\n' => {
self.pos += 1;
continue :parsing .start;
},
'<' => {
self.pos += 1;
continue :parsing .saw_lt;
},
else => {
start = self.pos;
text_end = 0;
continue :parsing .scanning_text_content;
},
},
.saw_lt => switch (self.source[self.pos]) {
0 => return error.UnexpectedEOF,
'?' => {
// skip until '?>'
self.pos += 1;
const ptr = self.source.ptr;
const end = self.source.len;
var i = self.pos;
while (i < end) : (i += 1) {
if (ptr[i] == '?' and ptr[i + 1] == '>') {
self.pos = i + 2;
continue :parsing .start;
}
}
return error.UnterminatedProcessingInstruction;
},
'!' => {
// skip until '>'
self.pos += 1;
const ptr = self.source.ptr;
const end = self.source.len;
var i = self.pos;
while (i < end) : (i += 1) {
if (ptr[i] == '>') {
self.pos = i + 1;
continue :parsing .start;
}
}
return error.UnterminatedComment;
},
'/' => {
self.pos += 1;
tok = .{ .close_tag = "" };
continue :parsing .saw_lt;
},
'a'...'z', 'A'...'Z', '_' => {
start = self.pos;
self.pos += 1;
continue :parsing .scanning_name;
},
else => return error.UnexpectedCharacter,
},
inline .scanning_name,
.scanning_attribute_name,
=> |s| switch (self.source[self.pos]) {
0 => return error.UnexpectedEOF,
'a'...'z', 'A'...'Z', '0'...'9', '_', '-' => {
self.pos += 1;
continue :parsing s;
},
else => {
if (self.pos == start) return error.EmptyTagName;
const name = self.source[start..self.pos];
if (s == .scanning_name) {
if (tok == .close_tag) {
tok = .{ .close_tag = name };
continue :parsing .scanned_name;
} else {
self.state = .scanned_name;
tok = .{ .open_tag = name };
}
} else if (s == .scanning_attribute_name) {
tok = .{ .attribute = .{
.name = name,
.value = undefined,
} };
continue :parsing .scanned_attribute_name;
} else comptime unreachable;
},
},
.scanned_name => switch (self.source[self.pos]) {
0 => return error.UnexpectedEOF,
' ', '\t', '\r', '\n' => {
self.pos += 1;
continue :parsing .scanned_name;
},
'/' => {
self.pos += 1;
tok = .{ .close_tag = "" };
continue :parsing .scanned_name;
},
'>' => {
self.pos += 1;
self.state = .start;
if (tok != .close_tag) {
tok = .open_tag_end;
}
},
else => {
start = self.pos;
continue :parsing .scanning_attribute_name;
},
},
.scanned_attribute_name => switch (self.source[self.pos]) {
0 => return error.UnexpectedEOF,
' ', '\t', '\r', '\n' => {
self.pos += 1;
continue :parsing .scanned_attribute_name;
},
'=' => {
self.pos += 1;
continue :parsing .expect_attribute_value;
},
else => return error.ExpectedAttributeValue,
},
.expect_attribute_value => switch (self.source[self.pos]) {
'"', '\'' => |q| {
self.pos += 1;
start = self.pos;
quote = q;
continue :parsing .scanning_attribute_value;
},
else => return error.UnexpectedEOF,
},
.scanning_attribute_value => switch (self.source[self.pos]) {
0 => return error.UnexpectedEOF,
'"', '\'' => |q| {
if (q != quote) {
self.pos += 1;
continue :parsing .scanning_attribute_value;
}
const value = self.source[start..self.pos];
self.pos += 1;
self.state = .scanned_name;
tok.attribute.value = value;
},
else => {
self.pos += 1;
continue :parsing .scanning_attribute_value;
},
},
.scanning_text_content => switch (self.source[self.pos]) {
0, '<' => {
const end = if (text_end == 0) self.pos else text_end;
if (end == start) continue :parsing .saw_lt;
self.state = .start;
const text = self.source[start..end];
tok = .{ .text_content = text };
},
' ', '\t', '\r', '\n' => {
if (text_end == 0) text_end = self.pos;
self.pos += 1;
continue :parsing .scanning_text_content;
},
else => {
text_end = 0;
self.pos += 1;
continue :parsing .scanning_text_content;
},
},
}
return tok;
}
pub const Token = union(enum) {
eof,
open_tag: []const u8,
attribute: struct {
name: []const u8,
value: []const u8,
},
open_tag_end,
text_content: []const u8,
close_tag: []const u8,
pub fn format(this: @This(), writer: *std.io.Writer) !void {
switch (this) {
.eof => try writer.writeAll("eof"),
.open_tag => |name| try writer.print("open_tag: '{s}'", .{name}),
.close_tag => |name| try writer.print("close_tag: '{s}'", .{name}),
.open_tag_end => try writer.writeAll("open_tag_end"),
.attribute => |attr| {
try writer.print("attribute: '{s}'", .{attr.name});
if (attr.value) |value| {
try writer.print(" = '{s}'", .{value});
}
},
.text_content => |text| try writer.print("text_content: '{s}'", .{text}),
}
}
};
const State = enum {
start,
saw_lt,
scanning_name,
scanned_name,
scanning_attribute_name,
scanned_attribute_name,
expect_attribute_value,
scanning_attribute_value,
scanning_text_content,
};
pub const parse = Node.parse;
pub const Node = union(enum) {
element: Element,
text: []const u8,
pub const Attribute = struct {
name: []const u8,
value: []const u8,
};
pub const Element = struct {
tag: []const u8,
attributes: []const Attribute,
children: []const Node,
};
pub const Parsed = struct {
root: Node,
arena: std.heap.ArenaAllocator,
pub fn deinit(self: *Parsed) void {
self.arena.deinit();
}
};
pub fn parse(xml: *Self, allocator: std.mem.Allocator) !?Parsed {
var arena = std.heap.ArenaAllocator.init(allocator);
errdefer arena.deinit();
const alloc = arena.allocator();
const tag = try xml.next();
switch (tag) {
.eof => return null,
.open_tag => |tag_name| {
const root = try parseNode(alloc, xml, tag_name);
return Parsed{
.root = root,
.arena = arena,
};
},
.text_content => |text| {
const root = Node{ .text = text };
return Parsed{
.root = root,
.arena = arena,
};
},
else => return error.UnexpectedToken,
}
}
fn parseNode(allocator: std.mem.Allocator, xml: *Self, name: []const u8) !Node {
var node: Node = .{
.element = .{
.tag = name,
.attributes = &.{},
.children = &.{},
},
};
var children: std.ArrayList(Node) = .empty;
errdefer children.deinit(allocator);
var attributes: std.ArrayList(Attribute) = .empty;
errdefer attributes.deinit(allocator);
var is_self_closing: bool = true;
while (true) {
switch (try xml.next()) {
.eof => return error.UnexpectedEOF,
.attribute => |attr| {
try attributes.append(allocator, .{
.name = attr.name,
.value = attr.value,
});
},
.open_tag_end => is_self_closing = false,
.open_tag => |tag| {
const child = try parseNode(allocator, xml, tag);
try children.append(allocator, child);
},
.text_content => |text| {
try children.append(allocator, .{ .text = text });
},
.close_tag => |tag| {
if ((is_self_closing and std.mem.eql(u8, tag, "")) or
(!is_self_closing and std.mem.eql(u8, tag, name)))
{
node.element.children = try children.toOwnedSlice(allocator);
node.element.attributes = try attributes.toOwnedSlice(allocator);
return node;
}
return error.UnexpectedClosingTag;
},
}
}
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment