Created
February 19, 2025 12:37
-
-
Save qknight/968e11fe2b1c7531493a3665ddcd7aad to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# main.rs | |
#[macro_use] | |
extern crate html5ever; | |
extern crate markup5ever_rcdom as rcdom; | |
use html5ever::parse_document; | |
use html5ever::tendril::TendrilSink; | |
use rcdom::{Handle, NodeData, RcDom}; | |
use std::io::Cursor; | |
// This is not proper HTML serialization, of course. | |
fn walk(indent: usize, handle: &Handle) { | |
let node = handle; | |
for _ in 0..indent { | |
print!(" "); | |
} | |
let close_tag = match node.data { | |
NodeData::Document => {println!(""); None }, | |
NodeData::Doctype { | |
ref name, | |
ref public_id, | |
ref system_id, | |
} => { | |
let mut doctype = format!("<!DOCTYPE {}", name); | |
if !public_id.is_empty() { | |
doctype += &format!(" \"{}\"", public_id); | |
} | |
if !system_id.is_empty() { | |
doctype += &format!(" \"{}\"", system_id); | |
} | |
doctype += ">"; | |
println!("{}", doctype); | |
None | |
}, | |
NodeData::Text { ref contents } => { | |
println!("{}", contents.borrow().escape_default()); | |
None | |
}, | |
NodeData::Comment { ref contents } => { | |
println!("<!-- {} -->", contents.escape_default()); | |
None | |
}, | |
NodeData::Element { | |
ref name, | |
ref attrs, | |
.. | |
} => { | |
assert!(name.ns == ns!(html)); | |
print!("<{}", name.local); | |
for attr in attrs.borrow().iter() { | |
assert!(attr.name.ns == ns!()); | |
print!(" {}=\"{}\"", attr.name.local, attr.value); | |
} | |
println!(">"); | |
Some(format!("</{}>", name.local)) | |
}, | |
NodeData::ProcessingInstruction { .. } => { | |
unreachable!(); | |
None | |
}, | |
}; | |
for child in node.children.borrow().iter() { | |
walk(indent + 2, child); | |
} | |
match close_tag { | |
Some(tag) => { | |
println!("{}{}", " ".repeat(indent), tag); | |
}, | |
None => {} | |
} | |
} | |
fn main() { | |
let html = "<!DOCTYPE html><!-- asdf --><html><head><title>Test</title></head><body><h1>Hello, World!</h1></body></html>"; | |
let dom = parse_document(RcDom::default(), Default::default()) | |
.from_utf8() | |
.read_from(&mut Cursor::new(html.as_bytes())) | |
.unwrap(); | |
walk(0, &dom.document); | |
if !dom.errors.borrow().is_empty() { | |
println!("\nParse errors:"); | |
for err in dom.errors.borrow().iter() { | |
println!(" {err}"); | |
} | |
} | |
} | |
# Cargo.toml | |
[package] | |
name = "z" | |
version = "0.1.0" | |
edition = "2021" | |
[dependencies] | |
chrono = "0.4.39" | |
html5ever = { version = "0.29.0", features = ["trace_tokenizer"] } | |
lazy_static = "1.5.0" | |
markup5ever = "0.14.0" | |
markup5ever_rcdom = "0.5.0-unofficial" | |
prettyish-html = "0.1.1" | |
regex = "1.11.1" | |
time-util = { version = "0.3.4", features = ["chrono"] } | |
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.41s | |
Running `target/debug/z` | |
# output | |
<!DOCTYPE html> | |
<!-- asdf --> | |
<html> | |
<head> | |
<title> | |
Test | |
</title> | |
</head> | |
<body> | |
<h1> | |
Hello, World! | |
</h1> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment