Last active
July 11, 2023 18:48
-
-
Save wvengen/3a93e1f55d5d412b1d1b197150550edd to your computer and use it in GitHub Desktop.
Parsing an XML stream from an HTTP endpoint in Ruby.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Example of parsing a remote XML stream. | |
# | |
# Fetches data from an HTTP endpoint, and processes it bit by bit, without | |
# loading the whole file into memory. | |
# | |
# This example shows recent articles from the W3C blog feed. | |
# | |
require 'nokogiri' | |
require 'net/http' | |
# URL we will be parsing streaming | |
URL = 'https://www.w3.org/blog/news/feed/atom' | |
# XML tag and namespace to work on | |
TAG_NAME = 'entry' | |
TAG_NAMESPACE = 'http://www.w3.org/2005/Atom' | |
puts("The most recent W3C blog articles are:") | |
# setup request | |
uri = URI(URL) | |
req = Net::HTTP::Get.new(uri.request_uri) | |
# read response in a separate thread using a pipe to communicate | |
IO.pipe do |rd, wr| | |
rd.binmode | |
wr.binmode | |
begin | |
reader_thread = Thread.new do | |
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http| | |
http.request(req) do |response| | |
response.value # raise error when response is not successful | |
response.read_body {|chunk| wr.write(chunk) } | |
end | |
wr.close | |
end | |
end | |
# ensure that exceptions bubble up | |
reader_thread.abort_on_exception = true | |
# wait until data is ready, so that early errors don't confuse Nokogiri | |
IO.select([rd]) | |
# parse the incoming data chunk by chunk | |
reader = Nokogiri::XML.Reader(rd) {|cfg| cfg.recover.nonet.compact } | |
reader.each do |node| | |
next if node.node_type != Nokogiri::XML::Reader::TYPE_ELEMENT | |
next if node.namespace_uri != TAG_NAMESPACE | |
next if node.name.split(':').last != TAG_NAME | |
# now that we have the desired fragment, put it to use | |
doc = Nokogiri::XML(node.outer_xml) {|cfg| cfg.recover.nonet.compact } | |
title = doc.children.xpath('atom:title', atom: TAG_NAMESPACE) | |
puts("- " + Nokogiri::HTML(title.text).text) | |
end | |
rescue Exception => e | |
# don't hang with full pipe buffer when an exception occurs during parsing | |
reader_thread.kill | |
raise e | |
ensure | |
# let the reader thread finish cleanly | |
reader_thread.join | |
end | |
end | |
puts("Done!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment