Created
March 8, 2013 08:32
-
-
Save coffeeaddict/5115012 to your computer and use it in GitHub Desktop.
HTML 2 Markdown using a SAX Parser. (WIP)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# html2markdown.rb | |
require 'nokogiri' | |
module Html2Markdown | |
class HtmlDocument < Nokogiri::XML::SAX::Document | |
MAP = { "b" => "strong", "i" => "em" } | |
def initialize() | |
@list = nil | |
@list_depth = 0 | |
@ol_count = 0 | |
@element = nil | |
@capture_buffer = "" | |
super() | |
end | |
def captured | |
@capture_buffer | |
end | |
def characters(string) | |
if %[p li h1 h2 h3 h4 h5 strong b i em].include?(@element) | |
@capture_buffer += string | |
else | |
$stderr.puts "Would have added #{string}, but do not know #{@element}" | |
end | |
end | |
def start_element(name, attributes) | |
name.downcase! | |
name = MAP[name] if MAP.has_key?(name) | |
@element = name | |
method = :"handle_#{name}" | |
return unless self.respond_to?(method) | |
if self.method(method).arity > 0 | |
self.send(method, attributes) | |
else | |
self.send(method) | |
end | |
end | |
def end_element(name) | |
name.downcase! | |
name = MAP[name] if MAP.has_key?(name) | |
method = :"end_#{name}" | |
if !self.respond_to?(method) | |
method = :"handle_#{name}" | |
return unless self.respond_to?(method) | |
end | |
self.send(method) | |
end | |
def handle_p | |
end | |
alias_method :handle_br, :handle_p | |
def end_p | |
@capture_buffer += "\n\n" | |
end | |
def end_br | |
@capture_buffer += "\n" | |
end | |
def handle_strong | |
@capture_buffer += "**" | |
end | |
def handle_em | |
@capture_buffer += "*" | |
end | |
1.upto(5) do |i| | |
header = "#" * i | |
self.send(:define_method, :"handle_h#{i}") do | |
@capture_buffer += "#{header} " | |
end | |
self.send(:define_method, :"end_h#{i}") do | |
@capture_buffer += "\n" | |
end | |
end | |
def handle_code | |
@capture_buffer += "`" | |
end | |
def handle_ul | |
@list = :ul | |
@list_depth += 1 | |
end | |
def end_ul | |
@list_depth -= 1 | |
@ol_count = 0 if @list == :ol | |
end | |
alias_method :end_ol, :end_ul | |
def handle_ol | |
@list = :ol | |
@list_depth += 1 | |
end | |
def handle_li | |
indent = " " * (@list_depth - 1) | |
mark = @list == :ul ? "* " : "#{@ol_count += 1} " | |
@capture_buffer += indent | |
@capture_buffer += mark | |
end | |
def end_li | |
@capture_buffer += "\n" | |
end | |
end | |
def self.convert(object) | |
document = HtmlDocument.new() | |
parser = Nokogiri::HTML::SAX::Parser.new(document) | |
parser.parse(object) | |
return document.captured | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment