-
-
Save treffynnon/2881418 to your computer and use it in GitHub Desktop.
Tumblr to Hakyll (Markdown and reStructuredText) migration
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
echo "This script will download and install the tumblr migration script" | |
echo "script and its dependencies on Debian systems." | |
echo " " | |
read -p "Install dependencies from apt-get? [Yn] " | |
if [ "$REPLY" == "" -o "$REPLY" == "y" -o "$REPLY" == "Y" ]; then | |
echo "Installing the dependencies..." | |
echo " " | |
sudo apt-get install libxslt-dev libxml2-dev ruby rubygems pandoc | |
fi | |
read -p "Install required ruby gems? [Yn] " | |
if [ "$REPLY" == "" -o "$REPLY" == "y" -o "$REPLY" == "Y" ]; then | |
echo "Installing the gems..." | |
echo " " | |
sudo gem install nokogiri pandoc-ruby mime-types sanitize | |
fi | |
wget https://raw.github.com/gist/2881418/tumblr.rb | |
chmod +x tumblr.rb | |
chmod g+w tumblr.rb | |
mkdir _posts _images | |
echo "Update TUMBLR_DOMAIN in tumblr.rb to the address of your tumblr blog and" | |
echo "set AUTHOR to your name" | |
echo " " | |
echo "To convert the body of posts to a different format using pandoc change" | |
echo "CONVERT_TO to a format pandoc understands. The default value of false" | |
echo "bypasses the conversion process altogether. For example to convert to" | |
echo "Markdown set it to md and for reStructuredText set it to rst." | |
echo " " | |
echo "Then run ./tumblr.rb to do the download." |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Script to import tumblr posts into local markdown posts ready to be consumed by Hakyll. | |
# Inspired by New Bamboo's post: | |
# http://blog.new-bamboo.co.uk/2009/2/20/migrating-from-mephisto-to-jekyll | |
# | |
# Supports post types: regular, quote, link, photo, video and audio | |
# Saves local copies of images | |
# | |
# Changes in this fork: | |
# ===================== | |
# | |
# - Ability to convert the body of the post to another format such as .md or .rst | |
# - File name slug truncated to protect from over length file names | |
# - Moved images to current dir | |
# - Added install and usage instructions | |
# - Fix title cleaning by stripping new lines and HTML and trimming and squeezing spaces | |
# | |
# Installation | |
# ============ | |
# | |
# Either with the install script below or instructions below | |
# | |
# Install deps on Ubuntu: | |
# ----------------------- | |
# | |
# sudo apt-get install libxslt-dev libxml2-dev ruby rubygems pandoc | |
# | |
# Install required ruby gems: | |
# --------------------------- | |
# | |
# sudo gem install mime-types nokogiri pandoc-ruby | |
# | |
# Setup: | |
# ------ | |
# | |
# mkdir _images _posts | |
# sudo chmod +x tumblr.rb | |
# | |
# Update `TUMBLR_DOMAIN` below to the address of your tumblr blog and set | |
# `AUTHOR` to your name. | |
# | |
# To convert the body of posts to a different format using pandoc change | |
# `CONVERT_TO` to a format pandoc understands. The default value of false | |
# bypasses the conversion process altogether. For example to convert to | |
# Markdown set it to `md` and for reStructuredText set it to `rst`. | |
# | |
# Usage: | |
# ------ | |
# | |
# ./tumblr.rb | |
require 'rubygems' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'net/http' | |
require 'mime/types' | |
require 'fileutils' | |
require 'pathname' | |
require 'date' | |
require 'pandoc-ruby' | |
require 'sanitize' | |
# Configuration | |
TUMBLR_DOMAIN = "http://blog.example.org" | |
WRITE_DIRECTORY = "_posts" | |
IMAGE_DIRECTORY = "_images" | |
AUTHOR = "" | |
CONVERT_TO = false | |
# follow 3xx redirection | |
def fetch(uri_str, limit = 10) | |
raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
response = Net::HTTP.get_response(URI.parse(uri_str)) | |
case response | |
when Net::HTTPSuccess then response | |
when Net::HTTPRedirection then fetch(response['location'], limit - 1) | |
else | |
response.error! | |
end | |
end | |
# save a local copy of a tumblr-hosted image and return the relative uri | |
def fetch_img(uri_str) | |
uri = URI.parse(uri_str) | |
resp = fetch(uri_str) | |
mime_type = MIME::Types[resp["content-type"]].first | |
# build our local image path | |
path = "#{uri.host}#{uri.path}" | |
# rewrite extension | |
extension = mime_type.extensions.first | |
extension = extension == "jpeg" ? "jpg" : extension | |
path = "#{path.chomp(File.extname(path))}.#{extension}" | |
print "Image: #{uri_str} --> #{path}\n" | |
local_path = "#{IMAGE_DIRECTORY}/#{path}" | |
FileUtils.mkdir_p Pathname.new(local_path).dirname | |
open(local_path, "wb") { |file| file.write(resp.body) } | |
return "/images/#{path}" | |
end | |
# Tumblr api only returns 50 posts per call | |
post_offset = 0 | |
posts_returned = -1 | |
while posts_returned != 0 | |
path = TUMBLR_DOMAIN + "/api/read?num=50&filter=none&start=#{post_offset}" | |
# Connect to Tumblr and read the API source | |
open(path) do |xml| | |
doc = Nokogiri::XML.parse(xml) | |
posts = doc.css("post") | |
posts_returned = posts.count | |
post_offset += posts.count | |
posts.each do |post_tag| | |
# Gather data about each post | |
date = Date.parse(post_tag.attributes["date"].content) | |
id = post_tag.css("@id").first.content | |
slug_tag = post_tag.css("slug").first | |
slug = slug_tag.nil? ? nil : slug_tag.content | |
type = post_tag.attributes["type"].content | |
tags = post_tag.css("tag").map{|t| t.content } | |
title = nil | |
body = nil | |
if type == "regular" | |
title_tag = post_tag.css("regular-title").first | |
title = title_tag.nil? ? nil : title_tag.content | |
body = post_tag.css("regular-body").first.content | |
elsif type == "quote" | |
text = post_tag.css("quote-text").first.content | |
source = post_tag.css("quote-source").first.content | |
body = "> #{text}" + "\n\n" + source | |
elsif type == "link" | |
text_tag = post_tag.css("link-text").first | |
text = text_tag.nil? ? nil : text_tag.content | |
link = post_tag.css("link-url").first.content | |
body = "<a href=\"#{link}\">#{text}</a>" | |
desc_tag = post_tag.css("link-description").first | |
if desc_tag != nil | |
body << "\n\n#{desc_tag.content}" | |
end | |
elsif type == "photo" | |
body = "" | |
photoset_tag = post_tag.css("photoset").first | |
if photoset_tag.nil? | |
body += "<img src=\"#{fetch_img(post_tag.css("photo-url").first.content)}\" />" | |
else | |
post_tag.css("photo").each do |photo_tag| | |
body += "<img src=\"#{fetch_img(photo_tag.css("photo-url").first.content)}\" />" | |
end | |
end | |
text = post_tag.css("photo-caption").first.content | |
body += "\n\n#{text}" | |
elsif type == "video" | |
caption_tag = post_tag.css("video-caption").first | |
if caption_tag != nil | |
text = caption_tag.content | |
end | |
body = post_tag.css("video-source").first.content | |
elsif type == "audio" | |
caption_tag = post_tag.css("audio-caption").first | |
text = caption_tag.nil? ? nil : caption_tag.content | |
body = post_tag.css("audio-player").first.content | |
else | |
print "ERROR: Post type not supported\n" | |
next | |
end | |
if !title && !text | |
print "ERROR: Post title and text are nil: #{id}\n" | |
next | |
end | |
# title defaults | |
title ||= text | |
title = Sanitize.clean(title) # strip html | |
title = title.gsub(/\r/," ") | |
title = title.gsub(/\n/," ") | |
title = title.gsub(/\s+/," ") | |
title = title.strip | |
title = title.length > 64 ? (title[0,64] + "…") : title # limit length to W3C maximum title length | |
# create the slug if necessary and build a _post filename | |
if slug.nil? | |
slug = "#{title.gsub(/(\s|[^a-zA-Z0-9])/,"-").gsub(/-+/,'-').gsub(/-$/,'').downcase}" | |
end | |
slug = slug[0, 155] | |
filename_extension = "html" | |
if CONVERT_TO | |
filename_extension = CONVERT_TO | |
end | |
filename = "#{date.strftime("%Y-%m-%d")}-#{slug}.#{filename_extension}" | |
# if there's no post, we give up. | |
if !body | |
next | |
end | |
if CONVERT_TO | |
body = PandocRuby.new(body, :from => :html, :to => CONVERT_TO) | |
end | |
tagcode = "" | |
if tags.size > 0 | |
tagcode = "\ntags: #{tags.join(", ")}" | |
end | |
hakyll_post = <<-EOPOST | |
--- | |
title: #{title} | |
author: #{AUTHOR}#{ tagcode } | |
--- | |
#{body} | |
EOPOST | |
# Write files | |
puts "#{ filename }" | |
file = File.new("#{WRITE_DIRECTORY}/#{filename}", "w+") | |
file.write(hakyll_post) | |
file.close | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment