treffynnon · June 6, 2012 11:40
diff --git a/install.sh b/install.sh
 #!/usr/bin/env bash
 echo "This script will download and install the tumblr migration script"
 echo "script and its dependencies on Debian systems."
 echo " "

 read -p "Install dependencies from apt-get? [Yn] "
 if [ "$REPLY" == "" -o "$REPLY" == "y" -o "$REPLY" == "Y" ]; then
    echo "Installing the dependencies..."
    echo " "
    sudo apt-get install libxslt-dev libxml2-dev ruby rubygems pandoc
 fi

 read -p "Install required ruby gems? [Yn] "
 if [ "$REPLY" == "" -o "$REPLY" == "y" -o "$REPLY" == "Y" ]; then
    echo "Installing the gems..."
    echo " "
    sudo gem install nokogiri pandoc-ruby mime-types sanitize
 fi

 wget https://raw.github.com/gist/2881418/tumblr.rb
 chmod +x tumblr.rb
 chmod g+w tumblr.rb
 mkdir _posts _images

 echo "Update TUMBLR_DOMAIN in tumblr.rb to the address of your tumblr blog and"
 echo "set AUTHOR to your name"
 echo " "
 echo "To convert the body of posts to a different format using pandoc change"
 echo "CONVERT_TO to a format pandoc understands. The default value of false"
 echo "bypasses the conversion process altogether. For example to convert to"
 echo "Markdown set it to md and for reStructuredText set it to rst."
 echo " "
 echo "Then run ./tumblr.rb to do the download."
diff --git a/tumblr.rb b/tumblr.rb
 #!/usr/bin/env ruby

 # Script to import tumblr posts into local markdown posts ready to be consumed by Hakyll.
 # Inspired by New Bamboo's post:
 # http://blog.new-bamboo.co.uk/2009/2/20/migrating-from-mephisto-to-jekyll
 #
 # Supports post types: regular, quote, link, photo, video and audio
 # Saves local copies of images
 #
 # Changes in this fork:
 # =====================
 #
 #  - Ability to convert the body of the post to another format such as .md or .rst
 #  - File name slug truncated to protect from over length file names
 #  - Moved images to current dir
 #  - Added install and usage instructions
 #  - Fix title cleaning by stripping new lines and HTML and trimming and squeezing spaces
 #
 # Installation
 # ============
 #
 # Either with the install script below or instructions below
 #
 # Install deps on Ubuntu:
 # -----------------------
 #
 #     sudo apt-get install libxslt-dev libxml2-dev ruby rubygems pandoc
 #
 # Install required ruby gems:
 # ---------------------------
 #
 #     sudo gem install mime-types nokogiri pandoc-ruby
 #
 # Setup:
 # ------
 #
 #     mkdir _images _posts
 #     sudo chmod +x tumblr.rb
 #
 #   Update `TUMBLR_DOMAIN` below to the address of your tumblr blog and set 
 #   `AUTHOR` to your name.
 #
 #   To convert the body of posts to a different format using pandoc change
 #   `CONVERT_TO` to a format pandoc understands. The default value of false
 #   bypasses the conversion process altogether. For example to convert to
 #   Markdown set it to `md` and for reStructuredText set it to `rst`.
 #
 # Usage:
 # ------
 #
 #     ./tumblr.rb

 require 'rubygems'
 require 'open-uri'
 require 'nokogiri'
 require 'net/http'
 require 'mime/types'
 require 'fileutils'
 require 'pathname'
 require 'date'
 require 'pandoc-ruby'
 require 'sanitize'

 # Configuration
 TUMBLR_DOMAIN =   "http://blog.example.org"
 WRITE_DIRECTORY = "_posts"
 IMAGE_DIRECTORY = "_images"
 AUTHOR = ""
 CONVERT_TO = false

 # follow 3xx redirection
 def fetch(uri_str, limit = 10)
  raise ArgumentError, 'HTTP redirect too deep' if limit == 0

  response = Net::HTTP.get_response(URI.parse(uri_str))
  case response
  when Net::HTTPSuccess     then response
  when Net::HTTPRedirection then fetch(response['location'], limit - 1)
  else
    response.error!
  end
 end

 # save a local copy of a tumblr-hosted image and return the relative uri
 def fetch_img(uri_str) 

  uri = URI.parse(uri_str)

  resp = fetch(uri_str)
  mime_type = MIME::Types[resp["content-type"]].first

  # build our local image path
  path = "#{uri.host}#{uri.path}"

  # rewrite extension
  extension = mime_type.extensions.first
  extension = extension == "jpeg" ? "jpg" : extension
  path = "#{path.chomp(File.extname(path))}.#{extension}"

  print "Image: #{uri_str} --> #{path}\n"

  local_path = "#{IMAGE_DIRECTORY}/#{path}"
  FileUtils.mkdir_p Pathname.new(local_path).dirname
  open(local_path, "wb") { |file| file.write(resp.body) }

  return "/images/#{path}"
 end

 # Tumblr api only returns 50 posts per call
 post_offset = 0
 posts_returned = -1
 while posts_returned != 0

  path = TUMBLR_DOMAIN + "/api/read?num=50&filter=none&start=#{post_offset}"

  # Connect to Tumblr and read the API source
  open(path) do |xml|
    doc = Nokogiri::XML.parse(xml)
    posts = doc.css("post")
    posts_returned = posts.count
    post_offset += posts.count
    posts.each do |post_tag|

      # Gather data about each post 
      date = Date.parse(post_tag.attributes["date"].content)
      id = post_tag.css("@id").first.content
      slug_tag = post_tag.css("slug").first
      slug = slug_tag.nil? ? nil : slug_tag.content
      type = post_tag.attributes["type"].content
      tags = post_tag.css("tag").map{|t| t.content }
      title = nil
      body = nil

      if type == "regular"    
        title_tag = post_tag.css("regular-title").first
        title = title_tag.nil? ? nil : title_tag.content
        body = post_tag.css("regular-body").first.content
      elsif type == "quote"    
        text = post_tag.css("quote-text").first.content
        source = post_tag.css("quote-source").first.content
        body = "> #{text}" + "\n\n" + source
      elsif type == "link"
        text_tag = post_tag.css("link-text").first
        text = text_tag.nil? ? nil : text_tag.content
        link = post_tag.css("link-url").first.content
        body = "<a href=\"#{link}\">#{text}</a>"
        desc_tag = post_tag.css("link-description").first
        if desc_tag != nil
          body << "\n\n#{desc_tag.content}"
        end
      elsif type == "photo"
        body = ""  

        photoset_tag = post_tag.css("photoset").first
        if photoset_tag.nil?
          body += "<img src=\"#{fetch_img(post_tag.css("photo-url").first.content)}\" />"
        else
          post_tag.css("photo").each do |photo_tag|
            body += "<img src=\"#{fetch_img(photo_tag.css("photo-url").first.content)}\" />"
          end
        end
        text = post_tag.css("photo-caption").first.content
        body += "\n\n#{text}" 
      elsif type == "video"
        caption_tag = post_tag.css("video-caption").first 
        if caption_tag != nil
          text = caption_tag.content
        end
        body = post_tag.css("video-source").first.content
      elsif type == "audio"
        caption_tag = post_tag.css("audio-caption").first 
        text = caption_tag.nil? ? nil : caption_tag.content
        body = post_tag.css("audio-player").first.content
      else
        print "ERROR: Post type not supported\n"
        next
      end

      if !title && !text
        print "ERROR: Post title and text are nil: #{id}\n"
        next
      end

      # title defaults
      title ||= text
      title = Sanitize.clean(title) # strip html
      title = title.gsub(/\r/," ")
      title = title.gsub(/\n/," ")
      title = title.gsub(/\s+/," ")
      title = title.strip
      title = title.length > 64 ? (title[0,64] + "…") : title # limit length to W3C maximum title length

      # create the slug if necessary and build a _post filename
      if slug.nil? 
        slug = "#{title.gsub(/(\s|[^a-zA-Z0-9])/,"-").gsub(/-+/,'-').gsub(/-$/,'').downcase}"
      end
      slug = slug[0, 155]

      filename_extension = "html"
      if CONVERT_TO
        filename_extension = CONVERT_TO
      end
      filename = "#{date.strftime("%Y-%m-%d")}-#{slug}.#{filename_extension}"

      # if there's no post, we give up.
      if !body
        next
      end

      if CONVERT_TO
        body = PandocRuby.new(body, :from => :html, :to => CONVERT_TO)
      end

      tagcode = ""
      if tags.size > 0
        tagcode = "\ntags: #{tags.join(", ")}"
      end

      hakyll_post = <<-EOPOST
 ---
 title: #{title}
 author: #{AUTHOR}#{ tagcode }
 ---

 #{body}
 EOPOST

      # Write files
      puts "#{ filename }"
      file = File.new("#{WRITE_DIRECTORY}/#{filename}", "w+")
      file.write(hakyll_post)
      file.close

    end
  end
 end
	#!/usr/bin/env bash
	echo "This script will download and install the tumblr migration script"
	echo "script and its dependencies on Debian systems."
	echo " "

	read -p "Install dependencies from apt-get? [Yn] "
	if [ "$REPLY" == "" -o "$REPLY" == "y" -o "$REPLY" == "Y" ]; then
	echo "Installing the dependencies..."
	echo " "
	sudo apt-get install libxslt-dev libxml2-dev ruby rubygems pandoc
	fi

	read -p "Install required ruby gems? [Yn] "
	if [ "$REPLY" == "" -o "$REPLY" == "y" -o "$REPLY" == "Y" ]; then
	echo "Installing the gems..."
	echo " "
	sudo gem install nokogiri pandoc-ruby mime-types sanitize
	fi

	wget https://raw.github.com/gist/2881418/tumblr.rb
	chmod +x tumblr.rb
	chmod g+w tumblr.rb
	mkdir _posts _images

	echo "Update TUMBLR_DOMAIN in tumblr.rb to the address of your tumblr blog and"
	echo "set AUTHOR to your name"
	echo " "
	echo "To convert the body of posts to a different format using pandoc change"
	echo "CONVERT_TO to a format pandoc understands. The default value of false"
	echo "bypasses the conversion process altogether. For example to convert to"
	echo "Markdown set it to md and for reStructuredText set it to rst."
	echo " "
	echo "Then run ./tumblr.rb to do the download."
	#!/usr/bin/env ruby

	# Script to import tumblr posts into local markdown posts ready to be consumed by Hakyll.
	# Inspired by New Bamboo's post:
	# http://blog.new-bamboo.co.uk/2009/2/20/migrating-from-mephisto-to-jekyll
	#
	# Supports post types: regular, quote, link, photo, video and audio
	# Saves local copies of images
	#
	# Changes in this fork:
	# =====================
	#
	# - Ability to convert the body of the post to another format such as .md or .rst
	# - File name slug truncated to protect from over length file names
	# - Moved images to current dir
	# - Added install and usage instructions
	# - Fix title cleaning by stripping new lines and HTML and trimming and squeezing spaces
	#
	# Installation
	# ============
	#
	# Either with the install script below or instructions below
	#
	# Install deps on Ubuntu:
	# -----------------------
	#
	# sudo apt-get install libxslt-dev libxml2-dev ruby rubygems pandoc
	#
	# Install required ruby gems:
	# ---------------------------
	#
	# sudo gem install mime-types nokogiri pandoc-ruby
	#
	# Setup:
	# ------
	#
	# mkdir _images _posts
	# sudo chmod +x tumblr.rb
	#
	# Update `TUMBLR_DOMAIN` below to the address of your tumblr blog and set
	# `AUTHOR` to your name.
	#
	# To convert the body of posts to a different format using pandoc change
	# `CONVERT_TO` to a format pandoc understands. The default value of false
	# bypasses the conversion process altogether. For example to convert to
	# Markdown set it to `md` and for reStructuredText set it to `rst`.
	#
	# Usage:
	# ------
	#
	# ./tumblr.rb

	require 'rubygems'
	require 'open-uri'
	require 'nokogiri'
	require 'net/http'
	require 'mime/types'
	require 'fileutils'
	require 'pathname'
	require 'date'
	require 'pandoc-ruby'
	require 'sanitize'

	# Configuration
	TUMBLR_DOMAIN = "http://blog.example.org"
	WRITE_DIRECTORY = "_posts"
	IMAGE_DIRECTORY = "_images"
	AUTHOR = ""
	CONVERT_TO = false

	# follow 3xx redirection
	def fetch(uri_str, limit = 10)
	raise ArgumentError, 'HTTP redirect too deep' if limit == 0

	response = Net::HTTP.get_response(URI.parse(uri_str))
	case response
	when Net::HTTPSuccess then response
	when Net::HTTPRedirection then fetch(response['location'], limit - 1)
	else
	response.error!
	end
	end

	# save a local copy of a tumblr-hosted image and return the relative uri
	def fetch_img(uri_str)

	uri = URI.parse(uri_str)

	resp = fetch(uri_str)
	mime_type = MIME::Types[resp["content-type"]].first

	# build our local image path
	path = "#{uri.host}#{uri.path}"

	# rewrite extension
	extension = mime_type.extensions.first
	extension = extension == "jpeg" ? "jpg" : extension
	path = "#{path.chomp(File.extname(path))}.#{extension}"

	print "Image: #{uri_str} --> #{path}\n"

	local_path = "#{IMAGE_DIRECTORY}/#{path}"
	FileUtils.mkdir_p Pathname.new(local_path).dirname
	open(local_path, "wb") { \|file\| file.write(resp.body) }

	return "/images/#{path}"
	end

	# Tumblr api only returns 50 posts per call
	post_offset = 0
	posts_returned = -1
	while posts_returned != 0

	path = TUMBLR_DOMAIN + "/api/read?num=50&filter=none&start=#{post_offset}"

	# Connect to Tumblr and read the API source
	open(path) do \|xml\|
	doc = Nokogiri::XML.parse(xml)
	posts = doc.css("post")
	posts_returned = posts.count
	post_offset += posts.count
	posts.each do \|post_tag\|

	# Gather data about each post
	date = Date.parse(post_tag.attributes["date"].content)
	id = post_tag.css("@id").first.content
	slug_tag = post_tag.css("slug").first
	slug = slug_tag.nil? ? nil : slug_tag.content
	type = post_tag.attributes["type"].content
	tags = post_tag.css("tag").map{\|t\| t.content }
	title = nil
	body = nil

	if type == "regular"
	title_tag = post_tag.css("regular-title").first
	title = title_tag.nil? ? nil : title_tag.content
	body = post_tag.css("regular-body").first.content
	elsif type == "quote"
	text = post_tag.css("quote-text").first.content
	source = post_tag.css("quote-source").first.content
	body = "> #{text}" + "\n\n" + source
	elsif type == "link"
	text_tag = post_tag.css("link-text").first
	text = text_tag.nil? ? nil : text_tag.content
	link = post_tag.css("link-url").first.content
	body = "<a href=\"#{link}\">#{text}</a>"
	desc_tag = post_tag.css("link-description").first
	if desc_tag != nil
	body << "\n\n#{desc_tag.content}"
	end
	elsif type == "photo"
	body = ""

	photoset_tag = post_tag.css("photoset").first
	if photoset_tag.nil?
	body += "<img src=\"#{fetch_img(post_tag.css("photo-url").first.content)}\" />"
	else
	post_tag.css("photo").each do \|photo_tag\|
	body += "<img src=\"#{fetch_img(photo_tag.css("photo-url").first.content)}\" />"
	end
	end
	text = post_tag.css("photo-caption").first.content
	body += "\n\n#{text}"
	elsif type == "video"
	caption_tag = post_tag.css("video-caption").first
	if caption_tag != nil
	text = caption_tag.content
	end
	body = post_tag.css("video-source").first.content
	elsif type == "audio"
	caption_tag = post_tag.css("audio-caption").first
	text = caption_tag.nil? ? nil : caption_tag.content
	body = post_tag.css("audio-player").first.content
	else
	print "ERROR: Post type not supported\n"
	next
	end

	if !title && !text
	print "ERROR: Post title and text are nil: #{id}\n"
	next
	end

	# title defaults
	title \|\|= text
	title = Sanitize.clean(title) # strip html
	title = title.gsub(/\r/," ")
	title = title.gsub(/\n/," ")
	title = title.gsub(/\s+/," ")
	title = title.strip
	title = title.length > 64 ? (title[0,64] + "…") : title # limit length to W3C maximum title length

	# create the slug if necessary and build a _post filename
	if slug.nil?
	slug = "#{title.gsub(/(\s\|[^a-zA-Z0-9])/,"-").gsub(/-+/,'-').gsub(/-$/,'').downcase}"
	end
	slug = slug[0, 155]

	filename_extension = "html"
	if CONVERT_TO
	filename_extension = CONVERT_TO
	end
	filename = "#{date.strftime("%Y-%m-%d")}-#{slug}.#{filename_extension}"

	# if there's no post, we give up.
	if !body
	next
	end

	if CONVERT_TO
	body = PandocRuby.new(body, :from => :html, :to => CONVERT_TO)
	end

	tagcode = ""
	if tags.size > 0
	tagcode = "\ntags: #{tags.join(", ")}"
	end

	hakyll_post = <<-EOPOST
	---
	title: #{title}
	author: #{AUTHOR}#{ tagcode }
	---

	#{body}
	EOPOST

	# Write files
	puts "#{ filename }"
	file = File.new("#{WRITE_DIRECTORY}/#{filename}", "w+")
	file.write(hakyll_post)
	file.close

	end
	end
	end