Skip to content

Instantly share code, notes, and snippets.

@sci-phi
Forked from harrisj/article_fetcher.rb
Created June 22, 2012 20:24

Revisions

  1. Jacob Harris created this gist Jun 22, 2012.
    49 changes: 49 additions & 0 deletions article_fetcher.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,49 @@
    # encoding: UTF-8

    require 'rubygems'
    require 'rest-client'
    require 'nokogiri'

    class ArticleFetcher
    def self.fetch_text(url)
    case url
    when /video\.nytimes\.com/
    ''
    when /\.blogs\.nytimes\.com/, /dealbook\.nytimes\.com/
    fetch_blog_text(url)
    when /\.nytimes\.com/
    fetch_article_text(url)
    end
    end

    def self.fetch_blog_text(url)
    response = RestClient.get(url)
    response_html = response.to_s

    parsed_html = Nokogiri::HTML(response_html)
    parsed_html.css("div.entry-content").inner_text
    end

    def self.fetch_article_text(url)
    full_url = if url =~ /\?/
    url + "&pagewanted=all"
    else
    url + "?pagewanted=all"
    end

    response = RestClient.get(full_url)
    response_html = response.to_s

    parsed_html = Nokogiri::HTML(response_html)
    parsed_html.css("div.articleBody").inner_text
    end

    def self.extract_quotes(text)
    out = text.scan(/["“]([^"”]*)[”"]/m).join("\n")
    out.gsub(/,\n/m, ".\n")
    end

    def self.fetch_quotes(url)
    extract_quotes(fetch_text(url))
    end
    end
    61 changes: 61 additions & 0 deletions nytimes_ebooks.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    $:.unshift(File.dirname(__FILE__))

    require 'rubygems'
    require 'article_fetcher'
    require 'markov'
    require 'open-uri'
    require 'simple-rss'
    require 'colorize'
    require 'twitter_db'

    RSS_FEED_URL = 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'

    rss = SimpleRSS.parse open(RSS_FEED_URL)

    Tweet.connect_to_db

    debugging = ENV['DEBUG']

    rss.items.each do |item|
    next if Tweet.in_db?(item.guid) && !debugging

    text = ArticleFetcher.fetch_text(item.guid)
    next if text.empty?

    quotes = ArticleFetcher.extract_quotes(text)

    if quotes.length < 400
    puts "Not using quotes for this one: #{item.guid}"
    quotes = text
    end


    quotes.gsub!(/(([A-Z]\.)+)/) {|w| w.gsub('.', '')}
    quotes.gsub!(/(Dr|Mr|Mrs|Gov|Amb|Hon|Ave)\./, '\1')

    markov = MarkovChainer.new(1)
    markov.add_text(quotes)

    retries = 5
    while retries > 0
    retries -= 1
    body = markov.generate_sentence
    case body
    when /[A-Z][A-Z]+/, /^.+\b[A-Z][a-z]+/, /(Dr|Mr|Mrs|Gov|Rep)\s/
    puts "Retrying since this has a name in it: #{body}"
    else
    break
    end
    end

    if body.length > 100
    body = body[0,100]
    body.gsub!(/\s+\S+$/, '')
    end

    body.gsub!(/\.$/, '')

    Tweet.queue(item.title, body, item.guid)
    end

    Tweet.post_pending
    152 changes: 152 additions & 0 deletions twitter_db.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,152 @@
    require 'rubygems'
    require 'sqlite3'
    require 'bitly'
    gem 'activesupport', '~> 2.3.11'
    gem 'activerecord', '~> 2.3.11'
    require 'active_support'
    require 'active_record'
    gem 'twitter', '~> 2.4.0'
    require 'twitter'

    DB_PATH = File.join(File.dirname(__FILE__), "tweets.db")
    Bitly.use_api_version_3

    Twitter.configure do |config|
    config.consumer_key = 'REDACTED'
    config.consumer_secret = 'REDACTED'
    config.oauth_token = 'REDACTED'
    config.oauth_token_secret = 'REDACTED'
    end

    class CreateTwitterDb < ActiveRecord::Migration
    def self.up
    create_table :tweets do |t|
    t.string :nyt_title
    t.string :body
    t.string :expanded_link
    t.boolean :posted, :default => false, :null => false
    t.boolean :expired, :default => false, :null => false
    t.datetime :created_at
    t.datetime :posted_at
    end

    add_index :tweets, :posted
    add_index :tweets, :expanded_link

    create_table :tweet_metadata do |t|
    t.datetime :next_post_at
    end
    end
    end

    class TweetMetadata < ActiveRecord::Base
    set_table_name 'tweet_metadata'

    def self.can_post_again?
    r = first

    r.nil? || r.next_post_at < Time.now
    end

    def self.tweet_posted
    next_time = Time.now + 5.minutes + rand(10.minutes)

    r = first
    if r.nil?
    create :next_post_at => next_time
    else
    r.update_attribute(:next_post_at, next_time)
    end
    end
    end

    class Tweet < ActiveRecord::Base
    BITLY_KEY = 'REDACTED'

    named_scope :pending, :conditions => {:posted => false, :expired => false}

    def self.connect_to_db
    if !File.exists?(DB_PATH)
    should_create = true
    end

    ActiveRecord::Base.establish_connection({
    :adapter => 'sqlite3',
    :database => DB_PATH
    })

    if should_create
    create_db
    end
    end

    def self.shorten_link(link)
    if @bitly.nil?
    @bitly = Bitly.new('nytimesebooks', BITLY_KEY)
    end

    u = @bitly.shorten(link, :history => 1)
    u.short_url
    end

    def self.in_db?(link)
    exists?(:expanded_link => link)
    end

    def self.queue(title, text, link)
    if ENV['DEBUG']
    puts title
    puts link
    puts text.colorize(:red)
    return
    end

    return if exists?(:expanded_link => link)

    begin
    short_link = shorten_link(link)
    body = "#{text.gsub(/\s+$/, '')} #{short_link}"
    create :nyt_title => title, :body => body, :expanded_link => link

    puts title
    puts link
    puts body.colorize(:red)

    # rescue => ex
    # puts "ERROR #{ex.message} for #{text} #{link}"
    end
    end

    def mark_posted!(add_timeout=true)
    update_attribute(:posted, true)
    if add_timeout
    TweetMetadata.tweet_posted
    end
    end

    def self.post_pending
    t = pending.first

    if !t.nil? && !TweetMetadata.can_post_again?
    puts "Can't post yet"
    end

    unless t.nil? || !TweetMetadata.can_post_again?
    begin
    Twitter.update(t.body)
    t.mark_posted!
    rescue => ex
    if ex.message =~ /Status is a duplicate/
    t.mark_posted!(false)
    else
    raise ex
    end
    end
    end
    end

    private
    def self.create_db
    CreateTwitterDb.up
    end
    end