Revisions
-
Jacob Harris created this gist
Jun 22, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,49 @@ # encoding: UTF-8 require 'rubygems' require 'rest-client' require 'nokogiri' class ArticleFetcher def self.fetch_text(url) case url when /video\.nytimes\.com/ '' when /\.blogs\.nytimes\.com/, /dealbook\.nytimes\.com/ fetch_blog_text(url) when /\.nytimes\.com/ fetch_article_text(url) end end def self.fetch_blog_text(url) response = RestClient.get(url) response_html = response.to_s parsed_html = Nokogiri::HTML(response_html) parsed_html.css("div.entry-content").inner_text end def self.fetch_article_text(url) full_url = if url =~ /\?/ url + "&pagewanted=all" else url + "?pagewanted=all" end response = RestClient.get(full_url) response_html = response.to_s parsed_html = Nokogiri::HTML(response_html) parsed_html.css("div.articleBody").inner_text end def self.extract_quotes(text) out = text.scan(/["“]([^"”]*)[”"]/m).join("\n") out.gsub(/,\n/m, ".\n") end def self.fetch_quotes(url) extract_quotes(fetch_text(url)) end end This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,61 @@ $:.unshift(File.dirname(__FILE__)) require 'rubygems' require 'article_fetcher' require 'markov' require 'open-uri' require 'simple-rss' require 'colorize' require 'twitter_db' RSS_FEED_URL = 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml' rss = SimpleRSS.parse open(RSS_FEED_URL) Tweet.connect_to_db debugging = ENV['DEBUG'] rss.items.each do |item| next if Tweet.in_db?(item.guid) && !debugging text = ArticleFetcher.fetch_text(item.guid) next if text.empty? quotes = ArticleFetcher.extract_quotes(text) if quotes.length < 400 puts "Not using quotes for this one: #{item.guid}" quotes = text end quotes.gsub!(/(([A-Z]\.)+)/) {|w| w.gsub('.', '')} quotes.gsub!(/(Dr|Mr|Mrs|Gov|Amb|Hon|Ave)\./, '\1') markov = MarkovChainer.new(1) markov.add_text(quotes) retries = 5 while retries > 0 retries -= 1 body = markov.generate_sentence case body when /[A-Z][A-Z]+/, /^.+\b[A-Z][a-z]+/, /(Dr|Mr|Mrs|Gov|Rep)\s/ puts "Retrying since this has a name in it: #{body}" else break end end if body.length > 100 body = body[0,100] body.gsub!(/\s+\S+$/, '') end body.gsub!(/\.$/, '') Tweet.queue(item.title, body, item.guid) end Tweet.post_pending This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,152 @@ require 'rubygems' require 'sqlite3' require 'bitly' gem 'activesupport', '~> 2.3.11' gem 'activerecord', '~> 2.3.11' require 'active_support' require 'active_record' gem 'twitter', '~> 2.4.0' require 'twitter' DB_PATH = File.join(File.dirname(__FILE__), "tweets.db") Bitly.use_api_version_3 Twitter.configure do |config| config.consumer_key = 'REDACTED' config.consumer_secret = 'REDACTED' config.oauth_token = 'REDACTED' config.oauth_token_secret = 'REDACTED' end class CreateTwitterDb < ActiveRecord::Migration def self.up create_table :tweets do |t| t.string :nyt_title t.string :body t.string :expanded_link t.boolean :posted, :default => false, :null => false t.boolean :expired, :default => false, :null => false t.datetime :created_at t.datetime :posted_at end add_index :tweets, :posted add_index :tweets, :expanded_link create_table :tweet_metadata do |t| t.datetime :next_post_at end end end class TweetMetadata < ActiveRecord::Base set_table_name 'tweet_metadata' def self.can_post_again? r = first r.nil? || r.next_post_at < Time.now end def self.tweet_posted next_time = Time.now + 5.minutes + rand(10.minutes) r = first if r.nil? create :next_post_at => next_time else r.update_attribute(:next_post_at, next_time) end end end class Tweet < ActiveRecord::Base BITLY_KEY = 'REDACTED' named_scope :pending, :conditions => {:posted => false, :expired => false} def self.connect_to_db if !File.exists?(DB_PATH) should_create = true end ActiveRecord::Base.establish_connection({ :adapter => 'sqlite3', :database => DB_PATH }) if should_create create_db end end def self.shorten_link(link) if @bitly.nil? @bitly = Bitly.new('nytimesebooks', BITLY_KEY) end u = @bitly.shorten(link, :history => 1) u.short_url end def self.in_db?(link) exists?(:expanded_link => link) end def self.queue(title, text, link) if ENV['DEBUG'] puts title puts link puts text.colorize(:red) return end return if exists?(:expanded_link => link) begin short_link = shorten_link(link) body = "#{text.gsub(/\s+$/, '')} #{short_link}" create :nyt_title => title, :body => body, :expanded_link => link puts title puts link puts body.colorize(:red) # rescue => ex # puts "ERROR #{ex.message} for #{text} #{link}" end end def mark_posted!(add_timeout=true) update_attribute(:posted, true) if add_timeout TweetMetadata.tweet_posted end end def self.post_pending t = pending.first if !t.nil? && !TweetMetadata.can_post_again? puts "Can't post yet" end unless t.nil? || !TweetMetadata.can_post_again? begin Twitter.update(t.body) t.mark_posted! rescue => ex if ex.message =~ /Status is a duplicate/ t.mark_posted!(false) else raise ex end end end end private def self.create_db CreateTwitterDb.up end end