Skip to content

Instantly share code, notes, and snippets.

@chrisZingel
Created March 17, 2014 03:24
Show Gist options
  • Save chrisZingel/9593454 to your computer and use it in GitHub Desktop.
Save chrisZingel/9593454 to your computer and use it in GitHub Desktop.
Search Engine ranking of keywords
# encoding=UTF-8
require 'rubygems'
require 'faraday'
require 'nokogiri'
require 'open-uri'
require 'yaml'
require 'ostruct'
# rake task to
namespace :api do
desc 'Search Engine Ranking'
task :search_engine_ranking => :environment do
def env_die
abort 'Fatal error: You need to specify a valid ENV.'
end
# Ensure that ENV is set to something
if ENV['RAILS_ENV'].nil?
env_die
end
class SearchEngine
attr_accessor :params, :conn, :response, :results, :search_engine_request,:count
def initialize(keywords, base_url)
@params = OpenStruct.new
@count =0
@params.base_url = base_url
@params.path ="/search?q=" + keywords.join("+")
@conn = Faraday.new(:url => @params.base_url) do |faraday|
faraday.response :logger # log requests to STDOUT
faraday.request :url_encoded # form-encode POST params
faraday.adapter Faraday.default_adapter # make requests with Net::HTTP
end
yield(@params)
self.instance_exec(params, &params.rule_for_setup) unless params.rule_for_setup.nil?
@search_engine_request = SearchEngineRequest.create(keywords: keywords.join("+"),
site: base_url)
end
def get_action(path=params.path)
puts "request on #{path}"
@response = conn.get path do |response|
response.headers['Cookie'] = params.cookie unless params.cookie.nil?
end
end
def extract_records
doc = Nokogiri::HTML response.body
doc.css(self.instance_exec(&params.rule_to_indentify_records))
end
def process_results
@results =[]
extract_records.each do |record|
@count +=1
match, url, path = self.instance_exec(record, &params.rule_to_extract_record)
@results << {order_id: @count, url: url, path: path, match?: match } unless match.nil?
end
save_results
end
def save_results
results.each do | record|
puts record
search_engine_request.search_engine_results.create({ rank_order: record[:order_id],
match: record[:match?],
path: record[:path][0..100],
url: record[:url][0..100] })
end
end
def loop_through_the_pages
0.step(10,10).each do |start|
get_action(params.url_first_part + start.to_s + params.url_last_part)
end
process_results
end
end
url = 'https://www.xxxx.com'
keywords =["ecommerce", "spree"]
search_engine = SearchEngine.new(keywords,url) do |params|
params.rule_to_indentify_records = lambda{ '#results li'}
params.rule_to_extract_record = lambda{ |record|
search_url = begin
record.css('h3 > a')[0].attribute_nodes.find{|i| i.name="href"}.value
rescue
""
end
match = begin
record.text.match(%r{xxxxx what you are interested in xxxxxx}).nil? ? false : true
rescue
false
end
url_path_match = search_url.match(/.*http:\/\/(?<url>.*?)\/(?<path>.*)/)
url_path_match.nil? ? [] : [match,url_path_match["url"], url_path_match["path"]]
}
params.rule_for_setup = lambda{ |p|
response = get_action(p.path)
p.cookie = response.headers['set-cookie']
doc = Nokogiri::HTML response.body
raw_url =doc.css('.sb_pag li a')[2].attribute_nodes.find{|i| i.name =="href"}.value
match_url=raw_url.match(/(?<first_part>.+?first=)\d+(?<last_part>.*)/)
p.url_first_part, p.url_last_part = [match_url["first_part"], match_url["last_part"]]
}
end
search_engine.loop_through_the_pages
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment