Created
March 17, 2014 03:24
-
-
Save chrisZingel/9593454 to your computer and use it in GitHub Desktop.
Search Engine ranking of keywords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=UTF-8 | |
require 'rubygems' | |
require 'faraday' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'yaml' | |
require 'ostruct' | |
# rake task to | |
namespace :api do | |
desc 'Search Engine Ranking' | |
task :search_engine_ranking => :environment do | |
def env_die | |
abort 'Fatal error: You need to specify a valid ENV.' | |
end | |
# Ensure that ENV is set to something | |
if ENV['RAILS_ENV'].nil? | |
env_die | |
end | |
class SearchEngine | |
attr_accessor :params, :conn, :response, :results, :search_engine_request,:count | |
def initialize(keywords, base_url) | |
@params = OpenStruct.new | |
@count =0 | |
@params.base_url = base_url | |
@params.path ="/search?q=" + keywords.join("+") | |
@conn = Faraday.new(:url => @params.base_url) do |faraday| | |
faraday.response :logger # log requests to STDOUT | |
faraday.request :url_encoded # form-encode POST params | |
faraday.adapter Faraday.default_adapter # make requests with Net::HTTP | |
end | |
yield(@params) | |
self.instance_exec(params, ¶ms.rule_for_setup) unless params.rule_for_setup.nil? | |
@search_engine_request = SearchEngineRequest.create(keywords: keywords.join("+"), | |
site: base_url) | |
end | |
def get_action(path=params.path) | |
puts "request on #{path}" | |
@response = conn.get path do |response| | |
response.headers['Cookie'] = params.cookie unless params.cookie.nil? | |
end | |
end | |
def extract_records | |
doc = Nokogiri::HTML response.body | |
doc.css(self.instance_exec(¶ms.rule_to_indentify_records)) | |
end | |
def process_results | |
@results =[] | |
extract_records.each do |record| | |
@count +=1 | |
match, url, path = self.instance_exec(record, ¶ms.rule_to_extract_record) | |
@results << {order_id: @count, url: url, path: path, match?: match } unless match.nil? | |
end | |
save_results | |
end | |
def save_results | |
results.each do | record| | |
puts record | |
search_engine_request.search_engine_results.create({ rank_order: record[:order_id], | |
match: record[:match?], | |
path: record[:path][0..100], | |
url: record[:url][0..100] }) | |
end | |
end | |
def loop_through_the_pages | |
0.step(10,10).each do |start| | |
get_action(params.url_first_part + start.to_s + params.url_last_part) | |
end | |
process_results | |
end | |
end | |
url = 'https://www.xxxx.com' | |
keywords =["ecommerce", "spree"] | |
search_engine = SearchEngine.new(keywords,url) do |params| | |
params.rule_to_indentify_records = lambda{ '#results li'} | |
params.rule_to_extract_record = lambda{ |record| | |
search_url = begin | |
record.css('h3 > a')[0].attribute_nodes.find{|i| i.name="href"}.value | |
rescue | |
"" | |
end | |
match = begin | |
record.text.match(%r{xxxxx what you are interested in xxxxxx}).nil? ? false : true | |
rescue | |
false | |
end | |
url_path_match = search_url.match(/.*http:\/\/(?<url>.*?)\/(?<path>.*)/) | |
url_path_match.nil? ? [] : [match,url_path_match["url"], url_path_match["path"]] | |
} | |
params.rule_for_setup = lambda{ |p| | |
response = get_action(p.path) | |
p.cookie = response.headers['set-cookie'] | |
doc = Nokogiri::HTML response.body | |
raw_url =doc.css('.sb_pag li a')[2].attribute_nodes.find{|i| i.name =="href"}.value | |
match_url=raw_url.match(/(?<first_part>.+?first=)\d+(?<last_part>.*)/) | |
p.url_first_part, p.url_last_part = [match_url["first_part"], match_url["last_part"]] | |
} | |
end | |
search_engine.loop_through_the_pages | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment