-
-
Save alexshapalov/870ef80c204da321b7267bcef1d01178 to your computer and use it in GitHub Desktop.
General Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ScraperDSL | |
attr_accessor :name, :home_url, :jobs_url, :structure | |
def initialize(name) | |
@name = name | |
@structure = ScraperStructure.new | |
end | |
def home_url(url) | |
@home_url = url | |
end | |
def jobs_url(url) | |
@jobs_url = url | |
end | |
def structure(&block) | |
@structure.instance_eval(&block) | |
end | |
def scrape | |
all_jobs_from_db = ScraperJob.where(site_name: @name).pluck(:unique_job_ref_id) | |
active_jobs_from_site = fetch_active_jobs_from_site | |
update_inactive_jobs(all_jobs_from_db, active_jobs_from_site) | |
ScraperJob.reindex | |
end | |
private | |
def fetch_active_jobs_from_site | |
active_jobs = [] | |
begin | |
response = HTTParty.get(@jobs_url, timeout: 3) | |
doc = Nokogiri::HTML(response.body) | |
doc.css(@structure.html_job_link_class).each do |link| | |
job_url = URI.join(@home_url, link["href"]).to_s | |
process_job_link(job_url, link, active_jobs) | |
end | |
rescue => e | |
log_error(@jobs_url, e.message) | |
end | |
active_jobs | |
end | |
def process_job_link(job_url, link, active_jobs) | |
begin | |
job_response = HTTParty.get(job_url, timeout: 3) | |
job_doc = Nokogiri::HTML(job_response.body) | |
unique_job_ref_id = Digest::SHA256.hexdigest(link["href"]) | |
job = find_or_initialize_job(unique_job_ref_id) | |
if job.new_record? || job.expired? | |
update_job_attributes(job, job_doc, unique_job_ref_id) | |
job.save | |
end | |
active_jobs << unique_job_ref_id | |
rescue => e | |
log_error(job_url, e.message) | |
end | |
end | |
def find_or_initialize_job(unique_job_ref_id) | |
ScraperJob.find_or_initialize_by(unique_job_ref_id: unique_job_ref_id) | |
end | |
def update_job_attributes(job, job_doc, unique_job_ref_id) | |
job.attributes = { | |
title: extract_text(job_doc, @structure.html_title), | |
company_name: extract_company_name(job_doc), | |
body: extract_text(job_doc, @structure.html_description), | |
city: extract_text(job_doc, @structure.html_city)&.[](0, 30), | |
state: extract_text(job_doc, @structure.html_state), | |
country: extract_country_name(job_doc), | |
compensation: extract_text(job_doc, @structure.html_compensation), | |
site_name: @name, | |
unique_job_ref_id: unique_job_ref_id, | |
expired: false | |
} | |
end | |
def extract_text(doc, xpath) | |
doc.at_xpath(xpath)&.text&.strip if doc.at_xpath(xpath) | |
end | |
def extract_company_name(doc) | |
extract_text(doc, @structure.html_company)&.split("at")&.last&.strip | |
end | |
def extract_country_name(doc) | |
extract_text(doc, @structure.html_country)&.split("-")&.last&.strip | |
end | |
def update_inactive_jobs(all_jobs_from_db, active_jobs_from_site) | |
inactive_jobs = all_jobs_from_db - active_jobs_from_site | |
ScraperJob.where(unique_job_ref_id: inactive_jobs, site_name: @name).update_all(expired: true) | |
end | |
def log_error(url, message) | |
puts "#{url}: #{message}" | |
end | |
end | |
class ScraperStructure | |
attr_accessor :html_job_link_class, :html_title, :html_company, :html_description, | |
:html_city, :html_state, :html_country, :html_compensation | |
def job_link_class(css_class) | |
@html_job_link_class = css_class | |
end | |
def title(xpath) | |
@html_title = xpath | |
end | |
def company(xpath) | |
@html_company = xpath | |
end | |
def description(xpath) | |
@html_description = xpath | |
end | |
def city(xpath) | |
@html_city = xpath | |
end | |
def state(xpath) | |
@html_state = xpath | |
end | |
def country(xpath) | |
@html_country = xpath | |
end | |
def compensation(xpath) | |
@html_compensation = xpath | |
end | |
end | |
def define_scraper(name, &block) | |
scraper = ScraperDSL.new(name) | |
scraper.instance_eval(&block) | |
scraper.scrape | |
end | |
# Assuming you have a Structure model in your database | |
Structure.where(active: true).find_each do |site_structure| | |
structure = ScraperStructure.new | |
structure.job_link_class(site_structure.html_job_link_class) | |
structure.title(site_structure.html_title) | |
structure.company(site_structure.html_company) | |
structure.description(site_structure.html_description) | |
structure.city(site_structure.html_city) | |
structure.state(site_structure.html_state) | |
structure.country(site_structure.html_country) | |
structure.compensation(site_structure.html_compensation) | |
scraper = ScraperDSL.new(site_structure.site_name) | |
scraper.home_url(site_structure.home_url) | |
scraper.jobs_url(site_structure.jobs_url) | |
scraper.structure = structure | |
scraper.scrape | |
end | |
# or DSL style for one site | |
define_scraper "GreenHouse" do | |
home_url "https://www.greenhouse.io" | |
jobs_url "https://www.greenhouse.io/jobs" | |
structure do | |
job_link_class ".job-link" | |
title "//h1[@class='title']" | |
company "//div[@class='company']" | |
description "//div[@class='description']" | |
city "//span[@class='city']" | |
state "//span[@class='state']" | |
country "//span[@class='country']" | |
compensation "//span[@class='compensation']" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment