alexshapalov · July 3, 2024 05:23
diff --git a/.rb b/.rb
 class ScraperDSL
  attr_accessor :name, :home_url, :jobs_url, :structure

  def initialize(name)
    @name = name
    @structure = ScraperStructure.new
  end

  def home_url(url)
    @home_url = url
  end

  def jobs_url(url)
    @jobs_url = url
  end

  def structure(&block)
    @structure.instance_eval(&block)
  end

  def scrape
    all_jobs_from_db = ScraperJob.where(site_name: @name).pluck(:unique_job_ref_id)
    active_jobs_from_site = fetch_active_jobs_from_site

    update_inactive_jobs(all_jobs_from_db, active_jobs_from_site)
    ScraperJob.reindex
  end

  private

  def fetch_active_jobs_from_site
    active_jobs = []

    begin
      response = HTTParty.get(@jobs_url, timeout: 3)
      doc = Nokogiri::HTML(response.body)

      doc.css(@structure.html_job_link_class).each do |link|
        job_url = URI.join(@home_url, link["href"]).to_s
        process_job_link(job_url, link, active_jobs)
      end
    rescue => e
      log_error(@jobs_url, e.message)
    end

    active_jobs
  end

  def process_job_link(job_url, link, active_jobs)
    begin
      job_response = HTTParty.get(job_url, timeout: 3)
      job_doc = Nokogiri::HTML(job_response.body)

      unique_job_ref_id = Digest::SHA256.hexdigest(link["href"])
      job = find_or_initialize_job(unique_job_ref_id)

      if job.new_record? || job.expired?
        update_job_attributes(job, job_doc, unique_job_ref_id)
        job.save
      end

      active_jobs << unique_job_ref_id
    rescue => e
      log_error(job_url, e.message)
    end
  end

  def find_or_initialize_job(unique_job_ref_id)
    ScraperJob.find_or_initialize_by(unique_job_ref_id: unique_job_ref_id)
  end

  def update_job_attributes(job, job_doc, unique_job_ref_id)
    job.attributes = {
      title: extract_text(job_doc, @structure.html_title),
      company_name: extract_company_name(job_doc),
      body: extract_text(job_doc, @structure.html_description),
      city: extract_text(job_doc, @structure.html_city)&.[](0, 30),
      state: extract_text(job_doc, @structure.html_state),
      country: extract_country_name(job_doc),
      compensation: extract_text(job_doc, @structure.html_compensation),
      site_name: @name,
      unique_job_ref_id: unique_job_ref_id,
      expired: false
    }
  end

  def extract_text(doc, xpath)
    doc.at_xpath(xpath)&.text&.strip if doc.at_xpath(xpath)
  end

  def extract_company_name(doc)
    extract_text(doc, @structure.html_company)&.split("at")&.last&.strip
  end

  def extract_country_name(doc)
    extract_text(doc, @structure.html_country)&.split("-")&.last&.strip
  end

  def update_inactive_jobs(all_jobs_from_db, active_jobs_from_site)
    inactive_jobs = all_jobs_from_db - active_jobs_from_site
    ScraperJob.where(unique_job_ref_id: inactive_jobs, site_name: @name).update_all(expired: true)
  end

  def log_error(url, message)
    puts "#{url}: #{message}"
  end
 end



 class ScraperStructure
  attr_accessor :html_job_link_class, :html_title, :html_company, :html_description,
                :html_city, :html_state, :html_country, :html_compensation

  def job_link_class(css_class)
    @html_job_link_class = css_class
  end

  def title(xpath)
    @html_title = xpath
  end

  def company(xpath)
    @html_company = xpath
  end

  def description(xpath)
    @html_description = xpath
  end

  def city(xpath)
    @html_city = xpath
  end

  def state(xpath)
    @html_state = xpath
  end

  def country(xpath)
    @html_country = xpath
  end

  def compensation(xpath)
    @html_compensation = xpath
  end
 end

 def define_scraper(name, &block)
  scraper = ScraperDSL.new(name)
  scraper.instance_eval(&block)
  scraper.scrape
 end


 # Assuming you have a Structure model in your database
 Structure.where(active: true).find_each do |site_structure|
  structure = ScraperStructure.new
  structure.job_link_class(site_structure.html_job_link_class)
  structure.title(site_structure.html_title)
  structure.company(site_structure.html_company)
  structure.description(site_structure.html_description)
  structure.city(site_structure.html_city)
  structure.state(site_structure.html_state)
  structure.country(site_structure.html_country)
  structure.compensation(site_structure.html_compensation)

  scraper = ScraperDSL.new(site_structure.site_name)
  scraper.home_url(site_structure.home_url)
  scraper.jobs_url(site_structure.jobs_url)
  scraper.structure = structure

  scraper.scrape
 end



 # or DSL style for one site

 define_scraper "GreenHouse" do
  home_url "https://www.greenhouse.io"
  jobs_url "https://www.greenhouse.io/jobs"

  structure do
    job_link_class ".job-link"
    title "//h1[@class='title']"
    company "//div[@class='company']"
    description "//div[@class='description']"
    city "//span[@class='city']"
    state "//span[@class='state']"
    country "//span[@class='country']"
    compensation "//span[@class='compensation']"
  end
 end
	class ScraperDSL
	attr_accessor :name, :home_url, :jobs_url, :structure

	def initialize(name)
	@name = name
	@structure = ScraperStructure.new
	end

	def home_url(url)
	@home_url = url
	end

	def jobs_url(url)
	@jobs_url = url
	end

	def structure(&block)
	@structure.instance_eval(&block)
	end

	def scrape
	all_jobs_from_db = ScraperJob.where(site_name: @name).pluck(:unique_job_ref_id)
	active_jobs_from_site = fetch_active_jobs_from_site

	update_inactive_jobs(all_jobs_from_db, active_jobs_from_site)
	ScraperJob.reindex
	end

	private

	def fetch_active_jobs_from_site
	active_jobs = []

	begin
	response = HTTParty.get(@jobs_url, timeout: 3)
	doc = Nokogiri::HTML(response.body)

	doc.css(@structure.html_job_link_class).each do \|link\|
	job_url = URI.join(@home_url, link["href"]).to_s
	process_job_link(job_url, link, active_jobs)
	end
	rescue => e
	log_error(@jobs_url, e.message)
	end

	active_jobs
	end

	def process_job_link(job_url, link, active_jobs)
	begin
	job_response = HTTParty.get(job_url, timeout: 3)
	job_doc = Nokogiri::HTML(job_response.body)

	unique_job_ref_id = Digest::SHA256.hexdigest(link["href"])
	job = find_or_initialize_job(unique_job_ref_id)

	if job.new_record? \|\| job.expired?
	update_job_attributes(job, job_doc, unique_job_ref_id)
	job.save
	end

	active_jobs << unique_job_ref_id
	rescue => e
	log_error(job_url, e.message)
	end
	end

	def find_or_initialize_job(unique_job_ref_id)
	ScraperJob.find_or_initialize_by(unique_job_ref_id: unique_job_ref_id)
	end

	def update_job_attributes(job, job_doc, unique_job_ref_id)
	job.attributes = {
	title: extract_text(job_doc, @structure.html_title),
	company_name: extract_company_name(job_doc),
	body: extract_text(job_doc, @structure.html_description),
	city: extract_text(job_doc, @structure.html_city)&.[](0, 30),
	state: extract_text(job_doc, @structure.html_state),
	country: extract_country_name(job_doc),
	compensation: extract_text(job_doc, @structure.html_compensation),
	site_name: @name,
	unique_job_ref_id: unique_job_ref_id,
	expired: false
	}
	end

	def extract_text(doc, xpath)
	doc.at_xpath(xpath)&.text&.strip if doc.at_xpath(xpath)
	end

	def extract_company_name(doc)
	extract_text(doc, @structure.html_company)&.split("at")&.last&.strip
	end

	def extract_country_name(doc)
	extract_text(doc, @structure.html_country)&.split("-")&.last&.strip
	end

	def update_inactive_jobs(all_jobs_from_db, active_jobs_from_site)
	inactive_jobs = all_jobs_from_db - active_jobs_from_site
	ScraperJob.where(unique_job_ref_id: inactive_jobs, site_name: @name).update_all(expired: true)
	end

	def log_error(url, message)
	puts "#{url}: #{message}"
	end
	end



	class ScraperStructure
	attr_accessor :html_job_link_class, :html_title, :html_company, :html_description,
	:html_city, :html_state, :html_country, :html_compensation

	def job_link_class(css_class)
	@html_job_link_class = css_class
	end

	def title(xpath)
	@html_title = xpath
	end

	def company(xpath)
	@html_company = xpath
	end

	def description(xpath)
	@html_description = xpath
	end

	def city(xpath)
	@html_city = xpath
	end

	def state(xpath)
	@html_state = xpath
	end

	def country(xpath)
	@html_country = xpath
	end

	def compensation(xpath)
	@html_compensation = xpath
	end
	end

	def define_scraper(name, &block)
	scraper = ScraperDSL.new(name)
	scraper.instance_eval(&block)
	scraper.scrape
	end


	# Assuming you have a Structure model in your database
	Structure.where(active: true).find_each do \|site_structure\|
	structure = ScraperStructure.new
	structure.job_link_class(site_structure.html_job_link_class)
	structure.title(site_structure.html_title)
	structure.company(site_structure.html_company)
	structure.description(site_structure.html_description)
	structure.city(site_structure.html_city)
	structure.state(site_structure.html_state)
	structure.country(site_structure.html_country)
	structure.compensation(site_structure.html_compensation)

	scraper = ScraperDSL.new(site_structure.site_name)
	scraper.home_url(site_structure.home_url)
	scraper.jobs_url(site_structure.jobs_url)
	scraper.structure = structure

	scraper.scrape
	end



	# or DSL style for one site

	define_scraper "GreenHouse" do
	home_url "https://www.greenhouse.io"
	jobs_url "https://www.greenhouse.io/jobs"

	structure do
	job_link_class ".job-link"
	title "//h1[@class='title']"
	company "//div[@class='company']"
	description "//div[@class='description']"
	city "//span[@class='city']"
	state "//span[@class='state']"
	country "//span[@class='country']"
	compensation "//span[@class='compensation']"
	end
	end