Skip to content

Instantly share code, notes, and snippets.

@chsh
Last active January 12, 2021 22:44
Show Gist options
  • Select an option

  • Save chsh/19051bd89d9b4eaf4066a9f3a40c5ae8 to your computer and use it in GitHub Desktop.

Select an option

Save chsh/19051bd89d9b4eaf4066a9f3a40c5ae8 to your computer and use it in GitHub Desktop.
Crawl all data using openBD API
class CreateOpenBds < ActiveRecord::Migration
def change
create_table :open_bds do |t|
t.string :isbn, null: false
t.jsonb :content
t.datetime :last_crawled_at
t.timestamps null: false
end
add_index :open_bds, :isbn, unique: true
add_index :open_bds, :content, using: :gin
add_index :open_bds, :last_crawled_at
add_index :open_bds, :created_at
add_index :open_bds, :updated_at
end
end
class OpenBd < ActiveRecord::Base
concerning :CrawlerFeature do
included do
scope :not_crawled, -> { where(last_crawled_at: nil) }
scope :expired, -> now = nil {
now ||= Time.zone.now
where('last_crawled_at is null OR last_crawled_at < ?', now - 7.days)
}
end
class_methods do
def crawl!(now = nil)
now ||= Time.zone.now
generate_records
crawl_in_batches(now)
end
private
def generate_records
all_isbns = coverage
all_isbns.each_slice(10000).each do |isbns|
saved_isbns = self.where(isbn: isbns).pluck(:isbn)
new_isbns = isbns - saved_isbns
new_isbns.each do |new_isbn|
self.where(isbn: new_isbn).create
end
end
end
def crawl_in_batches(now = nil)
now ||= Time.zone.now
self.expired(now).find_in_batches(batch_size: 5000).each do |batch|
isbn2rec = batch.index_by(&:isbn)
isbns = isbn2rec.keys.dup
res = get(isbns)
res.each do |data|
if data.present? && data['summary'].present?
isbn = data['summary']['isbn']
isbns -= [isbn]
rec = isbn2rec[isbn]
rec.update last_crawled_at: Time.zone.now, content: data
else
puts "Empty data!"
end
end
if isbns.present?
File.open('log/batch.log', 'a+') { |f|
f.puts "NOT LOADED ISBNS=#{isbns}"
}
end
end
end
def coverage
response = conn.get '/v1/coverage'
JSON.parse(response.body)
end
def get(isbns)
response = conn.post '/v1/get', isbn: isbns.join(',')
JSON.parse(response.body)
end
def conn
Faraday.new(url: 'https://api.openbd.jp')
end
end
end
end
@chsh
Copy link
Copy Markdown
Author

chsh commented Jan 25, 2017

usage

OpenBd.crawl!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment