Last active
July 2, 2022 18:45
-
-
Save connorshea/b6fe08a5d1d28bf88f252a91dd184c80 to your computer and use it in GitHub Desktop.
Proof-of-concept script that scrapes the GDQ schedule page and then pulls the vglist and IGDB IDs from the vglist API. This'd then be used to pull cover data and such from IGDB. It doesn't get every game in the schedule, but it can get most of them automatically, which will reduce the amount of manual effort involved in setting up the data for a…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'json' | |
gdq_schedule = JSON.parse(File.read('gdq.json')) | |
total_entries = gdq_schedule.count | |
puts "Total entries: #{total_entries}" | |
game_only_entries = gdq_schedule.filter { |entry| !entry['non_game'] }.count | |
puts "Game-only entries: #{game_only_entries}" | |
entries_with_vglist_ids = gdq_schedule.filter { |entry| !entry['vglist_id'].nil? }.count | |
puts "Entries with vglist IDs: #{entries_with_vglist_ids} (#{(entries_with_vglist_ids.fdiv(game_only_entries) * 100).round(2)}%)" | |
entries_with_igdb_ids = gdq_schedule.filter { |entry| !entry['igdb_id'].nil? }.count | |
puts "Entries with IGDB IDs: #{entries_with_igdb_ids} (#{(entries_with_igdb_ids.fdiv(game_only_entries) * 100).round(2)}%)" | |
puts | |
puts 'Games without vglist IDs:' | |
games_without_vglist_ids = gdq_schedule.filter { |entry| !entry['non_game'] }.filter { |entry| entry['vglist_id'].nil? }.map { |entry| entry['name'] } | |
games_without_vglist_ids.each do |entry| | |
puts "- #{entry}" | |
end | |
puts | |
puts 'Games without IGDB IDs:' | |
games_without_igdb_ids = gdq_schedule.filter { |entry| !entry['non_game'] }.filter { |entry| entry['igdb_id'].nil? }.map { |entry| entry['name'] } | |
games_without_igdb_ids.each do |entry| | |
puts "- #{entry}" | |
end | |
puts | |
puts 'Games with vglist IDs and no IGDB ID:' | |
games_without_igdb_ids.difference(games_without_vglist_ids).each do |entry| | |
puts "- #{entry}" | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bundler/inline' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'nokogiri' | |
gem 'graphql-client', '~> 0.18.0' | |
gem 'debug' | |
end | |
require 'json' | |
require 'open-uri' | |
require 'net/http' | |
require 'nokogiri' | |
require "graphql/client" | |
require "graphql/client/http" | |
require 'debug' | |
# For comparing using Levenshtein Distance. | |
# https://stackoverflow.com/questions/16323571/measure-the-distance-between-two-strings-with-ruby | |
require "rubygems/text" | |
module VGListGraphQL | |
HTTP = GraphQL::Client::HTTP.new("https://vglist.co/graphql") do | |
def headers(context) | |
{ | |
"User-Agent": "GDQ Schedule Parser", | |
"X-User-Email": ENV['VGLIST_EMAIL'], | |
"X-User-Token": ENV['VGLIST_TOKEN'], | |
"Content-Type": "application/json", | |
"Accept": "*/*" | |
} | |
end | |
end | |
# Fetch latest schema on init, this will make a network request | |
Schema = GraphQL::Client.load_schema(HTTP) | |
Client = GraphQL::Client.new(schema: Schema, execute: HTTP) | |
end | |
class GDQHelper | |
GAME_SEARCH_QUERY = VGListGraphQL::Client.parse <<~GRAPHQL | |
query($name: String!) { | |
gameSearch(query: $name, first: 10) { | |
nodes { | |
id | |
name | |
igdbId | |
} | |
} | |
} | |
GRAPHQL | |
# Given a string like '1:30:00', convert to an integer for seconds. | |
def self.estimate_to_seconds(estimate) | |
num_seconds = 0 | |
parts = estimate.split(':').map(&:to_i) | |
return 0 if parts.length != 3 | |
hours, minutes, seconds = parts | |
num_seconds += hours * 60 * 60 | |
num_seconds += minutes * 60 | |
num_seconds += seconds | |
num_seconds | |
end | |
# Given a string like 'Foo, Bar', return an array of `['Foo', 'Bar']`. | |
def self.parse_runners(runners) | |
runners.split(',').map(&:strip) | |
end | |
# Given the following inputs, return the category and platform as a 2-tuple. | |
def self.parse_category_and_platform(string) | |
return nil if string.nil? | |
parts = string.split('—') | |
platform = parts.last | |
category = parts.length > 2 ? parts[0..-1].join('—') : parts.first | |
[category.strip, platform.strip] | |
end | |
# Scrub the name to remove 'BONUS GAME' stuff. | |
def self.name_scrubber(name) | |
name.gsub(/BONUS GAME( \d?) -/i, '').strip | |
end | |
# Whether this is a bonus game. | |
def self.bonus_game?(name) | |
name.downcase.start_with?('bonus game') | |
end | |
# Whether this is a non-game entry in the schedule (recaps, pre-show, finale). | |
def self.non_game?(name) | |
name.start_with?('Daily Recap') || ['Pre-Show', 'Finale', 'Event Recap'].include?(name) | |
end | |
def self.get_vglist_and_igdb_id(name) | |
vglist_game = vglist_game_query(name) | |
[vglist_game&.id&.to_i, vglist_game&.igdb_id] | |
end | |
def self.vglist_game_query(name) | |
response = VGListGraphQL::Client.query(GAME_SEARCH_QUERY, variables: { name: name }) | |
game_nodes = response.data.game_search.nodes | |
nodes_with_exact_name = game_nodes.filter { |node| node.name == name } | |
# "Shadow of the Colossus" is the name of two different games due to the remaster :') | |
# So we just return nil if there are multiple games with the same exact name, | |
# rather than potentially choosing the wrong one :| | |
return nil if nodes_with_exact_name.length > 1 | |
# If there's just one game that has the exact name we want, return it. | |
return nodes_with_exact_name.first if nodes_with_exact_name.length == 1 | |
# Get games by checking for name-closeness. | |
games_with_similar_name = game_nodes.filter { |node| games_have_same_name?(name, node.name) } | |
# Return nil if no games have a similar-enough name or if there's more | |
# than 1 game with the name we're looking for. | |
return nil if games_with_similar_name.length != 1 | |
return games_with_similar_name.first if games_with_similar_name.length == 1 | |
end | |
def self.games_have_same_name?(name1, name2) | |
name1 = name1.downcase | |
name2 = name2.downcase | |
return true if name1 == name2 | |
levenshtein = Class.new.extend(Gem::Text).method(:levenshtein_distance) | |
distance = levenshtein.call(name1, name2) | |
return true if distance <= 2 | |
replacements = [ | |
{ | |
before: '&', | |
after: 'and' | |
} | |
] | |
replacements.each do |replacement| | |
name1 = name1.gsub(replacement[:before], replacement[:after]).strip | |
name2 = name2.gsub(replacement[:before], replacement[:after]).strip | |
end | |
return true if name1 == name2 | |
return false | |
end | |
end | |
gdq_games_list = [] | |
SCHEDULE_URLS = { | |
agdq2018: 'https://web.archive.org/web/20171202003955/https://gamesdonequick.com/schedule', | |
sgdq2018: 'https://web.archive.org/web/20180428144327/https://gamesdonequick.com/schedule', | |
agdq2019: 'https://web.archive.org/web/20190104080309/https://gamesdonequick.com/schedule', | |
sgdq2019: 'https://web.archive.org/web/20190531022612/https://gamesdonequick.com/schedule', | |
agdq2020: 'https://web.archive.org/web/20200121063630/https://gamesdonequick.com/schedule', | |
sgdq2020: 'https://web.archive.org/web/20200810014929/https://gamesdonequick.com/schedule', | |
agdq2021: 'https://web.archive.org/web/20210107025302/https://gamesdonequick.com/schedule', | |
sgdq2021: 'https://web.archive.org/web/20210528184033/https://gamesdonequick.com/schedule', | |
agdq2022: 'https://web.archive.org/web/20220106230609/https://gamesdonequick.com/schedule', | |
current: 'https://gamesdonequick.com/schedule' | |
}.freeze | |
GDQ_SCHEDULE_URL = SCHEDULE_URLS[:current] | |
# GDQ_SCHEDULE_URL = SCHEDULE_URLS[:sgdq2018] | |
response = Net::HTTP.get_response(URI.parse(GDQ_SCHEDULE_URL)) | |
gdq_schedule_html = response.body | |
doc = Nokogiri::HTML(gdq_schedule_html) | |
# Grab the game names for each | |
rows = doc.css('#runTable tbody tr:not(.second-row)') | |
second_rows = doc.css('#runTable tbody tr.second-row') | |
rows.each_with_index do |row, i| | |
game = {} | |
name = row.children[3].children[0].to_s | |
game[:name] = GDQHelper.name_scrubber(name) | |
game[:bonus_game] = GDQHelper.bonus_game?(name) | |
game[:non_game] = GDQHelper.non_game?(name) | |
second_row = second_rows[i] | |
game[:category], game[:platform] = nil, nil | |
# Protect against `second_row` being nil, which can happen for older GDQ schedule pages where the Finale has no second row. | |
game[:category], game[:platform] = GDQHelper.parse_category_and_platform(second_row.children[3].children[0].to_s) unless second_row.nil? | |
# Add vglist_id and igdb_id unless this is a non-game entry, no reason to waste API requests on non-game entries. | |
game[:vglist_id], game[:igdb_id] = nil, nil | |
game[:vglist_id], game[:igdb_id] = GDQHelper.get_vglist_and_igdb_id(game[:name]) unless game[:non_game] | |
estimate_string = nil | |
estimate_string = second_row.children[1].children[2].to_s.strip unless second_row.nil? | |
game[:estimate] = estimate_string.nil? ? nil : GDQHelper.estimate_to_seconds(estimate_string) | |
game[:commentator] = second_row.children[5].children[1].to_s.strip unless second_row.nil? | |
game[:runners] = GDQHelper.parse_runners(row.children[5].children[0].to_s) | |
gdq_games_list << game | |
# Sleep for 1 second between entries because we don't want to spam the vglist API. | |
sleep 1 | |
end | |
File.write(File.join(File.dirname(__FILE__), 'gdq.json'), JSON.pretty_generate(gdq_games_list)) | |
puts 'Written to file.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment