Created
April 27, 2025 08:58
-
-
Save cleoold/69ff9179979ebac1bb80eda1aa0c6609 to your computer and use it in GitHub Desktop.
Convert a vgmdb.net album webpage to structured representation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import sys | |
import re | |
import datetime | |
import requests | |
from bs4 import BeautifulSoup | |
def convert(url: str, soup: BeautifulSoup) -> dict: | |
res = {} | |
res['url'] = url | |
# Title | |
res['title'] = {} | |
h1_title_elems = soup.select('#innermain h1 .albumtitle') | |
for h1_title in h1_title_elems: | |
res['title'][h1_title.get('lang')] = h1_title.text.strip('/ ') | |
other_title_elems = soup.select('#innermain > div span.albumtitle') | |
other_titles = [] | |
for other_title_elem in other_title_elems: | |
titles = other_title_elem.text.split('<br>') | |
for title in titles: | |
title = title.strip() | |
for _, v in res['title'].items(): | |
if title == v: | |
break | |
else: | |
for o in other_titles: | |
if title == o: | |
break | |
else: | |
other_titles.append(title) | |
if other_titles: | |
res['title']['other'] = other_titles | |
# cover | |
coverElem = soup.select_one('#coverart') | |
if coverElem: | |
sty = coverElem.get('style') | |
if sty: | |
res['coverUrl'] = sty.split("url('")[1].split("')")[0] | |
# album info | |
res['info'] = {} | |
info_tables = soup.select('#album_infobit_large') | |
if info_tables: | |
info_rows = info_tables[0].select('tr') | |
for row in info_rows: | |
pair = row.select('td') | |
if len(pair) < 2: | |
continue | |
child_table = pair[1].select_one('table') | |
if not child_table: | |
res['info'][pair[0].text.strip()] = pair[1].text.strip() | |
else: | |
# e.g. drop down after the Catalog Number | |
span = pair[1].select_one('span') | |
span.select_one('script').decompose() | |
res['info'][pair[0].text.strip()] = span.text.strip() | |
# child table | |
name = child_table.select_one('td.thead').text.strip() | |
values = [td.text.strip() for td in child_table.select('td:not(.thead)')] | |
res['info'][name] = values | |
# credits | |
res['credits'] = {} | |
if len(info_tables) > 1: | |
credit_rows = info_tables[1].select('tr') | |
for row in credit_rows: | |
pair = row.select('td') | |
res['credits'][pair[0].text.strip()] = pair[1].text.strip() | |
# tracklist | |
# find available languages | |
available_langs = [li.text.strip() for li in soup.select('#tlnav li')] | |
res['discs'] = [] | |
tracklist_divs = soup.select('#tracklist > span') | |
for i, div in enumerate(tracklist_divs): | |
disc_names = [s.text.strip() for s in div.find_all(lambda t: t.name == 'span' and 'Disc' in t.text and 'length' not in t.text, recursive=False)] | |
disc_descs = [s.text.strip() for s in div.find_all('span', {'class': 'label'}, recursive=False)] | |
disc_lengths = [s.text.strip()[12:] for s in div.find_all(lambda t: t.name == 'span' and 'Disc length' in t.text, recursive=False)] | |
disc_table_elems = div.select('table.role') | |
for j, disc_table_elem in enumerate(disc_table_elems): | |
if i == 0: | |
disc = {} | |
disc['name'] = disc_names[j] | |
if disc_descs: | |
disc['desc'] = disc_descs[j] | |
if disc_lengths: | |
disc['length'] = disc_lengths[j] | |
disc['tracks'] = [] | |
res['discs'].append(disc) | |
else: | |
disc = res['discs'][j] | |
track_rows = disc_table_elem.select('tr') | |
for k, row in enumerate(track_rows): | |
tds = row.select('td') | |
if i == 0: | |
track = {} | |
track['number'] = tds[0].text.strip() | |
track['title'] = {} | |
if len(tds) > 2: | |
track['length'] = tds[2].text.strip() | |
disc['tracks'].append(track) | |
else: | |
track = disc['tracks'][k] | |
track['title'][available_langs[i]] = tds[1].text.strip() | |
# notes | |
notes_elem = soup.select_one('#notes') | |
if notes_elem: | |
res['notes'] = notes_elem.encode_contents().decode() | |
# album stats | |
res['stats'] = {} | |
keys = ['Category', 'Products represented', 'Platforms represented'] | |
for key in keys: | |
b = soup.find('b', {'class': 'label'}, string=key) | |
if b: | |
res['stats'][key] = b.parent.text.strip()[len(key) + 1:].strip() | |
def get_links(header, key): | |
headerElem = soup.find('h3', string=header) | |
if headerElem: | |
res[key] = [] | |
links = headerElem.parent.parent.find_next_sibling().select('a') | |
for link in links: | |
a = {} | |
a['name'] = link.text.strip() | |
m = re.match(r'/redirect/\d+/(.*)', link.get('href')) | |
if m: | |
a['url'] = m.group(1) | |
else: | |
a['url'] = link.get('href') | |
res[key].append(a) | |
# available at | |
get_links('Available at', 'availableAt') | |
get_links('Websites', 'websites') | |
# covers | |
res['covers'] = [] | |
cover_elems = soup.select('#cover_gallery a') | |
for cover_elem in cover_elems: | |
cover = {} | |
label = cover_elem.select_one('.label') | |
if label: | |
cover['label'] = label.text.strip() | |
cover['url'] = cover_elem.get('href') | |
res['covers'].append(cover) | |
res['time'] = datetime.datetime.now().isoformat() | |
return res | |
if __name__ == '__main__': | |
url = sys.argv[1] | |
if not url.startswith('https://vgmdb.net/album/'): | |
raise ValueError('URL must start with https://vgmdb.net/album/') | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
data = convert(url, soup) | |
print(json.dumps(data, indent=2, ensure_ascii=False)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment