Skip to content

Instantly share code, notes, and snippets.

@cleoold
Created April 27, 2025 08:58
Show Gist options
  • Save cleoold/69ff9179979ebac1bb80eda1aa0c6609 to your computer and use it in GitHub Desktop.
Save cleoold/69ff9179979ebac1bb80eda1aa0c6609 to your computer and use it in GitHub Desktop.
Convert a vgmdb.net album webpage to structured representation
import json
import sys
import re
import datetime
import requests
from bs4 import BeautifulSoup
def convert(url: str, soup: BeautifulSoup) -> dict:
res = {}
res['url'] = url
# Title
res['title'] = {}
h1_title_elems = soup.select('#innermain h1 .albumtitle')
for h1_title in h1_title_elems:
res['title'][h1_title.get('lang')] = h1_title.text.strip('/ ')
other_title_elems = soup.select('#innermain > div span.albumtitle')
other_titles = []
for other_title_elem in other_title_elems:
titles = other_title_elem.text.split('<br>')
for title in titles:
title = title.strip()
for _, v in res['title'].items():
if title == v:
break
else:
for o in other_titles:
if title == o:
break
else:
other_titles.append(title)
if other_titles:
res['title']['other'] = other_titles
# cover
coverElem = soup.select_one('#coverart')
if coverElem:
sty = coverElem.get('style')
if sty:
res['coverUrl'] = sty.split("url('")[1].split("')")[0]
# album info
res['info'] = {}
info_tables = soup.select('#album_infobit_large')
if info_tables:
info_rows = info_tables[0].select('tr')
for row in info_rows:
pair = row.select('td')
if len(pair) < 2:
continue
child_table = pair[1].select_one('table')
if not child_table:
res['info'][pair[0].text.strip()] = pair[1].text.strip()
else:
# e.g. drop down after the Catalog Number
span = pair[1].select_one('span')
span.select_one('script').decompose()
res['info'][pair[0].text.strip()] = span.text.strip()
# child table
name = child_table.select_one('td.thead').text.strip()
values = [td.text.strip() for td in child_table.select('td:not(.thead)')]
res['info'][name] = values
# credits
res['credits'] = {}
if len(info_tables) > 1:
credit_rows = info_tables[1].select('tr')
for row in credit_rows:
pair = row.select('td')
res['credits'][pair[0].text.strip()] = pair[1].text.strip()
# tracklist
# find available languages
available_langs = [li.text.strip() for li in soup.select('#tlnav li')]
res['discs'] = []
tracklist_divs = soup.select('#tracklist > span')
for i, div in enumerate(tracklist_divs):
disc_names = [s.text.strip() for s in div.find_all(lambda t: t.name == 'span' and 'Disc' in t.text and 'length' not in t.text, recursive=False)]
disc_descs = [s.text.strip() for s in div.find_all('span', {'class': 'label'}, recursive=False)]
disc_lengths = [s.text.strip()[12:] for s in div.find_all(lambda t: t.name == 'span' and 'Disc length' in t.text, recursive=False)]
disc_table_elems = div.select('table.role')
for j, disc_table_elem in enumerate(disc_table_elems):
if i == 0:
disc = {}
disc['name'] = disc_names[j]
if disc_descs:
disc['desc'] = disc_descs[j]
if disc_lengths:
disc['length'] = disc_lengths[j]
disc['tracks'] = []
res['discs'].append(disc)
else:
disc = res['discs'][j]
track_rows = disc_table_elem.select('tr')
for k, row in enumerate(track_rows):
tds = row.select('td')
if i == 0:
track = {}
track['number'] = tds[0].text.strip()
track['title'] = {}
if len(tds) > 2:
track['length'] = tds[2].text.strip()
disc['tracks'].append(track)
else:
track = disc['tracks'][k]
track['title'][available_langs[i]] = tds[1].text.strip()
# notes
notes_elem = soup.select_one('#notes')
if notes_elem:
res['notes'] = notes_elem.encode_contents().decode()
# album stats
res['stats'] = {}
keys = ['Category', 'Products represented', 'Platforms represented']
for key in keys:
b = soup.find('b', {'class': 'label'}, string=key)
if b:
res['stats'][key] = b.parent.text.strip()[len(key) + 1:].strip()
def get_links(header, key):
headerElem = soup.find('h3', string=header)
if headerElem:
res[key] = []
links = headerElem.parent.parent.find_next_sibling().select('a')
for link in links:
a = {}
a['name'] = link.text.strip()
m = re.match(r'/redirect/\d+/(.*)', link.get('href'))
if m:
a['url'] = m.group(1)
else:
a['url'] = link.get('href')
res[key].append(a)
# available at
get_links('Available at', 'availableAt')
get_links('Websites', 'websites')
# covers
res['covers'] = []
cover_elems = soup.select('#cover_gallery a')
for cover_elem in cover_elems:
cover = {}
label = cover_elem.select_one('.label')
if label:
cover['label'] = label.text.strip()
cover['url'] = cover_elem.get('href')
res['covers'].append(cover)
res['time'] = datetime.datetime.now().isoformat()
return res
if __name__ == '__main__':
url = sys.argv[1]
if not url.startswith('https://vgmdb.net/album/'):
raise ValueError('URL must start with https://vgmdb.net/album/')
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
data = convert(url, soup)
print(json.dumps(data, indent=2, ensure_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment