import collections import json import pathlib import urllib.parse CONTENT_URL_KEYS = ("given_url", "resolved_url") if __name__ == "__main__": tags = collections.Counter() urls = collections.Counter() for offset_path in pathlib.Path.cwd().glob("offset_*.json"): with offset_path.open("r") as f: offset = json.load(f) articles = offset["list"] if len(articles) == 0: break for item_id, article in articles.items(): assert item_id == article["item_id"] tags.update(tag for tag in article.get("tags", dict()).keys()) urls.update( urllib.parse.urlsplit(article[content_url_key]).netloc for content_url_key in CONTENT_URL_KEYS ) print(tags) print(urls)