import collections
import json
import pathlib
import urllib.parse

CONTENT_URL_KEYS = ("given_url", "resolved_url")
if __name__ == "__main__":

    tags = collections.Counter()
    urls = collections.Counter()
    for offset_path in pathlib.Path.cwd().glob("offset_*.json"):
        with offset_path.open("r") as f:
            offset = json.load(f)
            articles = offset["list"]
            if len(articles) == 0:
                break
            for item_id, article in articles.items():
                assert item_id == article["item_id"]
                tags.update(tag for tag in article.get("tags", dict()).keys())
                urls.update(
                    urllib.parse.urlsplit(article[content_url_key]).netloc
                    for content_url_key in CONTENT_URL_KEYS
                )
    print(tags)
    print(urls)