Last active
November 3, 2022 21:40
-
-
Save senko/b031f61f61d89f96e165659d3f022784 to your computer and use it in GitHub Desktop.
Get the most popular languages on Hacker News
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Calculate top list of programming languages based on HN stories/comments/points | |
# More info: https://blog.senko.net/relative-popularity-of-programming-languages-on-hacker-news | |
from datetime import datetime, timedelta | |
from json import dump, load | |
from os.path import join, exists | |
from tempfile import gettempdir | |
from typing import Optional | |
import requests | |
HN_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date" | |
HN_SEARCH_PARAMS = { | |
"tags": "story", | |
"restrictSearchableAttributes": "title", | |
"hitsPerPage": "1000", | |
} | |
LANGUAGES = [ | |
"C", | |
"C++", | |
"C#", | |
"Clojure", | |
"Dart", | |
"Elixir", | |
"Erlang", | |
"F#", | |
"Go", | |
"Haskell", | |
"Java", | |
"JavaScript", | |
"Kotlin", | |
"Lisp", | |
"Lua", | |
"PHP", | |
"Python", | |
"Racket", | |
"Ruby", | |
"Rust", | |
"Scala", | |
"Scheme", | |
"Swift", | |
"TypeScript", | |
"Zig", | |
] | |
def get_page(query: str, created_since: datetime, page: int) -> list[dict]: | |
params = { | |
"query": query, | |
"numericFilters": f"created_at_i > {created_since.timestamp()}", | |
} | |
params.update(HN_SEARCH_PARAMS) | |
if page: | |
params["page"] = page | |
resp = requests.get(HN_SEARCH_URL, params=params) | |
resp.raise_for_status() | |
return resp.json() | |
def get_results(lang: str) -> list[dict]: | |
query = f'"in {lang}"' | |
created_since = datetime.now() - timedelta(days=365) | |
page = 0 | |
has_more = True | |
results = [] | |
while has_more: | |
response = get_page(query, created_since, page) | |
results.extend(response["hits"]) | |
has_more = response.get("nbPages", 0) > page | |
page += 1 | |
return results | |
def download_language_results(data_dir: str, lang: str) -> list[dict]: | |
fname = join(data_dir, lang + ".json") | |
if exists(fname): | |
with open(fname, "r") as fp: | |
return load(fp) | |
results = get_results(lang) | |
with open(fname, "w") as fp: | |
dump(results, fp) | |
return results | |
def get_all_results(data_dir: str) -> dict[str,list[dict]]: | |
all_results = {} | |
for lang in LANGUAGES: | |
all_results[lang] = download_language_results(data_dir, lang) | |
return all_results | |
def postprocess_results(results: dict[str,list[dict]]) -> dict[str,list[dict]]: | |
cpp_results = set(item["objectID"] for item in results["C++"]) | |
csharp_results = set(item["objectID"] for item in results["C#"]) | |
non_c_results = cpp_results | csharp_results | |
# All C++/C# results also match C, so we remove them in postproc | |
results["C"] = [ item for item in results["C"] if item["objectID"] not in non_c_results ] | |
# Add Racket results to Scheme | |
results["Scheme"].extend(results["Racket"]) | |
del results["Racket"] | |
return results | |
def calculate_score(results: dict[str,list[dict]]): | |
score = {} | |
for lang, stories in results.items(): | |
score[lang] = { | |
"stories": len(stories), | |
"comments": sum(s["num_comments"] for s in stories), | |
"points": sum(s["points"] for s in stories) | |
} | |
return score | |
def sort_by_stories(score): | |
return sorted(score.keys(), key=lambda lang: score[lang]["stories"], reverse=True) | |
def sort_by_comments(score): | |
return sorted(score.keys(), key=lambda lang: score[lang]["comments"], reverse=True) | |
def sort_by_points(score): | |
return sorted(score.keys(), key=lambda lang: score[lang]["points"], reverse=True) | |
def generate_top_list(data_dir): | |
score = calculate_score(postprocess_results(get_all_results(data_dir))) | |
return { | |
"score": score, | |
"by_stories": sort_by_stories(score), | |
"by_comments": sort_by_comments(score), | |
"by_points": sort_by_points(score), | |
} | |
def print_top_list(top_list): | |
print("By number of stories:") | |
for place, lang in enumerate(top_list["by_stories"]): | |
print(f" {place+1:>2}. {lang} ({top_list['score'][lang]['stories']} stories)") | |
print("\nBy number of comments:") | |
for place, lang in enumerate(top_list["by_comments"]): | |
print(f" {place+1:>2}. {lang} ({top_list['score'][lang]['comments']} comments)") | |
print("\nBy points:") | |
for place, lang in enumerate(top_list["by_points"]): | |
print(f" {place+1:>2}. {lang} ({top_list['score'][lang]['points']} points)") | |
if __name__ == "__main__": | |
top_list = generate_top_list(join(gettempdir(), "hn")) | |
print_top_list(top_list) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment