Created
May 16, 2026 14:07
-
-
Save ggorlen/88aa800318495d3536e777207dc8353f to your computer and use it in GitHub Desktop.
better git stats
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # Git stats are misleading because some people squash merge and some people | |
| # create merge commits, bringing their history of commits from the branch onto | |
| # master and making it look like they did way more work relative to squashers. | |
| # | |
| # This tries to treat merge commits as if they were squash merged so repo stats | |
| # are actually normalized/comparable. It also merges by identical name and email. | |
| # | |
| # I'm sure this isn't 100% accurate, particularly for LOC, but anecdotally seems | |
| # to give reasonable results. | |
| # | |
| # See also: | |
| # - https://chatgpt.com/c/6a086d4c-da14-83ea-b0e8-5a887e199692 | |
| # - https://stackoverflow.com/questions/49002013 | |
| import subprocess | |
| import sys | |
| import json | |
| from collections import defaultdict | |
| def git(*args): | |
| return subprocess.check_output( | |
| ["git", *args], | |
| text=True, | |
| encoding="utf-8", | |
| ) | |
| def find(parent, x): | |
| if x not in parent: | |
| parent[x] = x | |
| while parent[x] != x: | |
| parent[x] = parent[parent[x]] | |
| x = parent[x] | |
| return x | |
| def union(parent, a, b): | |
| ra = find(parent, a) | |
| rb = find(parent, b) | |
| if ra != rb: | |
| parent[rb] = ra | |
| def parse_commits(): | |
| commits = [] | |
| parent = {} | |
| log = git( | |
| "log", | |
| "--first-parent", | |
| "--format=%H|%aN|%aE|%P", | |
| ) | |
| for line in log.splitlines(): | |
| commit_hash, name, email, parents = line.split("|") | |
| parents = parents.split() | |
| commits.append((commit_hash, name, email, parents)) | |
| union(parent, f"name:{name}", f"email:{email}") | |
| return commits, parent | |
| def parse_loc(): | |
| commit_loc = {} | |
| numstat_output = git( | |
| "log", | |
| "--first-parent", | |
| "--format=COMMIT:%H", | |
| "--numstat", | |
| ) | |
| current = None | |
| added = 0 | |
| deleted = 0 | |
| for line in numstat_output.splitlines(): | |
| if line.startswith("COMMIT:"): | |
| if current is not None: | |
| commit_loc[current] = (added, deleted) | |
| current = line.split(":", 1)[1] | |
| added = 0 | |
| deleted = 0 | |
| continue | |
| if not line.strip(): | |
| continue | |
| parts = line.split("\t") | |
| if len(parts) < 3: | |
| continue | |
| a, d, _ = parts | |
| if a != "-": | |
| added += int(a) | |
| if d != "-": | |
| deleted += int(d) | |
| if current is not None: | |
| commit_loc[current] = (added, deleted) | |
| return commit_loc | |
| def aggregate(commits, uf_parent, commit_loc): | |
| counts = defaultdict(int) | |
| loc_added = defaultdict(int) | |
| loc_deleted = defaultdict(int) | |
| names = defaultdict(lambda: defaultdict(int)) | |
| emails = defaultdict(set) | |
| hidden = set() | |
| for commit_hash, name, email, parents in commits: | |
| group = find(uf_parent, f"name:{name}") | |
| names[group][name] += 1 | |
| emails[group].add(email) | |
| if commit_hash in hidden: | |
| continue | |
| a, d = commit_loc.get(commit_hash, (0, 0)) | |
| if len(parents) == 2: | |
| main_parent, merged_parent = parents | |
| counts[group] += 1 | |
| loc_added[group] += a | |
| loc_deleted[group] += d | |
| merged = git( | |
| "rev-list", | |
| merged_parent, | |
| f"^{main_parent}", | |
| ).splitlines() | |
| hidden.update(merged) | |
| else: | |
| counts[group] += 1 | |
| loc_added[group] += a | |
| loc_deleted[group] += d | |
| return counts, loc_added, loc_deleted, names, emails | |
| def format_results(counts, loc_added, loc_deleted, names, emails): | |
| results = [] | |
| for group, n in sorted(counts.items(), key=lambda x: x[1], reverse=True): | |
| canonical = max(names[group].items(), key=lambda x: x[1])[0] | |
| results.append( | |
| { | |
| "name": canonical, | |
| "emails": sorted(emails[group]), | |
| "commits": n, | |
| "loc_added": loc_added[group], | |
| "loc_deleted": loc_deleted[group], | |
| } | |
| ) | |
| return results | |
| def main(): | |
| json_output = "--json" in sys.argv | |
| commits, uf_parent = parse_commits() | |
| commit_loc = parse_loc() | |
| counts, loc_added, loc_deleted, names, emails = aggregate( | |
| commits, | |
| uf_parent, | |
| commit_loc, | |
| ) | |
| results = format_results( | |
| counts, | |
| loc_added, | |
| loc_deleted, | |
| names, | |
| emails, | |
| ) | |
| if json_output: | |
| print(json.dumps(results, indent=2)) | |
| else: | |
| for r in results: | |
| print( | |
| f"{r['commits']:5} " | |
| f"+{r['loc_added']:8,d} " | |
| f"-{r['loc_deleted']:8,d} " | |
| f"{r['name']} <{', '.join(r['emails'])}>" | |
| ) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment