import argparse import os import re import subprocess import tempfile import requests def get_repo(owner, repo): """Check the repo out to a temporary directory""" tempdir = tempfile.mkdtemp() url = "http://github.com/%s/%s.git" % (owner, repo) subprocess.check_output(['git', 'clone', url, tempdir]) return tempdir def get_shas(checkout): """List all reachable commits from a checkout""" os.chdir(checkout) commits = subprocess.check_output(['git', 'repack']) packdir = os.path.join(checkout, '.git', 'objects', 'pack') packfile = [filename for filename in os.listdir(packdir) if '.idx' in filename] packfile = os.path.join(packdir, packfile[0]) shas = subprocess.check_output(['git', 'verify-pack', '-v', packfile]) shas = {sha[:4] for sha in shas.split("\n") if 'object' not in sha and sha} return shas def potential_hashes(checkout, shas=None): """Get all 4 digit hashes that aren't in our known list""" if shas is None: shas = get_shas(checkout) all_possibles = {'%04x' % possible for possible in xrange(16**4)} candidates = all_possibles - shas return candidates def get_parents_for_commit(owner, repo, commit): """Get the parents from the API. We can't use git as it won't show us hidden commits, but we have the full hash so we can use the API and avoid hitting the frontend""" url = "https://api.github.com/repos/%s/%s/commits/%s" % (owner, repo, commit) response = requests.get(url).json() for parent in response['parents']: yield parent['sha'] def get_all_ancestors(owner, repo, commit, break_on=None): """Given a hash, iterate over all of its ancestors, with an optional set of commits we have seen already to ignore.""" if break_on is None: break_on = set() # Recurse over the parents, so we can follow both sides of a merge parents = get_parents_for_commit(owner, repo, commit) for parent in parents: # We've seen this commit before, stop processing if parent[:4] not in break_on: yield parent for ancestor in get_all_ancestors(owner, repo, parent, break_on): yield ancestor def check_candidate(owner, repo, candidate, known_shas): """Given repo information and a 4 digit prefix, try and find that commit, then iterate over its ancestors if it exists. Otherwise return an empty iterator""" url = 'https://github.com/%s/%s/commit/%s' % (owner, repo, candidate) response = requests.get(url) if response.status_code == 200: # This is a real commit, find the full hash full_hash = re.compile('/commit/(%s.*?)\"' % candidate).findall(response.text)[0] # Make a note of the prefix, so we don't re-scan this tree known_shas.add(full_hash[:4]) yield full_hash # Yield from all ancestors of this commit, until we hit a known commit for ancestor in get_all_ancestors(owner, repo, full_hash, known_shas): known_shas.add(ancestor[:4]) yield ancestor else: return def main(): parser = argparse.ArgumentParser(description='Find unreachable commits') parser.add_argument('owner', type=str, help='the repository owner') parser.add_argument('repo', type=str, help='the repository name') args = parser.parse_args() # Get the git repo and find the candidates we will check for repository = get_repo(args.owner, args.repo) print "Extracting known identifiers" known_shas = get_shas(repository) potential_commits = potential_hashes(repository, shas=known_shas) print "There are %d potential commits to check for" % (len(potential_commits)) # Keep track of the last completion percentage, so we can report to the user during big scans percentage_complete = 0 for i, candidate in enumerate(potential_commits): current_percentage = round(((i+1.0) / len(potential_commits)) * 100) # Loop over ancestors for a 4 digit hash. If the hash doesn't exist, no ancestors are returned for parent in check_candidate(args.owner, args.repo, candidate, known_shas): print "Found hidden commit https://github.com/%s/%s/commit/%s" % (args.owner, args.repo, parent) if current_percentage > percentage_complete: print "%d%% complete" % (int(current_percentage)) percentage_complete = current_percentage if __name__ == '__main__': main()