import argparse
import os
import re
import subprocess
import tempfile

import requests


def get_repo(owner, repo):
    """Check the repo out to a temporary directory"""
    tempdir = tempfile.mkdtemp()
    url = "http://github.com/%s/%s.git" % (owner, repo)
    subprocess.check_output(['git', 'clone', url, tempdir])
    return tempdir


def get_shas(checkout):
    """List all reachable commits from a checkout"""
    os.chdir(checkout)
    commits = subprocess.check_output(['git', 'repack'])
    packdir = os.path.join(checkout, '.git', 'objects', 'pack')
    packfile = [filename for filename in os.listdir(packdir) if '.idx' in filename]
    packfile = os.path.join(packdir, packfile[0])
    shas = subprocess.check_output(['git', 'verify-pack', '-v', packfile])
    shas = {sha[:4] for sha in shas.split("\n") if 'object' not in sha and sha}
    return shas


def potential_hashes(checkout, shas=None):
    """Get all 4 digit hashes that aren't in our known list"""
    if shas is None:
        shas = get_shas(checkout)
    all_possibles = {'%04x' % possible for possible in xrange(16**4)}
    candidates = all_possibles - shas
    return candidates


def get_parents_for_commit(owner, repo, commit):
    """Get the parents from the API. We can't use git as it won't show us hidden
    commits, but we have the full hash so we can use the API and avoid hitting
    the frontend"""
    url = "https://api.github.com/repos/%s/%s/commits/%s" % (owner, repo, commit)
    response = requests.get(url).json()
    for parent in response['parents']:
        yield parent['sha']


def get_all_ancestors(owner, repo, commit, break_on=None):
    """Given a hash, iterate over all of its ancestors, with an optional
    set of commits we have seen already to ignore."""
    if break_on is None:
        break_on = set()
    # Recurse over the parents, so we can follow both sides of a merge
    parents = get_parents_for_commit(owner, repo, commit)
    for parent in parents:
        # We've seen this commit before, stop processing
        if parent[:4] not in break_on:
            yield parent
            for ancestor in get_all_ancestors(owner, repo, parent, break_on):
                yield ancestor


def check_candidate(owner, repo, candidate, known_shas):
    """Given repo information and a 4 digit prefix, try and find that commit,
    then iterate over its ancestors if it exists. Otherwise return an empty
    iterator"""
    url = 'https://github.com/%s/%s/commit/%s' % (owner, repo, candidate)
    response = requests.get(url)
    if response.status_code == 200:
        # This is a real commit, find the full hash
        full_hash = re.compile('/commit/(%s.*?)\"' % candidate).findall(response.text)[0]
        # Make a note of the prefix, so we don't re-scan this tree
        known_shas.add(full_hash[:4])
        yield full_hash
        # Yield from all ancestors of this commit, until we hit a known commit
        for ancestor in get_all_ancestors(owner, repo, full_hash, known_shas):
            known_shas.add(ancestor[:4])
            yield ancestor
    else:
        return


def main():
    parser = argparse.ArgumentParser(description='Find unreachable commits')
    parser.add_argument('owner', type=str,
                        help='the repository owner')
    parser.add_argument('repo', type=str,
                        help='the repository name')
    args = parser.parse_args()

    # Get the git repo and find the candidates we will check for
    repository = get_repo(args.owner, args.repo)
    print "Extracting known identifiers"
    known_shas = get_shas(repository)
    potential_commits = potential_hashes(repository, shas=known_shas)
    print "There are %d potential commits to check for" % (len(potential_commits))

    # Keep track of the last completion percentage, so we can report to the user during big scans
    percentage_complete = 0
    for i, candidate in enumerate(potential_commits):
        current_percentage = round(((i+1.0) / len(potential_commits)) * 100)
        # Loop over ancestors for a 4 digit hash. If the hash doesn't exist, no ancestors are returned
        for parent in check_candidate(args.owner, args.repo, candidate, known_shas):
            print "Found hidden commit https://github.com/%s/%s/commit/%s" % (args.owner, args.repo, parent)
        if current_percentage  > percentage_complete:
            print "%d%% complete" % (int(current_percentage))
            percentage_complete  = current_percentage

if __name__ == '__main__':
    main()