-
-
Save afternoon/1433794 to your computer and use it in GitHub Desktop.
| #!/usr/bin/python | |
| # | |
| # git-slim | |
| # | |
| # Remove big files from git repo history. | |
| # | |
| # Requires GitPython (https://github.com/gitpython-developers/GitPython) | |
| # | |
| # References: | |
| # - http://help.github.com/remove-sensitive-data/ | |
| # - http://stackoverflow.com/questions/4444091/git-filter-branch-to-delete-large-file | |
| # - http://stackoverflow.com/questions/1029969/why-is-my-git-repository-so-big/1036595#1036595 | |
| # - http://stackoverflow.com/questions/460331/git-finding-a-filename-from-a-sha1 | |
| from glob import glob | |
| from git import Repo | |
| from os.path import getsize | |
| from re import split | |
| from shutil import rmtree | |
| from sys import argv, exit, stdout | |
| def print_activity(start, end='done'): | |
| '''Decorator which logs info like "Doing something: done" to stdout.''' | |
| def decorate(f): | |
| def wrapped(*args, **kwargs): | |
| stdout.write('%s: ' % start) | |
| stdout.flush() | |
| x = f(*args, **kwargs) | |
| print end | |
| return x | |
| return wrapped | |
| return decorate | |
| def slim_main(): | |
| '''Invoke slimming on working directory or first argv entry.''' | |
| repo_dir = argv[1] if len(argv) > 1 else '.' | |
| try: | |
| slim(repo_dir) | |
| except KeyboardInterrupt: | |
| exit(0) | |
| def slim(repo_dir): | |
| r = Repo(repo_dir) | |
| prep(r) | |
| old_size = repo_size(r) | |
| slim_blobs(r) | |
| tidy_up(r) | |
| new_size = repo_size(r) | |
| ok_done(old_size, new_size) | |
| def repo_size(r): | |
| return getsize(r.git_dir) | |
| def prep(r): | |
| '''Prep a repo by running GC and repacking.''' | |
| if r.is_dirty(): | |
| raise Exception('repo is dirty') | |
| gc(r) | |
| repack(r) | |
| def slim_blobs(r): | |
| '''Reduce repo size by listing blobs in size order and asking the user if | |
| they would like to remove them. | |
| ''' | |
| pack_blobs = list_pack_blobs_by_size(r) | |
| index = blob_index(r) | |
| seen = [] | |
| targets = [] | |
| for b in pack_blobs: | |
| if b[0] not in index: | |
| print '%s not in blob index' % b[0] | |
| else: | |
| blob_path, commit_hexsha = index[b[0]] | |
| if blob_path not in seen: | |
| blob_size = format_size(b[1]) | |
| commit_hexsha_prefix = commit_hexsha[:7] | |
| prompt = 'Remove %s (%s at %s)? [Y/n/d] ' % \ | |
| (blob_path, blob_size, commit_hexsha_prefix) | |
| answer = raw_input(prompt).strip().lower() | |
| if answer == 'd': | |
| break | |
| elif answer in ('y', ''): | |
| targets.append(blob_path) | |
| seen.append(blob_path) | |
| remove_files(r, targets) | |
| def blob_index(r): | |
| '''Build index of paths of blobs in the repo. Iterates across all files in | |
| all commits and records blob used. | |
| ''' | |
| desc = 'Indexing blobs in commits: ' | |
| index = {} | |
| commits = list(r.iter_commits()) | |
| commits_len = len(commits) | |
| blob_predicate = lambda i, d: i.type == 'blob' | |
| i = 1 | |
| for commit in commits: | |
| stdout.write('\r%s(%s/%s)' % (desc, i, commits_len)) | |
| stdout.flush() | |
| for blob in commit.tree.traverse(predicate=blob_predicate): | |
| index[blob.hexsha] = blob.path, str(commit) | |
| i += 1 | |
| print '\r%sdone ' % desc | |
| return index | |
| @print_activity('Listing pack blobs') | |
| def list_pack_blobs_by_size(r): | |
| blobs = list_pack_blobs(r) | |
| blobs_s = sorted(blobs, key=lambda b: b[1], reverse=True) | |
| return blobs_s | |
| def list_pack_blobs(r): | |
| '''Call git verify-pack to dump info about blobs in a pack.''' | |
| pack_index_glob = r.git_dir + '/objects/pack/pack-*.idx' | |
| pack_index_files = glob(pack_index_glob) | |
| pack_info = r.git.verify_pack(*pack_index_files, verbose=True) | |
| return extract_blob_info(pack_info) | |
| def extract_blob_info(pack_info): | |
| '''Extract info about blobs in a pack from text returned by git verify-pack. | |
| ''' | |
| for line in pack_info.split('\n'): | |
| bits = split(r'\s+', line) | |
| if len(bits) > 1 and bits[1] == 'blob': | |
| yield bits[0], int(bits[3]) | |
| def format_size(num): | |
| '''Format numbers as file sizes. From hurry.filesize.''' | |
| for x in [' bytes', 'KB', 'MB', 'GB', 'TB']: | |
| if num < 1024.0: | |
| return "%.0f%s" % (num, x) | |
| num /= 1024.0 | |
| @print_activity('Removing files from repo history') | |
| def remove_files(r, fs): | |
| '''Run git rm for each file in list against each commit using git | |
| filter-branch. Completely removes files from repo history. | |
| ''' | |
| if not fs: | |
| return | |
| # todo: check file list doesn't exceed max command length | |
| filelist = ' '.join(fs) | |
| r.git.filter_branch('--index-filter', | |
| 'git rm --cached --ignore-unmatch %s' % filelist, | |
| '--prune-empty', | |
| 'HEAD') | |
| def tidy_up(r): | |
| '''Tidy up by expiring reflog, aggresively GCing repo and repacking. Should | |
| recover space used by objects removed during slimming process. | |
| ''' | |
| rm_original_refs(r) | |
| expire_reflog(r) | |
| gc(r) | |
| repack(r) | |
| @print_activity('Removing original refs') | |
| def rm_original_refs(r): | |
| rmtree(r.git_dir + '/refs/original/', ignore_errors=True) | |
| @print_activity('Expiring reflog') | |
| def expire_reflog(r): | |
| r.git.reflog('expire', '--expire=now', '--all') | |
| @print_activity('Garbage collecting') | |
| def gc(r): | |
| r.git.gc(prune=True) | |
| @print_activity('Repacking') | |
| def repack(r): | |
| r.git.repack(a=True, d=True, q=True) | |
| def ok_done(old_size, new_size): | |
| delta = format_size(old_size - new_size) | |
| old_f = format_size(old_size) | |
| new_f = format_size(new_size) | |
| print '\nRepo slimmed by %s (reduced from %s to %s)' % (delta, old_f, new_f) | |
| print '(Running \'git gc --agressive --prune\' may reclaim further space)\n' | |
| print 'Next run \'git push origin --all --force\'' | |
| print 'Then re-clone all copies of the repo' | |
| print 'Warning: If an old clone is used, big objects may reappear' | |
| if __name__ == '__main__': | |
| slim_main() |
Thanks for the little script. It made this timely process somewhat easier Thanks again.
Dustin
This exploded for me:
Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
File "/Users/benson/bin/git-slim.py", line 208, in <module>
slim_main()
File "/Users/benson/bin/git-slim.py", line 40, in slim_main
slim(repo_dir)
File "/Users/benson/bin/git-slim.py", line 49, in slim
slim_blobs(r)
File "/Users/benson/bin/git-slim.py", line 73, in slim_blobs
index = blob_index(r)
File "/Users/benson/bin/git-slim.py", line 102, in blob_index
commits = list(r.iter_commits())
File "/Library/Python/2.7/site-packages/git/repo/base.py", line 423, in iter_commits
rev = self.head.commit
File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 168, in _get_commit
obj = self._get_object()
File "/Library/Python/2.7/site-packages/git/refs/symbolic.py", line 161, in _get_object
return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
File "/Library/Python/2.7/site-packages/git/objects/base.py", line 64, in new_from_sha
oinfo = repo.odb.info(sha1)
File "/Library/Python/2.7/site-packages/gitdb/db/base.py", line 256, in info
return self._db_query(sha).info(sha)
File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 162, in info
m = self._map_loose_object(sha)
File "/Library/Python/2.7/site-packages/gitdb/db/loose.py", line 146, in _map_loose_object
raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 59a968e4b7bf20039a9314c383a7bb5aa955b53c
Quick look at the code: Won’t it only run git filter-branch on the current branch instead of the whole repo?
And since this is such a potentially destructive script, you might want to add a notice about that.
@bimargulies, this script failed for me as well, with the identical error. For the time being this script probably shouldn't be used, especially since as @Chronial points out it's potentially destructive.
python git-slim.py
Garbage collecting: done
Repacking: done
Listing pack blobs: done
Traceback (most recent call last):
File "git-slim.py", line 208, in <module>
slim_main()
File "git-slim.py", line 40, in slim_main
slim(repo_dir)
File "git-slim.py", line 49, in slim
slim_blobs(r)
File "git-slim.py", line 73, in slim_blobs
index = blob_index(r)
File "git-slim.py", line 102, in blob_index
commits = list(r.iter_commits())
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/repo/base.py", line 424, in iter_commits
rev = self.head.commit
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 168, in _get_commit
obj = self._get_object()
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/refs/symbolic.py", line 161, in _get_object
return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
File "/Library/Python/2.7/site-packages/GitPython-0.3.2.RC1-py2.7.egg/git/objects/base.py", line 64, in new_from_sha
oinfo = repo.odb.info(sha1)
File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/base.py", line 256, in info
return self._db_query(sha).info(sha)
File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 162, in info
m = self._map_loose_object(sha)
File "/Library/Python/2.7/site-packages/gitdb-0.5.4-py2.7-macosx-10.9-intel.egg/gitdb/db/loose.py", line 146, in _map_loose_object
raise BadObject(sha)
gitdb.exc.BadObject: BadObject: 45bb1fbbb91af857c8566fd30fe59d6dfee0d63d
Failed on OS X 10.10
python git-slim.py
Traceback (most recent call last):
File "git-slim.py", line 208, in <module>
slim_main()
File "git-slim.py", line 40, in slim_main
slim(repo_dir)
File "git-slim.py", line 47, in slim
prep(r)
File "git-slim.py", line 61, in prep
if r.is_dirty():
TypeError: 'bool' object is not callableI get this error :
File "git-slim.py", line 16, in
from git import Repo
ImportError: No module named git
This means the library is not found. Where can I get the library for this?
@greenspray, you can google how to install gitpython.
I got the following error:
File "git-slim.py", line 30
print end
^
SyntaxError: Missing parentheses in call to 'print'. Did you mean print(end)?
I got the following error:
File "git-slim.py", line 30
print end
^
SyntaxError: Missing parentheses in call to 'print'. Did you mean print(end)?
I got the same error, I'm using Python 3.6.3
Worked perfectly
If your system has both python 3 and python 2 then this script is python2 friendly so ... python2 ./git-slim.py
This works amazingly awesome! I had to run git gc --aggressive --prune to get it to take effect, but after that it was magic!
Mega thanks!