#!/usr/bin/perl
#
# Generate a listing of all paths ever used in the repository, along with the
# disk space used by the path throughout the entire history.  Note that sizes
# for trees are cumulative; they include the sizes of all of the paths below
# them, in addition to the tree storage itself. All sizes are in bytes, and
# reflect git's delta and zlib compression.
#
# One caveat is that this is just the _current_ on-disk size. The on-disk size
# of each object may change if git repacks and chooses different delta bases,
# for example. Likewise, the mapping of paths to objects may change (e.g., a
# given blob object may appear at multiple paths, and we assign it to only one
# path slot).  So take it as a rough guide; removing particular paths
# from your history may not remove exactly that many bytes from the repository
# size.

use strict;

my $commits = { size => 0 };
my $tags = { size => 0 };
my $tree = { size => 0 };

open(my $git, '-|',
     'git rev-list --objects --all |
      git cat-file --batch-check="%(objectsize:disk) %(objecttype) %(rest)"
     ');
while (<$git>) {
  my ($size, $type, $name) = /^(\d+) (.*?) (.*)/;
  if ($type eq 'commit') {
    add_to_bucket($commits, '', $size);
  } elsif ($type eq 'tag') {
    add_to_bucket($tags, '', $size);
  } else {
    add_to_bucket($tree, $name, $size);
  }
}

show_bucket($commits, 'COMMITS', 0);
show_bucket($tags, 'TAGS', 0);
show_bucket($tree, '/', 0);

sub add_to_bucket {
  my ($node, $name, $size) = @_;

  $node->{size} += $size;
  my @name = split('/', $name);
  while (@name) {
    $node = $node->{child}->{shift @name} ||= { size => 0 };
    $node->{size} += $size;
  }
}

sub show_bucket {
  my ($node, $name, $indent) = @_;
  print "  " x $indent;
  printf "%10s %s\n", $node->{size}, $name;
  for my $child (sort(keys %{$node->{child}})) {
    my $child_name = $name eq '/' ? $child : "$name/$child";
    show_bucket($node->{child}->{$child}, $child_name, $indent + 1)
  }
}