1#!/usr/bin/perl
2#
3# This tool will print vaguely pretty information about a pack.  It
4# expects the output of "git verify-pack -v" as input on stdin.
5#
6# $ git verify-pack -v | packinfo.pl
7#
8# This prints some full-pack statistics; currently "all sizes", "all
9# path sizes", "tree sizes", "tree path sizes", and "depths".
10#
11# * "all sizes" stats are across every object size in the file;
12#   full sizes for base objects, and delta size for deltas.
13# * "all path sizes" stats are across all object's "path sizes".
14#   A path size is the sum of the size of the delta chain, including the
15#   base object.  In other words, it's how many bytes need be read to
16#   reassemble the file from deltas.
17# * "tree sizes" are object sizes grouped into delta trees.
18# * "tree path sizes" are path sizes grouped into delta trees.
19# * "depths" should be obvious.
20#
21# When run as:
22#
23# $ git verify-pack -v | packinfo.pl -tree
24#
25# the trees of objects are output along with the stats.  This looks
26# like:
27#
28#   0 commit 031321c6...      803      803
29#
30#   0   blob 03156f21...     1767     1767
31#   1    blob f52a9d7f...       10     1777
32#   2     blob a8cc5739...       51     1828
33#   3      blob 660e90b1...       15     1843
34#   4       blob 0cb8e3bb...       33     1876
35#   2     blob e48607f0...      311     2088
36#      size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
37# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
38#
39# The first number after the sha1 is the object size, the second
40# number is the path size.  The statistics are across all objects in
41# the previous delta tree.  Obviously they are omitted for trees of
42# one object.
43#
44# When run as:
45#
46# $ git verify-pack -v | packinfo.pl -tree -filenames
47#
48# it adds filenames to the tree.  Getting this information is slow:
49#
50#   0   blob 03156f21...     1767     1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142
51#   1    blob f52a9d7f...       10     1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74
52#   2     blob a8cc5739...       51     1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0
53#   3      blob 660e90b1...       15     1843 Documentation/git-lost+found.txt @ master~3222^2~2
54#   4       blob 0cb8e3bb...       33     1876 Documentation/git-lost+found.txt @ master~3222^2~3
55#   2     blob e48607f0...      311     2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4
56#      size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
57# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
58#
59# When run as:
60#
61# $ git verify-pack -v | packinfo.pl -dump
62#
63# it prints out "sha1 size pathsize depth" for each sha1 in lexical
64# order.
65#
66# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7
67# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4
68# 000182eacf99cde27d5916aa415921924b82972c 499 499 0
69# ...
70#
71# This is handy for comparing two packs.  Adding "-filenames" will add
72# filenames, as per "-tree -filenames" above.
73
74use strict;
75use Getopt::Long;
76
77my $filenames = 0;
78my $tree = 0;
79my $dump = 0;
80GetOptions("tree" => \$tree,
81           "filenames" => \$filenames,
82           "dump" => \$dump);
83
84my %parents;
85my %children;
86my %sizes;
87my @roots;
88my %paths;
89my %types;
90my @commits;
91my %names;
92my %depths;
93my @depths;
94
95while (<STDIN>) {
96    my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_);
97    next unless ($sha1 =~ /^[0-9a-f]{40}$/);
98    $depths{$sha1} = $depth || 0;
99    push(@depths, $depth || 0);
100    push(@commits, $sha1) if ($type eq 'commit');
101    push(@roots, $sha1) unless $parent;
102    $parents{$sha1} = $parent;
103    $types{$sha1} = $type;
104    push(@{$children{$parent}}, $sha1);
105    $sizes{$sha1} = $size;
106}
107
108if ($filenames && ($tree || $dump)) {
109    open(NAMES, "git name-rev --all|");
110    while (<NAMES>) {
111        if (/^(\S+)\s+(.*)$/) {
112            my ($sha1, $name) = ($1, $2);
113            $names{$sha1} = $name;
114        }
115    }
116    close NAMES;
117
118    for my $commit (@commits) {
119        my $name = $names{$commit};
120        open(TREE, "git ls-tree -t -r $commit|");
121        print STDERR "Plumbing tree $name\n";
122        while (<TREE>) {
123            if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) {
124                my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4);
125                $paths{$sha1} = "$path @ $name";
126            }
127        }
128        close TREE;
129    }
130}
131
132sub stats {
133    my @data = sort {$a <=> $b} @_;
134    my $min = $data[0];
135    my $max = $data[$#data];
136    my $total = 0;
137    my $count = scalar @data;
138    for my $datum (@data) {
139        $total += $datum;
140    }
141    my $mean = $total / $count;
142    my $median = $data[int(@data / 2)];
143    my $diff_sum = 0;
144    for my $datum (@data) {
145        $diff_sum += ($datum - $mean)**2;
146    }
147    my $std_dev = sqrt($diff_sum / $count);
148    return ($count, $total, $min, $max, $mean, $median, $std_dev);
149}
150
151sub print_stats {
152    my $name = shift;
153    my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_);
154    printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n",
155           $name, $count, $total, $min, $max, $mean, $median, $std_dev);
156}
157
158my @sizes;
159my @path_sizes;
160my @all_sizes;
161my @all_path_sizes;
162my %path_sizes;
163
164sub dig {
165    my ($sha1, $depth, $path_size) = @_;
166    $path_size += $sizes{$sha1};
167    push(@sizes, $sizes{$sha1});
168    push(@all_sizes, $sizes{$sha1});
169    push(@path_sizes, $path_size);
170    push(@all_path_sizes, $path_size);
171    $path_sizes{$sha1} = $path_size;
172    if ($tree) {
173        printf("%3d%s %6s %s %8d %8d %s\n",
174               $depth, (" " x $depth), $types{$sha1},
175               $sha1, $sizes{$sha1}, $path_size, $paths{$sha1});
176    }
177    for my $child (@{$children{$sha1}}) {
178        dig($child, $depth + 1, $path_size);
179    }
180}
181
182my @tree_sizes;
183my @tree_path_sizes;
184
185for my $root (@roots) {
186    undef @sizes;
187    undef @path_sizes;
188    dig($root, 0, 0);
189    my ($aa, $sz_total) = stats(@sizes);
190    my ($bb, $psz_total) = stats(@path_sizes);
191    push(@tree_sizes, $sz_total);
192    push(@tree_path_sizes, $psz_total);
193    if ($tree) {
194        if (@sizes > 1) {
195            print_stats("     size", @sizes);
196            print_stats("path size", @path_sizes);
197        }
198        print "\n";
199    }
200}
201
202if ($dump) {
203    for my $sha1 (sort keys %sizes) {
204        print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n";
205    }
206} else {
207    print_stats("      all sizes", @all_sizes);
208    print_stats(" all path sizes", @all_path_sizes);
209    print_stats("     tree sizes", @tree_sizes);
210    print_stats("tree path sizes", @tree_path_sizes);
211    print_stats("         depths", @depths);
212}
213