1#!/usr/local/bin/perl 2# 3# This tool will print vaguely pretty information about a pack. It 4# expects the output of "git verify-pack -v" as input on stdin. 5# 6# $ git verify-pack -v | packinfo.pl 7# 8# This prints some full-pack statistics; currently "all sizes", "all 9# path sizes", "tree sizes", "tree path sizes", and "depths". 10# 11# * "all sizes" stats are across every object size in the file; 12# full sizes for base objects, and delta size for deltas. 13# * "all path sizes" stats are across all object's "path sizes". 14# A path size is the sum of the size of the delta chain, including the 15# base object. In other words, it's how many bytes need be read to 16# reassemble the file from deltas. 17# * "tree sizes" are object sizes grouped into delta trees. 18# * "tree path sizes" are path sizes grouped into delta trees. 19# * "depths" should be obvious. 20# 21# When run as: 22# 23# $ git verify-pack -v | packinfo.pl -tree 24# 25# the trees of objects are output along with the stats. This looks 26# like: 27# 28# 0 commit 031321c6... 803 803 29# 30# 0 blob 03156f21... 1767 1767 31# 1 blob f52a9d7f... 10 1777 32# 2 blob a8cc5739... 51 1828 33# 3 blob 660e90b1... 15 1843 34# 4 blob 0cb8e3bb... 33 1876 35# 2 blob e48607f0... 311 2088 36# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 37# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 38# 39# The first number after the sha1 is the object size, the second 40# number is the path size. The statistics are across all objects in 41# the previous delta tree. Obviously they are omitted for trees of 42# one object. 43# 44# When run as: 45# 46# $ git verify-pack -v | packinfo.pl -tree -filenames 47# 48# it adds filenames to the tree. Getting this information is slow: 49# 50# 0 blob 03156f21... 1767 1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142 51# 1 blob f52a9d7f... 10 1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74 52# 2 blob a8cc5739... 51 1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0 53# 3 blob 660e90b1... 15 1843 Documentation/git-lost+found.txt @ master~3222^2~2 54# 4 blob 0cb8e3bb... 33 1876 Documentation/git-lost+found.txt @ master~3222^2~3 55# 2 blob e48607f0... 311 2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4 56# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 57# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 58# 59# When run as: 60# 61# $ git verify-pack -v | packinfo.pl -dump 62# 63# it prints out "sha1 size pathsize depth" for each sha1 in lexical 64# order. 65# 66# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7 67# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4 68# 000182eacf99cde27d5916aa415921924b82972c 499 499 0 69# ... 70# 71# This is handy for comparing two packs. Adding "-filenames" will add 72# filenames, as per "-tree -filenames" above. 73 74use strict; 75use Getopt::Long; 76 77my $filenames = 0; 78my $tree = 0; 79my $dump = 0; 80GetOptions("tree" => \$tree, 81 "filenames" => \$filenames, 82 "dump" => \$dump); 83 84my %parents; 85my %children; 86my %sizes; 87my @roots; 88my %paths; 89my %types; 90my @commits; 91my %names; 92my %depths; 93my @depths; 94 95while (<STDIN>) { 96 my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_); 97 next unless ($sha1 =~ /^[0-9a-f]{40}$/); 98 $depths{$sha1} = $depth || 0; 99 push(@depths, $depth || 0); 100 push(@commits, $sha1) if ($type eq 'commit'); 101 push(@roots, $sha1) unless $parent; 102 $parents{$sha1} = $parent; 103 $types{$sha1} = $type; 104 push(@{$children{$parent}}, $sha1); 105 $sizes{$sha1} = $size; 106} 107 108if ($filenames && ($tree || $dump)) { 109 open(NAMES, "git name-rev --all|"); 110 while (<NAMES>) { 111 if (/^(\S+)\s+(.*)$/) { 112 my ($sha1, $name) = ($1, $2); 113 $names{$sha1} = $name; 114 } 115 } 116 close NAMES; 117 118 for my $commit (@commits) { 119 my $name = $names{$commit}; 120 open(TREE, "git ls-tree -t -r $commit|"); 121 print STDERR "Plumbing tree $name\n"; 122 while (<TREE>) { 123 if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) { 124 my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4); 125 $paths{$sha1} = "$path @ $name"; 126 } 127 } 128 close TREE; 129 } 130} 131 132sub stats { 133 my @data = sort {$a <=> $b} @_; 134 my $min = $data[0]; 135 my $max = $data[$#data]; 136 my $total = 0; 137 my $count = scalar @data; 138 for my $datum (@data) { 139 $total += $datum; 140 } 141 my $mean = $total / $count; 142 my $median = $data[int(@data / 2)]; 143 my $diff_sum = 0; 144 for my $datum (@data) { 145 $diff_sum += ($datum - $mean)**2; 146 } 147 my $std_dev = sqrt($diff_sum / $count); 148 return ($count, $total, $min, $max, $mean, $median, $std_dev); 149} 150 151sub print_stats { 152 my $name = shift; 153 my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_); 154 printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n", 155 $name, $count, $total, $min, $max, $mean, $median, $std_dev); 156} 157 158my @sizes; 159my @path_sizes; 160my @all_sizes; 161my @all_path_sizes; 162my %path_sizes; 163 164sub dig { 165 my ($sha1, $depth, $path_size) = @_; 166 $path_size += $sizes{$sha1}; 167 push(@sizes, $sizes{$sha1}); 168 push(@all_sizes, $sizes{$sha1}); 169 push(@path_sizes, $path_size); 170 push(@all_path_sizes, $path_size); 171 $path_sizes{$sha1} = $path_size; 172 if ($tree) { 173 printf("%3d%s %6s %s %8d %8d %s\n", 174 $depth, (" " x $depth), $types{$sha1}, 175 $sha1, $sizes{$sha1}, $path_size, $paths{$sha1}); 176 } 177 for my $child (@{$children{$sha1}}) { 178 dig($child, $depth + 1, $path_size); 179 } 180} 181 182my @tree_sizes; 183my @tree_path_sizes; 184 185for my $root (@roots) { 186 undef @sizes; 187 undef @path_sizes; 188 dig($root, 0, 0); 189 my ($aa, $sz_total) = stats(@sizes); 190 my ($bb, $psz_total) = stats(@path_sizes); 191 push(@tree_sizes, $sz_total); 192 push(@tree_path_sizes, $psz_total); 193 if ($tree) { 194 if (@sizes > 1) { 195 print_stats(" size", @sizes); 196 print_stats("path size", @path_sizes); 197 } 198 print "\n"; 199 } 200} 201 202if ($dump) { 203 for my $sha1 (sort keys %sizes) { 204 print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n"; 205 } 206} else { 207 print_stats(" all sizes", @all_sizes); 208 print_stats(" all path sizes", @all_path_sizes); 209 print_stats(" tree sizes", @tree_sizes); 210 print_stats("tree path sizes", @tree_path_sizes); 211 print_stats(" depths", @depths); 212} 213