On Fri, May 9, 2025 at 2:45 AM Patrick Steinhardt <ps@xxxxxx> wrote: > > The "stats" directory contains a couple of scripts to do some statistics > on a repository: > > - "git-common-hash" shows the longest common hash prefixes and can be > used to determine the minimum prefix length to use for object names > to be unique. The script has last been touched in 53474eb92ff > (contrib: update stats/mailmap script, 2012-12-12) and searching for > it on the internet doesn't really surface any potential use cases or > even mentions of it. > > Modern Git also shouldn't really need this tool as it knows to > automatically scale printed prefixes via some heuristics. > > - "mailmap.pl" performs some statistics on the number of mailmapped > commits in a repository. It has last been modified in 53474eb92ff > (contrib: update stats/mailmap script, 2012-12-12) and has since > been bitrotting. It doesn't even compile nowadays anymore: > > $ perl contrib/stats/mailmap.pl > Experimental keys on scalar is now forbidden at contrib/stats/mailmap.pl line 57. > Type of arg 1 to keys must be hash or array (not hash element) at contrib/stats/mailmap.pl line 57, near "}) " > Experimental keys on scalar is now forbidden at contrib/stats/mailmap.pl line 57. > Type of arg 1 to keys must be hash or array (not private variable) at contrib/stats/mailmap.pl line 57, near "$h)" > Experimental keys on scalar is now forbidden at contrib/stats/mailmap.pl line 64. > Type of arg 1 to keys must be hash or array (not private variable) at contrib/stats/mailmap.pl line 64, near "$h)" > Execution of contrib/stats/mailmap.pl aborted due to compilation errors. > > This should be good-enough signal to indicate that nodoby is using > this script at all anymore. s/nodoby/nobody/ > - "packinfo.pl" takes the output from git-verify-pack(1) and performs > some pretty printing thereof. On the one hand it reformats the > output to be easier to read and provide some summaries. On the other > hand it may also print filenames of blobs. > > The script has last been touched in 3b1eb124932 (contrib: update > packinfo.pl to not use dashed commands, 2008-10-17), but it still > works nowadays. Even so, it is quite unlikely that anybody is still > using it. And if the provided information really was useful we > should rather think about moving it into git-verify-pack(1) itself. > > Remove the whole directory. > > Signed-off-by: Patrick Steinhardt <ps@xxxxxx> > --- > contrib/stats/git-common-hash | 26 ------ > contrib/stats/mailmap.pl | 70 -------------- > contrib/stats/packinfo.pl | 212 ------------------------------------------ > 3 files changed, 308 deletions(-) > > diff --git a/contrib/stats/git-common-hash b/contrib/stats/git-common-hash > deleted file mode 100755 > index e27fd088be1..00000000000 > --- a/contrib/stats/git-common-hash > +++ /dev/null > @@ -1,26 +0,0 @@ > -#!/bin/sh > - > -# This script displays the distribution of longest common hash prefixes. > -# This can be used to determine the minimum prefix length to use > -# for object names to be unique. > - > -git rev-list --objects --all | sort | perl -lne ' > - substr($_, 40) = ""; > - # uncomment next line for a distribution of bits instead of hex chars > - # $_ = unpack("B*",pack("H*",$_)); > - if (defined $p) { > - ($p ^ $_) =~ /^(\0*)/; > - $common = length $1; > - if (defined $pcommon) { > - $count[$pcommon > $common ? $pcommon : $common]++; > - } else { > - $count[$common]++; # first item > - } > - } > - $p = $_; > - $pcommon = $common; > - END { > - $count[$common]++; # last item > - print "$_: $count[$_]" for 0..$#count; > - } > -' > diff --git a/contrib/stats/mailmap.pl b/contrib/stats/mailmap.pl > deleted file mode 100755 > index 9513f5e35b4..00000000000 > --- a/contrib/stats/mailmap.pl > +++ /dev/null > @@ -1,70 +0,0 @@ > -#!/usr/bin/perl > - > -use warnings 'all'; > -use strict; > -use Getopt::Long; > - > -my $match_emails; > -my $match_names; > -my $order_by = 'count'; > -Getopt::Long::Configure(qw(bundling)); > -GetOptions( > - 'emails|e!' => \$match_emails, > - 'names|n!' => \$match_names, > - 'count|c' => sub { $order_by = 'count' }, > - 'time|t' => sub { $order_by = 'stamp' }, > -) or exit 1; > -$match_emails = 1 unless $match_names; > - > -my $email = {}; > -my $name = {}; > - > -open(my $fh, '-|', "git log --format='%at <%aE> %aN'"); > -while(<$fh>) { > - my ($t, $e, $n) = /(\S+) <(\S+)> (.*)/; > - mark($email, $e, $n, $t); > - mark($name, $n, $e, $t); > -} > -close($fh); > - > -if ($match_emails) { > - foreach my $e (dups($email)) { > - foreach my $n (vals($email->{$e})) { > - show($n, $e, $email->{$e}->{$n}); > - } > - print "\n"; > - } > -} > -if ($match_names) { > - foreach my $n (dups($name)) { > - foreach my $e (vals($name->{$n})) { > - show($n, $e, $name->{$n}->{$e}); > - } > - print "\n"; > - } > -} > -exit 0; > - > -sub mark { > - my ($h, $k, $v, $t) = @_; > - my $e = $h->{$k}->{$v} ||= { count => 0, stamp => 0 }; > - $e->{count}++; > - $e->{stamp} = $t unless $t < $e->{stamp}; > -} > - > -sub dups { > - my $h = shift; > - return grep { keys($h->{$_}) > 1 } keys($h); > -} > - > -sub vals { > - my $h = shift; > - return sort { > - $h->{$b}->{$order_by} <=> $h->{$a}->{$order_by} > - } keys($h); > -} > - > -sub show { > - my ($n, $e, $h) = @_; > - print "$n <$e> ($h->{$order_by})\n"; > -} > diff --git a/contrib/stats/packinfo.pl b/contrib/stats/packinfo.pl > deleted file mode 100755 > index be188c0f11d..00000000000 > --- a/contrib/stats/packinfo.pl > +++ /dev/null > @@ -1,212 +0,0 @@ > -#!/usr/bin/perl > -# > -# This tool will print vaguely pretty information about a pack. It > -# expects the output of "git verify-pack -v" as input on stdin. > -# > -# $ git verify-pack -v | packinfo.pl > -# > -# This prints some full-pack statistics; currently "all sizes", "all > -# path sizes", "tree sizes", "tree path sizes", and "depths". > -# > -# * "all sizes" stats are across every object size in the file; > -# full sizes for base objects, and delta size for deltas. > -# * "all path sizes" stats are across all object's "path sizes". > -# A path size is the sum of the size of the delta chain, including the > -# base object. In other words, it's how many bytes need be read to > -# reassemble the file from deltas. > -# * "tree sizes" are object sizes grouped into delta trees. > -# * "tree path sizes" are path sizes grouped into delta trees. > -# * "depths" should be obvious. > -# > -# When run as: > -# > -# $ git verify-pack -v | packinfo.pl -tree > -# > -# the trees of objects are output along with the stats. This looks > -# like: > -# > -# 0 commit 031321c6... 803 803 > -# > -# 0 blob 03156f21... 1767 1767 > -# 1 blob f52a9d7f... 10 1777 > -# 2 blob a8cc5739... 51 1828 > -# 3 blob 660e90b1... 15 1843 > -# 4 blob 0cb8e3bb... 33 1876 > -# 2 blob e48607f0... 311 2088 > -# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 > -# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 > -# > -# The first number after the sha1 is the object size, the second > -# number is the path size. The statistics are across all objects in > -# the previous delta tree. Obviously they are omitted for trees of > -# one object. > -# > -# When run as: > -# > -# $ git verify-pack -v | packinfo.pl -tree -filenames > -# > -# it adds filenames to the tree. Getting this information is slow: > -# > -# 0 blob 03156f21... 1767 1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142 > -# 1 blob f52a9d7f... 10 1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74 > -# 2 blob a8cc5739... 51 1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0 > -# 3 blob 660e90b1... 15 1843 Documentation/git-lost+found.txt @ master~3222^2~2 > -# 4 blob 0cb8e3bb... 33 1876 Documentation/git-lost+found.txt @ master~3222^2~3 > -# 2 blob e48607f0... 311 2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4 > -# size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85 > -# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26 > -# > -# When run as: > -# > -# $ git verify-pack -v | packinfo.pl -dump > -# > -# it prints out "sha1 size pathsize depth" for each sha1 in lexical > -# order. > -# > -# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7 > -# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4 > -# 000182eacf99cde27d5916aa415921924b82972c 499 499 0 > -# ... > -# > -# This is handy for comparing two packs. Adding "-filenames" will add > -# filenames, as per "-tree -filenames" above. > - > -use strict; > -use Getopt::Long; > - > -my $filenames = 0; > -my $tree = 0; > -my $dump = 0; > -GetOptions("tree" => \$tree, > - "filenames" => \$filenames, > - "dump" => \$dump); > - > -my %parents; > -my %children; > -my %sizes; > -my @roots; > -my %paths; > -my %types; > -my @commits; > -my %names; > -my %depths; > -my @depths; > - > -while (<STDIN>) { > - my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_); > - next unless ($sha1 =~ /^[0-9a-f]{40}$/); > - $depths{$sha1} = $depth || 0; > - push(@depths, $depth || 0); > - push(@commits, $sha1) if ($type eq 'commit'); > - push(@roots, $sha1) unless $parent; > - $parents{$sha1} = $parent; > - $types{$sha1} = $type; > - push(@{$children{$parent}}, $sha1); > - $sizes{$sha1} = $size; > -} > - > -if ($filenames && ($tree || $dump)) { > - open(NAMES, "git name-rev --all|"); > - while (<NAMES>) { > - if (/^(\S+)\s+(.*)$/) { > - my ($sha1, $name) = ($1, $2); > - $names{$sha1} = $name; > - } > - } > - close NAMES; > - > - for my $commit (@commits) { > - my $name = $names{$commit}; > - open(TREE, "git ls-tree -t -r $commit|"); > - print STDERR "Plumbing tree $name\n"; > - while (<TREE>) { > - if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) { > - my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4); > - $paths{$sha1} = "$path @ $name"; > - } > - } > - close TREE; > - } > -} > - > -sub stats { > - my @data = sort {$a <=> $b} @_; > - my $min = $data[0]; > - my $max = $data[$#data]; > - my $total = 0; > - my $count = scalar @data; > - for my $datum (@data) { > - $total += $datum; > - } > - my $mean = $total / $count; > - my $median = $data[int(@data / 2)]; > - my $diff_sum = 0; > - for my $datum (@data) { > - $diff_sum += ($datum - $mean)**2; > - } > - my $std_dev = sqrt($diff_sum / $count); > - return ($count, $total, $min, $max, $mean, $median, $std_dev); > -} > - > -sub print_stats { > - my $name = shift; > - my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_); > - printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n", > - $name, $count, $total, $min, $max, $mean, $median, $std_dev); > -} > - > -my @sizes; > -my @path_sizes; > -my @all_sizes; > -my @all_path_sizes; > -my %path_sizes; > - > -sub dig { > - my ($sha1, $depth, $path_size) = @_; > - $path_size += $sizes{$sha1}; > - push(@sizes, $sizes{$sha1}); > - push(@all_sizes, $sizes{$sha1}); > - push(@path_sizes, $path_size); > - push(@all_path_sizes, $path_size); > - $path_sizes{$sha1} = $path_size; > - if ($tree) { > - printf("%3d%s %6s %s %8d %8d %s\n", > - $depth, (" " x $depth), $types{$sha1}, > - $sha1, $sizes{$sha1}, $path_size, $paths{$sha1}); > - } > - for my $child (@{$children{$sha1}}) { > - dig($child, $depth + 1, $path_size); > - } > -} > - > -my @tree_sizes; > -my @tree_path_sizes; > - > -for my $root (@roots) { > - undef @sizes; > - undef @path_sizes; > - dig($root, 0, 0); > - my ($aa, $sz_total) = stats(@sizes); > - my ($bb, $psz_total) = stats(@path_sizes); > - push(@tree_sizes, $sz_total); > - push(@tree_path_sizes, $psz_total); > - if ($tree) { > - if (@sizes > 1) { > - print_stats(" size", @sizes); > - print_stats("path size", @path_sizes); > - } > - print "\n"; > - } > -} > - > -if ($dump) { > - for my $sha1 (sort keys %sizes) { > - print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n"; > - } > -} else { > - print_stats(" all sizes", @all_sizes); > - print_stats(" all path sizes", @all_path_sizes); > - print_stats(" tree sizes", @tree_sizes); > - print_stats("tree path sizes", @tree_path_sizes); > - print_stats(" depths", @depths); > -} > > -- > 2.49.0.1077.gc0e912fd4c.dirty > >