Re: [PATCH v2 11/11] contrib: remove "stats" directory

Elijah Newren <newren@xxxxxxxxx> · Fri, 9 May 2025 16:31:42 -0700



On Fri, May 9, 2025 at 2:45 AM Patrick Steinhardt <ps@xxxxxx> wrote:
>
> The "stats" directory contains a couple of scripts to do some statistics
> on a repository:
>
>   - "git-common-hash" shows the longest common hash prefixes and can be
>     used to determine the minimum prefix length to use for object names
>     to be unique. The script has last been touched in 53474eb92ff
>     (contrib: update stats/mailmap script, 2012-12-12) and searching for
>     it on the internet doesn't really surface any potential use cases or
>     even mentions of it.
>
>     Modern Git also shouldn't really need this tool as it knows to
>     automatically scale printed prefixes via some heuristics.
>
>   - "mailmap.pl" performs some statistics on the number of mailmapped
>     commits in a repository. It has last been modified in 53474eb92ff
>     (contrib: update stats/mailmap script, 2012-12-12) and has since
>     been bitrotting. It doesn't even compile nowadays anymore:
>
>         $ perl contrib/stats/mailmap.pl
>         Experimental keys on scalar is now forbidden at contrib/stats/mailmap.pl line 57.
>         Type of arg 1 to keys must be hash or array (not hash element) at contrib/stats/mailmap.pl line 57, near "}) "
>         Experimental keys on scalar is now forbidden at contrib/stats/mailmap.pl line 57.
>         Type of arg 1 to keys must be hash or array (not private variable) at contrib/stats/mailmap.pl line 57, near "$h)"
>         Experimental keys on scalar is now forbidden at contrib/stats/mailmap.pl line 64.
>         Type of arg 1 to keys must be hash or array (not private variable) at contrib/stats/mailmap.pl line 64, near "$h)"
>         Execution of contrib/stats/mailmap.pl aborted due to compilation errors.
>
>     This should be good-enough signal to indicate that nodoby is using
>     this script at all anymore.

s/nodoby/nobody/

>   - "packinfo.pl" takes the output from git-verify-pack(1) and performs
>     some pretty printing thereof. On the one hand it reformats the
>     output to be easier to read and provide some summaries. On the other
>     hand it may also print filenames of blobs.
>
>     The script has last been touched in 3b1eb124932 (contrib: update
>     packinfo.pl to not use dashed commands, 2008-10-17), but it still
>     works nowadays. Even so, it is quite unlikely that anybody is still
>     using it. And if the provided information really was useful we
>     should rather think about moving it into git-verify-pack(1) itself.
>
> Remove the whole directory.
>
> Signed-off-by: Patrick Steinhardt <ps@xxxxxx>
> ---
>  contrib/stats/git-common-hash |  26 ------
>  contrib/stats/mailmap.pl      |  70 --------------
>  contrib/stats/packinfo.pl     | 212 ------------------------------------------
>  3 files changed, 308 deletions(-)
>
> diff --git a/contrib/stats/git-common-hash b/contrib/stats/git-common-hash
> deleted file mode 100755
> index e27fd088be1..00000000000
> --- a/contrib/stats/git-common-hash
> +++ /dev/null
> @@ -1,26 +0,0 @@
> -#!/bin/sh
> -
> -# This script displays the distribution of longest common hash prefixes.
> -# This can be used to determine the minimum prefix length to use
> -# for object names to be unique.
> -
> -git rev-list --objects --all | sort | perl -lne '
> -  substr($_, 40) = "";
> -  # uncomment next line for a distribution of bits instead of hex chars
> -  # $_ = unpack("B*",pack("H*",$_));
> -  if (defined $p) {
> -    ($p ^ $_) =~ /^(\0*)/;
> -    $common = length $1;
> -    if (defined $pcommon) {
> -      $count[$pcommon > $common ? $pcommon : $common]++;
> -    } else {
> -      $count[$common]++; # first item
> -    }
> -  }
> -  $p = $_;
> -  $pcommon = $common;
> -  END {
> -    $count[$common]++; # last item
> -    print "$_: $count[$_]" for 0..$#count;
> -  }
> -'
> diff --git a/contrib/stats/mailmap.pl b/contrib/stats/mailmap.pl
> deleted file mode 100755
> index 9513f5e35b4..00000000000
> --- a/contrib/stats/mailmap.pl
> +++ /dev/null
> @@ -1,70 +0,0 @@
> -#!/usr/bin/perl
> -
> -use warnings 'all';
> -use strict;
> -use Getopt::Long;
> -
> -my $match_emails;
> -my $match_names;
> -my $order_by = 'count';
> -Getopt::Long::Configure(qw(bundling));
> -GetOptions(
> -       'emails|e!' => \$match_emails,
> -       'names|n!'  => \$match_names,
> -       'count|c'   => sub { $order_by = 'count' },
> -       'time|t'    => sub { $order_by = 'stamp' },
> -) or exit 1;
> -$match_emails = 1 unless $match_names;
> -
> -my $email = {};
> -my $name = {};
> -
> -open(my $fh, '-|', "git log --format='%at <%aE> %aN'");
> -while(<$fh>) {
> -       my ($t, $e, $n) = /(\S+) <(\S+)> (.*)/;
> -       mark($email, $e, $n, $t);
> -       mark($name, $n, $e, $t);
> -}
> -close($fh);
> -
> -if ($match_emails) {
> -       foreach my $e (dups($email)) {
> -               foreach my $n (vals($email->{$e})) {
> -                       show($n, $e, $email->{$e}->{$n});
> -               }
> -               print "\n";
> -       }
> -}
> -if ($match_names) {
> -       foreach my $n (dups($name)) {
> -               foreach my $e (vals($name->{$n})) {
> -                       show($n, $e, $name->{$n}->{$e});
> -               }
> -               print "\n";
> -       }
> -}
> -exit 0;
> -
> -sub mark {
> -       my ($h, $k, $v, $t) = @_;
> -       my $e = $h->{$k}->{$v} ||= { count => 0, stamp => 0 };
> -       $e->{count}++;
> -       $e->{stamp} = $t unless $t < $e->{stamp};
> -}
> -
> -sub dups {
> -       my $h = shift;
> -       return grep { keys($h->{$_}) > 1 } keys($h);
> -}
> -
> -sub vals {
> -       my $h = shift;
> -       return sort {
> -               $h->{$b}->{$order_by} <=> $h->{$a}->{$order_by}
> -       } keys($h);
> -}
> -
> -sub show {
> -       my ($n, $e, $h) = @_;
> -       print "$n <$e> ($h->{$order_by})\n";
> -}
> diff --git a/contrib/stats/packinfo.pl b/contrib/stats/packinfo.pl
> deleted file mode 100755
> index be188c0f11d..00000000000
> --- a/contrib/stats/packinfo.pl
> +++ /dev/null
> @@ -1,212 +0,0 @@
> -#!/usr/bin/perl
> -#
> -# This tool will print vaguely pretty information about a pack.  It
> -# expects the output of "git verify-pack -v" as input on stdin.
> -#
> -# $ git verify-pack -v | packinfo.pl
> -#
> -# This prints some full-pack statistics; currently "all sizes", "all
> -# path sizes", "tree sizes", "tree path sizes", and "depths".
> -#
> -# * "all sizes" stats are across every object size in the file;
> -#   full sizes for base objects, and delta size for deltas.
> -# * "all path sizes" stats are across all object's "path sizes".
> -#   A path size is the sum of the size of the delta chain, including the
> -#   base object.  In other words, it's how many bytes need be read to
> -#   reassemble the file from deltas.
> -# * "tree sizes" are object sizes grouped into delta trees.
> -# * "tree path sizes" are path sizes grouped into delta trees.
> -# * "depths" should be obvious.
> -#
> -# When run as:
> -#
> -# $ git verify-pack -v | packinfo.pl -tree
> -#
> -# the trees of objects are output along with the stats.  This looks
> -# like:
> -#
> -#   0 commit 031321c6...      803      803
> -#
> -#   0   blob 03156f21...     1767     1767
> -#   1    blob f52a9d7f...       10     1777
> -#   2     blob a8cc5739...       51     1828
> -#   3      blob 660e90b1...       15     1843
> -#   4       blob 0cb8e3bb...       33     1876
> -#   2     blob e48607f0...      311     2088
> -#      size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
> -# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
> -#
> -# The first number after the sha1 is the object size, the second
> -# number is the path size.  The statistics are across all objects in
> -# the previous delta tree.  Obviously they are omitted for trees of
> -# one object.
> -#
> -# When run as:
> -#
> -# $ git verify-pack -v | packinfo.pl -tree -filenames
> -#
> -# it adds filenames to the tree.  Getting this information is slow:
> -#
> -#   0   blob 03156f21...     1767     1767 Documentation/git-lost-found.txt @ tags/v1.2.0~142
> -#   1    blob f52a9d7f...       10     1777 Documentation/git-lost-found.txt @ tags/v1.5.0-rc1~74
> -#   2     blob a8cc5739...       51     1828 Documentation/git-lost+found.txt @ tags/v0.99.9h^0
> -#   3      blob 660e90b1...       15     1843 Documentation/git-lost+found.txt @ master~3222^2~2
> -#   4       blob 0cb8e3bb...       33     1876 Documentation/git-lost+found.txt @ master~3222^2~3
> -#   2     blob e48607f0...      311     2088 Documentation/git-lost-found.txt @ tags/v1.5.2-rc3~4
> -#      size: count 6 total 2187 min 10 max 1767 mean 364.50 median 51 std_dev 635.85
> -# path size: count 6 total 11179 min 1767 max 2088 mean 1863.17 median 1843 std_dev 107.26
> -#
> -# When run as:
> -#
> -# $ git verify-pack -v | packinfo.pl -dump
> -#
> -# it prints out "sha1 size pathsize depth" for each sha1 in lexical
> -# order.
> -#
> -# 000079a2eaef17b7eae70e1f0f635557ea67b644 30 472 7
> -# 00013cafe6980411aa6fdd940784917b5ff50f0a 44 1542 4
> -# 000182eacf99cde27d5916aa415921924b82972c 499 499 0
> -# ...
> -#
> -# This is handy for comparing two packs.  Adding "-filenames" will add
> -# filenames, as per "-tree -filenames" above.
> -
> -use strict;
> -use Getopt::Long;
> -
> -my $filenames = 0;
> -my $tree = 0;
> -my $dump = 0;
> -GetOptions("tree" => \$tree,
> -           "filenames" => \$filenames,
> -           "dump" => \$dump);
> -
> -my %parents;
> -my %children;
> -my %sizes;
> -my @roots;
> -my %paths;
> -my %types;
> -my @commits;
> -my %names;
> -my %depths;
> -my @depths;
> -
> -while (<STDIN>) {
> -    my ($sha1, $type, $size, $space, $offset, $depth, $parent) = split(/\s+/, $_);
> -    next unless ($sha1 =~ /^[0-9a-f]{40}$/);
> -    $depths{$sha1} = $depth || 0;
> -    push(@depths, $depth || 0);
> -    push(@commits, $sha1) if ($type eq 'commit');
> -    push(@roots, $sha1) unless $parent;
> -    $parents{$sha1} = $parent;
> -    $types{$sha1} = $type;
> -    push(@{$children{$parent}}, $sha1);
> -    $sizes{$sha1} = $size;
> -}
> -
> -if ($filenames && ($tree || $dump)) {
> -    open(NAMES, "git name-rev --all|");
> -    while (<NAMES>) {
> -        if (/^(\S+)\s+(.*)$/) {
> -            my ($sha1, $name) = ($1, $2);
> -            $names{$sha1} = $name;
> -        }
> -    }
> -    close NAMES;
> -
> -    for my $commit (@commits) {
> -        my $name = $names{$commit};
> -        open(TREE, "git ls-tree -t -r $commit|");
> -        print STDERR "Plumbing tree $name\n";
> -        while (<TREE>) {
> -            if (/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) {
> -                my ($mode, $type, $sha1, $path) = ($1, $2, $3, $4);
> -                $paths{$sha1} = "$path @ $name";
> -            }
> -        }
> -        close TREE;
> -    }
> -}
> -
> -sub stats {
> -    my @data = sort {$a <=> $b} @_;
> -    my $min = $data[0];
> -    my $max = $data[$#data];
> -    my $total = 0;
> -    my $count = scalar @data;
> -    for my $datum (@data) {
> -        $total += $datum;
> -    }
> -    my $mean = $total / $count;
> -    my $median = $data[int(@data / 2)];
> -    my $diff_sum = 0;
> -    for my $datum (@data) {
> -        $diff_sum += ($datum - $mean)**2;
> -    }
> -    my $std_dev = sqrt($diff_sum / $count);
> -    return ($count, $total, $min, $max, $mean, $median, $std_dev);
> -}
> -
> -sub print_stats {
> -    my $name = shift;
> -    my ($count, $total, $min, $max, $mean, $median, $std_dev) = stats(@_);
> -    printf("%s: count %s total %s min %s max %s mean %.2f median %s std_dev %.2f\n",
> -           $name, $count, $total, $min, $max, $mean, $median, $std_dev);
> -}
> -
> -my @sizes;
> -my @path_sizes;
> -my @all_sizes;
> -my @all_path_sizes;
> -my %path_sizes;
> -
> -sub dig {
> -    my ($sha1, $depth, $path_size) = @_;
> -    $path_size += $sizes{$sha1};
> -    push(@sizes, $sizes{$sha1});
> -    push(@all_sizes, $sizes{$sha1});
> -    push(@path_sizes, $path_size);
> -    push(@all_path_sizes, $path_size);
> -    $path_sizes{$sha1} = $path_size;
> -    if ($tree) {
> -        printf("%3d%s %6s %s %8d %8d %s\n",
> -               $depth, (" " x $depth), $types{$sha1},
> -               $sha1, $sizes{$sha1}, $path_size, $paths{$sha1});
> -    }
> -    for my $child (@{$children{$sha1}}) {
> -        dig($child, $depth + 1, $path_size);
> -    }
> -}
> -
> -my @tree_sizes;
> -my @tree_path_sizes;
> -
> -for my $root (@roots) {
> -    undef @sizes;
> -    undef @path_sizes;
> -    dig($root, 0, 0);
> -    my ($aa, $sz_total) = stats(@sizes);
> -    my ($bb, $psz_total) = stats(@path_sizes);
> -    push(@tree_sizes, $sz_total);
> -    push(@tree_path_sizes, $psz_total);
> -    if ($tree) {
> -        if (@sizes > 1) {
> -            print_stats("     size", @sizes);
> -            print_stats("path size", @path_sizes);
> -        }
> -        print "\n";
> -    }
> -}
> -
> -if ($dump) {
> -    for my $sha1 (sort keys %sizes) {
> -        print "$sha1 $sizes{$sha1} $path_sizes{$sha1} $depths{$sha1} $paths{$sha1}\n";
> -    }
> -} else {
> -    print_stats("      all sizes", @all_sizes);
> -    print_stats(" all path sizes", @all_path_sizes);
> -    print_stats("     tree sizes", @tree_sizes);
> -    print_stats("tree path sizes", @tree_path_sizes);
> -    print_stats("         depths", @depths);
> -}
>
> --
> 2.49.0.1077.gc0e912fd4c.dirty
>
>