1#!/usr/bin/env perl
2package Porting::updateAUTHORS;
3use strict;
4use warnings;
5use Getopt::Long qw(GetOptions);
6use Pod::Usage qw(pod2usage);
7use Data::Dumper;
8use Encode qw(encode_utf8 decode_utf8 decode);
9
10# The style of this file is determined by:
11#
12# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
13#   -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs  \
14#   -fsb='#start-no-tidy' -fse='#end-no-tidy'
15
16# Info and config for passing to git log.
17#   %an: author name
18#   %aN: author name (respecting .mailmap, see git-shortlog(1) or git-blame(1))
19#   %ae: author email
20#   %aE: author email (respecting .mailmap, see git-shortlog(1) or git-blame(1))
21#   %cn: committer name
22#   %cN: committer name (respecting .mailmap, see git-shortlog(1) or git-blame(1))
23#   %ce: committer email
24#   %cE: committer email (respecting .mailmap, see git-shortlog(1) or git-blame(1))
25#   %H: commit hash
26#   %h: abbreviated commit hash
27#   %s: subject
28#   %x00: print a byte from a hex code
29
30my %field_spec= (
31    "an" => "author_name",
32    "aN" => "author_name_mm",
33    "ae" => "author_email",
34    "aE" => "author_email_mm",
35    "cn" => "committer_name",
36    "cN" => "committer_name_mm",
37    "ce" => "committer_email",
38    "cE" => "committer_email_mm",
39    "H"  => "commit_hash",
40    "h"  => "abbrev_hash",
41    "s"  => "commit_subject",
42);
43
44my @field_codes= sort keys %field_spec;
45my @field_names= map { $field_spec{$_} } @field_codes;
46my $tformat= join "%x00", map { "%" . $_ } @field_codes;
47
48sub _make_name_author_info {
49    my ($author_info, $commit_info, $name_key)= @_;
50    (my $email_key= $name_key) =~ s/name/email/;
51    my $email= $commit_info->{$email_key};
52    my $name= $commit_info->{$name_key};
53
54    my $line= $author_info->{"email2line"}{$email}
55        // $author_info->{"name2line"}{$name};
56
57    $line //= sprintf "%-31s<%s>",
58        $commit_info->{$name_key}, $commit_info->{$email_key};
59    return $line;
60}
61
62sub _make_name_simple {
63    my ($commit_info, $key)= @_;
64    my $name_key= $key . "_name";
65    my $email_key= $key . "_email";
66    return sprintf "%s <%s>", $commit_info->{$name_key},
67        lc($commit_info->{$email_key});
68}
69
70sub read_commit_log {
71    my ($author_info, $mailmap_info)= @_;
72    $author_info ||= {};
73    open my $fh, qq(git log --pretty='tformat:$tformat' |);
74
75    while (defined(my $line= <$fh>)) {
76        chomp $line;
77        $line= decode_utf8($line);
78        my $commit_info= {};
79        @{$commit_info}{@field_names}= split /\0/, $line, 0 + @field_names;
80
81        my $author_name_mm= _make_name_author_info($author_info, $commit_info,
82            "author_name_mm");
83
84        my $committer_name_mm=
85            _make_name_author_info($author_info, $commit_info,
86            "committer_name_mm");
87
88        my $author_name_real= _make_name_simple($commit_info, "author");
89
90        my $committer_name_real= _make_name_simple($commit_info, "committer");
91
92        _check_name_mailmap(
93            $mailmap_info, $author_name_mm, $author_name_real,
94            $commit_info,  "author name"
95        );
96        _check_name_mailmap($mailmap_info, $committer_name_mm,
97            $committer_name_real, $commit_info, "committer name");
98
99        $author_info->{"lines"}{$author_name_mm}++;
100        $author_info->{"lines"}{$committer_name_mm}++;
101    }
102    return $author_info;
103}
104
105sub read_authors {
106    my ($authors_file)= @_;
107    $authors_file ||= "AUTHORS";
108
109    my @authors_preamble;
110    open my $in_fh, "<", $authors_file
111        or die "Failed to open for read '$authors_file': $!";
112    while (defined(my $line= <$in_fh>)) {
113        chomp $line;
114        push @authors_preamble, $line;
115        if ($line =~ /^--/) {
116            last;
117        }
118    }
119    my %author_info;
120    while (defined(my $line= <$in_fh>)) {
121        chomp $line;
122        $line= decode_utf8($line);
123        my ($name, $email);
124        my $copy= $line;
125        $copy =~ s/\s+\z//;
126        if ($copy =~ s/<([^<>]*)>//) {
127            $email= $1;
128        }
129        elsif ($copy =~ s/\s+(\@\w+)\z//) {
130            $email= $1;
131        }
132        $copy =~ s/\s+\z//;
133        $name= $copy;
134        $email //= "unknown";
135        $email= lc($email);
136
137        $author_info{"lines"}{$line}++;
138        $author_info{"email2line"}{$email}= $line
139            if $email and $email ne "unknown";
140        $author_info{"name2line"}{$name}= $line
141            if $name and $name ne "unknown";
142        $author_info{"email2name"}{ lc($email) }= $name
143            if $email
144            and $name
145            and $email ne "unknown";
146        $author_info{"name2email"}{$name}= $email
147            if $name and $name ne "unknown";
148    }
149    close $in_fh
150        or die "Failed to close '$authors_file': $!";
151    return (\%author_info, \@authors_preamble);
152}
153
154sub update_authors {
155    my ($author_info, $authors_preamble, $authors_file)= @_;
156    $authors_file ||= "AUTHORS";
157    my $authors_file_new= $authors_file . ".new";
158    open my $out_fh, ">", $authors_file_new
159        or die "Failed to open for write '$authors_file_new': $!";
160    binmode $out_fh;
161    foreach my $line (@$authors_preamble) {
162        print $out_fh encode_utf8($line), "\n"
163            or die "Failed to print to '$authors_file_new': $!";
164    }
165    foreach my $author (_sorted_hash_keys($author_info->{"lines"})) {
166        next if $author =~ /^unknown/;
167        if ($author =~ s/\s*<unknown>\z//) {
168            next if $author =~ /^\w+$/;
169        }
170        print $out_fh encode_utf8($author), "\n"
171            or die "Failed to print to '$authors_file_new': $!";
172    }
173    close $out_fh
174        or die "Failed to close '$authors_file_new': $!";
175    rename $authors_file_new, $authors_file
176        or die "Failed to rename '$authors_file_new' to '$authors_file':$!";
177    return 1;    # ok
178}
179
180sub read_mailmap {
181    my ($mailmap_file)= @_;
182    $mailmap_file ||= ".mailmap";
183
184    open my $in, "<", $mailmap_file
185        or die "Failed to read '$mailmap_file': $!";
186    my %mailmap_hash;
187    my @mailmap_preamble;
188    my $line_num= 0;
189    while (defined(my $line= <$in>)) {
190        ++$line_num;
191        next unless $line =~ /\S/;
192        chomp($line);
193        $line= decode_utf8($line);
194        if ($line =~ /^#/) {
195            if (!keys %mailmap_hash) {
196                push @mailmap_preamble, $line;
197            }
198            else {
199                die encode_utf8 "Not expecting comments after header ",
200                    "finished at line $line_num!\nLine: $line\n";
201            }
202        }
203        else {
204            $mailmap_hash{$line}= $line_num;
205        }
206    }
207    close $in;
208    return \%mailmap_hash, \@mailmap_preamble;
209}
210
211# this can be used to extract data from the checkAUTHORS data
212sub merge_mailmap_with_AUTHORS_and_checkAUTHORS_data {
213    my ($mailmap_hash, $author_info)= @_;
214    require 'Porting/checkAUTHORS.pl' or die "No authors?";
215    my ($map, $preferred_email_or_github)=
216        Porting::checkAUTHORS::generate_known_author_map();
217
218    foreach my $old (sort keys %$preferred_email_or_github) {
219        my $new= $preferred_email_or_github->{$old};
220        next if $old !~ /\@/ or $new !~ /\@/ or $new eq $old;
221        my $name= $author_info->{"email2name"}{$new};
222        if ($name) {
223            my $line= "$name <$new> <$old>";
224            $mailmap_hash->{$line}++;
225        }
226    }
227    return 1;    # ok
228}
229
230sub _sorted_hash_keys {
231    my ($hash)= @_;
232    my @sorted= sort { lc($a) cmp lc($b) || $a cmp $b } keys %$hash;
233    return @sorted;
234}
235
236sub update_mailmap {
237    my ($mailmap_hash, $mailmap_preamble, $mailmap_file)= @_;
238    $mailmap_file ||= ".mailmap";
239
240    my $mailmap_file_new= $mailmap_file . "_new";
241    open my $out, ">", $mailmap_file_new
242        or die "Failed to write '$mailmap_file_new':$!";
243    binmode $out;
244    foreach my $line (@$mailmap_preamble, _sorted_hash_keys($mailmap_hash),) {
245        print $out encode_utf8($line), "\n"
246            or die "Failed to print to '$mailmap_file': $!";
247    }
248    close $out;
249    rename $mailmap_file_new, $mailmap_file
250        or die "Failed to rename '$mailmap_file_new' to '$mailmap_file':$!";
251    return 1;    # ok
252}
253
254sub parse_mailmap_hash {
255    my ($mailmap_hash)= @_;
256    my @recs;
257    foreach my $line (sort keys %$mailmap_hash) {
258        my $line_num= $mailmap_hash->{$line};
259        $line =~ /^ \s* (?: ( [^<>]*? ) \s+ )? <([^<>]*)>
260                (?: \s+ (?: ( [^<>]*? ) \s+ )? <([^<>]*)> )? \s* \z /x
261            or die encode_utf8 "Failed to parse line num $line_num: '$line'";
262        if (!$1 or !$2) {
263            die encode_utf8 "Both preferred name and email are mandatory ",
264                "in line num $line_num: '$line'";
265        }
266
267        # [ preferred_name, preferred_email, other_name, other_email ]
268        push @recs, [ $1, $2, $3, $4, $line_num ];
269    }
270    return \@recs;
271}
272
273sub _safe_set_key {
274    my ($hash, $root_key, $key, $val, $pretty_name)= @_;
275    $hash->{$root_key}{$key} //= $val;
276    my $prev= $hash->{$root_key}{$key};
277    if ($prev ne $val) {
278        die encode_utf8 "Collision on mapping $root_key: "
279            . " '$key' maps to '$prev' and '$val'\n";
280    }
281}
282
283my $O2P= "other2preferred";
284my $O2PN= "other2preferred_name";
285my $O2PE= "other2preferred_email";
286my $P2O= "preferred2other";
287my $N2P= "name2preferred";
288my $E2P= "email2preferred";
289
290my $blurb= "";    # FIXME - replace with a nice message
291
292sub _check_name_mailmap {
293    my ($mailmap_info, $auth_name, $raw_name, $commit_info, $descr)= @_;
294    my $name= $auth_name;
295    $name =~ s/<([^<>]+)>/<\L$1\E>/
296        or $name =~ s/(\s)(\@\w+)\z/$1<\L$2\E>/
297        or $name .= " <unknown>";
298
299    $name =~ s/\s+/ /g;
300
301    if (!$mailmap_info->{$P2O}{$name}) {
302        warn encode_utf8 sprintf "Unknown %s '%s' in commit %s '%s'\n%s",
303            $descr,
304            $name, $commit_info->{"abbrev_hash"},
305            $commit_info->{"commit_subject"},
306            $blurb;
307        $mailmap_info->{add}{"$name $raw_name"}++;
308        return 0;
309    }
310    elsif (!$mailmap_info->{$P2O}{$name}{$raw_name}) {
311        $mailmap_info->{add}{"$name $raw_name"}++;
312    }
313    return 1;
314}
315
316sub check_fix_mailmap_hash {
317    my ($mailmap_hash, $authors_info)= @_;
318    my $parsed= parse_mailmap_hash($mailmap_hash);
319    my @fixed;
320    my %seen_map;
321    my %pref_groups;
322
323    # first pass through the data, do any conversions, eg, LC
324    # the email address, decode any MIME-Header style email addresses.
325    # We also correct any preferred name entries so they match what
326    # we already have in AUTHORS, and check that there aren't collisions
327    # or other issues in the data.
328    foreach my $rec (@$parsed) {
329        my ($pname, $pemail, $oname, $oemail, $line_num)= @$rec;
330        $pemail= lc($pemail);
331        $oemail= lc($oemail) if defined $oemail;
332        if ($pname =~ /=\?UTF-8\?/) {
333            $pname= decode("MIME-Header", $pname);
334        }
335        my $auth_email= $authors_info->{"name2email"}{$pname};
336        if ($auth_email) {
337            ## this name exists in authors, so use its email data for pemail
338            $pemail= $auth_email;
339        }
340        my $auth_name= $authors_info->{"email2name"}{$pemail};
341        if ($auth_name) {
342            ## this email exists in authors, so use its name data for pname
343            $pname= $auth_name;
344        }
345
346        # neither name nor email exist in authors.
347        if ($pname ne "unknown") {
348            if (my $email= $seen_map{"name"}{$pname}) {
349                ## we have seen this pname before, check the pemail
350                ## is consistent
351                if ($email ne $pemail) {
352                    warn encode_utf8 "Inconsistent emails for name '$pname'"
353                        . " at line num $line_num: keeping '$email',"
354                        . " ignoring '$pemail'\n";
355                    $pemail= $email;
356                }
357            }
358            else {
359                $seen_map{"name"}{$pname}= $pemail;
360            }
361        }
362        if ($pemail ne "unknown") {
363            if (my $name= $seen_map{"email"}{$pemail}) {
364                ## we have seen this preferred_email before, check the preferred_name
365                ## is consistent
366                if ($name ne $pname) {
367                    warn encode_utf8 "Inconsistent name for email '$pemail'"
368                        . " at line num $line_num: keeping '$name', ignoring"
369                        . " '$pname'\n";
370                    $pname= $name;
371                }
372            }
373            else {
374                $seen_map{"email"}{$pemail}= $pname;
375            }
376        }
377
378        # Build an index of "preferred name/email" to other-email, other name
379        # we use this later to remove redundant entries missing a name.
380        $pref_groups{"$pname $pemail"}{$oemail}{ $oname || "" }=
381            [ $pname, $pemail, $oname, $oemail, $line_num ];
382    }
383
384    # this removes entries like
385    # Joe <blogs> <whatever>
386    # where there is a corresponding
387    # Joe <blogs> Joe X <blogs>
388    foreach my $pref (_sorted_hash_keys(\%pref_groups)) {
389        my $entries= $pref_groups{$pref};
390        foreach my $email (_sorted_hash_keys($entries)) {
391            my @names= _sorted_hash_keys($entries->{$email});
392            if ($names[0] eq "" and @names > 1) {
393                shift @names;
394            }
395            foreach my $name (@names) {
396                push @fixed, $entries->{$email}{$name};
397            }
398        }
399    }
400
401    # final pass through the dataset, build up a database
402    # we will use later for checks and updates, and reconstruct
403    # the canonical entries.
404    my $new_mailmap_hash= {};
405    my $mailmap_info=     {};
406    foreach my $rec (@fixed) {
407        my ($pname, $pemail, $oname, $oemail, $line_num)= @$rec;
408        my $preferred= "$pname <$pemail>";
409        my $other;
410        if (defined $oemail) {
411            $other= $oname ? "$oname <$oemail>" : "<$oemail>";
412        }
413        if ($other and $other ne "<unknown>") {
414            _safe_set_key($mailmap_info, $O2P,  $other, $preferred);
415            _safe_set_key($mailmap_info, $O2PN, $other, $pname);
416            _safe_set_key($mailmap_info, $O2PE, $other, $pemail);
417        }
418        $mailmap_info->{$P2O}{$preferred}{$other}++;
419        if ($pname ne "unknown") {
420            _safe_set_key($mailmap_info, $N2P, $pname, $preferred);
421        }
422        if ($pemail ne "unknown") {
423            _safe_set_key($mailmap_info, $E2P, $pemail, $preferred);
424        }
425        my $line= $preferred;
426        $line .= " $other" if $other;
427        $new_mailmap_hash->{$line}= $line_num;
428    }
429    return ($new_mailmap_hash, $mailmap_info);
430}
431
432sub add_new_mailmap_entries {
433    my ($mailmap_hash, $mailmap_info, $mailmap_file)= @_;
434
435    my $mailmap_add= $mailmap_info->{add}
436        or return 0;
437
438    my $num= 0;
439    for my $new (sort keys %$mailmap_add) {
440        !$mailmap_hash->{$new}++ or next;
441        warn encode_utf8 "Updating '$mailmap_file' with: $new\n";
442        $num++;
443    }
444    return $num;
445}
446
447sub read_and_update {
448    my ($authors_file, $mailmap_file)= @_;
449
450    # read the authors file and extract the info it contains
451    my ($author_info, $authors_preamble)= read_authors($authors_file);
452
453    # read the mailmap file.
454    my ($orig_mailmap_hash, $mailmap_preamble)= read_mailmap($mailmap_file);
455
456    # check and possibly fix the mailmap data, and build a set of precomputed
457    # datasets to work with it.
458    my ($mailmap_hash, $mailmap_info)=
459        check_fix_mailmap_hash($orig_mailmap_hash, $author_info);
460
461    # update the mailmap based on any check or fixes we just did,
462    # we always write even if we did not do any changes.
463    update_mailmap($mailmap_hash, $mailmap_preamble, $mailmap_file);
464
465    # read the commits names using git log, and compares and checks
466    # them against the data we have in authors.
467    read_commit_log($author_info, $mailmap_info);
468
469    # update the authors file with any changes, we always write,
470    # but we may not change anything
471    update_authors($author_info, $authors_preamble, $authors_file);
472
473    # check if we discovered new email data from the commits that
474    # we need to write back to disk.
475    add_new_mailmap_entries($mailmap_hash, $mailmap_info, $mailmap_file)
476        and update_mailmap($mailmap_hash, $mailmap_preamble,
477        $mailmap_file, $mailmap_info);
478
479    return undef;
480}
481
482sub main {
483    local $Data::Dumper::Sortkeys= 1;
484    my $authors_file= "AUTHORS";
485    my $mailmap_file= ".mailmap";
486    my $show_man= 0;
487    my $show_help= 0;
488
489    ## Parse options and print usage if there is a syntax error,
490    ## or if usage was explicitly requested.
491    GetOptions(
492        'help|?'                      => \$show_help,
493        'man'                         => \$show_man,
494        'authors_file|authors-file=s' => \$authors_file,
495        'mailmap_file|mailmap-file=s' => \$mailmap_file,
496    ) or pod2usage(2);
497    pod2usage(1)             if $show_help;
498    pod2usage(-verbose => 2) if $show_man;
499
500    read_and_update($authors_file, $mailmap_file);
501    return 0;    # 0 for no error - intended for exit();
502}
503
504exit(main()) unless caller;
505
5061;
507__END__
508
509=head1 NAME
510
511Porting/updateAUTHORS.pl - Automatically update AUTHORS and .mailmap
512based on commit data.
513
514=head1 SYNOPSIS
515
516Porting/updateAUTHORS.pl
517
518 Options:
519   --help               brief help message
520   --man                full documentation
521   --authors-file=FILE  override default location of AUTHORS
522   --mailmap-file=FILE  override default location of .mailmap
523
524=head1 OPTIONS
525
526=over 4
527
528=item --help
529
530Print a brief help message and exits.
531
532=item --man
533
534Prints the manual page and exits.
535
536=item --authors-file=FILE
537
538=item --authors_file=FILE
539
540Override the default location of the authors file, which is "AUTHORS" in
541the current directory.
542
543=item --mailmap-file=FILE
544
545=item --mailmap_file=FILE
546
547Override the default location of the mailmap file, which is ".mailmap"
548in the current directory.
549
550=back
551
552=head1 DESCRIPTION
553
554This program will automatically manage updates to the AUTHORS file and
555.mailmap file based on the data in our commits and the data in the files
556themselves. It uses no other sources of data. Expects to be run from
557the root a git repo of perl.
558
559In simple, execute the script and it will either die with a helpful
560message or it will update the files as necessary, possibly not at all if
561there is no need to do so. Note it will actually rewrite the files at
562least once, but it may not actually make any changes to their content.
563Thus to use the script is currently required that the files are
564modifiable.
565
566Review the changes it makes to make sure they are sane. If they are
567commit. If they are not then update the AUTHORS or .mailmap files as is
568appropriate and run the tool again. Typically you shouldn't need to do
569either unless you are changing the default name or email for a user. For
570instance if a person currently listed in the AUTHORS file whishes to
571change their preferred name or email then change it in the AUTHORS file
572and run the script again. I am not sure when you might need to directly
573modify .mailmap, usually modifying the AUTHORS file should suffice.
574
575=head1 FUNCTIONS
576
577Note that the file can also be used as a package. If you require the
578file then you can access the functions located within the package
579C<Porting::updateAUTHORS>. These are as follows:
580
581=over 4
582
583=item add_new_mailmap_entries($mailmap_hash, $mailmap_info, $mailmap_file)
584
585If any additions were identified while reading the commits this will
586inject them into the mailmap_hash so they can be written out. Returns a
587count of additions found.
588
589=item check_fix_mailmap_hash($mailmap_hash, $authors_info)
590
591Analyzes the data contained the in the .mailmap file and applies any
592automated fixes which are required and which it can automatically
593perform. Returns a hash of adjusted entries and a hash with additional
594metadata about the mailmap entries.
595
596=item main()
597
598This implements the command line version of this module, handle command
599line options, etc.
600
601=item merge_mailmap_with_AUTHORS_and_checkAUTHORS_data
602
603This is a utility function that combines data from this tool with data
604contained in F<Porting/checkAUTHORS.pl> it is not used directly, but was
605used to cleanup and generate the current version of the .mailmap file.
606
607=item parse_mailmap_hash($mailmap_hash)
608
609Takes a mailmap_hash and parses it and returns it as an array of array
610records with the contents:
611
612    [ $preferred_name, $preferred_email,
613      $other_name, $other_email,
614      $line_num ]
615
616=item read_and_update($authors_file, $mailmap_file)
617
618Wraps the other functions in this library and implements the logic and
619intent of this tool. Takes two arguments, the authors file name, and the
620mailmap file name. Returns nothing but may modify the AUTHORS file
621or the .mailmap file. Requires that both files are editable.
622
623=item read_commit_log($authors_info, $mailmap_info)
624
625Read the commit log and find any new names it contains.
626
627=item read_authors($authors_file)
628
629Read the AUTHORS file and return data about it.
630
631=item read_mailmap($mailmap_file)
632
633Read the .mailmap file and return data about it.
634
635=item update_authors($authors_info, $authors_preamble, $authors_file)
636
637Write out an updated AUTHORS file. This is done atomically
638using a rename, we will not leave a half modified file in
639the repo.
640
641=item update_mailmap($mm_hash, $mm_preamble, $mailmap_file, $mm_info)
642
643Write out an updated .mailmap file. This is done atomically
644using a rename, we will not leave a half modified file in
645the repo.
646
647=back
648
649=head1 TODO
650
651More documentation and testing.
652
653=head1 SEE ALSO
654
655F<Porting/checkAUTHORS.pl>
656
657=head1 AUTHOR
658
659Yves Orton <demerphq@gmail.com>
660
661=cut
662