xref: /openbsd/gnu/usr.bin/perl/regen/mph.pl (revision 256a93a4)
1package MinimalPerfectHash;
2use strict;
3use warnings;
4use Data::Dumper;
5use Carp;
6use Text::Wrap;
7use List::Util qw(shuffle min);
8
9use warnings 'FATAL' => 'all';
10
11# The style of this file is determined by:
12#
13# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
14#   -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs  \
15#   -fsb='##!' -fse='##.'
16
17# Naming conventions
18# * The public API, consisting of methods, uses "normal" sub names with
19#   no leading underscore.
20# * Private subs are prefixed with a single underscore.
21# * Private methods are prefixed with two underscores. (There is only
22#   one at the time of writing this comment)
23
24use constant {
25    FNV32_PRIME => 16777619,
26    U8_MAX      => 0xFF,
27    U16_MAX     => 0xFFFF,
28    U32_MAX     => 0xFFFFFFFF,
29    INF         => 1e9,
30};
31
32our $DEBUG= $ENV{DEBUG} || 0;    # our so we can use local on it
33my $RSHIFT= 8;
34my $MASK= U32_MAX;
35my $MAX_SEED2= U16_MAX;          # currently the same, but not necessarily.
36my $IS_32BIT= !eval { pack "Q", 1 };
37
38sub new {
39    my ($class, %self)= @_;
40
41    my $source_hash= $self{source_hash}
42        or die "'source_hash' is a required parameter in $class->new()\n";
43
44    my $length_all_keys= 0;
45    $length_all_keys += length($_) for keys %$source_hash;
46    $self{length_all_keys}= $length_all_keys;
47
48    $self{max_attempts} ||= 16;    # pick a number, any number...
49
50    $self{base_name} ||= "mph";
51    my $base_name= $self{base_name};
52
53    $self{prefix} ||= uc($base_name);
54
55    $self{h_file}      ||= $base_name . "_algo.h";
56    $self{c_file}      ||= $base_name . "_test.c";
57    $self{t_file}      ||= $base_name . "_test.pl";
58    $self{blob_name}   ||= $base_name . "_blob";
59    $self{struct_name} ||= $base_name . "_struct";
60    $self{table_name}  ||= $base_name . "_table";
61    $self{match_name}  ||= $base_name . "_match";
62
63    my $split_strategy;
64    $self{simple_split} //= 0;
65    if ($self{simple_split}) {
66        $self{split_strategy}= "simple";
67        $self{randomize_squeeze}= 0;
68    }
69    else {
70        $self{split_strategy}= "squeeze";
71        $self{randomize_squeeze} //= 1;
72    }
73    if ($self{randomize_squeeze}) {
74        $self{max_same_in_squeeze} //= 5;
75        if (defined $self{srand_seed_was}) {
76            $self{srand_seed}= delete $self{srand_seed_was};
77        }
78        elsif (!defined $self{srand_seed}) {
79            $self{srand_seed}= srand();
80        }
81        else {
82            srand($self{srand_seed});
83        }
84        print "SRAND_SEED= $self{srand_seed}\n" if $DEBUG;
85    }
86    else {
87        $self{max_same}= 3;
88        delete $self{srand_seed};
89    }
90    return bless \%self, $class;
91}
92
93# The basic idea is that you have a two level structure, and effectively
94# hash the key twice.
95#
96# The first hash finds a bucket in the array which contains a seed which
97# is used for the second hash, which then leads to a bucket with key
98# data which is compared against to determine if the key is a match.
99#
100# If the first hash finds no seed, then the key cannot match.
101#
102# In our case we cheat a bit, and hash the key only once, but use the
103# low bits for the first lookup and the high-bits for the second.
104#
105# So for instance:
106#
107#           h= (h >> RSHIFT) ^ s;
108#
109# is how the second hash is computed. We right shift the original hash
110# value  and then xor in the seed2, which will be non-zero.
111#
112# That then gives us the bucket which contains the key data we need to
113# match for a valid key.
114
115sub _fnv1a_32 {
116    my ($key, $seed)= @_;
117    use integer;
118
119    my $hash= 0 + $seed;
120    foreach my $char (split //, $key) {
121        $hash= $hash ^ ord($char);
122
123        # the & U32_MAX is to simulate 32 bit ints on a 64 bit integer Perl.
124        $hash= ($hash * FNV32_PRIME) & U32_MAX;
125    }
126
127    # The hash can end up negative on 32 bit Perls due to use integer being
128    # in scope. This is equivalent to casting it to an U32.
129    $hash= unpack "V", pack "l", $hash
130        if $IS_32BIT;
131
132    return $hash;
133}
134
135sub build_perfect_hash {
136    my ($self)= @_;
137
138    my $source_hash= $self->{source_hash};
139    my $max_attempts= $self->{max_attempts};
140
141    my $n= 0 + keys %$source_hash;
142    print "Building a minimal perfect hash from $n keys.\n"
143        if $DEBUG;
144    my $seed1= unpack("N", "Perl") - 1;
145
146    TRY:
147    for (my $attempt= 1 ; $attempt < $max_attempts ; $attempt++) {
148        my ($hash_to_key, $key_to_hash, $key_buckets);
149        SEED1:
150        for ($seed1++ ; 1 ; $seed1++) {
151            print "Trying seed $seed1\n"
152                if $DEBUG;
153            my %hash_to_key;
154            my %key_to_hash;
155            my %key_buckets;
156            my %shifted;
157            foreach my $key (sort keys %$source_hash) {
158                my $h= _fnv1a_32($key, $seed1);
159                next SEED1 if exists $hash_to_key{$h};
160                next SEED1 if $shifted{ ($h >> $RSHIFT) & $MASK }++;
161                $hash_to_key{$h}= $key;
162                $key_to_hash{$key}= $h;
163                push @{ $key_buckets{ $h % $n } }, $key;
164            }
165            $hash_to_key= \%hash_to_key;
166            $key_to_hash= \%key_to_hash;
167            $key_buckets= \%key_buckets;
168            last SEED1;
169        }
170        my $second_level=
171            _build_mph_level2($hash_to_key, $key_to_hash, $key_buckets);
172        if ($second_level) {
173            $self->{seed1}= $seed1;
174            $self->{second_level}= $second_level;
175            return $seed1, $second_level;
176        }
177    }
178    die sprintf "After %d attempts failed to construct a minimal perfect "
179        . "hash with %d keys.\nWe are using fnv32(), perhaps this "
180        . "hash function isn't good enough?\n",
181        $max_attempts, $n;
182}
183
184sub _build_mph_level2 {
185    my ($hash_to_key, $key_to_hash, $key_buckets)= @_;
186
187    my $n= 0 + keys %$key_to_hash;
188
189    # Loop over the key_buckets, processing the buckets with the most
190    # items in them first, and the ones with the least items in them last.
191    # This maximizes the chance we can find a $seed2 that "disambiguates"
192    # the items that collide in a single bucket.
193    #
194    # With a decent hash function we will have a typical long tail
195    # distribution of items per bucket, with relatively few buckets with
196    # the most collisions in them, and the vast majority of buckets
197    # having no collisions. By processing the ones with the most items
198    # in them first the "easy" cases don't get in the way of finding a
199    # solution for the hard cases. The buckets can be divided into three
200    # levels of difficulty to solve "hard", "medium" and "trivial".
201    #
202    # * Hard buckets have more than one item in them.
203    # * Medium buckets have one item whose hash is above $MAX_SEED2.
204    # * Trivial buckets have one item whose hash is not above $MAX_SEED2.
205    #
206    # Each type of bucket uses a different algorithm to solve. Note that
207    # a "classical" two level hash would only have "hard" and "trivial"
208    # buckets, but since we support having a larger hash value than we
209    # allow for a $seed2 we have three.
210
211    my @first_level;
212    my @second_level;
213    my @singles_high;
214    my @singles_low;
215
216    print "Finding mappings for buckets with collisions.\n"
217        if $DEBUG;
218
219    FIRST_IDX:
220    foreach my $first_idx (
221        sort {
222            @{ $key_buckets->{$b} } <=> @{ $key_buckets->{$a} }
223                || $a <=> $b
224        } keys %$key_buckets
225        )
226    {
227        my $keys= $key_buckets->{$first_idx};
228        if (@$keys == 1) {
229
230            # buckets with a single item in them can use a simpler
231            # and faster algorithm to find a bucket than those with
232            # buckets with more than one item.
233
234            # however keys whose $hash2 is above $MAX_SEED2 need to be
235            # processed first, and will use one strategy, while the rest
236            # of the singletons should be processed last, and can use
237            # an even simpler and more efficient strategy.
238            my $key= $keys->[0];
239            my $hash2= ($key_to_hash->{$key} >> $RSHIFT) & $MASK;
240            if ($hash2 > $MAX_SEED2) {
241                push @singles_high, [ $first_idx, $hash2, $key ];
242            }
243            else {
244                push @singles_low, [ $first_idx, $hash2, $key ];
245            }
246            next FIRST_IDX;
247        }
248
249        # This loop handles items with more than one key in the same
250        # bucket. We need to find a $seed2 that causes the operation
251        #
252        #    ($hash ^ $seed2) % $n
253        #
254        # to map those keys into different empty buckets. If we cannot
255        # find such a $seed2 then we need to recompute everything with a
256        # new seed.
257        SEED2:
258        for (my $seed2= 1 ; $seed2 <= $MAX_SEED2 ; $seed2++) {
259            my @idx= map {
260                ((($key_to_hash->{$_} >> $RSHIFT) ^ $seed2) & $MASK) % $n
261            } @$keys;
262            my %seen;
263            next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx;
264            $first_level[$first_idx]= $seed2;
265            @second_level[@idx]= map { _make_bucket_info($_) } @$keys;
266            next FIRST_IDX;
267        }
268
269        # If we get here then we failed to find a $seed2 which results
270        # in the colliding items being mapped to different empty buckets.
271        # So we have to rehash everything with a different $seed1.
272        print "Failed to disambiguate colliding keys. Trying new seed1.\n"
273            if $DEBUG;
274        return;
275    }
276
277    # Now fill in the singletons using a much simpler and faster
278    # way to compute the seed2. Since we only have to worry about
279    # a single seed, we merely need to fill in all the empty slots
280    # and we can always compute a mask that when xor'ed with $base
281    # maps to the empty slot.
282    print "Finding mappings for buckets with no collisions.\n"
283        if $DEBUG;
284
285    # sort @singles_low so that for the simple algorithm we do not end
286    # up mapping a 0 hash to the 0 bucket, which would result in a
287    # $seed2 of 0. Our logic avoids comparing the key when the $seed2 is
288    # 0, so we need to avoid having a seed2 of 0. This rule is not
289    # strictly required, but it cuts down on string comparisons at the
290    # cost of a relatively cheap numeric comparison. If you change this
291    # make sure you update the generated C code.
292
293    ##!
294    @singles_low= sort {
295        $b->[1] <=> $a->[1] ||    # sort by $hash2
296        $a->[0] <=> $b->[0]       # then by $first_idx
297    } @singles_low;
298    ##.
299
300    my $scan_idx= 0;    # used to find empty buckets for the "simple" case.
301    SINGLES:
302    foreach my $tuple (@singles_high, @singles_low) {
303        my ($first_idx, $hash2, $key)= @$tuple;
304        my ($seed2, $idx);
305        if ($hash2 > $MAX_SEED2) {
306
307            # The $hash2 is larger than the maximum value of $seed2.
308            # This means that we cannot simply map this item into
309            # whichever bucket we choose using xor. Instead we loop
310            # through the possible $seed2 values checking to see if it
311            # results in us landing in an empty bucket, which should be
312            # fairly common which means this loop should execute
313            # relatively few times. It also minimizes the chance that we
314            # cannot find a solution at all.
315            for my $i (1 .. $MAX_SEED2) {
316                $idx= (($hash2 ^ $i) & $MASK) % $n;
317                if (!$second_level[$idx]) {
318                    $seed2= $i;
319                    last;
320                }
321            }
322
323            # If we failed to find a solution we need to go back to
324            # beginning and try a different key.
325            if (!defined $seed2) {
326                print "No viable seed2 for singleton. Trying new seed1.\n"
327                    if $DEBUG;
328                return;
329            }
330        }
331        else {
332            # since $hash2 <= $MAX_SEED2 we can trivially map the item
333            # to any bucket we choose using xor. So we find the next
334            # empty bucket with the loop below, and then map this item
335            # into it.
336            SCAN:
337            while ($second_level[$scan_idx]) {
338                $scan_idx++;
339            }
340
341            # note that we don't need to mod $n here, as
342            #
343            #   $hash2 ^ $seed2 == $idx
344            #
345            # and $idx is already in the interval (0, $n-1)
346
347            $seed2= $hash2 ^ $scan_idx;
348
349            # increment $scan_idx after stashing its old value into $idx
350            # as by the end of this iteration of the SINGLES loop we
351            # will have filled $second_level[$scan_idx] and we need not
352            # check it in the SCAN while loop.
353            $idx= $scan_idx++;
354        }
355
356        # sanity check $idx.
357        die "WTF, \$idx should be less than \$n ($idx vs $n)"
358            unless $idx < $n;
359
360        die "Bad seed2 for first_idx: $first_idx." if $seed2 == 0;
361
362        # and finally we are done, we have found the final bucket
363        # location for this key.
364        $first_level[$first_idx]= $seed2;
365        $second_level[$idx]= _make_bucket_info($key);
366    }
367
368    # now that we are done we can go through and fill in the idx and
369    # seed2 as appropriate. We store idx into the hashes even though it
370    # is not stricly necessary as it simplifies some of the code that
371    # processes the @second_level bucket info array later.
372    foreach my $idx (0 .. $n - 1) {
373        $second_level[$idx]{seed2}= $first_level[$idx] || 0;
374        $second_level[$idx]{idx}= $idx;
375    }
376
377    return \@second_level;
378}
379
380sub _make_bucket_info {
381    my ($key)= @_;
382    return +{
383        key   => $key,
384        seed2 => undef,    # will be filled in later
385        idx   => undef,    # will be filled in later
386    };
387}
388
389sub _sort_keys_longest_first {
390    my ($hash)= shift;
391    my @keys= sort { length($b) <=> length($a) || $a cmp $b } keys %$hash;
392    return \@keys;
393}
394
395# This sub constructs a blob of characters which can be used to
396# reconstruct the keys of the $hash that is passed in to it, possibly
397# and likely by splitting the keys into two parts, a prefix and a
398# suffix. This allows prefixes and suffixes to be reused for more than
399# one original key.
400#
401# It returns a string that contains every prefix and suffix chosen, and
402# a hash that contains each key in the argument $hash with each value
403# being the position where it is split, using the length of the key to
404# indicate it need not be split at all.
405#
406# If $preprocess is false the process starts with an empty buffer and
407# populates it as it adds each new key, if $preprocess is true then it
408# tries to split each key at the '=' sign which is often present in
409# Unicode property names and composes the initial buffer from these
410# fragments.
411#
412# It performs multiple passes trying to find the ideal split point to
413# produce a minimal buffer, returning the smallest buffer it can.
414sub _build_split_words_simple {
415    my ($hash, $length_all_keys, $preprocess)= @_;
416    my %appended;
417    my $blob= "";
418    if ($preprocess) {
419        my %parts;
420        foreach my $key (@{ _sort_keys_longest_first($hash) }) {
421            my ($prefix, $suffix);
422            if ($key =~ /^([^=]+=)([^=]+)\z/) {
423                ($prefix, $suffix)= ($1, $2);
424                $parts{$suffix}++;
425
426                #$parts{$prefix}++;
427            }
428            else {
429                $prefix= $key;
430                $parts{$prefix}++;
431            }
432
433        }
434        foreach my $part (@{ _sort_keys_longest_first(\%parts) }) {
435            $blob .= $part;
436        }
437        printf "Using preprocessing, initial blob size is %d chars.\n",
438            length($blob)
439            if $DEBUG;
440    }
441    else {
442        print "No preprocessing, starting with an empty blob.\n"
443            if $DEBUG;
444    }
445    my ($res, $old_res, $added, $passes);
446
447    REDO:
448    $res= {};
449    $added= 0;
450    $passes++;
451
452    KEY:
453    foreach my $key (@{ _sort_keys_longest_first($hash) }) {
454        next if exists $res->{$key};
455        if (index($blob, $key) >= 0) {
456            my $idx= length($key);
457            if ($DEBUG > 1 and $old_res and $old_res->{$key} != $idx) {
458                print "changing: $key => $old_res->{$key} : $idx\n";
459            }
460            $res->{$key}= $idx;
461            next KEY;
462        }
463        my $best= length($key);
464        my $append= $key;
465        my $best_prefix= $key;
466        my $best_suffix= "";
467        my $min= 1;
468        foreach my $idx (reverse $min .. length($key) - 1) {
469            my $prefix= substr($key, 0, $idx);
470            my $suffix= substr($key, $idx);
471            my $i1= index($blob, $prefix) >= 0;
472            my $i2= index($blob, $suffix) >= 0;
473            if ($i1 and $i2) {
474                if ($DEBUG > 1 and $old_res and $old_res->{$key} != $idx) {
475                    print "changing: $key => $old_res->{$key} : $idx\n";
476                }
477                $res->{$key}= $idx;
478                $appended{$prefix}++;
479                $appended{$suffix}++;
480                next KEY;
481            }
482            elsif ($i1) {
483                if (length $suffix <= length $append) {
484                    $best= $idx;
485                    $append= $suffix;
486                    $best_prefix= $prefix;
487                    $best_suffix= $suffix;
488                }
489            }
490            elsif ($i2) {
491                if (length $prefix <= length $append) {
492                    $best= $idx;
493                    $append= $prefix;
494                    $best_prefix= $prefix;
495                    $best_suffix= $suffix;
496                }
497            }
498        }
499        if ($DEBUG > 1 and $old_res and $old_res->{$key} != $best) {
500            print "changing: $key => $old_res->{$key} : $best\n";
501        }
502
503        $res->{$key}= $best;
504        $blob .= $append;
505        $added += length($append);
506        $appended{$best_prefix}++;
507        $appended{$best_suffix}++;
508    }
509    if ($added) {
510        if ($added < length $blob) {
511            printf "Appended %d chars. Blob is %d chars long.\n",
512                $added, length($blob)
513                if $DEBUG;
514        }
515        else {
516            printf "Blob is %d chars long.\n", $added
517                if $DEBUG;
518        }
519    }
520    elsif ($passes > 1) {
521        print "Blob needed no changes.\n"
522            if $DEBUG;
523    }
524    my $new_blob= "";
525    foreach my $part (@{ _sort_keys_longest_first(\%appended) }) {
526        $new_blob .= $part unless index($new_blob, $part) >= 0;
527    }
528    if (length($new_blob) < length($blob)) {
529        printf "Uncorrected new blob length of %d chars is smaller.\n"
530            . "  Correcting new blob...%s",
531            length($new_blob), $DEBUG > 1 ? "\n" : " "
532            if $DEBUG;
533        $blob= $new_blob;
534        $old_res= $res;
535        %appended= ();
536        goto REDO;
537    }
538    else {
539        printf "After %d passes final blob length is %d chars.\n"
540            . "This is %.2f%% of the raw key length of %d chars.\n\n",
541            $passes, length($blob), 100 * length($blob) / $length_all_keys,
542            $length_all_keys
543            if $DEBUG;
544    }
545
546    # sanity check
547    die sprintf "not same size? %d != %d", 0 + keys %$res, 0 + keys %$hash
548        unless keys %$res == keys %$hash;
549    return ($blob, $res, $length_all_keys);
550}
551
552# Find all the positions where $word can be found in $$buf_ref,
553# including overlapping positions. The data is cached into the
554# $offsets_hash. Used by the _squeeze algorithm.
555sub _get_offsets {
556    my ($offsets_hash, $buf_ref, $word)= @_;
557    return $offsets_hash->{$word}
558        if defined $offsets_hash->{$word};
559
560    my @offsets;
561    my $from= 0;
562
563    while (1) {
564        my $i= index($$buf_ref, $word, $from);
565        last if $i == -1;
566        push @offsets, $i;
567        $from= $i + 1;
568    }
569
570    $offsets_hash->{$word}= \@offsets;
571    return \@offsets;
572}
573
574# Increments the popularity data for the characters at
575# $ofs .. $ofs + $len - 1 by $diff. Used by the _squeeze algorithm
576sub _inc_popularity {
577    my ($popularity, $ofs, $len, $diff)= @_;
578    for my $idx ($ofs .. $ofs + $len - 1) {
579        $popularity->[$idx] += $diff;
580    }
581}
582
583# Returns a summary hash about the popularity of the characters
584# $ofs .. $ofs + $len - 1. Used by the _squeeze algorithm
585sub _get_popularity {
586    my ($popularity, $ofs, $len)= @_;
587    my $res= {
588        reused_digits => 0,
589        popularity    => 0,
590    };
591    my $min_pop= undef;
592    for my $idx ($ofs .. $ofs + $len - 1) {
593        if ($popularity->[$idx] >= INF) {
594            $res->{reused_digits}++;
595        }
596        else {
597            my $pop= $popularity->[$idx];
598            if (!defined $min_pop || $pop < $min_pop) {
599                $min_pop= $pop;
600            }
601        }
602    }
603    $res->{popularity}= $min_pop // 0;
604    return $res;
605}
606
607# Merge the popularity data produced by _get_popularity() for the prefix
608# and suffix of a word together. Used by the _squeeze algorithm
609sub _merge_score {
610    my ($s1, $s2)= @_;
611    return +{
612        reused_digits => $s1->{reused_digits} + $s2->{reused_digits},
613        popularity    => min($s1->{popularity}, $s2->{popularity}),
614    };
615}
616
617# Initialize the popularity and offsets data for a word.
618# Used by the _squeeze algorithm
619sub _init_popularity {
620    my ($offsets_hash, $popularity, $buf_ref, $word, $diff)= @_;
621    my $offsets= _get_offsets($offsets_hash, $buf_ref, $word);
622    my $len= length $word;
623    for my $ofs (@$offsets) {
624        for my $idx ($ofs .. $ofs + $len - 1) {
625            $popularity->[$idx] += $diff;
626        }
627    }
628}
629
630# Compare the popularity data for two possible candidates
631# for solving a given word. Used by the _squeeze algorithm
632sub _compare_score {
633    my ($s1, $s2)= @_;
634    if ($s1->{reused_digits} != $s2->{reused_digits}) {
635        return $s1->{reused_digits} <=> $s2->{reused_digits};
636    }
637    return $s1->{popularity} <=> $s2->{popularity};
638}
639
640# Find the most popular offset for a word in $$buf_ref.
641# Used by the _squeeze algorithm
642sub _most_popular_offset {
643    my ($offsets_hash, $popularity, $buf_ref, $word)= @_;
644    my $best_score= {
645        reused_digits => -1,
646        popularity    => -1,
647    };
648    my $best_pos= -1;
649    my $offsets_ary= _get_offsets($offsets_hash, $buf_ref, $word);
650    my $wlen= length $word;
651    for my $i (@$offsets_ary) {
652        my $score= _get_popularity($popularity, $i, $wlen);
653        if (_compare_score($score, $best_score) > 0) {
654            $best_score= $score;
655            $best_pos= $i;
656            if ($best_score->{reused_digits} == $wlen) {
657                last;
658            }
659        }
660    }
661    return +{
662        position => $best_pos,
663        score    => $best_score,
664    };
665}
666
667# The _squeeze algorithm. Attempt to squeeze out unused characters from
668# a buffer of split words. If there are multiple places where a given
669# prefix or suffix can be found and the overall split decisions can be
670# reorganized so some of them are never used it removes the ones that
671# are not used.
672sub _squeeze {
673    my ($words, $word_count, $splits, $buf_ref)= @_;
674    print "Squeezing...\n" if $DEBUG;
675    my %offsets_hash;
676    my %split_points;
677    my $n= length $$buf_ref;
678    my @popularity= 0 x $n;
679
680    for my $word (sort keys %$word_count) {
681        my $count= $word_count->{$word};
682        _init_popularity(\%offsets_hash, \@popularity, $buf_ref, $word,
683            $count / length($word));
684    }
685
686    WORD:
687    for my $word (@$words) {
688        my $best_pos1= -1;
689        my $best_pos2= -1;
690        my $best_score= {
691            reused_digits => -1,
692            popularity    => -1,
693        };
694        my $best_split;
695
696        my $cand=
697            _most_popular_offset(\%offsets_hash, \@popularity, $buf_ref, $word);
698        if ($cand->{position} != -1) {
699            my $cand_score= $cand->{score};
700            if ($cand_score->{reused_digits} == length($word)) {
701                $split_points{$word}= 0;
702                next WORD;
703            }
704            elsif (_compare_score($cand_score, $best_score) > 0) {
705                $best_score= $cand_score;
706                $best_pos1= $cand->{position};
707                $best_pos2= -1;
708                $best_split= undef;
709            }
710        }
711
712        for my $split (@{ $splits->{$word} }) {
713            my $cand2=
714                _most_popular_offset(\%offsets_hash, \@popularity, $buf_ref,
715                $split->{w2});
716            next if $cand2->{position} == -1;
717
718            my $cand1=
719                _most_popular_offset(\%offsets_hash, \@popularity, $buf_ref,
720                $split->{w1});
721            next if $cand1->{position} == -1;
722
723            my $cand_score= _merge_score($cand1->{score}, $cand2->{score});
724
725            if ($cand_score->{reused_digits} == length($word)) {
726                $split_points{$word}= $split->{split_point};
727                next WORD;
728            }
729            if (_compare_score($cand_score, $best_score) > 0) {
730                $best_score= $cand_score;
731                $best_pos1= $cand1->{position};
732                $best_pos2= $cand2->{position};
733                $best_split= $split;
734            }
735        }
736
737        # apply high pop to used characters of the champion
738        if (defined $best_split) {
739            _inc_popularity(\@popularity, $best_pos1,
740                length($best_split->{w1}), INF);
741            _inc_popularity(\@popularity, $best_pos2,
742                length($best_split->{w2}), INF);
743            $split_points{$word}= $best_split->{split_point};
744        }
745        else {
746            _inc_popularity(\@popularity, $best_pos1, length($word), INF);
747            $split_points{$word}= 0;
748        }
749    }
750
751    my $res= "";
752    my @chars= split '', $$buf_ref;
753    for my $i (0 .. $n - 1) {
754        if ($popularity[$i] >= INF) {
755            $res .= $chars[$i];
756        }
757    }
758    printf "%d -> %d\n", $n, length($res) if $DEBUG;
759
760    # This algorithm chooses to "split" full strings at 0, so that the
761    # prefix is empty and the suffix contains the full key, but the
762    # minimal perfect hash logic wants it the other way around, as we do
763    # the prefix check first. so we correct it at the end here.
764    $split_points{$_} ||= length($_) for keys %split_points;
765
766    return ($res, \%split_points);
767}
768
769# compute an initial covering buffer for a set of words,
770# including split data.
771sub _initial_covering_buf {
772    my ($words, $splits)= @_;
773    my $res= "";
774    WORD:
775    for my $word (@$words) {
776        if (index($res, $word) != -1) {
777            next WORD;
778        }
779        else {
780            for my $split (@{ $splits->{$word} }) {
781                if (   index($res, $split->{w1}) != -1
782                    && index($res, $split->{w2}) != -1)
783                {
784                    next WORD;
785                }
786            }
787        }
788        $res .= $word;
789    }
790    return $res;
791}
792
793sub build_split_words_squeeze {
794    my ($self)= @_;
795    # Thanks to Ilya Sashcheka for this algorithm
796
797    my $hash= $self->{source_hash};
798    my $length_all_keys= $self->{length_all_keys};
799    my $randomize= $self->{randomize_squeeze};
800    my $max_same= $self->{max_same_in_squeeze};
801
802    my @words= sort keys %$hash;
803    my %splits;
804    my $split_points;
805
806    for my $word (@words) {
807        my $word_splits= [];
808        my $wlen= length $word;
809        for my $i (1 .. $wlen - 1) {
810            ##!
811            push @$word_splits,
812                +{
813                    w1          => substr($word, 0, $i),
814                    w2          => substr($word, $i),
815                    split_point => $i,
816                };
817            ##.
818        }
819        $splits{$word}= $word_splits;
820    }
821
822    my %word_count;
823    for my $word (@words) {
824        $word_count{$word}++;
825        for my $split (@{ $splits{$word} }) {
826            $word_count{ $split->{w1} }++;
827            $word_count{ $split->{w2} }++;
828        }
829    }
830
831    @words= sort { length($a) <=> length($b) || $a cmp $b } @words;
832    my $buf= _initial_covering_buf(\@words, \%splits);
833
834    printf "Pre squeeze buffer: %s\n", $buf        if $DEBUG > 1;
835    printf "Pre squeeze length: %d\n", length $buf if $DEBUG;
836
837    my $same= 0;
838    my $counter= 0;
839    my $reverse_under= 2;
840    while ($same < $max_same) {
841        my ($new_buf, $new_split_points)=
842            _squeeze(\@words, \%word_count, \%splits, \$buf);
843        if (!$split_points or length($new_buf) < length($buf)) {
844            $buf= $new_buf;
845            $split_points= $new_split_points;
846            $same= 0;
847        }
848        else {
849            if ($same < $reverse_under or !$randomize) {
850                print "reversing words....\n" if $DEBUG;
851                @words= reverse @words;
852            }
853            else {
854                print "shuffling words....\n" if $DEBUG;
855                @words= shuffle @words;
856                $reverse_under= 1;
857            }
858            $same++;
859        }
860    }
861
862    printf "Final length: %d\n", length($buf) if $DEBUG;
863
864    $self->{blob}= $buf;
865    $self->{split_points}= $split_points;
866
867    return $buf, $split_points;
868}
869
870sub build_split_words_simple {
871    my ($self)= @_;
872
873    my $hash= $self->{source_hash};
874    my $length_all_keys= $self->{length_all_keys};
875
876    my ($blob, $split_points)=
877        _build_split_words_simple($hash, $length_all_keys, 0);
878
879    my ($blob2, $split_points2)=
880        _build_split_words_simple($hash, $length_all_keys, 1);
881
882    if (length($blob) > length($blob2)) {
883        printf "Using preprocess-smart blob. Length is %d chars. (vs %d)\n",
884            length $blob2, length $blob
885            if $DEBUG;
886        $blob= $blob2;
887        $split_points= $split_points2;
888    }
889    else {
890        printf "Using greedy-smart blob. Length is %d chars. (vs %d)\n",
891            length $blob, length $blob2
892            if $DEBUG;
893    }
894    $self->{blob}= $blob;
895    $self->{split_points}= $split_points;
896
897    return $blob, $split_points;
898}
899
900sub build_split_words {
901    my ($self)= @_;
902
903    # The _simple algorithm does not compress nearly as well as the
904    # _squeeze algorithm, although it uses less memory and will likely
905    # be faster, especially if randomization is enabled. The default
906    # is to use _squeeze as our hash is not that large (~8k keys).
907    my ($buf, $split_words);
908    if ($self->{simple_split}) {
909        ($buf, $split_words)= $self->build_split_words_simple();
910    }
911    else {
912        ($buf, $split_words)= $self->build_split_words_squeeze();
913    }
914    foreach my $key (sort keys %$split_words) {
915        my $point= $split_words->{$key};
916        my $prefix= substr($key, 0, $point);
917        my $suffix= substr($key, $point);
918        if (index($buf, $prefix) < 0) {
919            die "Failed to find prefix '$prefix' for '$key'";
920        }
921        if (length $suffix and index($buf, $suffix) < 0) {
922            die "Failed to find suffix '$suffix' for '$key'";
923        }
924    }
925    return ($buf, $split_words);
926}
927
928sub blob_as_code {
929    my ($self)= @_;
930    my $blob= $self->{blob};
931    my $blob_name= $self->{blob_name};
932
933    # output the blob as C code.
934    my @code= (sprintf "STATIC const unsigned char %s[] =\n", $blob_name);
935    my $blob_len= length $blob;
936    while (length($blob)) {
937        push @code, sprintf qq(    "%s"), substr($blob, 0, 65, "");
938        push @code, length $blob ? "\n" : ";\n";
939    }
940    push @code, "/* $blob_name length: $blob_len */\n";
941    return $self->{blob_as_code}= join "", @code;
942}
943
944sub print_includes {
945    my ($self, $ofh)= @_;
946    print $ofh "#include <stdio.h>\n";
947    print $ofh "#include <string.h>\n";
948    print $ofh "#include <stdint.h>\n";
949    print $ofh "\n";
950}
951
952sub print_defines {
953    my ($self, $ofh)= @_;
954    my $defines= $self->{defines_hash};
955
956    my $key_len;
957    foreach my $def (keys %$defines) {
958        $key_len //= length $def;
959        $key_len= length $def if $key_len < length $def;
960    }
961    foreach my $def (sort keys %$defines) {
962        printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def};
963    }
964    print $ofh "\n";
965}
966
967sub build_array_of_struct {
968    my ($self)= @_;
969    my $second_level= $self->{second_level};
970    my $blob= $self->{blob};
971
972    my %defines;
973    my %tests;
974    my @rows;
975    foreach my $row (@$second_level) {
976        if (!defined $row->{idx} or !defined $row->{value}) {
977            die "panic: No idx or value key in row data:", Dumper($row);
978        }
979        $defines{ $row->{value} }= $row->{idx} + 1;
980        $tests{ $row->{key} }= $defines{ $row->{value} };
981        ##!
982        my @u16= (
983            $row->{seed2},
984            index($blob, $row->{prefix}),
985            index($blob, $row->{suffix}),
986        );
987        $_ > U16_MAX and die "panic: value exceeds range of U16"
988            for @u16;
989        my @u8= (
990            length($row->{prefix}),
991            length($row->{suffix}),
992        );
993        $_ > U8_MAX and die "panic: value exceeds range of U8"
994            for @u8;
995        push @rows, sprintf "  { %5d, %5d, %5d, %3d, %3d, %s }   /* %s%s */",
996            @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix};
997        ##.
998    }
999    $self->{rows_array}= \@rows;
1000    $self->{defines_hash}= \%defines;
1001    $self->{tests_hash}= \%tests;
1002    return \@rows, \%defines, \%tests;
1003}
1004
1005sub make_algo {
1006    my ($self)= @_;
1007
1008    my (
1009        $second_level, $seed1,     $length_all_keys, $blob,
1010        $rows_array,   $blob_name, $struct_name,     $table_name,
1011        $match_name,   $prefix,    $split_strategy,  $srand_seed,
1012        )
1013        = @{$self}{ qw(
1014            second_level   seed1       length_all_keys   blob
1015            rows_array     blob_name   struct_name       table_name
1016            match_name     prefix      split_strategy    srand_seed
1017        ) };
1018
1019    my $n= 0 + @$second_level;
1020    my $data_size= $n * 8 + length $blob;
1021
1022    my @code= "#define ${prefix}_VALt I16\n\n";
1023    push @code, "/*\n";
1024    push @code, sprintf "generator script: %s\n", $0;
1025    push @code, sprintf "split strategy: %s\n",   $split_strategy;
1026    push @code, sprintf "srand: %d\n", $srand_seed
1027        if defined $srand_seed;
1028    push @code, sprintf "rows: %s\n",                $n;
1029    push @code, sprintf "seed: %s\n",                $seed1;
1030    push @code, sprintf "full length of keys: %d\n", $length_all_keys;
1031    push @code, sprintf "blob length: %d\n",         length $blob;
1032    push @code, sprintf "ref length: %d\n",          0 + @$second_level * 8;
1033    push @code, sprintf "data size: %d (%%%.2f)\n", $data_size,
1034        ($data_size / $length_all_keys) * 100;
1035    push @code, "*/\n\n";
1036
1037    push @code, $self->blob_as_code();
1038    push @code, <<"EOF_CODE";
1039
1040struct $struct_name {
1041    U16 seed2;
1042    U16 pfx;
1043    U16 sfx;
1044    U8  pfx_len;
1045    U8  sfx_len;
1046    ${prefix}_VALt value;
1047};
1048
1049EOF_CODE
1050
1051    push @code, "#define ${prefix}_RSHIFT $RSHIFT\n";
1052    push @code, "#define ${prefix}_BUCKETS $n\n\n";
1053    push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1;
1054    push @code, sprintf "STATIC const U32 ${prefix}_FNV32_PRIME = 0x%08x;\n\n",
1055        FNV32_PRIME;
1056
1057    push @code, "/* The comments give the input key for the row it is in */\n";
1058    push @code,
1059        "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n",
1060        join(",\n", @$rows_array) . "\n};\n\n";
1061    push @code, <<"EOF_CODE";
1062${prefix}_VALt
1063$match_name( const unsigned char * const key, const U16 key_len ) {
1064    const unsigned char * ptr= key;
1065    const unsigned char * ptr_end= key + key_len;
1066    U32 h= ${prefix}_SEED1;
1067    U32 s;
1068    U32 n;
1069    /* this is FNV-1a 32bit unrolled. */
1070    do {
1071        h ^= NATIVE_TO_LATIN1(*ptr);    /* table collated in Latin1 */
1072        h *= ${prefix}_FNV32_PRIME;
1073    } while ( ++ptr < ptr_end );
1074    n= h % ${prefix}_BUCKETS;
1075    s = $table_name\[n].seed2;
1076    if (s) {
1077        h= (h >> ${prefix}_RSHIFT) ^ s;
1078        n = h % ${prefix}_BUCKETS;
1079        if (
1080            ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) &&
1081            ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) &&
1082            ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx,
1083                key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 )
1084        ) {
1085            return $table_name\[n].value;
1086        }
1087    }
1088    return 0;
1089}
1090EOF_CODE
1091
1092    return $self->{algo_code}= join "", @code;
1093}
1094
1095sub __ofh {
1096    my ($self, $to, $default_key)= @_;
1097
1098    $to //= $self->{$default_key};
1099
1100    my $ofh;
1101    if (ref $to) {
1102        $ofh= $to;
1103    }
1104    else {
1105        open $ofh, ">", $to
1106            or die "Failed to open '$to': $!";
1107    }
1108    return $ofh;
1109}
1110
1111sub print_algo {
1112    my ($self, $to)= @_;
1113
1114    my $ofh= $self->__ofh($to, "h_file");
1115
1116    my $code= $self->make_algo();
1117    print $to $code;
1118}
1119
1120sub print_main {
1121    my ($self, $ofh)= @_;
1122    my ($h_file, $match_name, $prefix)= @{$self}{qw(h_file match_name prefix)};
1123    print $ofh <<"EOF_CODE";
1124#include "$h_file"
1125
1126int main(int argc, char *argv[]){
1127    int i;
1128    for (i=1; i<argc; i++) {
1129        unsigned char *key = (unsigned char *)argv[i];
1130        int key_len = strlen(argv[i]);
1131        printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len));
1132    }
1133    return 0;
1134}
1135EOF_CODE
1136}
1137
1138# output the test Perl code.
1139sub print_tests {
1140    my ($self, $to)= @_;
1141    my $tests_hash= $self->{tests_hash};
1142
1143    my $ofh= $self->__ofh($to, "t_file");
1144
1145    my $num_tests= 2 + keys %$tests_hash;
1146    print $ofh
1147        "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;";
1148    my $bytes= 0;
1149    my @tests= sort keys %$tests_hash;
1150    print $ofh
1151        "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n";
1152    print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n";
1153    print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n";
1154
1155    while (@tests) {
1156        my @batch= splice @tests, 0, 10;
1157        my $batch_args= join " ", map { "'$_'" } @batch;
1158        print $ofh "\@res=`./mph_test $batch_args`;\n";
1159        foreach my $i (0 .. $#batch) {
1160            my $key= $batch[$i];
1161            my $want= $tests_hash->{$key};
1162            print $ofh
1163                "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
1164        }
1165    }
1166}
1167
1168sub print_test_binary {
1169    my ($self, $to)= @_;
1170
1171    my $ofh= $self->__ofh($to, "c_file");
1172
1173    $self->print_includes($ofh);
1174    $self->print_defines($ofh);
1175    $self->print_main($ofh);
1176}
1177
1178sub make_mph_with_split_keys {
1179    my ($self)= @_;
1180
1181    my $hash= $self->{source_hash};
1182    my $length_all_keys= $self->{length_all_keys};
1183
1184    my ($blob, $split_points)= $self->build_split_words();
1185
1186    my ($seed1, $second_level)= $self->build_perfect_hash();
1187
1188    # add prefix/suffix data into the bucket info in @$second_level
1189    foreach my $bucket_info (@$second_level) {
1190        my $key= $bucket_info->{key};
1191        my $sp= $split_points->{$key} // die "no split_point data for '$key'\n";
1192
1193        my ($prefix, $suffix)= unpack "A${sp}A*", $key;
1194        $bucket_info->{prefix}= $prefix;
1195        $bucket_info->{suffix}= $suffix;
1196        $bucket_info->{value}= $hash->{$key};
1197    }
1198    my ($rows, $defines, $tests)= $self->build_array_of_struct();
1199    return 1;
1200}
1201
1202sub make_files_split_keys {
1203    my ($self)= @_;
1204
1205    $self->make_mph_with_split_keys();
1206    $self->print_algo();
1207    $self->print_test_binary();
1208    $self->print_tests();
1209}
1210
1211unless (caller) {
1212    my %hash;
1213    {
1214        no warnings;
1215        do "../perl/lib/unicore/UCD.pl";
1216        %hash= %utf8::loose_to_file_of;
1217    }
1218    if ($ENV{MERGE_KEYS}) {
1219        my @keys= keys %hash;
1220        foreach my $loose (keys %utf8::loose_property_name_of) {
1221            my $to= $utf8::loose_property_name_of{$loose};
1222            next if $to eq $loose;
1223            foreach my $key (@keys) {
1224                my $copy= $key;
1225                if ($copy =~ s/^\Q$to\E(=|\z)/$loose$1/) {
1226
1227                    $hash{$copy}= $key;
1228                }
1229            }
1230        }
1231    }
1232    foreach my $key (keys %hash) {
1233        my $munged= uc($key);
1234        $munged =~ s/\W/__/g;
1235        $hash{$key}= $munged;
1236    }
1237
1238    my $name= shift @ARGV;
1239    $name ||= "mph";
1240    my $obj= __PACKAGE__->new(
1241        source_hash => \%hash,
1242        base_name   => $name
1243    );
1244    $obj->make_files_split_keys();
1245}
1246
12471;
1248__END__
1249