1package MinimalPerfectHash; 2use strict; 3use warnings; 4use Data::Dumper; 5use Carp; 6use Text::Wrap; 7use List::Util qw(shuffle min); 8 9use warnings 'FATAL' => 'all'; 10 11# The style of this file is determined by: 12# 13# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \ 14# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \ 15# -fsb='##!' -fse='##.' 16 17# Naming conventions 18# * The public API, consisting of methods, uses "normal" sub names with 19# no leading underscore. 20# * Private subs are prefixed with a single underscore. 21# * Private methods are prefixed with two underscores. (There is only 22# one at the time of writing this comment) 23 24use constant { 25 FNV32_PRIME => 16777619, 26 U8_MAX => 0xFF, 27 U16_MAX => 0xFFFF, 28 U32_MAX => 0xFFFFFFFF, 29 INF => 1e9, 30}; 31 32our $DEBUG= $ENV{DEBUG} || 0; # our so we can use local on it 33my $RSHIFT= 8; 34my $MASK= U32_MAX; 35my $MAX_SEED2= U16_MAX; # currently the same, but not necessarily. 36my $IS_32BIT= !eval { pack "Q", 1 }; 37 38sub new { 39 my ($class, %self)= @_; 40 41 my $source_hash= $self{source_hash} 42 or die "'source_hash' is a required parameter in $class->new()\n"; 43 44 my $length_all_keys= 0; 45 $length_all_keys += length($_) for keys %$source_hash; 46 $self{length_all_keys}= $length_all_keys; 47 48 $self{max_attempts} ||= 16; # pick a number, any number... 49 50 $self{base_name} ||= "mph"; 51 my $base_name= $self{base_name}; 52 53 $self{prefix} ||= uc($base_name); 54 55 $self{h_file} ||= $base_name . "_algo.h"; 56 $self{c_file} ||= $base_name . "_test.c"; 57 $self{t_file} ||= $base_name . "_test.pl"; 58 $self{blob_name} ||= $base_name . "_blob"; 59 $self{struct_name} ||= $base_name . "_struct"; 60 $self{table_name} ||= $base_name . "_table"; 61 $self{match_name} ||= $base_name . "_match"; 62 63 my $split_strategy; 64 $self{simple_split} //= 0; 65 if ($self{simple_split}) { 66 $self{split_strategy}= "simple"; 67 $self{randomize_squeeze}= 0; 68 } 69 else { 70 $self{split_strategy}= "squeeze"; 71 $self{randomize_squeeze} //= 1; 72 } 73 if ($self{randomize_squeeze}) { 74 $self{max_same_in_squeeze} //= 5; 75 if (defined $self{srand_seed_was}) { 76 $self{srand_seed}= delete $self{srand_seed_was}; 77 } 78 elsif (!defined $self{srand_seed}) { 79 $self{srand_seed}= srand(); 80 } 81 else { 82 srand($self{srand_seed}); 83 } 84 print "SRAND_SEED= $self{srand_seed}\n" if $DEBUG; 85 } 86 else { 87 $self{max_same}= 3; 88 delete $self{srand_seed}; 89 } 90 return bless \%self, $class; 91} 92 93# The basic idea is that you have a two level structure, and effectively 94# hash the key twice. 95# 96# The first hash finds a bucket in the array which contains a seed which 97# is used for the second hash, which then leads to a bucket with key 98# data which is compared against to determine if the key is a match. 99# 100# If the first hash finds no seed, then the key cannot match. 101# 102# In our case we cheat a bit, and hash the key only once, but use the 103# low bits for the first lookup and the high-bits for the second. 104# 105# So for instance: 106# 107# h= (h >> RSHIFT) ^ s; 108# 109# is how the second hash is computed. We right shift the original hash 110# value and then xor in the seed2, which will be non-zero. 111# 112# That then gives us the bucket which contains the key data we need to 113# match for a valid key. 114 115sub _fnv1a_32 { 116 my ($key, $seed)= @_; 117 use integer; 118 119 my $hash= 0 + $seed; 120 foreach my $char (split //, $key) { 121 $hash= $hash ^ ord($char); 122 123 # the & U32_MAX is to simulate 32 bit ints on a 64 bit integer Perl. 124 $hash= ($hash * FNV32_PRIME) & U32_MAX; 125 } 126 127 # The hash can end up negative on 32 bit Perls due to use integer being 128 # in scope. This is equivalent to casting it to an U32. 129 $hash= unpack "V", pack "l", $hash 130 if $IS_32BIT; 131 132 return $hash; 133} 134 135sub build_perfect_hash { 136 my ($self)= @_; 137 138 my $source_hash= $self->{source_hash}; 139 my $max_attempts= $self->{max_attempts}; 140 141 my $n= 0 + keys %$source_hash; 142 print "Building a minimal perfect hash from $n keys.\n" 143 if $DEBUG; 144 my $seed1= unpack("N", "Perl") - 1; 145 146 TRY: 147 for (my $attempt= 1 ; $attempt < $max_attempts ; $attempt++) { 148 my ($hash_to_key, $key_to_hash, $key_buckets); 149 SEED1: 150 for ($seed1++ ; 1 ; $seed1++) { 151 print "Trying seed $seed1\n" 152 if $DEBUG; 153 my %hash_to_key; 154 my %key_to_hash; 155 my %key_buckets; 156 my %shifted; 157 foreach my $key (sort keys %$source_hash) { 158 my $h= _fnv1a_32($key, $seed1); 159 next SEED1 if exists $hash_to_key{$h}; 160 next SEED1 if $shifted{ ($h >> $RSHIFT) & $MASK }++; 161 $hash_to_key{$h}= $key; 162 $key_to_hash{$key}= $h; 163 push @{ $key_buckets{ $h % $n } }, $key; 164 } 165 $hash_to_key= \%hash_to_key; 166 $key_to_hash= \%key_to_hash; 167 $key_buckets= \%key_buckets; 168 last SEED1; 169 } 170 my $second_level= 171 _build_mph_level2($hash_to_key, $key_to_hash, $key_buckets); 172 if ($second_level) { 173 $self->{seed1}= $seed1; 174 $self->{second_level}= $second_level; 175 return $seed1, $second_level; 176 } 177 } 178 die sprintf "After %d attempts failed to construct a minimal perfect " 179 . "hash with %d keys.\nWe are using fnv32(), perhaps this " 180 . "hash function isn't good enough?\n", 181 $max_attempts, $n; 182} 183 184sub _build_mph_level2 { 185 my ($hash_to_key, $key_to_hash, $key_buckets)= @_; 186 187 my $n= 0 + keys %$key_to_hash; 188 189 # Loop over the key_buckets, processing the buckets with the most 190 # items in them first, and the ones with the least items in them last. 191 # This maximizes the chance we can find a $seed2 that "disambiguates" 192 # the items that collide in a single bucket. 193 # 194 # With a decent hash function we will have a typical long tail 195 # distribution of items per bucket, with relatively few buckets with 196 # the most collisions in them, and the vast majority of buckets 197 # having no collisions. By processing the ones with the most items 198 # in them first the "easy" cases don't get in the way of finding a 199 # solution for the hard cases. The buckets can be divided into three 200 # levels of difficulty to solve "hard", "medium" and "trivial". 201 # 202 # * Hard buckets have more than one item in them. 203 # * Medium buckets have one item whose hash is above $MAX_SEED2. 204 # * Trivial buckets have one item whose hash is not above $MAX_SEED2. 205 # 206 # Each type of bucket uses a different algorithm to solve. Note that 207 # a "classical" two level hash would only have "hard" and "trivial" 208 # buckets, but since we support having a larger hash value than we 209 # allow for a $seed2 we have three. 210 211 my @first_level; 212 my @second_level; 213 my @singles_high; 214 my @singles_low; 215 216 print "Finding mappings for buckets with collisions.\n" 217 if $DEBUG; 218 219 FIRST_IDX: 220 foreach my $first_idx ( 221 sort { 222 @{ $key_buckets->{$b} } <=> @{ $key_buckets->{$a} } 223 || $a <=> $b 224 } keys %$key_buckets 225 ) 226 { 227 my $keys= $key_buckets->{$first_idx}; 228 if (@$keys == 1) { 229 230 # buckets with a single item in them can use a simpler 231 # and faster algorithm to find a bucket than those with 232 # buckets with more than one item. 233 234 # however keys whose $hash2 is above $MAX_SEED2 need to be 235 # processed first, and will use one strategy, while the rest 236 # of the singletons should be processed last, and can use 237 # an even simpler and more efficient strategy. 238 my $key= $keys->[0]; 239 my $hash2= ($key_to_hash->{$key} >> $RSHIFT) & $MASK; 240 if ($hash2 > $MAX_SEED2) { 241 push @singles_high, [ $first_idx, $hash2, $key ]; 242 } 243 else { 244 push @singles_low, [ $first_idx, $hash2, $key ]; 245 } 246 next FIRST_IDX; 247 } 248 249 # This loop handles items with more than one key in the same 250 # bucket. We need to find a $seed2 that causes the operation 251 # 252 # ($hash ^ $seed2) % $n 253 # 254 # to map those keys into different empty buckets. If we cannot 255 # find such a $seed2 then we need to recompute everything with a 256 # new seed. 257 SEED2: 258 for (my $seed2= 1 ; $seed2 <= $MAX_SEED2 ; $seed2++) { 259 my @idx= map { 260 ((($key_to_hash->{$_} >> $RSHIFT) ^ $seed2) & $MASK) % $n 261 } @$keys; 262 my %seen; 263 next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx; 264 $first_level[$first_idx]= $seed2; 265 @second_level[@idx]= map { _make_bucket_info($_) } @$keys; 266 next FIRST_IDX; 267 } 268 269 # If we get here then we failed to find a $seed2 which results 270 # in the colliding items being mapped to different empty buckets. 271 # So we have to rehash everything with a different $seed1. 272 print "Failed to disambiguate colliding keys. Trying new seed1.\n" 273 if $DEBUG; 274 return; 275 } 276 277 # Now fill in the singletons using a much simpler and faster 278 # way to compute the seed2. Since we only have to worry about 279 # a single seed, we merely need to fill in all the empty slots 280 # and we can always compute a mask that when xor'ed with $base 281 # maps to the empty slot. 282 print "Finding mappings for buckets with no collisions.\n" 283 if $DEBUG; 284 285 # sort @singles_low so that for the simple algorithm we do not end 286 # up mapping a 0 hash to the 0 bucket, which would result in a 287 # $seed2 of 0. Our logic avoids comparing the key when the $seed2 is 288 # 0, so we need to avoid having a seed2 of 0. This rule is not 289 # strictly required, but it cuts down on string comparisons at the 290 # cost of a relatively cheap numeric comparison. If you change this 291 # make sure you update the generated C code. 292 293 ##! 294 @singles_low= sort { 295 $b->[1] <=> $a->[1] || # sort by $hash2 296 $a->[0] <=> $b->[0] # then by $first_idx 297 } @singles_low; 298 ##. 299 300 my $scan_idx= 0; # used to find empty buckets for the "simple" case. 301 SINGLES: 302 foreach my $tuple (@singles_high, @singles_low) { 303 my ($first_idx, $hash2, $key)= @$tuple; 304 my ($seed2, $idx); 305 if ($hash2 > $MAX_SEED2) { 306 307 # The $hash2 is larger than the maximum value of $seed2. 308 # This means that we cannot simply map this item into 309 # whichever bucket we choose using xor. Instead we loop 310 # through the possible $seed2 values checking to see if it 311 # results in us landing in an empty bucket, which should be 312 # fairly common which means this loop should execute 313 # relatively few times. It also minimizes the chance that we 314 # cannot find a solution at all. 315 for my $i (1 .. $MAX_SEED2) { 316 $idx= (($hash2 ^ $i) & $MASK) % $n; 317 if (!$second_level[$idx]) { 318 $seed2= $i; 319 last; 320 } 321 } 322 323 # If we failed to find a solution we need to go back to 324 # beginning and try a different key. 325 if (!defined $seed2) { 326 print "No viable seed2 for singleton. Trying new seed1.\n" 327 if $DEBUG; 328 return; 329 } 330 } 331 else { 332 # since $hash2 <= $MAX_SEED2 we can trivially map the item 333 # to any bucket we choose using xor. So we find the next 334 # empty bucket with the loop below, and then map this item 335 # into it. 336 SCAN: 337 while ($second_level[$scan_idx]) { 338 $scan_idx++; 339 } 340 341 # note that we don't need to mod $n here, as 342 # 343 # $hash2 ^ $seed2 == $idx 344 # 345 # and $idx is already in the interval (0, $n-1) 346 347 $seed2= $hash2 ^ $scan_idx; 348 349 # increment $scan_idx after stashing its old value into $idx 350 # as by the end of this iteration of the SINGLES loop we 351 # will have filled $second_level[$scan_idx] and we need not 352 # check it in the SCAN while loop. 353 $idx= $scan_idx++; 354 } 355 356 # sanity check $idx. 357 die "WTF, \$idx should be less than \$n ($idx vs $n)" 358 unless $idx < $n; 359 360 die "Bad seed2 for first_idx: $first_idx." if $seed2 == 0; 361 362 # and finally we are done, we have found the final bucket 363 # location for this key. 364 $first_level[$first_idx]= $seed2; 365 $second_level[$idx]= _make_bucket_info($key); 366 } 367 368 # now that we are done we can go through and fill in the idx and 369 # seed2 as appropriate. We store idx into the hashes even though it 370 # is not stricly necessary as it simplifies some of the code that 371 # processes the @second_level bucket info array later. 372 foreach my $idx (0 .. $n - 1) { 373 $second_level[$idx]{seed2}= $first_level[$idx] || 0; 374 $second_level[$idx]{idx}= $idx; 375 } 376 377 return \@second_level; 378} 379 380sub _make_bucket_info { 381 my ($key)= @_; 382 return +{ 383 key => $key, 384 seed2 => undef, # will be filled in later 385 idx => undef, # will be filled in later 386 }; 387} 388 389sub _sort_keys_longest_first { 390 my ($hash)= shift; 391 my @keys= sort { length($b) <=> length($a) || $a cmp $b } keys %$hash; 392 return \@keys; 393} 394 395# This sub constructs a blob of characters which can be used to 396# reconstruct the keys of the $hash that is passed in to it, possibly 397# and likely by splitting the keys into two parts, a prefix and a 398# suffix. This allows prefixes and suffixes to be reused for more than 399# one original key. 400# 401# It returns a string that contains every prefix and suffix chosen, and 402# a hash that contains each key in the argument $hash with each value 403# being the position where it is split, using the length of the key to 404# indicate it need not be split at all. 405# 406# If $preprocess is false the process starts with an empty buffer and 407# populates it as it adds each new key, if $preprocess is true then it 408# tries to split each key at the '=' sign which is often present in 409# Unicode property names and composes the initial buffer from these 410# fragments. 411# 412# It performs multiple passes trying to find the ideal split point to 413# produce a minimal buffer, returning the smallest buffer it can. 414sub _build_split_words_simple { 415 my ($hash, $length_all_keys, $preprocess)= @_; 416 my %appended; 417 my $blob= ""; 418 if ($preprocess) { 419 my %parts; 420 foreach my $key (@{ _sort_keys_longest_first($hash) }) { 421 my ($prefix, $suffix); 422 if ($key =~ /^([^=]+=)([^=]+)\z/) { 423 ($prefix, $suffix)= ($1, $2); 424 $parts{$suffix}++; 425 426 #$parts{$prefix}++; 427 } 428 else { 429 $prefix= $key; 430 $parts{$prefix}++; 431 } 432 433 } 434 foreach my $part (@{ _sort_keys_longest_first(\%parts) }) { 435 $blob .= $part; 436 } 437 printf "Using preprocessing, initial blob size is %d chars.\n", 438 length($blob) 439 if $DEBUG; 440 } 441 else { 442 print "No preprocessing, starting with an empty blob.\n" 443 if $DEBUG; 444 } 445 my ($res, $old_res, $added, $passes); 446 447 REDO: 448 $res= {}; 449 $added= 0; 450 $passes++; 451 452 KEY: 453 foreach my $key (@{ _sort_keys_longest_first($hash) }) { 454 next if exists $res->{$key}; 455 if (index($blob, $key) >= 0) { 456 my $idx= length($key); 457 if ($DEBUG > 1 and $old_res and $old_res->{$key} != $idx) { 458 print "changing: $key => $old_res->{$key} : $idx\n"; 459 } 460 $res->{$key}= $idx; 461 next KEY; 462 } 463 my $best= length($key); 464 my $append= $key; 465 my $best_prefix= $key; 466 my $best_suffix= ""; 467 my $min= 1; 468 foreach my $idx (reverse $min .. length($key) - 1) { 469 my $prefix= substr($key, 0, $idx); 470 my $suffix= substr($key, $idx); 471 my $i1= index($blob, $prefix) >= 0; 472 my $i2= index($blob, $suffix) >= 0; 473 if ($i1 and $i2) { 474 if ($DEBUG > 1 and $old_res and $old_res->{$key} != $idx) { 475 print "changing: $key => $old_res->{$key} : $idx\n"; 476 } 477 $res->{$key}= $idx; 478 $appended{$prefix}++; 479 $appended{$suffix}++; 480 next KEY; 481 } 482 elsif ($i1) { 483 if (length $suffix <= length $append) { 484 $best= $idx; 485 $append= $suffix; 486 $best_prefix= $prefix; 487 $best_suffix= $suffix; 488 } 489 } 490 elsif ($i2) { 491 if (length $prefix <= length $append) { 492 $best= $idx; 493 $append= $prefix; 494 $best_prefix= $prefix; 495 $best_suffix= $suffix; 496 } 497 } 498 } 499 if ($DEBUG > 1 and $old_res and $old_res->{$key} != $best) { 500 print "changing: $key => $old_res->{$key} : $best\n"; 501 } 502 503 $res->{$key}= $best; 504 $blob .= $append; 505 $added += length($append); 506 $appended{$best_prefix}++; 507 $appended{$best_suffix}++; 508 } 509 if ($added) { 510 if ($added < length $blob) { 511 printf "Appended %d chars. Blob is %d chars long.\n", 512 $added, length($blob) 513 if $DEBUG; 514 } 515 else { 516 printf "Blob is %d chars long.\n", $added 517 if $DEBUG; 518 } 519 } 520 elsif ($passes > 1) { 521 print "Blob needed no changes.\n" 522 if $DEBUG; 523 } 524 my $new_blob= ""; 525 foreach my $part (@{ _sort_keys_longest_first(\%appended) }) { 526 $new_blob .= $part unless index($new_blob, $part) >= 0; 527 } 528 if (length($new_blob) < length($blob)) { 529 printf "Uncorrected new blob length of %d chars is smaller.\n" 530 . " Correcting new blob...%s", 531 length($new_blob), $DEBUG > 1 ? "\n" : " " 532 if $DEBUG; 533 $blob= $new_blob; 534 $old_res= $res; 535 %appended= (); 536 goto REDO; 537 } 538 else { 539 printf "After %d passes final blob length is %d chars.\n" 540 . "This is %.2f%% of the raw key length of %d chars.\n\n", 541 $passes, length($blob), 100 * length($blob) / $length_all_keys, 542 $length_all_keys 543 if $DEBUG; 544 } 545 546 # sanity check 547 die sprintf "not same size? %d != %d", 0 + keys %$res, 0 + keys %$hash 548 unless keys %$res == keys %$hash; 549 return ($blob, $res, $length_all_keys); 550} 551 552# Find all the positions where $word can be found in $$buf_ref, 553# including overlapping positions. The data is cached into the 554# $offsets_hash. Used by the _squeeze algorithm. 555sub _get_offsets { 556 my ($offsets_hash, $buf_ref, $word)= @_; 557 return $offsets_hash->{$word} 558 if defined $offsets_hash->{$word}; 559 560 my @offsets; 561 my $from= 0; 562 563 while (1) { 564 my $i= index($$buf_ref, $word, $from); 565 last if $i == -1; 566 push @offsets, $i; 567 $from= $i + 1; 568 } 569 570 $offsets_hash->{$word}= \@offsets; 571 return \@offsets; 572} 573 574# Increments the popularity data for the characters at 575# $ofs .. $ofs + $len - 1 by $diff. Used by the _squeeze algorithm 576sub _inc_popularity { 577 my ($popularity, $ofs, $len, $diff)= @_; 578 for my $idx ($ofs .. $ofs + $len - 1) { 579 $popularity->[$idx] += $diff; 580 } 581} 582 583# Returns a summary hash about the popularity of the characters 584# $ofs .. $ofs + $len - 1. Used by the _squeeze algorithm 585sub _get_popularity { 586 my ($popularity, $ofs, $len)= @_; 587 my $res= { 588 reused_digits => 0, 589 popularity => 0, 590 }; 591 my $min_pop= undef; 592 for my $idx ($ofs .. $ofs + $len - 1) { 593 if ($popularity->[$idx] >= INF) { 594 $res->{reused_digits}++; 595 } 596 else { 597 my $pop= $popularity->[$idx]; 598 if (!defined $min_pop || $pop < $min_pop) { 599 $min_pop= $pop; 600 } 601 } 602 } 603 $res->{popularity}= $min_pop // 0; 604 return $res; 605} 606 607# Merge the popularity data produced by _get_popularity() for the prefix 608# and suffix of a word together. Used by the _squeeze algorithm 609sub _merge_score { 610 my ($s1, $s2)= @_; 611 return +{ 612 reused_digits => $s1->{reused_digits} + $s2->{reused_digits}, 613 popularity => min($s1->{popularity}, $s2->{popularity}), 614 }; 615} 616 617# Initialize the popularity and offsets data for a word. 618# Used by the _squeeze algorithm 619sub _init_popularity { 620 my ($offsets_hash, $popularity, $buf_ref, $word, $diff)= @_; 621 my $offsets= _get_offsets($offsets_hash, $buf_ref, $word); 622 my $len= length $word; 623 for my $ofs (@$offsets) { 624 for my $idx ($ofs .. $ofs + $len - 1) { 625 $popularity->[$idx] += $diff; 626 } 627 } 628} 629 630# Compare the popularity data for two possible candidates 631# for solving a given word. Used by the _squeeze algorithm 632sub _compare_score { 633 my ($s1, $s2)= @_; 634 if ($s1->{reused_digits} != $s2->{reused_digits}) { 635 return $s1->{reused_digits} <=> $s2->{reused_digits}; 636 } 637 return $s1->{popularity} <=> $s2->{popularity}; 638} 639 640# Find the most popular offset for a word in $$buf_ref. 641# Used by the _squeeze algorithm 642sub _most_popular_offset { 643 my ($offsets_hash, $popularity, $buf_ref, $word)= @_; 644 my $best_score= { 645 reused_digits => -1, 646 popularity => -1, 647 }; 648 my $best_pos= -1; 649 my $offsets_ary= _get_offsets($offsets_hash, $buf_ref, $word); 650 my $wlen= length $word; 651 for my $i (@$offsets_ary) { 652 my $score= _get_popularity($popularity, $i, $wlen); 653 if (_compare_score($score, $best_score) > 0) { 654 $best_score= $score; 655 $best_pos= $i; 656 if ($best_score->{reused_digits} == $wlen) { 657 last; 658 } 659 } 660 } 661 return +{ 662 position => $best_pos, 663 score => $best_score, 664 }; 665} 666 667# The _squeeze algorithm. Attempt to squeeze out unused characters from 668# a buffer of split words. If there are multiple places where a given 669# prefix or suffix can be found and the overall split decisions can be 670# reorganized so some of them are never used it removes the ones that 671# are not used. 672sub _squeeze { 673 my ($words, $word_count, $splits, $buf_ref)= @_; 674 print "Squeezing...\n" if $DEBUG; 675 my %offsets_hash; 676 my %split_points; 677 my $n= length $$buf_ref; 678 my @popularity= 0 x $n; 679 680 for my $word (sort keys %$word_count) { 681 my $count= $word_count->{$word}; 682 _init_popularity(\%offsets_hash, \@popularity, $buf_ref, $word, 683 $count / length($word)); 684 } 685 686 WORD: 687 for my $word (@$words) { 688 my $best_pos1= -1; 689 my $best_pos2= -1; 690 my $best_score= { 691 reused_digits => -1, 692 popularity => -1, 693 }; 694 my $best_split; 695 696 my $cand= 697 _most_popular_offset(\%offsets_hash, \@popularity, $buf_ref, $word); 698 if ($cand->{position} != -1) { 699 my $cand_score= $cand->{score}; 700 if ($cand_score->{reused_digits} == length($word)) { 701 $split_points{$word}= 0; 702 next WORD; 703 } 704 elsif (_compare_score($cand_score, $best_score) > 0) { 705 $best_score= $cand_score; 706 $best_pos1= $cand->{position}; 707 $best_pos2= -1; 708 $best_split= undef; 709 } 710 } 711 712 for my $split (@{ $splits->{$word} }) { 713 my $cand2= 714 _most_popular_offset(\%offsets_hash, \@popularity, $buf_ref, 715 $split->{w2}); 716 next if $cand2->{position} == -1; 717 718 my $cand1= 719 _most_popular_offset(\%offsets_hash, \@popularity, $buf_ref, 720 $split->{w1}); 721 next if $cand1->{position} == -1; 722 723 my $cand_score= _merge_score($cand1->{score}, $cand2->{score}); 724 725 if ($cand_score->{reused_digits} == length($word)) { 726 $split_points{$word}= $split->{split_point}; 727 next WORD; 728 } 729 if (_compare_score($cand_score, $best_score) > 0) { 730 $best_score= $cand_score; 731 $best_pos1= $cand1->{position}; 732 $best_pos2= $cand2->{position}; 733 $best_split= $split; 734 } 735 } 736 737 # apply high pop to used characters of the champion 738 if (defined $best_split) { 739 _inc_popularity(\@popularity, $best_pos1, 740 length($best_split->{w1}), INF); 741 _inc_popularity(\@popularity, $best_pos2, 742 length($best_split->{w2}), INF); 743 $split_points{$word}= $best_split->{split_point}; 744 } 745 else { 746 _inc_popularity(\@popularity, $best_pos1, length($word), INF); 747 $split_points{$word}= 0; 748 } 749 } 750 751 my $res= ""; 752 my @chars= split '', $$buf_ref; 753 for my $i (0 .. $n - 1) { 754 if ($popularity[$i] >= INF) { 755 $res .= $chars[$i]; 756 } 757 } 758 printf "%d -> %d\n", $n, length($res) if $DEBUG; 759 760 # This algorithm chooses to "split" full strings at 0, so that the 761 # prefix is empty and the suffix contains the full key, but the 762 # minimal perfect hash logic wants it the other way around, as we do 763 # the prefix check first. so we correct it at the end here. 764 $split_points{$_} ||= length($_) for keys %split_points; 765 766 return ($res, \%split_points); 767} 768 769# compute an initial covering buffer for a set of words, 770# including split data. 771sub _initial_covering_buf { 772 my ($words, $splits)= @_; 773 my $res= ""; 774 WORD: 775 for my $word (@$words) { 776 if (index($res, $word) != -1) { 777 next WORD; 778 } 779 else { 780 for my $split (@{ $splits->{$word} }) { 781 if ( index($res, $split->{w1}) != -1 782 && index($res, $split->{w2}) != -1) 783 { 784 next WORD; 785 } 786 } 787 } 788 $res .= $word; 789 } 790 return $res; 791} 792 793sub build_split_words_squeeze { 794 my ($self)= @_; 795 # Thanks to Ilya Sashcheka for this algorithm 796 797 my $hash= $self->{source_hash}; 798 my $length_all_keys= $self->{length_all_keys}; 799 my $randomize= $self->{randomize_squeeze}; 800 my $max_same= $self->{max_same_in_squeeze}; 801 802 my @words= sort keys %$hash; 803 my %splits; 804 my $split_points; 805 806 for my $word (@words) { 807 my $word_splits= []; 808 my $wlen= length $word; 809 for my $i (1 .. $wlen - 1) { 810 ##! 811 push @$word_splits, 812 +{ 813 w1 => substr($word, 0, $i), 814 w2 => substr($word, $i), 815 split_point => $i, 816 }; 817 ##. 818 } 819 $splits{$word}= $word_splits; 820 } 821 822 my %word_count; 823 for my $word (@words) { 824 $word_count{$word}++; 825 for my $split (@{ $splits{$word} }) { 826 $word_count{ $split->{w1} }++; 827 $word_count{ $split->{w2} }++; 828 } 829 } 830 831 @words= sort { length($a) <=> length($b) || $a cmp $b } @words; 832 my $buf= _initial_covering_buf(\@words, \%splits); 833 834 printf "Pre squeeze buffer: %s\n", $buf if $DEBUG > 1; 835 printf "Pre squeeze length: %d\n", length $buf if $DEBUG; 836 837 my $same= 0; 838 my $counter= 0; 839 my $reverse_under= 2; 840 while ($same < $max_same) { 841 my ($new_buf, $new_split_points)= 842 _squeeze(\@words, \%word_count, \%splits, \$buf); 843 if (!$split_points or length($new_buf) < length($buf)) { 844 $buf= $new_buf; 845 $split_points= $new_split_points; 846 $same= 0; 847 } 848 else { 849 if ($same < $reverse_under or !$randomize) { 850 print "reversing words....\n" if $DEBUG; 851 @words= reverse @words; 852 } 853 else { 854 print "shuffling words....\n" if $DEBUG; 855 @words= shuffle @words; 856 $reverse_under= 1; 857 } 858 $same++; 859 } 860 } 861 862 printf "Final length: %d\n", length($buf) if $DEBUG; 863 864 $self->{blob}= $buf; 865 $self->{split_points}= $split_points; 866 867 return $buf, $split_points; 868} 869 870sub build_split_words_simple { 871 my ($self)= @_; 872 873 my $hash= $self->{source_hash}; 874 my $length_all_keys= $self->{length_all_keys}; 875 876 my ($blob, $split_points)= 877 _build_split_words_simple($hash, $length_all_keys, 0); 878 879 my ($blob2, $split_points2)= 880 _build_split_words_simple($hash, $length_all_keys, 1); 881 882 if (length($blob) > length($blob2)) { 883 printf "Using preprocess-smart blob. Length is %d chars. (vs %d)\n", 884 length $blob2, length $blob 885 if $DEBUG; 886 $blob= $blob2; 887 $split_points= $split_points2; 888 } 889 else { 890 printf "Using greedy-smart blob. Length is %d chars. (vs %d)\n", 891 length $blob, length $blob2 892 if $DEBUG; 893 } 894 $self->{blob}= $blob; 895 $self->{split_points}= $split_points; 896 897 return $blob, $split_points; 898} 899 900sub build_split_words { 901 my ($self)= @_; 902 903 # The _simple algorithm does not compress nearly as well as the 904 # _squeeze algorithm, although it uses less memory and will likely 905 # be faster, especially if randomization is enabled. The default 906 # is to use _squeeze as our hash is not that large (~8k keys). 907 my ($buf, $split_words); 908 if ($self->{simple_split}) { 909 ($buf, $split_words)= $self->build_split_words_simple(); 910 } 911 else { 912 ($buf, $split_words)= $self->build_split_words_squeeze(); 913 } 914 foreach my $key (sort keys %$split_words) { 915 my $point= $split_words->{$key}; 916 my $prefix= substr($key, 0, $point); 917 my $suffix= substr($key, $point); 918 if (index($buf, $prefix) < 0) { 919 die "Failed to find prefix '$prefix' for '$key'"; 920 } 921 if (length $suffix and index($buf, $suffix) < 0) { 922 die "Failed to find suffix '$suffix' for '$key'"; 923 } 924 } 925 return ($buf, $split_words); 926} 927 928sub blob_as_code { 929 my ($self)= @_; 930 my $blob= $self->{blob}; 931 my $blob_name= $self->{blob_name}; 932 933 # output the blob as C code. 934 my @code= (sprintf "STATIC const unsigned char %s[] =\n", $blob_name); 935 my $blob_len= length $blob; 936 while (length($blob)) { 937 push @code, sprintf qq( "%s"), substr($blob, 0, 65, ""); 938 push @code, length $blob ? "\n" : ";\n"; 939 } 940 push @code, "/* $blob_name length: $blob_len */\n"; 941 return $self->{blob_as_code}= join "", @code; 942} 943 944sub print_includes { 945 my ($self, $ofh)= @_; 946 print $ofh "#include <stdio.h>\n"; 947 print $ofh "#include <string.h>\n"; 948 print $ofh "#include <stdint.h>\n"; 949 print $ofh "\n"; 950} 951 952sub print_defines { 953 my ($self, $ofh)= @_; 954 my $defines= $self->{defines_hash}; 955 956 my $key_len; 957 foreach my $def (keys %$defines) { 958 $key_len //= length $def; 959 $key_len= length $def if $key_len < length $def; 960 } 961 foreach my $def (sort keys %$defines) { 962 printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def}; 963 } 964 print $ofh "\n"; 965} 966 967sub build_array_of_struct { 968 my ($self)= @_; 969 my $second_level= $self->{second_level}; 970 my $blob= $self->{blob}; 971 972 my %defines; 973 my %tests; 974 my @rows; 975 foreach my $row (@$second_level) { 976 if (!defined $row->{idx} or !defined $row->{value}) { 977 die "panic: No idx or value key in row data:", Dumper($row); 978 } 979 $defines{ $row->{value} }= $row->{idx} + 1; 980 $tests{ $row->{key} }= $defines{ $row->{value} }; 981 ##! 982 my @u16= ( 983 $row->{seed2}, 984 index($blob, $row->{prefix}), 985 index($blob, $row->{suffix}), 986 ); 987 $_ > U16_MAX and die "panic: value exceeds range of U16" 988 for @u16; 989 my @u8= ( 990 length($row->{prefix}), 991 length($row->{suffix}), 992 ); 993 $_ > U8_MAX and die "panic: value exceeds range of U8" 994 for @u8; 995 push @rows, sprintf " { %5d, %5d, %5d, %3d, %3d, %s } /* %s%s */", 996 @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix}; 997 ##. 998 } 999 $self->{rows_array}= \@rows; 1000 $self->{defines_hash}= \%defines; 1001 $self->{tests_hash}= \%tests; 1002 return \@rows, \%defines, \%tests; 1003} 1004 1005sub make_algo { 1006 my ($self)= @_; 1007 1008 my ( 1009 $second_level, $seed1, $length_all_keys, $blob, 1010 $rows_array, $blob_name, $struct_name, $table_name, 1011 $match_name, $prefix, $split_strategy, $srand_seed, 1012 ) 1013 = @{$self}{ qw( 1014 second_level seed1 length_all_keys blob 1015 rows_array blob_name struct_name table_name 1016 match_name prefix split_strategy srand_seed 1017 ) }; 1018 1019 my $n= 0 + @$second_level; 1020 my $data_size= $n * 8 + length $blob; 1021 1022 my @code= "#define ${prefix}_VALt I16\n\n"; 1023 push @code, "/*\n"; 1024 push @code, sprintf "generator script: %s\n", $0; 1025 push @code, sprintf "split strategy: %s\n", $split_strategy; 1026 push @code, sprintf "srand: %d\n", $srand_seed 1027 if defined $srand_seed; 1028 push @code, sprintf "rows: %s\n", $n; 1029 push @code, sprintf "seed: %s\n", $seed1; 1030 push @code, sprintf "full length of keys: %d\n", $length_all_keys; 1031 push @code, sprintf "blob length: %d\n", length $blob; 1032 push @code, sprintf "ref length: %d\n", 0 + @$second_level * 8; 1033 push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, 1034 ($data_size / $length_all_keys) * 100; 1035 push @code, "*/\n\n"; 1036 1037 push @code, $self->blob_as_code(); 1038 push @code, <<"EOF_CODE"; 1039 1040struct $struct_name { 1041 U16 seed2; 1042 U16 pfx; 1043 U16 sfx; 1044 U8 pfx_len; 1045 U8 sfx_len; 1046 ${prefix}_VALt value; 1047}; 1048 1049EOF_CODE 1050 1051 push @code, "#define ${prefix}_RSHIFT $RSHIFT\n"; 1052 push @code, "#define ${prefix}_BUCKETS $n\n\n"; 1053 push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1; 1054 push @code, sprintf "STATIC const U32 ${prefix}_FNV32_PRIME = 0x%08x;\n\n", 1055 FNV32_PRIME; 1056 1057 push @code, "/* The comments give the input key for the row it is in */\n"; 1058 push @code, 1059 "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", 1060 join(",\n", @$rows_array) . "\n};\n\n"; 1061 push @code, <<"EOF_CODE"; 1062${prefix}_VALt 1063$match_name( const unsigned char * const key, const U16 key_len ) { 1064 const unsigned char * ptr= key; 1065 const unsigned char * ptr_end= key + key_len; 1066 U32 h= ${prefix}_SEED1; 1067 U32 s; 1068 U32 n; 1069 /* this is FNV-1a 32bit unrolled. */ 1070 do { 1071 h ^= NATIVE_TO_LATIN1(*ptr); /* table collated in Latin1 */ 1072 h *= ${prefix}_FNV32_PRIME; 1073 } while ( ++ptr < ptr_end ); 1074 n= h % ${prefix}_BUCKETS; 1075 s = $table_name\[n].seed2; 1076 if (s) { 1077 h= (h >> ${prefix}_RSHIFT) ^ s; 1078 n = h % ${prefix}_BUCKETS; 1079 if ( 1080 ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) && 1081 ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) && 1082 ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx, 1083 key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 ) 1084 ) { 1085 return $table_name\[n].value; 1086 } 1087 } 1088 return 0; 1089} 1090EOF_CODE 1091 1092 return $self->{algo_code}= join "", @code; 1093} 1094 1095sub __ofh { 1096 my ($self, $to, $default_key)= @_; 1097 1098 $to //= $self->{$default_key}; 1099 1100 my $ofh; 1101 if (ref $to) { 1102 $ofh= $to; 1103 } 1104 else { 1105 open $ofh, ">", $to 1106 or die "Failed to open '$to': $!"; 1107 } 1108 return $ofh; 1109} 1110 1111sub print_algo { 1112 my ($self, $to)= @_; 1113 1114 my $ofh= $self->__ofh($to, "h_file"); 1115 1116 my $code= $self->make_algo(); 1117 print $to $code; 1118} 1119 1120sub print_main { 1121 my ($self, $ofh)= @_; 1122 my ($h_file, $match_name, $prefix)= @{$self}{qw(h_file match_name prefix)}; 1123 print $ofh <<"EOF_CODE"; 1124#include "$h_file" 1125 1126int main(int argc, char *argv[]){ 1127 int i; 1128 for (i=1; i<argc; i++) { 1129 unsigned char *key = (unsigned char *)argv[i]; 1130 int key_len = strlen(argv[i]); 1131 printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len)); 1132 } 1133 return 0; 1134} 1135EOF_CODE 1136} 1137 1138# output the test Perl code. 1139sub print_tests { 1140 my ($self, $to)= @_; 1141 my $tests_hash= $self->{tests_hash}; 1142 1143 my $ofh= $self->__ofh($to, "t_file"); 1144 1145 my $num_tests= 2 + keys %$tests_hash; 1146 print $ofh 1147 "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;"; 1148 my $bytes= 0; 1149 my @tests= sort keys %$tests_hash; 1150 print $ofh 1151 "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n"; 1152 print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n"; 1153 print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n"; 1154 1155 while (@tests) { 1156 my @batch= splice @tests, 0, 10; 1157 my $batch_args= join " ", map { "'$_'" } @batch; 1158 print $ofh "\@res=`./mph_test $batch_args`;\n"; 1159 foreach my $i (0 .. $#batch) { 1160 my $key= $batch[$i]; 1161 my $want= $tests_hash->{$key}; 1162 print $ofh 1163 "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n"; 1164 } 1165 } 1166} 1167 1168sub print_test_binary { 1169 my ($self, $to)= @_; 1170 1171 my $ofh= $self->__ofh($to, "c_file"); 1172 1173 $self->print_includes($ofh); 1174 $self->print_defines($ofh); 1175 $self->print_main($ofh); 1176} 1177 1178sub make_mph_with_split_keys { 1179 my ($self)= @_; 1180 1181 my $hash= $self->{source_hash}; 1182 my $length_all_keys= $self->{length_all_keys}; 1183 1184 my ($blob, $split_points)= $self->build_split_words(); 1185 1186 my ($seed1, $second_level)= $self->build_perfect_hash(); 1187 1188 # add prefix/suffix data into the bucket info in @$second_level 1189 foreach my $bucket_info (@$second_level) { 1190 my $key= $bucket_info->{key}; 1191 my $sp= $split_points->{$key} // die "no split_point data for '$key'\n"; 1192 1193 my ($prefix, $suffix)= unpack "A${sp}A*", $key; 1194 $bucket_info->{prefix}= $prefix; 1195 $bucket_info->{suffix}= $suffix; 1196 $bucket_info->{value}= $hash->{$key}; 1197 } 1198 my ($rows, $defines, $tests)= $self->build_array_of_struct(); 1199 return 1; 1200} 1201 1202sub make_files_split_keys { 1203 my ($self)= @_; 1204 1205 $self->make_mph_with_split_keys(); 1206 $self->print_algo(); 1207 $self->print_test_binary(); 1208 $self->print_tests(); 1209} 1210 1211unless (caller) { 1212 my %hash; 1213 { 1214 no warnings; 1215 do "../perl/lib/unicore/UCD.pl"; 1216 %hash= %utf8::loose_to_file_of; 1217 } 1218 if ($ENV{MERGE_KEYS}) { 1219 my @keys= keys %hash; 1220 foreach my $loose (keys %utf8::loose_property_name_of) { 1221 my $to= $utf8::loose_property_name_of{$loose}; 1222 next if $to eq $loose; 1223 foreach my $key (@keys) { 1224 my $copy= $key; 1225 if ($copy =~ s/^\Q$to\E(=|\z)/$loose$1/) { 1226 1227 $hash{$copy}= $key; 1228 } 1229 } 1230 } 1231 } 1232 foreach my $key (keys %hash) { 1233 my $munged= uc($key); 1234 $munged =~ s/\W/__/g; 1235 $hash{$key}= $munged; 1236 } 1237 1238 my $name= shift @ARGV; 1239 $name ||= "mph"; 1240 my $obj= __PACKAGE__->new( 1241 source_hash => \%hash, 1242 base_name => $name 1243 ); 1244 $obj->make_files_split_keys(); 1245} 1246 12471; 1248__END__ 1249