1package MinimalPerfectHash; 2use strict; 3use warnings; 4use Data::Dumper; 5use Carp; 6use Text::Wrap; 7use bignum; # Otherwise fails on 32-bit systems 8 9my $DEBUG= 0; 10my $RSHIFT= 8; 11my $FNV_CONST= 16777619; 12 13# The basic idea is that you have a two level structure, and effectively 14# hash the key twice. 15# 16# The first hash finds a bucket in the array which contains a seed which 17# is used for the second hash, which then leads to a bucket with key 18# data which is compared against to determine if the key is a match. 19# 20# If the first hash finds no seed, then the key cannot match. 21# 22# In our case we cheat a bit, and hash the key only once, but use the 23# low bits for the first lookup and the high-bits for the second. 24# 25# So for instance: 26# 27# h= (h >> RSHIFT) ^ s; 28# 29# is how the second hash is computed. We right shift the original hash 30# value and then xor in the seed2, which will be non-zero. 31# 32# That then gives us the bucket which contains the key data we need to 33# match for a valid key. 34 35sub _fnv { 36 my ($key, $seed)= @_; 37 my $hash = 0+$seed; 38 foreach my $char (split //, $key) { 39 $hash = $hash ^ ord($char); 40 $hash = ($hash * $FNV_CONST) & 0xFFFFFFFF; 41 } 42 return $hash; 43} 44 45sub build_perfect_hash { 46 my ($hash, $split_pos)= @_; 47 48 my $n= 0+keys %$hash; 49 my $max_h= 0xFFFFFFFF; 50 $max_h -= $max_h % $n; # this just avoids a tiny bit bias 51 my $seed1= unpack("N", "Perl") - 1; 52 my $hash_to_key; 53 my $key_to_hash; 54 my $length_all_keys; 55 my $key_buckets; 56 SEED1: 57 for ($seed1++;1;$seed1++) { 58 my %hash_to_key; 59 my %key_to_hash; 60 my %key_buckets; 61 my %high; 62 $length_all_keys= 0; 63 foreach my $key (sort keys %$hash) { 64 $length_all_keys += length $key; 65 my $h= _fnv($key,$seed1); 66 next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed 67 next SEED1 if exists $hash_to_key{$h}; 68 next SEED1 if $high{$h >> $RSHIFT}++; 69 $hash_to_key{$h}= $key; 70 $key_to_hash{$key}= $h; 71 push @{$key_buckets{$h % $n}}, $key; 72 } 73 $hash_to_key= \%hash_to_key; 74 $key_to_hash= \%key_to_hash; 75 $key_buckets= \%key_buckets; 76 last SEED1; 77 } 78 79 my %token; 80 my @first_level; 81 my @second_level; 82 foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} || $a <=> $b } keys %$key_buckets) { 83 my $keys= $key_buckets->{$first_idx}; 84 #printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx; 85 my $seed2; 86 SEED2: 87 for ($seed2=1;1;$seed2++) { 88 goto FIND_SEED if $seed2 > 0xFFFF; 89 my @idx= map { 90 ( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n 91 } @$keys; 92 my %seen; 93 next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx; 94 $first_level[$first_idx]= $seed2; 95 @second_level[@idx]= map { 96 my $sp= $split_pos->{$_} // die "no split pos for '$_':$!"; 97 my ($prefix,$suffix)= unpack "A${sp}A*", $_; 98 99 +{ 100 key => $_, 101 prefix => $prefix, 102 suffix => $suffix, 103 hash => $key_to_hash->{$_}, 104 value => $hash->{$_}, 105 seed2 => 0, 106 } 107 } @$keys; 108 last; 109 } 110 111 } 112 $second_level[$_]{seed2}= $first_level[$_]||0, $second_level[$_]{idx}= $_ for 0 .. $#second_level; 113 114 return $seed1, \@second_level, $length_all_keys; 115} 116 117sub build_split_words { 118 my ($hash, $preprocess, $blob, $old_res)= @_; 119 my %appended; 120 $blob //= ""; 121 if ($preprocess) { 122 my %parts; 123 foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %$hash) { 124 my ($prefix,$suffix); 125 if ($key=~/^([^=]+=)([^=]+)\z/) { 126 ($prefix,$suffix)= ($1, $2); 127 $parts{$suffix}++; 128 #$parts{$prefix}++; 129 } else { 130 $prefix= $key; 131 $parts{$prefix}++; 132 } 133 134 } 135 foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %parts) { 136 $blob .= $key . "\0"; 137 } 138 printf "Using preprocessing, initial blob size %d\n", length($blob); 139 } else { 140 printf "No preprocessing, initial blob size %d\n", length($blob); 141 } 142 my %res; 143 144 REDO: 145 %res= (); 146 KEY: 147 foreach my $key ( 148 sort { 149 (length($b) <=> length($a)) || 150 ($a cmp $b) 151 } 152 keys %$hash 153 ) { 154 next if exists $res{$key}; 155 if (index($blob,$key) >= 0 ) { 156 my $idx= length($key); 157 if ($DEBUG and $old_res and $old_res->{$key} != $idx) { 158 print "changing: $key => $old_res->{$key} : $idx\n"; 159 } 160 $res{$key}= $idx; 161 next KEY; 162 } 163 my $best= length($key); 164 my $append= $key; 165 my $min= 0; #length $key >= 4 ? 4 : 0; 166 my $best_prefix; 167 my $best_suffix; 168 foreach my $idx (reverse $min .. length($key)) { 169 my $prefix= substr($key,0,$idx); 170 my $suffix= substr($key,$idx); 171 my $i1= index($blob,$prefix)>=0; 172 my $i2= index($blob,$suffix)>=0; 173 if ($i1 and $i2) { 174 if ($DEBUG and $old_res and $old_res->{$key} != $idx) { 175 print "changing: $key => $old_res->{$key} : $idx\n"; 176 } 177 $res{$key}= $idx; 178 $appended{$prefix}++; 179 $appended{$suffix}++; 180 next KEY; 181 } elsif ($i1) { 182 if (length $suffix <= length $append) { 183 $best= $idx; 184 $append= $suffix; 185 $best_prefix= $prefix; 186 $best_suffix= $suffix; 187 } 188 } elsif ($i2) { 189 if (length $prefix <= length $append) { 190 $best= $idx; 191 $append= $prefix; 192 $best_prefix= $prefix; 193 $best_suffix= $suffix; 194 } 195 } 196 } 197 if ($DEBUG and $old_res and $old_res->{$key} != $best) { 198 print "changing: $key => $old_res->{$key} : $best\n"; 199 } 200 #print "$best_prefix|$best_suffix => $best => $append\n"; 201 $res{$key}= $best; 202 $blob .= $append; 203 $appended{$best_prefix}++; 204 $appended{$best_suffix}++; 205 } 206 my $b2 = ""; 207 foreach my $key (sort { length($b) <=> length($a) || $a cmp $b } keys %appended) { 208 $b2 .= $key unless index($b2,$key)>=0; 209 } 210 if (length($b2)<length($blob)) { 211 printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2); 212 $blob= $b2; 213 %appended=(); 214 goto REDO; 215 } else { 216 printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2); 217 } 218 die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash; 219 return ($blob,\%res); 220} 221 222 223sub blob_as_code { 224 my ($blob,$blob_name)= @_; 225 226 $blob_name ||= "mph_blob"; 227 228 # output the blob as C code. 229 my @code= (sprintf "STATIC const unsigned char %s[] =\n",$blob_name); 230 my $blob_len= length $blob; 231 while (length($blob)) { 232 push @code, sprintf qq( "%s"), substr($blob,0,65,""); 233 push @code, length $blob ? "\n" : ";\n"; 234 } 235 push @code, "/* $blob_name length: $blob_len */\n"; 236 return join "",@code; 237} 238 239sub print_includes { 240 my $ofh= shift; 241 print $ofh "#include <stdio.h>\n"; 242 print $ofh "#include <string.h>\n"; 243 print $ofh "#include <stdint.h>\n"; 244 print $ofh "\n"; 245} 246 247sub print_defines { 248 my ($ofh,$defines)= @_; 249 250 my $key_len; 251 foreach my $def (keys %$defines) { 252 $key_len //= length $def; 253 $key_len= length $def if $key_len < length $def; 254 } 255 foreach my $def (sort keys %$defines) { 256 printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def}; 257 } 258 print $ofh "\n"; 259} 260 261 262sub build_array_of_struct { 263 my ($second_level,$blob)= @_; 264 265 my %defines; 266 my %tests; 267 my @rows; 268 foreach my $row (@$second_level) { 269 $defines{$row->{value}}= $row->{idx}+1; 270 $tests{$row->{key}}= $defines{$row->{value}}; 271 my @u16= ( 272 $row->{seed2}, 273 index($blob,$row->{prefix}//0), 274 index($blob,$row->{suffix}//0), 275 ); 276 $_ > 0xFFFF and die "panic: value exceeds range of U16" 277 for @u16; 278 my @u8= ( 279 length($row->{prefix}), 280 length($row->{suffix}), 281 ); 282 $_ > 0xFF and die "panic: value exceeds range of U8" 283 for @u8; 284 push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s } /* %s%s */", 285 @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix}); 286 } 287 return \@rows,\%defines,\%tests; 288} 289 290sub make_algo { 291 my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, 292 $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_; 293 294 $blob_name ||= "mph_blob"; 295 $struct_name ||= "mph_struct"; 296 $table_name ||= "mph_table"; 297 $prefix ||= "MPH"; 298 299 my $n= 0+@$second_level; 300 my $data_size= 0+@$second_level * 8 + length $smart_blob; 301 302 my @code = "#define ${prefix}_VALt I16\n\n"; 303 push @code, "/*\n"; 304 push @code, sprintf "rows: %s\n", $n; 305 push @code, sprintf "seed: %s\n", $seed1; 306 push @code, sprintf "full length of keys: %d\n", $length_all_keys; 307 push @code, sprintf "blob length: %d\n", length $smart_blob; 308 push @code, sprintf "ref length: %d\n", 0+@$second_level * 8; 309 push @code, sprintf "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100; 310 push @code, "*/\n\n"; 311 312 push @code, blob_as_code($smart_blob, $blob_name); 313 push @code, <<"EOF_CODE"; 314 315struct $struct_name { 316 U16 seed2; 317 U16 pfx; 318 U16 sfx; 319 U8 pfx_len; 320 U8 sfx_len; 321 ${prefix}_VALt value; 322}; 323 324EOF_CODE 325 326 push @code, "#define ${prefix}_RSHIFT $RSHIFT\n"; 327 push @code, "#define ${prefix}_BUCKETS $n\n\n"; 328 push @code, sprintf "STATIC const U32 ${prefix}_SEED1 = 0x%08x;\n", $seed1; 329 push @code, sprintf "STATIC const U32 ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST; 330 331 push @code, "/* The comments give the input key for the row it is in */\n"; 332 push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n"; 333 push @code, <<"EOF_CODE"; 334${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) { 335 const unsigned char * ptr= key; 336 const unsigned char * ptr_end= key + key_len; 337 U32 h= ${prefix}_SEED1; 338 U32 s; 339 U32 n; 340 do { 341 h ^= NATIVE_TO_LATIN1(*ptr); /* table collated in Latin1 */ 342 h *= ${prefix}_FNV_CONST; 343 } while ( ++ptr < ptr_end ); 344 n= h % ${prefix}_BUCKETS; 345 s = $table_name\[n].seed2; 346 if (s) { 347 h= (h >> ${prefix}_RSHIFT) ^ s; 348 n = h % ${prefix}_BUCKETS; 349 if ( 350 ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) && 351 ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) && 352 ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx, 353 key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 ) 354 ) { 355 return $table_name\[n].value; 356 } 357 } 358 return 0; 359} 360EOF_CODE 361 362 return join "", @code; 363} 364 365sub print_algo { 366 my ($ofh, $second_level, $seed1, $long_blob, $smart_blob, $rows, 367 $blob_name, $struct_name, $table_name, $match_name ) = @_; 368 369 if (!ref $ofh) { 370 my $file= $ofh; 371 undef $ofh; 372 open $ofh, ">", $file 373 or die "Failed to open '$file': $!"; 374 } 375 376 my $code = make_algo( 377 $second_level, $seed1, $long_blob, $smart_blob, $rows, 378 $blob_name, $struct_name, $table_name, $match_name ); 379 print $ofh $code; 380} 381 382sub print_main { 383 my ($ofh,$h_file,$match_name,$prefix)=@_; 384 print $ofh <<"EOF_CODE"; 385#include "$h_file" 386 387int main(int argc, char *argv[]){ 388 int i; 389 for (i=1; i<argc; i++) { 390 unsigned char *key = (unsigned char *)argv[i]; 391 int key_len = strlen(argv[i]); 392 printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len)); 393 } 394 return 0; 395} 396EOF_CODE 397} 398 399# output the test Perl code. 400sub print_tests { 401 my ($file, $tests_hash)= @_; 402 open my $ofh, ">", $file 403 or die "Failed to open '$file' for writing: $!"; 404 my $num_tests= 2 + keys %$tests_hash; 405 print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;"; 406 my $bytes= 0; 407 my @tests= sort keys %$tests_hash; 408 print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n"; 409 print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n"; 410 print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n"; 411 while (@tests) { 412 my @batch= splice @tests,0,10; 413 my $batch_args= join " ", map { "'$_'" } @batch; 414 print $ofh "\@res=`./mph_test $batch_args`;\n"; 415 foreach my $i (0..$#batch) { 416 my $key= $batch[$i]; 417 my $want= $tests_hash->{$key}; 418 print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n"; 419 } 420 } 421 close $ofh; 422} 423 424sub print_test_binary { 425 my ($file,$h_file, $second_level, $seed1, $length_all_keys, 426 $smart_blob, $rows, $defines, $match_name, $prefix)= @_; 427 open my $ofh, ">", $file 428 or die "Failed to open '$file': $!"; 429 print_includes($ofh); 430 print_defines($ofh, $defines); 431 print_main($ofh,$h_file,$match_name,$prefix); 432 close $ofh; 433} 434 435sub make_mph_from_hash { 436 my $hash= shift; 437 438 # we do this twice because often we can find longer prefixes on the second pass. 439 my ($smart_blob, $res_to_split)= build_split_words($hash,0); 440 { 441 my ($smart_blob2, $res_to_split2)= build_split_words($hash,1); 442 if (length($smart_blob) > length($smart_blob2)) { 443 printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob; 444 $smart_blob= $smart_blob2; 445 $res_to_split= $res_to_split2; 446 } else { 447 printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2; 448 } 449 } 450 my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split); 451 my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob); 452 return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests); 453} 454 455sub make_files { 456 my ($hash,$base_name)= @_; 457 458 my $h_name= $base_name . "_algo.h"; 459 my $c_name= $base_name . "_test.c"; 460 my $p_name= $base_name . "_test.pl"; 461 my $blob_name= $base_name . "_blob"; 462 my $struct_name= $base_name . "_bucket_info"; 463 my $table_name= $base_name . "_table"; 464 my $match_name= $base_name . "_match"; 465 my $prefix= uc($base_name); 466 467 my ($second_level, $seed1, $length_all_keys, 468 $smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash ); 469 print_algo( $h_name, 470 $second_level, $seed1, $length_all_keys, $smart_blob, $rows, 471 $blob_name, $struct_name, $table_name, $match_name, $prefix ); 472 print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys, 473 $smart_blob, $rows, $defines, 474 $match_name, $prefix ); 475 print_tests( $p_name, $tests ); 476} 477 478unless (caller) { 479 my %hash; 480 { 481 no warnings; 482 do "../perl/lib/unicore/UCD.pl"; 483 %hash= %utf8::loose_to_file_of; 484 } 485 if ($ENV{MERGE_KEYS}) { 486 my @keys= keys %hash; 487 foreach my $loose (keys %utf8::loose_property_name_of) { 488 my $to= $utf8::loose_property_name_of{$loose}; 489 next if $to eq $loose; 490 foreach my $key (@keys) { 491 my $copy= $key; 492 if ($copy=~s/^\Q$to\E(=|\z)/$loose$1/) { 493 #print "$key => $copy\n"; 494 $hash{$copy}= $key; 495 } 496 } 497 } 498 } 499 foreach my $key (keys %hash) { 500 my $munged= uc($key); 501 $munged=~s/\W/__/g; 502 $hash{$key} = $munged; 503 } 504 505 my $name= shift @ARGV; 506 $name ||= "mph"; 507 make_files(\%hash,$name); 508} 509 5101; 511__END__ 512