1#---------------------------------------------------------------------- 2# 3# PerfectHash.pm 4# Perl module that constructs minimal perfect hash functions 5# 6# This code constructs a minimal perfect hash function for the given 7# set of keys, using an algorithm described in 8# "An optimal algorithm for generating minimal perfect hash functions" 9# by Czech, Havas and Majewski in Information Processing Letters, 10# 43(5):256-264, October 1992. 11# This implementation is loosely based on NetBSD's "nbperf", 12# which was written by Joerg Sonnenberger. 13# 14# The resulting hash function is perfect in the sense that if the presented 15# key is one of the original set, it will return the key's index in the set 16# (in range 0..N-1). However, the caller must still verify the match, 17# as false positives are possible. Also, the hash function may return 18# values that are out of range (negative or >= N), due to summing unrelated 19# hashtable entries. This indicates that the presented key is definitely 20# not in the set. 21# 22# 23# Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 24# Portions Copyright (c) 1994, Regents of the University of California 25# 26# src/tools/PerfectHash.pm 27# 28#---------------------------------------------------------------------- 29 30package PerfectHash; 31 32use strict; 33use warnings; 34 35 36# At runtime, we'll compute two simple hash functions of the input key, 37# and use them to index into a mapping table. The hash functions are just 38# multiply-and-add in uint32 arithmetic, with different multipliers and 39# initial seeds. All the complexity in this module is concerned with 40# selecting hash parameters that will work and building the mapping table. 41 42# We support making case-insensitive hash functions, though this only 43# works for a strict-ASCII interpretation of case insensitivity, 44# ie, A-Z maps onto a-z and nothing else. 45my $case_fold = 0; 46 47 48# 49# Construct a C function implementing a perfect hash for the given keys. 50# The C function definition is returned as a string. 51# 52# The keys should be passed as an array reference. They can be any set 53# of Perl strings; it is caller's responsibility that there not be any 54# duplicates. (Note that the "strings" can be binary data, but hashing 55# e.g. OIDs has endianness hazards that callers must overcome.) 56# 57# The name to use for the function is specified as the second argument. 58# It will be a global function by default, but the caller may prepend 59# "static " to the result string if it wants a static function. 60# 61# Additional options can be specified as keyword-style arguments: 62# 63# case_fold => bool 64# If specified as true, the hash function is case-insensitive, for the 65# limited idea of case-insensitivity explained above. 66# 67# fixed_key_length => N 68# If specified, all keys are assumed to have length N bytes, and the 69# hash function signature will be just "int f(const void *key)" 70# rather than "int f(const void *key, size_t keylen)". 71# 72sub generate_hash_function 73{ 74 my ($keys_ref, $funcname, %options) = @_; 75 76 # It's not worth passing this around as a parameter; just use a global. 77 $case_fold = $options{case_fold} || 0; 78 79 # Try different hash function parameters until we find a set that works 80 # for these keys. The multipliers are chosen to be primes that are cheap 81 # to calculate via shift-and-add, so don't change them without care. 82 # (Commonly, random seeds are tried, but we want reproducible results 83 # from this program so we don't do that.) 84 my $hash_mult1 = 31; 85 my $hash_mult2; 86 my $hash_seed1; 87 my $hash_seed2; 88 my @subresult; 89 FIND_PARAMS: 90 foreach (127, 257, 521, 1033, 2053) 91 { 92 $hash_mult2 = $_; # "foreach $hash_mult2" doesn't work 93 for ($hash_seed1 = 0; $hash_seed1 < 10; $hash_seed1++) 94 { 95 for ($hash_seed2 = 0; $hash_seed2 < 10; $hash_seed2++) 96 { 97 @subresult = _construct_hash_table( 98 $keys_ref, $hash_mult1, $hash_mult2, 99 $hash_seed1, $hash_seed2); 100 last FIND_PARAMS if @subresult; 101 } 102 } 103 } 104 105 # Choke if we couldn't find a workable set of parameters. 106 die "failed to generate perfect hash" if !@subresult; 107 108 # Extract info from _construct_hash_table's result array. 109 my $elemtype = $subresult[0]; 110 my @hashtab = @{ $subresult[1] }; 111 my $nhash = scalar(@hashtab); 112 113 # OK, construct the hash function definition including the hash table. 114 my $f = ''; 115 $f .= sprintf "int\n"; 116 if (defined $options{fixed_key_length}) 117 { 118 $f .= sprintf "%s(const void *key)\n{\n", $funcname; 119 } 120 else 121 { 122 $f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname; 123 } 124 $f .= sprintf "\tstatic const %s h[%d] = {\n", $elemtype, $nhash; 125 for (my $i = 0; $i < $nhash; $i++) 126 { 127 $f .= sprintf "%s%6d,%s", 128 ($i % 8 == 0 ? "\t\t" : " "), 129 $hashtab[$i], 130 ($i % 8 == 7 ? "\n" : ""); 131 } 132 $f .= sprintf "\n" if ($nhash % 8 != 0); 133 $f .= sprintf "\t};\n\n"; 134 $f .= sprintf "\tconst unsigned char *k = (const unsigned char *) key;\n"; 135 $f .= sprintf "\tsize_t\t\tkeylen = %d;\n", $options{fixed_key_length} 136 if (defined $options{fixed_key_length}); 137 $f .= sprintf "\tuint32\t\ta = %d;\n", $hash_seed1; 138 $f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed2; 139 $f .= sprintf "\twhile (keylen--)\n\t{\n"; 140 $f .= sprintf "\t\tunsigned char c = *k++"; 141 $f .= sprintf " | 0x20" if $case_fold; # see comment below 142 $f .= sprintf ";\n\n"; 143 $f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1; 144 $f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2; 145 $f .= sprintf "\t}\n"; 146 $f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash; 147 $f .= sprintf "}\n"; 148 149 return $f; 150} 151 152 153# Calculate a hash function as the run-time code will do. 154# 155# If we are making a case-insensitive hash function, we implement that 156# by OR'ing 0x20 into each byte of the key. This correctly transforms 157# upper-case ASCII into lower-case ASCII, while not changing digits or 158# dollar signs. (It does change '_', as well as other characters not 159# likely to appear in keywords; this has little effect on the hash's 160# ability to discriminate keywords.) 161sub _calc_hash 162{ 163 my ($key, $mult, $seed) = @_; 164 165 my $result = $seed; 166 for my $c (split //, $key) 167 { 168 my $cn = ord($c); 169 $cn |= 0x20 if $case_fold; 170 $result = ($result * $mult + $cn) % 4294967296; 171 } 172 return $result; 173} 174 175 176# Attempt to construct a mapping table for a minimal perfect hash function 177# for the given keys, using the specified hash parameters. 178# 179# Returns an array containing the mapping table element type name as the 180# first element, and a ref to an array of the table values as the second. 181# 182# Returns an empty array on failure; then caller should choose different 183# hash parameter(s) and try again. 184sub _construct_hash_table 185{ 186 my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_; 187 my @keys = @{$keys_ref}; 188 189 # This algorithm is based on a graph whose edges correspond to the 190 # keys and whose vertices correspond to entries of the mapping table. 191 # A key's edge links the two vertices whose indexes are the outputs of 192 # the two hash functions for that key. For K keys, the mapping 193 # table must have at least 2*K+1 entries, guaranteeing that there's at 194 # least one unused entry. (In principle, larger mapping tables make it 195 # easier to find a workable hash and increase the number of inputs that 196 # can be rejected due to touching unused hashtable entries. In practice, 197 # neither effect seems strong enough to justify using a larger table.) 198 my $nedges = scalar @keys; # number of edges 199 my $nverts = 2 * $nedges + 1; # number of vertices 200 201 # However, it would be very bad if $nverts were exactly equal to either 202 # $hash_mult1 or $hash_mult2: effectively, that hash function would be 203 # sensitive to only the last byte of each key. Cases where $nverts is a 204 # multiple of either multiplier likewise lose information. (But $nverts 205 # can't actually divide them, if they've been intelligently chosen as 206 # primes.) We can avoid such problems by adjusting the table size. 207 while ($nverts % $hash_mult1 == 0 208 || $nverts % $hash_mult2 == 0) 209 { 210 $nverts++; 211 } 212 213 # Initialize the array of edges. 214 my @E = (); 215 foreach my $kw (@keys) 216 { 217 # Calculate hashes for this key. 218 # The hashes are immediately reduced modulo the mapping table size. 219 my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed1) % $nverts; 220 my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed2) % $nverts; 221 222 # If the two hashes are the same for any key, we have to fail 223 # since this edge would itself form a cycle in the graph. 224 return () if $hash1 == $hash2; 225 226 # Add the edge for this key. 227 push @E, { left => $hash1, right => $hash2 }; 228 } 229 230 # Initialize the array of vertices, giving them all empty lists 231 # of associated edges. (The lists will be hashes of edge numbers.) 232 my @V = (); 233 for (my $v = 0; $v < $nverts; $v++) 234 { 235 push @V, { edges => {} }; 236 } 237 238 # Insert each edge in the lists of edges connected to its vertices. 239 for (my $e = 0; $e < $nedges; $e++) 240 { 241 my $v = $E[$e]{left}; 242 $V[$v]{edges}->{$e} = 1; 243 244 $v = $E[$e]{right}; 245 $V[$v]{edges}->{$e} = 1; 246 } 247 248 # Now we attempt to prove the graph acyclic. 249 # A cycle-free graph is either empty or has some vertex of degree 1. 250 # Removing the edge attached to that vertex doesn't change this property, 251 # so doing that repeatedly will reduce the size of the graph. 252 # If the graph is empty at the end of the process, it was acyclic. 253 # We track the order of edge removal so that the next phase can process 254 # them in reverse order of removal. 255 my @output_order = (); 256 257 # Consider each vertex as a possible starting point for edge-removal. 258 for (my $startv = 0; $startv < $nverts; $startv++) 259 { 260 my $v = $startv; 261 262 # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it), 263 # remove that edge, and then consider the edge's other vertex to see 264 # if it is now of degree 1. The inner loop repeats until reaching a 265 # vertex not of degree 1. 266 while (scalar(keys(%{ $V[$v]{edges} })) == 1) 267 { 268 # Unlink its only edge. 269 my $e = (keys(%{ $V[$v]{edges} }))[0]; 270 delete($V[$v]{edges}->{$e}); 271 272 # Unlink the edge from its other vertex, too. 273 my $v2 = $E[$e]{left}; 274 $v2 = $E[$e]{right} if ($v2 == $v); 275 delete($V[$v2]{edges}->{$e}); 276 277 # Push e onto the front of the output-order list. 278 unshift @output_order, $e; 279 280 # Consider v2 on next iteration of inner loop. 281 $v = $v2; 282 } 283 } 284 285 # We succeeded only if all edges were removed from the graph. 286 return () if (scalar(@output_order) != $nedges); 287 288 # OK, build the hash table of size $nverts. 289 my @hashtab = (0) x $nverts; 290 # We need a "visited" flag array in this step, too. 291 my @visited = (0) x $nverts; 292 293 # The goal is that for any key, the sum of the hash table entries for 294 # its first and second hash values is the desired output (i.e., the key 295 # number). By assigning hash table values in the selected edge order, 296 # we can guarantee that that's true. This works because the edge first 297 # removed from the graph (and hence last to be visited here) must have 298 # at least one vertex it shared with no other edge; hence it will have at 299 # least one vertex (hashtable entry) still unvisited when we reach it here, 300 # and we can assign that unvisited entry a value that makes the sum come 301 # out as we wish. By induction, the same holds for all the other edges. 302 foreach my $e (@output_order) 303 { 304 my $l = $E[$e]{left}; 305 my $r = $E[$e]{right}; 306 if (!$visited[$l]) 307 { 308 # $hashtab[$r] might be zero, or some previously assigned value. 309 $hashtab[$l] = $e - $hashtab[$r]; 310 } 311 else 312 { 313 die "oops, doubly used hashtab entry" if $visited[$r]; 314 # $hashtab[$l] might be zero, or some previously assigned value. 315 $hashtab[$r] = $e - $hashtab[$l]; 316 } 317 # Now freeze both of these hashtab entries. 318 $visited[$l] = 1; 319 $visited[$r] = 1; 320 } 321 322 # Detect range of values needed in hash table. 323 my $hmin = $nedges; 324 my $hmax = 0; 325 for (my $v = 0; $v < $nverts; $v++) 326 { 327 $hmin = $hashtab[$v] if $hashtab[$v] < $hmin; 328 $hmax = $hashtab[$v] if $hashtab[$v] > $hmax; 329 } 330 331 # Choose width of hashtable entries. In addition to the actual values, 332 # we need to be able to store a flag for unused entries, and we wish to 333 # have the property that adding any other entry value to the flag gives 334 # an out-of-range result (>= $nedges). 335 my $elemtype; 336 my $unused_flag; 337 338 if ( $hmin >= -0x7F 339 && $hmax <= 0x7F 340 && $hmin + 0x7F >= $nedges) 341 { 342 # int8 will work 343 $elemtype = 'int8'; 344 $unused_flag = 0x7F; 345 } 346 elsif ($hmin >= -0x7FFF 347 && $hmax <= 0x7FFF 348 && $hmin + 0x7FFF >= $nedges) 349 { 350 # int16 will work 351 $elemtype = 'int16'; 352 $unused_flag = 0x7FFF; 353 } 354 elsif ($hmin >= -0x7FFFFFFF 355 && $hmax <= 0x7FFFFFFF 356 && $hmin + 0x3FFFFFFF >= $nedges) 357 { 358 # int32 will work 359 $elemtype = 'int32'; 360 $unused_flag = 0x3FFFFFFF; 361 } 362 else 363 { 364 die "hash table values too wide"; 365 } 366 367 # Set any unvisited hashtable entries to $unused_flag. 368 for (my $v = 0; $v < $nverts; $v++) 369 { 370 $hashtab[$v] = $unused_flag if !$visited[$v]; 371 } 372 373 return ($elemtype, \@hashtab); 374} 375 3761; 377