1#----------------------------------------------------------------------
2#
3# PerfectHash.pm
4#    Perl module that constructs minimal perfect hash functions
5#
6# This code constructs a minimal perfect hash function for the given
7# set of keys, using an algorithm described in
8# "An optimal algorithm for generating minimal perfect hash functions"
9# by Czech, Havas and Majewski in Information Processing Letters,
10# 43(5):256-264, October 1992.
11# This implementation is loosely based on NetBSD's "nbperf",
12# which was written by Joerg Sonnenberger.
13#
14# The resulting hash function is perfect in the sense that if the presented
15# key is one of the original set, it will return the key's index in the set
16# (in range 0..N-1).  However, the caller must still verify the match,
17# as false positives are possible.  Also, the hash function may return
18# values that are out of range (negative or >= N), due to summing unrelated
19# hashtable entries.  This indicates that the presented key is definitely
20# not in the set.
21#
22#
23# Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
24# Portions Copyright (c) 1994, Regents of the University of California
25#
26# src/tools/PerfectHash.pm
27#
28#----------------------------------------------------------------------
29
30package PerfectHash;
31
32use strict;
33use warnings;
34
35
36# At runtime, we'll compute two simple hash functions of the input key,
37# and use them to index into a mapping table.  The hash functions are just
38# multiply-and-add in uint32 arithmetic, with different multipliers and
39# initial seeds.  All the complexity in this module is concerned with
40# selecting hash parameters that will work and building the mapping table.
41
42# We support making case-insensitive hash functions, though this only
43# works for a strict-ASCII interpretation of case insensitivity,
44# ie, A-Z maps onto a-z and nothing else.
45my $case_fold = 0;
46
47
48#
49# Construct a C function implementing a perfect hash for the given keys.
50# The C function definition is returned as a string.
51#
52# The keys should be passed as an array reference.  They can be any set
53# of Perl strings; it is caller's responsibility that there not be any
54# duplicates.  (Note that the "strings" can be binary data, but hashing
55# e.g. OIDs has endianness hazards that callers must overcome.)
56#
57# The name to use for the function is specified as the second argument.
58# It will be a global function by default, but the caller may prepend
59# "static " to the result string if it wants a static function.
60#
61# Additional options can be specified as keyword-style arguments:
62#
63# case_fold => bool
64# If specified as true, the hash function is case-insensitive, for the
65# limited idea of case-insensitivity explained above.
66#
67# fixed_key_length => N
68# If specified, all keys are assumed to have length N bytes, and the
69# hash function signature will be just "int f(const void *key)"
70# rather than "int f(const void *key, size_t keylen)".
71#
72sub generate_hash_function
73{
74	my ($keys_ref, $funcname, %options) = @_;
75
76	# It's not worth passing this around as a parameter; just use a global.
77	$case_fold = $options{case_fold} || 0;
78
79	# Try different hash function parameters until we find a set that works
80	# for these keys.  The multipliers are chosen to be primes that are cheap
81	# to calculate via shift-and-add, so don't change them without care.
82	# (Commonly, random seeds are tried, but we want reproducible results
83	# from this program so we don't do that.)
84	my $hash_mult1 = 31;
85	my $hash_mult2;
86	my $hash_seed1;
87	my $hash_seed2;
88	my @subresult;
89  FIND_PARAMS:
90	foreach (127, 257, 521, 1033, 2053)
91	{
92		$hash_mult2 = $_;    # "foreach $hash_mult2" doesn't work
93		for ($hash_seed1 = 0; $hash_seed1 < 10; $hash_seed1++)
94		{
95			for ($hash_seed2 = 0; $hash_seed2 < 10; $hash_seed2++)
96			{
97				@subresult = _construct_hash_table(
98					$keys_ref,   $hash_mult1, $hash_mult2,
99					$hash_seed1, $hash_seed2);
100				last FIND_PARAMS if @subresult;
101			}
102		}
103	}
104
105	# Choke if we couldn't find a workable set of parameters.
106	die "failed to generate perfect hash" if !@subresult;
107
108	# Extract info from _construct_hash_table's result array.
109	my $elemtype = $subresult[0];
110	my @hashtab  = @{ $subresult[1] };
111	my $nhash    = scalar(@hashtab);
112
113	# OK, construct the hash function definition including the hash table.
114	my $f = '';
115	$f .= sprintf "int\n";
116	if (defined $options{fixed_key_length})
117	{
118		$f .= sprintf "%s(const void *key)\n{\n", $funcname;
119	}
120	else
121	{
122		$f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname;
123	}
124	$f .= sprintf "\tstatic const %s h[%d] = {\n", $elemtype, $nhash;
125	for (my $i = 0; $i < $nhash; $i++)
126	{
127		$f .= sprintf "%s%6d,%s",
128		  ($i % 8 == 0 ? "\t\t" : " "),
129		  $hashtab[$i],
130		  ($i % 8 == 7 ? "\n" : "");
131	}
132	$f .= sprintf "\n" if ($nhash % 8 != 0);
133	$f .= sprintf "\t};\n\n";
134	$f .= sprintf "\tconst unsigned char *k = (const unsigned char *) key;\n";
135	$f .= sprintf "\tsize_t\t\tkeylen = %d;\n", $options{fixed_key_length}
136	  if (defined $options{fixed_key_length});
137	$f .= sprintf "\tuint32\t\ta = %d;\n",   $hash_seed1;
138	$f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed2;
139	$f .= sprintf "\twhile (keylen--)\n\t{\n";
140	$f .= sprintf "\t\tunsigned char c = *k++";
141	$f .= sprintf " | 0x20" if $case_fold;    # see comment below
142	$f .= sprintf ";\n\n";
143	$f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1;
144	$f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2;
145	$f .= sprintf "\t}\n";
146	$f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
147	$f .= sprintf "}\n";
148
149	return $f;
150}
151
152
153# Calculate a hash function as the run-time code will do.
154#
155# If we are making a case-insensitive hash function, we implement that
156# by OR'ing 0x20 into each byte of the key.  This correctly transforms
157# upper-case ASCII into lower-case ASCII, while not changing digits or
158# dollar signs.  (It does change '_', as well as other characters not
159# likely to appear in keywords; this has little effect on the hash's
160# ability to discriminate keywords.)
161sub _calc_hash
162{
163	my ($key, $mult, $seed) = @_;
164
165	my $result = $seed;
166	for my $c (split //, $key)
167	{
168		my $cn = ord($c);
169		$cn |= 0x20 if $case_fold;
170		$result = ($result * $mult + $cn) % 4294967296;
171	}
172	return $result;
173}
174
175
176# Attempt to construct a mapping table for a minimal perfect hash function
177# for the given keys, using the specified hash parameters.
178#
179# Returns an array containing the mapping table element type name as the
180# first element, and a ref to an array of the table values as the second.
181#
182# Returns an empty array on failure; then caller should choose different
183# hash parameter(s) and try again.
184sub _construct_hash_table
185{
186	my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_;
187	my @keys = @{$keys_ref};
188
189	# This algorithm is based on a graph whose edges correspond to the
190	# keys and whose vertices correspond to entries of the mapping table.
191	# A key's edge links the two vertices whose indexes are the outputs of
192	# the two hash functions for that key.  For K keys, the mapping
193	# table must have at least 2*K+1 entries, guaranteeing that there's at
194	# least one unused entry.  (In principle, larger mapping tables make it
195	# easier to find a workable hash and increase the number of inputs that
196	# can be rejected due to touching unused hashtable entries.  In practice,
197	# neither effect seems strong enough to justify using a larger table.)
198	my $nedges = scalar @keys;       # number of edges
199	my $nverts = 2 * $nedges + 1;    # number of vertices
200
201	# However, it would be very bad if $nverts were exactly equal to either
202	# $hash_mult1 or $hash_mult2: effectively, that hash function would be
203	# sensitive to only the last byte of each key.  Cases where $nverts is a
204	# multiple of either multiplier likewise lose information.  (But $nverts
205	# can't actually divide them, if they've been intelligently chosen as
206	# primes.)  We can avoid such problems by adjusting the table size.
207	while ($nverts % $hash_mult1 == 0
208		|| $nverts % $hash_mult2 == 0)
209	{
210		$nverts++;
211	}
212
213	# Initialize the array of edges.
214	my @E = ();
215	foreach my $kw (@keys)
216	{
217		# Calculate hashes for this key.
218		# The hashes are immediately reduced modulo the mapping table size.
219		my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed1) % $nverts;
220		my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed2) % $nverts;
221
222		# If the two hashes are the same for any key, we have to fail
223		# since this edge would itself form a cycle in the graph.
224		return () if $hash1 == $hash2;
225
226		# Add the edge for this key.
227		push @E, { left => $hash1, right => $hash2 };
228	}
229
230	# Initialize the array of vertices, giving them all empty lists
231	# of associated edges.  (The lists will be hashes of edge numbers.)
232	my @V = ();
233	for (my $v = 0; $v < $nverts; $v++)
234	{
235		push @V, { edges => {} };
236	}
237
238	# Insert each edge in the lists of edges connected to its vertices.
239	for (my $e = 0; $e < $nedges; $e++)
240	{
241		my $v = $E[$e]{left};
242		$V[$v]{edges}->{$e} = 1;
243
244		$v = $E[$e]{right};
245		$V[$v]{edges}->{$e} = 1;
246	}
247
248	# Now we attempt to prove the graph acyclic.
249	# A cycle-free graph is either empty or has some vertex of degree 1.
250	# Removing the edge attached to that vertex doesn't change this property,
251	# so doing that repeatedly will reduce the size of the graph.
252	# If the graph is empty at the end of the process, it was acyclic.
253	# We track the order of edge removal so that the next phase can process
254	# them in reverse order of removal.
255	my @output_order = ();
256
257	# Consider each vertex as a possible starting point for edge-removal.
258	for (my $startv = 0; $startv < $nverts; $startv++)
259	{
260		my $v = $startv;
261
262		# If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
263		# remove that edge, and then consider the edge's other vertex to see
264		# if it is now of degree 1.  The inner loop repeats until reaching a
265		# vertex not of degree 1.
266		while (scalar(keys(%{ $V[$v]{edges} })) == 1)
267		{
268			# Unlink its only edge.
269			my $e = (keys(%{ $V[$v]{edges} }))[0];
270			delete($V[$v]{edges}->{$e});
271
272			# Unlink the edge from its other vertex, too.
273			my $v2 = $E[$e]{left};
274			$v2 = $E[$e]{right} if ($v2 == $v);
275			delete($V[$v2]{edges}->{$e});
276
277			# Push e onto the front of the output-order list.
278			unshift @output_order, $e;
279
280			# Consider v2 on next iteration of inner loop.
281			$v = $v2;
282		}
283	}
284
285	# We succeeded only if all edges were removed from the graph.
286	return () if (scalar(@output_order) != $nedges);
287
288	# OK, build the hash table of size $nverts.
289	my @hashtab = (0) x $nverts;
290	# We need a "visited" flag array in this step, too.
291	my @visited = (0) x $nverts;
292
293	# The goal is that for any key, the sum of the hash table entries for
294	# its first and second hash values is the desired output (i.e., the key
295	# number).  By assigning hash table values in the selected edge order,
296	# we can guarantee that that's true.  This works because the edge first
297	# removed from the graph (and hence last to be visited here) must have
298	# at least one vertex it shared with no other edge; hence it will have at
299	# least one vertex (hashtable entry) still unvisited when we reach it here,
300	# and we can assign that unvisited entry a value that makes the sum come
301	# out as we wish.  By induction, the same holds for all the other edges.
302	foreach my $e (@output_order)
303	{
304		my $l = $E[$e]{left};
305		my $r = $E[$e]{right};
306		if (!$visited[$l])
307		{
308			# $hashtab[$r] might be zero, or some previously assigned value.
309			$hashtab[$l] = $e - $hashtab[$r];
310		}
311		else
312		{
313			die "oops, doubly used hashtab entry" if $visited[$r];
314			# $hashtab[$l] might be zero, or some previously assigned value.
315			$hashtab[$r] = $e - $hashtab[$l];
316		}
317		# Now freeze both of these hashtab entries.
318		$visited[$l] = 1;
319		$visited[$r] = 1;
320	}
321
322	# Detect range of values needed in hash table.
323	my $hmin = $nedges;
324	my $hmax = 0;
325	for (my $v = 0; $v < $nverts; $v++)
326	{
327		$hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
328		$hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
329	}
330
331	# Choose width of hashtable entries.  In addition to the actual values,
332	# we need to be able to store a flag for unused entries, and we wish to
333	# have the property that adding any other entry value to the flag gives
334	# an out-of-range result (>= $nedges).
335	my $elemtype;
336	my $unused_flag;
337
338	if (   $hmin >= -0x7F
339		&& $hmax <= 0x7F
340		&& $hmin + 0x7F >= $nedges)
341	{
342		# int8 will work
343		$elemtype    = 'int8';
344		$unused_flag = 0x7F;
345	}
346	elsif ($hmin >= -0x7FFF
347		&& $hmax <= 0x7FFF
348		&& $hmin + 0x7FFF >= $nedges)
349	{
350		# int16 will work
351		$elemtype    = 'int16';
352		$unused_flag = 0x7FFF;
353	}
354	elsif ($hmin >= -0x7FFFFFFF
355		&& $hmax <= 0x7FFFFFFF
356		&& $hmin + 0x3FFFFFFF >= $nedges)
357	{
358		# int32 will work
359		$elemtype    = 'int32';
360		$unused_flag = 0x3FFFFFFF;
361	}
362	else
363	{
364		die "hash table values too wide";
365	}
366
367	# Set any unvisited hashtable entries to $unused_flag.
368	for (my $v = 0; $v < $nverts; $v++)
369	{
370		$hashtab[$v] = $unused_flag if !$visited[$v];
371	}
372
373	return ($elemtype, \@hashtab);
374}
375
3761;
377