1 /* Search algorithm.
2 Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3 Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4 and Bruno Haible <bruno@clisp.org>.
5
6 This file is part of GNU GPERF.
7
8 GNU GPERF is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU GPERF is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING.
20 If not, write to the Free Software Foundation, Inc.,
21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 /* Specification. */
24 #include "search.h"
25
26 #include <stdio.h>
27 #include <stdlib.h> /* declares exit(), rand(), srand() */
28 #include <string.h> /* declares memset(), memcmp() */
29 #include <time.h> /* declares time() */
30 #include <math.h> /* declares exp() */
31 #include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32 #include "options.h"
33 #include "hash-table.h"
34
35 /* ================================ Theory ================================= */
36
37 /* The general form of the hash function is
38
39 hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
40 + len (keyword)
41
42 where Pos is a set of byte positions,
43 each alpha_inc[i] is a nonnegative integer,
44 each asso_values[c] is a nonnegative integer,
45 len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
46
47 Theorem 1: If all keywords are different, there is a set Pos such that
48 all tuples (keyword[i] : i in Pos) are different.
49
50 Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
51 are nonnegative integers alpha_inc[i] such that all multisets
52 {keyword[i] + alpha_inc[i] : i in Pos} are different.
53
54 Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
55
56 Theorem 3: If all multisets selchars[keyword] are different, there are
57 nonnegative integers asso_values[c] such that all hash values
58 sum (asso_values[c] : c in selchars[keyword]) are different.
59
60 Based on these three facts, we find the hash function in three steps:
61
62 Step 1 (Finding good byte positions):
63 Find a set Pos, as small as possible, such that all tuples
64 (keyword[i] : i in Pos) are different.
65
66 Step 2 (Finding good alpha increments):
67 Find nonnegative integers alpha_inc[i], as many of them as possible being
68 zero, and the others being as small as possible, such that all multisets
69 {keyword[i] + alpha_inc[i] : i in Pos} are different.
70
71 Step 3 (Finding good asso_values):
72 Find asso_values[c] such that all hash (keyword) are different.
73
74 In other words, each step finds a projection that is injective on the
75 given finite set:
76 proj1 : String --> Map (Pos --> N)
77 proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
78 proj3 : Map (Pos --> N) / S(Pos) --> N
79 where
80 N denotes the set of nonnegative integers,
81 Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
82 S(Pos) is the symmetric group over Pos.
83
84 This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
85 modifications apply:
86 proj1 : String --> Map (Pos --> N) x N
87 proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
88 proj3 : Map (Pos --> N) / S(Pos) x N --> N
89
90 For a case-insensitive hash function, the general form is
91
92 hash (keyword) =
93 sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
94 + len (keyword)
95
96 where alpha_unify[c] is chosen so that an upper/lower case change in
97 keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]].
98 */
99
100 /* ==================== Initialization and Preparation ===================== */
101
Search(KeywordExt_List * list)102 Search::Search (KeywordExt_List *list)
103 : _head (list)
104 {
105 }
106
107 void
prepare()108 Search::prepare ()
109 {
110 /* Compute the total number of keywords. */
111 _total_keys = 0;
112 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
113 _total_keys++;
114
115 /* Compute the minimum and maximum keyword length. */
116 _max_key_len = INT_MIN;
117 _min_key_len = INT_MAX;
118 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
119 {
120 KeywordExt *keyword = temp->first();
121
122 if (_max_key_len < keyword->_allchars_length)
123 _max_key_len = keyword->_allchars_length;
124 if (_min_key_len > keyword->_allchars_length)
125 _min_key_len = keyword->_allchars_length;
126 }
127
128 /* Exit program if an empty string is used as keyword, since the comparison
129 expressions don't work correctly for looking up an empty string. */
130 if (_min_key_len == 0)
131 {
132 fprintf (stderr, "Empty input keyword is not allowed.\n"
133 "To recognize an empty input keyword, your code should check for\n"
134 "len == 0 before calling the gperf generated lookup function.\n");
135 exit (1);
136 }
137
138 /* Exit program if the characters in the keywords are not in the required
139 range. */
140 if (option[SEVENBIT])
141 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
142 {
143 KeywordExt *keyword = temp->first();
144
145 const char *k = keyword->_allchars;
146 for (int i = keyword->_allchars_length; i > 0; k++, i--)
147 if (!(static_cast<unsigned char>(*k) < 128))
148 {
149 fprintf (stderr, "Option --seven-bit has been specified,\n"
150 "but keyword \"%.*s\" contains non-ASCII characters.\n"
151 "Try removing option --seven-bit.\n",
152 keyword->_allchars_length, keyword->_allchars);
153 exit (1);
154 }
155 }
156 }
157
158 /* ====================== Finding good byte positions ====================== */
159
160 /* Computes the upper bound on the indices passed to asso_values[],
161 assuming no alpha_increments. */
162 unsigned int
compute_alpha_size() const163 Search::compute_alpha_size () const
164 {
165 return (option[SEVENBIT] ? 128 : 256);
166 }
167
168 /* Computes the unification rules between different asso_values[c],
169 assuming no alpha_increments. */
170 unsigned int *
compute_alpha_unify() const171 Search::compute_alpha_unify () const
172 {
173 if (option[UPPERLOWER])
174 {
175 /* Uppercase to lowercase mapping. */
176 unsigned int alpha_size = compute_alpha_size();
177 unsigned int *alpha_unify = new unsigned int[alpha_size];
178 for (unsigned int c = 0; c < alpha_size; c++)
179 alpha_unify[c] = c;
180 for (unsigned int c = 'A'; c <= 'Z'; c++)
181 alpha_unify[c] = c + ('a'-'A');
182 return alpha_unify;
183 }
184 else
185 /* Identity mapping. */
186 return NULL;
187 }
188
189 /* Initializes each keyword's _selchars array. */
190 void
init_selchars_tuple(const Positions & positions,const unsigned int * alpha_unify) const191 Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
192 {
193 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
194 temp->first()->init_selchars_tuple(positions, alpha_unify);
195 }
196
197 /* Deletes each keyword's _selchars array. */
198 void
delete_selchars() const199 Search::delete_selchars () const
200 {
201 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
202 temp->first()->delete_selchars();
203 }
204
205 /* Count the duplicate keywords that occur with a given set of positions.
206 In other words, it returns the difference
207 # K - # proj1 (K)
208 where K is the multiset of given keywords. */
209 unsigned int
count_duplicates_tuple(const Positions & positions,const unsigned int * alpha_unify) const210 Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
211 {
212 /* Run through the keyword list and count the duplicates incrementally.
213 The result does not depend on the order of the keyword list, thanks to
214 the formula above. */
215 init_selchars_tuple (positions, alpha_unify);
216
217 unsigned int count = 0;
218 {
219 Hash_Table representatives (_total_keys, option[NOLENGTH]);
220 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
221 {
222 KeywordExt *keyword = temp->first();
223 if (representatives.insert (keyword))
224 count++;
225 }
226 }
227
228 delete_selchars ();
229
230 return count;
231 }
232
233 /* Find good key positions. */
234
235 void
find_positions()236 Search::find_positions ()
237 {
238 /* If the user gave the key positions, we use them. */
239 if (option[POSITIONS])
240 {
241 _key_positions = option.get_key_positions();
242 return;
243 }
244
245 /* Compute preliminary alpha_unify table. */
246 unsigned int *alpha_unify = compute_alpha_unify ();
247
248 /* 1. Find positions that must occur in order to distinguish duplicates. */
249 Positions mandatory;
250
251 if (!option[DUP])
252 {
253 for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
254 {
255 KeywordExt *keyword1 = l1->first();
256 for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
257 {
258 KeywordExt *keyword2 = l2->first();
259
260 /* If keyword1 and keyword2 have the same length and differ
261 in just one position, and it is not the last character,
262 this position is mandatory. */
263 if (keyword1->_allchars_length == keyword2->_allchars_length)
264 {
265 int n = keyword1->_allchars_length;
266 int i;
267 for (i = 0; i < n - 1; i++)
268 {
269 unsigned char c1 = keyword1->_allchars[i];
270 unsigned char c2 = keyword2->_allchars[i];
271 if (option[UPPERLOWER])
272 {
273 if (c1 >= 'A' && c1 <= 'Z')
274 c1 += 'a' - 'A';
275 if (c2 >= 'A' && c2 <= 'Z')
276 c2 += 'a' - 'A';
277 }
278 if (c1 != c2)
279 break;
280 }
281 if (i < n - 1)
282 {
283 int j;
284 for (j = i + 1; j < n; j++)
285 {
286 unsigned char c1 = keyword1->_allchars[j];
287 unsigned char c2 = keyword2->_allchars[j];
288 if (option[UPPERLOWER])
289 {
290 if (c1 >= 'A' && c1 <= 'Z')
291 c1 += 'a' - 'A';
292 if (c2 >= 'A' && c2 <= 'Z')
293 c2 += 'a' - 'A';
294 }
295 if (c1 != c2)
296 break;
297 }
298 if (j >= n)
299 {
300 /* Position i is mandatory. */
301 if (!mandatory.contains (i))
302 mandatory.add (i);
303 }
304 }
305 }
306 }
307 }
308 }
309
310 /* 2. Add positions, as long as this decreases the duplicates count. */
311 int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
312 ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
313 Positions current = mandatory;
314 unsigned int current_duplicates_count =
315 count_duplicates_tuple (current, alpha_unify);
316 for (;;)
317 {
318 Positions best;
319 unsigned int best_duplicates_count = UINT_MAX;
320
321 for (int i = imax; i >= -1; i--)
322 if (!current.contains (i))
323 {
324 Positions tryal = current;
325 tryal.add (i);
326 unsigned int try_duplicates_count =
327 count_duplicates_tuple (tryal, alpha_unify);
328
329 /* We prefer 'try' to 'best' if it produces less duplicates,
330 or if it produces the same number of duplicates but with
331 a more efficient hash function. */
332 if (try_duplicates_count < best_duplicates_count
333 || (try_duplicates_count == best_duplicates_count && i >= 0))
334 {
335 best = tryal;
336 best_duplicates_count = try_duplicates_count;
337 }
338 }
339
340 /* Stop adding positions when it gives no improvement. */
341 if (best_duplicates_count >= current_duplicates_count)
342 break;
343
344 current = best;
345 current_duplicates_count = best_duplicates_count;
346 }
347
348 /* 3. Remove positions, as long as this doesn't increase the duplicates
349 count. */
350 for (;;)
351 {
352 Positions best;
353 unsigned int best_duplicates_count = UINT_MAX;
354
355 for (int i = imax; i >= -1; i--)
356 if (current.contains (i) && !mandatory.contains (i))
357 {
358 Positions tryal = current;
359 tryal.remove (i);
360 unsigned int try_duplicates_count =
361 count_duplicates_tuple (tryal, alpha_unify);
362
363 /* We prefer 'try' to 'best' if it produces less duplicates,
364 or if it produces the same number of duplicates but with
365 a more efficient hash function. */
366 if (try_duplicates_count < best_duplicates_count
367 || (try_duplicates_count == best_duplicates_count && i == -1))
368 {
369 best = tryal;
370 best_duplicates_count = try_duplicates_count;
371 }
372 }
373
374 /* Stop removing positions when it gives no improvement. */
375 if (best_duplicates_count > current_duplicates_count)
376 break;
377
378 current = best;
379 current_duplicates_count = best_duplicates_count;
380 }
381
382 /* 4. Replace two positions by one, as long as this doesn't increase the
383 duplicates count. */
384 for (;;)
385 {
386 Positions best;
387 unsigned int best_duplicates_count = UINT_MAX;
388
389 for (int i1 = imax; i1 >= -1; i1--)
390 if (current.contains (i1) && !mandatory.contains (i1))
391 for (int i2 = imax; i2 >= -1; i2--)
392 if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
393 for (int i3 = imax; i3 >= 0; i3--)
394 if (!current.contains (i3))
395 {
396 Positions tryal = current;
397 tryal.remove (i1);
398 tryal.remove (i2);
399 tryal.add (i3);
400 unsigned int try_duplicates_count =
401 count_duplicates_tuple (tryal, alpha_unify);
402
403 /* We prefer 'try' to 'best' if it produces less duplicates,
404 or if it produces the same number of duplicates but with
405 a more efficient hash function. */
406 if (try_duplicates_count < best_duplicates_count
407 || (try_duplicates_count == best_duplicates_count
408 && (i1 == -1 || i2 == -1 || i3 >= 0)))
409 {
410 best = tryal;
411 best_duplicates_count = try_duplicates_count;
412 }
413 }
414
415 /* Stop removing positions when it gives no improvement. */
416 if (best_duplicates_count > current_duplicates_count)
417 break;
418
419 current = best;
420 current_duplicates_count = best_duplicates_count;
421 }
422
423 /* That's it. Hope it's good enough. */
424 _key_positions = current;
425
426 if (option[DEBUG])
427 {
428 /* Print the result. */
429 fprintf (stderr, "\nComputed positions: ");
430 PositionReverseIterator iter = _key_positions.reviterator();
431 bool seen_lastchar = false;
432 bool first = true;
433 for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
434 {
435 if (!first)
436 fprintf (stderr, ", ");
437 if (i == Positions::LASTCHAR)
438 seen_lastchar = true;
439 else
440 {
441 fprintf (stderr, "%d", i + 1);
442 first = false;
443 }
444 }
445 if (seen_lastchar)
446 {
447 if (!first)
448 fprintf (stderr, ", ");
449 fprintf (stderr, "$");
450 }
451 fprintf (stderr, "\n");
452 }
453
454 /* Free preliminary alpha_unify table. */
455 delete[] alpha_unify;
456 }
457
458 /* Count the duplicate keywords that occur with the found set of positions.
459 In other words, it returns the difference
460 # K - # proj1 (K)
461 where K is the multiset of given keywords. */
462 unsigned int
count_duplicates_tuple() const463 Search::count_duplicates_tuple () const
464 {
465 unsigned int *alpha_unify = compute_alpha_unify ();
466 unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
467 delete[] alpha_unify;
468 return count;
469 }
470
471 /* ===================== Finding good alpha increments ===================== */
472
473 /* Computes the upper bound on the indices passed to asso_values[]. */
474 unsigned int
compute_alpha_size(const unsigned int * alpha_inc) const475 Search::compute_alpha_size (const unsigned int *alpha_inc) const
476 {
477 unsigned int max_alpha_inc = 0;
478 for (int i = 0; i < _max_key_len; i++)
479 if (max_alpha_inc < alpha_inc[i])
480 max_alpha_inc = alpha_inc[i];
481 return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
482 }
483
484 /* Computes the unification rules between different asso_values[c]. */
485 unsigned int *
compute_alpha_unify(const Positions & positions,const unsigned int * alpha_inc) const486 Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
487 {
488 if (option[UPPERLOWER])
489 {
490 /* Without alpha increments, we would simply unify
491 'A' -> 'a', ..., 'Z' -> 'z'.
492 But when a keyword contains at position i a character c,
493 we have the constraint
494 asso_values[tolower(c) + alpha_inc[i]] ==
495 asso_values[toupper(c) + alpha_inc[i]].
496 This introduces a unification
497 toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
498 Note that this unification can extend outside the range of
499 ASCII letters! But still every unified character pair is at
500 a distance of 'a'-'A' = 32, or (after chained unification)
501 at a multiple of 32. So in the end the alpha_unify vector has
502 the form c -> c + 32 * f(c) where f(c) is a nonnegative
503 integer. */
504 unsigned int alpha_size = compute_alpha_size (alpha_inc);
505
506 unsigned int *alpha_unify = new unsigned int[alpha_size];
507 for (unsigned int c = 0; c < alpha_size; c++)
508 alpha_unify[c] = c;
509
510 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
511 {
512 KeywordExt *keyword = temp->first();
513
514 /* Iterate through the selected character positions. */
515 PositionIterator iter = positions.iterator(keyword->_allchars_length);
516
517 for (int i; (i = iter.next ()) != PositionIterator::EOS; )
518 {
519 unsigned int c;
520 if (i == Positions::LASTCHAR)
521 c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
522 else if (i < keyword->_allchars_length)
523 c = static_cast<unsigned char>(keyword->_allchars[i]);
524 else
525 abort ();
526 if (c >= 'A' && c <= 'Z')
527 c += 'a' - 'A';
528 if (c >= 'a' && c <= 'z')
529 {
530 if (i != Positions::LASTCHAR)
531 c += alpha_inc[i];
532 /* Unify c with c - ('a'-'A'). */
533 unsigned int d = alpha_unify[c];
534 unsigned int b = c - ('a'-'A');
535 for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
536 alpha_unify[a] = d;
537 }
538 }
539 }
540 return alpha_unify;
541 }
542 else
543 /* Identity mapping. */
544 return NULL;
545 }
546
547 /* Initializes each keyword's _selchars array. */
548 void
init_selchars_multiset(const Positions & positions,const unsigned int * alpha_unify,const unsigned int * alpha_inc) const549 Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
550 {
551 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
552 temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
553 }
554
555 /* Count the duplicate keywords that occur with the given set of positions
556 and a given alpha_inc[] array.
557 In other words, it returns the difference
558 # K - # proj2 (proj1 (K))
559 where K is the multiset of given keywords. */
560 unsigned int
count_duplicates_multiset(const unsigned int * alpha_inc) const561 Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
562 {
563 /* Run through the keyword list and count the duplicates incrementally.
564 The result does not depend on the order of the keyword list, thanks to
565 the formula above. */
566 unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
567 init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
568
569 unsigned int count = 0;
570 {
571 Hash_Table representatives (_total_keys, option[NOLENGTH]);
572 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
573 {
574 KeywordExt *keyword = temp->first();
575 if (representatives.insert (keyword))
576 count++;
577 }
578 }
579
580 delete_selchars ();
581 delete[] alpha_unify;
582
583 return count;
584 }
585
586 /* Find good _alpha_inc[]. */
587
588 void
find_alpha_inc()589 Search::find_alpha_inc ()
590 {
591 /* The goal is to choose _alpha_inc[] such that it doesn't introduce
592 artificial duplicates.
593 In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
594 unsigned int duplicates_goal = count_duplicates_tuple ();
595
596 /* Start with zero increments. This is sufficient in most cases. */
597 unsigned int *current = new unsigned int [_max_key_len];
598 for (int i = 0; i < _max_key_len; i++)
599 current[i] = 0;
600 unsigned int current_duplicates_count = count_duplicates_multiset (current);
601
602 if (current_duplicates_count > duplicates_goal)
603 {
604 /* Look which _alpha_inc[i] we are free to increment. */
605 unsigned int nindices;
606 {
607 nindices = 0;
608 PositionIterator iter = _key_positions.iterator(_max_key_len);
609 for (;;)
610 {
611 int key_pos = iter.next ();
612 if (key_pos == PositionIterator::EOS)
613 break;
614 if (key_pos != Positions::LASTCHAR)
615 nindices++;
616 }
617 }
618
619 unsigned int *indices = new unsigned int[nindices];
620 {
621 unsigned int j = 0;
622 PositionIterator iter = _key_positions.iterator(_max_key_len);
623 for (;;)
624 {
625 int key_pos = iter.next ();
626 if (key_pos == PositionIterator::EOS)
627 break;
628 if (key_pos != Positions::LASTCHAR)
629 indices[j++] = key_pos;
630 }
631 if (!(j == nindices))
632 abort ();
633 }
634
635 /* Perform several rounds of searching for a good alpha increment.
636 Each round reduces the number of artificial collisions by adding
637 an increment in a single key position. */
638 unsigned int *best = new unsigned int[_max_key_len];
639 unsigned int *tryal = new unsigned int[_max_key_len];
640 do
641 {
642 /* An increment of 1 is not always enough. Try higher increments
643 also. */
644 for (unsigned int inc = 1; ; inc++)
645 {
646 unsigned int best_duplicates_count = UINT_MAX;
647
648 for (unsigned int j = 0; j < nindices; j++)
649 {
650 memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
651 tryal[indices[j]] += inc;
652 unsigned int try_duplicates_count =
653 count_duplicates_multiset (tryal);
654
655 /* We prefer 'try' to 'best' if it produces less
656 duplicates. */
657 if (try_duplicates_count < best_duplicates_count)
658 {
659 memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
660 best_duplicates_count = try_duplicates_count;
661 }
662 }
663
664 /* Stop this round when we got an improvement. */
665 if (best_duplicates_count < current_duplicates_count)
666 {
667 memcpy (current, best, _max_key_len * sizeof (unsigned int));
668 current_duplicates_count = best_duplicates_count;
669 break;
670 }
671 }
672 }
673 while (current_duplicates_count > duplicates_goal);
674 delete[] tryal;
675 delete[] best;
676
677 if (option[DEBUG])
678 {
679 /* Print the result. */
680 fprintf (stderr, "\nComputed alpha increments: ");
681 bool first = true;
682 for (unsigned int j = nindices; j-- > 0; )
683 if (current[indices[j]] != 0)
684 {
685 if (!first)
686 fprintf (stderr, ", ");
687 fprintf (stderr, "%u:+%u",
688 indices[j] + 1, current[indices[j]]);
689 first = false;
690 }
691 fprintf (stderr, "\n");
692 }
693 delete[] indices;
694 }
695
696 _alpha_inc = current;
697 _alpha_size = compute_alpha_size (_alpha_inc);
698 _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
699 }
700
701 /* ======================= Finding good asso_values ======================== */
702
703 /* Initializes the asso_values[] related parameters. */
704
705 void
prepare_asso_values()706 Search::prepare_asso_values ()
707 {
708 KeywordExt_List *temp;
709
710 /* Initialize each keyword's _selchars array. */
711 init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
712
713 /* Compute the maximum _selchars_length over all keywords. */
714 _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
715
716 /* Check for duplicates, i.e. keywords with the same _selchars array
717 (and - if !option[NOLENGTH] - also the same length).
718 We deal with these by building an equivalence class, so that only
719 1 keyword is representative of the entire collection. Only this
720 representative remains in the keyword list; the others are accessible
721 through the _duplicate_link chain, starting at the representative.
722 This *greatly* simplifies processing during later stages of the program.
723 Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
724 {
725 _list_len = _total_keys;
726 _total_duplicates = 0;
727 /* Make hash table for efficiency. */
728 Hash_Table representatives (_list_len, option[NOLENGTH]);
729
730 KeywordExt_List *prev = NULL; /* list node before temp */
731 for (temp = _head; temp; )
732 {
733 KeywordExt *keyword = temp->first();
734 KeywordExt *other_keyword = representatives.insert (keyword);
735 KeywordExt_List *garbage = NULL;
736
737 if (other_keyword)
738 {
739 _total_duplicates++;
740 _list_len--;
741 /* Remove keyword from the main list. */
742 prev->rest() = temp->rest();
743 garbage = temp;
744 /* And insert it on other_keyword's duplicate list. */
745 keyword->_duplicate_link = other_keyword->_duplicate_link;
746 other_keyword->_duplicate_link = keyword;
747
748 /* Complain if user hasn't enabled the duplicate option. */
749 if (!option[DUP] || option[DEBUG])
750 {
751 fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
752 keyword->_allchars_length, keyword->_allchars,
753 other_keyword->_allchars_length, other_keyword->_allchars);
754 for (int j = 0; j < keyword->_selchars_length; j++)
755 putc (keyword->_selchars[j], stderr);
756 fprintf (stderr, "\".\n");
757 }
758 }
759 else
760 {
761 keyword->_duplicate_link = NULL;
762 prev = temp;
763 }
764 temp = temp->rest();
765 if (garbage)
766 delete garbage;
767 }
768 if (option[DEBUG])
769 representatives.dump();
770 }
771
772 /* Exit program if duplicates exists and option[DUP] not set, since we
773 don't want to continue in this case. (We don't want to turn on
774 option[DUP] implicitly, because the generated code is usually much
775 slower. */
776 if (_total_duplicates)
777 {
778 if (option[DUP])
779 fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
780 _total_duplicates);
781 else
782 {
783 fprintf (stderr, "%d input keys have identical hash values,\n",
784 _total_duplicates);
785 if (option[POSITIONS])
786 fprintf (stderr, "try different key positions or use option -D.\n");
787 else
788 fprintf (stderr, "use option -D.\n");
789 exit (1);
790 }
791 }
792
793 /* Compute the occurrences of each character in the alphabet. */
794 _occurrences = new int[_alpha_size];
795 memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
796 for (temp = _head; temp; temp = temp->rest())
797 {
798 KeywordExt *keyword = temp->first();
799 const unsigned int *ptr = keyword->_selchars;
800 for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
801 _occurrences[*ptr]++;
802 }
803
804 /* Memory allocation. */
805 _asso_values = new int[_alpha_size];
806
807 int non_linked_length = _list_len;
808 unsigned int asso_value_max;
809
810 asso_value_max =
811 static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
812 /* Round up to the next power of two. This makes it easy to ensure
813 an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
814 being odd, it guarantees that Search::try_asso_value() will iterate
815 through different values for _asso_value[c]. */
816 if (asso_value_max == 0)
817 asso_value_max = 1;
818 asso_value_max |= asso_value_max >> 1;
819 asso_value_max |= asso_value_max >> 2;
820 asso_value_max |= asso_value_max >> 4;
821 asso_value_max |= asso_value_max >> 8;
822 asso_value_max |= asso_value_max >> 16;
823 asso_value_max++;
824 _asso_value_max = asso_value_max;
825
826 /* Given the bound for _asso_values[c], we have a bound for the possible
827 hash values, as computed in compute_hash(). */
828 _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
829 + (_asso_value_max - 1) * _max_selchars_length;
830 /* Allocate a sparse bit vector for detection of collisions of hash
831 values. */
832 _collision_detector = new Bool_Array (_max_hash_value + 1);
833
834 if (option[DEBUG])
835 {
836 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
837 "\nmaximum size of generated hash table is %d\n",
838 non_linked_length, asso_value_max, _max_hash_value);
839
840 int field_width;
841
842 field_width = 0;
843 {
844 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
845 {
846 KeywordExt *keyword = temp->first();
847 if (field_width < keyword->_selchars_length)
848 field_width = keyword->_selchars_length;
849 }
850 }
851
852 fprintf (stderr, "\ndumping the keyword list without duplicates\n");
853 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
854 int i = 0;
855 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
856 {
857 KeywordExt *keyword = temp->first();
858 fprintf (stderr, "%9d, ", ++i);
859 if (field_width > keyword->_selchars_length)
860 fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
861 for (int j = 0; j < keyword->_selchars_length; j++)
862 putc (keyword->_selchars[j], stderr);
863 fprintf (stderr, ", %.*s\n",
864 keyword->_allchars_length, keyword->_allchars);
865 }
866 fprintf (stderr, "\nend of keyword list\n\n");
867 }
868
869 if (option[RANDOM] || option.get_jump () == 0)
870 /* We will use rand(), so initialize the random number generator. */
871 srand (static_cast<long>(time (0)));
872
873 _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
874 _jump = option.get_jump ();
875 }
876
877 /* Finds some _asso_values[] that fit. */
878
879 /* The idea is to choose the _asso_values[] one by one, in a way that
880 a choice that has been made never needs to be undone later. This
881 means that we split the work into several steps. Each step chooses
882 one or more _asso_values[c]. The result of choosing one or more
883 _asso_values[c] is that the partitioning of the keyword set gets
884 broader.
885 Look at this partitioning: After every step, the _asso_values[] of a
886 certain set C of characters are undetermined. (At the beginning, C
887 is the set of characters c with _occurrences[c] > 0. At the end, C
888 is empty.) To each keyword K, we associate the multiset of _selchars
889 for which the _asso_values[] are undetermined:
890 K --> K->_selchars intersect C.
891 Consider two keywords equivalent if their value under this mapping is
892 the same. This introduces an equivalence relation on the set of
893 keywords. The equivalence classes partition the keyword set. (At the
894 beginning, the partition is the finest possible: each K is an equivalence
895 class by itself, because all K have a different _selchars. At the end,
896 all K have been merged into a single equivalence class.)
897 The partition before a step is always a refinement of the partition
898 after the step.
899 We choose the steps in such a way that the partition really becomes
900 broader at each step. (A step that only chooses an _asso_values[c]
901 without changing the partition is better merged with the previous step,
902 to avoid useless backtracking.) */
903
904 struct EquivalenceClass
905 {
906 /* The keywords in this equivalence class. */
907 KeywordExt_List * _keywords;
908 KeywordExt_List * _keywords_last;
909 /* The number of keywords in this equivalence class. */
910 unsigned int _cardinality;
911 /* The undetermined selected characters for the keywords in this
912 equivalence class, as a canonically reordered multiset. */
913 unsigned int * _undetermined_chars;
914 unsigned int _undetermined_chars_length;
915
916 EquivalenceClass * _next;
917 };
918
919 struct Step
920 {
921 /* The characters whose values are being determined in this step. */
922 unsigned int _changing_count;
923 unsigned int * _changing;
924 /* Exclusive upper bound for the _asso_values[c] of this step.
925 A power of 2. */
926 unsigned int _asso_value_max;
927 /* The characters whose values will be determined after this step. */
928 bool * _undetermined;
929 /* The keyword set partition after this step. */
930 EquivalenceClass * _partition;
931 /* The expected number of iterations in this step. */
932 double _expected_lower;
933 double _expected_upper;
934
935 Step * _next;
936 };
937
938 static inline bool
equals(const unsigned int * ptr1,const unsigned int * ptr2,unsigned int len)939 equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
940 {
941 while (len > 0)
942 {
943 if (*ptr1 != *ptr2)
944 return false;
945 ptr1++;
946 ptr2++;
947 len--;
948 }
949 return true;
950 }
951
952 EquivalenceClass *
compute_partition(bool * undetermined) const953 Search::compute_partition (bool *undetermined) const
954 {
955 EquivalenceClass *partition = NULL;
956 EquivalenceClass *partition_last = NULL;
957 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
958 {
959 KeywordExt *keyword = temp->first();
960
961 /* Compute the undetermined characters for this keyword. */
962 unsigned int *undetermined_chars =
963 new unsigned int[keyword->_selchars_length];
964 unsigned int undetermined_chars_length = 0;
965
966 for (int i = 0; i < keyword->_selchars_length; i++)
967 if (undetermined[keyword->_selchars[i]])
968 undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
969
970 /* Look up the equivalence class to which this keyword belongs. */
971 EquivalenceClass *equclass;
972 for (equclass = partition; equclass; equclass = equclass->_next)
973 if (equclass->_undetermined_chars_length == undetermined_chars_length
974 && equals (equclass->_undetermined_chars, undetermined_chars,
975 undetermined_chars_length))
976 break;
977 if (equclass == NULL)
978 {
979 equclass = new EquivalenceClass();
980 equclass->_keywords = NULL;
981 equclass->_keywords_last = NULL;
982 equclass->_cardinality = 0;
983 equclass->_undetermined_chars = undetermined_chars;
984 equclass->_undetermined_chars_length = undetermined_chars_length;
985 equclass->_next = NULL;
986 if (partition)
987 partition_last->_next = equclass;
988 else
989 partition = equclass;
990 partition_last = equclass;
991 }
992 else
993 delete[] undetermined_chars;
994
995 /* Add the keyword to the equivalence class. */
996 KeywordExt_List *cons = new KeywordExt_List(keyword);
997 if (equclass->_keywords)
998 equclass->_keywords_last->rest() = cons;
999 else
1000 equclass->_keywords = cons;
1001 equclass->_keywords_last = cons;
1002 equclass->_cardinality++;
1003 }
1004
1005 /* Free some of the allocated memory. The caller doesn't need it. */
1006 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1007 delete[] cls->_undetermined_chars;
1008
1009 return partition;
1010 }
1011
1012 static void
delete_partition(EquivalenceClass * partition)1013 delete_partition (EquivalenceClass *partition)
1014 {
1015 while (partition != NULL)
1016 {
1017 EquivalenceClass *equclass = partition;
1018 partition = equclass->_next;
1019 delete_list (equclass->_keywords);
1020 //delete[] equclass->_undetermined_chars; // already freed above
1021 delete equclass;
1022 }
1023 }
1024
1025 /* Compute the possible number of collisions when _asso_values[c] is
1026 chosen, leading to the given partition. */
1027 unsigned int
count_possible_collisions(EquivalenceClass * partition,unsigned int c) const1028 Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1029 {
1030 /* Every equivalence class p is split according to the frequency of
1031 occurrence of c, leading to equivalence classes p1, p2, ...
1032 This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions.
1033 Return the sum of this expression over all equivalence classes. */
1034 unsigned int sum = 0;
1035 unsigned int m = _max_selchars_length;
1036 unsigned int *split_cardinalities = new unsigned int[m + 1];
1037 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1038 {
1039 for (unsigned int i = 0; i <= m; i++)
1040 split_cardinalities[i] = 0;
1041
1042 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1043 {
1044 KeywordExt *keyword = temp->first();
1045
1046 unsigned int count = 0;
1047 for (int i = 0; i < keyword->_selchars_length; i++)
1048 if (keyword->_selchars[i] == c)
1049 count++;
1050
1051 split_cardinalities[count]++;
1052 }
1053
1054 sum += cls->_cardinality * cls->_cardinality;
1055 for (unsigned int i = 0; i <= m; i++)
1056 sum -= split_cardinalities[i] * split_cardinalities[i];
1057 }
1058 delete[] split_cardinalities;
1059 return sum;
1060 }
1061
1062 /* Test whether adding c to the undetermined characters changes the given
1063 partition. */
1064 bool
unchanged_partition(EquivalenceClass * partition,unsigned int c) const1065 Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1066 {
1067 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1068 {
1069 unsigned int first_count = UINT_MAX;
1070
1071 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1072 {
1073 KeywordExt *keyword = temp->first();
1074
1075 unsigned int count = 0;
1076 for (int i = 0; i < keyword->_selchars_length; i++)
1077 if (keyword->_selchars[i] == c)
1078 count++;
1079
1080 if (temp == cls->_keywords)
1081 first_count = count;
1082 else if (count != first_count)
1083 /* c would split this equivalence class. */
1084 return false;
1085 }
1086 }
1087 return true;
1088 }
1089
1090 void
find_asso_values()1091 Search::find_asso_values ()
1092 {
1093 Step *steps;
1094
1095 /* Determine the steps, starting with the last one. */
1096 {
1097 bool *undetermined;
1098 bool *determined;
1099
1100 steps = NULL;
1101
1102 undetermined = new bool[_alpha_size];
1103 for (unsigned int c = 0; c < _alpha_size; c++)
1104 undetermined[c] = false;
1105
1106 determined = new bool[_alpha_size];
1107 for (unsigned int c = 0; c < _alpha_size; c++)
1108 determined[c] = true;
1109
1110 for (;;)
1111 {
1112 /* Compute the partition that needs to be refined. */
1113 EquivalenceClass *partition = compute_partition (undetermined);
1114
1115 /* Determine the main character to be chosen in this step.
1116 Choosing such a character c has the effect of splitting every
1117 equivalence class (according the the frequency of occurrence of c).
1118 We choose the c with the minimum number of possible collisions,
1119 so that characters which lead to a large number of collisions get
1120 handled early during the search. */
1121 unsigned int chosen_c;
1122 unsigned int chosen_possible_collisions;
1123 {
1124 unsigned int best_c = 0;
1125 unsigned int best_possible_collisions = UINT_MAX;
1126 for (unsigned int c = 0; c < _alpha_size; c++)
1127 if (_occurrences[c] > 0 && determined[c])
1128 {
1129 unsigned int possible_collisions =
1130 count_possible_collisions (partition, c);
1131 if (possible_collisions < best_possible_collisions)
1132 {
1133 best_c = c;
1134 best_possible_collisions = possible_collisions;
1135 }
1136 }
1137 if (best_possible_collisions == UINT_MAX)
1138 {
1139 /* All c with _occurrences[c] > 0 are undetermined. We are
1140 are the starting situation and don't need any more step. */
1141 delete_partition (partition);
1142 break;
1143 }
1144 chosen_c = best_c;
1145 chosen_possible_collisions = best_possible_collisions;
1146 }
1147
1148 /* We need one more step. */
1149 Step *step = new Step();
1150
1151 step->_undetermined = new bool[_alpha_size];
1152 memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1153
1154 step->_partition = partition;
1155
1156 /* Now determine how the equivalence classes will be before this
1157 step. */
1158 undetermined[chosen_c] = true;
1159 partition = compute_partition (undetermined);
1160
1161 /* Now determine which other characters should be determined in this
1162 step, because they will not change the equivalence classes at
1163 this point. It is the set of all c which, for all equivalence
1164 classes, have the same frequency of occurrence in every keyword
1165 of the equivalence class. */
1166 for (unsigned int c = 0; c < _alpha_size; c++)
1167 if (_occurrences[c] > 0 && determined[c]
1168 && unchanged_partition (partition, c))
1169 {
1170 undetermined[c] = true;
1171 determined[c] = false;
1172 }
1173
1174 /* main_c must be one of these. */
1175 if (determined[chosen_c])
1176 abort ();
1177
1178 /* Now the set of changing characters of this step. */
1179 unsigned int changing_count;
1180
1181 changing_count = 0;
1182 for (unsigned int c = 0; c < _alpha_size; c++)
1183 if (undetermined[c] && !step->_undetermined[c])
1184 changing_count++;
1185
1186 unsigned int *changing = new unsigned int[changing_count];
1187 changing_count = 0;
1188 for (unsigned int c = 0; c < _alpha_size; c++)
1189 if (undetermined[c] && !step->_undetermined[c])
1190 changing[changing_count++] = c;
1191
1192 step->_changing = changing;
1193 step->_changing_count = changing_count;
1194
1195 step->_asso_value_max = _asso_value_max;
1196
1197 step->_expected_lower =
1198 exp (static_cast<double>(chosen_possible_collisions)
1199 / static_cast<double>(_max_hash_value));
1200 step->_expected_upper =
1201 exp (static_cast<double>(chosen_possible_collisions)
1202 / static_cast<double>(_asso_value_max));
1203
1204 delete_partition (partition);
1205
1206 step->_next = steps;
1207 steps = step;
1208 }
1209
1210 delete[] determined;
1211 delete[] undetermined;
1212 }
1213
1214 if (option[DEBUG])
1215 {
1216 unsigned int stepno = 0;
1217 for (Step *step = steps; step; step = step->_next)
1218 {
1219 stepno++;
1220 fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1221 for (unsigned int i = 0; i < step->_changing_count; i++)
1222 {
1223 if (i > 0)
1224 fprintf (stderr, ",");
1225 fprintf (stderr, "'%c'", step->_changing[i]);
1226 }
1227 fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1228 step->_expected_lower, step->_expected_upper);
1229 fprintf (stderr, "Keyword equivalence classes:\n");
1230 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1231 {
1232 fprintf (stderr, "\n");
1233 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1234 {
1235 KeywordExt *keyword = temp->first();
1236 fprintf (stderr, " %.*s\n",
1237 keyword->_allchars_length, keyword->_allchars);
1238 }
1239 }
1240 fprintf (stderr, "\n");
1241 }
1242 }
1243
1244 /* Initialize _asso_values[]. (The value given here matters only
1245 for those c which occur in all keywords with equal multiplicity.) */
1246 for (unsigned int c = 0; c < _alpha_size; c++)
1247 _asso_values[c] = 0;
1248
1249 unsigned int stepno = 0;
1250 for (Step *step = steps; step; step = step->_next)
1251 {
1252 stepno++;
1253
1254 /* Initialize the asso_values[]. */
1255 unsigned int k = step->_changing_count;
1256 for (unsigned int i = 0; i < k; i++)
1257 {
1258 unsigned int c = step->_changing[i];
1259 _asso_values[c] =
1260 (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1261 & (step->_asso_value_max - 1);
1262 }
1263
1264 unsigned int iterations = 0;
1265 unsigned int *iter = new unsigned int[k];
1266 for (unsigned int i = 0; i < k; i++)
1267 iter[i] = 0;
1268 unsigned int ii = (_jump != 0 ? k - 1 : 0);
1269
1270 for (;;)
1271 {
1272 /* Test whether these asso_values[] lead to collisions among
1273 the equivalence classes that should be collision-free. */
1274 bool has_collision = false;
1275 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1276 {
1277 /* Iteration Number array is a win, O(1) initialization time! */
1278 _collision_detector->clear ();
1279
1280 for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1281 {
1282 KeywordExt *keyword = ptr->first();
1283
1284 /* Compute the new hash code for the keyword, leaving apart
1285 the yet undetermined asso_values[]. */
1286 int hashcode;
1287 {
1288 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1289 const unsigned int *p = keyword->_selchars;
1290 int i = keyword->_selchars_length;
1291 for (; i > 0; p++, i--)
1292 if (!step->_undetermined[*p])
1293 sum += _asso_values[*p];
1294 hashcode = sum;
1295 }
1296
1297 /* See whether it collides with another keyword's hash code,
1298 from the same equivalence class. */
1299 if (_collision_detector->set_bit (hashcode))
1300 {
1301 has_collision = true;
1302 break;
1303 }
1304 }
1305
1306 /* Don't need to continue looking at the other equivalence
1307 classes if we already have found a collision. */
1308 if (has_collision)
1309 break;
1310 }
1311
1312 iterations++;
1313 if (!has_collision)
1314 break;
1315
1316 /* Try other asso_values[]. */
1317 if (_jump != 0)
1318 {
1319 /* The way we try various values for
1320 asso_values[step->_changing[0],...step->_changing[k-1]]
1321 is like this:
1322 for (bound = 0,1,...)
1323 for (ii = 0,...,k-1)
1324 iter[ii] := bound
1325 iter[0..ii-1] := values <= bound
1326 iter[ii+1..k-1] := values < bound
1327 and
1328 asso_values[step->_changing[i]] =
1329 _initial_asso_value + iter[i] * _jump.
1330 This makes it more likely to find small asso_values[].
1331 */
1332 unsigned int bound = iter[ii];
1333 unsigned int i = 0;
1334 while (i < ii)
1335 {
1336 unsigned int c = step->_changing[i];
1337 iter[i]++;
1338 _asso_values[c] =
1339 (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1340 if (iter[i] <= bound)
1341 goto found_next;
1342 _asso_values[c] =
1343 (_asso_values[c] - iter[i] * _jump)
1344 & (step->_asso_value_max - 1);
1345 iter[i] = 0;
1346 i++;
1347 }
1348 i = ii + 1;
1349 while (i < k)
1350 {
1351 unsigned int c = step->_changing[i];
1352 iter[i]++;
1353 _asso_values[c] =
1354 (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1355 if (iter[i] < bound)
1356 goto found_next;
1357 _asso_values[c] =
1358 (_asso_values[c] - iter[i] * _jump)
1359 & (step->_asso_value_max - 1);
1360 iter[i] = 0;
1361 i++;
1362 }
1363 /* Switch from one ii to the next. */
1364 {
1365 unsigned int c = step->_changing[ii];
1366 _asso_values[c] =
1367 (_asso_values[c] - bound * _jump)
1368 & (step->_asso_value_max - 1);
1369 iter[ii] = 0;
1370 }
1371 /* Here all iter[i] == 0. */
1372 ii++;
1373 if (ii == k)
1374 {
1375 ii = 0;
1376 bound++;
1377 if (bound == step->_asso_value_max)
1378 {
1379 /* Out of search space! We can either backtrack, or
1380 increase the available search space of this step.
1381 It seems simpler to choose the latter solution. */
1382 step->_asso_value_max = 2 * step->_asso_value_max;
1383 if (step->_asso_value_max > _asso_value_max)
1384 {
1385 _asso_value_max = step->_asso_value_max;
1386 /* Reinitialize _max_hash_value. */
1387 _max_hash_value =
1388 (option[NOLENGTH] ? 0 : _max_key_len)
1389 + (_asso_value_max - 1) * _max_selchars_length;
1390 /* Reinitialize _collision_detector. */
1391 delete _collision_detector;
1392 _collision_detector =
1393 new Bool_Array (_max_hash_value + 1);
1394 }
1395 }
1396 }
1397 {
1398 unsigned int c = step->_changing[ii];
1399 iter[ii] = bound;
1400 _asso_values[c] =
1401 (_asso_values[c] + bound * _jump)
1402 & (step->_asso_value_max - 1);
1403 }
1404 found_next: ;
1405 }
1406 else
1407 {
1408 /* Random. */
1409 unsigned int c = step->_changing[ii];
1410 _asso_values[c] =
1411 (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1412 /* Next time, change the next c. */
1413 ii++;
1414 if (ii == k)
1415 ii = 0;
1416 }
1417 }
1418 delete[] iter;
1419
1420 if (option[DEBUG])
1421 {
1422 fprintf (stderr, "Step %u chose _asso_values[", stepno);
1423 for (unsigned int i = 0; i < step->_changing_count; i++)
1424 {
1425 if (i > 0)
1426 fprintf (stderr, ",");
1427 fprintf (stderr, "'%c'", step->_changing[i]);
1428 }
1429 fprintf (stderr, "] in %u iterations.\n", iterations);
1430 }
1431 }
1432
1433 /* Free allocated memory. */
1434 while (steps != NULL)
1435 {
1436 Step *step = steps;
1437 steps = step->_next;
1438 delete[] step->_changing;
1439 delete[] step->_undetermined;
1440 delete_partition (step->_partition);
1441 delete step;
1442 }
1443 }
1444
1445 /* Computes a keyword's hash value, relative to the current _asso_values[],
1446 and stores it in keyword->_hash_value. */
1447
1448 inline int
compute_hash(KeywordExt * keyword) const1449 Search::compute_hash (KeywordExt *keyword) const
1450 {
1451 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1452
1453 const unsigned int *p = keyword->_selchars;
1454 int i = keyword->_selchars_length;
1455 for (; i > 0; p++, i--)
1456 sum += _asso_values[*p];
1457
1458 return keyword->_hash_value = sum;
1459 }
1460
1461 /* Finds good _asso_values[]. */
1462
1463 void
find_good_asso_values()1464 Search::find_good_asso_values ()
1465 {
1466 prepare_asso_values ();
1467
1468 /* Search for good _asso_values[]. */
1469 int asso_iteration;
1470 if ((asso_iteration = option.get_asso_iterations ()) == 0)
1471 /* Try only the given _initial_asso_value and _jump. */
1472 find_asso_values ();
1473 else
1474 {
1475 /* Try different pairs of _initial_asso_value and _jump, in the
1476 following order:
1477 (0, 1)
1478 (1, 1)
1479 (2, 1) (0, 3)
1480 (3, 1) (1, 3)
1481 (4, 1) (2, 3) (0, 5)
1482 (5, 1) (3, 3) (1, 5)
1483 ..... */
1484 KeywordExt_List *saved_head = _head;
1485 int best_initial_asso_value = 0;
1486 int best_jump = 1;
1487 int *best_asso_values = new int[_alpha_size];
1488 int best_collisions = INT_MAX;
1489 int best_max_hash_value = INT_MAX;
1490
1491 _initial_asso_value = 0; _jump = 1;
1492 for (;;)
1493 {
1494 /* Restore the keyword list in its original order. */
1495 _head = copy_list (saved_head);
1496 /* Find good _asso_values[]. */
1497 find_asso_values ();
1498 /* Test whether it is the best solution so far. */
1499 int collisions = 0;
1500 int max_hash_value = INT_MIN;
1501 _collision_detector->clear ();
1502 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1503 {
1504 KeywordExt *keyword = ptr->first();
1505 int hashcode = compute_hash (keyword);
1506 if (max_hash_value < hashcode)
1507 max_hash_value = hashcode;
1508 if (_collision_detector->set_bit (hashcode))
1509 collisions++;
1510 }
1511 if (collisions < best_collisions
1512 || (collisions == best_collisions
1513 && max_hash_value < best_max_hash_value))
1514 {
1515 memcpy (best_asso_values, _asso_values,
1516 _alpha_size * sizeof (_asso_values[0]));
1517 best_collisions = collisions;
1518 best_max_hash_value = max_hash_value;
1519 }
1520 /* Delete the copied keyword list. */
1521 delete_list (_head);
1522
1523 if (--asso_iteration == 0)
1524 break;
1525 /* Prepare for next iteration. */
1526 if (_initial_asso_value >= 2)
1527 _initial_asso_value -= 2, _jump += 2;
1528 else
1529 _initial_asso_value += _jump, _jump = 1;
1530 }
1531 _head = saved_head;
1532 /* Install the best found asso_values. */
1533 _initial_asso_value = best_initial_asso_value;
1534 _jump = best_jump;
1535 memcpy (_asso_values, best_asso_values,
1536 _alpha_size * sizeof (_asso_values[0]));
1537 delete[] best_asso_values;
1538 /* The keywords' _hash_value fields are recomputed below. */
1539 }
1540 }
1541
1542 /* ========================================================================= */
1543
1544 /* Comparison function for sorting by increasing _hash_value. */
1545 static bool
less_by_hash_value(KeywordExt * keyword1,KeywordExt * keyword2)1546 less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1547 {
1548 return keyword1->_hash_value < keyword2->_hash_value;
1549 }
1550
1551 /* Sorts the keyword list by hash value. */
1552
1553 void
sort()1554 Search::sort ()
1555 {
1556 _head = mergesort_list (_head, less_by_hash_value);
1557 }
1558
1559 void
optimize()1560 Search::optimize ()
1561 {
1562 /* Preparations. */
1563 prepare ();
1564
1565 /* Step 1: Finding good byte positions. */
1566 find_positions ();
1567
1568 /* Step 2: Finding good alpha increments. */
1569 find_alpha_inc ();
1570
1571 /* Step 3: Finding good asso_values. */
1572 find_good_asso_values ();
1573
1574 /* Make one final check, just to make sure nothing weird happened.... */
1575 _collision_detector->clear ();
1576 for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1577 {
1578 KeywordExt *curr = curr_ptr->first();
1579 unsigned int hashcode = compute_hash (curr);
1580 if (_collision_detector->set_bit (hashcode))
1581 {
1582 /* This shouldn't happen. proj1, proj2, proj3 must have been
1583 computed to be injective on the given keyword set. */
1584 fprintf (stderr,
1585 "\nInternal error, unexpected duplicate hash code\n");
1586 if (option[POSITIONS])
1587 fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1588 else
1589 fprintf (stderr, "try options -m or -r.\n\n");
1590 exit (1);
1591 }
1592 }
1593
1594 /* Sorts the keyword list by hash value. */
1595 sort ();
1596
1597 /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely
1598 necessary, but speeds up the lookup function in many cases of lookup
1599 failure: no string comparison is needed once the hash value of a string
1600 is larger than the hash value of any keyword. */
1601 int max_hash_value;
1602 {
1603 KeywordExt_List *temp;
1604 for (temp = _head; temp->rest(); temp = temp->rest())
1605 ;
1606 max_hash_value = temp->first()->_hash_value;
1607 }
1608 for (unsigned int c = 0; c < _alpha_size; c++)
1609 if (_occurrences[c] == 0)
1610 _asso_values[c] = max_hash_value + 1;
1611
1612 /* Propagate unified asso_values. */
1613 if (_alpha_unify)
1614 for (unsigned int c = 0; c < _alpha_size; c++)
1615 if (_alpha_unify[c] != c)
1616 _asso_values[c] = _asso_values[_alpha_unify[c]];
1617 }
1618
1619 /* Prints out some diagnostics upon completion. */
1620
~Search()1621 Search::~Search ()
1622 {
1623 delete _collision_detector;
1624 if (option[DEBUG])
1625 {
1626 fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1627
1628 for (unsigned int i = 0; i < _alpha_size; i++)
1629 if (_occurrences[i])
1630 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1631 i, _asso_values[i], i, _occurrences[i]);
1632
1633 fprintf (stderr, "end table dumping\n");
1634
1635 fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1636 "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1637 _list_len, _total_keys, _total_duplicates, _max_key_len);
1638
1639 int field_width = _max_selchars_length;
1640 fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1641 field_width, "selchars");
1642 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1643 {
1644 fprintf (stderr, "%11d,%11d,%6d, ",
1645 ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1646 if (field_width > ptr->first()->_selchars_length)
1647 fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1648 for (int j = 0; j < ptr->first()->_selchars_length; j++)
1649 putc (ptr->first()->_selchars[j], stderr);
1650 fprintf (stderr, ", %.*s\n",
1651 ptr->first()->_allchars_length, ptr->first()->_allchars);
1652 }
1653
1654 fprintf (stderr, "End dumping list.\n\n");
1655 }
1656 delete[] _asso_values;
1657 delete[] _occurrences;
1658 delete[] _alpha_unify;
1659 delete[] _alpha_inc;
1660 }
1661