1 /* Search algorithm.
2    Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3    Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4    and Bruno Haible <bruno@clisp.org>.
5 
6    This file is part of GNU GPERF.
7 
8    GNU GPERF is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 2, or (at your option)
11    any later version.
12 
13    GNU GPERF is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program; see the file COPYING.
20    If not, write to the Free Software Foundation, Inc.,
21    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
22 
23 /* Specification. */
24 #include "search.h"
25 
26 #include <stdio.h>
27 #include <stdlib.h> /* declares exit(), rand(), srand() */
28 #include <string.h> /* declares memset(), memcmp() */
29 #include <time.h> /* declares time() */
30 #include <math.h> /* declares exp() */
31 #include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32 #include "options.h"
33 #include "hash-table.h"
34 
35 /* ================================ Theory ================================= */
36 
37 /* The general form of the hash function is
38 
39       hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
40                        + len (keyword)
41 
42    where Pos is a set of byte positions,
43    each alpha_inc[i] is a nonnegative integer,
44    each asso_values[c] is a nonnegative integer,
45    len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
46 
47    Theorem 1: If all keywords are different, there is a set Pos such that
48    all tuples (keyword[i] : i in Pos) are different.
49 
50    Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
51    are nonnegative integers alpha_inc[i] such that all multisets
52    {keyword[i] + alpha_inc[i] : i in Pos} are different.
53 
54    Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
55 
56    Theorem 3: If all multisets selchars[keyword] are different, there are
57    nonnegative integers asso_values[c] such that all hash values
58    sum (asso_values[c] : c in selchars[keyword]) are different.
59 
60    Based on these three facts, we find the hash function in three steps:
61 
62    Step 1 (Finding good byte positions):
63    Find a set Pos, as small as possible, such that all tuples
64    (keyword[i] : i in Pos) are different.
65 
66    Step 2 (Finding good alpha increments):
67    Find nonnegative integers alpha_inc[i], as many of them as possible being
68    zero, and the others being as small as possible, such that all multisets
69    {keyword[i] + alpha_inc[i] : i in Pos} are different.
70 
71    Step 3 (Finding good asso_values):
72    Find asso_values[c] such that all hash (keyword) are different.
73 
74    In other words, each step finds a projection that is injective on the
75    given finite set:
76      proj1 : String --> Map (Pos --> N)
77      proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
78      proj3 : Map (Pos --> N) / S(Pos) --> N
79    where
80      N denotes the set of nonnegative integers,
81      Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
82      S(Pos) is the symmetric group over Pos.
83 
84    This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
85    modifications apply:
86      proj1 : String --> Map (Pos --> N) x N
87      proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
88      proj3 : Map (Pos --> N) / S(Pos) x N --> N
89 
90    For a case-insensitive hash function, the general form is
91 
92       hash (keyword) =
93         sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
94         + len (keyword)
95 
96    where alpha_unify[c] is chosen so that an upper/lower case change in
97    keyword[i] doesn't change  alpha_unify[keyword[i] + alpha_inc[i]].
98  */
99 
100 /* ==================== Initialization and Preparation ===================== */
101 
Search(KeywordExt_List * list)102 Search::Search (KeywordExt_List *list)
103   : _head (list)
104 {
105 }
106 
107 void
prepare()108 Search::prepare ()
109 {
110   /* Compute the total number of keywords.  */
111   _total_keys = 0;
112   for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
113     _total_keys++;
114 
115   /* Compute the minimum and maximum keyword length.  */
116   _max_key_len = INT_MIN;
117   _min_key_len = INT_MAX;
118   for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
119     {
120       KeywordExt *keyword = temp->first();
121 
122       if (_max_key_len < keyword->_allchars_length)
123         _max_key_len = keyword->_allchars_length;
124       if (_min_key_len > keyword->_allchars_length)
125         _min_key_len = keyword->_allchars_length;
126     }
127 
128   /* Exit program if an empty string is used as keyword, since the comparison
129      expressions don't work correctly for looking up an empty string.  */
130   if (_min_key_len == 0)
131     {
132       fprintf (stderr, "Empty input keyword is not allowed.\n"
133                "To recognize an empty input keyword, your code should check for\n"
134                "len == 0 before calling the gperf generated lookup function.\n");
135       exit (1);
136     }
137 
138   /* Exit program if the characters in the keywords are not in the required
139      range.  */
140   if (option[SEVENBIT])
141     for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
142       {
143         KeywordExt *keyword = temp->first();
144 
145         const char *k = keyword->_allchars;
146         for (int i = keyword->_allchars_length; i > 0; k++, i--)
147           if (!(static_cast<unsigned char>(*k) < 128))
148             {
149               fprintf (stderr, "Option --seven-bit has been specified,\n"
150                        "but keyword \"%.*s\" contains non-ASCII characters.\n"
151                        "Try removing option --seven-bit.\n",
152                        keyword->_allchars_length, keyword->_allchars);
153               exit (1);
154             }
155       }
156 }
157 
158 /* ====================== Finding good byte positions ====================== */
159 
160 /* Computes the upper bound on the indices passed to asso_values[],
161    assuming no alpha_increments.  */
162 unsigned int
compute_alpha_size() const163 Search::compute_alpha_size () const
164 {
165   return (option[SEVENBIT] ? 128 : 256);
166 }
167 
168 /* Computes the unification rules between different asso_values[c],
169    assuming no alpha_increments.  */
170 unsigned int *
compute_alpha_unify() const171 Search::compute_alpha_unify () const
172 {
173   if (option[UPPERLOWER])
174     {
175       /* Uppercase to lowercase mapping.  */
176       unsigned int alpha_size = compute_alpha_size();
177       unsigned int *alpha_unify = new unsigned int[alpha_size];
178       for (unsigned int c = 0; c < alpha_size; c++)
179         alpha_unify[c] = c;
180       for (unsigned int c = 'A'; c <= 'Z'; c++)
181         alpha_unify[c] = c + ('a'-'A');
182       return alpha_unify;
183     }
184   else
185     /* Identity mapping.  */
186     return NULL;
187 }
188 
189 /* Initializes each keyword's _selchars array.  */
190 void
init_selchars_tuple(const Positions & positions,const unsigned int * alpha_unify) const191 Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
192 {
193   for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
194     temp->first()->init_selchars_tuple(positions, alpha_unify);
195 }
196 
197 /* Deletes each keyword's _selchars array.  */
198 void
delete_selchars() const199 Search::delete_selchars () const
200 {
201   for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
202     temp->first()->delete_selchars();
203 }
204 
205 /* Count the duplicate keywords that occur with a given set of positions.
206    In other words, it returns the difference
207      # K - # proj1 (K)
208    where K is the multiset of given keywords.  */
209 unsigned int
count_duplicates_tuple(const Positions & positions,const unsigned int * alpha_unify) const210 Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
211 {
212   /* Run through the keyword list and count the duplicates incrementally.
213      The result does not depend on the order of the keyword list, thanks to
214      the formula above.  */
215   init_selchars_tuple (positions, alpha_unify);
216 
217   unsigned int count = 0;
218   {
219     Hash_Table representatives (_total_keys, option[NOLENGTH]);
220     for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
221       {
222         KeywordExt *keyword = temp->first();
223         if (representatives.insert (keyword))
224           count++;
225       }
226   }
227 
228   delete_selchars ();
229 
230   return count;
231 }
232 
233 /* Find good key positions.  */
234 
235 void
find_positions()236 Search::find_positions ()
237 {
238   /* If the user gave the key positions, we use them.  */
239   if (option[POSITIONS])
240     {
241       _key_positions = option.get_key_positions();
242       return;
243     }
244 
245   /* Compute preliminary alpha_unify table.  */
246   unsigned int *alpha_unify = compute_alpha_unify ();
247 
248   /* 1. Find positions that must occur in order to distinguish duplicates.  */
249   Positions mandatory;
250 
251   if (!option[DUP])
252     {
253       for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
254         {
255           KeywordExt *keyword1 = l1->first();
256           for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
257             {
258               KeywordExt *keyword2 = l2->first();
259 
260               /* If keyword1 and keyword2 have the same length and differ
261                  in just one position, and it is not the last character,
262                  this position is mandatory.  */
263               if (keyword1->_allchars_length == keyword2->_allchars_length)
264                 {
265                   int n = keyword1->_allchars_length;
266                   int i;
267                   for (i = 0; i < n - 1; i++)
268                     {
269                       unsigned char c1 = keyword1->_allchars[i];
270                       unsigned char c2 = keyword2->_allchars[i];
271                       if (option[UPPERLOWER])
272                         {
273                           if (c1 >= 'A' && c1 <= 'Z')
274                             c1 += 'a' - 'A';
275                           if (c2 >= 'A' && c2 <= 'Z')
276                             c2 += 'a' - 'A';
277                         }
278                       if (c1 != c2)
279                         break;
280                     }
281                   if (i < n - 1)
282                     {
283                       int j;
284                       for (j = i + 1; j < n; j++)
285                         {
286                           unsigned char c1 = keyword1->_allchars[j];
287                           unsigned char c2 = keyword2->_allchars[j];
288                           if (option[UPPERLOWER])
289                             {
290                               if (c1 >= 'A' && c1 <= 'Z')
291                                 c1 += 'a' - 'A';
292                               if (c2 >= 'A' && c2 <= 'Z')
293                                 c2 += 'a' - 'A';
294                             }
295                           if (c1 != c2)
296                             break;
297                         }
298                       if (j >= n)
299                         {
300                           /* Position i is mandatory.  */
301                           if (!mandatory.contains (i))
302                             mandatory.add (i);
303                         }
304                     }
305                 }
306             }
307         }
308     }
309 
310   /* 2. Add positions, as long as this decreases the duplicates count.  */
311   int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
312               ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
313   Positions current = mandatory;
314   unsigned int current_duplicates_count =
315     count_duplicates_tuple (current, alpha_unify);
316   for (;;)
317     {
318       Positions best;
319       unsigned int best_duplicates_count = UINT_MAX;
320 
321       for (int i = imax; i >= -1; i--)
322         if (!current.contains (i))
323           {
324             Positions tryal = current;
325             tryal.add (i);
326             unsigned int try_duplicates_count =
327               count_duplicates_tuple (tryal, alpha_unify);
328 
329             /* We prefer 'try' to 'best' if it produces less duplicates,
330                or if it produces the same number of duplicates but with
331                a more efficient hash function.  */
332             if (try_duplicates_count < best_duplicates_count
333                 || (try_duplicates_count == best_duplicates_count && i >= 0))
334               {
335                 best = tryal;
336                 best_duplicates_count = try_duplicates_count;
337               }
338           }
339 
340       /* Stop adding positions when it gives no improvement.  */
341       if (best_duplicates_count >= current_duplicates_count)
342         break;
343 
344       current = best;
345       current_duplicates_count = best_duplicates_count;
346     }
347 
348   /* 3. Remove positions, as long as this doesn't increase the duplicates
349      count.  */
350   for (;;)
351     {
352       Positions best;
353       unsigned int best_duplicates_count = UINT_MAX;
354 
355       for (int i = imax; i >= -1; i--)
356         if (current.contains (i) && !mandatory.contains (i))
357           {
358             Positions tryal = current;
359             tryal.remove (i);
360             unsigned int try_duplicates_count =
361               count_duplicates_tuple (tryal, alpha_unify);
362 
363             /* We prefer 'try' to 'best' if it produces less duplicates,
364                or if it produces the same number of duplicates but with
365                a more efficient hash function.  */
366             if (try_duplicates_count < best_duplicates_count
367                 || (try_duplicates_count == best_duplicates_count && i == -1))
368               {
369                 best = tryal;
370                 best_duplicates_count = try_duplicates_count;
371               }
372           }
373 
374       /* Stop removing positions when it gives no improvement.  */
375       if (best_duplicates_count > current_duplicates_count)
376         break;
377 
378       current = best;
379       current_duplicates_count = best_duplicates_count;
380     }
381 
382   /* 4. Replace two positions by one, as long as this doesn't increase the
383      duplicates count.  */
384   for (;;)
385     {
386       Positions best;
387       unsigned int best_duplicates_count = UINT_MAX;
388 
389       for (int i1 = imax; i1 >= -1; i1--)
390         if (current.contains (i1) && !mandatory.contains (i1))
391           for (int i2 = imax; i2 >= -1; i2--)
392             if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
393               for (int i3 = imax; i3 >= 0; i3--)
394                 if (!current.contains (i3))
395                   {
396                     Positions tryal = current;
397                     tryal.remove (i1);
398                     tryal.remove (i2);
399                     tryal.add (i3);
400                     unsigned int try_duplicates_count =
401                       count_duplicates_tuple (tryal, alpha_unify);
402 
403                     /* We prefer 'try' to 'best' if it produces less duplicates,
404                        or if it produces the same number of duplicates but with
405                        a more efficient hash function.  */
406                     if (try_duplicates_count < best_duplicates_count
407                         || (try_duplicates_count == best_duplicates_count
408                             && (i1 == -1 || i2 == -1 || i3 >= 0)))
409                       {
410                         best = tryal;
411                         best_duplicates_count = try_duplicates_count;
412                       }
413                   }
414 
415       /* Stop removing positions when it gives no improvement.  */
416       if (best_duplicates_count > current_duplicates_count)
417         break;
418 
419       current = best;
420       current_duplicates_count = best_duplicates_count;
421     }
422 
423   /* That's it.  Hope it's good enough.  */
424   _key_positions = current;
425 
426   if (option[DEBUG])
427     {
428       /* Print the result.  */
429       fprintf (stderr, "\nComputed positions: ");
430       PositionReverseIterator iter = _key_positions.reviterator();
431       bool seen_lastchar = false;
432       bool first = true;
433       for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
434         {
435           if (!first)
436             fprintf (stderr, ", ");
437           if (i == Positions::LASTCHAR)
438             seen_lastchar = true;
439           else
440             {
441               fprintf (stderr, "%d", i + 1);
442               first = false;
443             }
444         }
445       if (seen_lastchar)
446         {
447           if (!first)
448             fprintf (stderr, ", ");
449           fprintf (stderr, "$");
450         }
451       fprintf (stderr, "\n");
452     }
453 
454   /* Free preliminary alpha_unify table.  */
455   delete[] alpha_unify;
456 }
457 
458 /* Count the duplicate keywords that occur with the found set of positions.
459    In other words, it returns the difference
460      # K - # proj1 (K)
461    where K is the multiset of given keywords.  */
462 unsigned int
count_duplicates_tuple() const463 Search::count_duplicates_tuple () const
464 {
465   unsigned int *alpha_unify = compute_alpha_unify ();
466   unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
467   delete[] alpha_unify;
468   return count;
469 }
470 
471 /* ===================== Finding good alpha increments ===================== */
472 
473 /* Computes the upper bound on the indices passed to asso_values[].  */
474 unsigned int
compute_alpha_size(const unsigned int * alpha_inc) const475 Search::compute_alpha_size (const unsigned int *alpha_inc) const
476 {
477   unsigned int max_alpha_inc = 0;
478   for (int i = 0; i < _max_key_len; i++)
479     if (max_alpha_inc < alpha_inc[i])
480       max_alpha_inc = alpha_inc[i];
481   return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
482 }
483 
484 /* Computes the unification rules between different asso_values[c].  */
485 unsigned int *
compute_alpha_unify(const Positions & positions,const unsigned int * alpha_inc) const486 Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
487 {
488   if (option[UPPERLOWER])
489     {
490       /* Without alpha increments, we would simply unify
491            'A' -> 'a', ..., 'Z' -> 'z'.
492          But when a keyword contains at position i a character c,
493          we have the constraint
494             asso_values[tolower(c) + alpha_inc[i]] ==
495             asso_values[toupper(c) + alpha_inc[i]].
496          This introduces a unification
497            toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
498          Note that this unification can extend outside the range of
499          ASCII letters!  But still every unified character pair is at
500          a distance of 'a'-'A' = 32, or (after chained unification)
501          at a multiple of 32.  So in the end the alpha_unify vector has
502          the form    c -> c + 32 * f(c)   where f(c) is a nonnegative
503          integer.  */
504       unsigned int alpha_size = compute_alpha_size (alpha_inc);
505 
506       unsigned int *alpha_unify = new unsigned int[alpha_size];
507       for (unsigned int c = 0; c < alpha_size; c++)
508         alpha_unify[c] = c;
509 
510       for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
511         {
512           KeywordExt *keyword = temp->first();
513 
514           /* Iterate through the selected character positions.  */
515           PositionIterator iter = positions.iterator(keyword->_allchars_length);
516 
517           for (int i; (i = iter.next ()) != PositionIterator::EOS; )
518             {
519               unsigned int c;
520               if (i == Positions::LASTCHAR)
521                 c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
522               else if (i < keyword->_allchars_length)
523                 c = static_cast<unsigned char>(keyword->_allchars[i]);
524               else
525                 abort ();
526               if (c >= 'A' && c <= 'Z')
527                 c += 'a' - 'A';
528               if (c >= 'a' && c <= 'z')
529                 {
530                   if (i != Positions::LASTCHAR)
531                     c += alpha_inc[i];
532                   /* Unify c with c - ('a'-'A').  */
533                   unsigned int d = alpha_unify[c];
534                   unsigned int b = c - ('a'-'A');
535                   for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
536                     alpha_unify[a] = d;
537                 }
538             }
539         }
540       return alpha_unify;
541     }
542   else
543     /* Identity mapping.  */
544     return NULL;
545 }
546 
547 /* Initializes each keyword's _selchars array.  */
548 void
init_selchars_multiset(const Positions & positions,const unsigned int * alpha_unify,const unsigned int * alpha_inc) const549 Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
550 {
551   for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
552     temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
553 }
554 
555 /* Count the duplicate keywords that occur with the given set of positions
556    and a given alpha_inc[] array.
557    In other words, it returns the difference
558      # K - # proj2 (proj1 (K))
559    where K is the multiset of given keywords.  */
560 unsigned int
count_duplicates_multiset(const unsigned int * alpha_inc) const561 Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
562 {
563   /* Run through the keyword list and count the duplicates incrementally.
564      The result does not depend on the order of the keyword list, thanks to
565      the formula above.  */
566   unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
567   init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
568 
569   unsigned int count = 0;
570   {
571     Hash_Table representatives (_total_keys, option[NOLENGTH]);
572     for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
573       {
574         KeywordExt *keyword = temp->first();
575         if (representatives.insert (keyword))
576           count++;
577       }
578   }
579 
580   delete_selchars ();
581   delete[] alpha_unify;
582 
583   return count;
584 }
585 
586 /* Find good _alpha_inc[].  */
587 
588 void
find_alpha_inc()589 Search::find_alpha_inc ()
590 {
591   /* The goal is to choose _alpha_inc[] such that it doesn't introduce
592      artificial duplicates.
593      In other words, the goal is  # proj2 (proj1 (K)) = # proj1 (K).  */
594   unsigned int duplicates_goal = count_duplicates_tuple ();
595 
596   /* Start with zero increments.  This is sufficient in most cases.  */
597   unsigned int *current = new unsigned int [_max_key_len];
598   for (int i = 0; i < _max_key_len; i++)
599     current[i] = 0;
600   unsigned int current_duplicates_count = count_duplicates_multiset (current);
601 
602   if (current_duplicates_count > duplicates_goal)
603     {
604       /* Look which _alpha_inc[i] we are free to increment.  */
605       unsigned int nindices;
606       {
607         nindices = 0;
608         PositionIterator iter = _key_positions.iterator(_max_key_len);
609         for (;;)
610           {
611             int key_pos = iter.next ();
612             if (key_pos == PositionIterator::EOS)
613               break;
614             if (key_pos != Positions::LASTCHAR)
615               nindices++;
616           }
617       }
618 
619       unsigned int *indices = new unsigned int[nindices];
620       {
621         unsigned int j = 0;
622         PositionIterator iter = _key_positions.iterator(_max_key_len);
623         for (;;)
624           {
625             int key_pos = iter.next ();
626             if (key_pos == PositionIterator::EOS)
627               break;
628             if (key_pos != Positions::LASTCHAR)
629               indices[j++] = key_pos;
630           }
631         if (!(j == nindices))
632           abort ();
633       }
634 
635       /* Perform several rounds of searching for a good alpha increment.
636          Each round reduces the number of artificial collisions by adding
637          an increment in a single key position.  */
638       unsigned int *best = new unsigned int[_max_key_len];
639       unsigned int *tryal = new unsigned int[_max_key_len];
640       do
641         {
642           /* An increment of 1 is not always enough.  Try higher increments
643              also.  */
644           for (unsigned int inc = 1; ; inc++)
645             {
646               unsigned int best_duplicates_count = UINT_MAX;
647 
648               for (unsigned int j = 0; j < nindices; j++)
649                 {
650                   memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
651                   tryal[indices[j]] += inc;
652                   unsigned int try_duplicates_count =
653                     count_duplicates_multiset (tryal);
654 
655                   /* We prefer 'try' to 'best' if it produces less
656                      duplicates.  */
657                   if (try_duplicates_count < best_duplicates_count)
658                     {
659                       memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
660                       best_duplicates_count = try_duplicates_count;
661                     }
662                 }
663 
664               /* Stop this round when we got an improvement.  */
665               if (best_duplicates_count < current_duplicates_count)
666                 {
667                   memcpy (current, best, _max_key_len * sizeof (unsigned int));
668                   current_duplicates_count = best_duplicates_count;
669                   break;
670                 }
671             }
672         }
673       while (current_duplicates_count > duplicates_goal);
674       delete[] tryal;
675       delete[] best;
676 
677       if (option[DEBUG])
678         {
679           /* Print the result.  */
680           fprintf (stderr, "\nComputed alpha increments: ");
681           bool first = true;
682           for (unsigned int j = nindices; j-- > 0; )
683             if (current[indices[j]] != 0)
684               {
685                 if (!first)
686                   fprintf (stderr, ", ");
687                 fprintf (stderr, "%u:+%u",
688                          indices[j] + 1, current[indices[j]]);
689                 first = false;
690               }
691           fprintf (stderr, "\n");
692         }
693       delete[] indices;
694     }
695 
696   _alpha_inc = current;
697   _alpha_size = compute_alpha_size (_alpha_inc);
698   _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
699 }
700 
701 /* ======================= Finding good asso_values ======================== */
702 
703 /* Initializes the asso_values[] related parameters.  */
704 
705 void
prepare_asso_values()706 Search::prepare_asso_values ()
707 {
708   KeywordExt_List *temp;
709 
710   /* Initialize each keyword's _selchars array.  */
711   init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
712 
713   /* Compute the maximum _selchars_length over all keywords.  */
714   _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
715 
716   /* Check for duplicates, i.e. keywords with the same _selchars array
717      (and - if !option[NOLENGTH] - also the same length).
718      We deal with these by building an equivalence class, so that only
719      1 keyword is representative of the entire collection.  Only this
720      representative remains in the keyword list; the others are accessible
721      through the _duplicate_link chain, starting at the representative.
722      This *greatly* simplifies processing during later stages of the program.
723      Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
724   {
725     _list_len = _total_keys;
726     _total_duplicates = 0;
727     /* Make hash table for efficiency.  */
728     Hash_Table representatives (_list_len, option[NOLENGTH]);
729 
730     KeywordExt_List *prev = NULL; /* list node before temp */
731     for (temp = _head; temp; )
732       {
733         KeywordExt *keyword = temp->first();
734         KeywordExt *other_keyword = representatives.insert (keyword);
735         KeywordExt_List *garbage = NULL;
736 
737         if (other_keyword)
738           {
739             _total_duplicates++;
740             _list_len--;
741             /* Remove keyword from the main list.  */
742             prev->rest() = temp->rest();
743             garbage = temp;
744             /* And insert it on other_keyword's duplicate list.  */
745             keyword->_duplicate_link = other_keyword->_duplicate_link;
746             other_keyword->_duplicate_link = keyword;
747 
748             /* Complain if user hasn't enabled the duplicate option. */
749             if (!option[DUP] || option[DEBUG])
750               {
751                 fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
752                          keyword->_allchars_length, keyword->_allchars,
753                          other_keyword->_allchars_length, other_keyword->_allchars);
754                 for (int j = 0; j < keyword->_selchars_length; j++)
755                   putc (keyword->_selchars[j], stderr);
756                 fprintf (stderr, "\".\n");
757               }
758           }
759         else
760           {
761             keyword->_duplicate_link = NULL;
762             prev = temp;
763           }
764         temp = temp->rest();
765         if (garbage)
766           delete garbage;
767       }
768     if (option[DEBUG])
769       representatives.dump();
770   }
771 
772   /* Exit program if duplicates exists and option[DUP] not set, since we
773      don't want to continue in this case.  (We don't want to turn on
774      option[DUP] implicitly, because the generated code is usually much
775      slower.  */
776   if (_total_duplicates)
777     {
778       if (option[DUP])
779         fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
780                          _total_duplicates);
781       else
782         {
783           fprintf (stderr, "%d input keys have identical hash values,\n",
784                            _total_duplicates);
785           if (option[POSITIONS])
786             fprintf (stderr, "try different key positions or use option -D.\n");
787           else
788             fprintf (stderr, "use option -D.\n");
789           exit (1);
790         }
791     }
792 
793   /* Compute the occurrences of each character in the alphabet.  */
794   _occurrences = new int[_alpha_size];
795   memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
796   for (temp = _head; temp; temp = temp->rest())
797     {
798       KeywordExt *keyword = temp->first();
799       const unsigned int *ptr = keyword->_selchars;
800       for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
801         _occurrences[*ptr]++;
802     }
803 
804   /* Memory allocation.  */
805   _asso_values = new int[_alpha_size];
806 
807   int non_linked_length = _list_len;
808   unsigned int asso_value_max;
809 
810   asso_value_max =
811     static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
812   /* Round up to the next power of two.  This makes it easy to ensure
813      an _asso_value[c] is >= 0 and < asso_value_max.  Also, the jump value
814      being odd, it guarantees that Search::try_asso_value() will iterate
815      through different values for _asso_value[c].  */
816   if (asso_value_max == 0)
817     asso_value_max = 1;
818   asso_value_max |= asso_value_max >> 1;
819   asso_value_max |= asso_value_max >> 2;
820   asso_value_max |= asso_value_max >> 4;
821   asso_value_max |= asso_value_max >> 8;
822   asso_value_max |= asso_value_max >> 16;
823   asso_value_max++;
824   _asso_value_max = asso_value_max;
825 
826   /* Given the bound for _asso_values[c], we have a bound for the possible
827      hash values, as computed in compute_hash().  */
828   _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
829                     + (_asso_value_max - 1) * _max_selchars_length;
830   /* Allocate a sparse bit vector for detection of collisions of hash
831      values.  */
832   _collision_detector = new Bool_Array (_max_hash_value + 1);
833 
834   if (option[DEBUG])
835     {
836       fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
837                "\nmaximum size of generated hash table is %d\n",
838                non_linked_length, asso_value_max, _max_hash_value);
839 
840       int field_width;
841 
842       field_width = 0;
843       {
844         for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
845           {
846             KeywordExt *keyword = temp->first();
847             if (field_width < keyword->_selchars_length)
848               field_width = keyword->_selchars_length;
849           }
850       }
851 
852       fprintf (stderr, "\ndumping the keyword list without duplicates\n");
853       fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
854       int i = 0;
855       for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
856         {
857           KeywordExt *keyword = temp->first();
858           fprintf (stderr, "%9d, ", ++i);
859           if (field_width > keyword->_selchars_length)
860             fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
861           for (int j = 0; j < keyword->_selchars_length; j++)
862             putc (keyword->_selchars[j], stderr);
863           fprintf (stderr, ", %.*s\n",
864                    keyword->_allchars_length, keyword->_allchars);
865         }
866       fprintf (stderr, "\nend of keyword list\n\n");
867     }
868 
869   if (option[RANDOM] || option.get_jump () == 0)
870     /* We will use rand(), so initialize the random number generator.  */
871     srand (static_cast<long>(time (0)));
872 
873   _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
874   _jump = option.get_jump ();
875 }
876 
877 /* Finds some _asso_values[] that fit.  */
878 
879 /* The idea is to choose the _asso_values[] one by one, in a way that
880    a choice that has been made never needs to be undone later.  This
881    means that we split the work into several steps.  Each step chooses
882    one or more _asso_values[c].  The result of choosing one or more
883    _asso_values[c] is that the partitioning of the keyword set gets
884    broader.
885    Look at this partitioning:  After every step, the _asso_values[] of a
886    certain set C of characters are undetermined.  (At the beginning, C
887    is the set of characters c with _occurrences[c] > 0.  At the end, C
888    is empty.)  To each keyword K, we associate the multiset of _selchars
889    for which the _asso_values[] are undetermined:
890                     K  -->  K->_selchars intersect C.
891    Consider two keywords equivalent if their value under this mapping is
892    the same.  This introduces an equivalence relation on the set of
893    keywords.  The equivalence classes partition the keyword set.  (At the
894    beginning, the partition is the finest possible: each K is an equivalence
895    class by itself, because all K have a different _selchars.  At the end,
896    all K have been merged into a single equivalence class.)
897    The partition before a step is always a refinement of the partition
898    after the step.
899    We choose the steps in such a way that the partition really becomes
900    broader at each step.  (A step that only chooses an _asso_values[c]
901    without changing the partition is better merged with the previous step,
902    to avoid useless backtracking.)  */
903 
904 struct EquivalenceClass
905 {
906   /* The keywords in this equivalence class.  */
907   KeywordExt_List *     _keywords;
908   KeywordExt_List *     _keywords_last;
909   /* The number of keywords in this equivalence class.  */
910   unsigned int          _cardinality;
911   /* The undetermined selected characters for the keywords in this
912      equivalence class, as a canonically reordered multiset.  */
913   unsigned int *        _undetermined_chars;
914   unsigned int          _undetermined_chars_length;
915 
916   EquivalenceClass *    _next;
917 };
918 
919 struct Step
920 {
921   /* The characters whose values are being determined in this step.  */
922   unsigned int          _changing_count;
923   unsigned int *        _changing;
924   /* Exclusive upper bound for the _asso_values[c] of this step.
925      A power of 2.  */
926   unsigned int          _asso_value_max;
927   /* The characters whose values will be determined after this step.  */
928   bool *                _undetermined;
929   /* The keyword set partition after this step.  */
930   EquivalenceClass *    _partition;
931   /* The expected number of iterations in this step.  */
932   double                _expected_lower;
933   double                _expected_upper;
934 
935   Step *                _next;
936 };
937 
938 static inline bool
equals(const unsigned int * ptr1,const unsigned int * ptr2,unsigned int len)939 equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
940 {
941   while (len > 0)
942     {
943       if (*ptr1 != *ptr2)
944         return false;
945       ptr1++;
946       ptr2++;
947       len--;
948     }
949   return true;
950 }
951 
952 EquivalenceClass *
compute_partition(bool * undetermined) const953 Search::compute_partition (bool *undetermined) const
954 {
955   EquivalenceClass *partition = NULL;
956   EquivalenceClass *partition_last = NULL;
957   for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
958     {
959       KeywordExt *keyword = temp->first();
960 
961       /* Compute the undetermined characters for this keyword.  */
962       unsigned int *undetermined_chars =
963         new unsigned int[keyword->_selchars_length];
964       unsigned int undetermined_chars_length = 0;
965 
966       for (int i = 0; i < keyword->_selchars_length; i++)
967         if (undetermined[keyword->_selchars[i]])
968           undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
969 
970       /* Look up the equivalence class to which this keyword belongs.  */
971       EquivalenceClass *equclass;
972       for (equclass = partition; equclass; equclass = equclass->_next)
973         if (equclass->_undetermined_chars_length == undetermined_chars_length
974             && equals (equclass->_undetermined_chars, undetermined_chars,
975                        undetermined_chars_length))
976           break;
977       if (equclass == NULL)
978         {
979           equclass = new EquivalenceClass();
980           equclass->_keywords = NULL;
981           equclass->_keywords_last = NULL;
982           equclass->_cardinality = 0;
983           equclass->_undetermined_chars = undetermined_chars;
984           equclass->_undetermined_chars_length = undetermined_chars_length;
985           equclass->_next = NULL;
986           if (partition)
987             partition_last->_next = equclass;
988           else
989             partition = equclass;
990           partition_last = equclass;
991         }
992       else
993         delete[] undetermined_chars;
994 
995       /* Add the keyword to the equivalence class.  */
996       KeywordExt_List *cons = new KeywordExt_List(keyword);
997       if (equclass->_keywords)
998         equclass->_keywords_last->rest() = cons;
999       else
1000         equclass->_keywords = cons;
1001       equclass->_keywords_last = cons;
1002       equclass->_cardinality++;
1003     }
1004 
1005   /* Free some of the allocated memory.  The caller doesn't need it.  */
1006   for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1007     delete[] cls->_undetermined_chars;
1008 
1009   return partition;
1010 }
1011 
1012 static void
delete_partition(EquivalenceClass * partition)1013 delete_partition (EquivalenceClass *partition)
1014 {
1015   while (partition != NULL)
1016     {
1017       EquivalenceClass *equclass = partition;
1018       partition = equclass->_next;
1019       delete_list (equclass->_keywords);
1020       //delete[] equclass->_undetermined_chars; // already freed above
1021       delete equclass;
1022     }
1023 }
1024 
1025 /* Compute the possible number of collisions when _asso_values[c] is
1026    chosen, leading to the given partition.  */
1027 unsigned int
count_possible_collisions(EquivalenceClass * partition,unsigned int c) const1028 Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1029 {
1030   /* Every equivalence class p is split according to the frequency of
1031      occurrence of c, leading to equivalence classes p1, p2, ...
1032      This leads to   (|p|^2 - |p1|^2 - |p2|^2 - ...)/2  possible collisions.
1033      Return the sum of this expression over all equivalence classes.  */
1034   unsigned int sum = 0;
1035   unsigned int m = _max_selchars_length;
1036   unsigned int *split_cardinalities = new unsigned int[m + 1];
1037   for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1038     {
1039       for (unsigned int i = 0; i <= m; i++)
1040         split_cardinalities[i] = 0;
1041 
1042       for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1043         {
1044           KeywordExt *keyword = temp->first();
1045 
1046           unsigned int count = 0;
1047           for (int i = 0; i < keyword->_selchars_length; i++)
1048             if (keyword->_selchars[i] == c)
1049               count++;
1050 
1051           split_cardinalities[count]++;
1052         }
1053 
1054       sum += cls->_cardinality * cls->_cardinality;
1055       for (unsigned int i = 0; i <= m; i++)
1056         sum -= split_cardinalities[i] * split_cardinalities[i];
1057     }
1058   delete[] split_cardinalities;
1059   return sum;
1060 }
1061 
1062 /* Test whether adding c to the undetermined characters changes the given
1063    partition.  */
1064 bool
unchanged_partition(EquivalenceClass * partition,unsigned int c) const1065 Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1066 {
1067   for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1068     {
1069       unsigned int first_count = UINT_MAX;
1070 
1071       for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1072         {
1073           KeywordExt *keyword = temp->first();
1074 
1075           unsigned int count = 0;
1076           for (int i = 0; i < keyword->_selchars_length; i++)
1077             if (keyword->_selchars[i] == c)
1078               count++;
1079 
1080           if (temp == cls->_keywords)
1081             first_count = count;
1082           else if (count != first_count)
1083             /* c would split this equivalence class.  */
1084             return false;
1085         }
1086     }
1087   return true;
1088 }
1089 
1090 void
find_asso_values()1091 Search::find_asso_values ()
1092 {
1093   Step *steps;
1094 
1095   /* Determine the steps, starting with the last one.  */
1096   {
1097     bool *undetermined;
1098     bool *determined;
1099 
1100     steps = NULL;
1101 
1102     undetermined = new bool[_alpha_size];
1103     for (unsigned int c = 0; c < _alpha_size; c++)
1104       undetermined[c] = false;
1105 
1106     determined = new bool[_alpha_size];
1107     for (unsigned int c = 0; c < _alpha_size; c++)
1108       determined[c] = true;
1109 
1110     for (;;)
1111       {
1112         /* Compute the partition that needs to be refined.  */
1113         EquivalenceClass *partition = compute_partition (undetermined);
1114 
1115         /* Determine the main character to be chosen in this step.
1116            Choosing such a character c has the effect of splitting every
1117            equivalence class (according the the frequency of occurrence of c).
1118            We choose the c with the minimum number of possible collisions,
1119            so that characters which lead to a large number of collisions get
1120            handled early during the search.  */
1121         unsigned int chosen_c;
1122         unsigned int chosen_possible_collisions;
1123         {
1124           unsigned int best_c = 0;
1125           unsigned int best_possible_collisions = UINT_MAX;
1126           for (unsigned int c = 0; c < _alpha_size; c++)
1127             if (_occurrences[c] > 0 && determined[c])
1128               {
1129                 unsigned int possible_collisions =
1130                   count_possible_collisions (partition, c);
1131                 if (possible_collisions < best_possible_collisions)
1132                   {
1133                     best_c = c;
1134                     best_possible_collisions = possible_collisions;
1135                   }
1136               }
1137           if (best_possible_collisions == UINT_MAX)
1138             {
1139               /* All c with _occurrences[c] > 0 are undetermined.  We are
1140                  are the starting situation and don't need any more step.  */
1141               delete_partition (partition);
1142               break;
1143             }
1144           chosen_c = best_c;
1145           chosen_possible_collisions = best_possible_collisions;
1146         }
1147 
1148         /* We need one more step.  */
1149         Step *step = new Step();
1150 
1151         step->_undetermined = new bool[_alpha_size];
1152         memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1153 
1154         step->_partition = partition;
1155 
1156         /* Now determine how the equivalence classes will be before this
1157            step.  */
1158         undetermined[chosen_c] = true;
1159         partition = compute_partition (undetermined);
1160 
1161         /* Now determine which other characters should be determined in this
1162            step, because they will not change the equivalence classes at
1163            this point.  It is the set of all c which, for all equivalence
1164            classes, have the same frequency of occurrence in every keyword
1165            of the equivalence class.  */
1166         for (unsigned int c = 0; c < _alpha_size; c++)
1167           if (_occurrences[c] > 0 && determined[c]
1168               && unchanged_partition (partition, c))
1169             {
1170               undetermined[c] = true;
1171               determined[c] = false;
1172             }
1173 
1174         /* main_c must be one of these.  */
1175         if (determined[chosen_c])
1176           abort ();
1177 
1178         /* Now the set of changing characters of this step.  */
1179         unsigned int changing_count;
1180 
1181         changing_count = 0;
1182         for (unsigned int c = 0; c < _alpha_size; c++)
1183           if (undetermined[c] && !step->_undetermined[c])
1184             changing_count++;
1185 
1186         unsigned int *changing = new unsigned int[changing_count];
1187         changing_count = 0;
1188         for (unsigned int c = 0; c < _alpha_size; c++)
1189           if (undetermined[c] && !step->_undetermined[c])
1190             changing[changing_count++] = c;
1191 
1192         step->_changing = changing;
1193         step->_changing_count = changing_count;
1194 
1195         step->_asso_value_max = _asso_value_max;
1196 
1197         step->_expected_lower =
1198           exp (static_cast<double>(chosen_possible_collisions)
1199                / static_cast<double>(_max_hash_value));
1200         step->_expected_upper =
1201           exp (static_cast<double>(chosen_possible_collisions)
1202                / static_cast<double>(_asso_value_max));
1203 
1204         delete_partition (partition);
1205 
1206         step->_next = steps;
1207         steps = step;
1208       }
1209 
1210     delete[] determined;
1211     delete[] undetermined;
1212   }
1213 
1214   if (option[DEBUG])
1215     {
1216       unsigned int stepno = 0;
1217       for (Step *step = steps; step; step = step->_next)
1218         {
1219           stepno++;
1220           fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1221           for (unsigned int i = 0; i < step->_changing_count; i++)
1222             {
1223               if (i > 0)
1224                 fprintf (stderr, ",");
1225               fprintf (stderr, "'%c'", step->_changing[i]);
1226             }
1227           fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1228                    step->_expected_lower, step->_expected_upper);
1229           fprintf (stderr, "Keyword equivalence classes:\n");
1230           for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1231             {
1232               fprintf (stderr, "\n");
1233               for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1234                 {
1235                   KeywordExt *keyword = temp->first();
1236                   fprintf (stderr, "  %.*s\n",
1237                            keyword->_allchars_length, keyword->_allchars);
1238                 }
1239             }
1240           fprintf (stderr, "\n");
1241         }
1242     }
1243 
1244   /* Initialize _asso_values[].  (The value given here matters only
1245      for those c which occur in all keywords with equal multiplicity.)  */
1246   for (unsigned int c = 0; c < _alpha_size; c++)
1247     _asso_values[c] = 0;
1248 
1249   unsigned int stepno = 0;
1250   for (Step *step = steps; step; step = step->_next)
1251     {
1252       stepno++;
1253 
1254       /* Initialize the asso_values[].  */
1255       unsigned int k = step->_changing_count;
1256       for (unsigned int i = 0; i < k; i++)
1257         {
1258           unsigned int c = step->_changing[i];
1259           _asso_values[c] =
1260             (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1261             & (step->_asso_value_max - 1);
1262         }
1263 
1264       unsigned int iterations = 0;
1265       unsigned int *iter = new unsigned int[k];
1266       for (unsigned int i = 0; i < k; i++)
1267         iter[i] = 0;
1268       unsigned int ii = (_jump != 0 ? k - 1 : 0);
1269 
1270       for (;;)
1271         {
1272           /* Test whether these asso_values[] lead to collisions among
1273              the equivalence classes that should be collision-free.  */
1274           bool has_collision = false;
1275           for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1276             {
1277               /* Iteration Number array is a win, O(1) initialization time!  */
1278               _collision_detector->clear ();
1279 
1280               for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1281                 {
1282                   KeywordExt *keyword = ptr->first();
1283 
1284                   /* Compute the new hash code for the keyword, leaving apart
1285                      the yet undetermined asso_values[].  */
1286                   int hashcode;
1287                   {
1288                     int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1289                     const unsigned int *p = keyword->_selchars;
1290                     int i = keyword->_selchars_length;
1291                     for (; i > 0; p++, i--)
1292                       if (!step->_undetermined[*p])
1293                         sum += _asso_values[*p];
1294                     hashcode = sum;
1295                   }
1296 
1297                   /* See whether it collides with another keyword's hash code,
1298                      from the same equivalence class.  */
1299                   if (_collision_detector->set_bit (hashcode))
1300                     {
1301                       has_collision = true;
1302                       break;
1303                     }
1304                 }
1305 
1306               /* Don't need to continue looking at the other equivalence
1307                  classes if we already have found a collision.  */
1308               if (has_collision)
1309                 break;
1310             }
1311 
1312           iterations++;
1313           if (!has_collision)
1314             break;
1315 
1316           /* Try other asso_values[].  */
1317           if (_jump != 0)
1318             {
1319               /* The way we try various values for
1320                    asso_values[step->_changing[0],...step->_changing[k-1]]
1321                  is like this:
1322                  for (bound = 0,1,...)
1323                    for (ii = 0,...,k-1)
1324                      iter[ii] := bound
1325                      iter[0..ii-1] := values <= bound
1326                      iter[ii+1..k-1] := values < bound
1327                  and
1328                    asso_values[step->_changing[i]] =
1329                      _initial_asso_value + iter[i] * _jump.
1330                  This makes it more likely to find small asso_values[].
1331                */
1332               unsigned int bound = iter[ii];
1333               unsigned int i = 0;
1334               while (i < ii)
1335                 {
1336                   unsigned int c = step->_changing[i];
1337                   iter[i]++;
1338                   _asso_values[c] =
1339                     (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1340                   if (iter[i] <= bound)
1341                     goto found_next;
1342                   _asso_values[c] =
1343                     (_asso_values[c] - iter[i] * _jump)
1344                     & (step->_asso_value_max - 1);
1345                   iter[i] = 0;
1346                   i++;
1347                 }
1348               i = ii + 1;
1349               while (i < k)
1350                 {
1351                   unsigned int c = step->_changing[i];
1352                   iter[i]++;
1353                   _asso_values[c] =
1354                     (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1355                   if (iter[i] < bound)
1356                     goto found_next;
1357                   _asso_values[c] =
1358                     (_asso_values[c] - iter[i] * _jump)
1359                     & (step->_asso_value_max - 1);
1360                   iter[i] = 0;
1361                   i++;
1362                 }
1363               /* Switch from one ii to the next.  */
1364               {
1365                 unsigned int c = step->_changing[ii];
1366                 _asso_values[c] =
1367                   (_asso_values[c] - bound * _jump)
1368                   & (step->_asso_value_max - 1);
1369                 iter[ii] = 0;
1370               }
1371               /* Here all iter[i] == 0.  */
1372               ii++;
1373               if (ii == k)
1374                 {
1375                   ii = 0;
1376                   bound++;
1377                   if (bound == step->_asso_value_max)
1378                     {
1379                       /* Out of search space!  We can either backtrack, or
1380                          increase the available search space of this step.
1381                          It seems simpler to choose the latter solution.  */
1382                       step->_asso_value_max = 2 * step->_asso_value_max;
1383                       if (step->_asso_value_max > _asso_value_max)
1384                         {
1385                           _asso_value_max = step->_asso_value_max;
1386                           /* Reinitialize _max_hash_value.  */
1387                           _max_hash_value =
1388                             (option[NOLENGTH] ? 0 : _max_key_len)
1389                             + (_asso_value_max - 1) * _max_selchars_length;
1390                           /* Reinitialize _collision_detector.  */
1391                           delete _collision_detector;
1392                           _collision_detector =
1393                             new Bool_Array (_max_hash_value + 1);
1394                         }
1395                     }
1396                 }
1397               {
1398                 unsigned int c = step->_changing[ii];
1399                 iter[ii] = bound;
1400                 _asso_values[c] =
1401                   (_asso_values[c] + bound * _jump)
1402                   & (step->_asso_value_max - 1);
1403               }
1404              found_next: ;
1405             }
1406           else
1407             {
1408               /* Random.  */
1409               unsigned int c = step->_changing[ii];
1410               _asso_values[c] =
1411                 (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1412               /* Next time, change the next c.  */
1413               ii++;
1414               if (ii == k)
1415                 ii = 0;
1416             }
1417         }
1418       delete[] iter;
1419 
1420       if (option[DEBUG])
1421         {
1422           fprintf (stderr, "Step %u chose _asso_values[", stepno);
1423           for (unsigned int i = 0; i < step->_changing_count; i++)
1424             {
1425               if (i > 0)
1426                 fprintf (stderr, ",");
1427               fprintf (stderr, "'%c'", step->_changing[i]);
1428             }
1429           fprintf (stderr, "] in %u iterations.\n", iterations);
1430         }
1431     }
1432 
1433   /* Free allocated memory.  */
1434   while (steps != NULL)
1435     {
1436       Step *step = steps;
1437       steps = step->_next;
1438       delete[] step->_changing;
1439       delete[] step->_undetermined;
1440       delete_partition (step->_partition);
1441       delete step;
1442     }
1443 }
1444 
1445 /* Computes a keyword's hash value, relative to the current _asso_values[],
1446    and stores it in keyword->_hash_value.  */
1447 
1448 inline int
compute_hash(KeywordExt * keyword) const1449 Search::compute_hash (KeywordExt *keyword) const
1450 {
1451   int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1452 
1453   const unsigned int *p = keyword->_selchars;
1454   int i = keyword->_selchars_length;
1455   for (; i > 0; p++, i--)
1456     sum += _asso_values[*p];
1457 
1458   return keyword->_hash_value = sum;
1459 }
1460 
1461 /* Finds good _asso_values[].  */
1462 
1463 void
find_good_asso_values()1464 Search::find_good_asso_values ()
1465 {
1466   prepare_asso_values ();
1467 
1468   /* Search for good _asso_values[].  */
1469   int asso_iteration;
1470   if ((asso_iteration = option.get_asso_iterations ()) == 0)
1471     /* Try only the given _initial_asso_value and _jump.  */
1472     find_asso_values ();
1473   else
1474     {
1475       /* Try different pairs of _initial_asso_value and _jump, in the
1476          following order:
1477            (0, 1)
1478            (1, 1)
1479            (2, 1) (0, 3)
1480            (3, 1) (1, 3)
1481            (4, 1) (2, 3) (0, 5)
1482            (5, 1) (3, 3) (1, 5)
1483            ..... */
1484       KeywordExt_List *saved_head = _head;
1485       int best_initial_asso_value = 0;
1486       int best_jump = 1;
1487       int *best_asso_values = new int[_alpha_size];
1488       int best_collisions = INT_MAX;
1489       int best_max_hash_value = INT_MAX;
1490 
1491       _initial_asso_value = 0; _jump = 1;
1492       for (;;)
1493         {
1494           /* Restore the keyword list in its original order.  */
1495           _head = copy_list (saved_head);
1496           /* Find good _asso_values[].  */
1497           find_asso_values ();
1498           /* Test whether it is the best solution so far.  */
1499           int collisions = 0;
1500           int max_hash_value = INT_MIN;
1501           _collision_detector->clear ();
1502           for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1503             {
1504               KeywordExt *keyword = ptr->first();
1505               int hashcode = compute_hash (keyword);
1506               if (max_hash_value < hashcode)
1507                 max_hash_value = hashcode;
1508               if (_collision_detector->set_bit (hashcode))
1509                 collisions++;
1510             }
1511           if (collisions < best_collisions
1512               || (collisions == best_collisions
1513                   && max_hash_value < best_max_hash_value))
1514             {
1515               memcpy (best_asso_values, _asso_values,
1516                       _alpha_size * sizeof (_asso_values[0]));
1517               best_collisions = collisions;
1518               best_max_hash_value = max_hash_value;
1519             }
1520           /* Delete the copied keyword list.  */
1521           delete_list (_head);
1522 
1523           if (--asso_iteration == 0)
1524             break;
1525           /* Prepare for next iteration.  */
1526           if (_initial_asso_value >= 2)
1527             _initial_asso_value -= 2, _jump += 2;
1528           else
1529             _initial_asso_value += _jump, _jump = 1;
1530         }
1531       _head = saved_head;
1532       /* Install the best found asso_values.  */
1533       _initial_asso_value = best_initial_asso_value;
1534       _jump = best_jump;
1535       memcpy (_asso_values, best_asso_values,
1536               _alpha_size * sizeof (_asso_values[0]));
1537       delete[] best_asso_values;
1538       /* The keywords' _hash_value fields are recomputed below.  */
1539     }
1540 }
1541 
1542 /* ========================================================================= */
1543 
1544 /* Comparison function for sorting by increasing _hash_value.  */
1545 static bool
less_by_hash_value(KeywordExt * keyword1,KeywordExt * keyword2)1546 less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1547 {
1548   return keyword1->_hash_value < keyword2->_hash_value;
1549 }
1550 
1551 /* Sorts the keyword list by hash value.  */
1552 
1553 void
sort()1554 Search::sort ()
1555 {
1556   _head = mergesort_list (_head, less_by_hash_value);
1557 }
1558 
1559 void
optimize()1560 Search::optimize ()
1561 {
1562   /* Preparations.  */
1563   prepare ();
1564 
1565   /* Step 1: Finding good byte positions.  */
1566   find_positions ();
1567 
1568   /* Step 2: Finding good alpha increments.  */
1569   find_alpha_inc ();
1570 
1571   /* Step 3: Finding good asso_values.  */
1572   find_good_asso_values ();
1573 
1574   /* Make one final check, just to make sure nothing weird happened.... */
1575   _collision_detector->clear ();
1576   for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1577     {
1578       KeywordExt *curr = curr_ptr->first();
1579       unsigned int hashcode = compute_hash (curr);
1580       if (_collision_detector->set_bit (hashcode))
1581         {
1582           /* This shouldn't happen.  proj1, proj2, proj3 must have been
1583              computed to be injective on the given keyword set.  */
1584           fprintf (stderr,
1585                    "\nInternal error, unexpected duplicate hash code\n");
1586           if (option[POSITIONS])
1587             fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1588           else
1589             fprintf (stderr, "try options -m or -r.\n\n");
1590           exit (1);
1591         }
1592     }
1593 
1594   /* Sorts the keyword list by hash value.  */
1595   sort ();
1596 
1597   /* Set unused asso_values[c] to max_hash_value + 1.  This is not absolutely
1598      necessary, but speeds up the lookup function in many cases of lookup
1599      failure: no string comparison is needed once the hash value of a string
1600      is larger than the hash value of any keyword.  */
1601   int max_hash_value;
1602   {
1603     KeywordExt_List *temp;
1604     for (temp = _head; temp->rest(); temp = temp->rest())
1605       ;
1606     max_hash_value = temp->first()->_hash_value;
1607   }
1608   for (unsigned int c = 0; c < _alpha_size; c++)
1609     if (_occurrences[c] == 0)
1610       _asso_values[c] = max_hash_value + 1;
1611 
1612   /* Propagate unified asso_values.  */
1613   if (_alpha_unify)
1614     for (unsigned int c = 0; c < _alpha_size; c++)
1615       if (_alpha_unify[c] != c)
1616         _asso_values[c] = _asso_values[_alpha_unify[c]];
1617 }
1618 
1619 /* Prints out some diagnostics upon completion.  */
1620 
~Search()1621 Search::~Search ()
1622 {
1623   delete _collision_detector;
1624   if (option[DEBUG])
1625     {
1626       fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1627 
1628       for (unsigned int i = 0; i < _alpha_size; i++)
1629         if (_occurrences[i])
1630           fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1631                    i, _asso_values[i], i, _occurrences[i]);
1632 
1633       fprintf (stderr, "end table dumping\n");
1634 
1635       fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1636                "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1637                _list_len, _total_keys, _total_duplicates, _max_key_len);
1638 
1639       int field_width = _max_selchars_length;
1640       fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1641                field_width, "selchars");
1642       for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1643         {
1644           fprintf (stderr, "%11d,%11d,%6d, ",
1645                    ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1646           if (field_width > ptr->first()->_selchars_length)
1647             fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1648           for (int j = 0; j < ptr->first()->_selchars_length; j++)
1649             putc (ptr->first()->_selchars[j], stderr);
1650           fprintf (stderr, ", %.*s\n",
1651                    ptr->first()->_allchars_length, ptr->first()->_allchars);
1652         }
1653 
1654       fprintf (stderr, "End dumping list.\n\n");
1655     }
1656   delete[] _asso_values;
1657   delete[] _occurrences;
1658   delete[] _alpha_unify;
1659   delete[] _alpha_inc;
1660 }
1661