163eb84d1Schristos /* Fast fuzzy searching among messages.
263eb84d1Schristos    Copyright (C) 2006 Free Software Foundation, Inc.
363eb84d1Schristos    Written by Bruno Haible <bruno@clisp.org>, 2006.
463eb84d1Schristos 
563eb84d1Schristos    This program is free software; you can redistribute it and/or modify
663eb84d1Schristos    it under the terms of the GNU General Public License as published by
763eb84d1Schristos    the Free Software Foundation; either version 2, or (at your option)
863eb84d1Schristos    any later version.
963eb84d1Schristos 
1063eb84d1Schristos    This program is distributed in the hope that it will be useful,
1163eb84d1Schristos    but WITHOUT ANY WARRANTY; without even the implied warranty of
1263eb84d1Schristos    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1363eb84d1Schristos    GNU General Public License for more details.
1463eb84d1Schristos 
1563eb84d1Schristos    You should have received a copy of the GNU General Public License
1663eb84d1Schristos    along with this program; if not, write to the Free Software Foundation,
1763eb84d1Schristos    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
1863eb84d1Schristos 
1963eb84d1Schristos #ifdef HAVE_CONFIG_H
2063eb84d1Schristos # include <config.h>
2163eb84d1Schristos #endif
2263eb84d1Schristos 
2363eb84d1Schristos /* Specification.  */
2463eb84d1Schristos #include "msgl-fsearch.h"
2563eb84d1Schristos 
2663eb84d1Schristos #include <math.h>
2763eb84d1Schristos #include <stdlib.h>
2863eb84d1Schristos 
2963eb84d1Schristos #include "xalloc.h"
3063eb84d1Schristos #include "po-charset.h"
3163eb84d1Schristos 
3263eb84d1Schristos 
3363eb84d1Schristos /* Fuzzy searching of L strings in a large set of N messages (assuming
3463eb84d1Schristos    they have all the same small size) takes O(L * N) when using repeated
3563eb84d1Schristos    fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.
3663eb84d1Schristos 
3763eb84d1Schristos    So we preprocess the set of N messages, yielding a data structure
3863eb84d1Schristos    that allows to select the similar messages for any given string, and
3963eb84d1Schristos    apply fstrcmp() only to the search result.  This allows to reduce the
4063eb84d1Schristos    time to something between O(L * 1) and O(L * N) - depending on the
4163eb84d1Schristos    structure of the strings.  In the example with L = 800 and N = 69000,
4263eb84d1Schristos    memory use increases by a factor of 2 and the time decreases from
4363eb84d1Schristos    9068 s to 230 s.
4463eb84d1Schristos 
4563eb84d1Schristos    The data structure is a hash table mapping each n-gram (here with n=4)
4663eb84d1Schristos    occurring in the message strings to the list of messages that contains
4763eb84d1Schristos    it.  For example, if the message list is
4863eb84d1Schristos       [0] "close"
4963eb84d1Schristos       [1] "losers"
5063eb84d1Schristos    the index is a hash table mapping
5163eb84d1Schristos       "clos" -> { 0 }
5263eb84d1Schristos       "lose" -> { 0, 1 }
5363eb84d1Schristos       "oser" -> { 1 }
5463eb84d1Schristos       "sers" -> { 1 }
5563eb84d1Schristos    Searching the similar messages to, say, "closest", is done by looking up
5663eb84d1Schristos    all its 4-grams in the hash table and summing up the results:
5763eb84d1Schristos       "clos" -> { 0 }
5863eb84d1Schristos       "lose" -> { 0, 1 }
5963eb84d1Schristos       "oses" -> {}
6063eb84d1Schristos       "sest" -> {}
6163eb84d1Schristos    => total: { 2x0, 1x1 }
6263eb84d1Schristos    The list is sorted according to decreasing frequency: { 0, 1, ... }
6363eb84d1Schristos    and only the first few messages from this frequency list are passed to
6463eb84d1Schristos    fstrcmp().
6563eb84d1Schristos 
6663eb84d1Schristos    The n-gram similarity and the fstrcmp() similarity are quite different
6763eb84d1Schristos    metrics.  For example, "close" and "c l o s e" have no n-grams in common
6863eb84d1Schristos    (even for n as small as n = 2); however, fstrcmp() will find that they
6963eb84d1Schristos    are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
7063eb84d1Schristos    "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
7163eb84d1Schristos    only 0.02.  Also, repeated n-grams don't have the same effect on the
7263eb84d1Schristos    two similarity measures.  But all this doesn't matter much in practice.
7363eb84d1Schristos 
7463eb84d1Schristos    We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
7563eb84d1Schristos    lists are likely too long.  (Well, with ideographic languages like Chinese,
7663eb84d1Schristos    n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
7763eb84d1Schristos    either.)
7863eb84d1Schristos 
7963eb84d1Schristos    The units are characters in the current encoding.  Not just bytes,
8063eb84d1Schristos    because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */
8163eb84d1Schristos 
8263eb84d1Schristos /* Each message is represented by its index in the message list.  */
8363eb84d1Schristos typedef unsigned int index_ty;
8463eb84d1Schristos 
8563eb84d1Schristos /* An index list has its allocated size and length tacked at the front.
8663eb84d1Schristos    The indices are sorted in ascending order.  */
8763eb84d1Schristos typedef index_ty *index_list_ty;
8863eb84d1Schristos #define IL_ALLOCATED 0
8963eb84d1Schristos #define IL_LENGTH 1
9063eb84d1Schristos 
9163eb84d1Schristos /* Create a new index list containing only a given index.  */
9263eb84d1Schristos static inline index_list_ty
new_index(index_ty idx)9363eb84d1Schristos new_index (index_ty idx)
9463eb84d1Schristos {
9563eb84d1Schristos   index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
9663eb84d1Schristos   list[IL_ALLOCATED] = 1;
9763eb84d1Schristos   list[IL_LENGTH] = 1;
9863eb84d1Schristos   list[2] = idx;
9963eb84d1Schristos 
10063eb84d1Schristos   return list;
10163eb84d1Schristos }
10263eb84d1Schristos 
10363eb84d1Schristos /* Add a given index, greater or equal than all previous indices, to an
10463eb84d1Schristos    index list.
10563eb84d1Schristos    Return the new index list, if it had to be reallocated, or NULL if it
10663eb84d1Schristos    didn't change.  */
10763eb84d1Schristos static inline index_list_ty
addlast_index(index_list_ty list,index_ty idx)10863eb84d1Schristos addlast_index (index_list_ty list, index_ty idx)
10963eb84d1Schristos {
11063eb84d1Schristos   index_list_ty result;
11163eb84d1Schristos   size_t length = list[IL_LENGTH];
11263eb84d1Schristos 
11363eb84d1Schristos   /* Look whether it should be inserted.  */
11463eb84d1Schristos   if (length > 0 && list[2 + (length - 1)] == idx)
11563eb84d1Schristos     return NULL;
11663eb84d1Schristos 
11763eb84d1Schristos   /* Now make room for one more list element.  */
11863eb84d1Schristos   result = NULL;
11963eb84d1Schristos   if (length == list[IL_ALLOCATED])
12063eb84d1Schristos     {
12163eb84d1Schristos       size_t new_allocated = 2 * length - (length >> 6);
12263eb84d1Schristos       list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
12363eb84d1Schristos       list[IL_ALLOCATED] = new_allocated;
12463eb84d1Schristos       result = list;
12563eb84d1Schristos     }
12663eb84d1Schristos   list[2 + length] = idx;
12763eb84d1Schristos   list[IL_LENGTH] = length + 1;
12863eb84d1Schristos   return result;
12963eb84d1Schristos }
13063eb84d1Schristos 
13163eb84d1Schristos /* Add a given index to an index list.
13263eb84d1Schristos    Return the new index list, if it had to be reallocated, or NULL if it
13363eb84d1Schristos    didn't change.  */
13463eb84d1Schristos static inline index_list_ty
add_index(index_list_ty list,index_ty idx)13563eb84d1Schristos add_index (index_list_ty list, index_ty idx)
13663eb84d1Schristos {
13763eb84d1Schristos   index_list_ty result;
13863eb84d1Schristos   size_t length = list[IL_LENGTH];
13963eb84d1Schristos 
14063eb84d1Schristos   /* Look where it should be inserted.  */
14163eb84d1Schristos   size_t lo = 0;
14263eb84d1Schristos   size_t hi = length;
14363eb84d1Schristos   while (lo < hi)
14463eb84d1Schristos     {
14563eb84d1Schristos       /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
14663eb84d1Schristos       size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
14763eb84d1Schristos       index_ty val = list[2 + mid];
14863eb84d1Schristos       if (val < idx)
14963eb84d1Schristos 	lo = mid + 1;
15063eb84d1Schristos       else if (val > idx)
15163eb84d1Schristos 	hi = mid;
15263eb84d1Schristos       else
15363eb84d1Schristos 	return NULL;
15463eb84d1Schristos     }
15563eb84d1Schristos 
15663eb84d1Schristos   /* Now make room for one more list element.  */
15763eb84d1Schristos   result = NULL;
15863eb84d1Schristos   if (length == list[IL_ALLOCATED])
15963eb84d1Schristos     {
16063eb84d1Schristos       size_t new_allocated = 2 * length - (length >> 6);
16163eb84d1Schristos       list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
16263eb84d1Schristos       list[IL_ALLOCATED] = new_allocated;
16363eb84d1Schristos       result = list;
16463eb84d1Schristos     }
16563eb84d1Schristos   list[IL_LENGTH] = length + 1;
16663eb84d1Schristos   for (; length > hi; length--)
16763eb84d1Schristos     list[2 + length] = list[1 + length];
16863eb84d1Schristos   list[2 + length] = idx;
16963eb84d1Schristos   return result;
17063eb84d1Schristos }
17163eb84d1Schristos 
17263eb84d1Schristos /* We use 4-grams, therefore strings with less than 4 characters cannot be
17363eb84d1Schristos    handled through the 4-grams table and need to be handled specially.
17463eb84d1Schristos    Since every character occupies at most 4 bytes (see po-charset.c),
17563eb84d1Schristos    this means the size of such short strings is bounded by:  */
17663eb84d1Schristos #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
17763eb84d1Schristos #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
17863eb84d1Schristos 
17963eb84d1Schristos /* Such short strings are handled by direct comparison with all messages
18063eb84d1Schristos    of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
18163eb84d1Schristos    fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
18263eb84d1Schristos    fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
18363eb84d1Schristos    limit the search to lengths l' in the range
18463eb84d1Schristos      l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
18563eb84d1Schristos    Thus we need the list of the short strings up to length:  */
186*99c2fe91Schristos #define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1))
18763eb84d1Schristos 
18863eb84d1Schristos /* A fuzzy index contains a hash table mapping all n-grams to their
18963eb84d1Schristos    occurrences list.  */
19063eb84d1Schristos struct message_fuzzy_index_ty
19163eb84d1Schristos {
19263eb84d1Schristos   message_ty **messages;
19363eb84d1Schristos   character_iterator_t iterator;
19463eb84d1Schristos   hash_table gram4;
19563eb84d1Schristos   size_t firstfew;
19663eb84d1Schristos   message_list_ty *short_messages[SHORT_MSG_MAX + 1];
19763eb84d1Schristos };
19863eb84d1Schristos 
19963eb84d1Schristos /* Allocate a fuzzy index corresponding to a given list of messages.
20063eb84d1Schristos    The list of messages and the msgctxt and msgid fields of the messages
20163eb84d1Schristos    inside it must not be modified while the returned fuzzy index is in use.  */
20263eb84d1Schristos message_fuzzy_index_ty *
message_fuzzy_index_alloc(const message_list_ty * mlp,const char * canon_charset)20363eb84d1Schristos message_fuzzy_index_alloc (const message_list_ty *mlp,
20463eb84d1Schristos 			   const char *canon_charset)
20563eb84d1Schristos {
20663eb84d1Schristos   message_fuzzy_index_ty *findex =
20763eb84d1Schristos     (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
20863eb84d1Schristos   size_t count = mlp->nitems;
20963eb84d1Schristos   size_t j;
21063eb84d1Schristos   size_t l;
21163eb84d1Schristos 
21263eb84d1Schristos   findex->messages = mlp->item;
21363eb84d1Schristos   findex->iterator = po_charset_character_iterator (canon_charset);
21463eb84d1Schristos 
21563eb84d1Schristos   /* Setup hash table.  */
21663eb84d1Schristos   if (hash_init (&findex->gram4, 10 * count) < 0)
21763eb84d1Schristos     xalloc_die ();
21863eb84d1Schristos   for (j = 0; j < count; j++)
21963eb84d1Schristos     {
22063eb84d1Schristos       message_ty *mp = mlp->item[j];
22163eb84d1Schristos 
22263eb84d1Schristos       if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
22363eb84d1Schristos 	{
22463eb84d1Schristos 	  const char *str = mp->msgid;
22563eb84d1Schristos 
22663eb84d1Schristos 	  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
22763eb84d1Schristos 	  const char *p0 = str;
22863eb84d1Schristos 	  if (*p0 != '\0')
22963eb84d1Schristos 	    {
23063eb84d1Schristos 	      const char *p1 = p0 + findex->iterator (p0);
23163eb84d1Schristos 	      if (*p1 != '\0')
23263eb84d1Schristos 		{
23363eb84d1Schristos 		  const char *p2 = p1 + findex->iterator (p1);
23463eb84d1Schristos 		  if (*p2 != '\0')
23563eb84d1Schristos 		    {
23663eb84d1Schristos 		      const char *p3 = p2 + findex->iterator (p2);
23763eb84d1Schristos 		      if (*p3 != '\0')
23863eb84d1Schristos 			{
23963eb84d1Schristos 			  const char *p4 = p3 + findex->iterator (p3);
24063eb84d1Schristos 			  for (;;)
24163eb84d1Schristos 			    {
24263eb84d1Schristos 			      /* The segment from p0 to p4 is a 4-gram of
24363eb84d1Schristos 				 characters.  Add a hash table entry that maps
24463eb84d1Schristos 				 it to the index j, or extend the existing
24563eb84d1Schristos 				 hash table entry accordingly.  */
24663eb84d1Schristos 			      void *found;
24763eb84d1Schristos 
24863eb84d1Schristos 			      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
24963eb84d1Schristos 						   &found) == 0)
25063eb84d1Schristos 				{
25163eb84d1Schristos 				  index_list_ty list = (index_list_ty) found;
25263eb84d1Schristos 				  list = addlast_index (list, j);
25363eb84d1Schristos 				  if (list != NULL)
25463eb84d1Schristos 				    hash_set_value (&findex->gram4, p0, p4 - p0,
25563eb84d1Schristos 						    list);
25663eb84d1Schristos 				}
25763eb84d1Schristos 			      else
25863eb84d1Schristos 				hash_insert_entry (&findex->gram4, p0, p4 - p0,
25963eb84d1Schristos 						   new_index (j));
26063eb84d1Schristos 
26163eb84d1Schristos 			      /* Advance.  */
26263eb84d1Schristos 			      if (*p4 == '\0')
26363eb84d1Schristos 				break;
26463eb84d1Schristos 			      p0 = p1;
26563eb84d1Schristos 			      p1 = p2;
26663eb84d1Schristos 			      p2 = p3;
26763eb84d1Schristos 			      p3 = p4;
26863eb84d1Schristos 			      p4 = p4 + findex->iterator (p4);
26963eb84d1Schristos 			    }
27063eb84d1Schristos 			}
27163eb84d1Schristos 		    }
27263eb84d1Schristos 		}
27363eb84d1Schristos 	    }
27463eb84d1Schristos 	}
27563eb84d1Schristos     }
27663eb84d1Schristos 
27763eb84d1Schristos   /* Shrink memory used by the hash table.  */
27863eb84d1Schristos   {
27963eb84d1Schristos     void *iter;
28063eb84d1Schristos     const void *key;
28163eb84d1Schristos     size_t keylen;
28263eb84d1Schristos     void **valuep;
28363eb84d1Schristos 
28463eb84d1Schristos     iter = NULL;
28563eb84d1Schristos     while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
28663eb84d1Schristos 	   == 0)
28763eb84d1Schristos       {
28863eb84d1Schristos 	index_list_ty list = (index_list_ty) *valuep;
28963eb84d1Schristos 	index_ty length = list[IL_LENGTH];
29063eb84d1Schristos 
29163eb84d1Schristos 	if (length < list[IL_ALLOCATED])
29263eb84d1Schristos 	  {
29363eb84d1Schristos 	    list[IL_ALLOCATED] = length;
29463eb84d1Schristos 	    *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
29563eb84d1Schristos 	  }
29663eb84d1Schristos       }
29763eb84d1Schristos   }
29863eb84d1Schristos 
29963eb84d1Schristos   findex->firstfew = (int) sqrt ((double) count);
30063eb84d1Schristos   if (findex->firstfew < 10)
30163eb84d1Schristos     findex->firstfew = 10;
30263eb84d1Schristos 
30363eb84d1Schristos   /* Setup lists of short messages.  */
30463eb84d1Schristos   for (l = 0; l <= SHORT_MSG_MAX; l++)
30563eb84d1Schristos     findex->short_messages[l] = message_list_alloc (false);
30663eb84d1Schristos   for (j = 0; j < count; j++)
30763eb84d1Schristos     {
30863eb84d1Schristos       message_ty *mp = mlp->item[j];
30963eb84d1Schristos 
31063eb84d1Schristos       if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
31163eb84d1Schristos 	{
31263eb84d1Schristos 	  const char *str = mp->msgid;
31363eb84d1Schristos 	  size_t len = strlen (str);
31463eb84d1Schristos 
31563eb84d1Schristos 	  if (len <= SHORT_MSG_MAX)
31663eb84d1Schristos 	    message_list_append (findex->short_messages[len], mp);
31763eb84d1Schristos 	}
31863eb84d1Schristos     }
31963eb84d1Schristos 
32063eb84d1Schristos   /* Shrink memory used by the lists of short messages.  */
32163eb84d1Schristos   for (l = 0; l <= SHORT_MSG_MAX; l++)
32263eb84d1Schristos     {
32363eb84d1Schristos       message_list_ty *mlp = findex->short_messages[l];
32463eb84d1Schristos 
32563eb84d1Schristos       if (mlp->nitems < mlp->nitems_max)
32663eb84d1Schristos 	{
32763eb84d1Schristos 	  mlp->nitems_max = mlp->nitems;
32863eb84d1Schristos 	  mlp->item =
32963eb84d1Schristos 	    (message_ty **)
33063eb84d1Schristos 	    xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
33163eb84d1Schristos 	}
33263eb84d1Schristos     }
33363eb84d1Schristos 
33463eb84d1Schristos   return findex;
33563eb84d1Schristos }
33663eb84d1Schristos 
33763eb84d1Schristos /* An index with multiplicity.  */
33863eb84d1Schristos struct mult_index
33963eb84d1Schristos {
34063eb84d1Schristos   index_ty index;
34163eb84d1Schristos   unsigned int count;
34263eb84d1Schristos };
34363eb84d1Schristos 
34463eb84d1Schristos /* A list of indices with multiplicity.
34563eb84d1Schristos    The indices are sorted in ascending order.  */
34663eb84d1Schristos struct mult_index_list
34763eb84d1Schristos {
34863eb84d1Schristos   struct mult_index *item;
34963eb84d1Schristos   size_t nitems;
35063eb84d1Schristos   size_t nitems_max;
35163eb84d1Schristos   /* Work area.  */
35263eb84d1Schristos   struct mult_index *item2;
35363eb84d1Schristos   size_t nitems2_max;
35463eb84d1Schristos };
35563eb84d1Schristos 
35663eb84d1Schristos /* Initialize an empty list of indices with multiplicity.  */
35763eb84d1Schristos static inline void
mult_index_list_init(struct mult_index_list * accu)35863eb84d1Schristos mult_index_list_init (struct mult_index_list *accu)
35963eb84d1Schristos {
36063eb84d1Schristos   accu->nitems = 0;
36163eb84d1Schristos   accu->nitems_max = 0;
36263eb84d1Schristos   accu->item = NULL;
36363eb84d1Schristos   accu->nitems2_max = 0;
36463eb84d1Schristos   accu->item2 = NULL;
36563eb84d1Schristos }
36663eb84d1Schristos 
36763eb84d1Schristos /* Add an index list to a list of indices with multiplicity.  */
36863eb84d1Schristos static inline void
mult_index_list_accumulate(struct mult_index_list * accu,index_list_ty list)36963eb84d1Schristos mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
37063eb84d1Schristos {
37163eb84d1Schristos   size_t len1 = accu->nitems;
37263eb84d1Schristos   size_t len2 = list[IL_LENGTH];
37363eb84d1Schristos   size_t need = len1 + len2;
37463eb84d1Schristos   struct mult_index *ptr1;
37563eb84d1Schristos   struct mult_index *ptr1_end;
37663eb84d1Schristos   index_ty *ptr2;
37763eb84d1Schristos   index_ty *ptr2_end;
37863eb84d1Schristos   struct mult_index *destptr;
37963eb84d1Schristos 
38063eb84d1Schristos   /* Make the work area large enough.  */
38163eb84d1Schristos   if (accu->nitems2_max < need)
38263eb84d1Schristos     {
38363eb84d1Schristos       size_t new_max = 2 * accu->nitems2_max + 1;
38463eb84d1Schristos 
38563eb84d1Schristos       if (new_max < need)
38663eb84d1Schristos 	new_max = need;
38763eb84d1Schristos       if (accu->item2 != NULL)
38863eb84d1Schristos 	free (accu->item2);
38963eb84d1Schristos       accu->item2 =
39063eb84d1Schristos 	(struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
39163eb84d1Schristos       accu->nitems2_max = new_max;
39263eb84d1Schristos     }
39363eb84d1Schristos 
39463eb84d1Schristos   /* Make a linear pass through accu and list simultaneously.  */
39563eb84d1Schristos   ptr1 = accu->item;
39663eb84d1Schristos   ptr1_end = ptr1 + len1;
39763eb84d1Schristos   ptr2 = list + 2;
39863eb84d1Schristos   ptr2_end = ptr2 + len2;
39963eb84d1Schristos   destptr = accu->item2;
40063eb84d1Schristos   while (ptr1 < ptr1_end && ptr2 < ptr2_end)
40163eb84d1Schristos     {
40263eb84d1Schristos       if (ptr1->index < *ptr2)
40363eb84d1Schristos 	{
40463eb84d1Schristos 	  *destptr = *ptr1;
40563eb84d1Schristos 	  ptr1++;
40663eb84d1Schristos 	}
40763eb84d1Schristos       else if (ptr1->index > *ptr2)
40863eb84d1Schristos 	{
40963eb84d1Schristos 	  destptr->index = *ptr2;
41063eb84d1Schristos 	  destptr->count = 1;
41163eb84d1Schristos 	  ptr2++;
41263eb84d1Schristos 	}
41363eb84d1Schristos       else /* ptr1->index == list[2 + i2] */
41463eb84d1Schristos 	{
41563eb84d1Schristos 	  destptr->index = ptr1->index;
41663eb84d1Schristos 	  destptr->count = ptr1->count + 1;
41763eb84d1Schristos 	  ptr1++;
41863eb84d1Schristos 	  ptr2++;
41963eb84d1Schristos 	}
42063eb84d1Schristos       destptr++;
42163eb84d1Schristos     }
42263eb84d1Schristos   while (ptr1 < ptr1_end)
42363eb84d1Schristos     {
42463eb84d1Schristos       *destptr = *ptr1;
42563eb84d1Schristos       ptr1++;
42663eb84d1Schristos       destptr++;
42763eb84d1Schristos     }
42863eb84d1Schristos   while (ptr2 < ptr2_end)
42963eb84d1Schristos     {
43063eb84d1Schristos       destptr->index = *ptr2;
43163eb84d1Schristos       destptr->count = 1;
43263eb84d1Schristos       ptr2++;
43363eb84d1Schristos       destptr++;
43463eb84d1Schristos     }
43563eb84d1Schristos 
43663eb84d1Schristos   /* Swap accu->item and accu->item2.  */
43763eb84d1Schristos   {
43863eb84d1Schristos     struct mult_index *dest = accu->item2;
43963eb84d1Schristos     size_t dest_max = accu->nitems2_max;
44063eb84d1Schristos 
44163eb84d1Schristos     accu->item2 = accu->item;
44263eb84d1Schristos     accu->nitems2_max = accu->nitems_max;
44363eb84d1Schristos 
44463eb84d1Schristos     accu->item = dest;
44563eb84d1Schristos     accu->nitems = destptr - dest;
44663eb84d1Schristos     accu->nitems_max = dest_max;
44763eb84d1Schristos   }
44863eb84d1Schristos }
44963eb84d1Schristos 
45063eb84d1Schristos /* Compares two indices with multiplicity, according to their multiplicity.  */
45163eb84d1Schristos static int
mult_index_compare(const void * p1,const void * p2)45263eb84d1Schristos mult_index_compare (const void *p1, const void *p2)
45363eb84d1Schristos {
45463eb84d1Schristos   const struct mult_index *ptr1 = (const struct mult_index *) p1;
45563eb84d1Schristos   const struct mult_index *ptr2 = (const struct mult_index *) p2;
45663eb84d1Schristos 
45763eb84d1Schristos   if (ptr1->count < ptr2->count)
45863eb84d1Schristos     return 1;
45963eb84d1Schristos   if (ptr1->count > ptr2->count)
46063eb84d1Schristos     return -1;
46163eb84d1Schristos   /* For reproduceable results, also take into account the indices.  */
46263eb84d1Schristos   if (ptr1->index > ptr2->index)
46363eb84d1Schristos     return 1;
46463eb84d1Schristos   if (ptr1->index < ptr2->index)
46563eb84d1Schristos     return -1;
46663eb84d1Schristos   return 0;
46763eb84d1Schristos }
46863eb84d1Schristos 
46963eb84d1Schristos /* Sort a list of indices with multiplicity according to decreasing
47063eb84d1Schristos    multiplicity.  */
47163eb84d1Schristos static inline void
mult_index_list_sort(struct mult_index_list * accu)47263eb84d1Schristos mult_index_list_sort (struct mult_index_list *accu)
47363eb84d1Schristos {
47463eb84d1Schristos   if (accu->nitems > 1)
47563eb84d1Schristos     qsort (accu->item, accu->nitems, sizeof (struct mult_index),
47663eb84d1Schristos 	   mult_index_compare);
47763eb84d1Schristos }
47863eb84d1Schristos 
47963eb84d1Schristos /* Frees a list of indices with multiplicity.  */
48063eb84d1Schristos static inline void
mult_index_list_free(struct mult_index_list * accu)48163eb84d1Schristos mult_index_list_free (struct mult_index_list *accu)
48263eb84d1Schristos {
48363eb84d1Schristos   if (accu->item != NULL)
48463eb84d1Schristos     free (accu->item);
48563eb84d1Schristos   if (accu->item2 != NULL)
48663eb84d1Schristos     free (accu->item2);
48763eb84d1Schristos }
48863eb84d1Schristos 
48963eb84d1Schristos /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
49063eb84d1Schristos    The match does not need to be optimal.  */
49163eb84d1Schristos message_ty *
message_fuzzy_index_search(message_fuzzy_index_ty * findex,const char * msgctxt,const char * msgid)49263eb84d1Schristos message_fuzzy_index_search (message_fuzzy_index_ty *findex,
49363eb84d1Schristos 			    const char *msgctxt, const char *msgid)
49463eb84d1Schristos {
49563eb84d1Schristos   const char *str = msgid;
49663eb84d1Schristos 
49763eb84d1Schristos   /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
49863eb84d1Schristos   const char *p0 = str;
49963eb84d1Schristos   if (*p0 != '\0')
50063eb84d1Schristos     {
50163eb84d1Schristos       const char *p1 = p0 + findex->iterator (p0);
50263eb84d1Schristos       if (*p1 != '\0')
50363eb84d1Schristos 	{
50463eb84d1Schristos 	  const char *p2 = p1 + findex->iterator (p1);
50563eb84d1Schristos 	  if (*p2 != '\0')
50663eb84d1Schristos 	    {
50763eb84d1Schristos 	      const char *p3 = p2 + findex->iterator (p2);
50863eb84d1Schristos 	      if (*p3 != '\0')
50963eb84d1Schristos 		{
51063eb84d1Schristos 		  const char *p4 = p3 + findex->iterator (p3);
51163eb84d1Schristos 		  struct mult_index_list accu;
51263eb84d1Schristos 
51363eb84d1Schristos 		  mult_index_list_init (&accu);
51463eb84d1Schristos 		  for (;;)
51563eb84d1Schristos 		    {
51663eb84d1Schristos 		      /* The segment from p0 to p4 is a 4-gram of
51763eb84d1Schristos 			 characters.  Get the hash table entry containing
51863eb84d1Schristos 			 a list of indices, and add it to the accu.  */
51963eb84d1Schristos 		      void *found;
52063eb84d1Schristos 
52163eb84d1Schristos 		      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
52263eb84d1Schristos 					   &found) == 0)
52363eb84d1Schristos 			{
52463eb84d1Schristos 			  index_list_ty list = (index_list_ty) found;
52563eb84d1Schristos 			  mult_index_list_accumulate (&accu, list);
52663eb84d1Schristos 			}
52763eb84d1Schristos 
52863eb84d1Schristos 		      /* Advance.  */
52963eb84d1Schristos 		      if (*p4 == '\0')
53063eb84d1Schristos 			break;
53163eb84d1Schristos 		      p0 = p1;
53263eb84d1Schristos 		      p1 = p2;
53363eb84d1Schristos 		      p2 = p3;
53463eb84d1Schristos 		      p3 = p4;
53563eb84d1Schristos 		      p4 = p4 + findex->iterator (p4);
53663eb84d1Schristos 		    }
53763eb84d1Schristos 
53863eb84d1Schristos 		  /* Sort in decreasing count order.  */
53963eb84d1Schristos 		  mult_index_list_sort (&accu);
54063eb84d1Schristos 
54163eb84d1Schristos 		  /* Take the first few messages from this sorted list, and
54263eb84d1Schristos 		     maximize the fstrcmp() result.  */
54363eb84d1Schristos 		  {
54463eb84d1Schristos 		    size_t count;
54563eb84d1Schristos 		    struct mult_index *ptr;
54663eb84d1Schristos 		    message_ty *best_mp;
54763eb84d1Schristos 		    double best_weight;
54863eb84d1Schristos 
54963eb84d1Schristos 		    count = findex->firstfew;
55063eb84d1Schristos 		    if (count > accu.nitems)
55163eb84d1Schristos 		      count = accu.nitems;
55263eb84d1Schristos 
55363eb84d1Schristos 		    best_weight = FUZZY_THRESHOLD;
55463eb84d1Schristos 		    best_mp = NULL;
55563eb84d1Schristos 		    for (ptr = accu.item; count > 0; ptr++, count--)
55663eb84d1Schristos 		      {
55763eb84d1Schristos 			message_ty *mp = findex->messages[ptr->index];
55863eb84d1Schristos 			double weight =
55963eb84d1Schristos 			  fuzzy_search_goal_function (mp, msgctxt, msgid);
56063eb84d1Schristos 
56163eb84d1Schristos 			if (weight > best_weight)
56263eb84d1Schristos 			  {
56363eb84d1Schristos 			    best_weight = weight;
56463eb84d1Schristos 			    best_mp = mp;
56563eb84d1Schristos 			  }
56663eb84d1Schristos 		      }
56763eb84d1Schristos 
56863eb84d1Schristos 		    mult_index_list_free (&accu);
56963eb84d1Schristos 
57063eb84d1Schristos 		    return best_mp;
57163eb84d1Schristos 		  }
57263eb84d1Schristos 		}
57363eb84d1Schristos 	    }
57463eb84d1Schristos 	}
57563eb84d1Schristos     }
57663eb84d1Schristos 
57763eb84d1Schristos   /* The string had less than 4 characters.  */
57863eb84d1Schristos   {
57963eb84d1Schristos     size_t l = strlen (str);
58063eb84d1Schristos     size_t lmin, lmax;
58163eb84d1Schristos     message_ty *best_mp;
58263eb84d1Schristos     double best_weight;
58363eb84d1Schristos 
58463eb84d1Schristos     if (!(l <= SHORT_STRING_MAX_BYTES))
58563eb84d1Schristos       abort ();
58663eb84d1Schristos 
58763eb84d1Schristos     /* Walk through those short messages which have an appropriate length.
58863eb84d1Schristos        See the comment before SHORT_MSG_MAX.  */
58963eb84d1Schristos     lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
59063eb84d1Schristos     lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
59163eb84d1Schristos     if (!(lmax <= SHORT_MSG_MAX))
59263eb84d1Schristos       abort ();
59363eb84d1Schristos 
59463eb84d1Schristos     best_weight = FUZZY_THRESHOLD;
59563eb84d1Schristos     best_mp = NULL;
59663eb84d1Schristos     for (l = lmin; l <= lmax; l++)
59763eb84d1Schristos       {
59863eb84d1Schristos 	message_list_ty *mlp = findex->short_messages[l];
59963eb84d1Schristos 	size_t j;
60063eb84d1Schristos 
60163eb84d1Schristos 	for (j = 0; j < mlp->nitems; j++)
60263eb84d1Schristos 	  {
60363eb84d1Schristos 	    message_ty *mp = mlp->item[j];
60463eb84d1Schristos 	    double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
60563eb84d1Schristos 
60663eb84d1Schristos 	    if (weight > best_weight)
60763eb84d1Schristos 	      {
60863eb84d1Schristos 		best_weight = weight;
60963eb84d1Schristos 		best_mp = mp;
61063eb84d1Schristos 	      }
61163eb84d1Schristos 	  }
61263eb84d1Schristos       }
61363eb84d1Schristos 
61463eb84d1Schristos     return best_mp;
61563eb84d1Schristos   }
61663eb84d1Schristos }
61763eb84d1Schristos 
61863eb84d1Schristos /* Free a fuzzy index.  */
61963eb84d1Schristos void
message_fuzzy_index_free(message_fuzzy_index_ty * findex)62063eb84d1Schristos message_fuzzy_index_free (message_fuzzy_index_ty *findex)
62163eb84d1Schristos {
62263eb84d1Schristos   size_t l;
62363eb84d1Schristos   void *iter;
62463eb84d1Schristos   const void *key;
62563eb84d1Schristos   size_t keylen;
62663eb84d1Schristos   void *data;
62763eb84d1Schristos 
62863eb84d1Schristos   /* Free the short lists.  */
62963eb84d1Schristos   for (l = 0; l <= SHORT_MSG_MAX; l++)
63063eb84d1Schristos     message_list_free (findex->short_messages[l], 1);
63163eb84d1Schristos 
63263eb84d1Schristos   /* Free the index lists occurring as values in the hash tables.  */
63363eb84d1Schristos   iter = NULL;
63463eb84d1Schristos   while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
63563eb84d1Schristos     free ((index_list_ty *) data);
63663eb84d1Schristos   /* Free the hash table itself.  */
63763eb84d1Schristos   hash_destroy (&findex->gram4);
63863eb84d1Schristos 
63963eb84d1Schristos   free (findex);
64063eb84d1Schristos }
641