1 /* Fast fuzzy searching among messages.
2 Copyright (C) 2006, 2008, 2011, 2013 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "msgl-fsearch.h"
24
25 #include <math.h>
26 #include <stdlib.h>
27
28 #include "xalloc.h"
29 #include "po-charset.h"
30
31
32 /* Fuzzy searching of L strings in a large set of N messages (assuming
33 they have all the same small size) takes O(L * N) when using repeated
34 fstrcmp() calls. When for example L = 800 and N = 69000, this is slow.
35
36 So we preprocess the set of N messages, yielding a data structure
37 that allows to select the similar messages for any given string, and
38 apply fstrcmp() only to the search result. This allows to reduce the
39 time to something between O(L * 1) and O(L * N) - depending on the
40 structure of the strings. In the example with L = 800 and N = 69000,
41 memory use increases by a factor of 2 and the time decreases from
42 9068 s to 230 s.
43
44 The data structure is a hash table mapping each n-gram (here with n=4)
45 occurring in the message strings to the list of messages that contains
46 it. For example, if the message list is
47 [0] "close"
48 [1] "losers"
49 the index is a hash table mapping
50 "clos" -> { 0 }
51 "lose" -> { 0, 1 }
52 "oser" -> { 1 }
53 "sers" -> { 1 }
54 Searching the similar messages to, say, "closest", is done by looking up
55 all its 4-grams in the hash table and summing up the results:
56 "clos" -> { 0 }
57 "lose" -> { 0, 1 }
58 "oses" -> {}
59 "sest" -> {}
60 => total: { 2x0, 1x1 }
61 The list is sorted according to decreasing frequency: { 0, 1, ... }
62 and only the first few messages from this frequency list are passed to
63 fstrcmp().
64
65 The n-gram similarity and the fstrcmp() similarity are quite different
66 metrics. For example, "close" and "c l o s e" have no n-grams in common
67 (even for n as small as n = 2); however, fstrcmp() will find that they
68 are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and
69 "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
70 only 0.02. Also, repeated n-grams don't have the same effect on the
71 two similarity measures. But all this doesn't matter much in practice.
72
73 We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
74 lists are likely too long. (Well, with ideographic languages like Chinese,
75 n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case
76 either.)
77
78 The units are characters in the current encoding. Not just bytes,
79 because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */
80
81 /* Each message is represented by its index in the message list. */
82 typedef unsigned int index_ty;
83
84 /* An index list has its allocated size and length tacked at the front.
85 The indices are sorted in ascending order. */
86 typedef index_ty *index_list_ty;
87 #define IL_ALLOCATED 0
88 #define IL_LENGTH 1
89
90 /* Create a new index list containing only a given index. */
91 static inline index_list_ty
new_index(index_ty idx)92 new_index (index_ty idx)
93 {
94 index_ty *list = XNMALLOC (2 + 1, index_ty);
95 list[IL_ALLOCATED] = 1;
96 list[IL_LENGTH] = 1;
97 list[2] = idx;
98
99 return list;
100 }
101
102 /* Add a given index, greater or equal than all previous indices, to an
103 index list.
104 Return the new index list, if it had to be reallocated, or NULL if it
105 didn't change. */
106 static inline index_list_ty
addlast_index(index_list_ty list,index_ty idx)107 addlast_index (index_list_ty list, index_ty idx)
108 {
109 index_list_ty result;
110 size_t length = list[IL_LENGTH];
111
112 /* Look whether it should be inserted. */
113 if (length > 0 && list[2 + (length - 1)] == idx)
114 return NULL;
115
116 /* Now make room for one more list element. */
117 result = NULL;
118 if (length == list[IL_ALLOCATED])
119 {
120 size_t new_allocated = 2 * length - (length >> 6);
121 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
122 list[IL_ALLOCATED] = new_allocated;
123 result = list;
124 }
125 list[2 + length] = idx;
126 list[IL_LENGTH] = length + 1;
127 return result;
128 }
129
130 /* Add a given index to an index list.
131 Return the new index list, if it had to be reallocated, or NULL if it
132 didn't change. */
133 static inline index_list_ty
add_index(index_list_ty list,index_ty idx)134 add_index (index_list_ty list, index_ty idx)
135 {
136 index_list_ty result;
137 size_t length = list[IL_LENGTH];
138
139 /* Look where it should be inserted. */
140 size_t lo = 0;
141 size_t hi = length;
142 while (lo < hi)
143 {
144 /* Here we know that idx must be inserted at a position >= lo, <= hi. */
145 size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
146 index_ty val = list[2 + mid];
147 if (val < idx)
148 lo = mid + 1;
149 else if (val > idx)
150 hi = mid;
151 else
152 return NULL;
153 }
154
155 /* Now make room for one more list element. */
156 result = NULL;
157 if (length == list[IL_ALLOCATED])
158 {
159 size_t new_allocated = 2 * length - (length >> 6);
160 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
161 list[IL_ALLOCATED] = new_allocated;
162 result = list;
163 }
164 list[IL_LENGTH] = length + 1;
165 for (; length > hi; length--)
166 list[2 + length] = list[1 + length];
167 list[2 + length] = idx;
168 return result;
169 }
170
171 /* We use 4-grams, therefore strings with less than 4 characters cannot be
172 handled through the 4-grams table and need to be handled specially.
173 Since every character occupies at most 4 bytes (see po-charset.c),
174 this means the size of such short strings is bounded by: */
175 #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
176 #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
177
178 /* Such short strings are handled by direct comparison with all messages
179 of appropriate size. Note that for two strings of length 0 <= l1 <= l2,
180 fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in
181 fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
182 limit the search to lengths l' in the range
183 l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
184 Thus we need the list of the short strings up to length: */
185 #if !defined __SUNPRO_C
186 # define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
187 #else
188 /* Sun C on Solaris 8 cannot compute this constant expression. */
189 # define SHORT_MSG_MAX 28
190 #endif
191
192 /* A fuzzy index contains a hash table mapping all n-grams to their
193 occurrences list. */
194 struct message_fuzzy_index_ty
195 {
196 message_ty **messages;
197 character_iterator_t iterator;
198 hash_table gram4;
199 size_t firstfew;
200 message_list_ty **short_messages;
201 };
202
203 /* Allocate a fuzzy index corresponding to a given list of messages.
204 The list of messages and the msgctxt and msgid fields of the messages
205 inside it must not be modified while the returned fuzzy index is in use. */
206 message_fuzzy_index_ty *
message_fuzzy_index_alloc(const message_list_ty * mlp,const char * canon_charset)207 message_fuzzy_index_alloc (const message_list_ty *mlp,
208 const char *canon_charset)
209 {
210 message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
211 size_t count = mlp->nitems;
212 size_t j;
213 size_t l;
214
215 findex->messages = mlp->item;
216 findex->iterator = po_charset_character_iterator (canon_charset);
217
218 /* Setup hash table. */
219 if (hash_init (&findex->gram4, 10 * count) < 0)
220 xalloc_die ();
221 for (j = 0; j < count; j++)
222 {
223 message_ty *mp = mlp->item[j];
224
225 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
226 {
227 const char *str = mp->msgid;
228
229 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
230 const char *p0 = str;
231 if (*p0 != '\0')
232 {
233 const char *p1 = p0 + findex->iterator (p0);
234 if (*p1 != '\0')
235 {
236 const char *p2 = p1 + findex->iterator (p1);
237 if (*p2 != '\0')
238 {
239 const char *p3 = p2 + findex->iterator (p2);
240 if (*p3 != '\0')
241 {
242 const char *p4 = p3 + findex->iterator (p3);
243 for (;;)
244 {
245 /* The segment from p0 to p4 is a 4-gram of
246 characters. Add a hash table entry that maps
247 it to the index j, or extend the existing
248 hash table entry accordingly. */
249 void *found;
250
251 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
252 &found) == 0)
253 {
254 index_list_ty list = (index_list_ty) found;
255 list = addlast_index (list, j);
256 if (list != NULL)
257 hash_set_value (&findex->gram4, p0, p4 - p0,
258 list);
259 }
260 else
261 hash_insert_entry (&findex->gram4, p0, p4 - p0,
262 new_index (j));
263
264 /* Advance. */
265 if (*p4 == '\0')
266 break;
267 p0 = p1;
268 p1 = p2;
269 p2 = p3;
270 p3 = p4;
271 p4 = p4 + findex->iterator (p4);
272 }
273 }
274 }
275 }
276 }
277 }
278 }
279
280 /* Shrink memory used by the hash table. */
281 {
282 void *iter;
283 const void *key;
284 size_t keylen;
285 void **valuep;
286
287 iter = NULL;
288 while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
289 == 0)
290 {
291 index_list_ty list = (index_list_ty) *valuep;
292 index_ty length = list[IL_LENGTH];
293
294 if (length < list[IL_ALLOCATED])
295 {
296 list[IL_ALLOCATED] = length;
297 *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
298 }
299 }
300 }
301
302 findex->firstfew = (int) sqrt ((double) count);
303 if (findex->firstfew < 10)
304 findex->firstfew = 10;
305
306 /* Setup lists of short messages. */
307 findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *);
308 for (l = 0; l <= SHORT_MSG_MAX; l++)
309 findex->short_messages[l] = message_list_alloc (false);
310 for (j = 0; j < count; j++)
311 {
312 message_ty *mp = mlp->item[j];
313
314 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
315 {
316 const char *str = mp->msgid;
317 size_t len = strlen (str);
318
319 if (len <= SHORT_MSG_MAX)
320 message_list_append (findex->short_messages[len], mp);
321 }
322 }
323
324 /* Shrink memory used by the lists of short messages. */
325 for (l = 0; l <= SHORT_MSG_MAX; l++)
326 {
327 message_list_ty *mlp = findex->short_messages[l];
328
329 if (mlp->nitems < mlp->nitems_max)
330 {
331 mlp->nitems_max = mlp->nitems;
332 mlp->item =
333 (message_ty **)
334 xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
335 }
336 }
337
338 return findex;
339 }
340
341 /* An index with multiplicity. */
342 struct mult_index
343 {
344 index_ty index;
345 unsigned int count;
346 };
347
348 /* A list of indices with multiplicity.
349 The indices are sorted in ascending order. */
350 struct mult_index_list
351 {
352 struct mult_index *item;
353 size_t nitems;
354 size_t nitems_max;
355 /* Work area. */
356 struct mult_index *item2;
357 size_t nitems2_max;
358 };
359
360 /* Initialize an empty list of indices with multiplicity. */
361 static inline void
mult_index_list_init(struct mult_index_list * accu)362 mult_index_list_init (struct mult_index_list *accu)
363 {
364 accu->nitems = 0;
365 accu->nitems_max = 0;
366 accu->item = NULL;
367 accu->nitems2_max = 0;
368 accu->item2 = NULL;
369 }
370
371 /* Add an index list to a list of indices with multiplicity. */
372 static inline void
mult_index_list_accumulate(struct mult_index_list * accu,index_list_ty list)373 mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
374 {
375 size_t len1 = accu->nitems;
376 size_t len2 = list[IL_LENGTH];
377 size_t need = len1 + len2;
378 struct mult_index *ptr1;
379 struct mult_index *ptr1_end;
380 index_ty *ptr2;
381 index_ty *ptr2_end;
382 struct mult_index *destptr;
383
384 /* Make the work area large enough. */
385 if (accu->nitems2_max < need)
386 {
387 size_t new_max = 2 * accu->nitems2_max + 1;
388
389 if (new_max < need)
390 new_max = need;
391 if (accu->item2 != NULL)
392 free (accu->item2);
393 accu->item2 = XNMALLOC (new_max, struct mult_index);
394 accu->nitems2_max = new_max;
395 }
396
397 /* Make a linear pass through accu and list simultaneously. */
398 ptr1 = accu->item;
399 ptr1_end = ptr1 + len1;
400 ptr2 = list + 2;
401 ptr2_end = ptr2 + len2;
402 destptr = accu->item2;
403 while (ptr1 < ptr1_end && ptr2 < ptr2_end)
404 {
405 if (ptr1->index < *ptr2)
406 {
407 *destptr = *ptr1;
408 ptr1++;
409 }
410 else if (ptr1->index > *ptr2)
411 {
412 destptr->index = *ptr2;
413 destptr->count = 1;
414 ptr2++;
415 }
416 else /* ptr1->index == list[2 + i2] */
417 {
418 destptr->index = ptr1->index;
419 destptr->count = ptr1->count + 1;
420 ptr1++;
421 ptr2++;
422 }
423 destptr++;
424 }
425 while (ptr1 < ptr1_end)
426 {
427 *destptr = *ptr1;
428 ptr1++;
429 destptr++;
430 }
431 while (ptr2 < ptr2_end)
432 {
433 destptr->index = *ptr2;
434 destptr->count = 1;
435 ptr2++;
436 destptr++;
437 }
438
439 /* Swap accu->item and accu->item2. */
440 {
441 struct mult_index *dest = accu->item2;
442 size_t dest_max = accu->nitems2_max;
443
444 accu->item2 = accu->item;
445 accu->nitems2_max = accu->nitems_max;
446
447 accu->item = dest;
448 accu->nitems = destptr - dest;
449 accu->nitems_max = dest_max;
450 }
451 }
452
453 /* Compares two indices with multiplicity, according to their multiplicity. */
454 static int
mult_index_compare(const void * p1,const void * p2)455 mult_index_compare (const void *p1, const void *p2)
456 {
457 const struct mult_index *ptr1 = (const struct mult_index *) p1;
458 const struct mult_index *ptr2 = (const struct mult_index *) p2;
459
460 if (ptr1->count < ptr2->count)
461 return 1;
462 if (ptr1->count > ptr2->count)
463 return -1;
464 /* For reproduceable results, also take into account the indices. */
465 if (ptr1->index > ptr2->index)
466 return 1;
467 if (ptr1->index < ptr2->index)
468 return -1;
469 return 0;
470 }
471
472 /* Sort a list of indices with multiplicity according to decreasing
473 multiplicity. */
474 static inline void
mult_index_list_sort(struct mult_index_list * accu)475 mult_index_list_sort (struct mult_index_list *accu)
476 {
477 if (accu->nitems > 1)
478 qsort (accu->item, accu->nitems, sizeof (struct mult_index),
479 mult_index_compare);
480 }
481
482 /* Frees a list of indices with multiplicity. */
483 static inline void
mult_index_list_free(struct mult_index_list * accu)484 mult_index_list_free (struct mult_index_list *accu)
485 {
486 if (accu->item != NULL)
487 free (accu->item);
488 if (accu->item2 != NULL)
489 free (accu->item2);
490 }
491
492 /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
493 The match does not need to be optimal.
494 Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
495 LOWER_BOUND must be >= FUZZY_THRESHOLD.
496 If HEURISTIC is true, only the few best messages among the list - according
497 to a certain heuristic - are considered. If HEURISTIC is false, all
498 messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
499 like in message_list_search_fuzzy (except that in ambiguous cases where
500 several best matches exist, message_list_search_fuzzy chooses the one with
501 the smallest index whereas message_fuzzy_index_search makes a better
502 choice). */
503 message_ty *
message_fuzzy_index_search(message_fuzzy_index_ty * findex,const char * msgctxt,const char * msgid,double lower_bound,bool heuristic)504 message_fuzzy_index_search (message_fuzzy_index_ty *findex,
505 const char *msgctxt, const char *msgid,
506 double lower_bound,
507 bool heuristic)
508 {
509 const char *str = msgid;
510
511 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
512 const char *p0 = str;
513 if (*p0 != '\0')
514 {
515 const char *p1 = p0 + findex->iterator (p0);
516 if (*p1 != '\0')
517 {
518 const char *p2 = p1 + findex->iterator (p1);
519 if (*p2 != '\0')
520 {
521 const char *p3 = p2 + findex->iterator (p2);
522 if (*p3 != '\0')
523 {
524 const char *p4 = p3 + findex->iterator (p3);
525 struct mult_index_list accu;
526
527 mult_index_list_init (&accu);
528 for (;;)
529 {
530 /* The segment from p0 to p4 is a 4-gram of
531 characters. Get the hash table entry containing
532 a list of indices, and add it to the accu. */
533 void *found;
534
535 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
536 &found) == 0)
537 {
538 index_list_ty list = (index_list_ty) found;
539 mult_index_list_accumulate (&accu, list);
540 }
541
542 /* Advance. */
543 if (*p4 == '\0')
544 break;
545 p0 = p1;
546 p1 = p2;
547 p2 = p3;
548 p3 = p4;
549 p4 = p4 + findex->iterator (p4);
550 }
551
552 /* Sort in decreasing count order. */
553 mult_index_list_sort (&accu);
554
555 /* Iterate over this sorted list, and maximize the
556 fuzzy_search_goal_function() result.
557 If HEURISTIC is true, take only the first few messages.
558 If HEURISTIC is false, consider all messages - to match
559 the behaviour of message_list_search_fuzzy -, but process
560 them in the order of the sorted list. This increases
561 the chances that the later calls to fstrcmp_bounded() (via
562 fuzzy_search_goal_function()) terminate quickly, thanks
563 to the best_weight which will be quite high already after
564 the first few messages. */
565 {
566 size_t count;
567 struct mult_index *ptr;
568 message_ty *best_mp;
569 double best_weight;
570
571 count = accu.nitems;
572 if (heuristic)
573 {
574 if (count > findex->firstfew)
575 count = findex->firstfew;
576 }
577
578 best_weight = lower_bound;
579 best_mp = NULL;
580 for (ptr = accu.item; count > 0; ptr++, count--)
581 {
582 message_ty *mp = findex->messages[ptr->index];
583 double weight =
584 fuzzy_search_goal_function (mp, msgctxt, msgid,
585 best_weight);
586
587 if (weight > best_weight)
588 {
589 best_weight = weight;
590 best_mp = mp;
591 }
592 }
593
594 mult_index_list_free (&accu);
595
596 return best_mp;
597 }
598 }
599 }
600 }
601 }
602
603 /* The string had less than 4 characters. */
604 {
605 size_t l = strlen (str);
606 size_t lmin, lmax;
607 message_ty *best_mp;
608 double best_weight;
609
610 if (!(l <= SHORT_STRING_MAX_BYTES))
611 abort ();
612
613 /* Walk through those short messages which have an appropriate length.
614 See the comment before SHORT_MSG_MAX. */
615 lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
616 lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
617 if (!(lmax <= SHORT_MSG_MAX))
618 abort ();
619
620 best_weight = lower_bound;
621 best_mp = NULL;
622 for (l = lmin; l <= lmax; l++)
623 {
624 message_list_ty *mlp = findex->short_messages[l];
625 size_t j;
626
627 for (j = 0; j < mlp->nitems; j++)
628 {
629 message_ty *mp = mlp->item[j];
630 double weight =
631 fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight);
632
633 if (weight > best_weight)
634 {
635 best_weight = weight;
636 best_mp = mp;
637 }
638 }
639 }
640
641 return best_mp;
642 }
643 }
644
645 /* Free a fuzzy index. */
646 void
message_fuzzy_index_free(message_fuzzy_index_ty * findex)647 message_fuzzy_index_free (message_fuzzy_index_ty *findex)
648 {
649 size_t l;
650 void *iter;
651 const void *key;
652 size_t keylen;
653 void *data;
654
655 /* Free the short lists. */
656 for (l = 0; l <= SHORT_MSG_MAX; l++)
657 message_list_free (findex->short_messages[l], 1);
658 free (findex->short_messages);
659
660 /* Free the index lists occurring as values in the hash tables. */
661 iter = NULL;
662 while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
663 free ((index_list_ty *) data);
664 /* Free the hash table itself. */
665 hash_destroy (&findex->gram4);
666
667 free (findex);
668 }
669