1 /* dzl-fuzzy-index-cursor.c
2  *
3  * Copyright (C) 2016 Christian Hergert <christian@hergert.me>
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #define G_LOG_DOMAIN "dzl-fuzzy-index-cursor"
20 
21 #include "config.h"
22 
23 #include <string.h>
24 
25 #include "search/dzl-fuzzy-index-cursor.h"
26 #include "search/dzl-fuzzy-index-match.h"
27 #include "search/dzl-fuzzy-index-private.h"
28 #include "util/dzl-macros.h"
29 #include "util/dzl-int-pair.h"
30 
31 struct _DzlFuzzyIndexCursor
32 {
33   GObject          object;
34 
35   DzlFuzzyIndex   *index;
36   gchar           *query;
37   GVariantDict    *tables;
38   GArray          *matches;
39   guint            max_matches;
40   guint            case_sensitive : 1;
41 };
42 
43 typedef struct
44 {
45   guint position;
46   guint lookaside_id;
47 } DzlFuzzyIndexItem;
48 
49 typedef struct
50 {
51   const gchar *key;
52   guint        document_id;
53   gfloat       score;
54   guint        priority;
55 } DzlFuzzyMatch;
56 
57 typedef struct
58 {
59   DzlFuzzyIndex                   *index;
60   const DzlFuzzyIndexItem * const *tables;
61   const gsize                     *tables_n_elements;
62   gint                            *tables_state;
63   guint                            n_tables;
64   guint                            max_matches;
65   const gchar                     *needle;
66   GHashTable                      *matches;
67 } DzlFuzzyLookup;
68 
69 enum {
70   PROP_0,
71   PROP_CASE_SENSITIVE,
72   PROP_INDEX,
73   PROP_TABLES,
74   PROP_MAX_MATCHES,
75   PROP_QUERY,
76   N_PROPS
77 };
78 
79 static void async_initable_iface_init (GAsyncInitableIface *iface);
80 static void list_model_iface_init     (GListModelInterface *iface);
81 
82 static GParamSpec *properties [N_PROPS];
83 
G_DEFINE_TYPE_WITH_CODE(DzlFuzzyIndexCursor,dzl_fuzzy_index_cursor,G_TYPE_OBJECT,G_IMPLEMENT_INTERFACE (G_TYPE_ASYNC_INITABLE,async_initable_iface_init)G_IMPLEMENT_INTERFACE (G_TYPE_LIST_MODEL,list_model_iface_init))84 G_DEFINE_TYPE_WITH_CODE (DzlFuzzyIndexCursor, dzl_fuzzy_index_cursor, G_TYPE_OBJECT,
85                          G_IMPLEMENT_INTERFACE (G_TYPE_ASYNC_INITABLE, async_initable_iface_init)
86                          G_IMPLEMENT_INTERFACE (G_TYPE_LIST_MODEL, list_model_iface_init))
87 
88 static inline gfloat
89 pointer_to_float (gpointer ptr)
90 {
91   union {
92     gpointer ptr;
93 #if GLIB_SIZEOF_VOID_P == 8
94     gdouble fval;
95 #else
96     gfloat fval;
97 #endif
98   } convert;
99   convert.ptr = ptr;
100   return (gfloat)convert.fval;
101 }
102 
103 static inline gpointer
float_to_pointer(gfloat fval)104 float_to_pointer (gfloat fval)
105 {
106   union {
107     gpointer ptr;
108 #if GLIB_SIZEOF_VOID_P == 8
109     gdouble fval;
110 #else
111     gfloat fval;
112 #endif
113   } convert;
114   convert.fval = fval;
115   return convert.ptr;
116 }
117 
118 /* Not guaranteed, so assert to be sure */
119 G_STATIC_ASSERT (sizeof(gdouble) == 8);
120 G_STATIC_ASSERT (sizeof(gfloat) == 4);
121 
122 static void
dzl_fuzzy_index_cursor_finalize(GObject * object)123 dzl_fuzzy_index_cursor_finalize (GObject *object)
124 {
125   DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)object;
126 
127   g_clear_object (&self->index);
128   g_clear_pointer (&self->query, g_free);
129   g_clear_pointer (&self->matches, g_array_unref);
130   g_clear_pointer (&self->tables, g_variant_dict_unref);
131 
132   G_OBJECT_CLASS (dzl_fuzzy_index_cursor_parent_class)->finalize (object);
133 }
134 
135 static void
dzl_fuzzy_index_cursor_get_property(GObject * object,guint prop_id,GValue * value,GParamSpec * pspec)136 dzl_fuzzy_index_cursor_get_property (GObject    *object,
137                                  guint       prop_id,
138                                  GValue     *value,
139                                  GParamSpec *pspec)
140 {
141   DzlFuzzyIndexCursor *self = DZL_FUZZY_INDEX_CURSOR(object);
142 
143   switch (prop_id)
144     {
145     case PROP_CASE_SENSITIVE:
146       g_value_set_boolean (value, self->case_sensitive);
147       break;
148 
149     case PROP_INDEX:
150       g_value_set_object (value, self->index);
151       break;
152 
153     case PROP_MAX_MATCHES:
154       g_value_set_uint (value, self->max_matches);
155       break;
156 
157     case PROP_QUERY:
158       g_value_set_string (value, self->query);
159       break;
160 
161     default:
162       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
163     }
164 }
165 
166 static void
dzl_fuzzy_index_cursor_set_property(GObject * object,guint prop_id,const GValue * value,GParamSpec * pspec)167 dzl_fuzzy_index_cursor_set_property (GObject      *object,
168                                  guint         prop_id,
169                                  const GValue *value,
170                                  GParamSpec   *pspec)
171 {
172   DzlFuzzyIndexCursor *self = DZL_FUZZY_INDEX_CURSOR(object);
173 
174   switch (prop_id)
175     {
176     case PROP_CASE_SENSITIVE:
177       self->case_sensitive = g_value_get_boolean (value);
178       break;
179 
180     case PROP_INDEX:
181       self->index = g_value_dup_object (value);
182       break;
183 
184     case PROP_TABLES:
185       self->tables = g_value_dup_boxed (value);
186       break;
187 
188     case PROP_MAX_MATCHES:
189       self->max_matches = g_value_get_uint (value);
190       break;
191 
192     case PROP_QUERY:
193       self->query = g_value_dup_string (value);
194       break;
195 
196     default:
197       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
198     }
199 }
200 
201 static void
dzl_fuzzy_index_cursor_class_init(DzlFuzzyIndexCursorClass * klass)202 dzl_fuzzy_index_cursor_class_init (DzlFuzzyIndexCursorClass *klass)
203 {
204   GObjectClass *object_class = G_OBJECT_CLASS (klass);
205 
206   object_class->finalize = dzl_fuzzy_index_cursor_finalize;
207   object_class->get_property = dzl_fuzzy_index_cursor_get_property;
208   object_class->set_property = dzl_fuzzy_index_cursor_set_property;
209 
210   properties [PROP_CASE_SENSITIVE] =
211     g_param_spec_boolean ("case-sensitive",
212                           "Case Sensitive",
213                           "Case Sensitive",
214                           FALSE,
215                           (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
216 
217   properties [PROP_INDEX] =
218     g_param_spec_object ("index",
219                          "Index",
220                          "The index this cursor is iterating",
221                          DZL_TYPE_FUZZY_INDEX,
222                          (G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
223 
224   properties [PROP_TABLES] =
225     g_param_spec_boxed ("tables",
226                         "Tables",
227                         "The dictionary of character indexes",
228                         G_TYPE_VARIANT_DICT,
229                         (G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
230 
231   properties [PROP_QUERY] =
232     g_param_spec_string ("query",
233                          "Query",
234                          "The query for the index",
235                          NULL,
236                          (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
237 
238   properties [PROP_MAX_MATCHES] =
239     g_param_spec_uint ("max-matches",
240                        "Max Matches",
241                        "The max number of matches to display",
242                        0,
243                        G_MAXUINT,
244                        0,
245                        (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
246 
247   g_object_class_install_properties (object_class, N_PROPS, properties);
248 }
249 
250 static void
dzl_fuzzy_index_cursor_init(DzlFuzzyIndexCursor * self)251 dzl_fuzzy_index_cursor_init (DzlFuzzyIndexCursor *self)
252 {
253   self->matches = g_array_new (FALSE, FALSE, sizeof (DzlFuzzyMatch));
254 }
255 
256 static gint
fuzzy_match_compare(gconstpointer a,gconstpointer b)257 fuzzy_match_compare (gconstpointer a,
258                      gconstpointer b)
259 {
260   const DzlFuzzyMatch *ma = a;
261   const DzlFuzzyMatch *mb = b;
262 
263   if (ma->score < mb->score)
264     return 1;
265   else if (ma->score > mb->score)
266     return -1;
267 
268   return strcmp (ma->key, mb->key);
269 }
270 
271 static gboolean
fuzzy_do_match(const DzlFuzzyLookup * lookup,const DzlFuzzyIndexItem * item,guint table_index,gint score)272 fuzzy_do_match (const DzlFuzzyLookup    *lookup,
273                 const DzlFuzzyIndexItem *item,
274                 guint                    table_index,
275                 gint                     score)
276 {
277   const DzlFuzzyIndexItem *table;
278   const DzlFuzzyIndexItem *iter;
279   gssize n_elements;
280   gint *state;
281   gint iter_score;
282 
283   g_assert (lookup != NULL);
284   g_assert (item != NULL);
285   g_assert (score >= 0);
286 
287   table = lookup->tables [table_index];
288   n_elements = (gssize)lookup->tables_n_elements [table_index];
289   state = &lookup->tables_state [table_index];
290 
291   for (; state [0] < n_elements; state [0]++)
292     {
293       DzlIntPair *lookup_pair;
294       gboolean contains_document;
295 
296       iter = &table [state [0]];
297 
298       if ((iter->lookaside_id < item->lookaside_id) ||
299           ((iter->lookaside_id == item->lookaside_id) &&
300            (iter->position <= item->position)))
301         continue;
302       else if (iter->lookaside_id > item->lookaside_id)
303         break;
304 
305       iter_score = score + (iter->position - item->position);
306 
307       if (table_index + 1 < lookup->n_tables)
308         {
309           if (fuzzy_do_match (lookup, iter, table_index + 1, iter_score))
310             return TRUE;
311           continue;
312         }
313 
314       contains_document = g_hash_table_lookup_extended (lookup->matches,
315                                                         GUINT_TO_POINTER (item->lookaside_id),
316                                                         NULL,
317                                                         (gpointer *)&lookup_pair);
318 
319       if (!contains_document || iter_score < dzl_int_pair_first (lookup_pair))
320         g_hash_table_insert (lookup->matches,
321                              GUINT_TO_POINTER (item->lookaside_id),
322                              dzl_int_pair_new (iter_score, iter->position));
323 
324       return TRUE;
325     }
326 
327   return FALSE;
328 }
329 
330 static void
dzl_fuzzy_index_cursor_worker(GTask * task,gpointer source_object,gpointer task_data,GCancellable * cancellable)331 dzl_fuzzy_index_cursor_worker (GTask        *task,
332                                gpointer      source_object,
333                                gpointer      task_data,
334                                GCancellable *cancellable)
335 {
336   DzlFuzzyIndexCursor *self = source_object;
337   g_autoptr(GHashTable) matches = NULL;
338   g_autoptr(GHashTable) by_document = NULL;
339   g_autoptr(GPtrArray) tables = NULL;
340   g_autoptr(GArray) tables_n_elements = NULL;
341   g_autofree gint *tables_state = NULL;
342   g_autofree gchar *freeme = NULL;
343   const gchar *query;
344   DzlFuzzyLookup lookup = { 0 };
345   GHashTableIter iter;
346   const gchar *str;
347   gpointer key, value;
348   guint i;
349 
350   g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
351   g_assert (G_IS_TASK (task));
352 
353   if (g_task_return_error_if_cancelled (task))
354     return;
355 
356   /* No matches with empty query */
357   if (self->query == NULL || *self->query == '\0')
358     goto cleanup;
359 
360   /* If we are not case-sensitive, we need to downcase the query string */
361   query = self->query;
362   if (!self->case_sensitive)
363     query = freeme = g_utf8_casefold (query, -1);
364 
365   tables = g_ptr_array_new ();
366   tables_n_elements = g_array_new (FALSE, FALSE, sizeof (gsize));
367   matches = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)dzl_int_pair_free);
368 
369   for (str = query; *str; str = g_utf8_next_char (str))
370     {
371       gunichar ch = g_utf8_get_char (str);
372       g_autoptr(GVariant) table = NULL;
373       gconstpointer fixed;
374       gsize n_elements;
375       gchar char_key[8];
376 
377       if (g_unichar_isspace (ch))
378         continue;
379 
380       char_key [g_unichar_to_utf8 (ch, char_key)] = '\0';
381       table = g_variant_dict_lookup_value (self->tables,
382                                            char_key,
383                                            (const GVariantType *)"a(uu)");
384 
385       /* No possible matches, missing table for character */
386       if (table == NULL)
387         goto cleanup;
388 
389       fixed = g_variant_get_fixed_array (table, &n_elements, sizeof (DzlFuzzyIndexItem));
390       g_array_append_val (tables_n_elements, n_elements);
391       g_ptr_array_add (tables, (gpointer)fixed);
392     }
393 
394   if (tables->len == 0)
395     goto cleanup;
396 
397   g_assert (tables->len > 0);
398   g_assert (tables->len == tables_n_elements->len);
399 
400   tables_state = g_new0 (gint, tables->len);
401 
402   lookup.index = self->index;
403   lookup.matches = matches;
404   lookup.tables = (const DzlFuzzyIndexItem * const *)tables->pdata;
405   lookup.tables_n_elements = (const gsize *)(gpointer)tables_n_elements->data;
406   lookup.tables_state = tables_state;
407   lookup.n_tables = tables->len;
408   lookup.needle = query;
409   lookup.max_matches = self->max_matches;
410 
411   if G_LIKELY (lookup.n_tables > 1)
412     {
413       for (i = 0; i < lookup.tables_n_elements[0]; i++)
414         {
415           const DzlFuzzyIndexItem *item = &lookup.tables[0][i];
416 
417           fuzzy_do_match (&lookup, item, 1, MIN (16, item->position * 2));
418         }
419     }
420   else
421     {
422       guint last_id = G_MAXUINT;
423 
424       for (i = 0; i < lookup.tables_n_elements[0]; i++)
425         {
426           const DzlFuzzyIndexItem *item = &lookup.tables[0][i];
427           DzlFuzzyMatch match;
428 
429           if (item->lookaside_id != last_id)
430             {
431               last_id = item->lookaside_id;
432 
433               if G_UNLIKELY (!_dzl_fuzzy_index_resolve (self->index,
434                                                         item->lookaside_id,
435                                                         &match.document_id,
436                                                         &match.key,
437                                                         &match.priority,
438                                                         item->position,
439                                                         item->position,
440                                                         &match.score))
441                 continue;
442 
443               g_array_append_val (self->matches, match);
444             }
445         }
446 
447       goto cleanup;
448     }
449 
450   if (g_task_return_error_if_cancelled (task))
451     return;
452 
453   by_document = g_hash_table_new (NULL, NULL);
454 
455   g_hash_table_iter_init (&iter, matches);
456 
457   while (g_hash_table_iter_next (&iter, &key, &value))
458     {
459       DzlIntPair *pair = value;
460       guint score = dzl_int_pair_first (pair);
461       guint last_offset = dzl_int_pair_second (pair);
462       gpointer other_score;
463       DzlFuzzyMatch match = {0};
464       guint lookaside_id = GPOINTER_TO_UINT (key);
465 
466       if G_UNLIKELY (!_dzl_fuzzy_index_resolve (self->index,
467                                                 lookaside_id,
468                                                 &match.document_id,
469                                                 &match.key,
470                                                 &match.priority,
471                                                 score,
472                                                 last_offset,
473                                                 &match.score))
474         continue;
475 
476       if (g_hash_table_lookup_extended (by_document,
477                                         GUINT_TO_POINTER (match.document_id),
478                                         NULL,
479                                         &other_score) &&
480           match.score <= pointer_to_float (other_score))
481         continue;
482 
483       g_hash_table_insert (by_document,
484                            GUINT_TO_POINTER (match.document_id),
485                            float_to_pointer (match.score));
486 
487       g_array_append_val (self->matches, match);
488     }
489 
490   /*
491    * Because we have to do the deduplication of documents after
492    * searching all the potential matches, we could have duplicates
493    * in the array. So we need to filter them out. Not that big
494    * of a deal, since we can do remove_fast on the index and
495    * we have to sort them afterwards anyway. Potentially, we could
496    * do both at once if we felt like doing our own sort.
497    */
498   for (i = 0; i < self->matches->len; i++)
499     {
500       DzlFuzzyMatch *match;
501       gpointer other_score;
502 
503     next:
504       match = &g_array_index (self->matches, DzlFuzzyMatch, i);
505       other_score = g_hash_table_lookup (by_document, GUINT_TO_POINTER (match->document_id));
506 
507       /*
508        * This item should have been discarded, but we didn't have enough
509        * information at the time we built the array.
510        */
511       if (pointer_to_float (other_score) < match->score)
512         {
513           g_array_remove_index_fast (self->matches, i);
514           if (i < self->matches->len - 1)
515             goto next;
516         }
517     }
518 
519   if (g_task_return_error_if_cancelled (task))
520     return;
521 
522 cleanup:
523   if (self->matches != NULL)
524     {
525       g_array_sort (self->matches, fuzzy_match_compare);
526       if (lookup.max_matches > 0 && lookup.max_matches < self->matches->len)
527         g_array_set_size (self->matches, lookup.max_matches);
528     }
529 
530   g_task_return_boolean (task, TRUE);
531 }
532 
533 static void
dzl_fuzzy_index_cursor_init_async(GAsyncInitable * initable,gint io_priority,GCancellable * cancellable,GAsyncReadyCallback callback,gpointer user_data)534 dzl_fuzzy_index_cursor_init_async (GAsyncInitable      *initable,
535                                    gint                 io_priority,
536                                    GCancellable        *cancellable,
537                                    GAsyncReadyCallback  callback,
538                                    gpointer             user_data)
539 {
540   DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)initable;
541   g_autoptr(GTask) task = NULL;
542 
543   g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
544   g_assert (!cancellable || G_IS_CANCELLABLE (cancellable));
545 
546   task = g_task_new (self, cancellable, callback, user_data);
547   g_task_set_source_tag (task, dzl_fuzzy_index_cursor_init_async);
548   g_task_set_priority (task, io_priority);
549   g_task_set_check_cancellable (task, FALSE);
550   g_task_run_in_thread (task, dzl_fuzzy_index_cursor_worker);
551 }
552 
553 static gboolean
dzl_fuzzy_index_cursor_init_finish(GAsyncInitable * initiable,GAsyncResult * result,GError ** error)554 dzl_fuzzy_index_cursor_init_finish (GAsyncInitable  *initiable,
555                                     GAsyncResult    *result,
556                                     GError         **error)
557 {
558   g_assert (DZL_IS_FUZZY_INDEX_CURSOR (initiable));
559   g_assert (G_IS_TASK (result));
560 
561   return g_task_propagate_boolean (G_TASK (result), error);
562 }
563 
564 static void
async_initable_iface_init(GAsyncInitableIface * iface)565 async_initable_iface_init (GAsyncInitableIface *iface)
566 {
567   iface->init_async = dzl_fuzzy_index_cursor_init_async;
568   iface->init_finish = dzl_fuzzy_index_cursor_init_finish;
569 }
570 
571 static GType
dzl_fuzzy_index_cursor_get_item_type(GListModel * model)572 dzl_fuzzy_index_cursor_get_item_type (GListModel *model)
573 {
574   return DZL_TYPE_FUZZY_INDEX_MATCH;
575 }
576 
577 static guint
dzl_fuzzy_index_cursor_get_n_items(GListModel * model)578 dzl_fuzzy_index_cursor_get_n_items (GListModel *model)
579 {
580   DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)model;
581 
582   g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
583 
584   return self->matches->len;
585 }
586 
587 static gpointer
dzl_fuzzy_index_cursor_get_item(GListModel * model,guint position)588 dzl_fuzzy_index_cursor_get_item (GListModel *model,
589                                  guint       position)
590 {
591   DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)model;
592   g_autoptr(GVariant) document = NULL;
593   DzlFuzzyMatch *match;
594 
595   g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
596   g_assert (position < self->matches->len);
597 
598   match = &g_array_index (self->matches, DzlFuzzyMatch, position);
599 
600   document = _dzl_fuzzy_index_lookup_document (self->index, match->document_id);
601 
602   return g_object_new (DZL_TYPE_FUZZY_INDEX_MATCH,
603                        "document", document,
604                        "key", match->key,
605                        "score", match->score,
606                        "priority", match->priority,
607                        NULL);
608 }
609 
610 static void
list_model_iface_init(GListModelInterface * iface)611 list_model_iface_init (GListModelInterface *iface)
612 {
613   iface->get_item_type = dzl_fuzzy_index_cursor_get_item_type;
614   iface->get_n_items = dzl_fuzzy_index_cursor_get_n_items;
615   iface->get_item = dzl_fuzzy_index_cursor_get_item;
616 }
617 
618 /**
619  * dzl_fuzzy_index_cursor_get_index:
620  * @self: A #DzlFuzzyIndexCursor
621  *
622  * Gets the index the cursor is iterating.
623  *
624  * Returns: (transfer none): A #DzlFuzzyIndex.
625  */
626 DzlFuzzyIndex *
dzl_fuzzy_index_cursor_get_index(DzlFuzzyIndexCursor * self)627 dzl_fuzzy_index_cursor_get_index (DzlFuzzyIndexCursor *self)
628 {
629   g_return_val_if_fail (DZL_IS_FUZZY_INDEX_CURSOR (self), NULL);
630 
631   return self->index;
632 }
633