1 /* dzl-fuzzy-index-cursor.c
2 *
3 * Copyright (C) 2016 Christian Hergert <christian@hergert.me>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #define G_LOG_DOMAIN "dzl-fuzzy-index-cursor"
20
21 #include "config.h"
22
23 #include <string.h>
24
25 #include "search/dzl-fuzzy-index-cursor.h"
26 #include "search/dzl-fuzzy-index-match.h"
27 #include "search/dzl-fuzzy-index-private.h"
28 #include "util/dzl-macros.h"
29 #include "util/dzl-int-pair.h"
30
31 struct _DzlFuzzyIndexCursor
32 {
33 GObject object;
34
35 DzlFuzzyIndex *index;
36 gchar *query;
37 GVariantDict *tables;
38 GArray *matches;
39 guint max_matches;
40 guint case_sensitive : 1;
41 };
42
43 typedef struct
44 {
45 guint position;
46 guint lookaside_id;
47 } DzlFuzzyIndexItem;
48
49 typedef struct
50 {
51 const gchar *key;
52 guint document_id;
53 gfloat score;
54 guint priority;
55 } DzlFuzzyMatch;
56
57 typedef struct
58 {
59 DzlFuzzyIndex *index;
60 const DzlFuzzyIndexItem * const *tables;
61 const gsize *tables_n_elements;
62 gint *tables_state;
63 guint n_tables;
64 guint max_matches;
65 const gchar *needle;
66 GHashTable *matches;
67 } DzlFuzzyLookup;
68
69 enum {
70 PROP_0,
71 PROP_CASE_SENSITIVE,
72 PROP_INDEX,
73 PROP_TABLES,
74 PROP_MAX_MATCHES,
75 PROP_QUERY,
76 N_PROPS
77 };
78
79 static void async_initable_iface_init (GAsyncInitableIface *iface);
80 static void list_model_iface_init (GListModelInterface *iface);
81
82 static GParamSpec *properties [N_PROPS];
83
G_DEFINE_TYPE_WITH_CODE(DzlFuzzyIndexCursor,dzl_fuzzy_index_cursor,G_TYPE_OBJECT,G_IMPLEMENT_INTERFACE (G_TYPE_ASYNC_INITABLE,async_initable_iface_init)G_IMPLEMENT_INTERFACE (G_TYPE_LIST_MODEL,list_model_iface_init))84 G_DEFINE_TYPE_WITH_CODE (DzlFuzzyIndexCursor, dzl_fuzzy_index_cursor, G_TYPE_OBJECT,
85 G_IMPLEMENT_INTERFACE (G_TYPE_ASYNC_INITABLE, async_initable_iface_init)
86 G_IMPLEMENT_INTERFACE (G_TYPE_LIST_MODEL, list_model_iface_init))
87
88 static inline gfloat
89 pointer_to_float (gpointer ptr)
90 {
91 union {
92 gpointer ptr;
93 #if GLIB_SIZEOF_VOID_P == 8
94 gdouble fval;
95 #else
96 gfloat fval;
97 #endif
98 } convert;
99 convert.ptr = ptr;
100 return (gfloat)convert.fval;
101 }
102
103 static inline gpointer
float_to_pointer(gfloat fval)104 float_to_pointer (gfloat fval)
105 {
106 union {
107 gpointer ptr;
108 #if GLIB_SIZEOF_VOID_P == 8
109 gdouble fval;
110 #else
111 gfloat fval;
112 #endif
113 } convert;
114 convert.fval = fval;
115 return convert.ptr;
116 }
117
118 /* Not guaranteed, so assert to be sure */
119 G_STATIC_ASSERT (sizeof(gdouble) == 8);
120 G_STATIC_ASSERT (sizeof(gfloat) == 4);
121
122 static void
dzl_fuzzy_index_cursor_finalize(GObject * object)123 dzl_fuzzy_index_cursor_finalize (GObject *object)
124 {
125 DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)object;
126
127 g_clear_object (&self->index);
128 g_clear_pointer (&self->query, g_free);
129 g_clear_pointer (&self->matches, g_array_unref);
130 g_clear_pointer (&self->tables, g_variant_dict_unref);
131
132 G_OBJECT_CLASS (dzl_fuzzy_index_cursor_parent_class)->finalize (object);
133 }
134
135 static void
dzl_fuzzy_index_cursor_get_property(GObject * object,guint prop_id,GValue * value,GParamSpec * pspec)136 dzl_fuzzy_index_cursor_get_property (GObject *object,
137 guint prop_id,
138 GValue *value,
139 GParamSpec *pspec)
140 {
141 DzlFuzzyIndexCursor *self = DZL_FUZZY_INDEX_CURSOR(object);
142
143 switch (prop_id)
144 {
145 case PROP_CASE_SENSITIVE:
146 g_value_set_boolean (value, self->case_sensitive);
147 break;
148
149 case PROP_INDEX:
150 g_value_set_object (value, self->index);
151 break;
152
153 case PROP_MAX_MATCHES:
154 g_value_set_uint (value, self->max_matches);
155 break;
156
157 case PROP_QUERY:
158 g_value_set_string (value, self->query);
159 break;
160
161 default:
162 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
163 }
164 }
165
166 static void
dzl_fuzzy_index_cursor_set_property(GObject * object,guint prop_id,const GValue * value,GParamSpec * pspec)167 dzl_fuzzy_index_cursor_set_property (GObject *object,
168 guint prop_id,
169 const GValue *value,
170 GParamSpec *pspec)
171 {
172 DzlFuzzyIndexCursor *self = DZL_FUZZY_INDEX_CURSOR(object);
173
174 switch (prop_id)
175 {
176 case PROP_CASE_SENSITIVE:
177 self->case_sensitive = g_value_get_boolean (value);
178 break;
179
180 case PROP_INDEX:
181 self->index = g_value_dup_object (value);
182 break;
183
184 case PROP_TABLES:
185 self->tables = g_value_dup_boxed (value);
186 break;
187
188 case PROP_MAX_MATCHES:
189 self->max_matches = g_value_get_uint (value);
190 break;
191
192 case PROP_QUERY:
193 self->query = g_value_dup_string (value);
194 break;
195
196 default:
197 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
198 }
199 }
200
201 static void
dzl_fuzzy_index_cursor_class_init(DzlFuzzyIndexCursorClass * klass)202 dzl_fuzzy_index_cursor_class_init (DzlFuzzyIndexCursorClass *klass)
203 {
204 GObjectClass *object_class = G_OBJECT_CLASS (klass);
205
206 object_class->finalize = dzl_fuzzy_index_cursor_finalize;
207 object_class->get_property = dzl_fuzzy_index_cursor_get_property;
208 object_class->set_property = dzl_fuzzy_index_cursor_set_property;
209
210 properties [PROP_CASE_SENSITIVE] =
211 g_param_spec_boolean ("case-sensitive",
212 "Case Sensitive",
213 "Case Sensitive",
214 FALSE,
215 (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
216
217 properties [PROP_INDEX] =
218 g_param_spec_object ("index",
219 "Index",
220 "The index this cursor is iterating",
221 DZL_TYPE_FUZZY_INDEX,
222 (G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
223
224 properties [PROP_TABLES] =
225 g_param_spec_boxed ("tables",
226 "Tables",
227 "The dictionary of character indexes",
228 G_TYPE_VARIANT_DICT,
229 (G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
230
231 properties [PROP_QUERY] =
232 g_param_spec_string ("query",
233 "Query",
234 "The query for the index",
235 NULL,
236 (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
237
238 properties [PROP_MAX_MATCHES] =
239 g_param_spec_uint ("max-matches",
240 "Max Matches",
241 "The max number of matches to display",
242 0,
243 G_MAXUINT,
244 0,
245 (G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY | G_PARAM_STATIC_STRINGS));
246
247 g_object_class_install_properties (object_class, N_PROPS, properties);
248 }
249
250 static void
dzl_fuzzy_index_cursor_init(DzlFuzzyIndexCursor * self)251 dzl_fuzzy_index_cursor_init (DzlFuzzyIndexCursor *self)
252 {
253 self->matches = g_array_new (FALSE, FALSE, sizeof (DzlFuzzyMatch));
254 }
255
256 static gint
fuzzy_match_compare(gconstpointer a,gconstpointer b)257 fuzzy_match_compare (gconstpointer a,
258 gconstpointer b)
259 {
260 const DzlFuzzyMatch *ma = a;
261 const DzlFuzzyMatch *mb = b;
262
263 if (ma->score < mb->score)
264 return 1;
265 else if (ma->score > mb->score)
266 return -1;
267
268 return strcmp (ma->key, mb->key);
269 }
270
271 static gboolean
fuzzy_do_match(const DzlFuzzyLookup * lookup,const DzlFuzzyIndexItem * item,guint table_index,gint score)272 fuzzy_do_match (const DzlFuzzyLookup *lookup,
273 const DzlFuzzyIndexItem *item,
274 guint table_index,
275 gint score)
276 {
277 const DzlFuzzyIndexItem *table;
278 const DzlFuzzyIndexItem *iter;
279 gssize n_elements;
280 gint *state;
281 gint iter_score;
282
283 g_assert (lookup != NULL);
284 g_assert (item != NULL);
285 g_assert (score >= 0);
286
287 table = lookup->tables [table_index];
288 n_elements = (gssize)lookup->tables_n_elements [table_index];
289 state = &lookup->tables_state [table_index];
290
291 for (; state [0] < n_elements; state [0]++)
292 {
293 DzlIntPair *lookup_pair;
294 gboolean contains_document;
295
296 iter = &table [state [0]];
297
298 if ((iter->lookaside_id < item->lookaside_id) ||
299 ((iter->lookaside_id == item->lookaside_id) &&
300 (iter->position <= item->position)))
301 continue;
302 else if (iter->lookaside_id > item->lookaside_id)
303 break;
304
305 iter_score = score + (iter->position - item->position);
306
307 if (table_index + 1 < lookup->n_tables)
308 {
309 if (fuzzy_do_match (lookup, iter, table_index + 1, iter_score))
310 return TRUE;
311 continue;
312 }
313
314 contains_document = g_hash_table_lookup_extended (lookup->matches,
315 GUINT_TO_POINTER (item->lookaside_id),
316 NULL,
317 (gpointer *)&lookup_pair);
318
319 if (!contains_document || iter_score < dzl_int_pair_first (lookup_pair))
320 g_hash_table_insert (lookup->matches,
321 GUINT_TO_POINTER (item->lookaside_id),
322 dzl_int_pair_new (iter_score, iter->position));
323
324 return TRUE;
325 }
326
327 return FALSE;
328 }
329
330 static void
dzl_fuzzy_index_cursor_worker(GTask * task,gpointer source_object,gpointer task_data,GCancellable * cancellable)331 dzl_fuzzy_index_cursor_worker (GTask *task,
332 gpointer source_object,
333 gpointer task_data,
334 GCancellable *cancellable)
335 {
336 DzlFuzzyIndexCursor *self = source_object;
337 g_autoptr(GHashTable) matches = NULL;
338 g_autoptr(GHashTable) by_document = NULL;
339 g_autoptr(GPtrArray) tables = NULL;
340 g_autoptr(GArray) tables_n_elements = NULL;
341 g_autofree gint *tables_state = NULL;
342 g_autofree gchar *freeme = NULL;
343 const gchar *query;
344 DzlFuzzyLookup lookup = { 0 };
345 GHashTableIter iter;
346 const gchar *str;
347 gpointer key, value;
348 guint i;
349
350 g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
351 g_assert (G_IS_TASK (task));
352
353 if (g_task_return_error_if_cancelled (task))
354 return;
355
356 /* No matches with empty query */
357 if (self->query == NULL || *self->query == '\0')
358 goto cleanup;
359
360 /* If we are not case-sensitive, we need to downcase the query string */
361 query = self->query;
362 if (!self->case_sensitive)
363 query = freeme = g_utf8_casefold (query, -1);
364
365 tables = g_ptr_array_new ();
366 tables_n_elements = g_array_new (FALSE, FALSE, sizeof (gsize));
367 matches = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)dzl_int_pair_free);
368
369 for (str = query; *str; str = g_utf8_next_char (str))
370 {
371 gunichar ch = g_utf8_get_char (str);
372 g_autoptr(GVariant) table = NULL;
373 gconstpointer fixed;
374 gsize n_elements;
375 gchar char_key[8];
376
377 if (g_unichar_isspace (ch))
378 continue;
379
380 char_key [g_unichar_to_utf8 (ch, char_key)] = '\0';
381 table = g_variant_dict_lookup_value (self->tables,
382 char_key,
383 (const GVariantType *)"a(uu)");
384
385 /* No possible matches, missing table for character */
386 if (table == NULL)
387 goto cleanup;
388
389 fixed = g_variant_get_fixed_array (table, &n_elements, sizeof (DzlFuzzyIndexItem));
390 g_array_append_val (tables_n_elements, n_elements);
391 g_ptr_array_add (tables, (gpointer)fixed);
392 }
393
394 if (tables->len == 0)
395 goto cleanup;
396
397 g_assert (tables->len > 0);
398 g_assert (tables->len == tables_n_elements->len);
399
400 tables_state = g_new0 (gint, tables->len);
401
402 lookup.index = self->index;
403 lookup.matches = matches;
404 lookup.tables = (const DzlFuzzyIndexItem * const *)tables->pdata;
405 lookup.tables_n_elements = (const gsize *)(gpointer)tables_n_elements->data;
406 lookup.tables_state = tables_state;
407 lookup.n_tables = tables->len;
408 lookup.needle = query;
409 lookup.max_matches = self->max_matches;
410
411 if G_LIKELY (lookup.n_tables > 1)
412 {
413 for (i = 0; i < lookup.tables_n_elements[0]; i++)
414 {
415 const DzlFuzzyIndexItem *item = &lookup.tables[0][i];
416
417 fuzzy_do_match (&lookup, item, 1, MIN (16, item->position * 2));
418 }
419 }
420 else
421 {
422 guint last_id = G_MAXUINT;
423
424 for (i = 0; i < lookup.tables_n_elements[0]; i++)
425 {
426 const DzlFuzzyIndexItem *item = &lookup.tables[0][i];
427 DzlFuzzyMatch match;
428
429 if (item->lookaside_id != last_id)
430 {
431 last_id = item->lookaside_id;
432
433 if G_UNLIKELY (!_dzl_fuzzy_index_resolve (self->index,
434 item->lookaside_id,
435 &match.document_id,
436 &match.key,
437 &match.priority,
438 item->position,
439 item->position,
440 &match.score))
441 continue;
442
443 g_array_append_val (self->matches, match);
444 }
445 }
446
447 goto cleanup;
448 }
449
450 if (g_task_return_error_if_cancelled (task))
451 return;
452
453 by_document = g_hash_table_new (NULL, NULL);
454
455 g_hash_table_iter_init (&iter, matches);
456
457 while (g_hash_table_iter_next (&iter, &key, &value))
458 {
459 DzlIntPair *pair = value;
460 guint score = dzl_int_pair_first (pair);
461 guint last_offset = dzl_int_pair_second (pair);
462 gpointer other_score;
463 DzlFuzzyMatch match = {0};
464 guint lookaside_id = GPOINTER_TO_UINT (key);
465
466 if G_UNLIKELY (!_dzl_fuzzy_index_resolve (self->index,
467 lookaside_id,
468 &match.document_id,
469 &match.key,
470 &match.priority,
471 score,
472 last_offset,
473 &match.score))
474 continue;
475
476 if (g_hash_table_lookup_extended (by_document,
477 GUINT_TO_POINTER (match.document_id),
478 NULL,
479 &other_score) &&
480 match.score <= pointer_to_float (other_score))
481 continue;
482
483 g_hash_table_insert (by_document,
484 GUINT_TO_POINTER (match.document_id),
485 float_to_pointer (match.score));
486
487 g_array_append_val (self->matches, match);
488 }
489
490 /*
491 * Because we have to do the deduplication of documents after
492 * searching all the potential matches, we could have duplicates
493 * in the array. So we need to filter them out. Not that big
494 * of a deal, since we can do remove_fast on the index and
495 * we have to sort them afterwards anyway. Potentially, we could
496 * do both at once if we felt like doing our own sort.
497 */
498 for (i = 0; i < self->matches->len; i++)
499 {
500 DzlFuzzyMatch *match;
501 gpointer other_score;
502
503 next:
504 match = &g_array_index (self->matches, DzlFuzzyMatch, i);
505 other_score = g_hash_table_lookup (by_document, GUINT_TO_POINTER (match->document_id));
506
507 /*
508 * This item should have been discarded, but we didn't have enough
509 * information at the time we built the array.
510 */
511 if (pointer_to_float (other_score) < match->score)
512 {
513 g_array_remove_index_fast (self->matches, i);
514 if (i < self->matches->len - 1)
515 goto next;
516 }
517 }
518
519 if (g_task_return_error_if_cancelled (task))
520 return;
521
522 cleanup:
523 if (self->matches != NULL)
524 {
525 g_array_sort (self->matches, fuzzy_match_compare);
526 if (lookup.max_matches > 0 && lookup.max_matches < self->matches->len)
527 g_array_set_size (self->matches, lookup.max_matches);
528 }
529
530 g_task_return_boolean (task, TRUE);
531 }
532
533 static void
dzl_fuzzy_index_cursor_init_async(GAsyncInitable * initable,gint io_priority,GCancellable * cancellable,GAsyncReadyCallback callback,gpointer user_data)534 dzl_fuzzy_index_cursor_init_async (GAsyncInitable *initable,
535 gint io_priority,
536 GCancellable *cancellable,
537 GAsyncReadyCallback callback,
538 gpointer user_data)
539 {
540 DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)initable;
541 g_autoptr(GTask) task = NULL;
542
543 g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
544 g_assert (!cancellable || G_IS_CANCELLABLE (cancellable));
545
546 task = g_task_new (self, cancellable, callback, user_data);
547 g_task_set_source_tag (task, dzl_fuzzy_index_cursor_init_async);
548 g_task_set_priority (task, io_priority);
549 g_task_set_check_cancellable (task, FALSE);
550 g_task_run_in_thread (task, dzl_fuzzy_index_cursor_worker);
551 }
552
553 static gboolean
dzl_fuzzy_index_cursor_init_finish(GAsyncInitable * initiable,GAsyncResult * result,GError ** error)554 dzl_fuzzy_index_cursor_init_finish (GAsyncInitable *initiable,
555 GAsyncResult *result,
556 GError **error)
557 {
558 g_assert (DZL_IS_FUZZY_INDEX_CURSOR (initiable));
559 g_assert (G_IS_TASK (result));
560
561 return g_task_propagate_boolean (G_TASK (result), error);
562 }
563
564 static void
async_initable_iface_init(GAsyncInitableIface * iface)565 async_initable_iface_init (GAsyncInitableIface *iface)
566 {
567 iface->init_async = dzl_fuzzy_index_cursor_init_async;
568 iface->init_finish = dzl_fuzzy_index_cursor_init_finish;
569 }
570
571 static GType
dzl_fuzzy_index_cursor_get_item_type(GListModel * model)572 dzl_fuzzy_index_cursor_get_item_type (GListModel *model)
573 {
574 return DZL_TYPE_FUZZY_INDEX_MATCH;
575 }
576
577 static guint
dzl_fuzzy_index_cursor_get_n_items(GListModel * model)578 dzl_fuzzy_index_cursor_get_n_items (GListModel *model)
579 {
580 DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)model;
581
582 g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
583
584 return self->matches->len;
585 }
586
587 static gpointer
dzl_fuzzy_index_cursor_get_item(GListModel * model,guint position)588 dzl_fuzzy_index_cursor_get_item (GListModel *model,
589 guint position)
590 {
591 DzlFuzzyIndexCursor *self = (DzlFuzzyIndexCursor *)model;
592 g_autoptr(GVariant) document = NULL;
593 DzlFuzzyMatch *match;
594
595 g_assert (DZL_IS_FUZZY_INDEX_CURSOR (self));
596 g_assert (position < self->matches->len);
597
598 match = &g_array_index (self->matches, DzlFuzzyMatch, position);
599
600 document = _dzl_fuzzy_index_lookup_document (self->index, match->document_id);
601
602 return g_object_new (DZL_TYPE_FUZZY_INDEX_MATCH,
603 "document", document,
604 "key", match->key,
605 "score", match->score,
606 "priority", match->priority,
607 NULL);
608 }
609
610 static void
list_model_iface_init(GListModelInterface * iface)611 list_model_iface_init (GListModelInterface *iface)
612 {
613 iface->get_item_type = dzl_fuzzy_index_cursor_get_item_type;
614 iface->get_n_items = dzl_fuzzy_index_cursor_get_n_items;
615 iface->get_item = dzl_fuzzy_index_cursor_get_item;
616 }
617
618 /**
619 * dzl_fuzzy_index_cursor_get_index:
620 * @self: A #DzlFuzzyIndexCursor
621 *
622 * Gets the index the cursor is iterating.
623 *
624 * Returns: (transfer none): A #DzlFuzzyIndex.
625 */
626 DzlFuzzyIndex *
dzl_fuzzy_index_cursor_get_index(DzlFuzzyIndexCursor * self)627 dzl_fuzzy_index_cursor_get_index (DzlFuzzyIndexCursor *self)
628 {
629 g_return_val_if_fail (DZL_IS_FUZZY_INDEX_CURSOR (self), NULL);
630
631 return self->index;
632 }
633