1 /*
2  * matcher.c
3  * Copyright (C) 2014 Kovid Goyal <kovid at kovidgoyal.net>
4  *
5  * Distributed under terms of the GPL3 license.
6  */
7 
8 #define NO_ICU_TO_PYTHON
9 #define NO_PYTHON_TO_ICU32
10 #include "icu_calibre_utils.h"
11 #include <float.h>
12 #include <stdbool.h>
13 
14 #ifdef _MSC_VER
15 // inline does not work with the visual studio C compiler
16 #define inline
17 #endif
18 
19 #define MAX(x, y) ((x > y) ? x : y)
20 #define nullfree(x) if(x != NULL) free(x); x = NULL;
21 
22 // Algorithm to sort items by subsequence score {{{
23 typedef struct {
24     double score;
25     int32_t *positions;
26 } MemoryItem;
27 
alloc_memory(int32_t needle_len,int32_t max_haystack_len)28 static MemoryItem*** alloc_memory(int32_t needle_len, int32_t max_haystack_len) {
29     MemoryItem ***ans = NULL, **d1 = NULL, *d2 = NULL;
30     size_t num = max_haystack_len * max_haystack_len * needle_len;
31     size_t position_sz = needle_len * sizeof(int32_t);
32     size_t sz = (num * (sizeof(MemoryItem) + position_sz)) + (max_haystack_len * sizeof(MemoryItem**)) + (needle_len * sizeof(MemoryItem*));
33     int32_t hidx, nidx, last_idx, i, j;
34     char *base = NULL;
35 
36     ans = (MemoryItem***) calloc(sz, 1);
37     if (ans != NULL) {
38         d1 = (MemoryItem**)(ans + max_haystack_len);
39         d2 = (MemoryItem*) (d1 + max_haystack_len * needle_len );
40         for (i = 0; i < max_haystack_len; i++) {
41             ans[i] = d1 + i * needle_len;
42             for (j = 0; j < needle_len; j++) d1[i*needle_len + j] = d2 + j;
43         }
44 
45         base = ((char*)ans) + (sizeof(MemoryItem**)*max_haystack_len) + (sizeof(MemoryItem*)*needle_len) + (sizeof(MemoryItem)*max_haystack_len);
46 
47         for (hidx = 0; hidx < max_haystack_len; hidx++) {
48             for (nidx = 0; nidx < needle_len; nidx++) {
49                 for (last_idx = 0; last_idx < max_haystack_len; last_idx++) {
50                     ans[hidx][nidx][last_idx].positions = (int32_t*)base;
51                     base += position_sz;
52                 }
53             }
54         }
55     }
56     return ans;
57 }
58 
clear_memory(MemoryItem *** mem,int32_t needle_len,int32_t max_haystack_len)59 static void clear_memory(MemoryItem ***mem, int32_t needle_len, int32_t max_haystack_len) {
60     int32_t hidx, nidx, last_idx;
61     for (hidx = 0; hidx < max_haystack_len; hidx++) {
62         for (nidx = 0; nidx < needle_len; nidx++) {
63             for (last_idx = 0; last_idx < max_haystack_len; last_idx++) {
64                 mem[hidx][nidx][last_idx].score = DBL_MAX;
65             }
66         }
67     }
68 }
69 
70 typedef struct {
71     int32_t hidx;
72     int32_t nidx;
73     int32_t last_idx;
74     double score;
75     int32_t *positions;
76 } StackItem;
77 
78 typedef struct {
79     ssize_t pos;
80     int32_t needle_len;
81     size_t size;
82     StackItem *items;
83 } Stack;
84 
alloc_stack(Stack * stack,int32_t needle_len,int32_t max_haystack_len)85 static void alloc_stack(Stack *stack, int32_t needle_len, int32_t max_haystack_len) {
86     StackItem *ans = NULL;
87     char *base = NULL;
88     size_t num = max_haystack_len * needle_len;
89     size_t position_sz = needle_len * sizeof(int32_t);
90     size_t sz = sizeof(StackItem) + position_sz;
91     size_t i = 0;
92 
93     stack->needle_len = needle_len;
94     stack->pos = -1;
95     stack->size = num;
96     ans = (StackItem*) calloc(num, sz);
97     if (ans != NULL) {
98         base = (char*)(ans + num);
99         for (i = 0; i < num; i++, base += position_sz) ans[i].positions = (int32_t*) base;
100         stack->items = ans;
101     }
102 }
103 
stack_clear(Stack * stack)104 static void stack_clear(Stack *stack) { stack->pos = -1; }
105 
stack_push(Stack * stack,int32_t hidx,int32_t nidx,int32_t last_idx,double score,int32_t * positions)106 static void stack_push(Stack *stack, int32_t hidx, int32_t nidx, int32_t last_idx, double score, int32_t *positions) {
107     StackItem *si = &(stack->items[++stack->pos]);
108     si->hidx = hidx; si->nidx = nidx; si->last_idx = last_idx; si->score = score;
109     memcpy(si->positions, positions, sizeof(*positions) * stack->needle_len);
110 }
111 
stack_pop(Stack * stack,int32_t * hidx,int32_t * nidx,int32_t * last_idx,double * score,int32_t * positions)112 static void stack_pop(Stack *stack, int32_t *hidx, int32_t *nidx, int32_t *last_idx, double *score, int32_t *positions) {
113     StackItem *si = &(stack->items[stack->pos--]);
114     *hidx = si->hidx; *nidx = si->nidx; *last_idx = si->last_idx; *score = si->score;
115     memcpy(positions, si->positions, sizeof(*positions) * stack->needle_len);
116 }
117 
118 typedef struct {
119     UChar *haystack;
120     int32_t haystack_len;
121     UChar *needle;
122     int32_t needle_len;
123     double max_score_per_char;
124     MemoryItem ***memo;
125     UChar *level1;
126     UChar *level2;
127     UChar *level3;
128 } MatchInfo;
129 
130 typedef struct {
131     double score;
132     int32_t *positions;
133 } Match;
134 
135 
calc_score_for_char(MatchInfo * m,UChar32 last,UChar32 current,int32_t distance_from_last_match)136 static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, int32_t distance_from_last_match) {
137     double factor = 1.0;
138     double ans = m->max_score_per_char;
139 
140     if (u_strchr32(m->level1, last) != NULL)
141         factor = 0.9;
142     else if (u_strchr32(m->level2, last) != NULL)
143         factor = 0.8;
144     else if (u_isULowercase(last) && u_isUUppercase(current))
145         factor = 0.8;  // CamelCase
146     else if (u_strchr32(m->level3, last) != NULL)
147         factor = 0.7;
148     else
149         // If last is not a special char, factor diminishes
150         // as distance from last matched char increases
151         factor = (1.0 / distance_from_last_match) * 0.75;
152     return ans * factor;
153 }
154 
convert_positions(int32_t * positions,int32_t * final_positions,UChar * string,int32_t char_len,int32_t byte_len,double score)155 static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
156     // The positions array stores character positions as byte offsets in string, convert them into character offsets
157     int32_t i, *end;
158 
159     if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; }
160 
161     end = final_positions + char_len;
162     for (i = 0; i < byte_len && final_positions < end; i++) {
163         if (positions[i] == -1) continue;
164 #if PY_VERSION_HEX >= 0x03030000
165         *final_positions = u_countChar32(string, positions[i]);
166 #else
167 #ifdef Py_UNICODE_WIDE
168         *final_positions = u_countChar32(string, positions[i]);
169 #else
170         *final_positions = positions[i];
171 #endif
172 #endif
173         final_positions += 1;
174     }
175 }
176 
process_item(MatchInfo * m,Stack * stack,int32_t * final_positions,UStringSearch ** searches)177 static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) {
178     UChar32 hc, lc;
179     double final_score = 0.0, score = 0.0, score_for_char = 0.0;
180     int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
181     MemoryItem mem = {0};
182     UStringSearch *search = NULL;
183     UErrorCode status = U_ZERO_ERROR;
184 
185     stack_push(stack, 0, 0, 0, 0.0, final_positions);
186 
187     while (stack->pos >= 0) {
188         stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
189         mem = m->memo[hidx][nidx][last_idx];
190         if (mem.score == DBL_MAX) {
191             // No memoized result, calculate the score
192             for (i = nidx; i < m->needle_len;) {
193                 nidx = i;
194                 U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle
195                 search = searches[nidx];
196                 if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
197                 status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct
198                 usearch_setOffset(search, hidx, &status);
199                 status = U_ZERO_ERROR;
200                 pos = usearch_next(search, &status);
201                 if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found
202                 distance = u_countChar32(m->haystack + last_idx, pos - last_idx);
203                 if (distance <= 1) score_for_char = m->max_score_per_char;
204                 else {
205                     U16_GET(m->haystack, 0, pos, m->haystack_len, hc);
206                     j = pos;
207                     U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
208                     score_for_char = calc_score_for_char(m, lc, hc, distance);
209                 }
210                 j = pos;
211                 U16_NEXT(m->haystack, j, m->haystack_len, hc);
212                 hidx = j;
213                 if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
214                 last_idx = pos;
215                 positions[nidx] = pos;
216                 score += score_for_char;
217             } // for(i) iterate over needle
218             mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);
219 
220         } else {
221             score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
222         }
223         // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
224         if (score > final_score) {
225             final_score = score;
226             memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
227         }
228     }
229     return final_score;
230 }
231 
create_searches(UStringSearch ** searches,UChar * haystack,int32_t haystack_len,UChar * needle,int32_t needle_len,UCollator * collator)232 static bool create_searches(UStringSearch **searches, UChar *haystack, int32_t haystack_len, UChar *needle, int32_t needle_len, UCollator *collator) {
233     int32_t i = 0, pos = 0;
234     UErrorCode status = U_ZERO_ERROR;
235 
236     while (i < needle_len) {
237         pos = i;
238         U16_FWD_1(needle, i, needle_len);
239         if (pos == i) break;
240         searches[pos] = usearch_openFromCollator(needle + pos, i - pos, haystack, haystack_len, collator, NULL, &status);
241         if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); searches[pos] = NULL; return false; }
242     }
243 
244     return true;
245 }
246 
free_searches(UStringSearch ** searches,int32_t count)247 static void free_searches(UStringSearch **searches, int32_t count) {
248     int32_t i = 0;
249     for (i = 0; i < count; i++) {
250         if (searches[i] != NULL) usearch_close(searches[i]);
251         searches[i] = NULL;
252     }
253 }
254 
match(UChar ** items,int32_t * item_lengths,uint32_t item_count,UChar * needle,Match * match_results,int32_t * final_positions,int32_t needle_char_len,UCollator * collator,UChar * level1,UChar * level2,UChar * level3)255 static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UChar *needle, Match *match_results, int32_t *final_positions, int32_t needle_char_len, UCollator *collator, UChar *level1, UChar *level2, UChar *level3) {
256     Stack stack = {0};
257     int32_t i = 0, maxhl = 0;
258     int32_t r = 0, *positions = NULL;
259     MatchInfo *matches = NULL;
260     bool ok = false;
261     MemoryItem ***memo = NULL;
262     int32_t needle_len = u_strlen(needle);
263     UStringSearch **searches = NULL;
264 
265     if (needle_len <= 0 || item_count <= 0) {
266         for (i = 0; i < (int32_t)item_count; i++) match_results[i].score = 0.0;
267         ok = true;
268         goto end;
269     }
270 
271     matches = (MatchInfo*)calloc(item_count, sizeof(MatchInfo));
272     positions = (int32_t*)calloc(2*needle_len, sizeof(int32_t)); // One set of positions is the final answer and one set is working space
273     searches = (UStringSearch**) calloc(needle_len, sizeof(UStringSearch*));
274     if (matches == NULL || positions == NULL || searches == NULL) {PyErr_NoMemory(); goto end;}
275 
276     for (i = 0; i < (int32_t)item_count; i++) {
277         matches[i].haystack = items[i];
278         matches[i].haystack_len = item_lengths[i];
279         matches[i].needle = needle;
280         matches[i].needle_len = needle_len;
281         matches[i].max_score_per_char = (1.0 / matches[i].haystack_len + 1.0 / needle_len) / 2.0;
282         matches[i].level1 = level1;
283         matches[i].level2 = level2;
284         matches[i].level3 = level3;
285         maxhl = MAX(maxhl, matches[i].haystack_len);
286     }
287 
288     if (maxhl <= 0) {
289         for (i = 0; i < (int32_t)item_count; i++) match_results[i].score = 0.0;
290         ok = true;
291         goto end;
292     }
293 
294     alloc_stack(&stack, needle_len, maxhl);
295     memo = alloc_memory(needle_len, maxhl);
296     if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;}
297 
298     for (i = 0; i < (int32_t)item_count; i++) {
299         for (r = 0; r < needle_len; r++)  positions[r] = -1;
300         stack_clear(&stack);
301         clear_memory(memo, needle_len, matches[i].haystack_len);
302         free_searches(searches, needle_len);
303         if (!create_searches(searches, matches[i].haystack, matches[i].haystack_len, needle, needle_len, collator)) goto end;
304         matches[i].memo = memo;
305         match_results[i].score = process_item(&matches[i], &stack, positions, searches);
306         convert_positions(positions, final_positions + i * needle_char_len, matches[i].haystack, needle_char_len, needle_len, match_results[i].score);
307     }
308 
309     ok = true;
310 end:
311     nullfree(positions);
312     nullfree(stack.items);
313     nullfree(matches);
314     nullfree(memo);
315     if (searches != NULL) { free_searches(searches, needle_len); nullfree(searches); }
316     return ok;
317 }
318 
319 // }}}
320 
321 // Matcher object definition {{{
322 typedef struct {
323     PyObject_HEAD
324     // Type-specific fields go here.
325     UChar **items;
326     uint32_t item_count;
327     int32_t *item_lengths;
328     UChar *level1;
329     UChar *level2;
330     UChar *level3;
331     UCollator *collator;
332 } Matcher;
333 
334 // Matcher.__init__() {{{
335 
free_matcher(Matcher * self)336 static void free_matcher(Matcher *self) {
337     uint32_t i = 0;
338     if (self->items != NULL) {
339         for (i = 0; i < self->item_count; i++) { nullfree(self->items[i]); }
340     }
341     nullfree(self->items); nullfree(self->item_lengths);
342     nullfree(self->level1); nullfree(self->level2); nullfree(self->level3);
343     if (self->collator != NULL) { ucol_close(self->collator); self->collator = NULL; }
344 }
345 static void
Matcher_dealloc(Matcher * self)346 Matcher_dealloc(Matcher* self)
347 {
348     free_matcher(self);
349     Py_TYPE(self)->tp_free((PyObject*)self);
350 }
351 
352 #define alloc_uchar(x) (x * 3 + 1)
353 static int
Matcher_init(Matcher * self,PyObject * args,PyObject * kwds)354 Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
355 {
356     PyObject *items = NULL, *p = NULL, *py_items = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL, *collator = NULL;
357     int32_t i = 0;
358     UErrorCode status = U_ZERO_ERROR;
359     UCollator *col = NULL;
360 
361     if (!PyArg_ParseTuple(args, "OOOOO", &items, &collator, &level1, &level2, &level3)) return -1;
362 
363     // Clone the passed in collator (cloning is needed as collators are not thread safe)
364     if (!PyCapsule_CheckExact(collator)) { PyErr_SetString(PyExc_TypeError, "Collator must be a capsule"); return -1; }
365     col = (UCollator*)PyCapsule_GetPointer(collator, NULL);
366     if (col == NULL) return -1;
367     self->collator = ucol_safeClone(col, NULL, NULL, &status);
368     col = NULL;
369     if (U_FAILURE(status)) { self->collator = NULL; PyErr_SetString(PyExc_ValueError, u_errorName(status)); return -1; }
370 
371     py_items = PySequence_Fast(items,  "Must pass in two sequence objects");
372     if (py_items == NULL) goto end;
373     self->item_count = (uint32_t)PySequence_Size(items);
374 
375     self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
376     self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
377     self->level1 = python_to_icu(level1, NULL);
378     self->level2 = python_to_icu(level2, NULL);
379     self->level3 = python_to_icu(level3, NULL);
380 
381     if (self->items == NULL || self->item_lengths == NULL ) { PyErr_NoMemory(); goto end; }
382     if (self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) goto end;
383 
384     for (i = 0; i < (int32_t)self->item_count; i++) {
385         p = PySequence_Fast_GET_ITEM(py_items, i);
386         self->items[i] = python_to_icu(p, self->item_lengths + i);
387         if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
388     }
389 
390 end:
391     Py_XDECREF(py_items);
392     if (PyErr_Occurred()) { free_matcher(self); }
393     return (PyErr_Occurred()) ? -1 : 0;
394 }
395 // Matcher.__init__() }}}
396 
397 // Matcher.calculate_scores {{{
398 static PyObject *
Matcher_calculate_scores(Matcher * self,PyObject * args)399 Matcher_calculate_scores(Matcher *self, PyObject *args) {
400     int32_t *final_positions = NULL, *p;
401     Match *matches = NULL;
402     bool ok = false;
403     uint32_t i = 0, needle_char_len = 0, j = 0;
404     PyObject *items = NULL, *score = NULL, *positions = NULL, *pneedle = NULL;
405     UChar *needle = NULL;
406 
407     if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL;
408 
409     needle = python_to_icu(pneedle, NULL);
410     if (needle == NULL) return NULL;
411     needle_char_len = u_countChar32(needle, -1);
412     items = PyTuple_New(self->item_count);
413     positions = PyTuple_New(self->item_count);
414     matches = (Match*)calloc(self->item_count, sizeof(Match));
415     final_positions = (int32_t*) calloc(needle_char_len * self->item_count, sizeof(int32_t));
416     if (items == NULL || matches == NULL || final_positions == NULL || positions == NULL) {PyErr_NoMemory(); goto end;}
417 
418     for (i = 0; i < self->item_count; i++) {
419         score = PyTuple_New(needle_char_len);
420         if (score == NULL) { PyErr_NoMemory(); goto end; }
421         PyTuple_SET_ITEM(positions, (Py_ssize_t)i, score);
422     }
423 
424     Py_BEGIN_ALLOW_THREADS;
425     ok = match(self->items, self->item_lengths, self->item_count, needle, matches, final_positions, needle_char_len, self->collator, self->level1, self->level2, self->level3);
426     Py_END_ALLOW_THREADS;
427 
428     if (ok) {
429         for (i = 0; i < self->item_count; i++) {
430             score = PyFloat_FromDouble(matches[i].score);
431             if (score == NULL) { PyErr_NoMemory(); goto end; }
432             PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
433             p = final_positions + (i * needle_char_len);
434             for (j = 0; j < needle_char_len; j++) {
435                 score = PyLong_FromLong((long)p[j]);
436                 if (score == NULL) { PyErr_NoMemory(); goto end; }
437                 PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score);
438             }
439         }
440     } else { PyErr_NoMemory(); goto end; }
441 
442 end:
443     nullfree(needle);
444     nullfree(matches);
445     nullfree(final_positions);
446     if (PyErr_Occurred()) { Py_XDECREF(items); items = NULL; Py_XDECREF(positions); positions = NULL; return NULL; }
447     return Py_BuildValue("NN", items, positions);
448 } // }}}
449 
450 static PyMethodDef Matcher_methods[] = {
451     {"calculate_scores", (PyCFunction)Matcher_calculate_scores, METH_VARARGS,
452      "calculate_scores(query) -> Return the scores for all items given query as a tuple."
453     },
454 
455     {NULL, NULL}  /* Sentinel */
456 };
457 
458 
459 // }}}
460 
461 static PyTypeObject MatcherType = { // {{{
462     PyVarObject_HEAD_INIT(NULL, 0)
463     /* tp_name           */ "matcher.Matcher",
464     /* tp_basicsiz       */ sizeof(Matcher),
465     /* tp_itemsize       */ 0,
466     /* tp_dealloc        */ (destructor)Matcher_dealloc,
467     /* tp_print          */ 0,
468     /* tp_getattr        */ 0,
469     /* tp_setattr        */ 0,
470     /* tp_as_async       */ 0,
471     /* tp_repr           */ 0,
472     /* tp_as_number      */ 0,
473     /* tp_as_sequence    */ 0,
474     /* tp_as_mapping     */ 0,
475     /* tp_hash           */ 0,
476     /* tp_call           */ 0,
477     /* tp_str            */ 0,
478     /* tp_getattro       */ 0,
479     /* tp_setattro       */ 0,
480     /* tp_as_buffer      */ 0,
481     /* tp_flags          */ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
482     /* tp_doc            */ "Matcher",
483     /* tp_traverse       */ 0,
484     /* tp_clear          */ 0,
485     /* tp_richcompare    */ 0,
486     /* tp_weaklistoffset */ 0,
487     /* tp_iter           */ 0,
488     /* tp_iternext       */ 0,
489     /* tp_methods        */ Matcher_methods,
490     /* tp_members        */ 0,
491     /* tp_getset         */ 0,
492     /* tp_base           */ 0,
493     /* tp_dict           */ 0,
494     /* tp_descr_get      */ 0,
495     /* tp_descr_set      */ 0,
496     /* tp_dictoffset     */ 0,
497     /* tp_init           */ (initproc)Matcher_init,
498     /* tp_alloc          */ 0,
499     /* tp_new            */ PyType_GenericNew,
500 }; // }}}
501 
502 static int
exec_module(PyObject * mod)503 exec_module(PyObject *mod) {
504     if (PyType_Ready(&MatcherType) < 0) return -1;
505     Py_INCREF(&MatcherType);
506     if(PyModule_AddObject(mod, "Matcher", (PyObject *)&MatcherType) < 0) {
507         Py_DECREF(&MatcherType);
508         return -1;
509     }
510 	return 0;
511 }
512 
513 static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} };
514 
515 static struct PyModuleDef module_def = {
516     .m_base     = PyModuleDef_HEAD_INIT,
517     .m_name     = "matcher",
518     .m_doc      = "Find subsequence matches.",
519     .m_slots    = slots,
520 };
521 
PyInit_matcher(void)522 CALIBRE_MODINIT_FUNC PyInit_matcher(void) { return PyModuleDef_Init(&module_def); }
523