1 /*
2 * matcher.c
3 * Copyright (C) 2014 Kovid Goyal <kovid at kovidgoyal.net>
4 *
5 * Distributed under terms of the GPL3 license.
6 */
7
8 #define NO_ICU_TO_PYTHON
9 #define NO_PYTHON_TO_ICU32
10 #include "icu_calibre_utils.h"
11 #include <float.h>
12 #include <stdbool.h>
13
14 #ifdef _MSC_VER
15 // inline does not work with the visual studio C compiler
16 #define inline
17 #endif
18
19 #define MAX(x, y) ((x > y) ? x : y)
20 #define nullfree(x) if(x != NULL) free(x); x = NULL;
21
22 // Algorithm to sort items by subsequence score {{{
23 typedef struct {
24 double score;
25 int32_t *positions;
26 } MemoryItem;
27
alloc_memory(int32_t needle_len,int32_t max_haystack_len)28 static MemoryItem*** alloc_memory(int32_t needle_len, int32_t max_haystack_len) {
29 MemoryItem ***ans = NULL, **d1 = NULL, *d2 = NULL;
30 size_t num = max_haystack_len * max_haystack_len * needle_len;
31 size_t position_sz = needle_len * sizeof(int32_t);
32 size_t sz = (num * (sizeof(MemoryItem) + position_sz)) + (max_haystack_len * sizeof(MemoryItem**)) + (needle_len * sizeof(MemoryItem*));
33 int32_t hidx, nidx, last_idx, i, j;
34 char *base = NULL;
35
36 ans = (MemoryItem***) calloc(sz, 1);
37 if (ans != NULL) {
38 d1 = (MemoryItem**)(ans + max_haystack_len);
39 d2 = (MemoryItem*) (d1 + max_haystack_len * needle_len );
40 for (i = 0; i < max_haystack_len; i++) {
41 ans[i] = d1 + i * needle_len;
42 for (j = 0; j < needle_len; j++) d1[i*needle_len + j] = d2 + j;
43 }
44
45 base = ((char*)ans) + (sizeof(MemoryItem**)*max_haystack_len) + (sizeof(MemoryItem*)*needle_len) + (sizeof(MemoryItem)*max_haystack_len);
46
47 for (hidx = 0; hidx < max_haystack_len; hidx++) {
48 for (nidx = 0; nidx < needle_len; nidx++) {
49 for (last_idx = 0; last_idx < max_haystack_len; last_idx++) {
50 ans[hidx][nidx][last_idx].positions = (int32_t*)base;
51 base += position_sz;
52 }
53 }
54 }
55 }
56 return ans;
57 }
58
clear_memory(MemoryItem *** mem,int32_t needle_len,int32_t max_haystack_len)59 static void clear_memory(MemoryItem ***mem, int32_t needle_len, int32_t max_haystack_len) {
60 int32_t hidx, nidx, last_idx;
61 for (hidx = 0; hidx < max_haystack_len; hidx++) {
62 for (nidx = 0; nidx < needle_len; nidx++) {
63 for (last_idx = 0; last_idx < max_haystack_len; last_idx++) {
64 mem[hidx][nidx][last_idx].score = DBL_MAX;
65 }
66 }
67 }
68 }
69
70 typedef struct {
71 int32_t hidx;
72 int32_t nidx;
73 int32_t last_idx;
74 double score;
75 int32_t *positions;
76 } StackItem;
77
78 typedef struct {
79 ssize_t pos;
80 int32_t needle_len;
81 size_t size;
82 StackItem *items;
83 } Stack;
84
alloc_stack(Stack * stack,int32_t needle_len,int32_t max_haystack_len)85 static void alloc_stack(Stack *stack, int32_t needle_len, int32_t max_haystack_len) {
86 StackItem *ans = NULL;
87 char *base = NULL;
88 size_t num = max_haystack_len * needle_len;
89 size_t position_sz = needle_len * sizeof(int32_t);
90 size_t sz = sizeof(StackItem) + position_sz;
91 size_t i = 0;
92
93 stack->needle_len = needle_len;
94 stack->pos = -1;
95 stack->size = num;
96 ans = (StackItem*) calloc(num, sz);
97 if (ans != NULL) {
98 base = (char*)(ans + num);
99 for (i = 0; i < num; i++, base += position_sz) ans[i].positions = (int32_t*) base;
100 stack->items = ans;
101 }
102 }
103
stack_clear(Stack * stack)104 static void stack_clear(Stack *stack) { stack->pos = -1; }
105
stack_push(Stack * stack,int32_t hidx,int32_t nidx,int32_t last_idx,double score,int32_t * positions)106 static void stack_push(Stack *stack, int32_t hidx, int32_t nidx, int32_t last_idx, double score, int32_t *positions) {
107 StackItem *si = &(stack->items[++stack->pos]);
108 si->hidx = hidx; si->nidx = nidx; si->last_idx = last_idx; si->score = score;
109 memcpy(si->positions, positions, sizeof(*positions) * stack->needle_len);
110 }
111
stack_pop(Stack * stack,int32_t * hidx,int32_t * nidx,int32_t * last_idx,double * score,int32_t * positions)112 static void stack_pop(Stack *stack, int32_t *hidx, int32_t *nidx, int32_t *last_idx, double *score, int32_t *positions) {
113 StackItem *si = &(stack->items[stack->pos--]);
114 *hidx = si->hidx; *nidx = si->nidx; *last_idx = si->last_idx; *score = si->score;
115 memcpy(positions, si->positions, sizeof(*positions) * stack->needle_len);
116 }
117
118 typedef struct {
119 UChar *haystack;
120 int32_t haystack_len;
121 UChar *needle;
122 int32_t needle_len;
123 double max_score_per_char;
124 MemoryItem ***memo;
125 UChar *level1;
126 UChar *level2;
127 UChar *level3;
128 } MatchInfo;
129
130 typedef struct {
131 double score;
132 int32_t *positions;
133 } Match;
134
135
calc_score_for_char(MatchInfo * m,UChar32 last,UChar32 current,int32_t distance_from_last_match)136 static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, int32_t distance_from_last_match) {
137 double factor = 1.0;
138 double ans = m->max_score_per_char;
139
140 if (u_strchr32(m->level1, last) != NULL)
141 factor = 0.9;
142 else if (u_strchr32(m->level2, last) != NULL)
143 factor = 0.8;
144 else if (u_isULowercase(last) && u_isUUppercase(current))
145 factor = 0.8; // CamelCase
146 else if (u_strchr32(m->level3, last) != NULL)
147 factor = 0.7;
148 else
149 // If last is not a special char, factor diminishes
150 // as distance from last matched char increases
151 factor = (1.0 / distance_from_last_match) * 0.75;
152 return ans * factor;
153 }
154
convert_positions(int32_t * positions,int32_t * final_positions,UChar * string,int32_t char_len,int32_t byte_len,double score)155 static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
156 // The positions array stores character positions as byte offsets in string, convert them into character offsets
157 int32_t i, *end;
158
159 if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; }
160
161 end = final_positions + char_len;
162 for (i = 0; i < byte_len && final_positions < end; i++) {
163 if (positions[i] == -1) continue;
164 #if PY_VERSION_HEX >= 0x03030000
165 *final_positions = u_countChar32(string, positions[i]);
166 #else
167 #ifdef Py_UNICODE_WIDE
168 *final_positions = u_countChar32(string, positions[i]);
169 #else
170 *final_positions = positions[i];
171 #endif
172 #endif
173 final_positions += 1;
174 }
175 }
176
process_item(MatchInfo * m,Stack * stack,int32_t * final_positions,UStringSearch ** searches)177 static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) {
178 UChar32 hc, lc;
179 double final_score = 0.0, score = 0.0, score_for_char = 0.0;
180 int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
181 MemoryItem mem = {0};
182 UStringSearch *search = NULL;
183 UErrorCode status = U_ZERO_ERROR;
184
185 stack_push(stack, 0, 0, 0, 0.0, final_positions);
186
187 while (stack->pos >= 0) {
188 stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
189 mem = m->memo[hidx][nidx][last_idx];
190 if (mem.score == DBL_MAX) {
191 // No memoized result, calculate the score
192 for (i = nidx; i < m->needle_len;) {
193 nidx = i;
194 U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle
195 search = searches[nidx];
196 if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
197 status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct
198 usearch_setOffset(search, hidx, &status);
199 status = U_ZERO_ERROR;
200 pos = usearch_next(search, &status);
201 if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found
202 distance = u_countChar32(m->haystack + last_idx, pos - last_idx);
203 if (distance <= 1) score_for_char = m->max_score_per_char;
204 else {
205 U16_GET(m->haystack, 0, pos, m->haystack_len, hc);
206 j = pos;
207 U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
208 score_for_char = calc_score_for_char(m, lc, hc, distance);
209 }
210 j = pos;
211 U16_NEXT(m->haystack, j, m->haystack_len, hc);
212 hidx = j;
213 if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
214 last_idx = pos;
215 positions[nidx] = pos;
216 score += score_for_char;
217 } // for(i) iterate over needle
218 mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);
219
220 } else {
221 score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
222 }
223 // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
224 if (score > final_score) {
225 final_score = score;
226 memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
227 }
228 }
229 return final_score;
230 }
231
create_searches(UStringSearch ** searches,UChar * haystack,int32_t haystack_len,UChar * needle,int32_t needle_len,UCollator * collator)232 static bool create_searches(UStringSearch **searches, UChar *haystack, int32_t haystack_len, UChar *needle, int32_t needle_len, UCollator *collator) {
233 int32_t i = 0, pos = 0;
234 UErrorCode status = U_ZERO_ERROR;
235
236 while (i < needle_len) {
237 pos = i;
238 U16_FWD_1(needle, i, needle_len);
239 if (pos == i) break;
240 searches[pos] = usearch_openFromCollator(needle + pos, i - pos, haystack, haystack_len, collator, NULL, &status);
241 if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); searches[pos] = NULL; return false; }
242 }
243
244 return true;
245 }
246
free_searches(UStringSearch ** searches,int32_t count)247 static void free_searches(UStringSearch **searches, int32_t count) {
248 int32_t i = 0;
249 for (i = 0; i < count; i++) {
250 if (searches[i] != NULL) usearch_close(searches[i]);
251 searches[i] = NULL;
252 }
253 }
254
match(UChar ** items,int32_t * item_lengths,uint32_t item_count,UChar * needle,Match * match_results,int32_t * final_positions,int32_t needle_char_len,UCollator * collator,UChar * level1,UChar * level2,UChar * level3)255 static bool match(UChar **items, int32_t *item_lengths, uint32_t item_count, UChar *needle, Match *match_results, int32_t *final_positions, int32_t needle_char_len, UCollator *collator, UChar *level1, UChar *level2, UChar *level3) {
256 Stack stack = {0};
257 int32_t i = 0, maxhl = 0;
258 int32_t r = 0, *positions = NULL;
259 MatchInfo *matches = NULL;
260 bool ok = false;
261 MemoryItem ***memo = NULL;
262 int32_t needle_len = u_strlen(needle);
263 UStringSearch **searches = NULL;
264
265 if (needle_len <= 0 || item_count <= 0) {
266 for (i = 0; i < (int32_t)item_count; i++) match_results[i].score = 0.0;
267 ok = true;
268 goto end;
269 }
270
271 matches = (MatchInfo*)calloc(item_count, sizeof(MatchInfo));
272 positions = (int32_t*)calloc(2*needle_len, sizeof(int32_t)); // One set of positions is the final answer and one set is working space
273 searches = (UStringSearch**) calloc(needle_len, sizeof(UStringSearch*));
274 if (matches == NULL || positions == NULL || searches == NULL) {PyErr_NoMemory(); goto end;}
275
276 for (i = 0; i < (int32_t)item_count; i++) {
277 matches[i].haystack = items[i];
278 matches[i].haystack_len = item_lengths[i];
279 matches[i].needle = needle;
280 matches[i].needle_len = needle_len;
281 matches[i].max_score_per_char = (1.0 / matches[i].haystack_len + 1.0 / needle_len) / 2.0;
282 matches[i].level1 = level1;
283 matches[i].level2 = level2;
284 matches[i].level3 = level3;
285 maxhl = MAX(maxhl, matches[i].haystack_len);
286 }
287
288 if (maxhl <= 0) {
289 for (i = 0; i < (int32_t)item_count; i++) match_results[i].score = 0.0;
290 ok = true;
291 goto end;
292 }
293
294 alloc_stack(&stack, needle_len, maxhl);
295 memo = alloc_memory(needle_len, maxhl);
296 if (stack.items == NULL || memo == NULL) {PyErr_NoMemory(); goto end;}
297
298 for (i = 0; i < (int32_t)item_count; i++) {
299 for (r = 0; r < needle_len; r++) positions[r] = -1;
300 stack_clear(&stack);
301 clear_memory(memo, needle_len, matches[i].haystack_len);
302 free_searches(searches, needle_len);
303 if (!create_searches(searches, matches[i].haystack, matches[i].haystack_len, needle, needle_len, collator)) goto end;
304 matches[i].memo = memo;
305 match_results[i].score = process_item(&matches[i], &stack, positions, searches);
306 convert_positions(positions, final_positions + i * needle_char_len, matches[i].haystack, needle_char_len, needle_len, match_results[i].score);
307 }
308
309 ok = true;
310 end:
311 nullfree(positions);
312 nullfree(stack.items);
313 nullfree(matches);
314 nullfree(memo);
315 if (searches != NULL) { free_searches(searches, needle_len); nullfree(searches); }
316 return ok;
317 }
318
319 // }}}
320
321 // Matcher object definition {{{
322 typedef struct {
323 PyObject_HEAD
324 // Type-specific fields go here.
325 UChar **items;
326 uint32_t item_count;
327 int32_t *item_lengths;
328 UChar *level1;
329 UChar *level2;
330 UChar *level3;
331 UCollator *collator;
332 } Matcher;
333
334 // Matcher.__init__() {{{
335
free_matcher(Matcher * self)336 static void free_matcher(Matcher *self) {
337 uint32_t i = 0;
338 if (self->items != NULL) {
339 for (i = 0; i < self->item_count; i++) { nullfree(self->items[i]); }
340 }
341 nullfree(self->items); nullfree(self->item_lengths);
342 nullfree(self->level1); nullfree(self->level2); nullfree(self->level3);
343 if (self->collator != NULL) { ucol_close(self->collator); self->collator = NULL; }
344 }
345 static void
Matcher_dealloc(Matcher * self)346 Matcher_dealloc(Matcher* self)
347 {
348 free_matcher(self);
349 Py_TYPE(self)->tp_free((PyObject*)self);
350 }
351
352 #define alloc_uchar(x) (x * 3 + 1)
353 static int
Matcher_init(Matcher * self,PyObject * args,PyObject * kwds)354 Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
355 {
356 PyObject *items = NULL, *p = NULL, *py_items = NULL, *level1 = NULL, *level2 = NULL, *level3 = NULL, *collator = NULL;
357 int32_t i = 0;
358 UErrorCode status = U_ZERO_ERROR;
359 UCollator *col = NULL;
360
361 if (!PyArg_ParseTuple(args, "OOOOO", &items, &collator, &level1, &level2, &level3)) return -1;
362
363 // Clone the passed in collator (cloning is needed as collators are not thread safe)
364 if (!PyCapsule_CheckExact(collator)) { PyErr_SetString(PyExc_TypeError, "Collator must be a capsule"); return -1; }
365 col = (UCollator*)PyCapsule_GetPointer(collator, NULL);
366 if (col == NULL) return -1;
367 self->collator = ucol_safeClone(col, NULL, NULL, &status);
368 col = NULL;
369 if (U_FAILURE(status)) { self->collator = NULL; PyErr_SetString(PyExc_ValueError, u_errorName(status)); return -1; }
370
371 py_items = PySequence_Fast(items, "Must pass in two sequence objects");
372 if (py_items == NULL) goto end;
373 self->item_count = (uint32_t)PySequence_Size(items);
374
375 self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
376 self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
377 self->level1 = python_to_icu(level1, NULL);
378 self->level2 = python_to_icu(level2, NULL);
379 self->level3 = python_to_icu(level3, NULL);
380
381 if (self->items == NULL || self->item_lengths == NULL ) { PyErr_NoMemory(); goto end; }
382 if (self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) goto end;
383
384 for (i = 0; i < (int32_t)self->item_count; i++) {
385 p = PySequence_Fast_GET_ITEM(py_items, i);
386 self->items[i] = python_to_icu(p, self->item_lengths + i);
387 if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
388 }
389
390 end:
391 Py_XDECREF(py_items);
392 if (PyErr_Occurred()) { free_matcher(self); }
393 return (PyErr_Occurred()) ? -1 : 0;
394 }
395 // Matcher.__init__() }}}
396
397 // Matcher.calculate_scores {{{
398 static PyObject *
Matcher_calculate_scores(Matcher * self,PyObject * args)399 Matcher_calculate_scores(Matcher *self, PyObject *args) {
400 int32_t *final_positions = NULL, *p;
401 Match *matches = NULL;
402 bool ok = false;
403 uint32_t i = 0, needle_char_len = 0, j = 0;
404 PyObject *items = NULL, *score = NULL, *positions = NULL, *pneedle = NULL;
405 UChar *needle = NULL;
406
407 if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL;
408
409 needle = python_to_icu(pneedle, NULL);
410 if (needle == NULL) return NULL;
411 needle_char_len = u_countChar32(needle, -1);
412 items = PyTuple_New(self->item_count);
413 positions = PyTuple_New(self->item_count);
414 matches = (Match*)calloc(self->item_count, sizeof(Match));
415 final_positions = (int32_t*) calloc(needle_char_len * self->item_count, sizeof(int32_t));
416 if (items == NULL || matches == NULL || final_positions == NULL || positions == NULL) {PyErr_NoMemory(); goto end;}
417
418 for (i = 0; i < self->item_count; i++) {
419 score = PyTuple_New(needle_char_len);
420 if (score == NULL) { PyErr_NoMemory(); goto end; }
421 PyTuple_SET_ITEM(positions, (Py_ssize_t)i, score);
422 }
423
424 Py_BEGIN_ALLOW_THREADS;
425 ok = match(self->items, self->item_lengths, self->item_count, needle, matches, final_positions, needle_char_len, self->collator, self->level1, self->level2, self->level3);
426 Py_END_ALLOW_THREADS;
427
428 if (ok) {
429 for (i = 0; i < self->item_count; i++) {
430 score = PyFloat_FromDouble(matches[i].score);
431 if (score == NULL) { PyErr_NoMemory(); goto end; }
432 PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
433 p = final_positions + (i * needle_char_len);
434 for (j = 0; j < needle_char_len; j++) {
435 score = PyLong_FromLong((long)p[j]);
436 if (score == NULL) { PyErr_NoMemory(); goto end; }
437 PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score);
438 }
439 }
440 } else { PyErr_NoMemory(); goto end; }
441
442 end:
443 nullfree(needle);
444 nullfree(matches);
445 nullfree(final_positions);
446 if (PyErr_Occurred()) { Py_XDECREF(items); items = NULL; Py_XDECREF(positions); positions = NULL; return NULL; }
447 return Py_BuildValue("NN", items, positions);
448 } // }}}
449
450 static PyMethodDef Matcher_methods[] = {
451 {"calculate_scores", (PyCFunction)Matcher_calculate_scores, METH_VARARGS,
452 "calculate_scores(query) -> Return the scores for all items given query as a tuple."
453 },
454
455 {NULL, NULL} /* Sentinel */
456 };
457
458
459 // }}}
460
461 static PyTypeObject MatcherType = { // {{{
462 PyVarObject_HEAD_INIT(NULL, 0)
463 /* tp_name */ "matcher.Matcher",
464 /* tp_basicsiz */ sizeof(Matcher),
465 /* tp_itemsize */ 0,
466 /* tp_dealloc */ (destructor)Matcher_dealloc,
467 /* tp_print */ 0,
468 /* tp_getattr */ 0,
469 /* tp_setattr */ 0,
470 /* tp_as_async */ 0,
471 /* tp_repr */ 0,
472 /* tp_as_number */ 0,
473 /* tp_as_sequence */ 0,
474 /* tp_as_mapping */ 0,
475 /* tp_hash */ 0,
476 /* tp_call */ 0,
477 /* tp_str */ 0,
478 /* tp_getattro */ 0,
479 /* tp_setattro */ 0,
480 /* tp_as_buffer */ 0,
481 /* tp_flags */ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
482 /* tp_doc */ "Matcher",
483 /* tp_traverse */ 0,
484 /* tp_clear */ 0,
485 /* tp_richcompare */ 0,
486 /* tp_weaklistoffset */ 0,
487 /* tp_iter */ 0,
488 /* tp_iternext */ 0,
489 /* tp_methods */ Matcher_methods,
490 /* tp_members */ 0,
491 /* tp_getset */ 0,
492 /* tp_base */ 0,
493 /* tp_dict */ 0,
494 /* tp_descr_get */ 0,
495 /* tp_descr_set */ 0,
496 /* tp_dictoffset */ 0,
497 /* tp_init */ (initproc)Matcher_init,
498 /* tp_alloc */ 0,
499 /* tp_new */ PyType_GenericNew,
500 }; // }}}
501
502 static int
exec_module(PyObject * mod)503 exec_module(PyObject *mod) {
504 if (PyType_Ready(&MatcherType) < 0) return -1;
505 Py_INCREF(&MatcherType);
506 if(PyModule_AddObject(mod, "Matcher", (PyObject *)&MatcherType) < 0) {
507 Py_DECREF(&MatcherType);
508 return -1;
509 }
510 return 0;
511 }
512
513 static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} };
514
515 static struct PyModuleDef module_def = {
516 .m_base = PyModuleDef_HEAD_INIT,
517 .m_name = "matcher",
518 .m_doc = "Find subsequence matches.",
519 .m_slots = slots,
520 };
521
PyInit_matcher(void)522 CALIBRE_MODINIT_FUNC PyInit_matcher(void) { return PyModuleDef_Init(&module_def); }
523