1 /*
2 * Copyright (C) 2008,2009 Aliaksey Kandratsenka
3 *
4 * This program is free software: you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 3 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see
16 * `http://www.gnu.org/licenses/'.
17 */
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <assert.h>
23 #include <inttypes.h>
24
25 #include "scorer.h"
26 #include "compiler.h"
27
28 int scorer_utf8_mode = 1;
29
30 #define PROPER_WORD_START 0x100000
31 #define WILD_WORD_START 0x201
32 #define ADJACENT 0x400
33
34 #define max(a,b) ({__typeof__ (a) ____a = (a); __typeof__ (b) ____b = (b); ____b > ____a ? ____b : ____a; })
35
36 #if 0
37 #define dprintf(...) printf(__VA_ARGS__)
38 #else
39 __attribute__((format (printf, 1, 2))) __attribute__((unused))
40 static inline
____empty_printf(char * format,...)41 void ____empty_printf(char *format, ...)
42 {
43 }
44 #define dprintf(...) ____empty_printf(__VA_ARGS__)
45 #endif
46
47 static
delimiter_p_full(char ch)48 int delimiter_p_full(char ch)
49 {
50 return (ch == '.' || ch == '_' || ch == '/' || ch == '*' || ch == ' ' || ch == '\'' || ch == '"');
51 }
52
53 static
54 char delimiter_table[256];
55
56 static
57 char tolower_table[256];
58
prepare_scorer(void)59 void prepare_scorer(void)
60 {
61 int i;
62 for (i = 0; i < 256; i++) {
63 char delimiter = delimiter_p_full(i);
64 delimiter_table[i] = delimiter;
65 tolower_table[i] = tolower(i);
66 if (i == '-')
67 tolower_table[i] = '_';
68 }
69 }
70
71 static inline
delimiter_p(char ch)72 int delimiter_p(char ch)
73 {
74 return delimiter_table[(unsigned char)ch];
75 }
76
77 static inline
normalize_char(char ch,unsigned * is_delimiter)78 char normalize_char(char ch, unsigned *is_delimiter)
79 {
80 ch = tolower_table[(unsigned char)ch];
81 if (is_delimiter)
82 *is_delimiter = delimiter_p(ch);
83 return ch;
84 }
85
86 #define MAX_PAT_LENGTH 32
87
88 static
initialize_prepared_pattern(const char * pattern,struct prepared_pattern * rv)89 void initialize_prepared_pattern(const char *pattern,
90 struct prepared_pattern *rv)
91 {
92 unsigned previous_delimiter = 1;
93 int i;
94
95 for (i=0; i<rv->pat_length; i++) {
96 char ch = pattern[i];
97 rv->start_of_pattern_word[i] = previous_delimiter || ('A' <= ch && ch <= 'Z');
98 rv->translated_pattern[i] = normalize_char(ch, &previous_delimiter);
99 }
100
101 char fc_count = 0;
102 for (i = 0; i < rv->pat_length && fc_count < 4; i++) {
103 char ch = rv->translated_pattern[i];
104 if (!delimiter_p(ch)) {
105 rv->first_chars[fc_count*2] = ch;
106 rv->first_chars[fc_count*2+1] = toupper(ch);
107 fc_count++;
108 }
109 }
110 rv->fc_count = fc_count;
111 }
112
prepare_pattern(const struct scorer_query * query)113 struct prepared_pattern *prepare_pattern(const struct scorer_query *query)
114 {
115 const char *pattern = query->pattern;
116 unsigned pat_length = strlen(pattern);
117
118 if (pat_length > MAX_PAT_LENGTH)
119 pat_length = MAX_PAT_LENGTH;
120
121 char *start_of_pattern_word = pat_length ? malloc(pat_length) : 0;
122 char *translated_pattern = pat_length ? malloc(pat_length) : 0;
123
124 struct prepared_pattern *rv = malloc(sizeof(struct prepared_pattern));
125 rv->translated_pattern = translated_pattern;
126 rv->start_of_pattern_word = start_of_pattern_word;
127 rv->pat_length = pat_length;
128
129 if (pat_length)
130 initialize_prepared_pattern(pattern, rv);
131 return rv;
132 }
133
free_prepared_pattern(struct prepared_pattern * p)134 void free_prepared_pattern(struct prepared_pattern *p)
135 {
136 if (!p)
137 return;
138 free(p->start_of_pattern_word);
139 free(p->translated_pattern);
140 free(p);
141 }
142
143 static inline __attribute__((always_inline))
score_string_prepared_inline(const unsigned pat_length,const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)144 int score_string_prepared_inline(const unsigned pat_length,
145 const char *string,
146 const struct scorer_query *query,
147 const struct prepared_pattern *prepared_pattern,
148 const unsigned string_length,
149 unsigned* match)
150 {
151 struct position {
152 int this_score;
153 int score;
154 int amount;
155 };
156
157 struct position state[string_length][__BUILTIN_CONSTANT(pat_length) ? pat_length : MAX_PAT_LENGTH];
158 char *start_of_pattern_word = prepared_pattern->start_of_pattern_word;
159 char *translated_pattern = prepared_pattern->translated_pattern;
160 unsigned i;
161 unsigned k;
162 unsigned previous_delimiter;
163 int score;
164
165 if (pat_length == 0)
166 return 0;
167
168 if (string_length == 0)
169 return -1;
170
171 dprintf("scoring string '%.*s' for pattern '%s'\n", string_length, string, query->pattern);
172
173 #if 1
174 if (prepared_pattern->fc_count > 0) {
175 const char *p = string;
176 char ch;
177
178 #define I(i) \
179 do { \
180 ch = prepared_pattern->first_chars[i*2]; \
181 if (ch) { \
182 const char *next = memchr(p, ch, string + string_length - p); \
183 const char *next2 = memchr(p, prepared_pattern->first_chars[i*2+1], string + string_length - p); \
184 if (!next && !next2) \
185 return -1; \
186 if (!next) \
187 next = next2; \
188 else if (!next2) \
189 next2 = next; \
190 p = ((next < next2) ? next : next2) + 1; \
191 } \
192 } while (0)
193
194 I(0);
195
196 if (prepared_pattern->fc_count > 1) {
197 I(1);
198 if (prepared_pattern->fc_count > 2) {
199 I(2);
200 if (prepared_pattern->fc_count > 3)
201 I(3);
202 }
203 }
204
205 #undef I
206 }
207 #endif
208
209 memset(state, -1, sizeof(state));
210
211 previous_delimiter = 1;
212
213 for (i=0; i<string_length; i++) {
214 char ch = string[i];
215 unsigned max_k;
216 unsigned at_word_start = previous_delimiter || ('A' <= ch && ch <= 'Z');
217 ch = normalize_char(ch, &previous_delimiter);
218 if (ch == translated_pattern[0]) {
219 int amount = at_word_start ? PROPER_WORD_START : 0;
220 if (i > 0 && state[i-1][0].score > amount)
221 state[i][0].score = state[i-1][0].score;
222 else
223 state[i][0].score = amount;
224 state[i][0].this_score = amount;
225 state[i][0].amount = amount;
226 } else {
227 state[i][0].this_score = -1;
228 state[i][0].amount = 0;
229 state[i][0].score = i > 0 ? state[i-1][0].score : -1;
230 }
231 if (!__BUILTIN_CONSTANT(pat_length) || pat_length > 1) {
232 max_k = i+1;
233 max_k = (max_k > pat_length) ? pat_length : max_k;
234 for (k=1; k<max_k; k++) {
235 char pat_ch = translated_pattern[k];
236 int prev_score;
237 char prev_pattern;
238 int amount;
239 int this_score;
240 prev_score = state[i-1][k-1].score;
241 if (pat_ch == '_') { // always match '_'
242 state[i][k].score = prev_score;
243 state[i][k].amount = 0;
244 state[i][k].this_score = prev_score;
245 continue;
246 }
247 if (ch != pat_ch) {
248 cont:
249 state[i][k].score = state[i-1][k].score;
250 state[i][k].amount = 0;
251 state[i][k].this_score = -1;
252 continue;
253 }
254 if (prev_score < 0)
255 goto cont;
256 amount = 0;
257 prev_pattern = translated_pattern[k-1];
258 if (at_word_start)
259 amount = start_of_pattern_word[k] ? PROPER_WORD_START : WILD_WORD_START;
260 // '_' demands proper word start after itself, and nothing less
261 if (prev_pattern == '_')
262 if (amount != PROPER_WORD_START)
263 goto cont;
264 this_score = prev_score + amount;
265
266 // if current pattern byte is
267 // continuation of utf8 sequence, then
268 // it must match adjacently.
269 if (__EXPECT(scorer_utf8_mode, 1) && __EXPECT(utf8_continuation_p(pat_ch), 0)) {
270 if (state[i-1][k-1].this_score < 0)
271 goto cont;
272 this_score = 0; // force win
273 // of next candidate
274 }
275 if (state[i-1][k-1].this_score >= 0 && !delimiter_p(pat_ch)) {
276 int candidate = state[i-1][k-1].this_score + ADJACENT;
277 if (candidate > this_score)
278 this_score = candidate, amount = ADJACENT;
279 }
280 state[i][k].this_score = this_score;
281 state[i][k].amount = amount;
282 state[i][k].score = max(state[i-1][k].score, this_score);
283 }
284 }
285 }
286
287 score = state[string_length-1][pat_length-1].score;
288
289 dprintf("score: %d\n", score);
290
291 if (!match || score < 0)
292 return score;
293
294 {
295 int t = score;
296 k = string_length;
297 for (i=pat_length-1; i<(unsigned)-1; i--) {
298 if (query->right_match) {
299 for (k=k-1; k>=i; k--)
300 if (state[k][i].this_score == t)
301 break;
302 assert(k>=i);
303 } else {
304 int last_k = k;
305 for (k=0; k<last_k; k++)
306 if (state[k][i].this_score == t)
307 break;
308 assert(k<string_length);
309 }
310 match_again:
311 match[i] = k;
312 if (translated_pattern[i] == '_')
313 match[i] = SCORER_MATCH_NONE;
314 if (__EXPECT(scorer_utf8_mode, 1)
315 && __EXPECT(utf8_continuation_p(translated_pattern[i]), 0)) {
316 match[i] = SCORER_MATCH_NONE;
317 }
318 t -= state[k][i].amount;
319 if (state[k][i].amount == ADJACENT) {
320 assert(i>0);
321 k--;
322 i--;
323 goto match_again;
324 }
325 }
326 assert(t == 0);
327 }
328
329 return score;
330 }
331
332
333 /* partial compilation for pat_length == 1 */
334 static __attribute__((noinline))
score_string_prepared_1(const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)335 int score_string_prepared_1(const char *string,
336 const struct scorer_query *query,
337 const struct prepared_pattern *prepared_pattern,
338 const unsigned string_length,
339 unsigned* match)
340 {
341 return score_string_prepared_inline(1, string, query, prepared_pattern, string_length, match);
342 }
343
344 /* partial compilation for pat_length == 2 */
345 static __attribute__((noinline))
score_string_prepared_2(const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)346 int score_string_prepared_2(const char *string,
347 const struct scorer_query *query,
348 const struct prepared_pattern *prepared_pattern,
349 const unsigned string_length,
350 unsigned* match)
351 {
352 return score_string_prepared_inline(2, string, query, prepared_pattern, string_length, match);
353 }
354
score_string_prepared(const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)355 int score_string_prepared(const char *string,
356 const struct scorer_query *query,
357 const struct prepared_pattern *prepared_pattern,
358 const unsigned string_length,
359 unsigned* match)
360 {
361 if (!prepared_pattern)
362 return -1;
363
364 if (prepared_pattern->pat_length == 1) {
365 return score_string_prepared_1(string, query, prepared_pattern, string_length, match);
366 }
367
368 if (prepared_pattern->pat_length == 2) {
369 return score_string_prepared_2(string, query, prepared_pattern, string_length, match);
370 }
371
372 return score_string_prepared_inline(prepared_pattern->pat_length, string, query, prepared_pattern, string_length, match);
373 }
374
score_string(const char * string,const struct scorer_query * query,const unsigned string_length,unsigned * match)375 int score_string(const char *string,
376 const struct scorer_query *query,
377 const unsigned string_length,
378 unsigned* match)
379 {
380 const char *pattern = query->pattern;
381 unsigned pat_length = strlen(pattern);
382 char start_of_pattern_word[pat_length];
383 char translated_pattern[pat_length];
384
385 if (pat_length == 0)
386 return 0;
387
388 if (string_length == 0)
389 return -1;
390
391 if (pat_length > MAX_PAT_LENGTH)
392 pat_length = MAX_PAT_LENGTH;
393
394
395 struct prepared_pattern p = {
396 .translated_pattern = translated_pattern,
397 .start_of_pattern_word = start_of_pattern_word,
398 .pat_length = pat_length
399 };
400
401 if (pat_length)
402 initialize_prepared_pattern(pattern, &p);
403
404 return score_string_prepared(string, query, &p, string_length, match);
405 }
406
score_simple_string(const char * string,const char * pattern,unsigned * match)407 int score_simple_string(const char *string, const char *pattern, unsigned *match)
408 {
409 struct scorer_query qry = {.pattern = pattern,.right_match = 0};
410 unsigned string_length = strlen(string);
411 return score_string(string, &qry, string_length, match);
412 }
413