1 /*
2  * Copyright (C) 2008,2009 Aliaksey Kandratsenka
3  *
4  * This program is free software: you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 3 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see
16  * `http://www.gnu.org/licenses/'.
17  */
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <assert.h>
23 #include <inttypes.h>
24 
25 #include "scorer.h"
26 #include "compiler.h"
27 
28 int scorer_utf8_mode = 1;
29 
30 #define PROPER_WORD_START 0x100000
31 #define WILD_WORD_START 0x201
32 #define ADJACENT 0x400
33 
34 #define max(a,b) ({__typeof__ (a) ____a = (a); __typeof__ (b) ____b = (b); ____b > ____a ? ____b : ____a; })
35 
36 #if 0
37 #define dprintf(...) printf(__VA_ARGS__)
38 #else
39 __attribute__((format (printf, 1, 2))) __attribute__((unused))
40 static inline
____empty_printf(char * format,...)41 void ____empty_printf(char *format, ...)
42 {
43 }
44 #define dprintf(...) ____empty_printf(__VA_ARGS__)
45 #endif
46 
47 static
delimiter_p_full(char ch)48 int delimiter_p_full(char ch)
49 {
50 	return (ch == '.' || ch == '_' || ch == '/' || ch == '*' || ch == ' ' || ch == '\'' || ch == '"');
51 }
52 
53 static
54 char delimiter_table[256];
55 
56 static
57 char tolower_table[256];
58 
prepare_scorer(void)59 void prepare_scorer(void)
60 {
61 	int i;
62 	for (i = 0; i < 256; i++) {
63 		char delimiter = delimiter_p_full(i);
64 		delimiter_table[i] = delimiter;
65 		tolower_table[i] = tolower(i);
66 		if (i == '-')
67 			tolower_table[i] = '_';
68 	}
69 }
70 
71 static inline
delimiter_p(char ch)72 int delimiter_p(char ch)
73 {
74 	return delimiter_table[(unsigned char)ch];
75 }
76 
77 static inline
normalize_char(char ch,unsigned * is_delimiter)78 char normalize_char(char ch, unsigned *is_delimiter)
79 {
80 	ch = tolower_table[(unsigned char)ch];
81 	if (is_delimiter)
82 		*is_delimiter = delimiter_p(ch);
83 	return ch;
84 }
85 
86 #define MAX_PAT_LENGTH 32
87 
88 static
initialize_prepared_pattern(const char * pattern,struct prepared_pattern * rv)89 void initialize_prepared_pattern(const char *pattern,
90 				 struct prepared_pattern *rv)
91 {
92 	unsigned previous_delimiter = 1;
93 	int i;
94 
95 	for (i=0; i<rv->pat_length; i++) {
96 		char ch = pattern[i];
97 		rv->start_of_pattern_word[i] = previous_delimiter || ('A' <= ch && ch <= 'Z');
98 		rv->translated_pattern[i] = normalize_char(ch, &previous_delimiter);
99 	}
100 
101 	char fc_count = 0;
102 	for (i = 0; i < rv->pat_length && fc_count < 4; i++) {
103 		char ch = rv->translated_pattern[i];
104 		if (!delimiter_p(ch)) {
105 			rv->first_chars[fc_count*2] = ch;
106 			rv->first_chars[fc_count*2+1] = toupper(ch);
107 			fc_count++;
108 		}
109 	}
110 	rv->fc_count = fc_count;
111 }
112 
prepare_pattern(const struct scorer_query * query)113 struct prepared_pattern *prepare_pattern(const struct scorer_query *query)
114 {
115 	const char *pattern = query->pattern;
116 	unsigned pat_length = strlen(pattern);
117 
118 	if (pat_length > MAX_PAT_LENGTH)
119 		pat_length = MAX_PAT_LENGTH;
120 
121 	char *start_of_pattern_word = pat_length ? malloc(pat_length) : 0;
122 	char *translated_pattern = pat_length ? malloc(pat_length) : 0;
123 
124 	struct prepared_pattern *rv = malloc(sizeof(struct prepared_pattern));
125 	rv->translated_pattern = translated_pattern;
126 	rv->start_of_pattern_word = start_of_pattern_word;
127 	rv->pat_length = pat_length;
128 
129 	if (pat_length)
130 		initialize_prepared_pattern(pattern, rv);
131 	return rv;
132 }
133 
free_prepared_pattern(struct prepared_pattern * p)134 void free_prepared_pattern(struct prepared_pattern *p)
135 {
136 	if (!p)
137 		return;
138 	free(p->start_of_pattern_word);
139 	free(p->translated_pattern);
140 	free(p);
141 }
142 
143 static inline __attribute__((always_inline))
score_string_prepared_inline(const unsigned pat_length,const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)144 int score_string_prepared_inline(const unsigned pat_length,
145 				 const char *string,
146 				 const struct scorer_query *query,
147 				 const struct prepared_pattern *prepared_pattern,
148 				 const unsigned string_length,
149 				 unsigned* match)
150 {
151 	struct position {
152 		int this_score;
153 		int score;
154 		int amount;
155 	};
156 
157 	struct position state[string_length][__BUILTIN_CONSTANT(pat_length) ? pat_length : MAX_PAT_LENGTH];
158 	char *start_of_pattern_word = prepared_pattern->start_of_pattern_word;
159 	char *translated_pattern = prepared_pattern->translated_pattern;
160 	unsigned i;
161 	unsigned k;
162 	unsigned previous_delimiter;
163 	int score;
164 
165 	if (pat_length == 0)
166 		return 0;
167 
168 	if (string_length == 0)
169 		return -1;
170 
171 	dprintf("scoring string '%.*s' for pattern '%s'\n", string_length, string, query->pattern);
172 
173 #if 1
174 	if (prepared_pattern->fc_count > 0) {
175 		const char *p = string;
176 		char ch;
177 
178 #define I(i) \
179 		do { \
180 			ch = prepared_pattern->first_chars[i*2];	\
181 			if (ch) {					\
182 				const char *next = memchr(p, ch, string + string_length - p); \
183 				const char *next2 = memchr(p, prepared_pattern->first_chars[i*2+1], string + string_length - p); \
184 				if (!next && !next2)			\
185 					return -1;			\
186 				if (!next)				\
187 					next = next2;			\
188 				else if (!next2)			\
189 				next2 = next;				\
190 				p = ((next < next2) ? next : next2) + 1; \
191 			} \
192 		} while (0)
193 
194 		I(0);
195 
196 		if (prepared_pattern->fc_count > 1) {
197 			I(1);
198 			if (prepared_pattern->fc_count > 2) {
199 				I(2);
200 				if (prepared_pattern->fc_count > 3)
201 					I(3);
202 			}
203 		}
204 
205 #undef I
206 	}
207 #endif
208 
209 	memset(state, -1, sizeof(state));
210 
211 	previous_delimiter = 1;
212 
213 	for (i=0; i<string_length; i++) {
214 		char ch = string[i];
215 		unsigned max_k;
216 		unsigned at_word_start = previous_delimiter || ('A' <= ch && ch <= 'Z');
217 		ch = normalize_char(ch, &previous_delimiter);
218 		if (ch == translated_pattern[0]) {
219 			int amount = at_word_start ? PROPER_WORD_START : 0;
220 			if (i > 0 && state[i-1][0].score > amount)
221 				state[i][0].score = state[i-1][0].score;
222 			else
223 				state[i][0].score = amount;
224 			state[i][0].this_score = amount;
225 			state[i][0].amount = amount;
226 		} else {
227 			state[i][0].this_score = -1;
228 			state[i][0].amount = 0;
229 			state[i][0].score = i > 0 ? state[i-1][0].score : -1;
230 		}
231 		if (!__BUILTIN_CONSTANT(pat_length) || pat_length > 1) {
232 			max_k = i+1;
233 			max_k = (max_k > pat_length) ? pat_length : max_k;
234 			for (k=1; k<max_k; k++) {
235 				char pat_ch = translated_pattern[k];
236 				int prev_score;
237 				char prev_pattern;
238 				int amount;
239 				int this_score;
240 				prev_score = state[i-1][k-1].score;
241 				if (pat_ch == '_') { // always match '_'
242 					state[i][k].score = prev_score;
243 					state[i][k].amount = 0;
244 					state[i][k].this_score = prev_score;
245 					continue;
246 				}
247 				if (ch != pat_ch) {
248 				cont:
249 					state[i][k].score = state[i-1][k].score;
250 					state[i][k].amount = 0;
251 					state[i][k].this_score = -1;
252 					continue;
253 				}
254 				if (prev_score < 0)
255 					goto cont;
256 				amount = 0;
257 				prev_pattern = translated_pattern[k-1];
258 				if (at_word_start)
259 					amount = start_of_pattern_word[k] ? PROPER_WORD_START : WILD_WORD_START;
260 				// '_' demands proper word start after itself, and nothing less
261 				if (prev_pattern == '_')
262 					if (amount != PROPER_WORD_START)
263 						goto cont;
264 				this_score = prev_score + amount;
265 
266 				// if current pattern byte is
267 				// continuation of utf8 sequence, then
268 				// it must match adjacently.
269 				if (__EXPECT(scorer_utf8_mode, 1) && __EXPECT(utf8_continuation_p(pat_ch), 0)) {
270 					if (state[i-1][k-1].this_score < 0)
271 						goto cont;
272 					this_score = 0; // force win
273 							// of next candidate
274 				}
275 				if (state[i-1][k-1].this_score >= 0 && !delimiter_p(pat_ch)) {
276 					int candidate = state[i-1][k-1].this_score + ADJACENT;
277 					if (candidate > this_score)
278 						this_score = candidate, amount = ADJACENT;
279 				}
280 				state[i][k].this_score = this_score;
281 				state[i][k].amount = amount;
282 				state[i][k].score = max(state[i-1][k].score, this_score);
283 			}
284 		}
285 	}
286 
287 	score = state[string_length-1][pat_length-1].score;
288 
289 	dprintf("score: %d\n", score);
290 
291 	if (!match || score < 0)
292 		return score;
293 
294 	{
295 		int t = score;
296 		k = string_length;
297 		for (i=pat_length-1; i<(unsigned)-1; i--) {
298 			if (query->right_match) {
299 				for (k=k-1; k>=i; k--)
300 					if (state[k][i].this_score == t)
301 						break;
302 				assert(k>=i);
303 			} else {
304 				int last_k = k;
305 				for (k=0; k<last_k; k++)
306 					if (state[k][i].this_score == t)
307 						break;
308 				assert(k<string_length);
309 			}
310 		match_again:
311 			match[i] = k;
312 			if (translated_pattern[i] == '_')
313 				match[i] = SCORER_MATCH_NONE;
314 			if (__EXPECT(scorer_utf8_mode, 1)
315 			    && __EXPECT(utf8_continuation_p(translated_pattern[i]), 0)) {
316 				match[i] = SCORER_MATCH_NONE;
317 			}
318 			t -= state[k][i].amount;
319 			if (state[k][i].amount == ADJACENT) {
320 				assert(i>0);
321 				k--;
322 				i--;
323 				goto match_again;
324 			}
325 		}
326 		assert(t == 0);
327 	}
328 
329 	return score;
330 }
331 
332 
333 /* partial compilation for pat_length == 1 */
334 static __attribute__((noinline))
score_string_prepared_1(const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)335 int score_string_prepared_1(const char *string,
336 			    const struct scorer_query *query,
337 			    const struct prepared_pattern *prepared_pattern,
338 			    const unsigned string_length,
339 			    unsigned* match)
340 {
341 	return score_string_prepared_inline(1, string, query, prepared_pattern, string_length, match);
342 }
343 
344 /* partial compilation for pat_length == 2 */
345 static __attribute__((noinline))
score_string_prepared_2(const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)346 int score_string_prepared_2(const char *string,
347 			    const struct scorer_query *query,
348 			    const struct prepared_pattern *prepared_pattern,
349 			    const unsigned string_length,
350 			    unsigned* match)
351 {
352 	return score_string_prepared_inline(2, string, query, prepared_pattern, string_length, match);
353 }
354 
score_string_prepared(const char * string,const struct scorer_query * query,const struct prepared_pattern * prepared_pattern,const unsigned string_length,unsigned * match)355 int score_string_prepared(const char *string,
356 			  const struct scorer_query *query,
357 			  const struct prepared_pattern *prepared_pattern,
358 			  const unsigned string_length,
359 			  unsigned* match)
360 {
361 	if (!prepared_pattern)
362 		return -1;
363 
364 	if (prepared_pattern->pat_length == 1) {
365 		return score_string_prepared_1(string, query, prepared_pattern, string_length, match);
366 	}
367 
368 	if (prepared_pattern->pat_length == 2) {
369 		return score_string_prepared_2(string, query, prepared_pattern, string_length, match);
370 	}
371 
372 	return score_string_prepared_inline(prepared_pattern->pat_length, string, query, prepared_pattern, string_length, match);
373 }
374 
score_string(const char * string,const struct scorer_query * query,const unsigned string_length,unsigned * match)375 int score_string(const char *string,
376 		 const struct scorer_query *query,
377 		 const unsigned string_length,
378 		 unsigned* match)
379 {
380 	const char *pattern = query->pattern;
381 	unsigned pat_length = strlen(pattern);
382 	char start_of_pattern_word[pat_length];
383 	char translated_pattern[pat_length];
384 
385 	if (pat_length == 0)
386 		return 0;
387 
388 	if (string_length == 0)
389 		return -1;
390 
391 	if (pat_length > MAX_PAT_LENGTH)
392 		pat_length = MAX_PAT_LENGTH;
393 
394 
395 	struct prepared_pattern p = {
396 		.translated_pattern = translated_pattern,
397 		.start_of_pattern_word = start_of_pattern_word,
398 		.pat_length = pat_length
399 	};
400 
401 	if (pat_length)
402 		initialize_prepared_pattern(pattern, &p);
403 
404 	return score_string_prepared(string, query, &p, string_length, match);
405 }
406 
score_simple_string(const char * string,const char * pattern,unsigned * match)407 int score_simple_string(const char *string, const char *pattern, unsigned *match)
408 {
409 	struct scorer_query qry = {.pattern = pattern,.right_match = 0};
410 	unsigned string_length = strlen(string);
411 	return score_string(string, &qry, string_length, match);
412 }
413