1 /*
2
3 Copyright (C) by Jarkko Hietaniemi, 1998,1999,2000,2001,2002,2003,2006.
4 All Rights Reserved.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of either:
8
9 a) the GNU Library General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any
11 later version, or
12
13 b) the "Artistic License" which comes with Perl source code.
14
15 Other free software licensing schemes are negotiable.
16
17 Furthermore:
18
19 (1) This software is provided as-is, without warranties or
20 obligations of any kind.
21
22 (2) You shall include this copyright notice intact in all copies
23 and derived materials.
24
25 */
26
27 /*
28
29 $Id: apse.c,v 1.1 1999/06/23 16:09:13 jhi Exp jhi $
30
31 */
32
33 #include "apse.h"
34
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <ctype.h>
39 #include <assert.h>
40
41 #define APSE_BITS_IN_BITVEC (8*sizeof(apse_vec_t))
42
43 #define APSE_CHAR_MAX 256
44
45 #ifdef APSE_DEBUGGING
46 #define APSE_DEBUG(x) x
47 #else
48 #define APSE_DEBUG(x)
49 #endif
50
51 #define APSE_BIT(i) ((apse_vec_t)1 << ((i)%APSE_BITS_IN_BITVEC))
52 #define APSE_IDX(p, q, i) ((p)*(q)+((i)/APSE_BITS_IN_BITVEC))
53 #define APSE_BIT_SET(bv, p, q, i) ((bv[APSE_IDX(p, q, i)] |= APSE_BIT(i)))
54 #define APSE_BIT_CLR(bv, p, q, i) ((bv[APSE_IDX(p, q, i)] &= ~APSE_BIT(i)))
55 #define APSE_BIT_TST(bv, p, q, i) ((bv[APSE_IDX(p, q, i)] & APSE_BIT(i)))
56
57 #define APSE_MATCH_STATE_BOT 0
58 #define APSE_MATCH_STATE_SEARCH 1
59 #define APSE_MATCH_STATE_BEGIN 2
60 #define APSE_MATCH_STATE_FAIL 3
61 #define APSE_MATCH_STATE_GREEDY 4
62 #define APSE_MATCH_STATE_END 5
63 #define APSE_MATCH_STATE_EOT 6
64
65 #define APSE_TEST_HIGH_BIT(i) \
66 (((i) & ((apse_vec_t)1 << (APSE_BITS_IN_BITVEC - 1))) ? 1 : 0)
67
68 /* In case you are reading the TR 91-11 of University of Arizona, page 6:
69 * j+1 state
70 * j prev_state
71 * d i
72 * d-1 prev_i
73 */
74
75 #define APSE_NEXT_EXACT(state, prev_state, text, i, carry) \
76 (state[i] = ((prev_state[i] << 1 | carry) & text))
77
78 #define APSE_NEXT_APPROX(state, prev_state, text, i, prev_i, carry) \
79 (state[i] = (((prev_state[i] << 1) & text) | \
80 prev_state[prev_i] | \
81 ((state[prev_i] | prev_state[prev_i]) << 1) | \
82 carry))
83
84 #define APSE_NEXT_COMMON(state, prev_state, text, i) \
85 (state[i] = (prev_state[i] << 1) & text)
86
87 #define APSE_NEXT_INSERT(state, prev_state, i, prev_i) \
88 (state[i] |= prev_state[prev_i])
89
90 #define APSE_NEXT_DELETE(state, i, prev_i) \
91 (state[i] |= (state[prev_i] << 1))
92
93 #define APSE_NEXT_SUBSTI(state, prev_state, i, prev_i) \
94 (state[i] |= (prev_state[prev_i] << 1))
95
96 #define APSE_NEXT_CARRY(state, i, carry) \
97 (state[i] |= carry)
98
99 #define APSE_EXACT_MATCH_BEGIN(ap) (ap->state[0] & 1)
100
101 #define APSE_APPROX_MATCH_BEGIN(ap) \
102 (ap->state[ap->largest_distance + ap->match_begin_bitvector] > \
103 ap->match_begin_prefix && \
104 ap->state[ap->largest_distance + ap->match_begin_bitvector] & \
105 ap->match_begin_prefix)
106
107 #define APSE_PREFIX_DELETE_MASK(ap) \
108 do { if (ap->edit_deletions < ap->edit_distance && \
109 ap->text_position < ap->edit_distance) \
110 ap->state[h] &= ap->match_begin_bitmask; } while (0)
111
112 #define APSE_DEBUG_SINGLE(ap, i) \
113 APSE_DEBUG(printf("%c %2ld %2ld %s\n", \
114 isprint(ap->text[ap->text_position])? \
115 ap->text[ap->text_position]:'.', \
116 ap->text_position, i, \
117 _apse_fbin(ap->state[i], \
118 ap->pattern_size, 1)))
119
120
121 #define APSE_DEBUG_MULTIPLE_FIRST(ap, i) \
122 APSE_DEBUG(printf("%c %2ld %2ld", \
123 isprint(ap->text[ap->text_position])? \
124 ap->text[ap->text_position]:'.', \
125 ap->text_position, i))
126
127 #define APSE_DEBUG_MULTIPLE_REST(ap, i, j) \
128 APSE_DEBUG(printf(" %s", \
129 _apse_fbin(ap->state[j], \
130 ap->pattern_size, \
131 i == ap->bitvectors_in_state-1)))
132
133
134 #ifdef APSE_DEBUGGING
135 static char *_apse_fbin(apse_vec_t v, apse_size_t n, apse_bool_t last);
136
_apse_fbin(apse_vec_t v,apse_size_t n,apse_bool_t last)137 static char *_apse_fbin(apse_vec_t v, apse_size_t n, apse_bool_t last) {
138 static char s[APSE_BITS_IN_BITVEC + 1] = { 0 }; /* non-reentrant */
139
140 if (v) {
141 static const char *b =
142 "0000100001001100001010100110111000011001010111010011101101111111";
143 apse_size_t i;
144
145 for (i = 0; i < APSE_BITS_IN_BITVEC && i < n && v; i += 4) {
146 (void)memcpy(s + i, b + ((v & 0x0f) << 2), (size_t)4);
147 v >>= 4;
148 }
149
150 if (i < APSE_BITS_IN_BITVEC)
151 memset(s + i, '0', APSE_BITS_IN_BITVEC - i);
152 } else
153 memset(s, '0', APSE_BITS_IN_BITVEC);
154
155 if (last)
156 s[n % APSE_BITS_IN_BITVEC] = 0;
157
158 return s;
159 }
160 #endif
161
162 /* The code begins. */
163
apse_set_pattern(apse_t * ap,unsigned char * pattern,apse_size_t pattern_size)164 apse_bool_t apse_set_pattern(apse_t* ap,
165 unsigned char* pattern,
166 apse_size_t pattern_size) {
167 apse_size_t i;
168
169 if (ap->case_mask)
170 free(ap->case_mask);
171 if (ap->fold_mask)
172 free(ap->fold_mask);
173
174 ap->pattern_mask = 0;
175 ap->fold_mask = 0;
176 ap->case_mask = 0;
177
178 ap->is_greedy = 0;
179
180 ap->prev_equal = 0;
181 ap->prev_active = 0;
182
183 ap->pattern_size = pattern_size;
184 ap->bitvectors_in_state = (pattern_size - 1)/APSE_BITS_IN_BITVEC + 1;
185
186 if (ap->edit_distance)
187 ap->largest_distance = ap->edit_distance * ap->bitvectors_in_state;
188 else
189 ap->largest_distance = 0;
190
191 ap->bytes_in_state = ap->bitvectors_in_state * sizeof(apse_vec_t);
192
193 ap->case_mask = calloc((apse_size_t)APSE_CHAR_MAX, ap->bytes_in_state);
194 if (ap->case_mask == 0)
195 goto out;
196
197 for (i = 0; i < pattern_size; i++)
198 APSE_BIT_SET(ap->case_mask,
199 (unsigned)pattern[i],
200 ap->bitvectors_in_state, i);
201
202 ap->pattern_mask = ap->case_mask;
203
204 ap->match_end_bitmask =
205 (apse_vec_t)1 << ((pattern_size - 1) % APSE_BITS_IN_BITVEC);
206
207 out:
208 if (ap && ap->case_mask)
209 return 1;
210 else {
211 if (ap->case_mask)
212 free(ap->case_mask);
213 if (ap)
214 free(ap);
215 return 0;
216 }
217 }
218
apse_set_greedy(apse_t * ap,apse_bool_t greedy)219 void apse_set_greedy(apse_t *ap, apse_bool_t greedy) {
220 ap->is_greedy = greedy;
221 }
222
apse_get_greedy(apse_t * ap)223 apse_bool_t apse_get_greedy(apse_t *ap) {
224 return ap->is_greedy;
225 }
226
apse_set_match_bot_callback(apse_t * ap,void * (* match_bot_callback)(apse_t * ap))227 void apse_set_match_bot_callback(apse_t *ap,
228 void* (*match_bot_callback)(apse_t* ap)) {
229 ap->match_bot_callback = match_bot_callback;
230 }
231
apse_set_match_begin_callback(apse_t * ap,void * (* match_begin_callback)(apse_t * ap))232 void apse_set_match_begin_callback(apse_t *ap,
233 void* (*match_begin_callback)(apse_t* ap)) {
234 ap->match_begin_callback = match_begin_callback;
235 }
236
apse_set_match_fail_callback(apse_t * ap,void * (* match_fail_callback)(apse_t * ap))237 void apse_set_match_fail_callback(apse_t *ap,
238 void* (*match_fail_callback)(apse_t* ap)) {
239 ap->match_fail_callback = match_fail_callback;
240 }
241
apse_set_match_end_callback(apse_t * ap,void * (* match_end_callback)(apse_t * ap))242 void apse_set_match_end_callback(apse_t *ap,
243 void* (*match_end_callback)(apse_t* ap)) {
244 ap->match_end_callback = match_end_callback;
245 }
246
apse_set_match_eot_callback(apse_t * ap,void * (* match_eot_callback)(apse_t * ap))247 void apse_set_match_eot_callback(apse_t *ap,
248 void* (*match_eot_callback)(apse_t* ap)) {
249 ap->match_eot_callback = match_eot_callback;
250 }
251
apse_get_match_bot_callback(apse_t * ap)252 void* (*apse_get_match_bot_callback(apse_t * ap))(apse_t *ap) {
253 return ap->match_bot_callback;
254 }
255
apse_get_match_begin_callback(apse_t * ap)256 void* (*apse_get_match_begin_callback(apse_t * ap))(apse_t *ap) {
257 return ap->match_begin_callback;
258 }
259
apse_get_match_fail_callback(apse_t * ap)260 void* (*apse_get_match_fail_callback(apse_t * ap))(apse_t *ap) {
261 return ap->match_fail_callback;
262 }
263
apse_get_match_end_callback(apse_t * ap)264 void* (*apse_get_match_end_callback(apse_t * ap))(apse_t *ap) {
265 return ap->match_end_callback;
266 }
267
apse_get_match_eot_callback(apse_t * ap)268 void* (*apse_get_match_eot_callback(apse_t * ap))(apse_t *ap) {
269 return ap->match_eot_callback;
270 }
271
_apse_wrap_slice(apse_t * ap,apse_ssize_t begin_in,apse_ssize_t size_in,apse_ssize_t * begin_out,apse_ssize_t * size_out)272 static int _apse_wrap_slice(apse_t* ap,
273 apse_ssize_t begin_in,
274 apse_ssize_t size_in,
275 apse_ssize_t* begin_out,
276 apse_ssize_t* size_out) {
277 if (begin_in < 0) {
278 if ((apse_size_t)-begin_in > ap->pattern_size)
279 return 0;
280 begin_in = ap->pattern_size + begin_in;
281 }
282
283 if (size_in < 0) {
284 if (-size_in > begin_in)
285 return 0;
286 size_in = -size_in;
287 begin_in -= size_in;
288 }
289
290 if ((apse_size_t)begin_in >= ap->pattern_size)
291 return 0;
292
293 if ((apse_size_t)begin_in + size_in > ap->pattern_size)
294 size_in = ap->pattern_size - begin_in;
295
296 if (begin_out)
297 *begin_out = begin_in;
298
299 if (size_out)
300 *size_out = size_in;
301
302 return 1;
303 }
304
apse_set_anychar(apse_t * ap,apse_ssize_t pattern_index)305 apse_bool_t apse_set_anychar(apse_t *ap, apse_ssize_t pattern_index) {
306 apse_size_t bitvectors_in_state = ap->bitvectors_in_state;
307 apse_ssize_t true_index, i;
308 apse_bool_t okay = 0;
309
310 if (!_apse_wrap_slice(ap, pattern_index, (apse_ssize_t)1,
311 &true_index, 0))
312 goto out;
313
314 for (i = 0; i < APSE_CHAR_MAX; i++)
315 APSE_BIT_SET(ap->case_mask,
316 i, bitvectors_in_state, pattern_index);
317 if (ap->fold_mask)
318 for (i = 0; i < APSE_CHAR_MAX; i++)
319 APSE_BIT_SET(ap->fold_mask,
320 i, bitvectors_in_state, pattern_index);
321
322 okay = 1;
323
324 out:
325
326 return okay;
327 }
328
apse_set_charset(apse_t * ap,apse_ssize_t pattern_index,unsigned char * set,apse_size_t set_size,apse_bool_t complement)329 apse_bool_t apse_set_charset(apse_t* ap,
330 apse_ssize_t pattern_index,
331 unsigned char* set,
332 apse_size_t set_size,
333 apse_bool_t complement) {
334 apse_size_t bitvectors_in_state = ap->bitvectors_in_state;
335 apse_ssize_t true_index;
336 apse_bool_t okay = 0;
337 apse_size_t i;
338
339 if (!_apse_wrap_slice(ap, pattern_index, (apse_ssize_t)1,
340 &true_index, 0))
341 goto out;
342
343 if (complement) {
344 for (i = 0; i < set_size; i++)
345 APSE_BIT_CLR(ap->case_mask,
346 (unsigned)set[i],
347 bitvectors_in_state, true_index);
348 } else {
349 for (i = 0; i < set_size; i++)
350 APSE_BIT_SET(ap->case_mask,
351 (unsigned)set[i],
352 bitvectors_in_state, true_index);
353 }
354 if (ap->fold_mask)
355 apse_set_caseignore_slice(ap, pattern_index,
356 (apse_ssize_t)1, (apse_bool_t)1);
357
358 okay = 1;
359
360 out:
361
362 return okay;
363 }
364
_apse_reset_state(apse_t * ap)365 static void _apse_reset_state(apse_t* ap) {
366 apse_size_t i, j;
367
368 (void)memset(ap->state, 0, ap->bytes_in_all_states);
369 (void)memset(ap->prev_state, 0, ap->bytes_in_all_states);
370
371 ap->prev_equal = 0;
372 ap->prev_active = 0;
373
374 for (i = 1; i <= ap->edit_distance; i++) {
375 for (j = 0; j < i; j++) {
376 #ifdef APSE_DEBUGGING
377 int k = APSE_IDX(i, ap->bitvectors_in_state, j);
378 int l = ap->bytes_in_all_states/sizeof(apse_vec_t);
379 assert (k < l);
380 #endif
381 APSE_BIT_SET(ap->prev_state, i, ap->bitvectors_in_state, j);
382 }
383 }
384 }
385
apse_set_text_position(apse_t * ap,apse_size_t text_position)386 apse_bool_t apse_set_text_position(apse_t *ap,
387 apse_size_t text_position) {
388 ap->text_position = text_position;
389
390 return 1;
391 }
392
apse_get_text_position(apse_t * ap)393 apse_size_t apse_get_text_position(apse_t *ap) {
394 return ap->text_position;
395 }
396
apse_set_text_initial_position(apse_t * ap,apse_size_t text_initial_position)397 apse_bool_t apse_set_text_initial_position(apse_t *ap,
398 apse_size_t text_initial_position) {
399 ap->text_initial_position = text_initial_position;
400
401 return 1;
402 }
403
apse_get_text_initial_position(apse_t * ap)404 apse_size_t apse_get_text_initial_position(apse_t *ap) {
405 return ap->text_initial_position;
406 }
407
apse_set_text_final_position(apse_t * ap,apse_size_t text_final_position)408 apse_bool_t apse_set_text_final_position(apse_t *ap,
409 apse_size_t text_final_position) {
410 ap->text_final_position = text_final_position;
411
412 return 1;
413 }
414
apse_get_text_final_position(apse_t * ap)415 apse_size_t apse_get_text_final_position(apse_t *ap) {
416 return ap->text_final_position;
417 }
418
apse_set_text_position_range(apse_t * ap,apse_size_t text_position_range)419 apse_bool_t apse_set_text_position_range(apse_t *ap,
420 apse_size_t text_position_range) {
421 ap->text_position_range = text_position_range;
422
423 return 1;
424 }
425
apse_get_text_position_range(apse_t * ap)426 apse_size_t apse_get_text_position_range(apse_t *ap) {
427 return ap->text_position_range;
428 }
429
apse_reset(apse_t * ap)430 void apse_reset(apse_t *ap) {
431 _apse_reset_state(ap);
432
433 ap->text_position = ap->text_initial_position;
434 #if 0
435 ap->text_position_range = APSE_MATCH_BAD; /* Do not reset this. */
436 #endif
437
438 ap->match_state = APSE_MATCH_STATE_BOT;
439 ap->match_begin = APSE_MATCH_BAD;
440 ap->match_end = APSE_MATCH_BAD;
441 }
442
apse_set_edit_distance(apse_t * ap,apse_size_t edit_distance)443 apse_bool_t apse_set_edit_distance(apse_t *ap, apse_size_t edit_distance) {
444 /* TODO: waste not--reuse if possible */
445
446 if (ap->state)
447 free(ap->state);
448 if (ap->prev_state)
449 free(ap->prev_state);
450
451 if (edit_distance >= ap->pattern_size)
452 edit_distance = ap->pattern_size;
453
454 ap->edit_distance = edit_distance;
455
456 ap->bytes_in_all_states = (edit_distance + 1) * ap->bytes_in_state;
457
458 ap->state = ap->prev_state = 0;
459
460 ap->state = calloc(edit_distance + 1, ap->bytes_in_state);
461 if (ap->state == 0)
462 goto out;
463
464 ap->prev_state = calloc(edit_distance + 1, ap->bytes_in_state);
465 if (ap->prev_state == 0)
466 goto out;
467
468 apse_reset(ap);
469
470 if (!ap->has_different_distances) {
471 ap->edit_insertions = edit_distance;
472 ap->edit_deletions = edit_distance;
473 ap->edit_substitutions = edit_distance;
474 }
475
476 if (ap->edit_distance && ap->bitvectors_in_state)
477 ap->largest_distance = ap->edit_distance * ap->bitvectors_in_state;
478 else
479 ap->largest_distance = 0;
480
481 ap->match_begin_bitvector =
482 (edit_distance + 1) / APSE_BITS_IN_BITVEC;
483 ap->match_begin_prefix = ((apse_vec_t)1 << edit_distance) - 1;
484 ap->match_begin_bitmask =
485 ((apse_vec_t)1 << edit_distance) - 1;
486
487 ap->match_end_bitvector =
488 (ap->pattern_size - 1) / APSE_BITS_IN_BITVEC;
489
490 #ifdef APSE_DEBUGGING
491 if (ap->has_different_distances) {
492 printf("(edit distances: ");
493 printf("insertions = %ld, deletions = %ld, substitutions = %ld)\n",
494 ap->edit_insertions,
495 ap->edit_deletions,
496 ap->edit_substitutions);
497 } else
498 printf("(edit_distance = %ld)\n", ap->edit_distance);
499 #endif
500
501 out:
502 return ap->state && ap->prev_state;
503 }
504
apse_get_edit_distance(apse_t * ap)505 apse_size_t apse_get_edit_distance(apse_t *ap) {
506 return ap->edit_distance;
507 }
508
apse_set_minimal_distance(apse_t * ap,apse_bool_t minimal)509 apse_bool_t apse_set_minimal_distance(apse_t* ap, apse_bool_t minimal) {
510 ap->use_minimal_distance = minimal;
511 return 1;
512 }
513
apse_get_minimal_distance(apse_t * ap)514 apse_bool_t apse_get_minimal_distance(apse_t *ap) {
515 return ap->use_minimal_distance;
516 }
517
apse_set_exact_slice(apse_t * ap,apse_ssize_t exact_begin,apse_ssize_t exact_size,apse_bool_t exact)518 apse_bool_t apse_set_exact_slice(apse_t* ap,
519 apse_ssize_t exact_begin,
520 apse_ssize_t exact_size,
521 apse_bool_t exact) {
522 apse_ssize_t true_begin, true_size;
523 apse_bool_t okay = 0;
524 apse_size_t i, j;
525
526 if (!ap->exact_mask) {
527
528 ap->exact_mask = calloc((size_t)1, ap->bytes_in_state);
529 if (ap->exact_mask == 0)
530 goto out;
531
532 ap->exact_positions = 0;
533 }
534
535 if (!_apse_wrap_slice(ap, exact_begin, exact_size,
536 &true_begin, &true_size))
537 goto out;
538
539 if (exact) {
540 for (i = true_begin, j = true_begin + true_size;
541 i < j && i < ap->pattern_size; i++) {
542 if (!APSE_BIT_TST(ap->exact_mask, 0, 0, i))
543 ap->exact_positions++;
544 APSE_BIT_SET(ap->exact_mask, 0, 0, i);
545 }
546 } else {
547 for (i = true_begin, j = true_begin + true_size;
548 i < j && i < ap->pattern_size; i++) {
549 if (APSE_BIT_TST(ap->exact_mask, 0, 0, i))
550 ap->exact_positions--;
551 APSE_BIT_CLR(ap->exact_mask, 0, 0, i);
552 }
553 }
554
555 okay = 1;
556
557 out:
558 return okay;
559 }
560
apse_set_caseignore_slice(apse_t * ap,apse_ssize_t caseignore_begin,apse_ssize_t caseignore_size,apse_bool_t caseignore)561 apse_bool_t apse_set_caseignore_slice(apse_t* ap,
562 apse_ssize_t caseignore_begin,
563 apse_ssize_t caseignore_size,
564 apse_bool_t caseignore) {
565 apse_size_t i, j;
566 int k;
567 apse_ssize_t true_begin, true_size;
568 apse_bool_t okay = 0;
569
570 if (!ap->fold_mask) {
571
572 ap->fold_mask = calloc((apse_size_t)APSE_CHAR_MAX,
573 ap->bytes_in_state);
574 if (ap->fold_mask == 0)
575 goto out;
576
577 memcpy(ap->fold_mask,
578 ap->case_mask,
579 APSE_CHAR_MAX * ap->bytes_in_state);
580
581 ap->pattern_mask = ap->fold_mask;
582 }
583
584 if (!_apse_wrap_slice(ap, caseignore_begin, caseignore_size,
585 &true_begin, &true_size))
586 goto out;
587
588 if (caseignore) {
589 for (i = true_begin, j = true_begin + true_size;
590 i < j && i < ap->pattern_size; i++) {
591 for (k = 0; k < APSE_CHAR_MAX; k++) {
592 if (APSE_BIT_TST(ap->case_mask,
593 k, ap->bitvectors_in_state, i)) {
594 if (isupper(k))
595 APSE_BIT_SET(ap->fold_mask,
596 tolower(k),
597 ap->bitvectors_in_state, i);
598 else if (islower(k))
599 APSE_BIT_SET(ap->fold_mask,
600 toupper(k),
601 ap->bitvectors_in_state, i);
602 }
603 }
604 }
605 } else {
606 for (i = true_begin, j = true_begin + true_size;
607 i < j && i < ap->pattern_size; i++) {
608 for (k = 0; k < APSE_CHAR_MAX; k++) {
609 if (APSE_BIT_TST(ap->case_mask,
610 k, ap->bitvectors_in_state, i)) {
611 if (isupper(k))
612 APSE_BIT_CLR(ap->fold_mask,
613 tolower(k),
614 ap->bitvectors_in_state, i);
615 else
616 if (islower(k))
617 APSE_BIT_CLR(ap->fold_mask,
618 toupper(k),
619 ap->bitvectors_in_state, i);
620 }
621 }
622 }
623 }
624
625 okay = 1;
626
627 out:
628 return okay;
629 }
630
apse_destroy(apse_t * ap)631 void apse_destroy(apse_t *ap) {
632 if (ap->case_mask) free(ap->case_mask);
633 if (ap->fold_mask) free(ap->fold_mask);
634 if (ap->state) free(ap->state);
635 if (ap->prev_state) free(ap->prev_state);
636 if (ap->exact_mask) free(ap->exact_mask);
637 free(ap);
638 }
639
apse_create(unsigned char * pattern,apse_size_t pattern_size,apse_size_t edit_distance)640 apse_t *apse_create(unsigned char* pattern,
641 apse_size_t pattern_size,
642 apse_size_t edit_distance) {
643 apse_t *ap;
644 apse_bool_t okay = 0;
645
646 APSE_DEBUG(printf("(apse version %u.%u)\n",
647 APSE_MAJOR_VERSION, APSE_MINOR_VERSION));
648
649 APSE_DEBUG(
650 printf("(pattern = \"%s\", pattern_size = %ld)\n",
651 pattern, pattern_size));
652
653 ap = calloc((size_t)1, sizeof(*ap));
654 if (ap == 0)
655 return 0;
656
657 ap->pattern_size = 0;
658 ap->pattern_mask = 0;
659
660 ap->edit_distance = 0;
661 ap->has_different_distances = 0;
662 ap->edit_insertions = 0;
663 ap->edit_deletions = 0;
664 ap->edit_substitutions = 0;
665 ap->use_minimal_distance = 0;
666
667 ap->bitvectors_in_state = 0;
668 ap->bytes_in_state = 0;
669 ap->bytes_in_all_states = 0;
670 ap->largest_distance = 0;
671
672 ap->text = 0;
673 ap->text_size = 0;
674 ap->text_position = 0;
675 ap->text_initial_position = 0;
676 ap->text_final_position = APSE_MATCH_BAD;
677 ap->text_position_range = APSE_MATCH_BAD;
678
679 ap->state = 0;
680 ap->prev_state = 0;
681 ap->match_begin_bitmask = 0;
682 ap->match_begin_prefix = 0;
683 ap->match_end_bitvector = 0;
684 ap->match_end_bitmask = 0;
685 ap->match_state = APSE_MATCH_STATE_BOT;
686 ap->match_begin = APSE_MATCH_BAD;
687 ap->match_end = APSE_MATCH_BAD;
688
689 ap->match_bot_callback = 0;
690 ap->match_begin_callback = 0;
691 ap->match_fail_callback = 0;
692 ap->match_end_callback = 0;
693 ap->match_eot_callback = 0;
694
695 ap->exact_positions = 0;
696 ap->exact_mask = 0;
697
698 ap->is_greedy = 0;
699
700 ap->custom_data = 0;
701 ap->custom_data_size = 0;
702
703 if (!apse_set_pattern(ap, (unsigned char *)pattern, pattern_size))
704 goto out;
705
706 if (!apse_set_edit_distance(ap, edit_distance))
707 goto out;
708
709 ap->edit_insertions = ap->edit_deletions =
710 ap->edit_substitutions = ap->edit_distance;
711
712 ap->largest_distance = edit_distance * ap->bitvectors_in_state;
713
714 #ifdef APSE_DEBUGGING
715 printf("(size of bitvector = %ld, bitvectors_in_state = %ld)\n",
716 (long)sizeof(apse_vec_t), ap->bitvectors_in_state);
717 printf("(bytes_in_state = %ld, states = %ld, bytes_in_all_states = %ld)\n",
718 ap->bytes_in_state, ap->edit_distance + 1, ap->bytes_in_all_states);
719 printf("(match_begin_bitvector = %ld, match_begin_bitmask = %s)\n",
720 ap->match_begin_bitvector,
721 _apse_fbin(ap->match_begin_bitmask, ap->pattern_size, 1));
722 printf("(match_end_bitvector = %ld, match_end_bitmask = %s)\n",
723 ap->match_end_bitvector,
724 _apse_fbin(ap->match_end_bitmask, ap->pattern_size, 1));
725 printf("(largest_distance = %ld, match_begin_prefix = %s)\n",
726 ap->largest_distance,
727 _apse_fbin(ap->match_begin_prefix, ap->pattern_size, 1));
728 if (ap->bitvectors_in_state == 1)
729 printf("(single bitvector");
730 else
731 printf("(multiple bitvectors");
732 printf(")\n");
733 #endif
734 okay = 1;
735
736 out:
737 if (!okay) {
738 apse_destroy(ap);
739 ap = 0;
740 }
741
742 return ap;
743 }
744
apse_set_insertions(apse_t * ap,apse_size_t insertions)745 apse_bool_t apse_set_insertions(apse_t *ap, apse_size_t insertions) {
746 apse_bool_t okay = 0;
747
748 if (insertions > ap->edit_distance)
749 insertions = ap->edit_distance;
750 ap->edit_insertions = insertions;
751 ap->has_different_distances = 1;
752
753 APSE_DEBUG((printf("(edit distances: insertions = %ld, deletions = %ld, substitutions = %ld)\n",
754 ap->edit_insertions,
755 ap->edit_deletions,
756 ap->edit_substitutions)));
757
758 okay = 1;
759
760 return okay;
761 }
762
apse_get_insertions(apse_t * ap)763 apse_size_t apse_get_insertions(apse_t *ap) {
764 if (ap->has_different_distances)
765 return ap->edit_insertions;
766 else
767 return ap->edit_distance;
768 }
769
apse_set_deletions(apse_t * ap,apse_size_t deletions)770 apse_bool_t apse_set_deletions(apse_t *ap, apse_size_t deletions) {
771 apse_bool_t okay = 0;
772
773 if (deletions > ap->edit_distance)
774 deletions = ap->edit_distance;
775 ap->edit_deletions = deletions;
776 ap->has_different_distances = 1;
777
778 APSE_DEBUG((printf("(edit distances: insertions = %ld, deletions = %ld, substitutions = %ld)\n",
779 ap->edit_insertions,
780 ap->edit_deletions,
781 ap->edit_substitutions)));
782
783 okay = 1;
784
785 return okay;
786 }
787
apse_get_deletions(apse_t * ap)788 apse_size_t apse_get_deletions(apse_t *ap) {
789 if (ap->has_different_distances)
790 return ap->edit_deletions;
791 else
792 return ap->edit_distance;
793 }
794
apse_set_substitutions(apse_t * ap,apse_size_t substitutions)795 apse_bool_t apse_set_substitutions(apse_t *ap, apse_size_t substitutions) {
796 apse_bool_t okay = 0;
797
798 if (substitutions > ap->edit_distance)
799 substitutions = ap->edit_distance;
800 ap->edit_substitutions = substitutions;
801 ap->has_different_distances = 1;
802
803 APSE_DEBUG((printf("(edit distances: insertions = %ld, deletions = %ld, substitutions = %ld)\n",
804 ap->edit_insertions,
805 ap->edit_deletions,
806 ap->edit_substitutions)));
807
808 okay = 1;
809
810 return okay;
811 }
812
apse_get_substitutions(apse_t * ap)813 apse_size_t apse_get_substitutions(apse_t *ap) {
814 if (ap->has_different_distances)
815 return ap->edit_substitutions;
816 else
817 return ap->edit_distance;
818 }
819
820 #ifdef APSE_DEBUGGING
apse_match_state_name(apse_t * ap)821 static const char* apse_match_state_name(apse_t* ap) {
822 switch (ap->match_state) {
823 case APSE_MATCH_STATE_BOT: return "BOT";
824 case APSE_MATCH_STATE_SEARCH: return "SEARCH";
825 case APSE_MATCH_STATE_BEGIN: return "BEGIN";
826 case APSE_MATCH_STATE_FAIL: return "FAIL";
827 case APSE_MATCH_STATE_GREEDY: return "GREEDY";
828 case APSE_MATCH_STATE_END: return "END";
829 case APSE_MATCH_STATE_EOT: return "EOT";
830 default: return "***UNKNOWN***";
831 }
832 }
833 #endif
834
_apse_match_bot(apse_t * ap)835 static void _apse_match_bot(apse_t *ap) {
836 APSE_DEBUG(printf("(text = \"%*.*s\", text_size = %ld)\n",
837 (int)ap->text_size, (int)ap->text_size, ap->text,
838 ap->text_size));
839 apse_reset(ap);
840 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
841 APSE_DEBUG(printf("(text begin %ld)\n", ap->text_position));
842 if (ap->match_bot_callback)
843 ap->match_bot_callback(ap);
844 }
845
_apse_match_begin(apse_t * ap)846 static void _apse_match_begin(apse_t *ap) {
847 ap->match_state = APSE_MATCH_STATE_BEGIN;
848 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
849 APSE_DEBUG(printf("(match begin %ld)\n", ap->text_position));
850 ap->match_begin = ap->text_position;
851 if (ap->match_begin_callback)
852 ap->match_begin_callback(ap);
853 }
854
_apse_match_fail(apse_t * ap)855 static void _apse_match_fail(apse_t *ap) {
856 ap->match_state = APSE_MATCH_STATE_FAIL;
857 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
858 ap->match_begin = APSE_MATCH_BAD;
859 APSE_DEBUG(printf("(match fail %ld)\n", ap->text_position));
860 if (ap->match_fail_callback)
861 ap->match_fail_callback(ap);
862 ap->match_state = APSE_MATCH_STATE_SEARCH;
863 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
864 }
865
_apse_match_end(apse_t * ap)866 static void _apse_match_end(apse_t *ap) {
867 ap->match_state = APSE_MATCH_STATE_END;
868 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
869 #ifdef APSE_DEBUGGING
870 printf("(match end %ld)\n", ap->match_end);
871 printf("(match string \"%.*s\")\n",
872 (int)(ap->match_end - ap->match_begin + 1),
873 ap->text + ap->match_begin);
874 printf("(match length %ld)\n",
875 ap->match_end - ap->match_begin + 1);
876 #endif
877 if (ap->match_end_callback)
878 ap->match_end_callback(ap);
879 ap->match_state = APSE_MATCH_STATE_SEARCH;
880 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
881 }
882
_apse_match_eot(apse_t * ap)883 static void _apse_match_eot(apse_t *ap) {
884 ap->match_state = APSE_MATCH_STATE_EOT;
885 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
886 ap->text_position = ap->text_size;
887 APSE_DEBUG(printf("(text end %ld)\n", ap->text_position));
888 if (ap->match_eot_callback)
889 ap->match_eot_callback(ap);
890 }
891
_apse_match_next_state(apse_t * ap)892 static apse_bool_t _apse_match_next_state(apse_t *ap) {
893 apse_size_t h, i, j, k;
894 apse_vec_t match;
895
896 k = ap->edit_distance * ap->bitvectors_in_state;
897
898 switch (ap->match_state) {
899 case APSE_MATCH_STATE_SEARCH:
900 if (APSE_EXACT_MATCH_BEGIN(ap) || APSE_APPROX_MATCH_BEGIN(ap))
901 _apse_match_begin(ap);
902 break;
903 case APSE_MATCH_STATE_BEGIN:
904 {
905 apse_size_t equal = 0;
906 apse_size_t active = 0;
907
908 for (h = 0;
909 h <= k;
910 h += ap->bitvectors_in_state) {
911 for (i = h, j = h + ap->bitvectors_in_state - 1; i < j; j--)
912 if (ap->state[j] != ap->prev_state[j])
913 break;
914 if (ap->prev_state[j] == ap->state[j])
915 equal++;
916 if (ap->state[j])
917 active++;
918 }
919 #ifdef APSE_DEBUGGING
920 printf("(equal = %d, active = %d)\n", equal, active);
921 #endif
922 if ((equal == ap->edit_distance + 1 &&
923 ap->is_greedy == 0)
924 ||
925 (equal < ap->prev_equal &&
926 ap->prev_active &&
927 active > ap->prev_active &&
928 ap->text_position - ap->match_begin < 8 * ap->bytes_in_state &&
929 !APSE_BIT_TST(ap->state,
930 ap->edit_distance,
931 ap->bitvectors_in_state,
932 ap->text_position - ap->match_begin))) {
933 ap->match_begin = ap->text_position;
934 #ifdef APSE_DEBUGGING
935 printf("(slide begin %d)\n", ap->match_begin);
936 #endif
937 }
938 else if (active == 0)
939 _apse_match_fail(ap);
940 ap->prev_equal = equal;
941 ap->prev_active = active;
942 }
943 break;
944 default:
945 break;
946 }
947
948 for (match = 0, h = 0;
949 h <= k;
950 h += ap->bitvectors_in_state)
951 match |= ap->state[h + ap->match_end_bitvector];
952
953 if (match & ap->match_end_bitmask) {
954 if (ap->match_state == APSE_MATCH_STATE_BEGIN) {
955 if (ap->is_greedy) {
956 ap->match_state = APSE_MATCH_STATE_GREEDY;
957 APSE_DEBUG(printf("(match state %s)\n",
958 apse_match_state_name(ap)));
959 APSE_DEBUG(
960 printf("(greedy match continue %ld)\n",
961 ap->text_position));
962 } else {
963 ap->match_state = APSE_MATCH_STATE_END;
964 ap->match_end = ap->text_position;
965 }
966 }
967 } else if (ap->match_state == APSE_MATCH_STATE_GREEDY) {
968 ap->match_state = APSE_MATCH_STATE_END;
969 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
970 ap->match_end = ap->text_position - 1;
971 APSE_DEBUG(printf("(match end %ld)\n", ap->match_end));
972 }
973
974 return ap->match_state;
975 }
976
_apse_exact_multiple(apse_t * ap)977 static void _apse_exact_multiple(apse_t* ap) {
978 apse_size_t h;
979 apse_size_t g = ap->edit_distance * ap->bitvectors_in_state;
980
981 for (h = 0; h < ap->bitvectors_in_state; h++)
982 ap->state[g + h] &= ~ap->exact_mask[h];
983 }
984
_apse_match_single_simple(apse_t * ap)985 static apse_bool_t _apse_match_single_simple(apse_t *ap) {
986 /* single apse_vec_t, edit_distance */
987
988 APSE_DEBUG(printf("(match single simple)\n"));
989 for ( ; ap->text_position < ap->text_size; ap->text_position++) {
990 apse_vec_t t =
991 ap->pattern_mask[(unsigned)ap->text[ap->text_position] *
992 ap->bitvectors_in_state];
993 apse_size_t h, g;
994 APSE_NEXT_EXACT(ap->state, ap->prev_state, t, (apse_size_t)0, 1);
995 APSE_DEBUG_SINGLE(ap, (apse_size_t)0);
996
997 for (g = 0, h = 1; h <= ap->edit_distance; g = h, h++) {
998 APSE_NEXT_APPROX(ap->state, ap->prev_state, t, h, g, 1);
999 APSE_DEBUG_SINGLE(ap, h);
1000 }
1001
1002 if (ap->exact_positions)
1003 ap->state[ap->edit_distance] &= ~ap->exact_mask[0];
1004
1005 if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1006 return 1;
1007
1008 (void)memcpy(ap->prev_state, ap->state, ap->bytes_in_all_states);
1009 }
1010
1011 return 0;
1012 }
1013
_apse_match_multiple_simple(apse_t * ap)1014 static apse_bool_t _apse_match_multiple_simple(apse_t *ap) {
1015 /* multiple apse_vec_t:s, has_different_distances */
1016 apse_size_t h, i;
1017
1018 APSE_DEBUG(printf("(match multiple simple)\n"));
1019 for ( ; ap->text_position < ap->text_size; ap->text_position++) {
1020 apse_vec_t *t =
1021 ap->pattern_mask +
1022 (unsigned)ap->text[ap->text_position] * ap->bitvectors_in_state;
1023 apse_vec_t c, d;
1024
1025 APSE_DEBUG_MULTIPLE_FIRST(ap, (apse_size_t)0);
1026 for (c = 1, i = 0; i < ap->bitvectors_in_state; i++, c = d) {
1027 d = APSE_TEST_HIGH_BIT(ap->state[i]);
1028 APSE_NEXT_EXACT(ap->state, ap->prev_state, t[i], i, c);
1029 APSE_DEBUG_MULTIPLE_REST(ap, i, i);
1030 }
1031 APSE_DEBUG(printf("\n"));
1032
1033 for (h = 1; h <= ap->edit_distance; h++) {
1034 apse_size_t kj = h * ap->bitvectors_in_state,
1035 jj = kj - ap->bitvectors_in_state;
1036
1037 APSE_DEBUG_MULTIPLE_FIRST(ap, h);
1038 for (c = 1, i = 0;
1039 i < ap->bitvectors_in_state;
1040 i++, kj++, jj++, c = d) {
1041 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1042 APSE_NEXT_APPROX(ap->state, ap->prev_state,
1043 t[i], kj, jj, c);
1044 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1045 }
1046 APSE_DEBUG(printf("\n"));
1047 }
1048
1049 if (ap->exact_positions)
1050 _apse_exact_multiple(ap);
1051
1052 if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1053 return 1;
1054
1055 (void)memcpy(ap->prev_state, ap->state,
1056 ap->bytes_in_all_states);
1057 }
1058
1059 return 0;
1060 }
1061
_apse_match_single_complex(apse_t * ap)1062 static apse_bool_t _apse_match_single_complex(apse_t *ap) {
1063 /* single apse_vec_t, has_different_distances */
1064 APSE_DEBUG(printf("(match single complex)\n"));
1065 for ( ; ap->text_position < ap->text_size; ap->text_position++) {
1066 unsigned char o = ap->text[ap->text_position];
1067 apse_vec_t t =
1068 ap->pattern_mask[(unsigned int)o * ap->bitvectors_in_state];
1069 apse_size_t h, g;
1070
1071 APSE_NEXT_EXACT(ap->state, ap->prev_state, t, (apse_size_t)0, 1);
1072 APSE_DEBUG_SINGLE(ap, (apse_size_t)0);
1073
1074 for (g = 0, h = 1; h <= ap->edit_distance; g = h, h++) {
1075 apse_bool_t has_insertions = h <= ap->edit_insertions;
1076 apse_bool_t has_deletions = h <= ap->edit_deletions;
1077 apse_bool_t has_substitutions = h <= ap->edit_substitutions;
1078
1079 APSE_NEXT_COMMON(ap->state, ap->prev_state, t, h);
1080 if (has_insertions)
1081 APSE_NEXT_INSERT(ap->state, ap->prev_state, h, g);
1082 if (has_deletions)
1083 APSE_NEXT_DELETE(ap->state, h, g);
1084 if (has_substitutions)
1085 APSE_NEXT_SUBSTI(ap->state, ap->prev_state, h, g);
1086 APSE_NEXT_CARRY(ap->state, h,
1087 has_deletions || has_substitutions ? 1 : 0);
1088 APSE_PREFIX_DELETE_MASK(ap);
1089 APSE_DEBUG_SINGLE(ap, h);
1090 }
1091
1092 if (ap->exact_positions)
1093 ap->state[ap->edit_distance] &= ~ap->exact_mask[0];
1094
1095 if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1096 return 1;
1097
1098 (void)memcpy(ap->prev_state, ap->state,
1099 ap->bytes_in_all_states);
1100
1101 }
1102
1103 return 0;
1104 }
1105
_apse_match_multiple_complex(apse_t * ap)1106 static apse_bool_t _apse_match_multiple_complex(apse_t *ap) {
1107 /* multiple apse_vec_t:s, has_different_distances */
1108 apse_size_t h, i;
1109
1110 APSE_DEBUG(printf("(match multiple complex)\n"));
1111 for ( ; ap->text_position < ap->text_size; ap->text_position++) {
1112 unsigned char o = ap->text[ap->text_position];
1113 apse_vec_t *t =
1114 ap->pattern_mask + (unsigned)o * ap->bitvectors_in_state;
1115 apse_vec_t c, d;
1116
1117 APSE_DEBUG_MULTIPLE_FIRST(ap, (apse_size_t)0);
1118 for (c = 1, i = 0; i < ap->bitvectors_in_state; i++, c = d) {
1119 d = APSE_TEST_HIGH_BIT(ap->state[i]);
1120 APSE_NEXT_EXACT(ap->state, ap->prev_state, t[i], i, c);
1121 APSE_DEBUG_MULTIPLE_REST(ap, i, i);
1122 }
1123 APSE_DEBUG(printf("\n"));
1124
1125 for (h = 1; h <= ap->edit_distance; h++) {
1126 apse_size_t
1127 kj = h * ap->bitvectors_in_state,
1128 jj = kj - ap->bitvectors_in_state;
1129
1130 apse_bool_t has_insertions = h <= ap->edit_insertions;
1131 apse_bool_t has_deletions = h <= ap->edit_deletions;
1132 apse_bool_t has_substitutions = h <= ap->edit_substitutions;
1133
1134 APSE_DEBUG_MULTIPLE_FIRST(ap, h);
1135 /* Is there such a thing as too much manual optimization? */
1136 if (has_insertions) {
1137 if (has_deletions && has_substitutions) {
1138 for (c = 1, i = 0;
1139 i < ap->bitvectors_in_state;
1140 i++, kj++, jj++, c = d) {
1141 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1142 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1143 APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1144 APSE_NEXT_DELETE(ap->state, kj, jj);
1145 APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1146 APSE_NEXT_CARRY(ap->state, kj, c);
1147 APSE_PREFIX_DELETE_MASK(ap);
1148 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1149 }
1150 } else if (has_deletions) {
1151 for (c = 1, i = 0;
1152 i < ap->bitvectors_in_state;
1153 i++, kj++, jj++, c = d) {
1154 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1155 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1156 APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1157 APSE_NEXT_DELETE(ap->state, kj, jj);
1158 APSE_NEXT_CARRY(ap->state, kj, c);
1159 APSE_PREFIX_DELETE_MASK(ap);
1160 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1161 }
1162 } else if (has_substitutions) {
1163 for (c = 1, i = 0;
1164 i < ap->bitvectors_in_state;
1165 i++, kj++, jj++, c = d) {
1166 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1167 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1168 APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1169 APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1170 APSE_NEXT_CARRY(ap->state, kj, c);
1171 APSE_PREFIX_DELETE_MASK(ap);
1172 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1173 }
1174 } else {
1175 for (c = 1, i = 0;
1176 i < ap->bitvectors_in_state;
1177 i++, kj++, jj++, c = d) {
1178 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1179 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1180 APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1181 APSE_NEXT_CARRY(ap->state, kj, c);
1182 APSE_PREFIX_DELETE_MASK(ap);
1183 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1184 }
1185 }
1186 } else {
1187 if (has_deletions && has_substitutions) {
1188 for (c = 1, i = 0;
1189 i < ap->bitvectors_in_state;
1190 i++, kj++, jj++, c = d) {
1191 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1192 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1193 APSE_NEXT_DELETE(ap->state, kj, jj);
1194 APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1195 APSE_NEXT_CARRY(ap->state, kj, c);
1196 APSE_PREFIX_DELETE_MASK(ap);
1197 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1198 }
1199 } else if (has_deletions) {
1200 for (c = 1, i = 0;
1201 i < ap->bitvectors_in_state;
1202 i++, kj++, jj++, c = d) {
1203 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1204 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1205 APSE_NEXT_DELETE(ap->state, kj, jj);
1206 APSE_NEXT_CARRY(ap->state, kj, c);
1207 APSE_PREFIX_DELETE_MASK(ap);
1208 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1209 }
1210 } else if (has_substitutions) {
1211 for (c = 1, i = 0;
1212 i < ap->bitvectors_in_state;
1213 i++, kj++, jj++, c = d) {
1214 d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1215 APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1216 APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1217 APSE_NEXT_CARRY(ap->state, kj, c);
1218 APSE_PREFIX_DELETE_MASK(ap);
1219 APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1220 }
1221 }
1222 }
1223 APSE_DEBUG(printf("\n"));
1224
1225
1226 if (ap->exact_positions)
1227 _apse_exact_multiple(ap);
1228
1229 if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1230 return 1;
1231
1232 (void)memcpy(ap->prev_state, ap->state,
1233 ap->bytes_in_all_states);
1234 }
1235 }
1236
1237 return 0;
1238 }
1239
__apse_match(apse_t * ap,unsigned char * text,apse_size_t text_size)1240 static apse_bool_t __apse_match(apse_t *ap,
1241 unsigned char *text,
1242 apse_size_t text_size) {
1243 apse_bool_t did_match = 0;
1244
1245 APSE_DEBUG(printf("(match enter)\n"));
1246
1247 if (ap->match_state == APSE_MATCH_STATE_BOT) {
1248 ap->text = text;
1249 if (ap->text_final_position == APSE_MATCH_BAD)
1250 ap->text_size = text_size;
1251 else
1252 ap->text_size =
1253 ap->text_final_position > text_size ?
1254 text_size : ap->text_final_position + 1;
1255 _apse_match_bot(ap);
1256 } else if (ap->match_state == APSE_MATCH_STATE_EOT)
1257 goto leave;
1258
1259 if (ap->edit_deletions >= ap->pattern_size ||
1260 ap->edit_substitutions >= ap->pattern_size) {
1261 ap->match_state = APSE_MATCH_STATE_END;
1262 ap->match_begin = ap->text_initial_position;
1263 ap->match_end = ap->text_size - 1;
1264 ap->text_position = ap->text_size;
1265 goto out;
1266 }
1267
1268 if (ap->pattern_size - ap->edit_deletions >
1269 ap->text_size - ap->text_initial_position) {
1270 ap->match_state = APSE_MATCH_STATE_EOT;
1271 ap->text_position = ap->text_size;
1272 goto out;
1273 }
1274
1275 if (text_size + ap->edit_distance < ap->pattern_size + ap->text_position) {
1276 ap->text_position = ap->text_size;
1277 goto eot;
1278 }
1279
1280 if (ap->match_state == APSE_MATCH_STATE_SEARCH) {
1281 ap->text_position++;
1282 _apse_reset_state(ap);
1283 }
1284
1285 if (ap->text_position_range != APSE_MATCH_BAD &&
1286 ap->text_position - ap->text_initial_position >
1287 ap->text_position_range) {
1288 ap->match_state = APSE_MATCH_STATE_END;
1289 goto eot;
1290 }
1291
1292 ap->match_state = APSE_MATCH_STATE_SEARCH;
1293 APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
1294 APSE_DEBUG(printf("(match search %ld)\n", ap->text_position));
1295
1296 if (ap->has_different_distances) {
1297 if (ap->bitvectors_in_state == 1) {
1298 if (_apse_match_single_complex(ap))
1299 goto out;
1300 } else {
1301 if (_apse_match_multiple_complex(ap))
1302 goto out;
1303 }
1304 } else {
1305 if (ap->bitvectors_in_state == 1) {
1306 if (_apse_match_single_simple(ap))
1307 goto out;
1308 } else {
1309 if (_apse_match_multiple_simple(ap))
1310 goto out;
1311 }
1312 }
1313
1314 out:
1315
1316 if (ap->match_state == APSE_MATCH_STATE_GREEDY) {
1317 ap->match_state = APSE_MATCH_STATE_END;
1318 ap->match_end = ap->text_position - 1;
1319 APSE_DEBUG(printf("(greedy match end %ld)\n", ap->match_end));
1320 }
1321
1322 if (ap->match_state == APSE_MATCH_STATE_END) {
1323 _apse_match_end(ap);
1324 did_match = 1;
1325 }
1326
1327 eot:
1328
1329 if (ap->text_position == ap->text_size)
1330 _apse_match_eot(ap);
1331
1332 leave:
1333
1334 APSE_DEBUG(printf("(match leave)\n"));
1335
1336 return did_match;
1337 }
1338
_apse_match(apse_t * ap,unsigned char * text,apse_size_t text_size)1339 static apse_bool_t _apse_match(apse_t *ap,
1340 unsigned char *text,
1341 apse_size_t text_size) {
1342 if (ap->use_minimal_distance) {
1343 apse_set_edit_distance(ap, 0);
1344 if (__apse_match(ap, text, text_size))
1345 return 1;
1346 else {
1347 apse_size_t minimal_edit_distance;
1348 apse_size_t previous_edit_distance = 0;
1349 apse_size_t next_edit_distance;
1350
1351 for (next_edit_distance = 1;
1352 next_edit_distance <= ap->pattern_size;
1353 next_edit_distance *= 2) {
1354 apse_set_edit_distance(ap, next_edit_distance);
1355 if (__apse_match(ap, text, text_size))
1356 break;
1357 previous_edit_distance = next_edit_distance;
1358 }
1359 minimal_edit_distance = next_edit_distance;
1360 if (next_edit_distance > 1) {
1361 do {
1362 minimal_edit_distance =
1363 (previous_edit_distance + next_edit_distance) / 2;
1364 if (minimal_edit_distance == previous_edit_distance)
1365 break;
1366 apse_set_edit_distance(ap, minimal_edit_distance);
1367 if (__apse_match(ap, text, text_size))
1368 next_edit_distance = minimal_edit_distance;
1369 else
1370 previous_edit_distance = minimal_edit_distance;
1371 } while (previous_edit_distance <= next_edit_distance);
1372 if (!__apse_match(ap, text, text_size))
1373 minimal_edit_distance++;
1374 }
1375 apse_set_edit_distance(ap, minimal_edit_distance);
1376 __apse_match(ap, text, text_size);
1377
1378 return 1;
1379 }
1380 } else
1381 return __apse_match(ap, text, text_size);
1382 }
1383
apse_match(apse_t * ap,unsigned char * text,apse_size_t text_size)1384 apse_bool_t apse_match(apse_t *ap,
1385 unsigned char *text, apse_size_t text_size) {
1386 apse_bool_t did_match = _apse_match(ap, text, text_size);
1387
1388 _apse_match_eot(ap);
1389 apse_reset(ap);
1390
1391 return did_match;
1392 }
1393
apse_match_next(apse_t * ap,unsigned char * text,apse_size_t text_size)1394 apse_bool_t apse_match_next(apse_t *ap,
1395 unsigned char *text, apse_size_t text_size) {
1396 apse_bool_t did_match = _apse_match(ap, text, text_size);
1397
1398 if (!did_match)
1399 ap->match_state = APSE_MATCH_STATE_BOT;
1400
1401 return did_match;
1402 }
1403
apse_index(apse_t * ap,unsigned char * text,apse_size_t text_size)1404 apse_ssize_t apse_index(apse_t *ap,
1405 unsigned char *text, apse_size_t text_size) {
1406 apse_size_t did_match = _apse_match(ap, text, text_size);
1407 _apse_match_eot(ap);
1408 ap->match_state = APSE_MATCH_STATE_BOT;
1409
1410 return did_match ? ap->match_begin : APSE_MATCH_BAD;
1411 }
1412
apse_index_next(apse_t * ap,unsigned char * text,apse_size_t text_size)1413 apse_ssize_t apse_index_next(apse_t *ap,
1414 unsigned char *text, apse_size_t text_size) {
1415 apse_bool_t did_match = _apse_match(ap, text, text_size);
1416
1417 if (!did_match)
1418 ap->match_state = APSE_MATCH_STATE_BOT;
1419
1420 return did_match ? ap->match_begin : APSE_MATCH_BAD;
1421 }
1422
_apse_slice(apse_t * ap,unsigned char * text,apse_size_t text_size,apse_size_t * match_begin,apse_size_t * match_size)1423 static apse_bool_t _apse_slice(apse_t *ap,
1424 unsigned char *text,
1425 apse_size_t text_size,
1426 apse_size_t *match_begin,
1427 apse_size_t *match_size) {
1428 apse_bool_t did_match = _apse_match(ap, text, text_size);
1429
1430 if (did_match) {
1431 if (match_begin)
1432 *match_begin = ap->match_begin;
1433 if (match_size)
1434 *match_size = ap->match_end - ap->match_begin + 1;
1435 } else {
1436 if (match_begin)
1437 *match_begin = APSE_MATCH_BAD;
1438 if (match_size)
1439 *match_size = APSE_MATCH_BAD;
1440 }
1441
1442 return did_match;
1443 }
1444
apse_slice(apse_t * ap,unsigned char * text,apse_size_t text_size,apse_size_t * match_begin,apse_size_t * match_size)1445 apse_bool_t apse_slice(apse_t *ap,
1446 unsigned char *text,
1447 apse_size_t text_size,
1448 apse_size_t *match_begin,
1449 apse_size_t *match_size) {
1450 apse_bool_t did_match =
1451 _apse_slice(ap, text, text_size, match_begin, match_size);
1452
1453 _apse_match_eot(ap);
1454 ap->match_state = APSE_MATCH_STATE_BOT;
1455
1456 return did_match;
1457 }
1458
apse_slice_next(apse_t * ap,unsigned char * text,apse_size_t text_size,apse_size_t * match_begin,apse_size_t * match_size)1459 apse_bool_t apse_slice_next(apse_t* ap,
1460 unsigned char* text,
1461 apse_size_t text_size,
1462 apse_size_t* match_begin,
1463 apse_size_t* match_size) {
1464 apse_bool_t did_match =
1465 _apse_slice(ap, text, text_size, match_begin, match_size);
1466
1467 if (!did_match)
1468 ap->match_state = APSE_MATCH_STATE_BOT;
1469
1470 return did_match;
1471 }
1472
apse_set_custom_data(apse_t * ap,void * custom_data,apse_size_t custom_data_size)1473 void apse_set_custom_data(apse_t* ap,
1474 void* custom_data,
1475 apse_size_t custom_data_size) {
1476 ap->custom_data = custom_data;
1477 ap->custom_data_size = custom_data_size;
1478 }
1479
apse_get_custom_data(apse_t * ap,apse_size_t * custom_data_size)1480 void* apse_get_custom_data(apse_t* ap,
1481 apse_size_t* custom_data_size) {
1482 if (custom_data_size)
1483 *custom_data_size = ap->custom_data_size;
1484 return ap->custom_data;
1485 }
1486