1 /*
2 
3 Copyright (C) by Jarkko Hietaniemi, 1998,1999,2000,2001,2002,2003,2006.
4 All Rights Reserved.
5 
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of either:
8 
9 a) the GNU Library General Public License as published by the Free
10    Software Foundation; either version 2, or (at your option) any
11    later version, or
12 
13 b) the "Artistic License" which comes with Perl source code.
14 
15 Other free software licensing schemes are negotiable.
16 
17 Furthermore:
18 
19 (1) This software is provided as-is, without warranties or
20     obligations of any kind.
21 
22 (2) You shall include this copyright notice intact in all copies
23     and derived materials.
24 
25 */
26 
27 /*
28 
29   $Id: apse.c,v 1.1 1999/06/23 16:09:13 jhi Exp jhi $
30 
31 */
32 
33 #include "apse.h"
34 
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <ctype.h>
39 #include <assert.h>
40 
41 #define APSE_BITS_IN_BITVEC	(8*sizeof(apse_vec_t))
42 
43 #define APSE_CHAR_MAX		256
44 
45 #ifdef APSE_DEBUGGING
46 #define APSE_DEBUG(x) x
47 #else
48 #define APSE_DEBUG(x)
49 #endif
50 
51 #define APSE_BIT(i)		((apse_vec_t)1 << ((i)%APSE_BITS_IN_BITVEC))
52 #define APSE_IDX(p, q, i)	((p)*(q)+((i)/APSE_BITS_IN_BITVEC))
53 #define APSE_BIT_SET(bv, p, q, i) ((bv[APSE_IDX(p, q, i)] |=  APSE_BIT(i)))
54 #define APSE_BIT_CLR(bv, p, q, i) ((bv[APSE_IDX(p, q, i)] &= ~APSE_BIT(i)))
55 #define APSE_BIT_TST(bv, p, q, i) ((bv[APSE_IDX(p, q, i)] &   APSE_BIT(i)))
56 
57 #define APSE_MATCH_STATE_BOT		0
58 #define APSE_MATCH_STATE_SEARCH		1
59 #define APSE_MATCH_STATE_BEGIN		2
60 #define APSE_MATCH_STATE_FAIL		3
61 #define APSE_MATCH_STATE_GREEDY		4
62 #define APSE_MATCH_STATE_END		5
63 #define APSE_MATCH_STATE_EOT		6
64 
65 #define APSE_TEST_HIGH_BIT(i)	\
66 	(((i) & ((apse_vec_t)1 << (APSE_BITS_IN_BITVEC - 1))) ? 1 : 0)
67 
68 /* In case you are reading the TR 91-11 of University of Arizona, page 6:
69  * j+1	state
70  * j	prev_state
71  * d	i
72  * d-1	prev_i
73  */
74 
75 #define APSE_NEXT_EXACT(state, prev_state, text, i, carry)		\
76 	(state[i] = ((prev_state[i] << 1 | carry) & text))
77 
78 #define APSE_NEXT_APPROX(state, prev_state, text, i, prev_i, carry)	\
79 	(state[i]  = (((prev_state[i] << 1) & text)		  |	\
80 			prev_state[prev_i] 			  |	\
81 		      ((state[prev_i] | prev_state[prev_i]) << 1) |	\
82 			carry))
83 
84 #define APSE_NEXT_COMMON(state, prev_state, text, i)			\
85 	(state[i]  = (prev_state[i] << 1) & text)
86 
87 #define APSE_NEXT_INSERT(state, prev_state, i, prev_i)			\
88 	(state[i] |= prev_state[prev_i])
89 
90 #define APSE_NEXT_DELETE(state, i, prev_i)				\
91 	(state[i] |= (state[prev_i] << 1))
92 
93 #define APSE_NEXT_SUBSTI(state, prev_state, i, prev_i)			\
94 	(state[i] |= (prev_state[prev_i] << 1))
95 
96 #define APSE_NEXT_CARRY(state, i, carry)				\
97 	(state[i] |= carry)
98 
99 #define APSE_EXACT_MATCH_BEGIN(ap)	(ap->state[0] & 1)
100 
101 #define APSE_APPROX_MATCH_BEGIN(ap)	\
102 	(ap->state[ap->largest_distance + ap->match_begin_bitvector] > \
103 	 ap->match_begin_prefix && \
104 	 ap->state[ap->largest_distance + ap->match_begin_bitvector] & \
105 	 ap->match_begin_prefix)
106 
107 #define APSE_PREFIX_DELETE_MASK(ap)	\
108 	    do { if (ap->edit_deletions < ap->edit_distance &&	\
109 		     ap->text_position  < ap->edit_distance)	\
110  	             ap->state[h] &= ap->match_begin_bitmask; } while (0)
111 
112 #define APSE_DEBUG_SINGLE(ap, i)		\
113 	APSE_DEBUG(printf("%c %2ld %2ld %s\n",				\
114 			  isprint(ap->text[ap->text_position])?	\
115 			  ap->text[ap->text_position]:'.',		\
116 			  ap->text_position, i, 			\
117 			  _apse_fbin(ap->state[i],			\
118 				     ap->pattern_size, 1)))
119 
120 
121 #define APSE_DEBUG_MULTIPLE_FIRST(ap, i)	\
122 	APSE_DEBUG(printf("%c %2ld %2ld",				\
123 			  isprint(ap->text[ap->text_position])?	\
124 			  ap->text[ap->text_position]:'.',		\
125 			  ap->text_position, i))
126 
127 #define APSE_DEBUG_MULTIPLE_REST(ap, i, j)	\
128 	APSE_DEBUG(printf(" %s",					\
129 			  _apse_fbin(ap->state[j],			\
130 				     ap->pattern_size, 			\
131 				     i == ap->bitvectors_in_state-1)))
132 
133 
134 #ifdef APSE_DEBUGGING
135 static char *_apse_fbin(apse_vec_t v, apse_size_t n, apse_bool_t last);
136 
_apse_fbin(apse_vec_t v,apse_size_t n,apse_bool_t last)137 static char *_apse_fbin(apse_vec_t v, apse_size_t n, apse_bool_t last) {
138     static char s[APSE_BITS_IN_BITVEC + 1] = { 0 }; /* non-reentrant */
139 
140     if (v) {
141 	static const char *b =
142 	    "0000100001001100001010100110111000011001010111010011101101111111";
143 	apse_size_t i;
144 
145 	for (i = 0; i < APSE_BITS_IN_BITVEC && i < n && v; i += 4) {
146 	    (void)memcpy(s + i, b + ((v & 0x0f) << 2), (size_t)4);
147 	    v >>= 4;
148 	}
149 
150 	if (i < APSE_BITS_IN_BITVEC)
151 	    memset(s + i, '0', APSE_BITS_IN_BITVEC - i);
152     } else
153 	memset(s, '0', APSE_BITS_IN_BITVEC);
154 
155     if (last)
156 	s[n % APSE_BITS_IN_BITVEC] = 0;
157 
158     return s;
159 }
160 #endif
161 
162 /* The code begins. */
163 
apse_set_pattern(apse_t * ap,unsigned char * pattern,apse_size_t pattern_size)164 apse_bool_t apse_set_pattern(apse_t*		ap,
165 			     unsigned char*	pattern,
166 			     apse_size_t	pattern_size) {
167     apse_size_t	i;
168 
169     if (ap->case_mask)
170 	free(ap->case_mask);
171     if (ap->fold_mask)
172 	free(ap->fold_mask);
173 
174     ap->pattern_mask = 0;
175     ap->fold_mask    = 0;
176     ap->case_mask    = 0;
177 
178     ap->is_greedy    = 0;
179 
180     ap->prev_equal   = 0;
181     ap->prev_active  = 0;
182 
183     ap->pattern_size = pattern_size;
184     ap->bitvectors_in_state = (pattern_size - 1)/APSE_BITS_IN_BITVEC + 1;
185 
186     if (ap->edit_distance)
187 	ap->largest_distance = ap->edit_distance * ap->bitvectors_in_state;
188     else
189 	ap->largest_distance =  0;
190 
191     ap->bytes_in_state = ap->bitvectors_in_state * sizeof(apse_vec_t);
192 
193     ap->case_mask = calloc((apse_size_t)APSE_CHAR_MAX, ap->bytes_in_state);
194     if (ap->case_mask == 0)
195 	goto out;
196 
197     for (i = 0; i < pattern_size; i++)
198 	APSE_BIT_SET(ap->case_mask,
199 		     (unsigned)pattern[i],
200 		     ap->bitvectors_in_state, i);
201 
202     ap->pattern_mask = ap->case_mask;
203 
204     ap->match_end_bitmask =
205 	(apse_vec_t)1 << ((pattern_size - 1) % APSE_BITS_IN_BITVEC);
206 
207 out:
208     if (ap && ap->case_mask)
209 	return 1;
210     else {
211 	if (ap->case_mask)
212 	    free(ap->case_mask);
213 	if (ap)
214 	    free(ap);
215 	return 0;
216     }
217 }
218 
apse_set_greedy(apse_t * ap,apse_bool_t greedy)219 void apse_set_greedy(apse_t *ap, apse_bool_t greedy) {
220     ap->is_greedy = greedy;
221 }
222 
apse_get_greedy(apse_t * ap)223 apse_bool_t apse_get_greedy(apse_t *ap) {
224     return ap->is_greedy;
225 }
226 
apse_set_match_bot_callback(apse_t * ap,void * (* match_bot_callback)(apse_t * ap))227 void apse_set_match_bot_callback(apse_t *ap,
228 				 void* (*match_bot_callback)(apse_t* ap)) {
229     ap->match_bot_callback = match_bot_callback;
230 }
231 
apse_set_match_begin_callback(apse_t * ap,void * (* match_begin_callback)(apse_t * ap))232 void apse_set_match_begin_callback(apse_t *ap,
233 				   void* (*match_begin_callback)(apse_t* ap)) {
234     ap->match_begin_callback = match_begin_callback;
235 }
236 
apse_set_match_fail_callback(apse_t * ap,void * (* match_fail_callback)(apse_t * ap))237 void apse_set_match_fail_callback(apse_t *ap,
238 				  void* (*match_fail_callback)(apse_t* ap)) {
239     ap->match_fail_callback = match_fail_callback;
240 }
241 
apse_set_match_end_callback(apse_t * ap,void * (* match_end_callback)(apse_t * ap))242 void apse_set_match_end_callback(apse_t *ap,
243 				 void* (*match_end_callback)(apse_t* ap)) {
244     ap->match_end_callback = match_end_callback;
245 }
246 
apse_set_match_eot_callback(apse_t * ap,void * (* match_eot_callback)(apse_t * ap))247 void apse_set_match_eot_callback(apse_t *ap,
248 				 void* (*match_eot_callback)(apse_t* ap)) {
249     ap->match_eot_callback = match_eot_callback;
250 }
251 
apse_get_match_bot_callback(apse_t * ap)252 void* (*apse_get_match_bot_callback(apse_t * ap))(apse_t *ap) {
253     return ap->match_bot_callback;
254 }
255 
apse_get_match_begin_callback(apse_t * ap)256 void* (*apse_get_match_begin_callback(apse_t * ap))(apse_t *ap) {
257     return ap->match_begin_callback;
258 }
259 
apse_get_match_fail_callback(apse_t * ap)260 void* (*apse_get_match_fail_callback(apse_t * ap))(apse_t *ap) {
261     return ap->match_fail_callback;
262 }
263 
apse_get_match_end_callback(apse_t * ap)264 void* (*apse_get_match_end_callback(apse_t * ap))(apse_t *ap) {
265     return ap->match_end_callback;
266 }
267 
apse_get_match_eot_callback(apse_t * ap)268 void* (*apse_get_match_eot_callback(apse_t * ap))(apse_t *ap) {
269     return ap->match_eot_callback;
270 }
271 
_apse_wrap_slice(apse_t * ap,apse_ssize_t begin_in,apse_ssize_t size_in,apse_ssize_t * begin_out,apse_ssize_t * size_out)272 static int _apse_wrap_slice(apse_t*		ap,
273 			    apse_ssize_t	begin_in,
274 			    apse_ssize_t	size_in,
275 			    apse_ssize_t*	begin_out,
276 			    apse_ssize_t*	size_out) {
277     if (begin_in < 0) {
278 	if ((apse_size_t)-begin_in > ap->pattern_size)
279 	    return 0;
280 	begin_in = ap->pattern_size + begin_in;
281     }
282 
283     if (size_in < 0) {
284 	if (-size_in > begin_in)
285 	    return 0;
286 	size_in   = -size_in;
287 	begin_in -=  size_in;
288     }
289 
290     if ((apse_size_t)begin_in >= ap->pattern_size)
291 	return 0;
292 
293     if ((apse_size_t)begin_in + size_in > ap->pattern_size)
294 	size_in = ap->pattern_size - begin_in;
295 
296     if (begin_out)
297 	*begin_out = begin_in;
298 
299     if (size_out)
300 	*size_out = size_in;
301 
302     return 1;
303 }
304 
apse_set_anychar(apse_t * ap,apse_ssize_t pattern_index)305 apse_bool_t apse_set_anychar(apse_t *ap, apse_ssize_t pattern_index) {
306     apse_size_t	bitvectors_in_state = ap->bitvectors_in_state;
307     apse_ssize_t	true_index, i;
308     apse_bool_t	okay = 0;
309 
310     if (!_apse_wrap_slice(ap, pattern_index, (apse_ssize_t)1,
311 			      &true_index, 0))
312 	goto out;
313 
314     for (i = 0; i < APSE_CHAR_MAX; i++)
315 	APSE_BIT_SET(ap->case_mask,
316 		      i, bitvectors_in_state, pattern_index);
317     if (ap->fold_mask)
318 	for (i = 0; i < APSE_CHAR_MAX; i++)
319 	    APSE_BIT_SET(ap->fold_mask,
320 			  i, bitvectors_in_state, pattern_index);
321 
322     okay = 1;
323 
324   out:
325 
326     return okay;
327 }
328 
apse_set_charset(apse_t * ap,apse_ssize_t pattern_index,unsigned char * set,apse_size_t set_size,apse_bool_t complement)329 apse_bool_t apse_set_charset(apse_t*		ap,
330 			     apse_ssize_t	pattern_index,
331 			     unsigned char*	set,
332 			     apse_size_t	set_size,
333 			     apse_bool_t	complement) {
334     apse_size_t	bitvectors_in_state = ap->bitvectors_in_state;
335     apse_ssize_t	true_index;
336     apse_bool_t	okay = 0;
337     apse_size_t i;
338 
339     if (!_apse_wrap_slice(ap, pattern_index, (apse_ssize_t)1,
340 			      &true_index, 0))
341 	goto out;
342 
343     if (complement) {
344 	for (i = 0; i < set_size; i++)
345 	    APSE_BIT_CLR(ap->case_mask,
346 			  (unsigned)set[i],
347 			  bitvectors_in_state, true_index);
348     } else {
349 	for (i = 0; i < set_size; i++)
350 	    APSE_BIT_SET(ap->case_mask,
351 			  (unsigned)set[i],
352 			  bitvectors_in_state, true_index);
353     }
354     if (ap->fold_mask)
355 	apse_set_caseignore_slice(ap, pattern_index,
356 				   (apse_ssize_t)1, (apse_bool_t)1);
357 
358     okay = 1;
359 
360  out:
361 
362     return okay;
363 }
364 
_apse_reset_state(apse_t * ap)365 static void _apse_reset_state(apse_t* ap) {
366     apse_size_t	i, j;
367 
368     (void)memset(ap->state,      0, ap->bytes_in_all_states);
369     (void)memset(ap->prev_state, 0, ap->bytes_in_all_states);
370 
371     ap->prev_equal  = 0;
372     ap->prev_active = 0;
373 
374     for (i = 1; i <= ap->edit_distance; i++) {
375         for (j = 0; j < i; j++) {
376 #ifdef APSE_DEBUGGING
377             int k = APSE_IDX(i, ap->bitvectors_in_state, j);
378 	    int l = ap->bytes_in_all_states/sizeof(apse_vec_t);
379 	    assert (k < l);
380 #endif
381 	    APSE_BIT_SET(ap->prev_state, i, ap->bitvectors_in_state, j);
382 	}
383     }
384 }
385 
apse_set_text_position(apse_t * ap,apse_size_t text_position)386 apse_bool_t apse_set_text_position(apse_t *ap,
387 					 apse_size_t text_position) {
388     ap->text_position = text_position;
389 
390     return 1;
391 }
392 
apse_get_text_position(apse_t * ap)393 apse_size_t apse_get_text_position(apse_t *ap) {
394     return ap->text_position;
395 }
396 
apse_set_text_initial_position(apse_t * ap,apse_size_t text_initial_position)397 apse_bool_t apse_set_text_initial_position(apse_t *ap,
398 					   apse_size_t text_initial_position) {
399     ap->text_initial_position = text_initial_position;
400 
401     return 1;
402 }
403 
apse_get_text_initial_position(apse_t * ap)404 apse_size_t apse_get_text_initial_position(apse_t *ap) {
405     return ap->text_initial_position;
406 }
407 
apse_set_text_final_position(apse_t * ap,apse_size_t text_final_position)408 apse_bool_t apse_set_text_final_position(apse_t *ap,
409 					 apse_size_t text_final_position) {
410     ap->text_final_position = text_final_position;
411 
412     return 1;
413 }
414 
apse_get_text_final_position(apse_t * ap)415 apse_size_t apse_get_text_final_position(apse_t *ap) {
416     return ap->text_final_position;
417 }
418 
apse_set_text_position_range(apse_t * ap,apse_size_t text_position_range)419 apse_bool_t apse_set_text_position_range(apse_t *ap,
420 					 apse_size_t text_position_range) {
421     ap->text_position_range = text_position_range;
422 
423     return 1;
424 }
425 
apse_get_text_position_range(apse_t * ap)426 apse_size_t apse_get_text_position_range(apse_t *ap) {
427     return ap->text_position_range;
428 }
429 
apse_reset(apse_t * ap)430 void apse_reset(apse_t *ap) {
431     _apse_reset_state(ap);
432 
433     ap->text_position = ap->text_initial_position;
434 #if 0
435     ap->text_position_range = APSE_MATCH_BAD; /* Do not reset this. */
436 #endif
437 
438     ap->match_state = APSE_MATCH_STATE_BOT;
439     ap->match_begin = APSE_MATCH_BAD;
440     ap->match_end   = APSE_MATCH_BAD;
441 }
442 
apse_set_edit_distance(apse_t * ap,apse_size_t edit_distance)443 apse_bool_t apse_set_edit_distance(apse_t *ap, apse_size_t edit_distance) {
444     /* TODO: waste not--reuse if possible */
445 
446     if (ap->state)
447 	free(ap->state);
448     if (ap->prev_state)
449 	free(ap->prev_state);
450 
451     if (edit_distance >= ap->pattern_size)
452         edit_distance = ap->pattern_size;
453 
454     ap->edit_distance = edit_distance;
455 
456     ap->bytes_in_all_states = (edit_distance + 1) * ap->bytes_in_state;
457 
458     ap->state = ap->prev_state = 0;
459 
460     ap->state = calloc(edit_distance + 1, ap->bytes_in_state);
461     if (ap->state == 0)
462 	goto out;
463 
464     ap->prev_state = calloc(edit_distance + 1, ap->bytes_in_state);
465     if (ap->prev_state == 0)
466 	goto out;
467 
468     apse_reset(ap);
469 
470     if (!ap->has_different_distances) {
471 	ap->edit_insertions		= edit_distance;
472 	ap->edit_deletions		= edit_distance;
473 	ap->edit_substitutions		= edit_distance;
474     }
475 
476     if (ap->edit_distance && ap->bitvectors_in_state)
477 	ap->largest_distance = ap->edit_distance * ap->bitvectors_in_state;
478     else
479 	ap->largest_distance =  0;
480 
481     ap->match_begin_bitvector	=
482 	(edit_distance + 1) / APSE_BITS_IN_BITVEC;
483     ap->match_begin_prefix = ((apse_vec_t)1 << edit_distance) - 1;
484     ap->match_begin_bitmask	=
485 	((apse_vec_t)1 << edit_distance) - 1;
486 
487     ap->match_end_bitvector =
488 	(ap->pattern_size - 1) / APSE_BITS_IN_BITVEC;
489 
490 #ifdef APSE_DEBUGGING
491     if (ap->has_different_distances) {
492 	printf("(edit distances: ");
493 	printf("insertions = %ld, deletions = %ld, substitutions = %ld)\n",
494 	       ap->edit_insertions,
495 	       ap->edit_deletions,
496 	       ap->edit_substitutions);
497     } else
498 	printf("(edit_distance = %ld)\n", ap->edit_distance);
499 #endif
500 
501 out:
502     return ap->state && ap->prev_state;
503 }
504 
apse_get_edit_distance(apse_t * ap)505 apse_size_t apse_get_edit_distance(apse_t *ap) {
506     return ap->edit_distance;
507 }
508 
apse_set_minimal_distance(apse_t * ap,apse_bool_t minimal)509 apse_bool_t apse_set_minimal_distance(apse_t* ap, apse_bool_t minimal) {
510     ap->use_minimal_distance = minimal;
511     return 1;
512 }
513 
apse_get_minimal_distance(apse_t * ap)514 apse_bool_t apse_get_minimal_distance(apse_t *ap) {
515     return ap->use_minimal_distance;
516 }
517 
apse_set_exact_slice(apse_t * ap,apse_ssize_t exact_begin,apse_ssize_t exact_size,apse_bool_t exact)518 apse_bool_t apse_set_exact_slice(apse_t*	ap,
519 				 apse_ssize_t	exact_begin,
520 				 apse_ssize_t	exact_size,
521 				 apse_bool_t	exact) {
522     apse_ssize_t	true_begin, true_size;
523     apse_bool_t	okay = 0;
524     apse_size_t i, j;
525 
526     if (!ap->exact_mask) {
527 
528 	ap->exact_mask = calloc((size_t)1, ap->bytes_in_state);
529 	if (ap->exact_mask == 0)
530 	    goto out;
531 
532 	ap->exact_positions = 0;
533     }
534 
535     if (!_apse_wrap_slice(ap, exact_begin, exact_size,
536 			      &true_begin, &true_size))
537 	goto out;
538 
539     if (exact) {
540 	for (i = true_begin, j = true_begin +  true_size;
541 	     i < j && i < ap->pattern_size; i++) {
542 	    if (!APSE_BIT_TST(ap->exact_mask, 0, 0, i))
543 		ap->exact_positions++;
544 	    APSE_BIT_SET(ap->exact_mask, 0, 0, i);
545 	}
546     } else {
547 	for (i = true_begin, j = true_begin +  true_size;
548 	     i < j && i < ap->pattern_size; i++) {
549 	    if (APSE_BIT_TST(ap->exact_mask, 0, 0, i))
550 		ap->exact_positions--;
551 	    APSE_BIT_CLR(ap->exact_mask, 0, 0, i);
552 	}
553     }
554 
555     okay = 1;
556 
557 out:
558     return okay;
559 }
560 
apse_set_caseignore_slice(apse_t * ap,apse_ssize_t caseignore_begin,apse_ssize_t caseignore_size,apse_bool_t caseignore)561 apse_bool_t apse_set_caseignore_slice(apse_t*		ap,
562 				      apse_ssize_t	caseignore_begin,
563 				      apse_ssize_t	caseignore_size,
564 				      apse_bool_t	caseignore) {
565     apse_size_t		i, j;
566     int			k;
567     apse_ssize_t	true_begin, true_size;
568     apse_bool_t		okay = 0;
569 
570     if (!ap->fold_mask) {
571 
572 	ap->fold_mask = calloc((apse_size_t)APSE_CHAR_MAX,
573 			       ap->bytes_in_state);
574 	if (ap->fold_mask == 0)
575 	    goto out;
576 
577 	memcpy(ap->fold_mask,
578 	       ap->case_mask,
579 	       APSE_CHAR_MAX * ap->bytes_in_state);
580 
581 	ap->pattern_mask = ap->fold_mask;
582     }
583 
584     if (!_apse_wrap_slice(ap, caseignore_begin, caseignore_size,
585 			      &true_begin, &true_size))
586 	goto out;
587 
588     if (caseignore) {
589 	for (i = true_begin, j = true_begin +  true_size;
590 	     i < j && i < ap->pattern_size; i++) {
591 	    for (k = 0; k < APSE_CHAR_MAX; k++) {
592 		if (APSE_BIT_TST(ap->case_mask,
593 				 k, ap->bitvectors_in_state, i)) {
594 		    if (isupper(k))
595 			APSE_BIT_SET(ap->fold_mask,
596 				     tolower(k),
597 				     ap->bitvectors_in_state, i);
598 		    else if (islower(k))
599 			APSE_BIT_SET(ap->fold_mask,
600 				     toupper(k),
601 				     ap->bitvectors_in_state, i);
602 		}
603 	    }
604 	}
605     } else {
606 	for (i = true_begin, j = true_begin +  true_size;
607 	     i < j && i < ap->pattern_size; i++) {
608 	    for (k = 0; k < APSE_CHAR_MAX; k++) {
609 		if (APSE_BIT_TST(ap->case_mask,
610 				 k, ap->bitvectors_in_state, i)) {
611 		    if (isupper(k))
612 			APSE_BIT_CLR(ap->fold_mask,
613 				     tolower(k),
614 				     ap->bitvectors_in_state, i);
615 		    else
616 			if (islower(k))
617 			    APSE_BIT_CLR(ap->fold_mask,
618 					 toupper(k),
619 					 ap->bitvectors_in_state, i);
620 		}
621 	    }
622 	}
623     }
624 
625     okay = 1;
626 
627 out:
628     return okay;
629 }
630 
apse_destroy(apse_t * ap)631 void apse_destroy(apse_t *ap) {
632     if (ap->case_mask)		free(ap->case_mask);
633     if (ap->fold_mask)		free(ap->fold_mask);
634     if (ap->state)		free(ap->state);
635     if (ap->prev_state)		free(ap->prev_state);
636     if (ap->exact_mask)	 	free(ap->exact_mask);
637     free(ap);
638 }
639 
apse_create(unsigned char * pattern,apse_size_t pattern_size,apse_size_t edit_distance)640 apse_t *apse_create(unsigned char*	pattern,
641 		    apse_size_t		pattern_size,
642 		    apse_size_t		edit_distance) {
643     apse_t		*ap;
644     apse_bool_t	okay = 0;
645 
646     APSE_DEBUG(printf("(apse version %u.%u)\n",
647 		      APSE_MAJOR_VERSION, APSE_MINOR_VERSION));
648 
649     APSE_DEBUG(
650 	printf("(pattern = \"%s\", pattern_size = %ld)\n",
651 	       pattern, pattern_size));
652 
653     ap = calloc((size_t)1, sizeof(*ap));
654     if (ap == 0)
655 	return 0;
656 
657     ap->pattern_size		= 0;
658     ap->pattern_mask		= 0;
659 
660     ap->edit_distance		= 0;
661     ap->has_different_distances = 0;
662     ap->edit_insertions		= 0;
663     ap->edit_deletions		= 0;
664     ap->edit_substitutions	= 0;
665     ap->use_minimal_distance	= 0;
666 
667     ap->bitvectors_in_state	= 0;
668     ap->bytes_in_state		= 0;
669     ap->bytes_in_all_states	= 0;
670     ap->largest_distance	= 0;
671 
672     ap->text			= 0;
673     ap->text_size		= 0;
674     ap->text_position		= 0;
675     ap->text_initial_position	= 0;
676     ap->text_final_position	= APSE_MATCH_BAD;
677     ap->text_position_range	= APSE_MATCH_BAD;
678 
679     ap->state			= 0;
680     ap->prev_state		= 0;
681     ap->match_begin_bitmask	= 0;
682     ap->match_begin_prefix	= 0;
683     ap->match_end_bitvector	= 0;
684     ap->match_end_bitmask	= 0;
685     ap->match_state		= APSE_MATCH_STATE_BOT;
686     ap->match_begin		= APSE_MATCH_BAD;
687     ap->match_end		= APSE_MATCH_BAD;
688 
689     ap->match_bot_callback	= 0;
690     ap->match_begin_callback	= 0;
691     ap->match_fail_callback	= 0;
692     ap->match_end_callback	= 0;
693     ap->match_eot_callback	= 0;
694 
695     ap->exact_positions		= 0;
696     ap->exact_mask		= 0;
697 
698     ap->is_greedy		= 0;
699 
700     ap->custom_data		= 0;
701     ap->custom_data_size	= 0;
702 
703     if (!apse_set_pattern(ap, (unsigned char *)pattern, pattern_size))
704 	goto out;
705 
706     if (!apse_set_edit_distance(ap, edit_distance))
707 	goto out;
708 
709     ap->edit_insertions = ap->edit_deletions =
710 	ap->edit_substitutions = ap->edit_distance;
711 
712     ap->largest_distance = edit_distance * ap->bitvectors_in_state;
713 
714 #ifdef APSE_DEBUGGING
715     printf("(size of bitvector = %ld, bitvectors_in_state = %ld)\n",
716 	   (long)sizeof(apse_vec_t), ap->bitvectors_in_state);
717     printf("(bytes_in_state = %ld, states = %ld, bytes_in_all_states = %ld)\n",
718 	   ap->bytes_in_state, ap->edit_distance + 1, ap->bytes_in_all_states);
719     printf("(match_begin_bitvector = %ld, match_begin_bitmask = %s)\n",
720 	   ap->match_begin_bitvector,
721 	   _apse_fbin(ap->match_begin_bitmask, ap->pattern_size, 1));
722     printf("(match_end_bitvector = %ld, match_end_bitmask = %s)\n",
723 	   ap->match_end_bitvector,
724 	   _apse_fbin(ap->match_end_bitmask, ap->pattern_size, 1));
725     printf("(largest_distance = %ld, match_begin_prefix = %s)\n",
726 	   ap->largest_distance,
727 	   _apse_fbin(ap->match_begin_prefix, ap->pattern_size, 1));
728     if (ap->bitvectors_in_state == 1)
729 	printf("(single bitvector");
730     else
731 	printf("(multiple bitvectors");
732     printf(")\n");
733 #endif
734     okay = 1;
735 
736  out:
737     if (!okay) {
738 	apse_destroy(ap);
739 	ap = 0;
740     }
741 
742     return ap;
743 }
744 
apse_set_insertions(apse_t * ap,apse_size_t insertions)745 apse_bool_t apse_set_insertions(apse_t *ap, apse_size_t insertions) {
746     apse_bool_t	okay = 0;
747 
748     if (insertions > ap->edit_distance)
749 	insertions = ap->edit_distance;
750     ap->edit_insertions = insertions;
751     ap->has_different_distances = 1;
752 
753     APSE_DEBUG((printf("(edit distances: insertions = %ld, deletions = %ld, substitutions = %ld)\n",
754 		       ap->edit_insertions,
755 		       ap->edit_deletions,
756 		       ap->edit_substitutions)));
757 
758     okay = 1;
759 
760     return okay;
761 }
762 
apse_get_insertions(apse_t * ap)763 apse_size_t apse_get_insertions(apse_t *ap) {
764     if (ap->has_different_distances)
765 	return ap->edit_insertions;
766     else
767 	return ap->edit_distance;
768 }
769 
apse_set_deletions(apse_t * ap,apse_size_t deletions)770 apse_bool_t apse_set_deletions(apse_t *ap, apse_size_t deletions) {
771     apse_bool_t	okay = 0;
772 
773     if (deletions > ap->edit_distance)
774 	deletions = ap->edit_distance;
775     ap->edit_deletions = deletions;
776     ap->has_different_distances = 1;
777 
778     APSE_DEBUG((printf("(edit distances: insertions = %ld, deletions = %ld, substitutions = %ld)\n",
779 		       ap->edit_insertions,
780 		       ap->edit_deletions,
781 		       ap->edit_substitutions)));
782 
783     okay = 1;
784 
785     return okay;
786 }
787 
apse_get_deletions(apse_t * ap)788 apse_size_t apse_get_deletions(apse_t *ap) {
789     if (ap->has_different_distances)
790 	return ap->edit_deletions;
791     else
792 	return ap->edit_distance;
793 }
794 
apse_set_substitutions(apse_t * ap,apse_size_t substitutions)795 apse_bool_t apse_set_substitutions(apse_t *ap, apse_size_t substitutions) {
796     apse_bool_t	okay = 0;
797 
798     if (substitutions > ap->edit_distance)
799 	substitutions = ap->edit_distance;
800     ap->edit_substitutions = substitutions;
801     ap->has_different_distances = 1;
802 
803     APSE_DEBUG((printf("(edit distances: insertions = %ld, deletions = %ld, substitutions = %ld)\n",
804 		       ap->edit_insertions,
805 		       ap->edit_deletions,
806 		       ap->edit_substitutions)));
807 
808     okay = 1;
809 
810     return okay;
811 }
812 
apse_get_substitutions(apse_t * ap)813 apse_size_t apse_get_substitutions(apse_t *ap) {
814     if (ap->has_different_distances)
815 	return ap->edit_substitutions;
816     else
817 	return ap->edit_distance;
818 }
819 
820 #ifdef APSE_DEBUGGING
apse_match_state_name(apse_t * ap)821 static const char* apse_match_state_name(apse_t* ap) {
822     switch (ap->match_state) {
823     case APSE_MATCH_STATE_BOT:		return "BOT";
824     case APSE_MATCH_STATE_SEARCH:	return "SEARCH";
825     case APSE_MATCH_STATE_BEGIN:	return "BEGIN";
826     case APSE_MATCH_STATE_FAIL:		return "FAIL";
827     case APSE_MATCH_STATE_GREEDY:	return "GREEDY";
828     case APSE_MATCH_STATE_END:		return "END";
829     case APSE_MATCH_STATE_EOT:		return "EOT";
830     default:				return "***UNKNOWN***";
831     }
832 }
833 #endif
834 
_apse_match_bot(apse_t * ap)835 static void _apse_match_bot(apse_t *ap) {
836     APSE_DEBUG(printf("(text = \"%*.*s\", text_size = %ld)\n",
837 		       (int)ap->text_size, (int)ap->text_size, ap->text,
838 		       ap->text_size));
839     apse_reset(ap);
840     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
841     APSE_DEBUG(printf("(text begin %ld)\n", ap->text_position));
842     if (ap->match_bot_callback)
843 	ap->match_bot_callback(ap);
844 }
845 
_apse_match_begin(apse_t * ap)846 static void _apse_match_begin(apse_t *ap) {
847     ap->match_state = APSE_MATCH_STATE_BEGIN;
848     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
849     APSE_DEBUG(printf("(match begin %ld)\n", ap->text_position));
850     ap->match_begin = ap->text_position;
851     if (ap->match_begin_callback)
852 	ap->match_begin_callback(ap);
853 }
854 
_apse_match_fail(apse_t * ap)855 static void _apse_match_fail(apse_t *ap) {
856     ap->match_state = APSE_MATCH_STATE_FAIL;
857     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
858     ap->match_begin = APSE_MATCH_BAD;
859     APSE_DEBUG(printf("(match fail %ld)\n", ap->text_position));
860     if (ap->match_fail_callback)
861 	ap->match_fail_callback(ap);
862     ap->match_state = APSE_MATCH_STATE_SEARCH;
863     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
864 }
865 
_apse_match_end(apse_t * ap)866 static void _apse_match_end(apse_t *ap) {
867     ap->match_state = APSE_MATCH_STATE_END;
868     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
869 #ifdef APSE_DEBUGGING
870     printf("(match end %ld)\n", ap->match_end);
871     printf("(match string \"%.*s\")\n",
872 	   (int)(ap->match_end - ap->match_begin + 1),
873 	   ap->text + ap->match_begin);
874     printf("(match length %ld)\n",
875 	   ap->match_end - ap->match_begin + 1);
876 #endif
877     if (ap->match_end_callback)
878 	ap->match_end_callback(ap);
879     ap->match_state = APSE_MATCH_STATE_SEARCH;
880     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
881 }
882 
_apse_match_eot(apse_t * ap)883 static void _apse_match_eot(apse_t *ap) {
884     ap->match_state = APSE_MATCH_STATE_EOT;
885     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
886     ap->text_position = ap->text_size;
887     APSE_DEBUG(printf("(text end %ld)\n", ap->text_position));
888     if (ap->match_eot_callback)
889 	ap->match_eot_callback(ap);
890 }
891 
_apse_match_next_state(apse_t * ap)892 static apse_bool_t _apse_match_next_state(apse_t *ap) {
893     apse_size_t	h, i, j, k;
894     apse_vec_t	match;
895 
896     k = ap->edit_distance * ap->bitvectors_in_state;
897 
898     switch (ap->match_state) {
899     case APSE_MATCH_STATE_SEARCH:
900 	if (APSE_EXACT_MATCH_BEGIN(ap) || APSE_APPROX_MATCH_BEGIN(ap))
901 	    _apse_match_begin(ap);
902 	break;
903     case APSE_MATCH_STATE_BEGIN:
904 	{
905 	    apse_size_t		equal	= 0;
906 	    apse_size_t		active	= 0;
907 
908 	    for (h = 0;
909 		 h <= k;
910 		 h += ap->bitvectors_in_state) {
911 	        for (i = h, j = h + ap->bitvectors_in_state - 1; i < j; j--)
912 		    if (ap->state[j] != ap->prev_state[j])
913 			break;
914 		if (ap->prev_state[j] == ap->state[j])
915 		    equal++;
916 		if (ap->state[j])
917 		    active++;
918 	    }
919 #ifdef APSE_DEBUGGING
920 	    printf("(equal = %d, active = %d)\n", equal, active);
921 #endif
922 	    if ((equal == ap->edit_distance + 1 &&
923 		 ap->is_greedy == 0)
924 		||
925 		(equal < ap->prev_equal &&
926 		 ap->prev_active &&
927 		 active > ap->prev_active &&
928 		 ap->text_position - ap->match_begin < 8 * ap->bytes_in_state &&
929 		 !APSE_BIT_TST(ap->state,
930 			       ap->edit_distance,
931 			       ap->bitvectors_in_state,
932 			       ap->text_position - ap->match_begin))) {
933 		ap->match_begin = ap->text_position;
934 #ifdef APSE_DEBUGGING
935 		printf("(slide begin %d)\n", ap->match_begin);
936 #endif
937 	    }
938 	    else if (active == 0)
939 		_apse_match_fail(ap);
940 	    ap->prev_equal  = equal;
941 	    ap->prev_active = active;
942 	}
943 	break;
944     default:
945 	break;
946     }
947 
948     for (match = 0, h = 0;
949 	 h <= k;
950 	 h += ap->bitvectors_in_state)
951 	match |= ap->state[h + ap->match_end_bitvector];
952 
953     if (match & ap->match_end_bitmask) {
954 	if (ap->match_state == APSE_MATCH_STATE_BEGIN) {
955 	    if (ap->is_greedy) {
956 		ap->match_state = APSE_MATCH_STATE_GREEDY;
957 		APSE_DEBUG(printf("(match state %s)\n",
958 				   apse_match_state_name(ap)));
959 		APSE_DEBUG(
960 		    printf("(greedy match continue %ld)\n",
961 			   ap->text_position));
962 	    } else {
963 		ap->match_state = APSE_MATCH_STATE_END;
964 		ap->match_end   = ap->text_position;
965 	    }
966 	}
967     } else if (ap->match_state == APSE_MATCH_STATE_GREEDY) {
968 	ap->match_state = APSE_MATCH_STATE_END;
969 	APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
970 	ap->match_end	= ap->text_position - 1;
971 	APSE_DEBUG(printf("(match end %ld)\n",  ap->match_end));
972     }
973 
974     return ap->match_state;
975 }
976 
_apse_exact_multiple(apse_t * ap)977 static void _apse_exact_multiple(apse_t* ap) {
978     apse_size_t	h;
979     apse_size_t	g = ap->edit_distance * ap->bitvectors_in_state;
980 
981     for (h = 0; h < ap->bitvectors_in_state; h++)
982 	ap->state[g + h] &= ~ap->exact_mask[h];
983 }
984 
_apse_match_single_simple(apse_t * ap)985 static apse_bool_t _apse_match_single_simple(apse_t *ap) {
986     /* single apse_vec_t, edit_distance */
987 
988     APSE_DEBUG(printf("(match single simple)\n"));
989     for ( ; ap->text_position < ap->text_size; ap->text_position++) {
990 	apse_vec_t	t =
991       	    ap->pattern_mask[(unsigned)ap->text[ap->text_position] *
992 			    ap->bitvectors_in_state];
993 	apse_size_t	h, g;
994 	APSE_NEXT_EXACT(ap->state, ap->prev_state, t, (apse_size_t)0, 1);
995 	APSE_DEBUG_SINGLE(ap, (apse_size_t)0);
996 
997 	for (g = 0, h = 1; h <= ap->edit_distance; g = h, h++) {
998 	    APSE_NEXT_APPROX(ap->state, ap->prev_state, t, h, g, 1);
999 	    APSE_DEBUG_SINGLE(ap, h);
1000 	}
1001 
1002 	if (ap->exact_positions)
1003 	    ap->state[ap->edit_distance] &= ~ap->exact_mask[0];
1004 
1005 	if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1006 	    return 1;
1007 
1008 	(void)memcpy(ap->prev_state, ap->state, ap->bytes_in_all_states);
1009     }
1010 
1011     return 0;
1012 }
1013 
_apse_match_multiple_simple(apse_t * ap)1014 static apse_bool_t _apse_match_multiple_simple(apse_t *ap) {
1015     /* multiple apse_vec_t:s, has_different_distances */
1016     apse_size_t	h, i;
1017 
1018     APSE_DEBUG(printf("(match multiple simple)\n"));
1019     for ( ; ap->text_position < ap->text_size; ap->text_position++) {
1020 	apse_vec_t	*t =
1021 	    ap->pattern_mask +
1022 	    (unsigned)ap->text[ap->text_position] * ap->bitvectors_in_state;
1023 	apse_vec_t	c, d;
1024 
1025 	APSE_DEBUG_MULTIPLE_FIRST(ap, (apse_size_t)0);
1026 	for (c = 1, i = 0; i < ap->bitvectors_in_state; i++, c = d) {
1027 	    d = APSE_TEST_HIGH_BIT(ap->state[i]);
1028 	    APSE_NEXT_EXACT(ap->state, ap->prev_state, t[i], i, c);
1029 	    APSE_DEBUG_MULTIPLE_REST(ap, i, i);
1030 	}
1031 	APSE_DEBUG(printf("\n"));
1032 
1033 	for (h = 1; h <= ap->edit_distance; h++) {
1034 	    apse_size_t	kj = h * ap->bitvectors_in_state,
1035 			jj = kj - ap->bitvectors_in_state;
1036 
1037 	    APSE_DEBUG_MULTIPLE_FIRST(ap, h);
1038 	    for (c = 1, i = 0;
1039 		 i < ap->bitvectors_in_state;
1040 		 i++, kj++, jj++, c = d) {
1041 		d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1042 		APSE_NEXT_APPROX(ap->state, ap->prev_state,
1043 				  t[i], kj, jj, c);
1044 		APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1045 	    }
1046 	    APSE_DEBUG(printf("\n"));
1047 	}
1048 
1049 	if (ap->exact_positions)
1050 	    _apse_exact_multiple(ap);
1051 
1052 	if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1053 	    return 1;
1054 
1055 	(void)memcpy(ap->prev_state, ap->state,
1056 		     ap->bytes_in_all_states);
1057     }
1058 
1059     return 0;
1060 }
1061 
_apse_match_single_complex(apse_t * ap)1062 static apse_bool_t _apse_match_single_complex(apse_t *ap) {
1063     /* single apse_vec_t, has_different_distances */
1064     APSE_DEBUG(printf("(match single complex)\n"));
1065     for ( ; ap->text_position < ap->text_size; ap->text_position++) {
1066 	unsigned char	o = ap->text[ap->text_position];
1067 	apse_vec_t	t =
1068 	    ap->pattern_mask[(unsigned int)o * ap->bitvectors_in_state];
1069 	apse_size_t	h, g;
1070 
1071 	APSE_NEXT_EXACT(ap->state, ap->prev_state, t, (apse_size_t)0, 1);
1072 	APSE_DEBUG_SINGLE(ap, (apse_size_t)0);
1073 
1074 	for (g = 0, h = 1; h <= ap->edit_distance; g = h, h++) {
1075 	    apse_bool_t has_insertions    = h <= ap->edit_insertions;
1076 	    apse_bool_t has_deletions     = h <= ap->edit_deletions;
1077 	    apse_bool_t has_substitutions = h <= ap->edit_substitutions;
1078 
1079 	    APSE_NEXT_COMMON(ap->state, ap->prev_state, t, h);
1080 	    if (has_insertions)
1081 		APSE_NEXT_INSERT(ap->state, ap->prev_state, h, g);
1082 	    if (has_deletions)
1083 		APSE_NEXT_DELETE(ap->state, h, g);
1084 	    if (has_substitutions)
1085 		APSE_NEXT_SUBSTI(ap->state, ap->prev_state, h, g);
1086 	    APSE_NEXT_CARRY(ap->state, h,
1087 			    has_deletions || has_substitutions ? 1 : 0);
1088 	    APSE_PREFIX_DELETE_MASK(ap);
1089 	    APSE_DEBUG_SINGLE(ap, h);
1090 	}
1091 
1092 	if (ap->exact_positions)
1093 	    ap->state[ap->edit_distance] &= ~ap->exact_mask[0];
1094 
1095 	if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1096 	    return 1;
1097 
1098 	(void)memcpy(ap->prev_state, ap->state,
1099 		     ap->bytes_in_all_states);
1100 
1101     }
1102 
1103     return 0;
1104 }
1105 
_apse_match_multiple_complex(apse_t * ap)1106 static apse_bool_t _apse_match_multiple_complex(apse_t *ap) {
1107     /* multiple apse_vec_t:s, has_different_distances */
1108     apse_size_t	h, i;
1109 
1110     APSE_DEBUG(printf("(match multiple complex)\n"));
1111     for ( ; ap->text_position < ap->text_size; ap->text_position++) {
1112 	unsigned char	o = ap->text[ap->text_position];
1113 	apse_vec_t	*t =
1114 	    ap->pattern_mask + (unsigned)o * ap->bitvectors_in_state;
1115 	apse_vec_t	c, d;
1116 
1117 	APSE_DEBUG_MULTIPLE_FIRST(ap, (apse_size_t)0);
1118 	for (c = 1, i = 0; i < ap->bitvectors_in_state; i++, c = d) {
1119 	    d = APSE_TEST_HIGH_BIT(ap->state[i]);
1120 	    APSE_NEXT_EXACT(ap->state, ap->prev_state, t[i], i, c);
1121 	    APSE_DEBUG_MULTIPLE_REST(ap, i, i);
1122 	}
1123 	APSE_DEBUG(printf("\n"));
1124 
1125 	for (h = 1; h <= ap->edit_distance; h++) {
1126 	    apse_size_t
1127 		kj = h * ap->bitvectors_in_state,
1128 		jj = kj - ap->bitvectors_in_state;
1129 
1130 	    apse_bool_t	has_insertions    = h <= ap->edit_insertions;
1131 	    apse_bool_t	has_deletions     = h <= ap->edit_deletions;
1132 	    apse_bool_t	has_substitutions = h <= ap->edit_substitutions;
1133 
1134 	    APSE_DEBUG_MULTIPLE_FIRST(ap, h);
1135 	    /* Is there such a thing as too much manual optimization? */
1136 	    if (has_insertions) {
1137 		if (has_deletions && has_substitutions) {
1138 		    for (c = 1, i = 0;
1139 			 i < ap->bitvectors_in_state;
1140 			 i++, kj++, jj++, c = d) {
1141 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1142 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1143 			APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1144 			APSE_NEXT_DELETE(ap->state, kj, jj);
1145 			APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1146 			APSE_NEXT_CARRY(ap->state, kj, c);
1147 			APSE_PREFIX_DELETE_MASK(ap);
1148 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1149 		    }
1150 		} else if (has_deletions) {
1151 		    for (c = 1, i = 0;
1152 			 i < ap->bitvectors_in_state;
1153 			 i++, kj++, jj++, c = d) {
1154 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1155 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1156 			APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1157 			APSE_NEXT_DELETE(ap->state, kj, jj);
1158 			APSE_NEXT_CARRY(ap->state, kj, c);
1159 			APSE_PREFIX_DELETE_MASK(ap);
1160 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1161 		    }
1162 		} else if (has_substitutions) {
1163 		    for (c = 1, i = 0;
1164 			 i < ap->bitvectors_in_state;
1165 			 i++, kj++, jj++, c = d) {
1166 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1167 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1168 			APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1169 			APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1170 			APSE_NEXT_CARRY(ap->state, kj, c);
1171 			APSE_PREFIX_DELETE_MASK(ap);
1172 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1173 		    }
1174 		} else {
1175 		    for (c = 1, i = 0;
1176 			 i < ap->bitvectors_in_state;
1177 			 i++, kj++, jj++, c = d) {
1178 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1179 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1180 			APSE_NEXT_INSERT(ap->state, ap->prev_state, kj, jj);
1181 			APSE_NEXT_CARRY(ap->state, kj, c);
1182 			APSE_PREFIX_DELETE_MASK(ap);
1183 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1184 		    }
1185 		}
1186 	    } else {
1187 		if (has_deletions && has_substitutions) {
1188 		    for (c = 1, i = 0;
1189 			 i < ap->bitvectors_in_state;
1190 			 i++, kj++, jj++, c = d) {
1191 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1192 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1193 			APSE_NEXT_DELETE(ap->state, kj, jj);
1194 			APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1195 			APSE_NEXT_CARRY(ap->state, kj, c);
1196 			APSE_PREFIX_DELETE_MASK(ap);
1197 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1198 		    }
1199 		} else if (has_deletions) {
1200 		    for (c = 1, i = 0;
1201 			 i < ap->bitvectors_in_state;
1202 			 i++, kj++, jj++, c = d) {
1203 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1204 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1205 			APSE_NEXT_DELETE(ap->state, kj, jj);
1206 			APSE_NEXT_CARRY(ap->state, kj, c);
1207 			APSE_PREFIX_DELETE_MASK(ap);
1208 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1209 		    }
1210 		} else if (has_substitutions) {
1211 		    for (c = 1, i = 0;
1212 			 i < ap->bitvectors_in_state;
1213 			 i++, kj++, jj++, c = d) {
1214 			d = APSE_TEST_HIGH_BIT(ap->state[kj]);
1215 			APSE_NEXT_COMMON(ap->state, ap->prev_state, t[i], kj);
1216 			APSE_NEXT_SUBSTI(ap->state, ap->prev_state, kj, jj);
1217 			APSE_NEXT_CARRY(ap->state, kj, c);
1218 			APSE_PREFIX_DELETE_MASK(ap);
1219 			APSE_DEBUG_MULTIPLE_REST(ap, i, kj);
1220 		    }
1221 		}
1222 	    }
1223 	    APSE_DEBUG(printf("\n"));
1224 
1225 
1226 	    if (ap->exact_positions)
1227 		_apse_exact_multiple(ap);
1228 
1229 	    if (_apse_match_next_state(ap) == APSE_MATCH_STATE_END)
1230 		return 1;
1231 
1232 	    (void)memcpy(ap->prev_state, ap->state,
1233 			 ap->bytes_in_all_states);
1234 	}
1235     }
1236 
1237     return 0;
1238 }
1239 
__apse_match(apse_t * ap,unsigned char * text,apse_size_t text_size)1240 static apse_bool_t __apse_match(apse_t		*ap,
1241 				unsigned char	*text,
1242 				apse_size_t	text_size) {
1243     apse_bool_t	did_match = 0;
1244 
1245     APSE_DEBUG(printf("(match enter)\n"));
1246 
1247     if (ap->match_state == APSE_MATCH_STATE_BOT) {
1248 	ap->text      = text;
1249 	if (ap->text_final_position == APSE_MATCH_BAD)
1250 	    ap->text_size = text_size;
1251 	else
1252 	    ap->text_size =
1253 		ap->text_final_position > text_size ?
1254 		    text_size : ap->text_final_position + 1;
1255 	_apse_match_bot(ap);
1256     } else if (ap->match_state == APSE_MATCH_STATE_EOT)
1257 	goto leave;
1258 
1259     if (ap->edit_deletions     >= ap->pattern_size ||
1260 	ap->edit_substitutions >= ap->pattern_size) {
1261 	ap->match_state   = APSE_MATCH_STATE_END;
1262 	ap->match_begin   = ap->text_initial_position;
1263 	ap->match_end     = ap->text_size - 1;
1264 	ap->text_position = ap->text_size;
1265 	goto out;
1266     }
1267 
1268     if (ap->pattern_size - ap->edit_deletions >
1269 	ap->text_size - ap->text_initial_position) {
1270 	ap->match_state   = APSE_MATCH_STATE_EOT;
1271 	ap->text_position = ap->text_size;
1272 	goto out;
1273     }
1274 
1275     if (text_size + ap->edit_distance < ap->pattern_size + ap->text_position) {
1276 	ap->text_position = ap->text_size;
1277 	goto eot;
1278     }
1279 
1280     if (ap->match_state == APSE_MATCH_STATE_SEARCH) {
1281 	ap->text_position++;
1282 	_apse_reset_state(ap);
1283     }
1284 
1285     if (ap->text_position_range != APSE_MATCH_BAD &&
1286 	ap->text_position - ap->text_initial_position >
1287 	ap->text_position_range) {
1288 	ap->match_state   = APSE_MATCH_STATE_END;
1289 	goto eot;
1290     }
1291 
1292     ap->match_state = APSE_MATCH_STATE_SEARCH;
1293     APSE_DEBUG(printf("(match state %s)\n", apse_match_state_name(ap)));
1294     APSE_DEBUG(printf("(match search %ld)\n", ap->text_position));
1295 
1296     if (ap->has_different_distances) {
1297 	if (ap->bitvectors_in_state == 1) {
1298 	    if (_apse_match_single_complex(ap))
1299 		goto out;
1300 	} else {
1301 	    if (_apse_match_multiple_complex(ap))
1302 		goto out;
1303 	}
1304     } else {
1305 	if (ap->bitvectors_in_state == 1) {
1306 	    if (_apse_match_single_simple(ap))
1307 		goto out;
1308 	} else {
1309 	    if (_apse_match_multiple_simple(ap))
1310 		goto out;
1311 	}
1312     }
1313 
1314  out:
1315 
1316     if (ap->match_state == APSE_MATCH_STATE_GREEDY) {
1317 	ap->match_state = APSE_MATCH_STATE_END;
1318 	ap->match_end   = ap->text_position - 1;
1319 	APSE_DEBUG(printf("(greedy match end %ld)\n", ap->match_end));
1320     }
1321 
1322     if (ap->match_state == APSE_MATCH_STATE_END) {
1323 	_apse_match_end(ap);
1324 	did_match = 1;
1325     }
1326 
1327   eot:
1328 
1329     if (ap->text_position == ap->text_size)
1330 	_apse_match_eot(ap);
1331 
1332   leave:
1333 
1334     APSE_DEBUG(printf("(match leave)\n"));
1335 
1336     return did_match;
1337 }
1338 
_apse_match(apse_t * ap,unsigned char * text,apse_size_t text_size)1339 static apse_bool_t _apse_match(apse_t 		*ap,
1340 			       unsigned char	*text,
1341 			       apse_size_t	text_size) {
1342     if (ap->use_minimal_distance) {
1343 	apse_set_edit_distance(ap, 0);
1344 	if (__apse_match(ap, text, text_size))
1345 	    return 1;
1346 	else {
1347 	    apse_size_t minimal_edit_distance;
1348 	    apse_size_t previous_edit_distance = 0;
1349 	    apse_size_t next_edit_distance;
1350 
1351 	    for (next_edit_distance = 1;
1352 		 next_edit_distance <= ap->pattern_size;
1353 		 next_edit_distance *= 2) {
1354 		apse_set_edit_distance(ap, next_edit_distance);
1355 		if (__apse_match(ap, text, text_size))
1356 		    break;
1357 		previous_edit_distance = next_edit_distance;
1358 	    }
1359 	    minimal_edit_distance = next_edit_distance;
1360 	    if (next_edit_distance > 1) {
1361 		do {
1362 		    minimal_edit_distance =
1363 			(previous_edit_distance + next_edit_distance) / 2;
1364 		    if (minimal_edit_distance == previous_edit_distance)
1365 			break;
1366 		    apse_set_edit_distance(ap, minimal_edit_distance);
1367 		    if (__apse_match(ap, text, text_size))
1368 			next_edit_distance     = minimal_edit_distance;
1369 		    else
1370 			previous_edit_distance = minimal_edit_distance;
1371 		} while (previous_edit_distance <= next_edit_distance);
1372 		if (!__apse_match(ap, text, text_size))
1373 		    minimal_edit_distance++;
1374 	    }
1375 	    apse_set_edit_distance(ap, minimal_edit_distance);
1376 	    __apse_match(ap, text, text_size);
1377 
1378 	    return 1;
1379 	}
1380     } else
1381 	return __apse_match(ap, text, text_size);
1382 }
1383 
apse_match(apse_t * ap,unsigned char * text,apse_size_t text_size)1384 apse_bool_t apse_match(apse_t *ap,
1385 		       unsigned char *text, apse_size_t text_size) {
1386     apse_bool_t did_match = _apse_match(ap, text, text_size);
1387 
1388     _apse_match_eot(ap);
1389     apse_reset(ap);
1390 
1391     return did_match;
1392 }
1393 
apse_match_next(apse_t * ap,unsigned char * text,apse_size_t text_size)1394 apse_bool_t apse_match_next(apse_t *ap,
1395 			    unsigned char *text, apse_size_t text_size) {
1396     apse_bool_t did_match = _apse_match(ap, text, text_size);
1397 
1398     if (!did_match)
1399 	ap->match_state = APSE_MATCH_STATE_BOT;
1400 
1401     return did_match;
1402 }
1403 
apse_index(apse_t * ap,unsigned char * text,apse_size_t text_size)1404 apse_ssize_t apse_index(apse_t *ap,
1405 			 unsigned char *text, apse_size_t text_size) {
1406     apse_size_t did_match = _apse_match(ap, text, text_size);
1407     _apse_match_eot(ap);
1408     ap->match_state = APSE_MATCH_STATE_BOT;
1409 
1410     return did_match ? ap->match_begin : APSE_MATCH_BAD;
1411 }
1412 
apse_index_next(apse_t * ap,unsigned char * text,apse_size_t text_size)1413 apse_ssize_t apse_index_next(apse_t *ap,
1414 			     unsigned char *text, apse_size_t text_size) {
1415     apse_bool_t did_match = _apse_match(ap, text, text_size);
1416 
1417     if (!did_match)
1418 	ap->match_state = APSE_MATCH_STATE_BOT;
1419 
1420     return did_match ? ap->match_begin : APSE_MATCH_BAD;
1421 }
1422 
_apse_slice(apse_t * ap,unsigned char * text,apse_size_t text_size,apse_size_t * match_begin,apse_size_t * match_size)1423 static apse_bool_t _apse_slice(apse_t *ap,
1424 			       unsigned char *text,
1425 			       apse_size_t text_size,
1426 			       apse_size_t *match_begin,
1427 			       apse_size_t *match_size) {
1428     apse_bool_t did_match = _apse_match(ap, text, text_size);
1429 
1430     if (did_match) {
1431 	if (match_begin)
1432 	    *match_begin = ap->match_begin;
1433 	if (match_size)
1434 	    *match_size	 = ap->match_end - ap->match_begin + 1;
1435     } else {
1436 	if (match_begin)
1437 	    *match_begin = APSE_MATCH_BAD;
1438 	if (match_size)
1439 	    *match_size	 = APSE_MATCH_BAD;
1440     }
1441 
1442     return did_match;
1443 }
1444 
apse_slice(apse_t * ap,unsigned char * text,apse_size_t text_size,apse_size_t * match_begin,apse_size_t * match_size)1445 apse_bool_t apse_slice(apse_t *ap,
1446 		       unsigned char *text,
1447 		       apse_size_t text_size,
1448 		       apse_size_t *match_begin,
1449 		       apse_size_t *match_size) {
1450     apse_bool_t did_match =
1451         _apse_slice(ap, text, text_size, match_begin, match_size);
1452 
1453     _apse_match_eot(ap);
1454     ap->match_state = APSE_MATCH_STATE_BOT;
1455 
1456     return did_match;
1457 }
1458 
apse_slice_next(apse_t * ap,unsigned char * text,apse_size_t text_size,apse_size_t * match_begin,apse_size_t * match_size)1459 apse_bool_t apse_slice_next(apse_t*		ap,
1460 			    unsigned char*	text,
1461 			    apse_size_t		text_size,
1462 			    apse_size_t*	match_begin,
1463 			    apse_size_t*	match_size) {
1464     apse_bool_t did_match =
1465         _apse_slice(ap, text, text_size, match_begin, match_size);
1466 
1467     if (!did_match)
1468 	ap->match_state = APSE_MATCH_STATE_BOT;
1469 
1470     return did_match;
1471 }
1472 
apse_set_custom_data(apse_t * ap,void * custom_data,apse_size_t custom_data_size)1473 void apse_set_custom_data(apse_t*	ap,
1474 			  void*		custom_data,
1475 			  apse_size_t	custom_data_size) {
1476     ap->custom_data      = custom_data;
1477     ap->custom_data_size = custom_data_size;
1478 }
1479 
apse_get_custom_data(apse_t * ap,apse_size_t * custom_data_size)1480 void* apse_get_custom_data(apse_t*	ap,
1481 			   apse_size_t*	custom_data_size) {
1482     if (custom_data_size)
1483 	*custom_data_size = ap->custom_data_size;
1484     return ap->custom_data;
1485 }
1486