1 /*
2   mxbmse -- Fast Boyer Moore Search Algorithm (Version 0.9)
3 
4   The implementation is reentrant and thread safe. While the
5   general ideas behind the Boyer Moore algorithm are in the public
6   domain, this implementation falls under the following copyright:
7 
8   Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com
9   Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com
10 
11                         All Rights Reserved
12 
13   See the documentation for copying information or contact the author
14   (mal@lemburg.com).
15 */
16 
17 /* to turn on the debugging printfs (DPRINTF):*/
18 /* #define MAL_DEBUG */
19 
20 /* Logging file used by debugging facility */
21 #ifndef MAL_DEBUG_OUTPUTFILE
22 # define MAL_DEBUG_OUTPUTFILE "mxTextSearch.log"
23 #endif
24 
25 #ifdef MAL_DEBUG_WITH_PYTHON
26 # include "mx.h"
27 #endif
28 
29 #include "mxstdlib.h"
30 #include "mxbmse.h"
31 
32 /* --- Fast Boyer-Moore Implementation (8-bit) ---------------------------- */
33 
bm_init(char * match,int match_len)34 mxbmse_data *bm_init(char *match,
35 		     int match_len)
36 {
37     mxbmse_data *c;
38     int i;
39     BM_SHIFT_TYPE *shift;
40     char *m;
41 
42     c = newstruct(mxbmse_data);
43     c->match = match;
44     c->match_len = match_len;
45     c->eom = match + match_len - 1;
46 
47     /* Length 1 matching does not use a shift table */
48     if (match_len == 1)
49 	return c;
50 
51     /* Init shift table */
52     for ( shift = c->shift, i = 256; i > 0; i--, shift++ )
53 	*shift = (BM_SHIFT_TYPE) match_len;
54 
55     DPRINTF("shift table for match='%s'\n",match);
56     for ( shift = c->shift, m = match, i = match_len - 1;
57 	  i >= 0;
58 	  i--, m++ ) {
59 	shift[ (unsigned char) *m ] = (BM_SHIFT_TYPE) i;
60 	DPRINTF("  char = '%c'  shift = %i\n", *m, i);
61     }
62 
63     return c;
64 }
65 
bm_free(mxbmse_data * c)66 void bm_free(mxbmse_data *c)
67 {
68     if (c)
69 	free(c);
70 }
71 
bm_search(mxbmse_data * c,char * text,int start,int text_len)72 int bm_search(mxbmse_data *c,
73 	      char *text,
74 	      int start,
75 	      int text_len)
76 {
77     register char *pt;
78     register char *eot = text + text_len;
79 
80     /* Error check */
81     if (c == NULL)
82 	return -1;
83 
84     /* Init text pointer */
85     pt = text + start + c->match_len - 1;
86 
87     DPRINTF("Init :  %2i %20.20s \t text: %2i %20.20s\n",
88 	    c->match_len,c->match,start,text+start);
89 
90     if (c->match_len > 1)
91 	for (;;) {
92 	    register char *pm;
93 
94 	    pm = c->eom;
95 
96 	    for (;pt < eot && *pt != *pm;
97 		  pt += c->shift[(unsigned char) *pt]);
98 
99 	    if (pt >= eot)
100 		break;
101 
102 	    /* First char matches.. what about the others ? */
103 	    {
104 		register int im = c->match_len;
105 
106 		do {
107 		    DPRINTF("=match: %2i '%20.20s' \t text: '%20.20s'\n",
108 			    im,pm,pt);
109 		    if (--im == 0)
110 			/* Match */
111 			return pt - text + c->match_len;
112 		    pt--;
113 		    pm--;
114 		} while (*pt == *pm);
115 
116 		/* Mismatch after match: use shift-table */
117 		{
118 		    register int a,b;
119 
120 		    a = c->shift[(unsigned char) *pt];
121 		    b = c->match_len - im + 1;
122 		    DPRINTF("!match: %2i '%20.20s' \t text: '%20.20s' "
123 			    "(sh=%i)\n",
124 			    im,pm,pt,max(a,b));
125 		    pt += (a > b) ? a : b;
126 		}
127 	    }
128 
129 	}
130 
131     /* Special case: matching string has length 1 */
132     else {
133 	register char m = *c->eom;
134 
135 	for (;pt < eot; pt++)
136 	    if (*pt == m)
137 		/* Match */
138 		return pt - text + 1;
139     }
140 
141     return start; /* no match */
142 }
143 
144 /* bm search using the translate table -- 45% slower */
145 
bm_tr_search(mxbmse_data * c,char * text,int start,int text_len,char * tr)146 int bm_tr_search(mxbmse_data *c,
147 		 char *text,
148 		 int start,
149 		 int text_len,
150 		 char *tr)
151 {
152     register char *pt;
153     register char *eot = text + text_len;
154 
155     /* Error check */
156     if (c == NULL)
157 	return -1;
158 
159     /* Init text pointer */
160     pt = text + start + c->match_len - 1;
161 
162     DPRINTF("Init :  %2i '%20.20s' \t text: %2i '%20.20s'\n",
163 	    c->match_len,c->match,start,text+start);
164 
165     if (c->match_len > 1)
166 	for (;;) {
167 	    register char *pm;
168 
169 	    pm = c->eom;
170 
171 	    for (;pt < eot && tr[(unsigned char) *pt] != *pm;
172 		 pt += c->shift[(unsigned char) tr[(unsigned char) *pt]]);
173 
174 	    if (pt >= eot)
175 		break;
176 
177 	    /* First char matches.. what about the others ? */
178 	    {
179 		register int im = c->match_len;
180 
181 		do {
182 		    DPRINTF("=match: %2i '%20.20s' \t text: '%20.20s'\n",
183 			    im,pm,pt);
184 		    if (--im == 0)
185 			/* Match */
186 			return pt - text + c->match_len;
187 		    pt--;
188 		    pm--;
189 		} while (tr[(unsigned char) *pt] == *pm);
190 
191 		/* Mismatch after match: use shift-table */
192 		{
193 		    register int a,b;
194 
195 		    a = c->shift[(unsigned char) tr[(unsigned char) *pt]];
196 		    b = c->match_len - im + 1;
197 		    DPRINTF("!match: %2i '%20.20s' \t text: '%20.20s' "
198 			    "(sh=%i)\n",
199 			    im,pm,pt,max(a,b));
200 		    pt += (a > b)?a:b;
201 		}
202 	    }
203 
204 	}
205 
206     /* Special case: matching string has length 1 */
207     else {
208 	register char m = *c->eom;
209 
210 	for (;pt < eot; pt++)
211 	    if (*pt == m)
212 		/* Match */
213 		return pt - text + 1;
214     }
215 
216     return start; /* no match */
217 }
218 
219