1 /***********************************************************************
2 *
3 * Regular expression interface definitions for GNU Smalltalk
4 *
5 *
6 ***********************************************************************/
7
8 /***********************************************************************
9 *
10 * Copyright 2001, 2002, 2006, 2008 Free Software Foundation, Inc.
11 * Written by Paolo Bonzini and Dragomir Milevojevic.
12 *
13 * This file is part of GNU Smalltalk.
14 *
15 * GNU Smalltalk is free software; you can redistribute it and/or modify it
16 * under the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2, or (at your option) any later
18 * version.
19 *
20 * Linking GNU Smalltalk statically or dynamically with other modules is
21 * making a combined work based on GNU Smalltalk. Thus, the terms and
22 * conditions of the GNU General Public License cover the whole
23 * combination.
24 *
25 * In addition, as a special exception, the Free Software Foundation
26 * give you permission to combine GNU Smalltalk with free software
27 * programs or libraries that are released under the GNU LGPL and with
28 * independent programs running under the GNU Smalltalk virtual machine.
29 *
30 * You may copy and distribute such a system following the terms of the
31 * GNU GPL for GNU Smalltalk and the licenses of the other code
32 * concerned, provided that you include the source code of that other
33 * code when and as the GNU GPL requires distribution of source code.
34 *
35 * Note that people who make modified versions of GNU Smalltalk are not
36 * obligated to grant this special exception for their modified
37 * versions; it is their choice whether to do so. The GNU General
38 * Public License gives permission to release a modified version without
39 * this exception; this exception also makes it possible to release a
40 * modified version which carries forward this exception.
41 *
42 * GNU Smalltalk is distributed in the hope that it will be useful, but WITHOUT
43 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
44 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
45 * more details.
46 *
47 * You should have received a copy of the GNU General Public License along with
48 * GNU Smalltalk; see the file COPYING. If not, write to the Free Software
49 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
50 *
51 ***********************************************************************/
52
53 #ifdef HAVE_CONFIG_H
54 #include "config.h"
55 #endif
56
57 #include "gstpriv.h"
58 #include "regex.h"
59 #include "re.h"
60
61 #if STDC_HEADERS
62 #include <stdlib.h>
63 #include <string.h>
64 #endif
65
66 /* Regex caching facility */
67
68 #define REGEX_CACHE_SIZE 10
69
70 typedef enum RegexCachingEnum
71 {
72 REGEX_NOT_CACHED,
73 REGEX_CACHE_HIT,
74 REGEX_CACHE_MISS
75 }
76 RegexCaching;
77
78 typedef struct RegexCacheEntry
79 {
80 OOP patternOOP;
81 struct pre_pattern_buffer *regex;
82 }
83 RegexCacheEntry;
84
85 static RegexCaching lookupRegex (OOP patternOOP,
86 struct pre_pattern_buffer **pRegex);
87 static const char *compileRegex (OOP patternOOP,
88 struct pre_pattern_buffer *regex);
89 static struct pre_pattern_buffer *allocateNewRegex (void);
90 static void markRegexAsMRU (int i);
91 static void init_re (void);
92
93 static RegexCacheEntry cache[REGEX_CACHE_SIZE];
94
95 /* Smalltalk globals */
96 static OOP regexClassOOP, resultsClassOOP;
97
98 /* Allocate a buffer to be passed to the regular expression matcher */
99 struct pre_pattern_buffer *
allocateNewRegex(void)100 allocateNewRegex (void)
101 {
102 struct pre_pattern_buffer *regex;
103 regex = (struct pre_pattern_buffer *)
104 calloc (1, sizeof (struct pre_pattern_buffer));
105
106 regex->allocated = 0;
107 regex->fastmap = malloc (1 << BYTEWIDTH);
108
109 return regex;
110 }
111
112 /* Compile a pattern that's stored into an OOP. Answer an error, or NULL. */
113 const char *
compileRegex(OOP patternOOP,struct pre_pattern_buffer * regex)114 compileRegex (OOP patternOOP, struct pre_pattern_buffer *regex)
115 {
116 int patternLength;
117 const char *pattern;
118 const char *ress;
119
120 pattern = &STRING_OOP_AT (OOP_TO_OBJ (patternOOP), 1);
121 patternLength = _gst_basic_size (patternOOP);
122
123 /* compile pattern */
124 ress = pre_compile_pattern (pattern, patternLength, regex);
125 return ress;
126 }
127
128 /* Move the i-th entry of the cache to the first position */
129 void
markRegexAsMRU(int i)130 markRegexAsMRU (int i)
131 {
132 RegexCacheEntry saved;
133 int j;
134
135 saved = cache[i];
136 for (j = i; j > 0; j--)
137 cache[j] = cache[j - 1];
138
139 cache[0] = saved;
140 }
141
142 /* If patternOOP is not a Regex, answer REGEX_NOT_CACHED. Else look
143 it up in the cache and move it to its top (so that it is marked as
144 most recently used). Answer REGEX_CACHE_HIT if it is found, or
145 REGEX_CACHE_MISS if it is not.
146
147 pRegex will point to the compiled regex if there is a cache hit, else
148 lookupRegex will only initialize it, and it will be the caller's
149 responsibility to compile the regex into the buffer that is returned.
150 If the patternOOP is not a Regex (i.e. REGEX_NOT_CACHED returned), the
151 caller will also have to free the buffer pointed to by pRegex. */
152 RegexCaching
lookupRegex(OOP patternOOP,struct pre_pattern_buffer ** pRegex)153 lookupRegex (OOP patternOOP, struct pre_pattern_buffer **pRegex)
154 {
155 int i;
156 RegexCaching result;
157
158 if (!IS_OOP_READONLY (patternOOP))
159 {
160 *pRegex = allocateNewRegex ();
161 return REGEX_NOT_CACHED;
162 }
163
164 /* Search for the Regex object in the cache */
165 for (i = 0; i < REGEX_CACHE_SIZE; i++)
166 if (cache[i].patternOOP == patternOOP)
167 break;
168
169 if (i < REGEX_CACHE_SIZE)
170 result = REGEX_CACHE_HIT;
171
172 else
173 {
174 /* Kick out the least recently used regexp */
175 i--;
176 result = REGEX_CACHE_MISS;
177
178 /* Register the objects we're caching with the virtual machine */
179 if (cache[i].patternOOP)
180 _gst_unregister_oop (cache[i].patternOOP);
181
182 _gst_register_oop (patternOOP);
183 cache[i].patternOOP = patternOOP;
184 }
185
186 /* Mark the object as most recently used */
187 if (!cache[i].regex)
188 cache[i].regex = allocateNewRegex ();
189
190 markRegexAsMRU (i);
191 *pRegex = cache[0].regex;
192 return result;
193 }
194
195 /* Create a Regex object. We look for one that points to the same string
196 in the cache (so that we can optimize a loop that repeatedly calls
197 asRegex; if none is found, we create one ex-novo.
198 Note that Regex and String objects have the same layout; only, Regexes
199 are read-only so that we can support this kind of "interning" them. */
200 OOP
_gst_re_make_cacheable(OOP patternOOP)201 _gst_re_make_cacheable (OOP patternOOP)
202 {
203 OOP regexOOP;
204 const char *pattern;
205 char *regex;
206 struct pre_pattern_buffer *compiled;
207 int patternLength;
208 int i;
209
210 if (!regexClassOOP)
211 init_re ();
212
213 if (IS_OOP_READONLY (patternOOP))
214 return patternOOP;
215
216 /* Search in the cache */
217 patternLength = _gst_basic_size (patternOOP);
218 pattern = &STRING_OOP_AT (OOP_TO_OBJ (patternOOP), 1);
219
220 for (i = 0; i < REGEX_CACHE_SIZE; i++)
221 {
222 if (!cache[i].regex)
223 break;
224
225 regexOOP = cache[i].patternOOP;
226 regex = &STRING_OOP_AT (OOP_TO_OBJ (regexOOP), 1);
227 if (_gst_basic_size (regexOOP) == patternLength &&
228 memcmp (regex, pattern, patternLength) == 0)
229 {
230 markRegexAsMRU (i);
231 return regexOOP;
232 }
233 }
234
235 /* No way, must allocate a new Regex object */
236 regexOOP = _gst_object_alloc (regexClassOOP, patternLength);
237 regex = &STRING_OOP_AT (OOP_TO_OBJ (regexOOP), 1);
238 memcpy (regex, pattern, patternLength);
239
240 /* Put it in the cache (we must compile it to check that it
241 * is well-formed).
242 */
243 lookupRegex (regexOOP, &compiled);
244 if (compileRegex (patternOOP, compiled) != NULL)
245 return _gst_nil_oop;
246 else
247 return regexOOP;
248 }
249
250
251 typedef struct _gst_interval
252 {
253 OBJ_HEADER;
254 OOP fromOOP;
255 OOP toOOP;
256 OOP stepOOP;
257 } *gst_interval;
258
259 typedef struct _gst_registers
260 {
261 OBJ_HEADER;
262 OOP subjectOOP;
263 OOP fromOOP;
264 OOP toOOP;
265 OOP registersOOP;
266 OOP matchOOP;
267 OOP cacheOOP;
268 } *gst_registers;
269
270 static OOP
make_re_results(OOP srcOOP,struct pre_registers * regs)271 make_re_results (OOP srcOOP, struct pre_registers *regs)
272 {
273 OOP resultsOOP;
274 gst_registers results;
275
276 int i;
277 if (!regs->beg || regs->beg[0] == -1)
278 return _gst_nil_oop;
279
280 resultsOOP = _gst_object_alloc (resultsClassOOP, 0);
281 results = (gst_registers) OOP_TO_OBJ (resultsOOP);
282 results->subjectOOP = srcOOP;
283 results->fromOOP = FROM_INT (regs->beg[0] + 1);
284 results->toOOP = FROM_INT (regs->end[0]);
285 if (regs->num_regs > 1)
286 {
287 OOP registersOOP = _gst_object_alloc (_gst_array_class, regs->num_regs - 1);
288 results = (gst_registers) OOP_TO_OBJ (resultsOOP);
289 results->registersOOP = registersOOP;
290 }
291
292 for (i = 1; i < regs->num_regs; i++)
293 {
294 OOP intervalOOP;
295 if (regs->beg[i] == -1)
296 intervalOOP = _gst_nil_oop;
297 else
298 {
299 gst_interval interval;
300 intervalOOP = _gst_object_alloc (_gst_interval_class, 0);
301 interval = (gst_interval) OOP_TO_OBJ (intervalOOP);
302 interval->fromOOP = FROM_INT (regs->beg[i] + 1);
303 interval->toOOP = FROM_INT (regs->end[i]);
304 interval->stepOOP = FROM_INT (1);
305 }
306
307 /* We need to reload results as it may be invalidated by GC. */
308 results = (gst_registers) OOP_TO_OBJ (resultsOOP);
309 _gst_oop_at_put (results->registersOOP, i - 1, intervalOOP);
310 }
311
312 return resultsOOP;
313 }
314
315 /* Search helper function */
316
317 OOP
_gst_re_search(OOP srcOOP,OOP patternOOP,int from,int to)318 _gst_re_search (OOP srcOOP, OOP patternOOP, int from, int to)
319 {
320 const char *src;
321 struct pre_pattern_buffer *regex;
322 struct pre_registers *regs;
323 RegexCaching caching;
324 OOP resultOOP;
325
326 if (!regexClassOOP)
327 init_re ();
328
329 caching = lookupRegex (patternOOP, ®ex);
330 if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
331 return NULL;
332
333 /* now search */
334 src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
335 regs = (struct pre_registers *) calloc (1, sizeof (struct pre_registers));
336 pre_search (regex, src, to, from - 1, to - from + 1, regs);
337
338 if (caching == REGEX_NOT_CACHED)
339 pre_free_pattern (regex);
340
341 resultOOP = make_re_results (srcOOP, regs);
342 pre_free_registers(regs);
343 free(regs);
344 return resultOOP;
345 }
346
347
348 /* Match helper function */
349
350 int
_gst_re_match(OOP srcOOP,OOP patternOOP,int from,int to)351 _gst_re_match (OOP srcOOP, OOP patternOOP, int from, int to)
352 {
353 int res = 0;
354 const char *src;
355 struct pre_pattern_buffer *regex;
356 RegexCaching caching;
357
358 if (!regexClassOOP)
359 init_re ();
360
361 caching = lookupRegex (patternOOP, ®ex);
362 if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
363 return -100;
364
365 /* now search */
366 src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
367 res = pre_match (regex, src, to, from - 1, NULL);
368
369 if (caching == REGEX_NOT_CACHED)
370 pre_free_pattern (regex);
371
372 return res;
373 }
374
375
376
377 /* Initialize regex.c */
378 static void
init_re(void)379 init_re (void)
380 {
381 /* This is a ASCII downcase-table. We don't make any assumptions
382 about what bytes >=128 are, so can't downcase them. */
383 static const char casetable[256] =
384 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
385 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
386 ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
387 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
388 '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
389 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', '\\', ']', '^', '_',
390 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
391 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 127,
392 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
393 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
394 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
395 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
396 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
397 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
398 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
399 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
400 pre_set_casetable (casetable);
401
402 regexClassOOP = _gst_class_name_to_oop ("Regex");
403 resultsClassOOP = _gst_class_name_to_oop ("Kernel.MatchingRegexResults");
404 }
405