1 /***********************************************************************
2  *
3  *	Regular expression interface definitions for GNU Smalltalk
4  *
5  *
6  ***********************************************************************/
7 
8 /***********************************************************************
9  *
10  * Copyright 2001, 2002, 2006, 2008 Free Software Foundation, Inc.
11  * Written by Paolo Bonzini and Dragomir Milevojevic.
12  *
13  * This file is part of GNU Smalltalk.
14  *
15  * GNU Smalltalk is free software; you can redistribute it and/or modify it
16  * under the terms of the GNU General Public License as published by the Free
17  * Software Foundation; either version 2, or (at your option) any later
18  * version.
19  *
20  * Linking GNU Smalltalk statically or dynamically with other modules is
21  * making a combined work based on GNU Smalltalk.  Thus, the terms and
22  * conditions of the GNU General Public License cover the whole
23  * combination.
24  *
25  * In addition, as a special exception, the Free Software Foundation
26  * give you permission to combine GNU Smalltalk with free software
27  * programs or libraries that are released under the GNU LGPL and with
28  * independent programs running under the GNU Smalltalk virtual machine.
29  *
30  * You may copy and distribute such a system following the terms of the
31  * GNU GPL for GNU Smalltalk and the licenses of the other code
32  * concerned, provided that you include the source code of that other
33  * code when and as the GNU GPL requires distribution of source code.
34  *
35  * Note that people who make modified versions of GNU Smalltalk are not
36  * obligated to grant this special exception for their modified
37  * versions; it is their choice whether to do so.  The GNU General
38  * Public License gives permission to release a modified version without
39  * this exception; this exception also makes it possible to release a
40  * modified version which carries forward this exception.
41  *
42  * GNU Smalltalk is distributed in the hope that it will be useful, but WITHOUT
43  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
44  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
45  * more details.
46  *
47  * You should have received a copy of the GNU General Public License along with
48  * GNU Smalltalk; see the file COPYING.  If not, write to the Free Software
49  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
50  *
51  ***********************************************************************/
52 
53 #ifdef HAVE_CONFIG_H
54 #include "config.h"
55 #endif
56 
57 #include "gstpriv.h"
58 #include "regex.h"
59 #include "re.h"
60 
61 #if STDC_HEADERS
62 #include <stdlib.h>
63 #include <string.h>
64 #endif
65 
66 /* Regex caching facility */
67 
68 #define REGEX_CACHE_SIZE 10
69 
70 typedef enum RegexCachingEnum
71 {
72   REGEX_NOT_CACHED,
73   REGEX_CACHE_HIT,
74   REGEX_CACHE_MISS
75 }
76 RegexCaching;
77 
78 typedef struct RegexCacheEntry
79 {
80   OOP patternOOP;
81   struct pre_pattern_buffer *regex;
82 }
83 RegexCacheEntry;
84 
85 static RegexCaching lookupRegex (OOP patternOOP,
86 				 struct pre_pattern_buffer **pRegex);
87 static const char *compileRegex (OOP patternOOP,
88 				 struct pre_pattern_buffer *regex);
89 static struct pre_pattern_buffer *allocateNewRegex (void);
90 static void markRegexAsMRU (int i);
91 static void init_re (void);
92 
93 static RegexCacheEntry cache[REGEX_CACHE_SIZE];
94 
95 /* Smalltalk globals */
96 static OOP regexClassOOP, resultsClassOOP;
97 
98 /* Allocate a buffer to be passed to the regular expression matcher */
99 struct pre_pattern_buffer *
allocateNewRegex(void)100 allocateNewRegex (void)
101 {
102   struct pre_pattern_buffer *regex;
103   regex = (struct pre_pattern_buffer *)
104     calloc (1, sizeof (struct pre_pattern_buffer));
105 
106   regex->allocated = 0;
107   regex->fastmap = malloc (1 << BYTEWIDTH);
108 
109   return regex;
110 }
111 
112 /* Compile a pattern that's stored into an OOP.  Answer an error, or NULL. */
113 const char *
compileRegex(OOP patternOOP,struct pre_pattern_buffer * regex)114 compileRegex (OOP patternOOP, struct pre_pattern_buffer *regex)
115 {
116   int patternLength;
117   const char *pattern;
118   const char *ress;
119 
120   pattern = &STRING_OOP_AT (OOP_TO_OBJ (patternOOP), 1);
121   patternLength = _gst_basic_size (patternOOP);
122 
123   /* compile pattern */
124   ress = pre_compile_pattern (pattern, patternLength, regex);
125   return ress;
126 }
127 
128 /* Move the i-th entry of the cache to the first position */
129 void
markRegexAsMRU(int i)130 markRegexAsMRU (int i)
131 {
132   RegexCacheEntry saved;
133   int j;
134 
135   saved = cache[i];
136   for (j = i; j > 0; j--)
137     cache[j] = cache[j - 1];
138 
139   cache[0] = saved;
140 }
141 
142 /* If patternOOP is not a Regex, answer REGEX_NOT_CACHED.  Else look
143    it up in the cache and move it to its top (so that it is marked as
144    most recently used).  Answer REGEX_CACHE_HIT if it is found, or
145    REGEX_CACHE_MISS if it is not.
146 
147    pRegex will point to the compiled regex if there is a cache hit, else
148    lookupRegex will only initialize it, and it will be the caller's
149    responsibility to compile the regex into the buffer that is returned.
150    If the patternOOP is not a Regex (i.e. REGEX_NOT_CACHED returned), the
151    caller will also have to free the buffer pointed to by pRegex.  */
152 RegexCaching
lookupRegex(OOP patternOOP,struct pre_pattern_buffer ** pRegex)153 lookupRegex (OOP patternOOP, struct pre_pattern_buffer **pRegex)
154 {
155   int i;
156   RegexCaching result;
157 
158   if (!IS_OOP_READONLY (patternOOP))
159     {
160       *pRegex = allocateNewRegex ();
161       return REGEX_NOT_CACHED;
162     }
163 
164   /* Search for the Regex object in the cache */
165   for (i = 0; i < REGEX_CACHE_SIZE; i++)
166     if (cache[i].patternOOP == patternOOP)
167       break;
168 
169   if (i < REGEX_CACHE_SIZE)
170     result = REGEX_CACHE_HIT;
171 
172   else
173     {
174       /* Kick out the least recently used regexp */
175       i--;
176       result = REGEX_CACHE_MISS;
177 
178       /* Register the objects we're caching with the virtual machine */
179       if (cache[i].patternOOP)
180 	_gst_unregister_oop (cache[i].patternOOP);
181 
182       _gst_register_oop (patternOOP);
183       cache[i].patternOOP = patternOOP;
184     }
185 
186   /* Mark the object as most recently used */
187   if (!cache[i].regex)
188     cache[i].regex = allocateNewRegex ();
189 
190   markRegexAsMRU (i);
191   *pRegex = cache[0].regex;
192   return result;
193 }
194 
195 /* Create a Regex object.  We look for one that points to the same string
196    in the cache (so that we can optimize a loop that repeatedly calls
197    asRegex; if none is found, we create one ex-novo.
198    Note that Regex and String objects have the same layout; only, Regexes
199    are read-only so that we can support this kind of "interning" them.  */
200 OOP
_gst_re_make_cacheable(OOP patternOOP)201 _gst_re_make_cacheable (OOP patternOOP)
202 {
203   OOP regexOOP;
204   const char *pattern;
205   char *regex;
206   struct pre_pattern_buffer *compiled;
207   int patternLength;
208   int i;
209 
210   if (!regexClassOOP)
211     init_re ();
212 
213   if (IS_OOP_READONLY (patternOOP))
214     return patternOOP;
215 
216   /* Search in the cache */
217   patternLength = _gst_basic_size (patternOOP);
218   pattern = &STRING_OOP_AT (OOP_TO_OBJ (patternOOP), 1);
219 
220   for (i = 0; i < REGEX_CACHE_SIZE; i++)
221     {
222       if (!cache[i].regex)
223 	break;
224 
225       regexOOP = cache[i].patternOOP;
226       regex = &STRING_OOP_AT (OOP_TO_OBJ (regexOOP), 1);
227       if (_gst_basic_size (regexOOP) == patternLength &&
228 	  memcmp (regex, pattern, patternLength) == 0)
229 	{
230 	  markRegexAsMRU (i);
231 	  return regexOOP;
232 	}
233     }
234 
235   /* No way, must allocate a new Regex object */
236   regexOOP = _gst_object_alloc (regexClassOOP, patternLength);
237   regex = &STRING_OOP_AT (OOP_TO_OBJ (regexOOP), 1);
238   memcpy (regex, pattern, patternLength);
239 
240   /* Put it in the cache (we must compile it to check that it
241    * is well-formed).
242    */
243   lookupRegex (regexOOP, &compiled);
244   if (compileRegex (patternOOP, compiled) != NULL)
245     return _gst_nil_oop;
246   else
247     return regexOOP;
248 }
249 
250 
251 typedef struct _gst_interval
252 {
253   OBJ_HEADER;
254   OOP fromOOP;
255   OOP toOOP;
256   OOP stepOOP;
257 } *gst_interval;
258 
259 typedef struct _gst_registers
260 {
261   OBJ_HEADER;
262   OOP subjectOOP;
263   OOP fromOOP;
264   OOP toOOP;
265   OOP registersOOP;
266   OOP matchOOP;
267   OOP cacheOOP;
268 } *gst_registers;
269 
270 static OOP
make_re_results(OOP srcOOP,struct pre_registers * regs)271 make_re_results (OOP srcOOP, struct pre_registers *regs)
272 {
273   OOP resultsOOP;
274   gst_registers results;
275 
276   int i;
277   if (!regs->beg || regs->beg[0] == -1)
278     return _gst_nil_oop;
279 
280   resultsOOP = _gst_object_alloc (resultsClassOOP, 0);
281   results = (gst_registers) OOP_TO_OBJ (resultsOOP);
282   results->subjectOOP = srcOOP;
283   results->fromOOP = FROM_INT (regs->beg[0] + 1);
284   results->toOOP = FROM_INT (regs->end[0]);
285   if (regs->num_regs > 1)
286     {
287       OOP registersOOP = _gst_object_alloc (_gst_array_class, regs->num_regs - 1);
288       results = (gst_registers) OOP_TO_OBJ (resultsOOP);
289       results->registersOOP = registersOOP;
290     }
291 
292   for (i = 1; i < regs->num_regs; i++)
293     {
294       OOP intervalOOP;
295       if (regs->beg[i] == -1)
296 	intervalOOP = _gst_nil_oop;
297       else
298 	{
299           gst_interval interval;
300 	  intervalOOP = _gst_object_alloc (_gst_interval_class, 0);
301           interval = (gst_interval) OOP_TO_OBJ (intervalOOP);
302           interval->fromOOP = FROM_INT (regs->beg[i] + 1);
303           interval->toOOP = FROM_INT (regs->end[i]);
304           interval->stepOOP = FROM_INT (1);
305 	}
306 
307       /* We need to reload results as it may be invalidated by GC.  */
308       results = (gst_registers) OOP_TO_OBJ (resultsOOP);
309       _gst_oop_at_put (results->registersOOP, i - 1, intervalOOP);
310     }
311 
312   return resultsOOP;
313 }
314 
315 /* Search helper function */
316 
317 OOP
_gst_re_search(OOP srcOOP,OOP patternOOP,int from,int to)318 _gst_re_search (OOP srcOOP, OOP patternOOP, int from, int to)
319 {
320   const char *src;
321   struct pre_pattern_buffer *regex;
322   struct pre_registers *regs;
323   RegexCaching caching;
324   OOP resultOOP;
325 
326   if (!regexClassOOP)
327     init_re ();
328 
329   caching = lookupRegex (patternOOP, &regex);
330   if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
331     return NULL;
332 
333   /* now search */
334   src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
335   regs = (struct pre_registers *) calloc (1, sizeof (struct pre_registers));
336   pre_search (regex, src, to, from - 1, to - from + 1, regs);
337 
338   if (caching == REGEX_NOT_CACHED)
339     pre_free_pattern (regex);
340 
341   resultOOP = make_re_results (srcOOP, regs);
342   pre_free_registers(regs);
343   free(regs);
344   return resultOOP;
345 }
346 
347 
348 /* Match helper function */
349 
350 int
_gst_re_match(OOP srcOOP,OOP patternOOP,int from,int to)351 _gst_re_match (OOP srcOOP, OOP patternOOP, int from, int to)
352 {
353   int res = 0;
354   const char *src;
355   struct pre_pattern_buffer *regex;
356   RegexCaching caching;
357 
358   if (!regexClassOOP)
359     init_re ();
360 
361   caching = lookupRegex (patternOOP, &regex);
362   if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
363     return -100;
364 
365   /* now search */
366   src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
367   res = pre_match (regex, src, to, from - 1, NULL);
368 
369   if (caching == REGEX_NOT_CACHED)
370     pre_free_pattern (regex);
371 
372   return res;
373 }
374 
375 
376 
377 /* Initialize regex.c */
378 static void
init_re(void)379 init_re (void)
380 {
381   /* This is a ASCII downcase-table.  We don't make any assumptions
382      about what bytes >=128 are, so can't downcase them. */
383   static const char casetable[256] =
384     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
385      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
386      ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
387      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
388      '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
389      'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', '\\', ']', '^', '_',
390      '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
391      'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 127,
392      128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
393      144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
394      160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
395      176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
396      192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
397      208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
398      224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
399      240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
400   pre_set_casetable (casetable);
401 
402   regexClassOOP = _gst_class_name_to_oop ("Regex");
403   resultsClassOOP = _gst_class_name_to_oop ("Kernel.MatchingRegexResults");
404 }
405