1 /* find.c -- Searching and replacing
2    Copyright (C) 1993, 1994 John Harper <john@dcs.warwick.ac.uk>
3    $Id$
4 
5    This file is part of Jade.
6 
7    Jade is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2, or (at your option)
10    any later version.
11 
12    Jade is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with Jade; see the file COPYING.	If not, write to
19    the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20 
21 #define _GNU_SOURCE
22 
23 #include "repint.h"
24 
25 #include <string.h>
26 #include <ctype.h>
27 #include <stdlib.h>
28 #include <assert.h>
29 
30 /* Hooks for dealing with the rep_reg_obj match type. */
31 void (*rep_regsub_fun)(int, rep_regsubs *, char *, char *, void *);
32 int (*rep_regsublen_fun)(int, rep_regsubs *, char *, void *);
33 
34 
35 /* Compiling regexps. */
36 
37 /* A linked list is used to store all recently-used regexps in MRU
38    order. At GC the regexps at the tail of the list are freed to
39    satisfy the size limit.
40 
41    It might be better to use a hash-table. But by experience it seems
42    that the cache is usually quite small, and therefore searching the
43    list each compilation isn't too bad (and it makes the gc easier).
44 
45    Also, the hit-ratio is very good (as I'm typing this, ~0.97) */
46 
47 struct cached_regexp {
48     struct cached_regexp *next;
49     repv regexp;
50     rep_regexp *compiled;
51 };
52 
53 static struct cached_regexp *cached_regexps;	/* should be a hash table? */
54 static int regexp_hits, regexp_misses;
55 static int regexp_cache_limit = 1024;
56 
57 DEFSYM(regexp_error, "regexp-error");
58 DEFSTRING(err_regexp_error, "Regexp error");
59 
60 rep_regexp *
rep_compile_regexp(repv re)61 rep_compile_regexp(repv re)
62 {
63     struct cached_regexp **x = &cached_regexps;
64     int re_len;
65     assert(rep_STRINGP(re));
66     re_len = rep_STRING_LEN(re);
67     while(*x != 0)
68     {
69 	repv saved_re = (*x)->regexp;
70 	assert(rep_STRINGP(saved_re));
71 	if(saved_re == re
72 	   || (rep_STRING_LEN(saved_re) == re_len
73 	       && memcmp(rep_STR(saved_re), rep_STR(re), re_len) == 0))
74 	{
75 	    /* Found it. Move this node to the head of the list. Then
76 	       return the compiled copy. */
77 	    struct cached_regexp *this = *x;
78 	    if(x != &cached_regexps)
79 	    {
80 		*x = this->next;
81 		this->next = cached_regexps;
82 		cached_regexps = this;
83 	    }
84 	    regexp_hits++;
85 	    return this->compiled;
86 	}
87 	x = &((*x)->next);
88     }
89 
90     /* No cached copy. Compile it, then add it to the cache. */
91     {
92 	struct cached_regexp *this;
93 	rep_regexp *compiled = rep_regcomp(rep_STR(re));
94 	if(compiled != 0)
95 	{
96 	    this = rep_alloc(sizeof(struct cached_regexp));
97 	    if(this != 0)
98 	    {
99 		this->regexp = re;
100 		this->compiled = compiled;
101 		this->next = cached_regexps;
102 		cached_regexps = this;
103 		regexp_misses++;
104 		rep_data_after_gc += (sizeof(struct cached_regexp)
105 				  + compiled->regsize);
106 		return compiled;
107 	    }
108 	}
109 	return 0;
110     }
111 }
112 
113 /* Remove any cached compilation of STRING from the regexp cache */
114 void
rep_string_modified(repv string)115 rep_string_modified (repv string)
116 {
117     struct cached_regexp **x;
118     for (x = &cached_regexps; *x != 0; x = &((*x)->next))
119     {
120 	if ((*x)->regexp == string)
121 	{
122 	    /* found the string, remove it from the cache */
123 	    struct cached_regexp *ptr = *x;
124 	    *x = ptr->next;
125 	    free (ptr->compiled);
126 	    rep_free (ptr);
127 	}
128     }
129 }
130 
131 /* Called at GC */
132 static void
mark_cached_regexps(void)133 mark_cached_regexps(void)
134 {
135     unsigned long total = 0;
136     struct cached_regexp *x = cached_regexps, *xp = 0;
137     while(x != 0 && total < regexp_cache_limit)
138     {
139 	assert(rep_STRINGP(x->regexp));
140 	rep_MARKVAL(x->regexp);
141 	total += sizeof(struct cached_regexp) + x->compiled->regsize;
142 	xp = x;
143 	x = x->next;
144     }
145     if(xp != 0)
146     {
147 	/* Free all following regexps */
148 	x = xp->next;
149 	xp->next = 0;
150 	while(x != 0)
151 	{
152 	    xp = x->next;
153 	    free(x->compiled);
154 	    rep_free(x);
155 	    x = xp;
156 	}
157     }
158 }
159 
160 /* Free all cached regexps */
161 static void
release_cached_regexps(void)162 release_cached_regexps(void)
163 {
164     struct cached_regexp *x = cached_regexps;
165     cached_regexps = 0;
166     while(x != 0)
167     {
168 	struct cached_regexp *next = x->next;
169 	free(x->compiled);
170 	rep_free(x);
171 	x = next;
172     }
173 }
174 
175 
176 /* Storing regexp context. */
177 
178 /* Storage for remembering where the last match was.
179    last_match_data is the string or buffer that was matched against.
180    last_matches is a copy of the subexpression data of the last match.  */
181 static rep_regtype last_match_type;
182 static repv last_match_data;
183 static rep_regsubs last_matches;
184 
185 struct rep_saved_regexp_data *rep_saved_matches;
186 
187 void
rep_update_last_match(repv data,rep_regexp * prog)188 rep_update_last_match(repv data, rep_regexp *prog)
189 {
190     last_match_type = prog->lasttype;
191     last_match_data = data;
192     memcpy(&last_matches, &prog->matches, sizeof(last_matches));
193 }
194 
195 /* Called by GC */
196 void
rep_mark_regexp_data(void)197 rep_mark_regexp_data(void)
198 {
199     struct rep_saved_regexp_data *sd;
200 
201     /* Don't keep too many cached REs through GC. */
202     mark_cached_regexps();
203 
204     if(last_match_type == rep_reg_obj)
205     {
206 	int i;
207 	for(i = 0; i < rep_NSUBEXP; i++)
208 	{
209 	    rep_MARKVAL(last_matches.obj.startp[i]);
210 	    rep_MARKVAL(last_matches.obj.endp[i]);
211 	}
212     }
213     rep_MARKVAL(last_match_data);
214 
215     for(sd = rep_saved_matches; sd != 0; sd = sd->next)
216     {
217 	if(sd->type == rep_reg_obj)
218 	{
219 	    int i;
220 	    for(i = 0; i < rep_NSUBEXP; i++)
221 	    {
222 		rep_MARKVAL(sd->matches.obj.startp[i]);
223 		rep_MARKVAL(sd->matches.obj.endp[i]);
224 	    }
225 	}
226 	rep_MARKVAL(sd->data);
227     }
228 }
229 
230 /* Fix the match buffers to reflect matching a string from START to END. */
231 void
rep_set_string_match(repv obj,repv start,repv end)232 rep_set_string_match(repv obj, repv start, repv end)
233 {
234     int i;
235     last_match_data = obj;
236     last_match_type = rep_reg_obj;
237     last_matches.obj.startp[0] = start;
238     last_matches.obj.endp[0] = end;
239     for(i = 1; i < rep_NSUBEXP; i++)
240     {
241 	last_matches.obj.startp[i] = rep_NULL;
242 	last_matches.obj.endp[i] = rep_NULL;
243     }
244 }
245 
246 void
rep_push_regexp_data(struct rep_saved_regexp_data * sd)247 rep_push_regexp_data(struct rep_saved_regexp_data *sd)
248 {
249     sd->type = last_match_type;
250     sd->data = last_match_data;
251     memcpy(&sd->matches, &last_matches, sizeof(rep_regsubs));
252     sd->next = rep_saved_matches;
253     rep_saved_matches = sd;
254 }
255 
256 void
rep_pop_regexp_data(void)257 rep_pop_regexp_data(void)
258 {
259     struct rep_saved_regexp_data *sd = rep_saved_matches;
260     rep_saved_matches = sd->next;
261     last_match_type = sd->type;
262     last_match_data = sd->data;
263     memcpy(&last_matches, &sd->matches, sizeof(rep_regsubs));
264 }
265 
266 
267 /* Simple string matching */
268 
269 DEFUN("string-match", Fstring_match, Sstring_match, (repv re, repv str, repv start, repv nocasep), rep_Subr4) /*
270 ::doc:rep.regexp#string-match::
271 string-match REGEXP STRING [START] [IGNORE-CASE-P]
272 
273 Return t if REGEXP matches STRING. Updates the match data.
274 
275 When defined, START is the index of the first character to start
276 matching at (counting from zero). When IGNORE-CASE-P is non-nil the
277 case of matched strings are ignored. Note that character classes are
278 still case-significant.
279 ::end:: */
280 {
281     rep_regexp *prog;
282     long xstart;
283     rep_DECLARE1(re, rep_STRINGP);
284     rep_DECLARE2(str, rep_STRINGP);
285     rep_DECLARE3_OPT(start, rep_INTP);
286     xstart = rep_INTP(start) ? rep_INT(start) : 0;
287     prog = rep_compile_regexp(re);
288     if(prog)
289     {
290 	repv res;
291 	if(rep_regexec2(prog, rep_STR(str) + xstart,
292 			(rep_NILP(nocasep) ? 0 : rep_REG_NOCASE)
293 			| (xstart == 0 ? 0 : rep_REG_NOTBOL)))
294 	{
295 	    rep_update_last_match(str, prog);
296 	    res = Qt;
297 	}
298 	else
299 	    res = Qnil;
300 	return(res);
301     }
302     return rep_NULL;
303 }
304 
305 DEFUN("string-looking-at", Fstring_looking_at, Sstring_looking_at, (repv re, repv string, repv start, repv nocasep), rep_Subr4) /*
306 ::doc:rep.regexp#string-looking-at::
307 string-looking-at REGEXP STRING [START] [IGNORE-CASE-P]
308 
309 Returns t if REGEXP matches the STRING (starting at character START).
310 Updates the match data.
311 ::end:: */
312 {
313     rep_regexp *prog;
314     long xstart;
315     rep_DECLARE1(re, rep_STRINGP);
316     rep_DECLARE2(string, rep_STRINGP);
317     rep_DECLARE3_OPT(start, rep_INTP);
318     xstart = rep_INTP(start) ? rep_INT(start) : 0;
319     prog = rep_compile_regexp(re);
320     if(prog != NULL)
321     {
322 	repv res;
323 	if(rep_regmatch_string(prog, rep_STR(string) + xstart,
324 			       (rep_NILP(nocasep) ? 0 : rep_REG_NOCASE)
325 			       | (xstart == 0 ? 0 : rep_REG_NOTBOL)))
326 	{
327 	    rep_update_last_match(string, prog);
328 	    res = Qt;
329 	}
330 	else
331 	    res = Qnil;
332 	return res;
333     }
334     return rep_NULL;
335 }
336 
337 DEFUN("expand-last-match", Fexpand_last_match, Sexpand_last_match, (repv template), rep_Subr1) /*
338 ::doc:rep.regexp#expand-last-match::
339 expand-last-match TEMPLATE-STRING
340 
341 Expand the saved expressions from the most recent successfully matched
342 regexp according to TEMPLATE-STRING, a string that may contain any of
343 the following escape sequences,
344 
345   \0, \&   whole string matched by REGEXP
346   \N	   N'th parenthensized expression (1 <= N <= 9)
347 ::end:: */
348 {
349     long len;
350     repv string;
351     rep_DECLARE1(template, rep_STRINGP);
352     len = (*rep_regsublen_fun)(last_match_type, &last_matches,
353 			       rep_STR(template), rep_PTR(last_match_data));
354     string = rep_make_string(len);
355     (*rep_regsub_fun)(last_match_type, &last_matches,
356 		      rep_STR(template), rep_STR(string),
357 		      rep_PTR(last_match_data));
358     return string;
359 }
360 
361 DEFUN("match-start", Fmatch_start, Smatch_start, (repv exp), rep_Subr1) /*
362 ::doc:rep.regexp#match-start::
363 match-start [EXPRESSION-INDEX]
364 
365 Return the position which the EXPRESSION-INDEX'th parenthesised expression
366 started at in the last successful regexp match. If EXPRESSION-INDEX is
367 nil or 0 the start of the whole match is returned instead.
368 The returned value will either be a position if the last match was in a
369 buffer, or an integer if the last match was in a string (i.e. regexp-match).
370 ::end:: */
371 {
372     long i;
373     rep_DECLARE1_OPT(exp, rep_INTP);
374     if(rep_INTP(exp))
375     {
376 	i = rep_INT(exp);
377 	if((i >= rep_NSUBEXP) || (i < 0))
378 	    return(rep_signal_arg_error(exp, 1));
379     }
380     else
381 	i = 0;
382     if(last_match_type == rep_reg_obj)
383     {
384 	if(last_matches.obj.startp[i] != rep_NULL)
385 	    return last_matches.obj.startp[i];
386 	return Qnil;
387     }
388     else
389     {
390 	if(last_matches.string.startp[i] == NULL)
391 	    return(Qnil);
392 	i = last_matches.string.startp[i] - (char *)rep_STR(last_match_data);
393 	return(rep_MAKE_INT(i));
394     }
395 }
396 
397 DEFUN("match-end", Fmatch_end, Smatch_end, (repv exp), rep_Subr1) /*
398 ::doc:rep.regexp#match-end::
399 match-end [EXPRESSION-INDEX]
400 
401 Return the position which the EXPRESSION-INDEX'th parenthesised expression
402 ended at in the last successful regexp match. If EXPRESSION-INDEX is
403 nil or 0 the end of the whole match is returned instead.
404 The returned value will either be a position if the last match was in a
405 buffer, or an integer if the last match was in a string (i.e. regexp-match).
406 ::end:: */
407 {
408     long i;
409     rep_DECLARE1_OPT(exp, rep_INTP);
410     if(rep_INTP(exp))
411     {
412 	i = rep_INT(exp);
413 	if((i >= rep_NSUBEXP) || (i < 0))
414 	    return rep_signal_arg_error(exp, 1);
415     }
416     else
417 	i = 0;
418     if(last_match_type == rep_reg_obj)
419     {
420 	if(last_matches.obj.endp[i] != rep_NULL)
421 	    return last_matches.obj.endp[i];
422 	return Qnil;
423     }
424     else
425     {
426 	if(last_matches.string.endp[i] == NULL)
427 	    return(Qnil);
428 	i = last_matches.string.endp[i] - (char *)rep_STR(last_match_data);
429 	return(rep_MAKE_INT(i));
430     }
431 }
432 
433 DEFUN("quote-regexp", Fquote_regexp, Squote_regexp, (repv str), rep_Subr1) /*
434 ::doc:rep.regexp#quote-regexp::
435 quote-regexp STRING
436 
437 Returns a new version of STRING, any characters which the regexp routines
438 treat specially (asterisks, square brackets, etc...) is quoted by the escape
439 character `\'. If the STRING does not contain any regexp meta-characters
440 it is returned as-is (un-copied).
441 ::end:: */
442 {
443     char *buf, *s;
444     int buflen = 128, slen, i = 0;
445     rep_bool quoted = rep_FALSE;
446     repv res = rep_NULL;
447     rep_DECLARE1(str, rep_STRINGP);
448     s = rep_STR(str);
449     slen = rep_STRING_LEN(str);
450     buf = rep_alloc(buflen);
451     if(!buf)
452 	goto error;
453     while(slen-- > 0)
454     {
455 	char c;
456 	/* Ensure string is long enough, this saves doing this twice. */
457 	if(i + 2 >= buflen)
458 	{
459 	    int newlen = buflen * 2;
460 	    char *newbuf = rep_alloc(newlen);
461 	    if(!newbuf)
462 		goto error;
463 	    memcpy(newbuf, buf, i);
464 	    rep_free(buf);
465 	    buf = newbuf;
466 	    buflen = newlen;
467 	}
468 	switch(c = *s++)
469 	{
470 	case '*':
471 	case '+':
472 	case '?':
473 	case '.':
474 	case '[':
475 	case ']':
476 	case '(':
477 	case ')':
478 	case '|':
479 	case '^':
480 	case '$':
481 	case '\\':	 /* do I want to do this? */
482 	    /* quote this character */
483 	    buf[i++] = '\\';
484 	    buf[i++] = c;
485 	    quoted = rep_TRUE;
486 	    break;
487 	default:
488 	    buf[i++] = c;
489 	    break;
490 	}
491     }
492     if(!quoted)
493 	res = str;
494     else
495 	res = rep_string_dupn(buf, i);
496 error:
497     if(buf)
498 	rep_free(buf);
499     return(res);
500 }
501 
502 DEFUN("regexp-cache-control", Fregexp_cache_control,
503       Sregexp_cache_control, (repv limit), rep_Subr1) /*
504 ::doc:rep.regexp#regexp-cache-control::
505 regexp-cache-control [SOFT-LIMIT]
506 
507 If SOFT-LIMIT is defined, it specifies the maximum number of bytes that
508 the regexp cache may occupy after garbage collection.
509 
510 Returns (SOFT-LIMIT CURRENT-SIZE CURRENT-ENTRIES HITS MISSES).
511 ::end:: */
512 {
513     int current_size = 0, current_items = 0;
514     struct cached_regexp *x;
515 
516     rep_DECLARE1_OPT(limit, rep_INTP);
517     if(rep_INTP(limit) && rep_INT(limit) >= 0)
518 	regexp_cache_limit = rep_INT(limit);
519 
520     x = cached_regexps;
521     while(x != 0)
522     {
523 	current_items++;
524 	current_size += sizeof(struct cached_regexp) + x->compiled->regsize;
525 	x = x->next;
526     }
527 
528     return rep_list_5(rep_MAKE_INT(regexp_cache_limit),
529 		  rep_MAKE_INT(current_size), rep_MAKE_INT(current_items),
530 		  rep_MAKE_INT(regexp_hits), rep_MAKE_INT(regexp_misses));
531 }
532 
533 void
rep_regerror(char * err)534 rep_regerror(char *err)
535 {
536     Fsignal(Qregexp_error, rep_LIST_1(rep_string_dup(err)));
537 }
538 
539 void
rep_find_init(void)540 rep_find_init(void)
541 {
542     repv tem = rep_push_structure ("rep.regexp");
543     rep_ADD_SUBR(Sstring_match);
544     rep_ADD_SUBR(Sstring_looking_at);
545     rep_ADD_SUBR(Sexpand_last_match);
546     rep_ADD_SUBR(Smatch_start);
547     rep_ADD_SUBR(Smatch_end);
548     rep_ADD_SUBR(Squote_regexp);
549     rep_ADD_SUBR(Sregexp_cache_control);
550     rep_pop_structure (tem);
551 
552     rep_INTERN(regexp_error); rep_ERROR(regexp_error);
553     rep_regsub_fun = rep_default_regsub;
554     rep_regsublen_fun = rep_default_regsublen;
555 }
556 
557 void
rep_find_kill(void)558 rep_find_kill(void)
559 {
560     release_cached_regexps();
561 }
562