1 /* find.c -- Searching and replacing
2 Copyright (C) 1993, 1994 John Harper <john@dcs.warwick.ac.uk>
3 $Id$
4
5 This file is part of Jade.
6
7 Jade is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 Jade is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Jade; see the file COPYING. If not, write to
19 the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20
21 #define _GNU_SOURCE
22
23 #include "repint.h"
24
25 #include <string.h>
26 #include <ctype.h>
27 #include <stdlib.h>
28 #include <assert.h>
29
30 /* Hooks for dealing with the rep_reg_obj match type. */
31 void (*rep_regsub_fun)(int, rep_regsubs *, char *, char *, void *);
32 int (*rep_regsublen_fun)(int, rep_regsubs *, char *, void *);
33
34
35 /* Compiling regexps. */
36
37 /* A linked list is used to store all recently-used regexps in MRU
38 order. At GC the regexps at the tail of the list are freed to
39 satisfy the size limit.
40
41 It might be better to use a hash-table. But by experience it seems
42 that the cache is usually quite small, and therefore searching the
43 list each compilation isn't too bad (and it makes the gc easier).
44
45 Also, the hit-ratio is very good (as I'm typing this, ~0.97) */
46
47 struct cached_regexp {
48 struct cached_regexp *next;
49 repv regexp;
50 rep_regexp *compiled;
51 };
52
53 static struct cached_regexp *cached_regexps; /* should be a hash table? */
54 static int regexp_hits, regexp_misses;
55 static int regexp_cache_limit = 1024;
56
57 DEFSYM(regexp_error, "regexp-error");
58 DEFSTRING(err_regexp_error, "Regexp error");
59
60 rep_regexp *
rep_compile_regexp(repv re)61 rep_compile_regexp(repv re)
62 {
63 struct cached_regexp **x = &cached_regexps;
64 int re_len;
65 assert(rep_STRINGP(re));
66 re_len = rep_STRING_LEN(re);
67 while(*x != 0)
68 {
69 repv saved_re = (*x)->regexp;
70 assert(rep_STRINGP(saved_re));
71 if(saved_re == re
72 || (rep_STRING_LEN(saved_re) == re_len
73 && memcmp(rep_STR(saved_re), rep_STR(re), re_len) == 0))
74 {
75 /* Found it. Move this node to the head of the list. Then
76 return the compiled copy. */
77 struct cached_regexp *this = *x;
78 if(x != &cached_regexps)
79 {
80 *x = this->next;
81 this->next = cached_regexps;
82 cached_regexps = this;
83 }
84 regexp_hits++;
85 return this->compiled;
86 }
87 x = &((*x)->next);
88 }
89
90 /* No cached copy. Compile it, then add it to the cache. */
91 {
92 struct cached_regexp *this;
93 rep_regexp *compiled = rep_regcomp(rep_STR(re));
94 if(compiled != 0)
95 {
96 this = rep_alloc(sizeof(struct cached_regexp));
97 if(this != 0)
98 {
99 this->regexp = re;
100 this->compiled = compiled;
101 this->next = cached_regexps;
102 cached_regexps = this;
103 regexp_misses++;
104 rep_data_after_gc += (sizeof(struct cached_regexp)
105 + compiled->regsize);
106 return compiled;
107 }
108 }
109 return 0;
110 }
111 }
112
113 /* Remove any cached compilation of STRING from the regexp cache */
114 void
rep_string_modified(repv string)115 rep_string_modified (repv string)
116 {
117 struct cached_regexp **x;
118 for (x = &cached_regexps; *x != 0; x = &((*x)->next))
119 {
120 if ((*x)->regexp == string)
121 {
122 /* found the string, remove it from the cache */
123 struct cached_regexp *ptr = *x;
124 *x = ptr->next;
125 free (ptr->compiled);
126 rep_free (ptr);
127 }
128 }
129 }
130
131 /* Called at GC */
132 static void
mark_cached_regexps(void)133 mark_cached_regexps(void)
134 {
135 unsigned long total = 0;
136 struct cached_regexp *x = cached_regexps, *xp = 0;
137 while(x != 0 && total < regexp_cache_limit)
138 {
139 assert(rep_STRINGP(x->regexp));
140 rep_MARKVAL(x->regexp);
141 total += sizeof(struct cached_regexp) + x->compiled->regsize;
142 xp = x;
143 x = x->next;
144 }
145 if(xp != 0)
146 {
147 /* Free all following regexps */
148 x = xp->next;
149 xp->next = 0;
150 while(x != 0)
151 {
152 xp = x->next;
153 free(x->compiled);
154 rep_free(x);
155 x = xp;
156 }
157 }
158 }
159
160 /* Free all cached regexps */
161 static void
release_cached_regexps(void)162 release_cached_regexps(void)
163 {
164 struct cached_regexp *x = cached_regexps;
165 cached_regexps = 0;
166 while(x != 0)
167 {
168 struct cached_regexp *next = x->next;
169 free(x->compiled);
170 rep_free(x);
171 x = next;
172 }
173 }
174
175
176 /* Storing regexp context. */
177
178 /* Storage for remembering where the last match was.
179 last_match_data is the string or buffer that was matched against.
180 last_matches is a copy of the subexpression data of the last match. */
181 static rep_regtype last_match_type;
182 static repv last_match_data;
183 static rep_regsubs last_matches;
184
185 struct rep_saved_regexp_data *rep_saved_matches;
186
187 void
rep_update_last_match(repv data,rep_regexp * prog)188 rep_update_last_match(repv data, rep_regexp *prog)
189 {
190 last_match_type = prog->lasttype;
191 last_match_data = data;
192 memcpy(&last_matches, &prog->matches, sizeof(last_matches));
193 }
194
195 /* Called by GC */
196 void
rep_mark_regexp_data(void)197 rep_mark_regexp_data(void)
198 {
199 struct rep_saved_regexp_data *sd;
200
201 /* Don't keep too many cached REs through GC. */
202 mark_cached_regexps();
203
204 if(last_match_type == rep_reg_obj)
205 {
206 int i;
207 for(i = 0; i < rep_NSUBEXP; i++)
208 {
209 rep_MARKVAL(last_matches.obj.startp[i]);
210 rep_MARKVAL(last_matches.obj.endp[i]);
211 }
212 }
213 rep_MARKVAL(last_match_data);
214
215 for(sd = rep_saved_matches; sd != 0; sd = sd->next)
216 {
217 if(sd->type == rep_reg_obj)
218 {
219 int i;
220 for(i = 0; i < rep_NSUBEXP; i++)
221 {
222 rep_MARKVAL(sd->matches.obj.startp[i]);
223 rep_MARKVAL(sd->matches.obj.endp[i]);
224 }
225 }
226 rep_MARKVAL(sd->data);
227 }
228 }
229
230 /* Fix the match buffers to reflect matching a string from START to END. */
231 void
rep_set_string_match(repv obj,repv start,repv end)232 rep_set_string_match(repv obj, repv start, repv end)
233 {
234 int i;
235 last_match_data = obj;
236 last_match_type = rep_reg_obj;
237 last_matches.obj.startp[0] = start;
238 last_matches.obj.endp[0] = end;
239 for(i = 1; i < rep_NSUBEXP; i++)
240 {
241 last_matches.obj.startp[i] = rep_NULL;
242 last_matches.obj.endp[i] = rep_NULL;
243 }
244 }
245
246 void
rep_push_regexp_data(struct rep_saved_regexp_data * sd)247 rep_push_regexp_data(struct rep_saved_regexp_data *sd)
248 {
249 sd->type = last_match_type;
250 sd->data = last_match_data;
251 memcpy(&sd->matches, &last_matches, sizeof(rep_regsubs));
252 sd->next = rep_saved_matches;
253 rep_saved_matches = sd;
254 }
255
256 void
rep_pop_regexp_data(void)257 rep_pop_regexp_data(void)
258 {
259 struct rep_saved_regexp_data *sd = rep_saved_matches;
260 rep_saved_matches = sd->next;
261 last_match_type = sd->type;
262 last_match_data = sd->data;
263 memcpy(&last_matches, &sd->matches, sizeof(rep_regsubs));
264 }
265
266
267 /* Simple string matching */
268
269 DEFUN("string-match", Fstring_match, Sstring_match, (repv re, repv str, repv start, repv nocasep), rep_Subr4) /*
270 ::doc:rep.regexp#string-match::
271 string-match REGEXP STRING [START] [IGNORE-CASE-P]
272
273 Return t if REGEXP matches STRING. Updates the match data.
274
275 When defined, START is the index of the first character to start
276 matching at (counting from zero). When IGNORE-CASE-P is non-nil the
277 case of matched strings are ignored. Note that character classes are
278 still case-significant.
279 ::end:: */
280 {
281 rep_regexp *prog;
282 long xstart;
283 rep_DECLARE1(re, rep_STRINGP);
284 rep_DECLARE2(str, rep_STRINGP);
285 rep_DECLARE3_OPT(start, rep_INTP);
286 xstart = rep_INTP(start) ? rep_INT(start) : 0;
287 prog = rep_compile_regexp(re);
288 if(prog)
289 {
290 repv res;
291 if(rep_regexec2(prog, rep_STR(str) + xstart,
292 (rep_NILP(nocasep) ? 0 : rep_REG_NOCASE)
293 | (xstart == 0 ? 0 : rep_REG_NOTBOL)))
294 {
295 rep_update_last_match(str, prog);
296 res = Qt;
297 }
298 else
299 res = Qnil;
300 return(res);
301 }
302 return rep_NULL;
303 }
304
305 DEFUN("string-looking-at", Fstring_looking_at, Sstring_looking_at, (repv re, repv string, repv start, repv nocasep), rep_Subr4) /*
306 ::doc:rep.regexp#string-looking-at::
307 string-looking-at REGEXP STRING [START] [IGNORE-CASE-P]
308
309 Returns t if REGEXP matches the STRING (starting at character START).
310 Updates the match data.
311 ::end:: */
312 {
313 rep_regexp *prog;
314 long xstart;
315 rep_DECLARE1(re, rep_STRINGP);
316 rep_DECLARE2(string, rep_STRINGP);
317 rep_DECLARE3_OPT(start, rep_INTP);
318 xstart = rep_INTP(start) ? rep_INT(start) : 0;
319 prog = rep_compile_regexp(re);
320 if(prog != NULL)
321 {
322 repv res;
323 if(rep_regmatch_string(prog, rep_STR(string) + xstart,
324 (rep_NILP(nocasep) ? 0 : rep_REG_NOCASE)
325 | (xstart == 0 ? 0 : rep_REG_NOTBOL)))
326 {
327 rep_update_last_match(string, prog);
328 res = Qt;
329 }
330 else
331 res = Qnil;
332 return res;
333 }
334 return rep_NULL;
335 }
336
337 DEFUN("expand-last-match", Fexpand_last_match, Sexpand_last_match, (repv template), rep_Subr1) /*
338 ::doc:rep.regexp#expand-last-match::
339 expand-last-match TEMPLATE-STRING
340
341 Expand the saved expressions from the most recent successfully matched
342 regexp according to TEMPLATE-STRING, a string that may contain any of
343 the following escape sequences,
344
345 \0, \& whole string matched by REGEXP
346 \N N'th parenthensized expression (1 <= N <= 9)
347 ::end:: */
348 {
349 long len;
350 repv string;
351 rep_DECLARE1(template, rep_STRINGP);
352 len = (*rep_regsublen_fun)(last_match_type, &last_matches,
353 rep_STR(template), rep_PTR(last_match_data));
354 string = rep_make_string(len);
355 (*rep_regsub_fun)(last_match_type, &last_matches,
356 rep_STR(template), rep_STR(string),
357 rep_PTR(last_match_data));
358 return string;
359 }
360
361 DEFUN("match-start", Fmatch_start, Smatch_start, (repv exp), rep_Subr1) /*
362 ::doc:rep.regexp#match-start::
363 match-start [EXPRESSION-INDEX]
364
365 Return the position which the EXPRESSION-INDEX'th parenthesised expression
366 started at in the last successful regexp match. If EXPRESSION-INDEX is
367 nil or 0 the start of the whole match is returned instead.
368 The returned value will either be a position if the last match was in a
369 buffer, or an integer if the last match was in a string (i.e. regexp-match).
370 ::end:: */
371 {
372 long i;
373 rep_DECLARE1_OPT(exp, rep_INTP);
374 if(rep_INTP(exp))
375 {
376 i = rep_INT(exp);
377 if((i >= rep_NSUBEXP) || (i < 0))
378 return(rep_signal_arg_error(exp, 1));
379 }
380 else
381 i = 0;
382 if(last_match_type == rep_reg_obj)
383 {
384 if(last_matches.obj.startp[i] != rep_NULL)
385 return last_matches.obj.startp[i];
386 return Qnil;
387 }
388 else
389 {
390 if(last_matches.string.startp[i] == NULL)
391 return(Qnil);
392 i = last_matches.string.startp[i] - (char *)rep_STR(last_match_data);
393 return(rep_MAKE_INT(i));
394 }
395 }
396
397 DEFUN("match-end", Fmatch_end, Smatch_end, (repv exp), rep_Subr1) /*
398 ::doc:rep.regexp#match-end::
399 match-end [EXPRESSION-INDEX]
400
401 Return the position which the EXPRESSION-INDEX'th parenthesised expression
402 ended at in the last successful regexp match. If EXPRESSION-INDEX is
403 nil or 0 the end of the whole match is returned instead.
404 The returned value will either be a position if the last match was in a
405 buffer, or an integer if the last match was in a string (i.e. regexp-match).
406 ::end:: */
407 {
408 long i;
409 rep_DECLARE1_OPT(exp, rep_INTP);
410 if(rep_INTP(exp))
411 {
412 i = rep_INT(exp);
413 if((i >= rep_NSUBEXP) || (i < 0))
414 return rep_signal_arg_error(exp, 1);
415 }
416 else
417 i = 0;
418 if(last_match_type == rep_reg_obj)
419 {
420 if(last_matches.obj.endp[i] != rep_NULL)
421 return last_matches.obj.endp[i];
422 return Qnil;
423 }
424 else
425 {
426 if(last_matches.string.endp[i] == NULL)
427 return(Qnil);
428 i = last_matches.string.endp[i] - (char *)rep_STR(last_match_data);
429 return(rep_MAKE_INT(i));
430 }
431 }
432
433 DEFUN("quote-regexp", Fquote_regexp, Squote_regexp, (repv str), rep_Subr1) /*
434 ::doc:rep.regexp#quote-regexp::
435 quote-regexp STRING
436
437 Returns a new version of STRING, any characters which the regexp routines
438 treat specially (asterisks, square brackets, etc...) is quoted by the escape
439 character `\'. If the STRING does not contain any regexp meta-characters
440 it is returned as-is (un-copied).
441 ::end:: */
442 {
443 char *buf, *s;
444 int buflen = 128, slen, i = 0;
445 rep_bool quoted = rep_FALSE;
446 repv res = rep_NULL;
447 rep_DECLARE1(str, rep_STRINGP);
448 s = rep_STR(str);
449 slen = rep_STRING_LEN(str);
450 buf = rep_alloc(buflen);
451 if(!buf)
452 goto error;
453 while(slen-- > 0)
454 {
455 char c;
456 /* Ensure string is long enough, this saves doing this twice. */
457 if(i + 2 >= buflen)
458 {
459 int newlen = buflen * 2;
460 char *newbuf = rep_alloc(newlen);
461 if(!newbuf)
462 goto error;
463 memcpy(newbuf, buf, i);
464 rep_free(buf);
465 buf = newbuf;
466 buflen = newlen;
467 }
468 switch(c = *s++)
469 {
470 case '*':
471 case '+':
472 case '?':
473 case '.':
474 case '[':
475 case ']':
476 case '(':
477 case ')':
478 case '|':
479 case '^':
480 case '$':
481 case '\\': /* do I want to do this? */
482 /* quote this character */
483 buf[i++] = '\\';
484 buf[i++] = c;
485 quoted = rep_TRUE;
486 break;
487 default:
488 buf[i++] = c;
489 break;
490 }
491 }
492 if(!quoted)
493 res = str;
494 else
495 res = rep_string_dupn(buf, i);
496 error:
497 if(buf)
498 rep_free(buf);
499 return(res);
500 }
501
502 DEFUN("regexp-cache-control", Fregexp_cache_control,
503 Sregexp_cache_control, (repv limit), rep_Subr1) /*
504 ::doc:rep.regexp#regexp-cache-control::
505 regexp-cache-control [SOFT-LIMIT]
506
507 If SOFT-LIMIT is defined, it specifies the maximum number of bytes that
508 the regexp cache may occupy after garbage collection.
509
510 Returns (SOFT-LIMIT CURRENT-SIZE CURRENT-ENTRIES HITS MISSES).
511 ::end:: */
512 {
513 int current_size = 0, current_items = 0;
514 struct cached_regexp *x;
515
516 rep_DECLARE1_OPT(limit, rep_INTP);
517 if(rep_INTP(limit) && rep_INT(limit) >= 0)
518 regexp_cache_limit = rep_INT(limit);
519
520 x = cached_regexps;
521 while(x != 0)
522 {
523 current_items++;
524 current_size += sizeof(struct cached_regexp) + x->compiled->regsize;
525 x = x->next;
526 }
527
528 return rep_list_5(rep_MAKE_INT(regexp_cache_limit),
529 rep_MAKE_INT(current_size), rep_MAKE_INT(current_items),
530 rep_MAKE_INT(regexp_hits), rep_MAKE_INT(regexp_misses));
531 }
532
533 void
rep_regerror(char * err)534 rep_regerror(char *err)
535 {
536 Fsignal(Qregexp_error, rep_LIST_1(rep_string_dup(err)));
537 }
538
539 void
rep_find_init(void)540 rep_find_init(void)
541 {
542 repv tem = rep_push_structure ("rep.regexp");
543 rep_ADD_SUBR(Sstring_match);
544 rep_ADD_SUBR(Sstring_looking_at);
545 rep_ADD_SUBR(Sexpand_last_match);
546 rep_ADD_SUBR(Smatch_start);
547 rep_ADD_SUBR(Smatch_end);
548 rep_ADD_SUBR(Squote_regexp);
549 rep_ADD_SUBR(Sregexp_cache_control);
550 rep_pop_structure (tem);
551
552 rep_INTERN(regexp_error); rep_ERROR(regexp_error);
553 rep_regsub_fun = rep_default_regsub;
554 rep_regsublen_fun = rep_default_regsublen;
555 }
556
557 void
rep_find_kill(void)558 rep_find_kill(void)
559 {
560 release_cached_regexps();
561 }
562