1 /*------------------------------------------------------------------
2 * Regular Expression Wrapper and Cache
3 * Written 1998, 2002, 2004 by Lars Duening.
4 * Share and Enjoy!
5 *------------------------------------------------------------------
6 * This module serves as wrapper around optional alternative
7 * regular expression implementations. It wraps the unique regexp
8 * structures into a ref-counted structure, and lets the API hand
9 * out glorified pointers to this internal structure.
10 *
11 * Beware! rx_exec() stores result data in the regexp structure, so the
12 * same pattern must not be used in two concurrent rx_compile/rx_exec pairs.
13 *
14 #ifdef RXCACHE_TABLE
15 * Additionally, the regular expressions are held in a cache.
16 * Usage of the cache can reduce the setup time
17 * for regexps by factor 4; the actual regexp matching showed up
18 * in experiments as being fast enough to make a cache for the
19 * results worthless.
20 *
21 * Compiled expressions are stored together with their generator
22 * strings in a hash table, hashed over the generator string content.
23 *
24 * The table sizes are specified in config.h as follows:
25 * RXCACHE_TABLE: size of the expression hash table
26 #endif
27 *
28 * TODO: Using shared strings as cache-indices can speed things up,
29 * TODO:: especially when knowing where to find the hashvalue from
30 * TODO:: the strtable.
31 * TODO: Use in-table chaining to improve the use of the table space
32 * TODO:: and reduce the number of collisions.
33 * TODO: Separate the results out from HS-Regexp, so that the compiled
34 * TODO:: program and the results can be held separate.
35 *------------------------------------------------------------------
36 */
37
38 /*--------------------------------------------------------------------*/
39
40 #include "driver.h"
41
42 #include <stdio.h>
43 #include <string.h>
44
45 #include "mregex.h"
46
47 #include "comm.h" /* add_message() */
48 #include "gcollect.h"
49 #include "hash.h"
50 #include "interpret.h"
51 #include "main.h"
52 #include "mstrings.h"
53 #include "pkg-pcre.h"
54 #include "regexp.h"
55 #include "simulate.h"
56 #include "strfuns.h"
57 #include "svalue.h"
58 #include "xalloc.h"
59
60 #include "../mudlib/sys/debug_info.h"
61 #include "../mudlib/sys/driver_hook.h"
62 #include "../mudlib/sys/regexp.h"
63
64 /*--------------------------------------------------------------------*/
65
66 /* --- struct regdata_s: the regexp structure wrapper ---
67 *
68 * This structure wraps the actual regexp structure into a common
69 * format.
70 *
71 * The regular expressions are compiled as needed, with the pointers
72 * serving as flags:
73 * .pProg != NULL: PCRE version compiled
74 * .rx != NULL: traditional regex version compiled.
75 *
76 #ifdef RXCACHE_TABLE
77 * The regexp cache embeds this structure into a larger one which
78 * it uses to manage the cached expressions.
79 #endif
80 */
81
82 typedef struct regdata_s {
83 p_uint ref; /* Number of refs */
84 int opt; /* Additional options, but no package flags */
85 /* -- PCRE -- */
86 pcre * pProg; /* The generated regular expression */
87 pcre_extra * pHints; /* Study data */
88 int num_subs; /* Number of elements in pSubs */
89 int * pSubs; /* Substring offsets + workarea */
90 int res; /* Result of last rx_exec() */
91 /* -- Traditional -- */
92 regexp * rx; /* The actual regular expression */
93 } regdata_t;
94
95 #ifdef RXCACHE_TABLE
96
97 /* --- struct RxHashEntry: One expression hashtable entry ---
98 * Derived from regexp_i_t
99 */
100
101 typedef struct RxHashEntry {
102 regdata_t base; /* The base regexp_t structure */
103
104 string_t * pString; /* Generator string, a counted tabled string
105 * NULL if unused */
106 whash_t hString; /* Hash of pString */
107 size_t size; /* Size of regexp expressions for statistics */
108 } RxHashEntry;
109
110 #endif /* RXCHACHE_TABLE */
111
112 /* --- struct regexp_s: the regexp structure pimpl ---
113 *
114 * This structure is a decorated pointer to the regdata_t structure,
115 * allowing to store unique options and data for shared regexps.
116 *
117 * Since this structure is recreated for every rx_compile() call,
118 * it stores the selection flags of which regexp package to use
119 * for this call.
120 */
121
122 struct regexp_s {
123 int opt; /* Additional options, incl the package flags. */
124 regdata_t * data; /* Counted pointer to the internal structure */
125 };
126
127 /*--------------------------------------------------------------------*/
128
129 /* --- Variables --- */
130
131 #ifdef RXCACHE_TABLE
132
133 static RxHashEntry * xtable[RXCACHE_TABLE]; /* The Expression Hashtable */
134
135 /* Expression cache statistics */
136 static uint32 iNumXRequests = 0; /* Number of calls to rx_compile() */
137 static uint32 iNumXFound = 0; /* Number of calls satisfied from table */
138 static uint32 iNumXCollisions = 0; /* Number of hashcollisions */
139 static uint32 iNumXEntries = 0; /* Number of used cache entries */
140 static uint32 iXSizeAlloc = 0; /* Dynamic memory held in regexp structs */
141
142 #endif /* RXCACHE_TABLE */
143
144 static size_t pcre_malloc_size;
145 /* Accumulated size from pcre_malloc() calls. Used when creating
146 * a new PCRE to capture its allocated size for statistics.
147 */
148
149 static const char* pcre_malloc_err;
150 /* Set by the caller of PCRE functions so that an out-of-memory
151 * condition in the pcre_xalloc() wrapper can get the proper
152 * error message.
153 */
154
155 /*--------------------------------------------------------------------*/
156 /* --- Macros --- */
157
158 /* regdata_t *ref_regdata(regdata_t *r)
159 * Add another ref to regdata <r> and return the regdata <r>.
160 */
161
162 #define ref_regdata(r) ((r)->ref++, (r))
163
164 /* void free_regdata(regdata_t *r)
165 * Subtract one ref from regdata <r>, and free the regdata fully if
166 * the refcount reaches zero.
167 */
168
169 static void rx_free_data (regdata_t * expr); /* forward */
170 static void rx_free_subdata (regdata_t * expr); /* forward */
171
172 #define free_regdata(r) MACRO( if (--((r)->ref) <= 0) rx_free_data(r); )
173
174 #ifdef RXCACHE_TABLE
175
176 /* Hash functions, inspired by interpret.c */
177
178 #if !( (RXCACHE_TABLE) & (RXCACHE_TABLE)-1 )
179 #define RxStrHash(s) ((s) & ((RXCACHE_TABLE)-1))
180 #else
181 #define RxStrHash(s) ((s) % RXCACHE_TABLE)
182 #endif
183
184 #endif /* RXCHACHE_TABLE */
185
186 /*--------------------------------------------------------------------*/
187 static void *
pcre_xalloc(size_t size)188 pcre_xalloc (size_t size)
189
190 /* Wrapper function so that PCRE will use our special allocator. */
191
192 {
193 void * p;
194
195 if (!pcre_malloc_err)
196 pcre_malloc_err = "in PCRE function";
197 xallocate(p, size, pcre_malloc_err);
198 pcre_malloc_size += size;
199 return p;
200 } /* pcre_xalloc() */
201
202 /*--------------------------------------------------------------------*/
203 const char *
rx_pcre_version(void)204 rx_pcre_version (void)
205
206 /* Return a string with the name and version of the PCRE package.
207 */
208
209 {
210 static char buf[40];
211 sprintf(buf, "%d.%d", PCRE_MAJOR, PCRE_MINOR);
212 # ifdef USE_BUILTIN_PCRE
213 strcat(buf, " (builtin)");
214 # endif
215 return buf;
216 } /* rx_pcre_version() */
217
218 /*--------------------------------------------------------------------*/
rx_init(void)219 void rx_init(void)
220
221 /* Initialise the module. */
222
223 {
224 #ifdef RXCACHE_TABLE
225 memset(xtable, 0, sizeof(xtable));
226 #endif
227
228 pcre_malloc = pcre_xalloc;
229 pcre_free = xfree;
230 } /* rx_init() */
231
232 /*--------------------------------------------------------------------*/
233 static const char *
get_error_message(int code,int package)234 get_error_message (int code, int package)
235
236 /* Return a constant string with the error message for <code>
237 * generated by <package>.
238 * If <code> is not an error, NULL is returned.
239 */
240
241 {
242 if (package & RE_PCRE)
243 {
244 const char* text;
245
246 if (code >= 0)
247 return NULL;
248 switch (code)
249 {
250 case PCRE_ERROR_NOMATCH:
251 text = "too many capturing parentheses"; break;
252 case PCRE_ERROR_NULL:
253 text = "code or subject NULL"; break;
254 case PCRE_ERROR_BADOPTION:
255 text = "unknown option specified"; break;
256 case PCRE_ERROR_BADMAGIC:
257 text = "regex memory invalid"; break;
258 case PCRE_ERROR_UNKNOWN_NODE:
259 text = "regex memory violated"; break;
260 case PCRE_ERROR_NOMEMORY:
261 text = "out of memory"; break;
262 case RE_ERROR_BACKTRACK:
263 text = "too many backtracks"; break;
264 default:
265 text = "unknown internal error"; break;
266 }
267 return text;
268 }
269
270 if (package & RE_TRADITIONAL)
271 {
272 const char* text;
273
274 if (code >= 0)
275 return NULL;
276 switch (code)
277 {
278 case RE_ERROR_NULL:
279 text = "program or text NULL"; break;
280 case RE_ERROR_CORRUPT:
281 text = "regex memory invalid"; break;
282 case RE_ERROR_BACKTRACK:
283 text = "too many backtracks"; break;
284 default:
285 text = "unknown internal error"; break;
286 }
287 return text;
288 }
289
290 return NULL;
291 } /* get_error_message() */
292
293 /*--------------------------------------------------------------------*/
294 const char *
rx_error_message(int code,const regexp_t * pRegexp)295 rx_error_message (int code, const regexp_t * pRegexp)
296
297 /* Return a constant string with the error message for <code>
298 * and regexp <pRegexp>.
299 * If <code> is not an error, NULL is returned.
300 */
301
302 {
303 return get_error_message(code, pRegexp->opt);
304 } /* rx_error_message() */
305
306 /*--------------------------------------------------------------------*/
307 static Bool
rx_compile_re(string_t * expr,int opt,Bool from_ed,regdata_t * rdata)308 rx_compile_re (string_t * expr, int opt, Bool from_ed, regdata_t * rdata)
309
310 /* Compile the expression <expr> with the options <opt> into a traditional
311 * expression and store it into *<rdata>.
312 * On success, return TRUE;
313 * On failure, if <from_ed> is FALSE, an error is thrown.
314 * On failure, if <from_ed> is TRUE, the error message is printed directly
315 * to the user and the function returns with FALSE.
316 */
317
318 {
319 char * pErrmsg;
320 int erridx;
321
322 regexp * pRegexp = NULL;
323
324 /* Sanitize the <opt> value */
325 opt = opt & RE_EXCOMPATIBLE & ~(RE_PACKAGE_MASK);
326
327 /* Compile the RE */
328 pRegexp = hs_regcomp((unsigned char *)get_txt(expr)
329 , opt & RE_EXCOMPATIBLE
330 , &pErrmsg, &erridx);
331 if (NULL == pRegexp)
332 {
333 rx_free_subdata(rdata); /* Might have PCRE data in it already */
334 if (from_ed)
335 add_message("re: %s at offset %d\n", pErrmsg, erridx);
336 else
337 errorf("re: %s at offset %d\n", pErrmsg, erridx);
338 return MY_FALSE;
339 }
340
341 /* Compilation complete - store the result in the outgoing regdata
342 * structure.
343 */
344 rdata->rx = pRegexp;
345
346 return MY_TRUE;
347 } /* rx_compile_re() */
348
349 /*--------------------------------------------------------------------*/
350 static size_t
rx_compile_pcre(string_t * expr,int opt,Bool from_ed,regdata_t * rdata)351 rx_compile_pcre (string_t * expr, int opt, Bool from_ed, regdata_t * rdata)
352
353 /* Compile the expression <expr> with the options <opt> into a PCRE
354 * expression and store it into *<rdata>.
355 * On success, return the accumulated size of the expression.
356 * On failure, if <from_ed> is FALSE, an error is thrown.
357 * On failure, if <from_ed> is TRUE, the error message is printed directly
358 * to the user and the function returns with 0.
359 */
360
361 {
362 const char * pErrmsg;
363 int erridx;
364
365 pcre * pProg = NULL; /* The generated regular expression */
366 pcre_extra * pHints = NULL; /* Study data */
367 int * pSubs; /* Capturing and work area */
368 int pcre_opt; /* <opt> translated into PCRE opts */
369 int num_subs; /* Number of capturing parentheses */
370
371 /* Sanitize the <opt> value */
372 opt = opt & ~(RE_EXCOMPATIBLE) & ~(RE_PACKAGE_MASK);
373
374 /* Determine the RE compilation options */
375
376 pcre_opt = 0;
377 if (opt & RE_CASELESS) pcre_opt |= PCRE_CASELESS;
378 if (opt & RE_MULTILINE) pcre_opt |= PCRE_MULTILINE;
379 if (opt & RE_DOTALL) pcre_opt |= PCRE_DOTALL;
380 if (opt & RE_EXTENDED) pcre_opt |= PCRE_EXTENDED;
381 if (opt & RE_DOLLAR_ENDONLY) pcre_opt |= PCRE_DOLLAR_ENDONLY;
382 if (opt & RE_UNGREEDY) pcre_opt |= PCRE_UNGREEDY;
383
384 /* Compile the RE */
385 pcre_malloc_size = 0;
386 pcre_malloc_err = "compiling regex";
387 pProg = pcre_compile(get_txt(expr), pcre_opt, &pErrmsg, &erridx, NULL);
388
389 if (NULL == pProg)
390 {
391 rx_free_subdata(rdata); /* Might have HS data in it already */
392 if (from_ed)
393 add_message("pcre: %s at offset %d\n", pErrmsg, erridx);
394 else
395 errorf("pcre: %s at offset %d\n", pErrmsg, erridx);
396 return 0;
397 }
398
399 pcre_malloc_err = "studying regex";
400 pHints = pcre_study(pProg, 0, &pErrmsg);
401
402 if (pErrmsg)
403 {
404 xfree(pProg);
405 if (from_ed)
406 add_message("pcre: %s\n", pErrmsg);
407 else
408 errorf("pcre: %s\n", pErrmsg);
409 return 0;
410 }
411 /* We have to ensure to have an initialized pHints structure for
412 setting the recursion limit later on */
413 if (pHints == NULL)
414 {
415 pcre_malloc_err = "allocating memory for pHints section in regexp";
416 pHints = (pcre_extra *)pcre_xalloc(sizeof(pcre_extra));
417 if (pHints == NULL)
418 {
419 if (from_ed)
420 add_message("pcre: Could not allocate memory for pHints\n");
421 else
422 errorf("pcre: Could not allocate memory for pHints\n");
423 return 0;
424 }
425 pHints->flags = 0;
426 }
427
428 {
429 int rc;
430
431 rc = pcre_fullinfo(pProg, pHints, PCRE_INFO_CAPTURECOUNT, &num_subs);
432 if (rc != 0)
433 {
434 xfree(pProg);
435 xfree(pHints);
436 if (from_ed)
437 add_message("pcre: %s\n", get_error_message(rc, RE_PCRE));
438 else
439 errorf("pcre: %s\n", get_error_message(rc, RE_PCRE));
440 return 0;
441 }
442 num_subs = 3 * (num_subs+1);
443 }
444
445 pSubs = xalloc(num_subs * sizeof (*pSubs));
446 if (pSubs == NULL)
447 {
448 xfree(pProg);
449 xfree(pHints);
450 outofmem(num_subs * sizeof (*pSubs), "regexp work area");
451 }
452 pcre_malloc_size += num_subs * sizeof (*pSubs);
453
454 /* Compilation complete - store the result in the outgoing regdata
455 * structure.
456 */
457
458 rdata->pProg = pProg;
459 rdata->pHints = pHints;
460 rdata->pSubs = pSubs;
461 rdata->num_subs = num_subs;
462 rdata->res = 0;
463
464 return pcre_malloc_size;
465 } /* rx_compile_pcre() */
466
467 /*--------------------------------------------------------------------*/
468 static regdata_t *
rx_compile_data(string_t * expr,int opt,Bool from_ed)469 rx_compile_data (string_t * expr, int opt, Bool from_ed)
470
471 /* Compile a regdata structure from the expression <expr>, according
472 * to the options in <opt>. If <from_ed> is TRUE, the RE is used
473 * from the editor, so error messages will be printed directly
474 * to the user.
475 *
476 * If possible, take a ready-compiled structure from the hashtable,
477 * else enter the newly compiled structure into the table.
478 *
479 * The caller gets his own reference to the structure, which he has
480 * to free_regdata() after use.
481 */
482
483 {
484 size_t pcre_size;
485 regdata_t rdata; /* Local rdata structure to hold compiled expression */
486
487 #ifdef RXCACHE_TABLE
488 whash_t hExpr;
489 int h;
490 RxHashEntry *pHash;
491 #endif
492
493 #ifdef RXCACHE_TABLE
494 iNumXRequests++;
495
496 hExpr = mstr_get_hash(expr);
497 h = RxStrHash(hExpr);
498 pHash = xtable[h];
499
500 /* Look for a ready-compiled regexp */
501 if (pHash != NULL
502 && pHash->pString != NULL
503 && pHash->hString == hExpr
504 && pHash->base.opt == (opt & ~(RE_PACKAGE_MASK))
505 && mstreq(pHash->pString, expr)
506 )
507 {
508 iNumXFound++;
509
510 /* Regexp found, but it may not have been compiled for us yet.
511 */
512 if ((opt & RE_PCRE) && !pHash->base.pProg)
513 {
514 pHash->size = rx_compile_pcre(expr, opt, from_ed, &(pHash->base));
515 if (!pHash->size)
516 return NULL;
517 }
518 if ((opt & RE_TRADITIONAL) && !pHash->base.rx)
519 {
520 if (!rx_compile_re(expr, opt, from_ed, &(pHash->base)))
521 return NULL;
522 }
523 return ref_regdata(&(pHash->base));
524 }
525 #endif
526
527 /* Regexp not found: compile a new one.
528 */
529
530 pcre_size = 0;
531 memset(&rdata, 0, sizeof(rdata));
532
533 if (opt & RE_PCRE)
534 {
535 pcre_size = rx_compile_pcre(expr, opt, from_ed, &rdata);
536 if (!pcre_size)
537 {
538 return NULL;
539 }
540 }
541 if (opt & RE_TRADITIONAL)
542 {
543 if (!rx_compile_re(expr, opt, from_ed, &rdata))
544 {
545 return NULL;
546 }
547 }
548
549 #ifndef RXCACHE_TABLE
550
551 /* Wrap up the new regular expression and return it */
552 {
553 regdata_t * rc;
554
555 rc = xalloc(sizeof(*rc));
556 if (!rc)
557 {
558 rx_free_subdata(&rdata);
559 outofmem(sizeof(*rc), "Regexp data structure");
560 return NULL;
561 }
562
563 memcpy(rc, &rdata, sizeof(*rc));
564
565 rc->ref = 1;
566 rc->opt = opt & ~(RE_PACKAGE_MASK);
567 return rc;
568 }
569 #else
570
571 /* Wrap up the new regular expression and enter it into the table */
572 expr = make_tabled_from(expr); /* for faster comparisons */
573
574 if (NULL != pHash)
575 {
576 iNumXCollisions++;
577 iNumXEntries--;
578 iXSizeAlloc -= sizeof(*pHash) + pHash->size;
579 free_regdata(&(pHash->base));
580 }
581
582 pHash = xalloc(sizeof(*pHash));
583 if (!pHash)
584 {
585 rx_free_subdata(&rdata);
586 outofmem(sizeof(*pHash), "Regexp cache structure");
587 return NULL;
588 }
589 xtable[h] = pHash;
590
591 memcpy(&(pHash->base), &rdata, sizeof(pHash->base));
592
593 pHash->base.ref = 1;
594 pHash->base.opt = opt & ~(RE_PACKAGE_MASK);
595 pHash->pString = expr; /* refs are transferred */
596 pHash->hString = hExpr;
597 pHash->size = pcre_size;
598
599 iNumXEntries++;
600 iXSizeAlloc += sizeof(*pHash) + pHash->size;
601
602 return ref_regdata((regdata_t *)pHash);
603 #endif /* RXCACHE_TABLE */
604 } /* rx_compile_data() */
605
606 /*--------------------------------------------------------------------*/
607 regexp_t *
rx_compile(string_t * expr,int opt,Bool from_ed)608 rx_compile (string_t * expr, int opt, Bool from_ed)
609
610 /* Compile a regexp structure from the expression <expr>, according
611 * to the options in <opt>. If <from_ed> is TRUE, the RE is used
612 * from the editor, so error messages will be printed directly
613 * to the user.
614 *
615 * If possible, take a ready-compiled structure from the hashtable,
616 * else enter the newly compiled structure into the table.
617 *
618 * The caller gets his own unique regexp structure, which he has
619 * to free_regexp() after use.
620 */
621
622 {
623 regdata_t * pData;
624 regexp_t * pRegexp = NULL;
625
626 /* Determine for which package to compile */
627 if (!(opt & RE_PACKAGE_MASK))
628 {
629 if (driver_hook[H_REGEXP_PACKAGE].u.number)
630 opt |= driver_hook[H_REGEXP_PACKAGE].u.number;
631 else
632 opt |= regex_package;
633 }
634
635 if (!(opt & RE_PACKAGE_MASK))
636 {
637 if (from_ed)
638 add_message("Missing specification of which regexp package to use.\n");
639 else
640 errorf("Missing specification of which regexp package to use.\n");
641 return NULL;
642 }
643
644 pData = rx_compile_data(expr, opt, from_ed);
645 if (pData)
646 {
647 pRegexp = xalloc(sizeof(*pRegexp));
648 if (pRegexp == NULL)
649 {
650 free_regdata(pData);
651 outofmem(sizeof(*pRegexp), "regexp structure");
652 return NULL;
653 }
654
655 pRegexp->opt = opt;
656 pRegexp->data = pData;
657 }
658
659 return pRegexp;
660 } /* rx_compile() */
661
662 /*-------------------------------------------------------------------------*/
663 int
rx_exec(regexp_t * pRegexp,string_t * string,size_t start)664 rx_exec (regexp_t *pRegexp, string_t * string, size_t start)
665
666 /* Match the regexp <pRegexp> against the <string>, starting the match
667 * at the position <start>.
668 *
669 * Return a positive number if pattern matched, 0 if it did not match,
670 * or a negative error code (this can be printed with rx_error_message()).
671 */
672
673 {
674 regdata_t * prog = pRegexp->data;
675
676 if (pRegexp->opt & RE_PCRE)
677 {
678 int rc;
679 int pcre_opt;
680 pcre_extra * pHints; /* Study data */
681
682 /* Determine the RE compilation options */
683
684 pcre_opt = 0;
685 if (prog->opt & RE_ANCHORED) pcre_opt |= PCRE_ANCHORED;
686 if (prog->opt & RE_NOTBOL) pcre_opt |= PCRE_NOTBOL;
687 if (prog->opt & RE_NOTEOL) pcre_opt |= PCRE_NOTEOL;
688 if (prog->opt & RE_NOTEMPTY) pcre_opt |= PCRE_NOTEMPTY;
689
690 pHints = prog->pHints;
691 /* If LD_PCRE_RECURSION_LIMIT is defined we set a limit for match. If
692 * PCRE_EXTRA_MATCH_LIMIT_RECURSION is defined, we have a new libpcre,
693 * which supports limiting the recursions. Otherwise we have to limit
694 * the no of calls to match().
695 * TODO: Instead of the conditional compilation we should update the
696 * TODO::built-in pcre package.
697 */
698 #ifdef LD_PCRE_RECURSION_LIMIT
699 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
700 pHints->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
701 pHints->match_limit_recursion = LD_PCRE_RECURSION_LIMIT;
702 #else
703 pHints->flags |= PCRE_EXTRA_MATCH_LIMIT;
704 pHints->match_limit = LD_PCRE_RECURSION_LIMIT;
705 #endif /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
706 #else /* LD_PCRE_RECURSION_LIMIT */
707 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
708 pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT_RECURSION
709 #else
710 pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
711 #endif /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
712 #endif /* LD_PCRE_RECURSION_LIMIT */
713
714 rc = pcre_exec( prog->pProg, pHints
715 , get_txt(string), mstrsize(string), start, pcre_opt
716 , prog->pSubs, prog->num_subs
717 );
718 prog->res = rc;
719
720 /* Reverse the roles of return codes 0 (not enough entries in subs[])
721 * and PCRE_ERROR_NOMATCH.
722 */
723 if (rc == PCRE_ERROR_NOMATCH) rc = 0;
724 else if (rc == 0) rc = PCRE_ERROR_NOMATCH;
725
726 return rc;
727 } /* if (use pcre) */
728
729 /* Fallback: Traditional regexp */
730 return hs_regexec(prog->rx, get_txt(string)+start, get_txt(string));
731 } /* rx_exec() */
732
733 /*-------------------------------------------------------------------------*/
734 int
rx_exec_str(regexp_t * pRegexp,char * string,char * start)735 rx_exec_str (regexp_t *pRegexp, char * string, char * start)
736
737 /* Match the regexp <pRegexp> against the <string> whose real begin
738 * is at <start>.
739 *
740 * Return a positive number if pattern matched, 0 if it did not match,
741 * or a negative error code (this can be printed with rx_error_message()).
742 *
743 * This method is used by ed().
744 */
745
746 {
747 regdata_t * prog = pRegexp->data;
748
749 if (pRegexp->opt & RE_PCRE)
750 {
751 int rc;
752 int pcre_opt;
753 pcre_extra * pHints; /* Study data */
754
755 /* Determine the RE compilation options */
756
757 pcre_opt = 0;
758 if (prog->opt & RE_ANCHORED) pcre_opt |= PCRE_ANCHORED;
759 if (prog->opt & RE_NOTBOL) pcre_opt |= PCRE_NOTBOL;
760 if (prog->opt & RE_NOTEOL) pcre_opt |= PCRE_NOTEOL;
761 if (prog->opt & RE_NOTEMPTY) pcre_opt |= PCRE_NOTEMPTY;
762
763 pHints = prog->pHints;
764 /* If LD_PCRE_RECURSION_LIMIT is defined we set a limit for match. If
765 * PCRE_EXTRA_MATCH_LIMIT_RECURSION is defined, we have a new libpcre,
766 * which supports limiting the recursions. Otherwise we have to limit
767 * the no of calls to match().
768 * TODO: Instead of the conditional compilation we should update the
769 * TODO::built-in pcre package.
770 */
771 #ifdef LD_PCRE_RECURSION_LIMIT
772 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
773 pHints->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
774 pHints->match_limit_recursion = LD_PCRE_RECURSION_LIMIT;
775 #else
776 pHints->flags |= PCRE_EXTRA_MATCH_LIMIT;
777 pHints->match_limit = LD_PCRE_RECURSION_LIMIT;
778 #endif /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
779 #else /* LD_PCRE_RECURSION_LIMIT */
780 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
781 pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT_RECURSION
782 #else
783 pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
784 #endif /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
785 #endif /* LD_PCRE_RECURSION_LIMIT */
786
787 rc = pcre_exec( prog->pProg, pHints
788 , start, strlen(start), string - start, pcre_opt
789 , prog->pSubs, prog->num_subs
790 );
791 prog->res = rc;
792
793 /* Reverse the roles of return codes 0 (not enough entries in subs[])
794 * and PCRE_ERROR_NOMATCH.
795 */
796 if (rc == PCRE_ERROR_NOMATCH) rc = 0;
797 else if (rc == 0) rc = PCRE_ERROR_NOMATCH;
798
799 return rc;
800 } /* if (use pcre) */
801
802 /* Fallback: Traditional regexp */
803 return hs_regexec(prog->rx, string, start);
804 } /* rx_exec_str() */
805
806 /*-------------------------------------------------------------------------*/
807 int
rx_num_matches(regexp_t * pRegexp)808 rx_num_matches (regexp_t *pRegexp)
809
810 /* After a successful match of <pRegexp>, return the number of matched
811 * expressions and parenthesized expressions.
812 * In other words: the result is at least 1 (for the fully matched
813 * expressions), plus the number of matched '()' sub expressions.
814 *
815 * The function tries to detect when it is called after an unsuccessful
816 * match and return 0 in that case.
817 */
818
819 {
820 if (pRegexp->opt & RE_PCRE)
821 {
822 return pRegexp->data->res >= 1 ? pRegexp->data->res : 0;
823 } /* if (use pcre) */
824
825 /* Fallback: Traditional regexp */
826 {
827 regdata_t * prog = pRegexp->data;
828 p_uint num, i;
829
830 for (num = 0, i = 0
831 ; i < sizeof(prog->rx->startp) / sizeof(prog->rx->startp[0])
832 ; i++)
833 {
834 if (prog->rx->startp[i] != NULL
835 && prog->rx->endp[i] != NULL
836 )
837 num++;
838 }
839
840 return num;
841 }
842 } /* rx_num_matches() */
843
844 /*-------------------------------------------------------------------------*/
845 Bool
rx_get_match_n(regexp_t * pRegexp,string_t * str,int n,size_t * start,size_t * end)846 rx_get_match_n (regexp_t *pRegexp, string_t * str, int n, size_t * start, size_t * end)
847
848 /* After a successful match of <pRegexp> against <str>, store the start
849 * and end position of the match <n> in *<start> and *<end> and retuern TRUE.
850 * The end position is in fact the position of the first character after
851 * the match.
852 *
853 * <n> is 0 for the whole expression, and any positive number for the
854 * matched subexpressions.
855 *
856 * If the requested match does not exist, *<start> and *<end> are both
857 * set to 0 and the function returns FALSE.
858 */
859
860 {
861 Bool rc;
862 regdata_t * prog = pRegexp->data;
863
864 if (pRegexp->opt & RE_PCRE)
865 {
866 if (n < 0
867 || n >= prog->res
868 || prog->pSubs[2*n] < 0
869 || prog->pSubs[2*n+1] < 0
870 )
871 {
872 *start = 0;
873 *end = 0;
874 rc = MY_FALSE;
875 }
876 else
877 {
878 *start = (size_t)prog->pSubs[2*n];
879 *end = (size_t)prog->pSubs[2*n+1];
880 rc = MY_TRUE;
881 }
882 return rc;
883 } /* if (use pcre) */
884
885 /* Fallback: Traditional regexp */
886 if (n < 0
887 || n >= (int)(sizeof(prog->rx->startp) / sizeof(prog->rx->startp[0]))
888 || prog->rx->startp[n] == NULL
889 || prog->rx->endp[n] == NULL
890 )
891 {
892 *start = 0;
893 *end = 0;
894 rc = MY_FALSE;
895 }
896 else
897 {
898 *start = prog->rx->startp[n] - get_txt(str);
899 *end = prog->rx->endp[n] - get_txt(str);
900 rc = MY_TRUE;
901 }
902
903 return rc;
904 } /* rx_get_match_n() */
905
906 /*-------------------------------------------------------------------------*/
907 void
rx_get_match(regexp_t * pRegexp,string_t * str,size_t * start,size_t * end)908 rx_get_match (regexp_t *pRegexp, string_t * str, size_t * start, size_t * end)
909
910 /* After a successful match of <pRegexp> against <str>, return the start
911 * and end position of the match in *<start> and *<end>. The end
912 * position is in fact the position of the first character after the match.
913 */
914
915 {
916 regdata_t * prog = pRegexp->data;
917
918 if (pRegexp->opt & RE_PCRE)
919 {
920 *start = (size_t)prog->pSubs[0];
921 *end = (size_t)prog->pSubs[1];
922 return;
923 } /* if (use pcre) */
924
925 /* Fallback: Traditional regexp */
926 *start = prog->rx->startp[0] - get_txt(str);
927 *end = prog->rx->endp[0] - get_txt(str);
928 } /* rx_get_match() */
929
930 /*-------------------------------------------------------------------------*/
931 void
rx_get_match_str(regexp_t * pRegexp,char * str,size_t * start,size_t * end)932 rx_get_match_str (regexp_t *pRegexp, char * str, size_t * start, size_t * end)
933
934 /* After a successful match of <pRegexp> against <str>, return the start
935 * and end position of the match in *<start> and *<end>. The end
936 * position is in fact the position of the first character after the match.
937 */
938
939 {
940 regdata_t * prog = pRegexp->data;
941
942 if (pRegexp->opt & RE_PCRE)
943 {
944 *start = (size_t)prog->pSubs[0];
945 *end = (size_t)prog->pSubs[1];
946 return;
947 } /* if (use pcre) */
948
949 /* Fallback: Traditional regexp */
950 *start = prog->rx->startp[0] - str;
951 *end = prog->rx->endp[0] - str;
952 } /* rx_get_match_str() */
953
954 /*-------------------------------------------------------------------------*/
955 string_t *
rx_sub(regexp_t * pRegexp,string_t * source,string_t * subst)956 rx_sub (regexp_t *pRegexp, string_t *source, string_t *subst)
957
958 /* <pRegexp> describes a regexp match in string <source>. Take the
959 * replacement string <subst> and substitute any matched subparentheses.
960 * The result is a new string with one reference.
961 *
962 * Returns NULL when out of memory.
963 */
964
965 {
966 Bool copyPass; /* Pass indicator */
967 size_t len; /* Computed length of the result */
968 string_t * result; /* Result string */
969 regdata_t * prog = pRegexp->data;
970
971 result = NULL;
972
973 /* Make two passes over the the string: one to compute the size
974 * of the result, the second to create the result.
975 */
976 copyPass = MY_FALSE;
977 len = 0;
978 do
979 {
980 char * src;
981 char * dst = NULL;
982 size_t left;
983
984 left = mstrsize(subst);
985 src = get_txt(subst);
986 if (copyPass)
987 dst = get_txt(result);
988
989 while (left-- > 0)
990 {
991 int no;
992 char c;
993
994 c = *src++;
995 if (c == '&')
996 no = 0;
997 else if (c == '\\' && '0' <= *src && *src <= '9')
998 {
999 no = *src++ - '0';
1000 left--;
1001 }
1002 else
1003 no = -1;
1004
1005 if (no < 0) /* Ordinary character. */
1006 {
1007 if (c == '\\' && (*src == '\\' || *src == '&'))
1008 {
1009 c = *src++;
1010 left--;
1011 }
1012 if (copyPass)
1013 *dst++ = c;
1014 else
1015 len++;
1016 }
1017 else
1018 {
1019 if (pRegexp->opt & RE_PCRE)
1020 {
1021 if (no < prog->res
1022 && prog->pSubs[2*no] != -1
1023 && prog->pSubs[2*no+1] != -1
1024 )
1025 {
1026 size_t start = (size_t)prog->pSubs[2*no];
1027 size_t sublen = (size_t)prog->pSubs[2*no+1] - start;
1028
1029 if (copyPass)
1030 {
1031 memcpy(dst, get_txt(source)+start, sublen);
1032 dst += sublen;
1033 }
1034 else
1035 {
1036 len += sublen;
1037 }
1038 }
1039 } /* if (use pcre) */
1040
1041 if (pRegexp->opt & RE_TRADITIONAL)
1042 {
1043 if (no < (int)(sizeof(prog->rx->startp) / sizeof(prog->rx->startp[0]))
1044 && prog->rx->startp[no] != NULL
1045 && prog->rx->endp[no] != NULL
1046 )
1047 {
1048 size_t sublen = prog->rx->endp[no] - prog->rx->startp[no];
1049
1050 if (copyPass)
1051 {
1052 memcpy(dst, prog->rx->startp[no], sublen);
1053 dst += sublen;
1054 }
1055 else
1056 {
1057 len += sublen;
1058 }
1059 }
1060 } /* if (use traditional) */
1061 }
1062 } /* while(left-- > 0) */
1063
1064 /* End of pass */
1065 if (!copyPass)
1066 {
1067 result = alloc_mstring(len);
1068 if (!result)
1069 return NULL;
1070 }
1071 copyPass = !copyPass;
1072 } while (copyPass);
1073
1074 return result;
1075 } /* rx_sub() */
1076
1077 /*-------------------------------------------------------------------------*/
1078 string_t *
rx_sub_str(regexp_t * pRegexp,char * source,char * subst)1079 rx_sub_str (regexp_t *pRegexp, char *source, char *subst)
1080
1081 /* <pRegexp> describes a regexp match in string <source>. Take the
1082 * replacement string <subst> and substitute any matched subparentheses.
1083 * The result is a new string with one reference.
1084 *
1085 * Returns NULL when out of memory.
1086 */
1087
1088 {
1089 string_t *m_source;
1090 string_t *m_subst;
1091 string_t *rc;
1092
1093 m_source = new_mstring(source);
1094 if (!m_source)
1095 {
1096 return NULL;
1097 }
1098
1099 m_subst = new_mstring(subst);
1100 if (!m_subst)
1101 {
1102 free_mstring(m_source);
1103 return NULL;
1104 }
1105 rc = rx_sub(pRegexp, m_source, m_subst);
1106 free_mstring(m_source);
1107 free_mstring(m_subst);
1108 return rc;
1109 } /* rx_sub_str() */
1110
1111 /*-------------------------------------------------------------------------*/
1112 Bool
rx_reganch(regexp_t * pRegexp)1113 rx_reganch (regexp_t *pRegexp)
1114
1115 /* If <pRegexp> is a traditional regexp, return TRUE if rx->reganch
1116 * is not NULL. If <pRegexp> is a PCRE, always return FALSE.
1117 *
1118 * Used by ed.c
1119 */
1120
1121 {
1122 if (pRegexp->opt & RE_PCRE)
1123 return MY_FALSE;
1124
1125 /* Traditional regexp */
1126 return pRegexp->data->rx->reganch != 0;
1127 } /* rx_reganch() */
1128
1129 /*--------------------------------------------------------------------*/
1130 static void
rx_free_subdata(regdata_t * expr)1131 rx_free_subdata (regdata_t * expr)
1132
1133 /* Deallocate all associated data in regdata structure <expr>.
1134 */
1135
1136 {
1137 if (expr->pSubs) xfree(expr->pSubs); expr->pSubs = NULL;
1138 if (expr->pHints) xfree(expr->pHints); expr->pHints = NULL;
1139 if (expr->pProg) xfree(expr->pProg); expr->pProg = NULL;
1140 if (expr->rx) xfree(expr->rx); expr->rx = NULL;
1141 } /* rx_free_subdata() */
1142
1143 /*--------------------------------------------------------------------*/
1144 static void
rx_free_data(regdata_t * expr)1145 rx_free_data (regdata_t * expr)
1146
1147 /* Deallocate a regdata structure <expr> and all associated data.
1148 #ifdef RXCACHE_TABLE
1149 * <expr> is in fact a RxHashEntry structure.
1150 #endif
1151 */
1152
1153 {
1154 #ifdef RXCACHE_TABLE
1155 {
1156 RxHashEntry * pHash = (RxHashEntry *)expr;
1157 free_mstring(pHash->pString);
1158 }
1159 #endif
1160 rx_free_subdata(expr);
1161 xfree(expr);
1162 } /* rx_free_data() */
1163
1164 /*--------------------------------------------------------------------*/
1165 void
free_regexp(regexp_t * expr)1166 free_regexp (regexp_t * expr)
1167
1168 /* Deallocate a regexp structure <expr> and all associated data.
1169 */
1170
1171 {
1172 free_regdata(expr->data);
1173 xfree(expr);
1174 } /* free_regexp() */
1175
1176 /*--------------------------------------------------------------------*/
1177 size_t
rxcache_status(strbuf_t * sbuf,Bool verbose)1178 rxcache_status (strbuf_t *sbuf, Bool verbose)
1179
1180 /* Gather (and optionally print) the statistics from the rxcache.
1181 * Return the amount of memory used.
1182 */
1183
1184 {
1185 #ifdef RXCACHE_TABLE
1186
1187 uint32 iNumXReq; /* Number of rx_compile() requests, made non-zero */
1188
1189 #if defined(__MWERKS__) && !defined(WARN_ALL)
1190 # pragma warn_largeargs off
1191 #endif
1192
1193 /* In verbose mode, print the statistics */
1194 if (verbose)
1195 {
1196 strbuf_add(sbuf, "\nRegexp cache status:\n");
1197 strbuf_add(sbuf, "--------------------\n");
1198 strbuf_addf(sbuf, "Expressions in cache: %"PRIu32" (%.1f%%)\n"
1199 , iNumXEntries, 100.0 * (float)iNumXEntries / RXCACHE_TABLE);
1200 strbuf_addf(sbuf, "Memory allocated: %"PRIu32"\n", iXSizeAlloc);
1201 iNumXReq = iNumXRequests ? iNumXRequests : 1;
1202 strbuf_addf(sbuf
1203 , "Requests: %"PRIu32" - Found: %"PRIu32" (%.1f%%) - "
1204 "Coll: %"PRIu32" (%.1f%% req/%.1f%% entries)\n"
1205 , iNumXRequests, iNumXFound, 100.0 * (float)iNumXFound/(float)iNumXReq
1206 , iNumXCollisions, 100.0 * (float)iNumXCollisions/(float)iNumXReq
1207 , 100.0 * (float)iNumXCollisions/(iNumXEntries ? iNumXEntries : 1)
1208 );
1209 }
1210 else
1211 {
1212 strbuf_addf(sbuf, "Regexp cache:\t\t\t%8"PRId32" %9"PRIu32"\n",
1213 iNumXEntries, iXSizeAlloc);
1214 }
1215
1216 return iXSizeAlloc;
1217
1218 #if defined(__MWERKS__)
1219 # pragma warn_largeargs reset
1220 #endif
1221
1222 #else
1223
1224 return 0;
1225 #endif
1226 } /* rxcache_status() */
1227
1228 /*-------------------------------------------------------------------------*/
1229 void
rxcache_dinfo_status(svalue_t * svp,int value)1230 rxcache_dinfo_status (svalue_t *svp, int value)
1231
1232 /* Return the rxcache information for debug_info(DINFO_DATA, DID_STATUS).
1233 * <svp> points to the svalue block for the result, this function fills in
1234 * the spots for the object table.
1235 * If <value> is -1, <svp> points indeed to a value block; other it is
1236 * the index of the desired value and <svp> points to a single svalue.
1237 */
1238
1239 {
1240 #ifdef RXCACHE_TABLE
1241
1242 #define ST_NUMBER(which,code) \
1243 if (value == -1) svp[which].u.number = code; \
1244 else if (value == which) svp->u.number = code
1245
1246 ST_NUMBER(DID_ST_RX_CACHED, iNumXEntries);
1247 ST_NUMBER(DID_ST_RX_TABLE, RXCACHE_TABLE);
1248 ST_NUMBER(DID_ST_RX_TABLE_SIZE, iXSizeAlloc);
1249 ST_NUMBER(DID_ST_RX_REQUESTS, iNumXRequests);
1250 ST_NUMBER(DID_ST_RX_REQ_FOUND, iNumXFound);
1251 ST_NUMBER(DID_ST_RX_REQ_COLL, iNumXCollisions);
1252
1253 #undef ST_NUMBER
1254 #endif
1255 } /* rxcache_dinfo_status() */
1256
1257 /*--------------------------------------------------------------------*/
1258 #if defined(GC_SUPPORT)
1259
1260 /*--------------------------------------------------------------------*/
1261 void
clear_rxcache_refs(void)1262 clear_rxcache_refs (void)
1263
1264 /* Clear all the refcounts in the hashtables.
1265 * The refs of the shared strings and of the memory blocks are
1266 * not of our concern.
1267 */
1268
1269 {
1270 #ifdef RXCACHE_TABLE
1271 int i;
1272
1273 for (i = 0; i < RXCACHE_TABLE; i++)
1274 if (NULL != xtable[i])
1275 {
1276 xtable[i]->base.ref = 0;
1277 }
1278 #endif
1279 } /* clear_rxcache_refs() */
1280
1281 /*--------------------------------------------------------------------*/
1282 void
clear_regexp_ref(regexp_t * pRegexp)1283 clear_regexp_ref (regexp_t * pRegexp)
1284
1285 /* Clear the refcount for <pRegexp>.
1286 */
1287
1288 {
1289 if (pRegexp)
1290 pRegexp->data->ref = 0;
1291 } /* clear_regexp_ref() */
1292
1293 /*--------------------------------------------------------------------*/
1294 void
count_regdata_ref(regdata_t * pRegexp)1295 count_regdata_ref (regdata_t * pRegexp)
1296
1297 /* Mark all memory associated with one regexp structure and count
1298 * the refs.
1299 * This function is called both from rxcache as well as from ed.
1300 */
1301
1302 {
1303 note_malloced_block_ref(pRegexp);
1304 if (pRegexp->pProg)
1305 {
1306 note_malloced_block_ref(pRegexp->pProg);
1307 if (pRegexp->pHints)
1308 note_malloced_block_ref(pRegexp->pHints);
1309 if (pRegexp->pSubs)
1310 note_malloced_block_ref(pRegexp->pSubs);
1311 }
1312 if (pRegexp->rx)
1313 note_malloced_block_ref(pRegexp->rx);
1314 #ifdef RXCACHE_TABLE
1315 count_ref_from_string(((RxHashEntry *)pRegexp)->pString);
1316 #endif
1317 pRegexp->ref++;
1318 } /* count_regdata_ref() */
1319
1320 /*--------------------------------------------------------------------*/
1321 void
count_regexp_ref(regexp_t * pRegexp)1322 count_regexp_ref (regexp_t * pRegexp)
1323
1324 /* Mark all memory associated with one regexp structure and count
1325 * the refs.
1326 * This function is called both from rxcache as well as from ed.
1327 */
1328
1329 {
1330 note_malloced_block_ref(pRegexp);
1331 count_regdata_ref(pRegexp->data);
1332 } /* count_regexp_ref() */
1333
1334 /*--------------------------------------------------------------------*/
1335 void
count_rxcache_refs(void)1336 count_rxcache_refs (void)
1337
1338 /* Mark all memory referenced from the hashtables. */
1339
1340 {
1341 #ifdef RXCACHE_TABLE
1342 int i;
1343
1344 for (i = 0; i < RXCACHE_TABLE; i++)
1345 {
1346 if (NULL != xtable[i])
1347 {
1348 count_regdata_ref((regdata_t *)xtable[i]);
1349 }
1350 } /* for (i) */
1351 #endif
1352
1353 } /* count_rxcache_refs() */
1354
1355 #endif /* if GC_SUPPORT */
1356
1357 /*====================================================================*/
1358
1359