1 /*------------------------------------------------------------------
2  * Regular Expression Wrapper and Cache
3  * Written 1998, 2002, 2004 by Lars Duening.
4  * Share and Enjoy!
5  *------------------------------------------------------------------
6  * This module serves as wrapper around optional alternative
7  * regular expression implementations. It wraps the unique regexp
8  * structures into a ref-counted structure, and lets the API hand
9  * out glorified pointers to this internal structure.
10  *
11  * Beware! rx_exec() stores result data in the regexp structure, so the
12  * same pattern must not be used in two concurrent rx_compile/rx_exec pairs.
13  *
14 #ifdef RXCACHE_TABLE
15  * Additionally, the regular expressions are held in a cache.
16  * Usage of the cache can reduce the setup time
17  * for regexps by factor 4; the actual regexp matching showed up
18  * in experiments as being fast enough to make a cache for the
19  * results worthless.
20  *
21  * Compiled expressions are stored together with their generator
22  * strings in a hash table, hashed over the generator string content.
23  *
24  * The table sizes are specified in config.h as follows:
25  *   RXCACHE_TABLE: size of the expression hash table
26 #endif
27  *
28  * TODO: Using shared strings as cache-indices can speed things up,
29  * TODO:: especially when knowing where to find the hashvalue from
30  * TODO:: the strtable.
31  * TODO: Use in-table chaining to improve the use of the table space
32  * TODO:: and reduce the number of collisions.
33  * TODO: Separate the results out from HS-Regexp, so that the compiled
34  * TODO:: program and the results can be held separate.
35  *------------------------------------------------------------------
36  */
37 
38 /*--------------------------------------------------------------------*/
39 
40 #include "driver.h"
41 
42 #include <stdio.h>
43 #include <string.h>
44 
45 #include "mregex.h"
46 
47 #include "comm.h" /* add_message() */
48 #include "gcollect.h"
49 #include "hash.h"
50 #include "interpret.h"
51 #include "main.h"
52 #include "mstrings.h"
53 #include "pkg-pcre.h"
54 #include "regexp.h"
55 #include "simulate.h"
56 #include "strfuns.h"
57 #include "svalue.h"
58 #include "xalloc.h"
59 
60 #include "../mudlib/sys/debug_info.h"
61 #include "../mudlib/sys/driver_hook.h"
62 #include "../mudlib/sys/regexp.h"
63 
64 /*--------------------------------------------------------------------*/
65 
66 /* --- struct regdata_s: the regexp structure wrapper ---
67  *
68  * This structure wraps the actual regexp structure into a common
69  * format.
70  *
71  * The regular expressions are compiled as needed, with the pointers
72  * serving as flags:
73  *   .pProg != NULL: PCRE version compiled
74  *   .rx    != NULL: traditional regex version compiled.
75  *
76 #ifdef RXCACHE_TABLE
77  * The regexp cache embeds this structure into a larger one which
78  * it uses to manage the cached expressions.
79 #endif
80  */
81 
82 typedef struct regdata_s {
83     p_uint        ref;       /* Number of refs */
84     int           opt;       /* Additional options, but no package flags */
85     /* -- PCRE -- */
86     pcre        * pProg;     /* The generated regular expression */
87     pcre_extra  * pHints;    /* Study data */
88     int           num_subs;  /* Number of elements in pSubs */
89     int         * pSubs;     /* Substring offsets + workarea */
90     int           res;       /* Result of last rx_exec() */
91     /* -- Traditional -- */
92     regexp      * rx;        /* The actual regular expression */
93 } regdata_t;
94 
95 #ifdef RXCACHE_TABLE
96 
97 /* --- struct RxHashEntry: One expression hashtable entry ---
98  * Derived from regexp_i_t
99  */
100 
101 typedef struct RxHashEntry {
102     regdata_t base;      /* The base regexp_t structure */
103 
104     string_t * pString;  /* Generator string, a counted tabled string
105                           * NULL if unused */
106     whash_t    hString;  /* Hash of pString */
107     size_t     size;     /* Size of regexp expressions for statistics */
108 } RxHashEntry;
109 
110 #endif /* RXCHACHE_TABLE */
111 
112 /* --- struct regexp_s: the regexp structure pimpl ---
113  *
114  * This structure is a decorated pointer to the regdata_t structure,
115  * allowing to store unique options and data for shared regexps.
116  *
117  * Since this structure is recreated for every rx_compile() call,
118  * it stores the selection flags of which regexp package to use
119  * for this call.
120  */
121 
122 struct regexp_s {
123     int         opt;   /* Additional options, incl the package flags. */
124     regdata_t * data;  /* Counted pointer to the internal structure */
125 };
126 
127 /*--------------------------------------------------------------------*/
128 
129 /* --- Variables --- */
130 
131 #ifdef RXCACHE_TABLE
132 
133 static RxHashEntry * xtable[RXCACHE_TABLE];  /* The Expression Hashtable */
134 
135 /* Expression cache statistics */
136 static uint32 iNumXRequests   = 0;  /* Number of calls to rx_compile() */
137 static uint32 iNumXFound      = 0;  /* Number of calls satisfied from table */
138 static uint32 iNumXCollisions = 0;  /* Number of hashcollisions */
139 static uint32 iNumXEntries    = 0;  /* Number of used cache entries */
140 static uint32 iXSizeAlloc     = 0;  /* Dynamic memory held in regexp structs */
141 
142 #endif /* RXCACHE_TABLE */
143 
144 static size_t pcre_malloc_size;
145   /* Accumulated size from pcre_malloc() calls. Used when creating
146    * a new PCRE to capture its allocated size for statistics.
147    */
148 
149 static const char* pcre_malloc_err;
150   /* Set by the caller of PCRE functions so that an out-of-memory
151    * condition in the pcre_xalloc() wrapper can get the proper
152    * error message.
153    */
154 
155 /*--------------------------------------------------------------------*/
156 /* --- Macros --- */
157 
158 /* regdata_t *ref_regdata(regdata_t *r)
159  *   Add another ref to regdata <r> and return the regdata <r>.
160  */
161 
162 #define ref_regdata(r) ((r)->ref++, (r))
163 
164 /* void free_regdata(regdata_t *r)
165  *   Subtract one ref from regdata <r>, and free the regdata fully if
166  *   the refcount reaches zero.
167  */
168 
169 static void rx_free_data (regdata_t * expr); /* forward */
170 static void rx_free_subdata (regdata_t * expr); /* forward */
171 
172 #define free_regdata(r) MACRO( if (--((r)->ref) <= 0) rx_free_data(r); )
173 
174 #ifdef RXCACHE_TABLE
175 
176 /* Hash functions, inspired by interpret.c */
177 
178 #if !( (RXCACHE_TABLE) & (RXCACHE_TABLE)-1 )
179 #define RxStrHash(s) ((s) & ((RXCACHE_TABLE)-1))
180 #else
181 #define RxStrHash(s) ((s) % RXCACHE_TABLE)
182 #endif
183 
184 #endif /* RXCHACHE_TABLE */
185 
186 /*--------------------------------------------------------------------*/
187 static void *
pcre_xalloc(size_t size)188 pcre_xalloc (size_t size)
189 
190 /* Wrapper function so that PCRE will use our special allocator. */
191 
192 {
193     void * p;
194 
195     if (!pcre_malloc_err)
196         pcre_malloc_err = "in PCRE function";
197     xallocate(p, size, pcre_malloc_err);
198     pcre_malloc_size += size;
199     return p;
200 } /* pcre_xalloc() */
201 
202 /*--------------------------------------------------------------------*/
203 const char *
rx_pcre_version(void)204 rx_pcre_version (void)
205 
206 /* Return a string with the name and version of the PCRE package.
207  */
208 
209 {
210     static char buf[40];
211     sprintf(buf, "%d.%d", PCRE_MAJOR, PCRE_MINOR);
212 #    ifdef USE_BUILTIN_PCRE
213     strcat(buf, " (builtin)");
214 #    endif
215     return buf;
216 } /* rx_pcre_version() */
217 
218 /*--------------------------------------------------------------------*/
rx_init(void)219 void rx_init(void)
220 
221 /* Initialise the module. */
222 
223 {
224 #ifdef RXCACHE_TABLE
225     memset(xtable, 0, sizeof(xtable));
226 #endif
227 
228     pcre_malloc = pcre_xalloc;
229     pcre_free = xfree;
230 } /* rx_init() */
231 
232 /*--------------------------------------------------------------------*/
233 static const char *
get_error_message(int code,int package)234 get_error_message (int code, int package)
235 
236 /* Return a constant string with the error message for <code>
237  * generated by <package>.
238  * If <code> is not an error, NULL is returned.
239  */
240 
241 {
242     if (package & RE_PCRE)
243     {
244         const char* text;
245 
246         if (code >= 0)
247             return NULL;
248         switch (code)
249         {
250         case PCRE_ERROR_NOMATCH:
251             text = "too many capturing parentheses"; break;
252         case PCRE_ERROR_NULL:
253             text = "code or subject NULL"; break;
254         case PCRE_ERROR_BADOPTION:
255             text = "unknown option specified"; break;
256         case PCRE_ERROR_BADMAGIC:
257             text = "regex memory invalid"; break;
258         case PCRE_ERROR_UNKNOWN_NODE:
259             text = "regex memory violated"; break;
260         case PCRE_ERROR_NOMEMORY:
261             text = "out of memory"; break;
262         case RE_ERROR_BACKTRACK:
263             text = "too many backtracks"; break;
264         default:
265             text = "unknown internal error"; break;
266         }
267         return text;
268     }
269 
270     if (package & RE_TRADITIONAL)
271     {
272         const char* text;
273 
274         if (code >= 0)
275             return NULL;
276         switch (code)
277         {
278         case RE_ERROR_NULL:
279             text = "program or text NULL"; break;
280         case RE_ERROR_CORRUPT:
281             text = "regex memory invalid"; break;
282         case RE_ERROR_BACKTRACK:
283             text = "too many backtracks"; break;
284         default:
285             text = "unknown internal error"; break;
286         }
287         return text;
288     }
289 
290     return NULL;
291 }  /* get_error_message() */
292 
293 /*--------------------------------------------------------------------*/
294 const char *
rx_error_message(int code,const regexp_t * pRegexp)295 rx_error_message (int code, const regexp_t * pRegexp)
296 
297 /* Return a constant string with the error message for <code>
298  * and regexp <pRegexp>.
299  * If <code> is not an error, NULL is returned.
300  */
301 
302 {
303     return get_error_message(code, pRegexp->opt);
304 }  /* rx_error_message() */
305 
306 /*--------------------------------------------------------------------*/
307 static Bool
rx_compile_re(string_t * expr,int opt,Bool from_ed,regdata_t * rdata)308 rx_compile_re (string_t * expr, int opt, Bool from_ed, regdata_t * rdata)
309 
310 /* Compile the expression <expr> with the options <opt> into a traditional
311  * expression and store it into *<rdata>.
312  * On success, return TRUE;
313  * On failure, if <from_ed> is FALSE, an error is thrown.
314  * On failure, if <from_ed> is TRUE, the error message is printed directly
315  * to the user and the function returns with  FALSE.
316  */
317 
318 {
319     char   * pErrmsg;
320     int      erridx;
321 
322     regexp * pRegexp = NULL;
323 
324     /* Sanitize the <opt> value */
325     opt = opt & RE_EXCOMPATIBLE & ~(RE_PACKAGE_MASK);
326 
327     /* Compile the RE */
328     pRegexp = hs_regcomp((unsigned char *)get_txt(expr)
329                         , opt & RE_EXCOMPATIBLE
330                         , &pErrmsg, &erridx);
331     if (NULL == pRegexp)
332     {
333         rx_free_subdata(rdata); /* Might have PCRE data in it already */
334         if (from_ed)
335             add_message("re: %s at offset %d\n", pErrmsg, erridx);
336         else
337             errorf("re: %s at offset %d\n", pErrmsg, erridx);
338         return MY_FALSE;
339     }
340 
341     /* Compilation complete - store the result in the outgoing regdata
342      * structure.
343      */
344     rdata->rx = pRegexp;
345 
346     return MY_TRUE;
347 } /* rx_compile_re() */
348 
349 /*--------------------------------------------------------------------*/
350 static size_t
rx_compile_pcre(string_t * expr,int opt,Bool from_ed,regdata_t * rdata)351 rx_compile_pcre (string_t * expr, int opt, Bool from_ed, regdata_t * rdata)
352 
353 /* Compile the expression <expr> with the options <opt> into a PCRE
354  * expression and store it into *<rdata>.
355  * On success, return the accumulated size of the expression.
356  * On failure, if <from_ed> is FALSE, an error is thrown.
357  * On failure, if <from_ed> is TRUE, the error message is printed directly
358  * to the user and the function returns with 0.
359  */
360 
361 {
362     const char * pErrmsg;
363     int          erridx;
364 
365     pcre       * pProg = NULL;     /* The generated regular expression */
366     pcre_extra * pHints = NULL;    /* Study data */
367     int        * pSubs;     /* Capturing and work area */
368     int          pcre_opt;  /* <opt> translated into PCRE opts */
369     int          num_subs;  /* Number of capturing parentheses */
370 
371     /* Sanitize the <opt> value */
372     opt = opt & ~(RE_EXCOMPATIBLE) & ~(RE_PACKAGE_MASK);
373 
374     /* Determine the RE compilation options */
375 
376     pcre_opt = 0;
377     if (opt & RE_CASELESS)       pcre_opt |= PCRE_CASELESS;
378     if (opt & RE_MULTILINE)      pcre_opt |= PCRE_MULTILINE;
379     if (opt & RE_DOTALL)         pcre_opt |= PCRE_DOTALL;
380     if (opt & RE_EXTENDED)       pcre_opt |= PCRE_EXTENDED;
381     if (opt & RE_DOLLAR_ENDONLY) pcre_opt |= PCRE_DOLLAR_ENDONLY;
382     if (opt & RE_UNGREEDY)       pcre_opt |= PCRE_UNGREEDY;
383 
384     /* Compile the RE */
385     pcre_malloc_size = 0;
386     pcre_malloc_err  = "compiling regex";
387     pProg = pcre_compile(get_txt(expr), pcre_opt, &pErrmsg, &erridx, NULL);
388 
389     if (NULL == pProg)
390     {
391         rx_free_subdata(rdata); /* Might have HS data in it already */
392         if (from_ed)
393             add_message("pcre: %s at offset %d\n", pErrmsg, erridx);
394         else
395             errorf("pcre: %s at offset %d\n", pErrmsg, erridx);
396         return 0;
397     }
398 
399     pcre_malloc_err  = "studying regex";
400     pHints = pcre_study(pProg, 0, &pErrmsg);
401 
402     if (pErrmsg)
403     {
404         xfree(pProg);
405         if (from_ed)
406             add_message("pcre: %s\n", pErrmsg);
407         else
408             errorf("pcre: %s\n", pErrmsg);
409         return 0;
410     }
411     /* We have to ensure to have an initialized pHints structure for
412        setting the recursion limit later on */
413     if (pHints == NULL)
414     {
415         pcre_malloc_err = "allocating memory for pHints section in regexp";
416         pHints = (pcre_extra *)pcre_xalloc(sizeof(pcre_extra));
417         if (pHints == NULL)
418         {
419             if (from_ed)
420                 add_message("pcre: Could not allocate memory for pHints\n");
421             else
422                 errorf("pcre: Could not allocate memory for pHints\n");
423             return 0;
424         }
425         pHints->flags = 0;
426     }
427 
428     {
429        int rc;
430 
431         rc = pcre_fullinfo(pProg, pHints, PCRE_INFO_CAPTURECOUNT, &num_subs);
432         if (rc != 0)
433         {
434             xfree(pProg);
435             xfree(pHints);
436             if (from_ed)
437                 add_message("pcre: %s\n", get_error_message(rc, RE_PCRE));
438             else
439                 errorf("pcre: %s\n", get_error_message(rc, RE_PCRE));
440             return 0;
441         }
442         num_subs = 3 * (num_subs+1);
443     }
444 
445     pSubs = xalloc(num_subs * sizeof (*pSubs));
446     if (pSubs == NULL)
447     {
448         xfree(pProg);
449         xfree(pHints);
450         outofmem(num_subs * sizeof (*pSubs), "regexp work area");
451     }
452     pcre_malloc_size += num_subs * sizeof (*pSubs);
453 
454     /* Compilation complete - store the result in the outgoing regdata
455      * structure.
456      */
457 
458     rdata->pProg = pProg;
459     rdata->pHints = pHints;
460     rdata->pSubs = pSubs;
461     rdata->num_subs = num_subs;
462     rdata->res = 0;
463 
464     return pcre_malloc_size;
465 } /* rx_compile_pcre() */
466 
467 /*--------------------------------------------------------------------*/
468 static regdata_t *
rx_compile_data(string_t * expr,int opt,Bool from_ed)469 rx_compile_data (string_t * expr, int opt, Bool from_ed)
470 
471 /* Compile a regdata structure from the expression <expr>, according
472  * to the options in <opt>. If <from_ed> is TRUE, the RE is used
473  * from the editor, so error messages will be printed directly
474  * to the user.
475  *
476  * If possible, take a ready-compiled structure from the hashtable,
477  * else enter the newly compiled structure into the table.
478  *
479  * The caller gets his own reference to the structure, which he has
480  * to free_regdata() after use.
481  */
482 
483 {
484     size_t    pcre_size;
485     regdata_t rdata; /* Local rdata structure to hold compiled expression */
486 
487 #ifdef RXCACHE_TABLE
488     whash_t hExpr;
489     int h;
490     RxHashEntry *pHash;
491 #endif
492 
493 #ifdef RXCACHE_TABLE
494     iNumXRequests++;
495 
496     hExpr = mstr_get_hash(expr);
497     h = RxStrHash(hExpr);
498     pHash = xtable[h];
499 
500     /* Look for a ready-compiled regexp */
501     if (pHash != NULL
502      && pHash->pString != NULL
503      && pHash->hString == hExpr
504      && pHash->base.opt == (opt & ~(RE_PACKAGE_MASK))
505      && mstreq(pHash->pString, expr)
506        )
507     {
508         iNumXFound++;
509 
510         /* Regexp found, but it may not have been compiled for us yet.
511          */
512         if ((opt & RE_PCRE) && !pHash->base.pProg)
513         {
514             pHash->size = rx_compile_pcre(expr, opt, from_ed, &(pHash->base));
515             if (!pHash->size)
516                 return NULL;
517         }
518         if ((opt & RE_TRADITIONAL) && !pHash->base.rx)
519         {
520             if (!rx_compile_re(expr, opt, from_ed, &(pHash->base)))
521                 return NULL;
522         }
523         return ref_regdata(&(pHash->base));
524     }
525 #endif
526 
527     /* Regexp not found: compile a new one.
528      */
529 
530     pcre_size = 0;
531     memset(&rdata, 0, sizeof(rdata));
532 
533     if (opt & RE_PCRE)
534     {
535         pcre_size = rx_compile_pcre(expr, opt, from_ed, &rdata);
536         if (!pcre_size)
537         {
538             return NULL;
539         }
540     }
541     if (opt & RE_TRADITIONAL)
542     {
543         if (!rx_compile_re(expr, opt, from_ed, &rdata))
544         {
545             return NULL;
546         }
547     }
548 
549 #ifndef RXCACHE_TABLE
550 
551     /* Wrap up the new regular expression and return it */
552     {
553         regdata_t * rc;
554 
555         rc = xalloc(sizeof(*rc));
556         if (!rc)
557         {
558             rx_free_subdata(&rdata);
559             outofmem(sizeof(*rc), "Regexp data structure");
560             return NULL;
561         }
562 
563         memcpy(rc, &rdata, sizeof(*rc));
564 
565         rc->ref = 1;
566         rc->opt = opt & ~(RE_PACKAGE_MASK);
567         return rc;
568     }
569 #else
570 
571     /* Wrap up the new regular expression and enter it into the table */
572     expr = make_tabled_from(expr); /* for faster comparisons */
573 
574     if (NULL != pHash)
575     {
576         iNumXCollisions++;
577         iNumXEntries--;
578         iXSizeAlloc -= sizeof(*pHash) + pHash->size;
579         free_regdata(&(pHash->base));
580     }
581 
582     pHash = xalloc(sizeof(*pHash));
583     if (!pHash)
584     {
585         rx_free_subdata(&rdata);
586         outofmem(sizeof(*pHash), "Regexp cache structure");
587         return NULL;
588     }
589     xtable[h] = pHash;
590 
591     memcpy(&(pHash->base), &rdata, sizeof(pHash->base));
592 
593     pHash->base.ref = 1;
594     pHash->base.opt = opt & ~(RE_PACKAGE_MASK);
595     pHash->pString = expr; /* refs are transferred */
596     pHash->hString = hExpr;
597     pHash->size = pcre_size;
598 
599     iNumXEntries++;
600     iXSizeAlloc += sizeof(*pHash) + pHash->size;
601 
602     return ref_regdata((regdata_t *)pHash);
603 #endif /* RXCACHE_TABLE */
604 } /* rx_compile_data() */
605 
606 /*--------------------------------------------------------------------*/
607 regexp_t *
rx_compile(string_t * expr,int opt,Bool from_ed)608 rx_compile (string_t * expr, int opt, Bool from_ed)
609 
610 /* Compile a regexp structure from the expression <expr>, according
611  * to the options in <opt>. If <from_ed> is TRUE, the RE is used
612  * from the editor, so error messages will be printed directly
613  * to the user.
614  *
615  * If possible, take a ready-compiled structure from the hashtable,
616  * else enter the newly compiled structure into the table.
617  *
618  * The caller gets his own unique regexp structure, which he has
619  * to free_regexp() after use.
620  */
621 
622 {
623     regdata_t * pData;
624     regexp_t * pRegexp = NULL;
625 
626     /* Determine for which package to compile */
627     if (!(opt & RE_PACKAGE_MASK))
628     {
629         if (driver_hook[H_REGEXP_PACKAGE].u.number)
630             opt |= driver_hook[H_REGEXP_PACKAGE].u.number;
631         else
632             opt |= regex_package;
633     }
634 
635     if (!(opt & RE_PACKAGE_MASK))
636     {
637         if (from_ed)
638             add_message("Missing specification of which regexp package to use.\n");
639         else
640             errorf("Missing specification of which regexp package to use.\n");
641         return NULL;
642     }
643 
644     pData = rx_compile_data(expr, opt, from_ed);
645     if (pData)
646     {
647         pRegexp = xalloc(sizeof(*pRegexp));
648         if (pRegexp == NULL)
649         {
650             free_regdata(pData);
651             outofmem(sizeof(*pRegexp), "regexp structure");
652             return NULL;
653         }
654 
655         pRegexp->opt = opt;
656         pRegexp->data = pData;
657     }
658 
659     return pRegexp;
660 } /* rx_compile() */
661 
662 /*-------------------------------------------------------------------------*/
663 int
rx_exec(regexp_t * pRegexp,string_t * string,size_t start)664 rx_exec (regexp_t *pRegexp, string_t * string, size_t start)
665 
666 /* Match the regexp <pRegexp> against the <string>, starting the match
667  * at the position <start>.
668  *
669  * Return a positive number if pattern matched, 0 if it did not match,
670  * or a negative error code (this can be printed with rx_error_message()).
671  */
672 
673 {
674     regdata_t * prog = pRegexp->data;
675 
676     if (pRegexp->opt & RE_PCRE)
677     {
678         int rc;
679         int pcre_opt;
680         pcre_extra * pHints;    /* Study data */
681 
682         /* Determine the RE compilation options */
683 
684         pcre_opt = 0;
685         if (prog->opt & RE_ANCHORED) pcre_opt |= PCRE_ANCHORED;
686         if (prog->opt & RE_NOTBOL)   pcre_opt |= PCRE_NOTBOL;
687         if (prog->opt & RE_NOTEOL)   pcre_opt |= PCRE_NOTEOL;
688         if (prog->opt & RE_NOTEMPTY) pcre_opt |= PCRE_NOTEMPTY;
689 
690         pHints = prog->pHints;
691         /* If LD_PCRE_RECURSION_LIMIT is defined we set a limit for match. If
692          * PCRE_EXTRA_MATCH_LIMIT_RECURSION is defined, we have a new libpcre,
693          * which supports limiting the recursions. Otherwise we have to limit
694          * the no of calls to match().
695          * TODO: Instead of the conditional compilation we should update the
696          * TODO::built-in pcre package.
697          */
698 #ifdef LD_PCRE_RECURSION_LIMIT
699 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
700         pHints->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
701         pHints->match_limit_recursion = LD_PCRE_RECURSION_LIMIT;
702 #else
703         pHints->flags |= PCRE_EXTRA_MATCH_LIMIT;
704         pHints->match_limit = LD_PCRE_RECURSION_LIMIT;
705 #endif /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
706 #else  /* LD_PCRE_RECURSION_LIMIT */
707 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
708         pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT_RECURSION
709 #else
710         pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
711 #endif  /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
712 #endif  /* LD_PCRE_RECURSION_LIMIT */
713 
714         rc = pcre_exec( prog->pProg, pHints
715                       , get_txt(string), mstrsize(string), start, pcre_opt
716                       , prog->pSubs, prog->num_subs
717                       );
718         prog->res = rc;
719 
720         /* Reverse the roles of return codes 0 (not enough entries in subs[])
721          * and PCRE_ERROR_NOMATCH.
722          */
723         if (rc == PCRE_ERROR_NOMATCH) rc = 0;
724         else if (rc == 0) rc = PCRE_ERROR_NOMATCH;
725 
726         return rc;
727     } /* if (use pcre) */
728 
729     /* Fallback: Traditional regexp */
730     return hs_regexec(prog->rx, get_txt(string)+start, get_txt(string));
731 } /* rx_exec() */
732 
733 /*-------------------------------------------------------------------------*/
734 int
rx_exec_str(regexp_t * pRegexp,char * string,char * start)735 rx_exec_str (regexp_t *pRegexp, char * string, char * start)
736 
737 /* Match the regexp <pRegexp> against the <string> whose real begin
738  * is at <start>.
739  *
740  * Return a positive number if pattern matched, 0 if it did not match,
741  * or a negative error code (this can be printed with rx_error_message()).
742  *
743  * This method is used by ed().
744  */
745 
746 {
747     regdata_t * prog = pRegexp->data;
748 
749     if (pRegexp->opt & RE_PCRE)
750     {
751         int rc;
752         int pcre_opt;
753         pcre_extra * pHints;    /* Study data */
754 
755         /* Determine the RE compilation options */
756 
757         pcre_opt = 0;
758         if (prog->opt & RE_ANCHORED) pcre_opt |= PCRE_ANCHORED;
759         if (prog->opt & RE_NOTBOL)   pcre_opt |= PCRE_NOTBOL;
760         if (prog->opt & RE_NOTEOL)   pcre_opt |= PCRE_NOTEOL;
761         if (prog->opt & RE_NOTEMPTY) pcre_opt |= PCRE_NOTEMPTY;
762 
763         pHints = prog->pHints;
764         /* If LD_PCRE_RECURSION_LIMIT is defined we set a limit for match. If
765          * PCRE_EXTRA_MATCH_LIMIT_RECURSION is defined, we have a new libpcre,
766          * which supports limiting the recursions. Otherwise we have to limit
767          * the no of calls to match().
768          * TODO: Instead of the conditional compilation we should update the
769          * TODO::built-in pcre package.
770          */
771 #ifdef LD_PCRE_RECURSION_LIMIT
772 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
773         pHints->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
774         pHints->match_limit_recursion = LD_PCRE_RECURSION_LIMIT;
775 #else
776         pHints->flags |= PCRE_EXTRA_MATCH_LIMIT;
777         pHints->match_limit = LD_PCRE_RECURSION_LIMIT;
778 #endif /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
779 #else  /* LD_PCRE_RECURSION_LIMIT */
780 #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION
781         pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT_RECURSION
782 #else
783         pHints->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
784 #endif  /* PCRE_EXTRA_MATCH_LIMIT_RECURSION */
785 #endif  /* LD_PCRE_RECURSION_LIMIT */
786 
787         rc = pcre_exec( prog->pProg, pHints
788                       , start, strlen(start), string - start, pcre_opt
789                       , prog->pSubs, prog->num_subs
790                       );
791         prog->res = rc;
792 
793         /* Reverse the roles of return codes 0 (not enough entries in subs[])
794          * and PCRE_ERROR_NOMATCH.
795          */
796         if (rc == PCRE_ERROR_NOMATCH) rc = 0;
797         else if (rc == 0) rc = PCRE_ERROR_NOMATCH;
798 
799         return rc;
800     } /* if (use pcre) */
801 
802     /* Fallback: Traditional regexp */
803     return hs_regexec(prog->rx, string, start);
804 } /* rx_exec_str() */
805 
806 /*-------------------------------------------------------------------------*/
807 int
rx_num_matches(regexp_t * pRegexp)808 rx_num_matches (regexp_t *pRegexp)
809 
810 /* After a successful match of <pRegexp>, return the number of matched
811  * expressions and parenthesized expressions.
812  * In other words: the result is at least 1 (for the fully matched
813  * expressions), plus the number of matched '()' sub expressions.
814  *
815  * The function tries to detect when it is called after an unsuccessful
816  * match and return 0 in that case.
817  */
818 
819 {
820     if (pRegexp->opt & RE_PCRE)
821     {
822         return pRegexp->data->res >= 1 ? pRegexp->data->res : 0;
823     } /* if (use pcre) */
824 
825     /* Fallback: Traditional regexp */
826     {
827         regdata_t * prog = pRegexp->data;
828         p_uint num, i;
829 
830         for (num = 0, i = 0
831             ; i < sizeof(prog->rx->startp) / sizeof(prog->rx->startp[0])
832             ; i++)
833         {
834             if (prog->rx->startp[i] != NULL
835              && prog->rx->endp[i] != NULL
836                )
837                 num++;
838         }
839 
840         return num;
841     }
842 } /* rx_num_matches() */
843 
844 /*-------------------------------------------------------------------------*/
845 Bool
rx_get_match_n(regexp_t * pRegexp,string_t * str,int n,size_t * start,size_t * end)846 rx_get_match_n (regexp_t *pRegexp, string_t * str, int n, size_t * start, size_t * end)
847 
848 /* After a successful match of <pRegexp> against <str>, store the start
849  * and end position of the match <n> in *<start> and *<end> and retuern TRUE.
850  * The end position is in fact the position of the first character after
851  * the match.
852  *
853  * <n> is 0 for the whole expression, and any positive number for the
854  * matched subexpressions.
855  *
856  * If the requested match does not exist, *<start> and *<end> are both
857  * set to 0 and the function returns FALSE.
858  */
859 
860 {
861     Bool rc;
862     regdata_t * prog = pRegexp->data;
863 
864     if (pRegexp->opt & RE_PCRE)
865     {
866         if (n < 0
867          || n >= prog->res
868          || prog->pSubs[2*n] < 0
869          || prog->pSubs[2*n+1] < 0
870            )
871         {
872             *start = 0;
873             *end = 0;
874             rc = MY_FALSE;
875         }
876         else
877         {
878             *start = (size_t)prog->pSubs[2*n];
879             *end = (size_t)prog->pSubs[2*n+1];
880             rc = MY_TRUE;
881         }
882         return rc;
883     } /* if (use pcre) */
884 
885     /* Fallback: Traditional regexp */
886     if (n < 0
887      || n >= (int)(sizeof(prog->rx->startp) / sizeof(prog->rx->startp[0]))
888      || prog->rx->startp[n] == NULL
889      || prog->rx->endp[n] == NULL
890        )
891     {
892         *start = 0;
893         *end = 0;
894         rc = MY_FALSE;
895     }
896     else
897     {
898         *start = prog->rx->startp[n] - get_txt(str);
899         *end = prog->rx->endp[n] - get_txt(str);
900         rc = MY_TRUE;
901     }
902 
903     return rc;
904 } /* rx_get_match_n() */
905 
906 /*-------------------------------------------------------------------------*/
907 void
rx_get_match(regexp_t * pRegexp,string_t * str,size_t * start,size_t * end)908 rx_get_match (regexp_t *pRegexp, string_t * str, size_t * start, size_t * end)
909 
910 /* After a successful match of <pRegexp> against <str>, return the start
911  * and end position of the match in *<start> and *<end>. The end
912  * position is in fact the position of the first character after the match.
913  */
914 
915 {
916     regdata_t * prog = pRegexp->data;
917 
918     if (pRegexp->opt & RE_PCRE)
919     {
920         *start = (size_t)prog->pSubs[0];
921         *end = (size_t)prog->pSubs[1];
922         return;
923     } /* if (use pcre) */
924 
925     /* Fallback: Traditional regexp */
926     *start = prog->rx->startp[0] - get_txt(str);
927     *end = prog->rx->endp[0] - get_txt(str);
928 } /* rx_get_match() */
929 
930 /*-------------------------------------------------------------------------*/
931 void
rx_get_match_str(regexp_t * pRegexp,char * str,size_t * start,size_t * end)932 rx_get_match_str (regexp_t *pRegexp, char * str, size_t * start, size_t * end)
933 
934 /* After a successful match of <pRegexp> against <str>, return the start
935  * and end position of the match in *<start> and *<end>. The end
936  * position is in fact the position of the first character after the match.
937  */
938 
939 {
940     regdata_t * prog = pRegexp->data;
941 
942     if (pRegexp->opt & RE_PCRE)
943     {
944         *start = (size_t)prog->pSubs[0];
945         *end = (size_t)prog->pSubs[1];
946         return;
947     } /* if (use pcre) */
948 
949     /* Fallback: Traditional regexp */
950     *start = prog->rx->startp[0] - str;
951     *end = prog->rx->endp[0] - str;
952 } /* rx_get_match_str() */
953 
954 /*-------------------------------------------------------------------------*/
955 string_t *
rx_sub(regexp_t * pRegexp,string_t * source,string_t * subst)956 rx_sub (regexp_t *pRegexp, string_t *source, string_t *subst)
957 
958 /* <pRegexp> describes a regexp match in string <source>. Take the
959  * replacement string <subst> and substitute any matched subparentheses.
960  * The result is a new string with one reference.
961  *
962  * Returns NULL when out of memory.
963  */
964 
965 {
966     Bool   copyPass;   /* Pass indicator */
967     size_t len;        /* Computed length of the result */
968     string_t * result; /* Result string */
969     regdata_t * prog = pRegexp->data;
970 
971     result = NULL;
972 
973     /* Make two passes over the the string: one to compute the size
974      * of the result, the second to create the result.
975      */
976     copyPass = MY_FALSE;
977     len = 0;
978     do
979     {
980         char * src;
981         char * dst = NULL;
982         size_t left;
983 
984         left = mstrsize(subst);
985         src = get_txt(subst);
986         if (copyPass)
987             dst = get_txt(result);
988 
989         while (left-- > 0)
990         {
991             int no;
992             char c;
993 
994             c = *src++;
995             if (c == '&')
996                 no = 0;
997             else if (c == '\\' && '0' <= *src && *src <= '9')
998             {
999                 no = *src++ - '0';
1000                 left--;
1001             }
1002             else
1003                 no = -1;
1004 
1005             if (no < 0) /* Ordinary character. */
1006             {
1007                 if (c == '\\' && (*src == '\\' || *src == '&'))
1008                 {
1009                     c = *src++;
1010                     left--;
1011                 }
1012                 if (copyPass)
1013                     *dst++ = c;
1014                 else
1015                     len++;
1016             }
1017             else
1018             {
1019                 if (pRegexp->opt & RE_PCRE)
1020                 {
1021                     if (no < prog->res
1022                      && prog->pSubs[2*no] != -1
1023                      && prog->pSubs[2*no+1] != -1
1024                        )
1025                     {
1026                         size_t start = (size_t)prog->pSubs[2*no];
1027                         size_t sublen = (size_t)prog->pSubs[2*no+1] - start;
1028 
1029                         if (copyPass)
1030                         {
1031                             memcpy(dst, get_txt(source)+start, sublen);
1032                             dst += sublen;
1033                         }
1034                         else
1035                         {
1036                             len += sublen;
1037                         }
1038                     }
1039                 } /* if (use pcre) */
1040 
1041                 if (pRegexp->opt & RE_TRADITIONAL)
1042                 {
1043                     if (no < (int)(sizeof(prog->rx->startp) / sizeof(prog->rx->startp[0]))
1044                      && prog->rx->startp[no] != NULL
1045                      && prog->rx->endp[no] != NULL
1046                        )
1047                     {
1048                         size_t sublen = prog->rx->endp[no] - prog->rx->startp[no];
1049 
1050                         if (copyPass)
1051                         {
1052                             memcpy(dst, prog->rx->startp[no], sublen);
1053                             dst += sublen;
1054                         }
1055                         else
1056                         {
1057                             len += sublen;
1058                         }
1059                     }
1060                 } /* if (use traditional) */
1061             }
1062         } /* while(left-- > 0) */
1063 
1064         /* End of pass */
1065         if (!copyPass)
1066         {
1067             result = alloc_mstring(len);
1068             if (!result)
1069                 return NULL;
1070         }
1071         copyPass = !copyPass;
1072     } while (copyPass);
1073 
1074     return result;
1075 } /* rx_sub() */
1076 
1077 /*-------------------------------------------------------------------------*/
1078 string_t *
rx_sub_str(regexp_t * pRegexp,char * source,char * subst)1079 rx_sub_str (regexp_t *pRegexp, char *source, char *subst)
1080 
1081 /* <pRegexp> describes a regexp match in string <source>. Take the
1082  * replacement string <subst> and substitute any matched subparentheses.
1083  * The result is a new string with one reference.
1084  *
1085  * Returns NULL when out of memory.
1086  */
1087 
1088 {
1089     string_t *m_source;
1090     string_t *m_subst;
1091     string_t *rc;
1092 
1093     m_source = new_mstring(source);
1094     if (!m_source)
1095     {
1096         return NULL;
1097     }
1098 
1099     m_subst = new_mstring(subst);
1100     if (!m_subst)
1101     {
1102         free_mstring(m_source);
1103         return NULL;
1104     }
1105     rc = rx_sub(pRegexp, m_source, m_subst);
1106     free_mstring(m_source);
1107     free_mstring(m_subst);
1108     return rc;
1109 } /* rx_sub_str() */
1110 
1111 /*-------------------------------------------------------------------------*/
1112 Bool
rx_reganch(regexp_t * pRegexp)1113 rx_reganch (regexp_t *pRegexp)
1114 
1115 /* If <pRegexp> is a traditional regexp, return TRUE if rx->reganch
1116  * is not NULL. If <pRegexp> is a PCRE, always return FALSE.
1117  *
1118  * Used by ed.c
1119  */
1120 
1121 {
1122     if (pRegexp->opt & RE_PCRE)
1123         return MY_FALSE;
1124 
1125     /* Traditional regexp */
1126     return pRegexp->data->rx->reganch != 0;
1127 } /* rx_reganch() */
1128 
1129 /*--------------------------------------------------------------------*/
1130 static void
rx_free_subdata(regdata_t * expr)1131 rx_free_subdata (regdata_t * expr)
1132 
1133 /* Deallocate all associated data in regdata structure <expr>.
1134  */
1135 
1136 {
1137     if (expr->pSubs)  xfree(expr->pSubs);  expr->pSubs = NULL;
1138     if (expr->pHints) xfree(expr->pHints); expr->pHints = NULL;
1139     if (expr->pProg)  xfree(expr->pProg);  expr->pProg = NULL;
1140     if (expr->rx)     xfree(expr->rx);     expr->rx = NULL;
1141 } /* rx_free_subdata() */
1142 
1143 /*--------------------------------------------------------------------*/
1144 static void
rx_free_data(regdata_t * expr)1145 rx_free_data (regdata_t * expr)
1146 
1147 /* Deallocate a regdata structure <expr> and all associated data.
1148 #ifdef RXCACHE_TABLE
1149  * <expr> is in fact a RxHashEntry structure.
1150 #endif
1151  */
1152 
1153 {
1154 #ifdef RXCACHE_TABLE
1155     {
1156         RxHashEntry * pHash = (RxHashEntry *)expr;
1157         free_mstring(pHash->pString);
1158     }
1159 #endif
1160     rx_free_subdata(expr);
1161     xfree(expr);
1162 } /* rx_free_data() */
1163 
1164 /*--------------------------------------------------------------------*/
1165 void
free_regexp(regexp_t * expr)1166 free_regexp (regexp_t * expr)
1167 
1168 /* Deallocate a regexp structure <expr> and all associated data.
1169  */
1170 
1171 {
1172     free_regdata(expr->data);
1173     xfree(expr);
1174 } /* free_regexp() */
1175 
1176 /*--------------------------------------------------------------------*/
1177 size_t
rxcache_status(strbuf_t * sbuf,Bool verbose)1178 rxcache_status (strbuf_t *sbuf, Bool verbose)
1179 
1180 /* Gather (and optionally print) the statistics from the rxcache.
1181  * Return the amount of memory used.
1182  */
1183 
1184 {
1185 #ifdef RXCACHE_TABLE
1186 
1187     uint32 iNumXReq;  /* Number of rx_compile() requests, made non-zero */
1188 
1189 #if defined(__MWERKS__) && !defined(WARN_ALL)
1190 #    pragma warn_largeargs off
1191 #endif
1192 
1193     /* In verbose mode, print the statistics */
1194     if (verbose)
1195     {
1196         strbuf_add(sbuf, "\nRegexp cache status:\n");
1197         strbuf_add(sbuf,   "--------------------\n");
1198         strbuf_addf(sbuf, "Expressions in cache:  %"PRIu32" (%.1f%%)\n"
1199                    , iNumXEntries, 100.0 * (float)iNumXEntries / RXCACHE_TABLE);
1200         strbuf_addf(sbuf, "Memory allocated:      %"PRIu32"\n", iXSizeAlloc);
1201         iNumXReq = iNumXRequests ? iNumXRequests : 1;
1202         strbuf_addf(sbuf
1203                , "Requests: %"PRIu32" - Found: %"PRIu32" (%.1f%%) - "
1204                "Coll: %"PRIu32" (%.1f%% req/%.1f%% entries)\n"
1205                , iNumXRequests, iNumXFound, 100.0 * (float)iNumXFound/(float)iNumXReq
1206                , iNumXCollisions, 100.0 * (float)iNumXCollisions/(float)iNumXReq
1207                , 100.0 * (float)iNumXCollisions/(iNumXEntries ? iNumXEntries : 1)
1208                );
1209     }
1210     else
1211     {
1212         strbuf_addf(sbuf, "Regexp cache:\t\t\t%8"PRId32" %9"PRIu32"\n",
1213             iNumXEntries, iXSizeAlloc);
1214     }
1215 
1216     return iXSizeAlloc;
1217 
1218 #if defined(__MWERKS__)
1219 #    pragma warn_largeargs reset
1220 #endif
1221 
1222 #else
1223 
1224     return 0;
1225 #endif
1226 } /* rxcache_status() */
1227 
1228 /*-------------------------------------------------------------------------*/
1229 void
rxcache_dinfo_status(svalue_t * svp,int value)1230 rxcache_dinfo_status (svalue_t *svp, int value)
1231 
1232 /* Return the rxcache information for debug_info(DINFO_DATA, DID_STATUS).
1233  * <svp> points to the svalue block for the result, this function fills in
1234  * the spots for the object table.
1235  * If <value> is -1, <svp> points indeed to a value block; other it is
1236  * the index of the desired value and <svp> points to a single svalue.
1237  */
1238 
1239 {
1240 #ifdef RXCACHE_TABLE
1241 
1242 #define ST_NUMBER(which,code) \
1243     if (value == -1) svp[which].u.number = code; \
1244     else if (value == which) svp->u.number = code
1245 
1246     ST_NUMBER(DID_ST_RX_CACHED, iNumXEntries);
1247     ST_NUMBER(DID_ST_RX_TABLE, RXCACHE_TABLE);
1248     ST_NUMBER(DID_ST_RX_TABLE_SIZE, iXSizeAlloc);
1249     ST_NUMBER(DID_ST_RX_REQUESTS, iNumXRequests);
1250     ST_NUMBER(DID_ST_RX_REQ_FOUND, iNumXFound);
1251     ST_NUMBER(DID_ST_RX_REQ_COLL, iNumXCollisions);
1252 
1253 #undef ST_NUMBER
1254 #endif
1255 } /* rxcache_dinfo_status() */
1256 
1257 /*--------------------------------------------------------------------*/
1258 #if defined(GC_SUPPORT)
1259 
1260 /*--------------------------------------------------------------------*/
1261 void
clear_rxcache_refs(void)1262 clear_rxcache_refs (void)
1263 
1264 /* Clear all the refcounts in the hashtables.
1265  * The refs of the shared strings and of the memory blocks are
1266  * not of our concern.
1267  */
1268 
1269 {
1270 #ifdef RXCACHE_TABLE
1271     int i;
1272 
1273     for (i = 0; i < RXCACHE_TABLE; i++)
1274         if (NULL != xtable[i])
1275         {
1276             xtable[i]->base.ref = 0;
1277         }
1278 #endif
1279 } /* clear_rxcache_refs() */
1280 
1281 /*--------------------------------------------------------------------*/
1282 void
clear_regexp_ref(regexp_t * pRegexp)1283 clear_regexp_ref (regexp_t * pRegexp)
1284 
1285 /* Clear the refcount for <pRegexp>.
1286  */
1287 
1288 {
1289     if (pRegexp)
1290         pRegexp->data->ref = 0;
1291 } /* clear_regexp_ref() */
1292 
1293 /*--------------------------------------------------------------------*/
1294 void
count_regdata_ref(regdata_t * pRegexp)1295 count_regdata_ref (regdata_t * pRegexp)
1296 
1297 /* Mark all memory associated with one regexp structure and count
1298  * the refs.
1299  * This function is called both from rxcache as well as from ed.
1300  */
1301 
1302 {
1303     note_malloced_block_ref(pRegexp);
1304     if (pRegexp->pProg)
1305     {
1306         note_malloced_block_ref(pRegexp->pProg);
1307         if (pRegexp->pHints)
1308             note_malloced_block_ref(pRegexp->pHints);
1309         if (pRegexp->pSubs)
1310             note_malloced_block_ref(pRegexp->pSubs);
1311     }
1312     if (pRegexp->rx)
1313         note_malloced_block_ref(pRegexp->rx);
1314 #ifdef RXCACHE_TABLE
1315     count_ref_from_string(((RxHashEntry *)pRegexp)->pString);
1316 #endif
1317     pRegexp->ref++;
1318 } /* count_regdata_ref() */
1319 
1320 /*--------------------------------------------------------------------*/
1321 void
count_regexp_ref(regexp_t * pRegexp)1322 count_regexp_ref (regexp_t * pRegexp)
1323 
1324 /* Mark all memory associated with one regexp structure and count
1325  * the refs.
1326  * This function is called both from rxcache as well as from ed.
1327  */
1328 
1329 {
1330     note_malloced_block_ref(pRegexp);
1331     count_regdata_ref(pRegexp->data);
1332 } /* count_regexp_ref() */
1333 
1334 /*--------------------------------------------------------------------*/
1335 void
count_rxcache_refs(void)1336 count_rxcache_refs (void)
1337 
1338 /* Mark all memory referenced from the hashtables. */
1339 
1340 {
1341 #ifdef RXCACHE_TABLE
1342     int i;
1343 
1344     for (i = 0; i < RXCACHE_TABLE; i++)
1345     {
1346         if (NULL != xtable[i])
1347         {
1348             count_regdata_ref((regdata_t *)xtable[i]);
1349         }
1350     } /* for (i) */
1351 #endif
1352 
1353 } /* count_rxcache_refs() */
1354 
1355 #endif /* if GC_SUPPORT */
1356 
1357 /*====================================================================*/
1358 
1359