1 /* -*- mode: c ; c-file-style: "canonware-c-style" -*-
2  ******************************************************************************
3  *
4  * Copyright (C) 1996-2005 Jason Evans <jasone@canonware.com>.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice(s), this list of conditions and the following disclaimer
12  *    unmodified other than the allowable addition of one or more
13  *    copyright notices.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice(s), this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
20  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  ******************************************************************************
32  *
33  * Version: Onyx 5.1.2
34  *
35  ******************************************************************************/
36 
37 #define CW_NXO_REGSUB_C_
38 
39 #include "../include/libonyx/libonyx.h"
40 #include "../include/libonyx/nxa_l.h"
41 #include "../include/libonyx/nxo_l.h"
42 #include "../include/libonyx/nxo_regex_l.h"
43 #include "../include/libonyx/nxo_regsub_l.h"
44 #include "../include/libonyx/nxo_thread_l.h"
45 
46 /* Do the work of initializing a regsub, but don't do any of the typical
47  * GC-related initialization, so that this function can be used for the case
48  * where a regsub is temporarily constructed for a single subst. */
49 static cw_nxn_t
nxo_p_regsub_init(cw_nxoe_regsub_t * a_regsub,const char * a_pattern,uint32_t a_plen,bool a_global,bool a_insensitive,bool a_multiline,bool a_singleline,const char * a_template,uint32_t a_tlen)50 nxo_p_regsub_init(cw_nxoe_regsub_t *a_regsub, const char *a_pattern,
51 		  uint32_t a_plen, bool a_global,
52 		  bool a_insensitive, bool a_multiline,
53 		  bool a_singleline, const char *a_template,
54 		  uint32_t a_tlen)
55 {
56     cw_nxn_t retval;
57     char *pattern;
58     const char *errptr;
59     int options, erroffset, capturecount;
60     enum
61     {
62 	TSTATE_START,
63 	TSTATE_BS_CONT
64     } tstate;
65     uint32_t i, beg, end, voff;
66 
67     nxoe_l_new(&a_regsub->nxoe, NXOT_REGSUB, false);
68 
69     /* Create a '\0'-terminated copy of a_pattern. */
70     pattern = (char *) nxa_malloc(a_plen + 1);
71     memcpy(pattern, a_pattern, a_plen);
72     pattern[a_plen] = '\0';
73 
74     /* Translate options to a format usable by pcre_compile(). */
75     options = 0;
76     /* $i. */
77     if (a_insensitive)
78     {
79 	options |= PCRE_CASELESS;
80     }
81     /* $m. */
82     if (a_multiline)
83     {
84 	options |= PCRE_MULTILINE;
85     }
86     /* $s. */
87     if (a_singleline)
88     {
89 	options |= PCRE_DOTALL;
90     }
91 
92     /* Store the global flag.  This information is not needed in this function,
93      * but gets used when actually doing substitutions. */
94     a_regsub->global = a_global;
95 
96     /* Compile the regex. */
97     a_regsub->pcre = pcre_compile(pattern, options, &errptr, &erroffset, NULL);
98     nxa_free(pattern, a_plen + 1);
99     if (a_regsub->pcre == NULL)
100     {
101 	retval = NXN_regexerror;
102 	goto RETURN;
103     }
104 
105     /* Call pcre_study(), which may improve matching performance. */
106     a_regsub->extra = pcre_study(a_regsub->pcre, 0, &errptr);
107     if (errptr != NULL)
108     {
109 	free(a_regsub->pcre);
110 	retval = NXN_regexerror;
111 	goto RETURN;
112     }
113 
114     /* Get capturecount and the amount of space that was allocated for
115      * a_regsub->pcre and a_regsub->extra. */
116     if ((pcre_fullinfo(a_regsub->pcre, a_regsub->extra, PCRE_INFO_CAPTURECOUNT,
117 		       &capturecount) != 0)
118 	|| (pcre_fullinfo(a_regsub->pcre, a_regsub->extra, PCRE_INFO_SIZE,
119 			  &a_regsub->size) != 0)
120 	|| (pcre_fullinfo(a_regsub->pcre, a_regsub->extra, PCRE_INFO_STUDYSIZE,
121 			  &a_regsub->studysize) != 0)
122 	)
123     {
124 	free(a_regsub->pcre);
125 	if (a_regsub->extra != NULL)
126 	{
127 	    free(a_regsub->extra);
128 	}
129 	retval = NXN_regexerror;
130 	goto RETURN;
131     }
132 
133     /* Use capturecount to calculate the size of vector needed for pcre_exec()
134      * calls. */
135     a_regsub->ovcnt = (capturecount + 1) * 3;
136 
137     /* Make a copy of a_template. */
138     if (a_tlen > 0)
139     {
140 	a_regsub->template = (char *) nxa_malloc(a_tlen);
141 	memcpy(a_regsub->template, a_template, a_tlen);
142     }
143     else
144     {
145 	a_regsub->template = NULL;
146     }
147     a_regsub->tlen = a_tlen;
148 
149     /* Parse a_template and construct a vector from it.  Do this in two passes,
150      * since having to reallocate is likely to be more expensive than parsing
151      * twice. */
152     for (i = beg = end = a_regsub->vlen = 0, tstate = TSTATE_START;
153 	 i < a_tlen;
154 	 i++)
155     {
156 	switch (tstate)
157 	{
158 	    case TSTATE_START:
159 	    {
160 		switch (a_regsub->template[i])
161 		{
162 		    case '\\':
163 		    {
164 			end = i;
165 			tstate = TSTATE_BS_CONT;
166 			break;
167 		    }
168 		    default:
169 		    {
170 			break;
171 		    }
172 		}
173 		break;
174 	    }
175 	    case TSTATE_BS_CONT:
176 	    {
177 		switch (a_regsub->template[i])
178 		{
179 		    case '1': case '2': case '3': case '4': case '5': case '6':
180 		    case '7': case '8': case '9':
181 		    {
182 			/* Preceding plain text, if any. */
183 			if (end > beg)
184 			{
185 			    a_regsub->vlen++;
186 			}
187 
188 			/* Subpattern substitution. */
189 			a_regsub->vlen++;
190 			beg = end = i + 1;
191 			tstate = TSTATE_START;
192 			break;
193 		    }
194 		    case '\\':
195 		    {
196 			/* Stay in this state (ignore extra leading '\'
197 			 * characters. */
198 			end = i;
199 			break;
200 		    }
201 		    default:
202 		    {
203 			/* Ignore. */
204 			tstate = TSTATE_START;
205 			break;
206 		    }
207 		}
208 		break;
209 	    }
210 	    default:
211 	    {
212 		cw_not_reached();
213 	    }
214 	}
215     }
216     if (beg < i)
217     {
218 	/* Normal characters after last subpattern substitution. */
219 	a_regsub->vlen++;
220     }
221 
222     /* Initialize the vector, now that we know how big to make it. */
223     if (a_regsub->vlen > 0)
224     {
225 	a_regsub->vec
226 	    = (cw_nxoe_regsub_telm_t *) nxa_malloc(sizeof(cw_nxoe_regsub_telm_t)
227 						   * a_regsub->vlen);
228     }
229     else
230     {
231 	a_regsub->vec = NULL;
232     }
233 
234     for (i = beg = end = voff = 0, tstate = TSTATE_START;
235 	 i < a_tlen;
236 	 i++)
237     {
238 	switch (tstate)
239 	{
240 	    case TSTATE_START:
241 	    {
242 		switch (a_regsub->template[i])
243 		{
244 		    case '\\':
245 		    {
246 			end = i;
247 			tstate = TSTATE_BS_CONT;
248 			break;
249 		    }
250 		    default:
251 		    {
252 			break;
253 		    }
254 		}
255 		break;
256 	    }
257 	    case TSTATE_BS_CONT:
258 	    {
259 		switch (a_regsub->template[i])
260 		{
261 		    case '1': case '2': case '3': case '4': case '5': case '6':
262 		    case '7': case '8': case '9':
263 		    {
264 			/* Preceding plain text, if any. */
265 			if (end > beg)
266 			{
267 			    a_regsub->vec[voff].str = &a_regsub->template[beg];
268 			    a_regsub->vec[voff].len = end - beg;
269 			    voff++;
270 			}
271 
272 			/* Subpattern substitution. */
273 			a_regsub->vec[voff].str = NULL;
274 			a_regsub->vec[voff].len
275 			    = (uint32_t) (a_regsub->template[i] - '0');
276 			voff++;
277 			beg = end = i + 1;
278 			tstate = TSTATE_START;
279 			break;
280 		    }
281 		    case '\\':
282 		    {
283 			/* Stay in this state (ignore extra leading '\'
284 			 * characters. */
285 			end = i;
286 			break;
287 		    }
288 		    default:
289 		    {
290 			/* Ignore. */
291 			tstate = TSTATE_START;
292 			break;
293 		    }
294 		}
295 		break;
296 	    }
297 	    default:
298 	    {
299 		cw_not_reached();
300 	    }
301 	}
302     }
303     if (beg < i)
304     {
305 	/* Normal characters after last subpattern substitution. */
306 	a_regsub->vec[voff].str = &a_regsub->template[beg];
307 	a_regsub->vec[voff].len = i - beg;
308 #ifdef CW_DBG
309 	/* Make following assertion possible. */
310 	voff++;
311 #endif
312     }
313     cw_assert(voff == a_regsub->vlen);
314 
315     retval = NXN_ZERO;
316     RETURN:
317     return retval;
318 }
319 
320 CW_P_INLINE void
nxo_p_regsub_append(char ** r_ostr,uint32_t * r_omax,uint32_t * r_olen,const char * a_istr,uint32_t a_ilen)321 nxo_p_regsub_append(char **r_ostr, uint32_t *r_omax,
322 		    uint32_t *r_olen, const char *a_istr,
323 		    uint32_t a_ilen)
324 {
325     uint32_t omax;
326 
327     /* Expand *r_ostr, if necessary. */
328     for (omax = *r_omax; *r_olen + a_ilen > omax; omax *= 2)
329     {
330 	/* Do nothing. */
331     }
332     if (omax != *r_omax)
333     {
334 	*r_ostr = nxa_realloc(*r_ostr, omax, *r_omax);
335 	*r_omax = omax;
336     }
337 
338     /* Copy and adjust *r_olen. */
339     memcpy(&(*r_ostr)[*r_olen], a_istr, a_ilen);
340     *r_olen += a_ilen;
341 }
342 
343 static uint32_t
nxo_p_regsub_subst(cw_nxoe_regsub_t * a_regsub,cw_nxo_t * a_thread,cw_nxo_t * a_input,cw_nxo_t * r_output)344 nxo_p_regsub_subst(cw_nxoe_regsub_t *a_regsub, cw_nxo_t *a_thread,
345 		   cw_nxo_t *a_input, cw_nxo_t *r_output)
346 {
347     uint32_t retval = 0;
348     cw_nxo_regex_cache_t *cache;
349     uint32_t scnt, ilen, ioff, olen, omax, v;
350     char *istr, *ostr;
351 
352     cache = nxo_l_thread_regex_cache_get(a_thread);
353 
354     /* Allocate or extend the vector for passing to pcre_exec(), if
355      * necessary. */
356     if (cache->ovp == NULL)
357     {
358 	cache->ovp = nxa_malloc(sizeof(int) * a_regsub->ovcnt);
359 	cache->ovcnt = a_regsub->ovcnt;
360     }
361     else if (cache->ovcnt < a_regsub->ovcnt)
362     {
363 	cache->ovp = nxa_realloc(cache->ovp, sizeof(int) * a_regsub->ovcnt,
364 				 sizeof(int) * cache->ovcnt);
365 	cache->ovcnt = a_regsub->ovcnt;
366     }
367 
368     /* Allocate a temporary output string that is as large as the input string.
369      * If ostr overflows, iteratively double its size.  omax tracks the current
370      * allocation size of ostr, and olen tracks how full ostr is. */
371     ilen = omax = nxo_string_len_get(a_input);
372     olen = 0;
373     if (omax == 0)
374     {
375 	/* It is possible for a pattern to match the empty string, then
376 	 * substitute a non-empty string.  Therefore, handle the empty input
377 	 * string case. */
378 	omax = 8;
379     }
380     istr = nxo_string_get(a_input);
381     ostr = nxa_malloc(omax);
382 
383     /* Iteratively look for matches. */
384     for (scnt = ioff = 0;
385 	 ioff < ilen && (a_regsub->global || scnt < 1);
386 	 scnt++, ioff = (uint32_t) cache->ovp[1])
387     {
388 	/* Look for a match. */
389 	nxo_string_lock(a_input);
390 	cache->mcnt = pcre_exec(a_regsub->pcre, a_regsub->extra, (char *) istr,
391 				ilen, ioff, 0, cache->ovp, cache->ovcnt);
392 	nxo_string_unlock(a_input);
393 	if (cache->mcnt <= 0)
394 	{
395 	    switch (cache->mcnt)
396 	    {
397 		case 0:
398 		case PCRE_ERROR_NOMATCH:
399 		{
400 		    /* No match found.  Not an error. */
401 		    goto DONE;
402 		}
403 		case PCRE_ERROR_NOMEMORY:
404 		{
405 		    xep_throw(CW_ONYXX_OOM);
406 		}
407 		case PCRE_ERROR_NULL:
408 		case PCRE_ERROR_BADOPTION:
409 		case PCRE_ERROR_BADMAGIC:
410 		case PCRE_ERROR_UNKNOWN_NODE:
411 		default:
412 		{
413 		    cw_not_reached();
414 		}
415 	    }
416 	}
417 
418 	/* Copy any data between the end of the previous substitution and the
419 	 * beginning of the current substitution. */
420 	if (ioff < (uint32_t) cache->ovp[0])
421 	{
422 	    nxo_p_regsub_append(&ostr, &omax, &olen,
423 				&istr[ioff],
424 				(uint32_t) cache->ovp[0] - ioff);
425 	}
426 
427 	/* Substitute. */
428 	for (v = 0; v < a_regsub->vlen; v++)
429 	{
430 	    if (a_regsub->vec[v].str != NULL)
431 	    {
432 		/* Copy from template string. */
433 		nxo_p_regsub_append(&ostr, &omax, &olen, a_regsub->vec[v].str,
434 				    a_regsub->vec[v].len);
435 	    }
436 	    else
437 	    {
438 		/* Substitute subpattern match, if the subpattern was
439 		 * matched. */
440 		if (a_regsub->vec[v].len < cache->mcnt
441 		    && cache->ovp[a_regsub->vec[v].len * 2] != -1)
442 		{
443 
444 		    nxo_p_regsub_append(&ostr, &omax, &olen,
445 					&istr
446 					[cache->ovp[a_regsub->vec[v].len * 2]],
447 					cache->ovp[a_regsub->vec[v].len * 2 + 1]
448 					- cache->ovp[a_regsub->vec[v].len * 2]);
449 		}
450 	    }
451 	}
452 
453 	/* Increment substitution count. */
454 	retval++;
455     }
456     DONE:
457     /* If there are trailing bytes after the last match, copy them. */
458     if (ioff < ilen)
459     {
460 	nxo_p_regsub_append(&ostr, &omax, &olen, &istr[ioff], ilen - ioff);
461     }
462 
463     /* Create an Onyx string and copy ostr to it. */
464     if (retval > 0)
465     {
466 	nxo_string_new(r_output, nxo_thread_currentlocking(a_thread), olen);
467 	if (olen > 0)
468 	{
469 	    nxo_string_set(r_output, 0, ostr, olen);
470 	}
471     }
472     else
473     {
474 	/* No substitution done.  Dup the input string. */
475 	nxo_dup(r_output, a_input);
476     }
477 
478     /* Clean up. */
479     nxa_free(ostr, omax);
480 
481     return retval;
482 }
483 
484 cw_nxn_t
nxo_regsub_new(cw_nxo_t * a_nxo,const char * a_pattern,uint32_t a_plen,bool a_global,bool a_insensitive,bool a_multiline,bool a_singleline,const char * a_template,uint32_t a_tlen)485 nxo_regsub_new(cw_nxo_t *a_nxo, const char *a_pattern, uint32_t a_plen,
486 	       bool a_global, bool a_insensitive,
487 	       bool a_multiline, bool a_singleline,
488 	       const char *a_template, uint32_t a_tlen)
489 {
490     cw_nxn_t retval;
491     cw_nxoe_regsub_t *regsub;
492 
493     regsub = (cw_nxoe_regsub_t *) nxa_malloc(sizeof(cw_nxoe_regsub_t));
494 
495     retval = nxo_p_regsub_init(regsub, a_pattern, a_plen, a_global,
496 			       a_insensitive, a_multiline, a_singleline,
497 			       a_template, a_tlen);
498     if (retval)
499     {
500 	nxa_free(regsub, sizeof(cw_nxoe_regsub_t));
501 	goto RETURN;
502     }
503 
504     /* Tell the GC about the space being taken up by regsub->pcre and
505      * regsub->extra. */
506     nxa_l_count_adjust((cw_nxoi_t) regsub->size + regsub->studysize);
507 
508     /* Create a reference to the regsub object. */
509     nxo_no_new(a_nxo);
510     a_nxo->o.nxoe = (cw_nxoe_t *) regsub;
511     nxo_p_type_set(a_nxo, NXOT_REGSUB);
512 
513     /* Register the regsub object with the GC. */
514     nxa_l_gc_register((cw_nxoe_t *) regsub);
515 
516     retval = NXN_ZERO;
517     RETURN:
518     return retval;
519 }
520 
521 void
nxo_regsub_subst(cw_nxo_t * a_nxo,cw_nxo_t * a_thread,cw_nxo_t * a_input,cw_nxo_t * r_output,uint32_t * r_count)522 nxo_regsub_subst(cw_nxo_t *a_nxo, cw_nxo_t *a_thread, cw_nxo_t *a_input,
523 		 cw_nxo_t *r_output, uint32_t *r_count)
524 {
525     cw_nxoe_regsub_t *regsub;
526 
527     cw_check_ptr(a_nxo);
528     cw_dassert(a_nxo->magic == CW_NXO_MAGIC);
529     cw_assert(nxo_type_get(a_nxo) == NXOT_REGSUB);
530 
531     regsub = (cw_nxoe_regsub_t *) a_nxo->o.nxoe;
532 
533     cw_check_ptr(regsub);
534     cw_dassert(regsub->nxoe.magic == CW_NXOE_MAGIC);
535     cw_assert(regsub->nxoe.type == NXOT_REGSUB);
536 
537     *r_count = nxo_p_regsub_subst(regsub, a_thread, a_input, r_output);
538 }
539 
540 /* Do a subst without creating a regsub object, in order to avoid putting
541  * pressure on the GC. */
542 cw_nxn_t
nxo_regsub_nonew_subst(cw_nxo_t * a_thread,const char * a_pattern,uint32_t a_plen,bool a_global,bool a_insensitive,bool a_multiline,bool a_singleline,const char * a_template,uint32_t a_tlen,cw_nxo_t * a_input,cw_nxo_t * r_output,uint32_t * r_count)543 nxo_regsub_nonew_subst(cw_nxo_t *a_thread, const char *a_pattern,
544 		       uint32_t a_plen, bool a_global,
545 		       bool a_insensitive, bool a_multiline,
546 		       bool a_singleline, const char *a_template,
547 		       uint32_t a_tlen, cw_nxo_t *a_input,
548 		       cw_nxo_t *r_output, uint32_t *r_count)
549 {
550     cw_nxn_t retval;
551     cw_nxoe_regsub_t regsub;
552 
553     retval = nxo_p_regsub_init(&regsub, a_pattern, a_plen, a_global,
554 			       a_insensitive, a_multiline, a_singleline,
555 			       a_template, a_tlen);
556     if (retval)
557     {
558 	goto RETURN;
559     }
560 
561     *r_count = nxo_p_regsub_subst(&regsub, a_thread, a_input, r_output);
562 
563     /* Clean up memory. */
564 
565     if (regsub.vec != NULL)
566     {
567 	nxa_free(regsub.vec, sizeof(cw_nxoe_regsub_telm_t) * regsub.vlen);
568     }
569 
570     if (regsub.template != NULL)
571     {
572 	nxa_free(regsub.template, regsub.tlen);
573     }
574 
575     free(regsub.pcre);
576     if (regsub.extra != NULL)
577     {
578 	free(regsub.extra);
579     }
580 
581     retval = NXN_ZERO;
582     RETURN:
583     return retval;
584 }
585