1 /* -*- mode: c ; c-file-style: "canonware-c-style" -*-
2 ******************************************************************************
3 *
4 * Copyright (C) 1996-2005 Jason Evans <jasone@canonware.com>.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice(s), this list of conditions and the following disclaimer
12 * unmodified other than the allowable addition of one or more
13 * copyright notices.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice(s), this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 ******************************************************************************
32 *
33 * Version: Onyx 5.1.2
34 *
35 ******************************************************************************/
36
37 #define CW_NXO_REGSUB_C_
38
39 #include "../include/libonyx/libonyx.h"
40 #include "../include/libonyx/nxa_l.h"
41 #include "../include/libonyx/nxo_l.h"
42 #include "../include/libonyx/nxo_regex_l.h"
43 #include "../include/libonyx/nxo_regsub_l.h"
44 #include "../include/libonyx/nxo_thread_l.h"
45
46 /* Do the work of initializing a regsub, but don't do any of the typical
47 * GC-related initialization, so that this function can be used for the case
48 * where a regsub is temporarily constructed for a single subst. */
49 static cw_nxn_t
nxo_p_regsub_init(cw_nxoe_regsub_t * a_regsub,const char * a_pattern,uint32_t a_plen,bool a_global,bool a_insensitive,bool a_multiline,bool a_singleline,const char * a_template,uint32_t a_tlen)50 nxo_p_regsub_init(cw_nxoe_regsub_t *a_regsub, const char *a_pattern,
51 uint32_t a_plen, bool a_global,
52 bool a_insensitive, bool a_multiline,
53 bool a_singleline, const char *a_template,
54 uint32_t a_tlen)
55 {
56 cw_nxn_t retval;
57 char *pattern;
58 const char *errptr;
59 int options, erroffset, capturecount;
60 enum
61 {
62 TSTATE_START,
63 TSTATE_BS_CONT
64 } tstate;
65 uint32_t i, beg, end, voff;
66
67 nxoe_l_new(&a_regsub->nxoe, NXOT_REGSUB, false);
68
69 /* Create a '\0'-terminated copy of a_pattern. */
70 pattern = (char *) nxa_malloc(a_plen + 1);
71 memcpy(pattern, a_pattern, a_plen);
72 pattern[a_plen] = '\0';
73
74 /* Translate options to a format usable by pcre_compile(). */
75 options = 0;
76 /* $i. */
77 if (a_insensitive)
78 {
79 options |= PCRE_CASELESS;
80 }
81 /* $m. */
82 if (a_multiline)
83 {
84 options |= PCRE_MULTILINE;
85 }
86 /* $s. */
87 if (a_singleline)
88 {
89 options |= PCRE_DOTALL;
90 }
91
92 /* Store the global flag. This information is not needed in this function,
93 * but gets used when actually doing substitutions. */
94 a_regsub->global = a_global;
95
96 /* Compile the regex. */
97 a_regsub->pcre = pcre_compile(pattern, options, &errptr, &erroffset, NULL);
98 nxa_free(pattern, a_plen + 1);
99 if (a_regsub->pcre == NULL)
100 {
101 retval = NXN_regexerror;
102 goto RETURN;
103 }
104
105 /* Call pcre_study(), which may improve matching performance. */
106 a_regsub->extra = pcre_study(a_regsub->pcre, 0, &errptr);
107 if (errptr != NULL)
108 {
109 free(a_regsub->pcre);
110 retval = NXN_regexerror;
111 goto RETURN;
112 }
113
114 /* Get capturecount and the amount of space that was allocated for
115 * a_regsub->pcre and a_regsub->extra. */
116 if ((pcre_fullinfo(a_regsub->pcre, a_regsub->extra, PCRE_INFO_CAPTURECOUNT,
117 &capturecount) != 0)
118 || (pcre_fullinfo(a_regsub->pcre, a_regsub->extra, PCRE_INFO_SIZE,
119 &a_regsub->size) != 0)
120 || (pcre_fullinfo(a_regsub->pcre, a_regsub->extra, PCRE_INFO_STUDYSIZE,
121 &a_regsub->studysize) != 0)
122 )
123 {
124 free(a_regsub->pcre);
125 if (a_regsub->extra != NULL)
126 {
127 free(a_regsub->extra);
128 }
129 retval = NXN_regexerror;
130 goto RETURN;
131 }
132
133 /* Use capturecount to calculate the size of vector needed for pcre_exec()
134 * calls. */
135 a_regsub->ovcnt = (capturecount + 1) * 3;
136
137 /* Make a copy of a_template. */
138 if (a_tlen > 0)
139 {
140 a_regsub->template = (char *) nxa_malloc(a_tlen);
141 memcpy(a_regsub->template, a_template, a_tlen);
142 }
143 else
144 {
145 a_regsub->template = NULL;
146 }
147 a_regsub->tlen = a_tlen;
148
149 /* Parse a_template and construct a vector from it. Do this in two passes,
150 * since having to reallocate is likely to be more expensive than parsing
151 * twice. */
152 for (i = beg = end = a_regsub->vlen = 0, tstate = TSTATE_START;
153 i < a_tlen;
154 i++)
155 {
156 switch (tstate)
157 {
158 case TSTATE_START:
159 {
160 switch (a_regsub->template[i])
161 {
162 case '\\':
163 {
164 end = i;
165 tstate = TSTATE_BS_CONT;
166 break;
167 }
168 default:
169 {
170 break;
171 }
172 }
173 break;
174 }
175 case TSTATE_BS_CONT:
176 {
177 switch (a_regsub->template[i])
178 {
179 case '1': case '2': case '3': case '4': case '5': case '6':
180 case '7': case '8': case '9':
181 {
182 /* Preceding plain text, if any. */
183 if (end > beg)
184 {
185 a_regsub->vlen++;
186 }
187
188 /* Subpattern substitution. */
189 a_regsub->vlen++;
190 beg = end = i + 1;
191 tstate = TSTATE_START;
192 break;
193 }
194 case '\\':
195 {
196 /* Stay in this state (ignore extra leading '\'
197 * characters. */
198 end = i;
199 break;
200 }
201 default:
202 {
203 /* Ignore. */
204 tstate = TSTATE_START;
205 break;
206 }
207 }
208 break;
209 }
210 default:
211 {
212 cw_not_reached();
213 }
214 }
215 }
216 if (beg < i)
217 {
218 /* Normal characters after last subpattern substitution. */
219 a_regsub->vlen++;
220 }
221
222 /* Initialize the vector, now that we know how big to make it. */
223 if (a_regsub->vlen > 0)
224 {
225 a_regsub->vec
226 = (cw_nxoe_regsub_telm_t *) nxa_malloc(sizeof(cw_nxoe_regsub_telm_t)
227 * a_regsub->vlen);
228 }
229 else
230 {
231 a_regsub->vec = NULL;
232 }
233
234 for (i = beg = end = voff = 0, tstate = TSTATE_START;
235 i < a_tlen;
236 i++)
237 {
238 switch (tstate)
239 {
240 case TSTATE_START:
241 {
242 switch (a_regsub->template[i])
243 {
244 case '\\':
245 {
246 end = i;
247 tstate = TSTATE_BS_CONT;
248 break;
249 }
250 default:
251 {
252 break;
253 }
254 }
255 break;
256 }
257 case TSTATE_BS_CONT:
258 {
259 switch (a_regsub->template[i])
260 {
261 case '1': case '2': case '3': case '4': case '5': case '6':
262 case '7': case '8': case '9':
263 {
264 /* Preceding plain text, if any. */
265 if (end > beg)
266 {
267 a_regsub->vec[voff].str = &a_regsub->template[beg];
268 a_regsub->vec[voff].len = end - beg;
269 voff++;
270 }
271
272 /* Subpattern substitution. */
273 a_regsub->vec[voff].str = NULL;
274 a_regsub->vec[voff].len
275 = (uint32_t) (a_regsub->template[i] - '0');
276 voff++;
277 beg = end = i + 1;
278 tstate = TSTATE_START;
279 break;
280 }
281 case '\\':
282 {
283 /* Stay in this state (ignore extra leading '\'
284 * characters. */
285 end = i;
286 break;
287 }
288 default:
289 {
290 /* Ignore. */
291 tstate = TSTATE_START;
292 break;
293 }
294 }
295 break;
296 }
297 default:
298 {
299 cw_not_reached();
300 }
301 }
302 }
303 if (beg < i)
304 {
305 /* Normal characters after last subpattern substitution. */
306 a_regsub->vec[voff].str = &a_regsub->template[beg];
307 a_regsub->vec[voff].len = i - beg;
308 #ifdef CW_DBG
309 /* Make following assertion possible. */
310 voff++;
311 #endif
312 }
313 cw_assert(voff == a_regsub->vlen);
314
315 retval = NXN_ZERO;
316 RETURN:
317 return retval;
318 }
319
320 CW_P_INLINE void
nxo_p_regsub_append(char ** r_ostr,uint32_t * r_omax,uint32_t * r_olen,const char * a_istr,uint32_t a_ilen)321 nxo_p_regsub_append(char **r_ostr, uint32_t *r_omax,
322 uint32_t *r_olen, const char *a_istr,
323 uint32_t a_ilen)
324 {
325 uint32_t omax;
326
327 /* Expand *r_ostr, if necessary. */
328 for (omax = *r_omax; *r_olen + a_ilen > omax; omax *= 2)
329 {
330 /* Do nothing. */
331 }
332 if (omax != *r_omax)
333 {
334 *r_ostr = nxa_realloc(*r_ostr, omax, *r_omax);
335 *r_omax = omax;
336 }
337
338 /* Copy and adjust *r_olen. */
339 memcpy(&(*r_ostr)[*r_olen], a_istr, a_ilen);
340 *r_olen += a_ilen;
341 }
342
343 static uint32_t
nxo_p_regsub_subst(cw_nxoe_regsub_t * a_regsub,cw_nxo_t * a_thread,cw_nxo_t * a_input,cw_nxo_t * r_output)344 nxo_p_regsub_subst(cw_nxoe_regsub_t *a_regsub, cw_nxo_t *a_thread,
345 cw_nxo_t *a_input, cw_nxo_t *r_output)
346 {
347 uint32_t retval = 0;
348 cw_nxo_regex_cache_t *cache;
349 uint32_t scnt, ilen, ioff, olen, omax, v;
350 char *istr, *ostr;
351
352 cache = nxo_l_thread_regex_cache_get(a_thread);
353
354 /* Allocate or extend the vector for passing to pcre_exec(), if
355 * necessary. */
356 if (cache->ovp == NULL)
357 {
358 cache->ovp = nxa_malloc(sizeof(int) * a_regsub->ovcnt);
359 cache->ovcnt = a_regsub->ovcnt;
360 }
361 else if (cache->ovcnt < a_regsub->ovcnt)
362 {
363 cache->ovp = nxa_realloc(cache->ovp, sizeof(int) * a_regsub->ovcnt,
364 sizeof(int) * cache->ovcnt);
365 cache->ovcnt = a_regsub->ovcnt;
366 }
367
368 /* Allocate a temporary output string that is as large as the input string.
369 * If ostr overflows, iteratively double its size. omax tracks the current
370 * allocation size of ostr, and olen tracks how full ostr is. */
371 ilen = omax = nxo_string_len_get(a_input);
372 olen = 0;
373 if (omax == 0)
374 {
375 /* It is possible for a pattern to match the empty string, then
376 * substitute a non-empty string. Therefore, handle the empty input
377 * string case. */
378 omax = 8;
379 }
380 istr = nxo_string_get(a_input);
381 ostr = nxa_malloc(omax);
382
383 /* Iteratively look for matches. */
384 for (scnt = ioff = 0;
385 ioff < ilen && (a_regsub->global || scnt < 1);
386 scnt++, ioff = (uint32_t) cache->ovp[1])
387 {
388 /* Look for a match. */
389 nxo_string_lock(a_input);
390 cache->mcnt = pcre_exec(a_regsub->pcre, a_regsub->extra, (char *) istr,
391 ilen, ioff, 0, cache->ovp, cache->ovcnt);
392 nxo_string_unlock(a_input);
393 if (cache->mcnt <= 0)
394 {
395 switch (cache->mcnt)
396 {
397 case 0:
398 case PCRE_ERROR_NOMATCH:
399 {
400 /* No match found. Not an error. */
401 goto DONE;
402 }
403 case PCRE_ERROR_NOMEMORY:
404 {
405 xep_throw(CW_ONYXX_OOM);
406 }
407 case PCRE_ERROR_NULL:
408 case PCRE_ERROR_BADOPTION:
409 case PCRE_ERROR_BADMAGIC:
410 case PCRE_ERROR_UNKNOWN_NODE:
411 default:
412 {
413 cw_not_reached();
414 }
415 }
416 }
417
418 /* Copy any data between the end of the previous substitution and the
419 * beginning of the current substitution. */
420 if (ioff < (uint32_t) cache->ovp[0])
421 {
422 nxo_p_regsub_append(&ostr, &omax, &olen,
423 &istr[ioff],
424 (uint32_t) cache->ovp[0] - ioff);
425 }
426
427 /* Substitute. */
428 for (v = 0; v < a_regsub->vlen; v++)
429 {
430 if (a_regsub->vec[v].str != NULL)
431 {
432 /* Copy from template string. */
433 nxo_p_regsub_append(&ostr, &omax, &olen, a_regsub->vec[v].str,
434 a_regsub->vec[v].len);
435 }
436 else
437 {
438 /* Substitute subpattern match, if the subpattern was
439 * matched. */
440 if (a_regsub->vec[v].len < cache->mcnt
441 && cache->ovp[a_regsub->vec[v].len * 2] != -1)
442 {
443
444 nxo_p_regsub_append(&ostr, &omax, &olen,
445 &istr
446 [cache->ovp[a_regsub->vec[v].len * 2]],
447 cache->ovp[a_regsub->vec[v].len * 2 + 1]
448 - cache->ovp[a_regsub->vec[v].len * 2]);
449 }
450 }
451 }
452
453 /* Increment substitution count. */
454 retval++;
455 }
456 DONE:
457 /* If there are trailing bytes after the last match, copy them. */
458 if (ioff < ilen)
459 {
460 nxo_p_regsub_append(&ostr, &omax, &olen, &istr[ioff], ilen - ioff);
461 }
462
463 /* Create an Onyx string and copy ostr to it. */
464 if (retval > 0)
465 {
466 nxo_string_new(r_output, nxo_thread_currentlocking(a_thread), olen);
467 if (olen > 0)
468 {
469 nxo_string_set(r_output, 0, ostr, olen);
470 }
471 }
472 else
473 {
474 /* No substitution done. Dup the input string. */
475 nxo_dup(r_output, a_input);
476 }
477
478 /* Clean up. */
479 nxa_free(ostr, omax);
480
481 return retval;
482 }
483
484 cw_nxn_t
nxo_regsub_new(cw_nxo_t * a_nxo,const char * a_pattern,uint32_t a_plen,bool a_global,bool a_insensitive,bool a_multiline,bool a_singleline,const char * a_template,uint32_t a_tlen)485 nxo_regsub_new(cw_nxo_t *a_nxo, const char *a_pattern, uint32_t a_plen,
486 bool a_global, bool a_insensitive,
487 bool a_multiline, bool a_singleline,
488 const char *a_template, uint32_t a_tlen)
489 {
490 cw_nxn_t retval;
491 cw_nxoe_regsub_t *regsub;
492
493 regsub = (cw_nxoe_regsub_t *) nxa_malloc(sizeof(cw_nxoe_regsub_t));
494
495 retval = nxo_p_regsub_init(regsub, a_pattern, a_plen, a_global,
496 a_insensitive, a_multiline, a_singleline,
497 a_template, a_tlen);
498 if (retval)
499 {
500 nxa_free(regsub, sizeof(cw_nxoe_regsub_t));
501 goto RETURN;
502 }
503
504 /* Tell the GC about the space being taken up by regsub->pcre and
505 * regsub->extra. */
506 nxa_l_count_adjust((cw_nxoi_t) regsub->size + regsub->studysize);
507
508 /* Create a reference to the regsub object. */
509 nxo_no_new(a_nxo);
510 a_nxo->o.nxoe = (cw_nxoe_t *) regsub;
511 nxo_p_type_set(a_nxo, NXOT_REGSUB);
512
513 /* Register the regsub object with the GC. */
514 nxa_l_gc_register((cw_nxoe_t *) regsub);
515
516 retval = NXN_ZERO;
517 RETURN:
518 return retval;
519 }
520
521 void
nxo_regsub_subst(cw_nxo_t * a_nxo,cw_nxo_t * a_thread,cw_nxo_t * a_input,cw_nxo_t * r_output,uint32_t * r_count)522 nxo_regsub_subst(cw_nxo_t *a_nxo, cw_nxo_t *a_thread, cw_nxo_t *a_input,
523 cw_nxo_t *r_output, uint32_t *r_count)
524 {
525 cw_nxoe_regsub_t *regsub;
526
527 cw_check_ptr(a_nxo);
528 cw_dassert(a_nxo->magic == CW_NXO_MAGIC);
529 cw_assert(nxo_type_get(a_nxo) == NXOT_REGSUB);
530
531 regsub = (cw_nxoe_regsub_t *) a_nxo->o.nxoe;
532
533 cw_check_ptr(regsub);
534 cw_dassert(regsub->nxoe.magic == CW_NXOE_MAGIC);
535 cw_assert(regsub->nxoe.type == NXOT_REGSUB);
536
537 *r_count = nxo_p_regsub_subst(regsub, a_thread, a_input, r_output);
538 }
539
540 /* Do a subst without creating a regsub object, in order to avoid putting
541 * pressure on the GC. */
542 cw_nxn_t
nxo_regsub_nonew_subst(cw_nxo_t * a_thread,const char * a_pattern,uint32_t a_plen,bool a_global,bool a_insensitive,bool a_multiline,bool a_singleline,const char * a_template,uint32_t a_tlen,cw_nxo_t * a_input,cw_nxo_t * r_output,uint32_t * r_count)543 nxo_regsub_nonew_subst(cw_nxo_t *a_thread, const char *a_pattern,
544 uint32_t a_plen, bool a_global,
545 bool a_insensitive, bool a_multiline,
546 bool a_singleline, const char *a_template,
547 uint32_t a_tlen, cw_nxo_t *a_input,
548 cw_nxo_t *r_output, uint32_t *r_count)
549 {
550 cw_nxn_t retval;
551 cw_nxoe_regsub_t regsub;
552
553 retval = nxo_p_regsub_init(®sub, a_pattern, a_plen, a_global,
554 a_insensitive, a_multiline, a_singleline,
555 a_template, a_tlen);
556 if (retval)
557 {
558 goto RETURN;
559 }
560
561 *r_count = nxo_p_regsub_subst(®sub, a_thread, a_input, r_output);
562
563 /* Clean up memory. */
564
565 if (regsub.vec != NULL)
566 {
567 nxa_free(regsub.vec, sizeof(cw_nxoe_regsub_telm_t) * regsub.vlen);
568 }
569
570 if (regsub.template != NULL)
571 {
572 nxa_free(regsub.template, regsub.tlen);
573 }
574
575 free(regsub.pcre);
576 if (regsub.extra != NULL)
577 {
578 free(regsub.extra);
579 }
580
581 retval = NXN_ZERO;
582 RETURN:
583 return retval;
584 }
585