1 /**********************************************************************
2 
3   re.c -
4 
5   $Author: nagachika $
6   created at: Mon Aug  9 18:24:49 JST 1993
7 
8   Copyright (C) 1993-2007 Yukihiro Matsumoto
9 
10 **********************************************************************/
11 
12 #include "ruby/encoding.h"
13 #include "ruby/re.h"
14 #include "ruby/util.h"
15 #include "internal.h"
16 #include "regint.h"
17 #include "encindex.h"
18 #include <ctype.h>
19 
20 VALUE rb_eRegexpError;
21 
22 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
23 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
24 
25 #define BEG(no) (regs->beg[(no)])
26 #define END(no) (regs->end[(no)])
27 
28 #if 'a' == 97   /* it's ascii */
29 static const char casetable[] = {
30         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
31         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
32         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
33         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
34         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
35         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
36         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
37         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
38         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
39         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
40         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
41         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
42         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
43         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
44         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
45         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
46         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
47         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
48         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
49         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
50         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
51         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
52         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
53         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
54         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
55         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
56         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
57         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
58         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
59         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
60         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
61         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
62         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
63         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
64         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
65         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
66         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
67         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
68         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
69         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
70         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
71         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
72         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
73         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
74 };
75 #else
76 # error >>> "You lose. You will need a translation table for your character set." <<<
77 #endif
78 
79 int
rb_memcicmp(const void * x,const void * y,long len)80 rb_memcicmp(const void *x, const void *y, long len)
81 {
82     const unsigned char *p1 = x, *p2 = y;
83     int tmp;
84 
85     while (len--) {
86 	if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
87 	    return tmp;
88     }
89     return 0;
90 }
91 
92 #ifdef HAVE_MEMMEM
93 static inline long
rb_memsearch_ss(const unsigned char * xs,long m,const unsigned char * ys,long n)94 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
95 {
96     const unsigned char *y;
97 
98     if ((y = memmem(ys, n, xs, m)) != NULL)
99 	return y - ys;
100     else
101 	return -1;
102 }
103 #else
104 static inline long
rb_memsearch_ss(const unsigned char * xs,long m,const unsigned char * ys,long n)105 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
106 {
107     const unsigned char *x = xs, *xe = xs + m;
108     const unsigned char *y = ys, *ye = ys + n;
109 #define VALUE_MAX ((VALUE)~(VALUE)0)
110     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
111 
112     if (m > SIZEOF_VALUE)
113 	rb_bug("!!too long pattern string!!");
114 
115     if (!(y = memchr(y, *x, n - m + 1)))
116 	return -1;
117 
118     /* Prepare hash value */
119     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
120 	hx <<= CHAR_BIT;
121 	hy <<= CHAR_BIT;
122 	hx |= *x;
123 	hy |= *y;
124     }
125     /* Searching */
126     while (hx != hy) {
127 	if (y == ye)
128 	    return -1;
129 	hy <<= CHAR_BIT;
130 	hy |= *y;
131 	hy &= mask;
132 	y++;
133     }
134     return y - ys - m;
135 }
136 #endif
137 
138 static inline long
rb_memsearch_qs(const unsigned char * xs,long m,const unsigned char * ys,long n)139 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
140 {
141     const unsigned char *x = xs, *xe = xs + m;
142     const unsigned char *y = ys;
143     VALUE i, qstable[256];
144 
145     /* Preprocessing */
146     for (i = 0; i < 256; ++i)
147 	qstable[i] = m + 1;
148     for (; x < xe; ++x)
149 	qstable[*x] = xe - x;
150     /* Searching */
151     for (; y + m <= ys + n; y += *(qstable + y[m])) {
152 	if (*xs == *y && memcmp(xs, y, m) == 0)
153 	    return y - ys;
154     }
155     return -1;
156 }
157 
158 static inline unsigned int
rb_memsearch_qs_utf8_hash(const unsigned char * x)159 rb_memsearch_qs_utf8_hash(const unsigned char *x)
160 {
161     register const unsigned int mix = 8353;
162     register unsigned int h = *x;
163     if (h < 0xC0) {
164 	return h + 256;
165     }
166     else if (h < 0xE0) {
167 	h *= mix;
168 	h += x[1];
169     }
170     else if (h < 0xF0) {
171 	h *= mix;
172 	h += x[1];
173 	h *= mix;
174 	h += x[2];
175     }
176     else if (h < 0xF5) {
177 	h *= mix;
178 	h += x[1];
179 	h *= mix;
180 	h += x[2];
181 	h *= mix;
182 	h += x[3];
183     }
184     else {
185 	return h + 256;
186     }
187     return (unsigned char)h;
188 }
189 
190 static inline long
rb_memsearch_qs_utf8(const unsigned char * xs,long m,const unsigned char * ys,long n)191 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
192 {
193     const unsigned char *x = xs, *xe = xs + m;
194     const unsigned char *y = ys;
195     VALUE i, qstable[512];
196 
197     /* Preprocessing */
198     for (i = 0; i < 512; ++i) {
199 	qstable[i] = m + 1;
200     }
201     for (; x < xe; ++x) {
202 	qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
203     }
204     /* Searching */
205     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
206 	if (*xs == *y && memcmp(xs, y, m) == 0)
207 	    return y - ys;
208     }
209     return -1;
210 }
211 
212 static inline long
rb_memsearch_wchar(const unsigned char * xs,long m,const unsigned char * ys,long n)213 rb_memsearch_wchar(const unsigned char *xs, long m, const unsigned char *ys, long n)
214 {
215     const unsigned char *x = xs, x0 = *xs, *y = ys;
216     enum {char_size = 2};
217 
218     for (n -= m; n >= 0; n -= char_size, y += char_size) {
219 	if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
220 	    return y - ys;
221     }
222     return -1;
223 }
224 
225 static inline long
rb_memsearch_qchar(const unsigned char * xs,long m,const unsigned char * ys,long n)226 rb_memsearch_qchar(const unsigned char *xs, long m, const unsigned char *ys, long n)
227 {
228     const unsigned char *x = xs, x0 = *xs, *y = ys;
229     enum {char_size = 4};
230 
231     for (n -= m; n >= 0; n -= char_size, y += char_size) {
232 	if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
233 	    return y - ys;
234     }
235     return -1;
236 }
237 
238 long
rb_memsearch(const void * x0,long m,const void * y0,long n,rb_encoding * enc)239 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
240 {
241     const unsigned char *x = x0, *y = y0;
242 
243     if (m > n) return -1;
244     else if (m == n) {
245 	return memcmp(x0, y0, m) == 0 ? 0 : -1;
246     }
247     else if (m < 1) {
248 	return 0;
249     }
250     else if (m == 1) {
251 	const unsigned char *ys = memchr(y, *x, n);
252 
253 	if (ys)
254 	    return ys - y;
255 	else
256 	    return -1;
257     }
258     else if (LIKELY(rb_enc_mbminlen(enc) == 1)) {
259 	if (m <= SIZEOF_VALUE) {
260 	    return rb_memsearch_ss(x0, m, y0, n);
261 	}
262 	else if (enc == rb_utf8_encoding()){
263 	    return rb_memsearch_qs_utf8(x0, m, y0, n);
264 	}
265     }
266     else if (LIKELY(rb_enc_mbminlen(enc) == 2)) {
267 	return rb_memsearch_wchar(x0, m, y0, n);
268     }
269     else if (LIKELY(rb_enc_mbminlen(enc) == 4)) {
270 	return rb_memsearch_qchar(x0, m, y0, n);
271     }
272     return rb_memsearch_qs(x0, m, y0, n);
273 }
274 
275 #define REG_LITERAL FL_USER5
276 #define REG_ENCODING_NONE FL_USER6
277 
278 #define KCODE_FIXED FL_USER4
279 
280 #define ARG_REG_OPTION_MASK \
281     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
282 #define ARG_ENCODING_FIXED    16
283 #define ARG_ENCODING_NONE     32
284 
285 static int
char_to_option(int c)286 char_to_option(int c)
287 {
288     int val;
289 
290     switch (c) {
291       case 'i':
292 	val = ONIG_OPTION_IGNORECASE;
293 	break;
294       case 'x':
295 	val = ONIG_OPTION_EXTEND;
296 	break;
297       case 'm':
298 	val = ONIG_OPTION_MULTILINE;
299 	break;
300       default:
301 	val = 0;
302 	break;
303     }
304     return val;
305 }
306 
307 static char *
option_to_str(char str[4],int options)308 option_to_str(char str[4], int options)
309 {
310     char *p = str;
311     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
312     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
313     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
314     *p = 0;
315     return str;
316 }
317 
318 extern int
rb_char_to_option_kcode(int c,int * option,int * kcode)319 rb_char_to_option_kcode(int c, int *option, int *kcode)
320 {
321     *option = 0;
322 
323     switch (c) {
324       case 'n':
325         *kcode = rb_ascii8bit_encindex();
326         return (*option = ARG_ENCODING_NONE);
327       case 'e':
328 	*kcode = ENCINDEX_EUC_JP;
329 	break;
330       case 's':
331 	*kcode = ENCINDEX_Windows_31J;
332 	break;
333       case 'u':
334 	*kcode = rb_utf8_encindex();
335 	break;
336       default:
337 	*kcode = -1;
338 	return (*option = char_to_option(c));
339     }
340     *option = ARG_ENCODING_FIXED;
341     return 1;
342 }
343 
344 static void
rb_reg_check(VALUE re)345 rb_reg_check(VALUE re)
346 {
347     if (!RREGEXP_PTR(re) || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
348 	rb_raise(rb_eTypeError, "uninitialized Regexp");
349     }
350 }
351 
352 static void
rb_reg_expr_str(VALUE str,const char * s,long len,rb_encoding * enc,rb_encoding * resenc,int term)353 rb_reg_expr_str(VALUE str, const char *s, long len,
354 		rb_encoding *enc, rb_encoding *resenc, int term)
355 {
356     const char *p, *pend;
357     int cr = ENC_CODERANGE_UNKNOWN;
358     int need_escape = 0;
359     int c, clen;
360 
361     p = s; pend = p + len;
362     rb_str_coderange_scan_restartable(p, pend, enc, &cr);
363     if (rb_enc_asciicompat(enc) && ENC_CODERANGE_CLEAN_P(cr)) {
364 	while (p < pend) {
365 	    c = rb_enc_ascget(p, pend, &clen, enc);
366 	    if (c == -1) {
367 		if (enc == resenc) {
368 		    p += mbclen(p, pend, enc);
369 		}
370 		else {
371 		    need_escape = 1;
372 		    break;
373 		}
374 	    }
375 	    else if (c != term && rb_enc_isprint(c, enc)) {
376 		p += clen;
377 	    }
378 	    else {
379 		need_escape = 1;
380 		break;
381 	    }
382 	}
383     }
384     else {
385 	need_escape = 1;
386     }
387 
388     if (!need_escape) {
389 	rb_str_buf_cat(str, s, len);
390     }
391     else {
392 	int unicode_p = rb_enc_unicode_p(enc);
393 	p = s;
394 	while (p<pend) {
395             c = rb_enc_ascget(p, pend, &clen, enc);
396 	    if (c == '\\' && p+clen < pend) {
397 		int n = clen + mbclen(p+clen, pend, enc);
398 		rb_str_buf_cat(str, p, n);
399 		p += n;
400 		continue;
401 	    }
402 	    else if (c == -1) {
403 		clen = rb_enc_precise_mbclen(p, pend, enc);
404 		if (!MBCLEN_CHARFOUND_P(clen)) {
405 		    c = (unsigned char)*p;
406 		    clen = 1;
407 		    goto hex;
408 		}
409 		if (resenc) {
410 		    unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
411 		    rb_str_buf_cat_escaped_char(str, c, unicode_p);
412 		}
413 		else {
414 		    clen = MBCLEN_CHARFOUND_LEN(clen);
415 		    rb_str_buf_cat(str, p, clen);
416 		}
417 	    }
418 	    else if (c == term) {
419 		char c = '\\';
420 		rb_str_buf_cat(str, &c, 1);
421 		rb_str_buf_cat(str, p, clen);
422 	    }
423 	    else if (rb_enc_isprint(c, enc)) {
424 		rb_str_buf_cat(str, p, clen);
425 	    }
426 	    else if (!rb_enc_isspace(c, enc)) {
427 		char b[8];
428 
429 	      hex:
430 		snprintf(b, sizeof(b), "\\x%02X", c);
431 		rb_str_buf_cat(str, b, 4);
432 	    }
433 	    else {
434 		rb_str_buf_cat(str, p, clen);
435 	    }
436 	    p += clen;
437 	}
438     }
439 }
440 
441 static VALUE
rb_reg_desc(const char * s,long len,VALUE re)442 rb_reg_desc(const char *s, long len, VALUE re)
443 {
444     rb_encoding *enc = rb_enc_get(re);
445     VALUE str = rb_str_buf_new2("/");
446     rb_encoding *resenc = rb_default_internal_encoding();
447     if (resenc == NULL) resenc = rb_default_external_encoding();
448 
449     if (re && rb_enc_asciicompat(enc)) {
450 	rb_enc_copy(str, re);
451     }
452     else {
453 	rb_enc_associate(str, rb_usascii_encoding());
454     }
455     rb_reg_expr_str(str, s, len, enc, resenc, '/');
456     rb_str_buf_cat2(str, "/");
457     if (re) {
458 	char opts[4];
459 	rb_reg_check(re);
460 	if (*option_to_str(opts, RREGEXP_PTR(re)->options))
461 	    rb_str_buf_cat2(str, opts);
462 	if (RBASIC(re)->flags & REG_ENCODING_NONE)
463 	    rb_str_buf_cat2(str, "n");
464     }
465     OBJ_INFECT(str, re);
466     return str;
467 }
468 
469 
470 /*
471  *  call-seq:
472  *      rxp.source   -> str
473  *
474  *  Returns the original string of the pattern.
475  *
476  *      /ab+c/ix.source #=> "ab+c"
477  *
478  *  Note that escape sequences are retained as is.
479  *
480  *     /\x20\+/.source  #=> "\\x20\\+"
481  *
482  */
483 
484 static VALUE
rb_reg_source(VALUE re)485 rb_reg_source(VALUE re)
486 {
487     VALUE str;
488 
489     rb_reg_check(re);
490     str = rb_str_dup(RREGEXP_SRC(re));
491     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
492     return str;
493 }
494 
495 /*
496  * call-seq:
497  *    rxp.inspect   -> string
498  *
499  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
500  * <code>#inspect</code> actually produces the more natural version of
501  * the string than <code>#to_s</code>.
502  *
503  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
504  *
505  */
506 
507 static VALUE
rb_reg_inspect(VALUE re)508 rb_reg_inspect(VALUE re)
509 {
510     if (!RREGEXP_PTR(re) || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
511         return rb_any_to_s(re);
512     }
513     return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
514 }
515 
516 static VALUE rb_reg_str_with_term(VALUE re, int term);
517 
518 /*
519  *  call-seq:
520  *     rxp.to_s   -> str
521  *
522  *  Returns a string containing the regular expression and its options (using the
523  *  <code>(?opts:source)</code> notation. This string can be fed back in to
524  *  <code>Regexp::new</code> to a regular expression with the same semantics as
525  *  the original. (However, <code>Regexp#==</code> may not return true when
526  *  comparing the two, as the source of the regular expression itself may
527  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
528  *  generally more readable version of <i>rxp</i>.
529  *
530  *      r1 = /ab+c/ix           #=> /ab+c/ix
531  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
532  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
533  *      r1 == r2                #=> false
534  *      r1.source               #=> "ab+c"
535  *      r2.source               #=> "(?ix-m:ab+c)"
536  */
537 
538 static VALUE
rb_reg_to_s(VALUE re)539 rb_reg_to_s(VALUE re)
540 {
541     return rb_reg_str_with_term(re, '/');
542 }
543 
544 static VALUE
rb_reg_str_with_term(VALUE re,int term)545 rb_reg_str_with_term(VALUE re, int term)
546 {
547     int options, opt;
548     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
549     long len;
550     const UChar* ptr;
551     VALUE str = rb_str_buf_new2("(?");
552     char optbuf[5];
553     rb_encoding *enc = rb_enc_get(re);
554 
555     rb_reg_check(re);
556 
557     rb_enc_copy(str, re);
558     options = RREGEXP_PTR(re)->options;
559     ptr = (UChar*)RREGEXP_SRC_PTR(re);
560     len = RREGEXP_SRC_LEN(re);
561   again:
562     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
563 	int err = 1;
564 	ptr += 2;
565 	if ((len -= 2) > 0) {
566 	    do {
567                 opt = char_to_option((int )*ptr);
568                 if (opt != 0) {
569                     options |= opt;
570                 }
571                 else {
572                     break;
573                 }
574 		++ptr;
575 	    } while (--len > 0);
576 	}
577 	if (len > 1 && *ptr == '-') {
578 	    ++ptr;
579 	    --len;
580 	    do {
581                 opt = char_to_option((int )*ptr);
582                 if (opt != 0) {
583                     options &= ~opt;
584                 }
585                 else {
586                     break;
587                 }
588 		++ptr;
589 	    } while (--len > 0);
590 	}
591 	if (*ptr == ')') {
592 	    --len;
593 	    ++ptr;
594 	    goto again;
595 	}
596 	if (*ptr == ':' && ptr[len-1] == ')') {
597 	    Regexp *rp;
598 	    VALUE verbose = ruby_verbose;
599 	    ruby_verbose = Qfalse;
600 
601 	    ++ptr;
602 	    len -= 2;
603             err = onig_new(&rp, ptr, ptr + len, options,
604 			   enc, OnigDefaultSyntax, NULL);
605 	    onig_free(rp);
606 	    ruby_verbose = verbose;
607 	}
608 	if (err) {
609 	    options = RREGEXP_PTR(re)->options;
610 	    ptr = (UChar*)RREGEXP_SRC_PTR(re);
611 	    len = RREGEXP_SRC_LEN(re);
612 	}
613     }
614 
615     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
616 
617     if ((options & embeddable) != embeddable) {
618 	optbuf[0] = '-';
619 	option_to_str(optbuf + 1, ~options);
620 	rb_str_buf_cat2(str, optbuf);
621     }
622 
623     rb_str_buf_cat2(str, ":");
624     if (rb_enc_asciicompat(enc)) {
625 	rb_reg_expr_str(str, (char*)ptr, len, enc, NULL, term);
626 	rb_str_buf_cat2(str, ")");
627     }
628     else {
629 	const char *s, *e;
630 	char *paren;
631 	ptrdiff_t n;
632 	rb_str_buf_cat2(str, ")");
633 	rb_enc_associate(str, rb_usascii_encoding());
634 	str = rb_str_encode(str, rb_enc_from_encoding(enc), 0, Qnil);
635 
636 	/* backup encoded ")" to paren */
637 	s = RSTRING_PTR(str);
638 	e = RSTRING_END(str);
639 	s = rb_enc_left_char_head(s, e-1, e, enc);
640 	n = e - s;
641 	paren = ALLOCA_N(char, n);
642 	memcpy(paren, s, n);
643 	rb_str_resize(str, RSTRING_LEN(str) - n);
644 
645 	rb_reg_expr_str(str, (char*)ptr, len, enc, NULL, term);
646 	rb_str_buf_cat(str, paren, n);
647     }
648     rb_enc_copy(str, re);
649 
650     OBJ_INFECT(str, re);
651     return str;
652 }
653 
654 NORETURN(static void rb_reg_raise(const char *s, long len, const char *err, VALUE re));
655 
656 static void
rb_reg_raise(const char * s,long len,const char * err,VALUE re)657 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
658 {
659     VALUE desc = rb_reg_desc(s, len, re);
660 
661     rb_raise(rb_eRegexpError, "%s: %"PRIsVALUE, err, desc);
662 }
663 
664 static VALUE
rb_enc_reg_error_desc(const char * s,long len,rb_encoding * enc,int options,const char * err)665 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
666 {
667     char opts[6];
668     VALUE desc = rb_str_buf_new2(err);
669     rb_encoding *resenc = rb_default_internal_encoding();
670     if (resenc == NULL) resenc = rb_default_external_encoding();
671 
672     rb_enc_associate(desc, enc);
673     rb_str_buf_cat2(desc, ": /");
674     rb_reg_expr_str(desc, s, len, enc, resenc, '/');
675     opts[0] = '/';
676     option_to_str(opts + 1, options);
677     rb_str_buf_cat2(desc, opts);
678     return rb_exc_new3(rb_eRegexpError, desc);
679 }
680 
681 NORETURN(static void rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err));
682 
683 static void
rb_enc_reg_raise(const char * s,long len,rb_encoding * enc,int options,const char * err)684 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
685 {
686     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
687 }
688 
689 static VALUE
rb_reg_error_desc(VALUE str,int options,const char * err)690 rb_reg_error_desc(VALUE str, int options, const char *err)
691 {
692     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
693 				 rb_enc_get(str), options, err);
694 }
695 
696 NORETURN(static void rb_reg_raise_str(VALUE str, int options, const char *err));
697 
698 static void
rb_reg_raise_str(VALUE str,int options,const char * err)699 rb_reg_raise_str(VALUE str, int options, const char *err)
700 {
701     rb_exc_raise(rb_reg_error_desc(str, options, err));
702 }
703 
704 
705 /*
706  *  call-seq:
707  *     rxp.casefold?   -> true or false
708  *
709  *  Returns the value of the case-insensitive flag.
710  *
711  *      /a/.casefold?           #=> false
712  *      /a/i.casefold?          #=> true
713  *      /(?i:a)/.casefold?      #=> false
714  */
715 
716 static VALUE
rb_reg_casefold_p(VALUE re)717 rb_reg_casefold_p(VALUE re)
718 {
719     rb_reg_check(re);
720     if (RREGEXP_PTR(re)->options & ONIG_OPTION_IGNORECASE) return Qtrue;
721     return Qfalse;
722 }
723 
724 
725 /*
726  *  call-seq:
727  *     rxp.options   -> integer
728  *
729  *  Returns the set of bits corresponding to the options used when creating this
730  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
731  *  may be set in the returned options: these are used internally by the regular
732  *  expression code. These extra bits are ignored if the options are passed to
733  *  <code>Regexp::new</code>.
734  *
735  *     Regexp::IGNORECASE                  #=> 1
736  *     Regexp::EXTENDED                    #=> 2
737  *     Regexp::MULTILINE                   #=> 4
738  *
739  *     /cat/.options                       #=> 0
740  *     /cat/ix.options                     #=> 3
741  *     Regexp.new('cat', true).options     #=> 1
742  *     /\xa1\xa2/e.options                 #=> 16
743  *
744  *     r = /cat/ix
745  *     Regexp.new(r.source, r.options)     #=> /cat/ix
746  */
747 
748 static VALUE
rb_reg_options_m(VALUE re)749 rb_reg_options_m(VALUE re)
750 {
751     int options = rb_reg_options(re);
752     return INT2NUM(options);
753 }
754 
755 static int
reg_names_iter(const OnigUChar * name,const OnigUChar * name_end,int back_num,int * back_refs,OnigRegex regex,void * arg)756 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
757           int back_num, int *back_refs, OnigRegex regex, void *arg)
758 {
759     VALUE ary = (VALUE)arg;
760     rb_ary_push(ary, rb_enc_str_new((const char *)name, name_end-name, regex->enc));
761     return 0;
762 }
763 
764 /*
765  * call-seq:
766  *    rxp.names   -> [name1, name2, ...]
767  *
768  * Returns a list of names of captures as an array of strings.
769  *
770  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
771  *     #=> ["foo", "bar", "baz"]
772  *
773  *     /(?<foo>.)(?<foo>.)/.names
774  *     #=> ["foo"]
775  *
776  *     /(.)(.)/.names
777  *     #=> []
778  */
779 
780 static VALUE
rb_reg_names(VALUE re)781 rb_reg_names(VALUE re)
782 {
783     VALUE ary;
784     rb_reg_check(re);
785     ary = rb_ary_new_capa(onig_number_of_names(RREGEXP_PTR(re)));
786     onig_foreach_name(RREGEXP_PTR(re), reg_names_iter, (void*)ary);
787     return ary;
788 }
789 
790 static int
reg_named_captures_iter(const OnigUChar * name,const OnigUChar * name_end,int back_num,int * back_refs,OnigRegex regex,void * arg)791 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
792           int back_num, int *back_refs, OnigRegex regex, void *arg)
793 {
794     VALUE hash = (VALUE)arg;
795     VALUE ary = rb_ary_new2(back_num);
796     int i;
797 
798     for (i = 0; i < back_num; i++)
799         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
800 
801     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
802 
803     return 0;
804 }
805 
806 /*
807  * call-seq:
808  *    rxp.named_captures  -> hash
809  *
810  * Returns a hash representing information about named captures of <i>rxp</i>.
811  *
812  * A key of the hash is a name of the named captures.
813  * A value of the hash is an array which is list of indexes of corresponding
814  * named captures.
815  *
816  *    /(?<foo>.)(?<bar>.)/.named_captures
817  *    #=> {"foo"=>[1], "bar"=>[2]}
818  *
819  *    /(?<foo>.)(?<foo>.)/.named_captures
820  *    #=> {"foo"=>[1, 2]}
821  *
822  * If there are no named captures, an empty hash is returned.
823  *
824  *    /(.)(.)/.named_captures
825  *    #=> {}
826  */
827 
828 static VALUE
rb_reg_named_captures(VALUE re)829 rb_reg_named_captures(VALUE re)
830 {
831     regex_t *reg = (rb_reg_check(re), RREGEXP_PTR(re));
832     VALUE hash = rb_hash_new_with_size(onig_number_of_names(reg));
833     onig_foreach_name(reg, reg_named_captures_iter, (void*)hash);
834     return hash;
835 }
836 
837 static int
onig_new_with_source(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,const OnigSyntaxType * syntax,OnigErrorInfo * einfo,const char * sourcefile,int sourceline)838 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
839 		     OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
840 		     OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
841 {
842     int r;
843 
844     *reg = (regex_t* )malloc(sizeof(regex_t));
845     if (IS_NULL(*reg)) return ONIGERR_MEMORY;
846 
847     r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
848     if (r) goto err;
849 
850     r = onig_compile_ruby(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
851     if (r) {
852       err:
853 	onig_free(*reg);
854 	*reg = NULL;
855     }
856     return r;
857 }
858 
859 static Regexp*
make_regexp(const char * s,long len,rb_encoding * enc,int flags,onig_errmsg_buffer err,const char * sourcefile,int sourceline)860 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
861 	const char *sourcefile, int sourceline)
862 {
863     Regexp *rp;
864     int r;
865     OnigErrorInfo einfo;
866 
867     /* Handle escaped characters first. */
868 
869     /* Build a copy of the string (in dest) with the
870        escaped characters translated,  and generate the regex
871        from that.
872     */
873 
874     r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
875 		 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
876     if (r) {
877 	onig_error_code_to_str((UChar*)err, r, &einfo);
878 	return 0;
879     }
880     return rp;
881 }
882 
883 
884 /*
885  *  Document-class: MatchData
886  *
887  *  <code>MatchData</code> encapsulates the result of matching a Regexp against
888  *  string. It is returned by Regexp#match and
889  *  String#match, and also stored in a global variable returned by
890  *  Regexp.last_match.
891  *
892  *  Usage:
893  *
894  *      url = 'https://docs.ruby-lang.org/en/2.5.0/MatchData.html'
895  *      m = url.match(/(\d\.?)+/)   # => #<MatchData "2.5.0" 1:"0">
896  *      m.string                    # => "https://docs.ruby-lang.org/en/2.5.0/MatchData.html"
897  *      m.regexp                    # => /(\d\.?)+/
898  *      # entire matched substring:
899  *      m[0]                        # => "2.5.0"
900  *
901  *      # Working with unnamed captures
902  *      m = url.match(%r{([^/]+)/([^/]+)\.html$})
903  *      m.captures                  # => ["2.5.0", "MatchData"]
904  *      m[1]                        # => "2.5.0"
905  *      m.values_at(1, 2)           # => ["2.5.0", "MatchData"]
906  *
907  *      # Working with named captures
908  *      m = url.match(%r{(?<version>[^/]+)/(?<module>[^/]+)\.html$})
909  *      m.captures                  # => ["2.5.0", "MatchData"]
910  *      m.named_captures            # => {"version"=>"2.5.0", "module"=>"MatchData"}
911  *      m[:version]                 # => "2.5.0"
912  *      m.values_at(:version, :module)
913  *                                  # => ["2.5.0", "MatchData"]
914  *      # Numerical indexes are working, too
915  *      m[1]                        # => "2.5.0"
916  *      m.values_at(1, 2)           # => ["2.5.0", "MatchData"]
917  *
918  *  == Global variables equivalence
919  *
920  *  Parts of last <code>MatchData</code> (returned by Regexp.last_match) are also
921  *  aliased as global variables:
922  *
923  *  * <code>$~</code> is <code>Regexp.last_match</code>;
924  *  * <code>$&</code> is <code>Regexp.last_match[0]</code>;
925  *  * <code>$1</code>, <code>$2</code>, and so on are
926  *    <code>Regexp.last_match[i]</code> (captures by number);
927  *  * <code>$`</code> is <code>Regexp.last_match.pre_match</code>;
928  *  * <code>$'</code> is <code>Regexp.last_match.post_match</code>;
929  *  * <code>$+</code> is <code>Regexp.last_match[-1]</code> (the last capture).
930  *
931  *  See also "Special global variables" section in Regexp documentation.
932  */
933 
934 VALUE rb_cMatch;
935 
936 static VALUE
match_alloc(VALUE klass)937 match_alloc(VALUE klass)
938 {
939     NEWOBJ_OF(match, struct RMatch, klass, T_MATCH);
940 
941     match->str = 0;
942     match->rmatch = 0;
943     match->regexp = 0;
944     match->rmatch = ZALLOC(struct rmatch);
945 
946     return (VALUE)match;
947 }
948 
949 int
rb_reg_region_copy(struct re_registers * to,const struct re_registers * from)950 rb_reg_region_copy(struct re_registers *to, const struct re_registers *from)
951 {
952     onig_region_copy(to, (OnigRegion *)from);
953     if (to->allocated) return 0;
954     rb_gc();
955     onig_region_copy(to, (OnigRegion *)from);
956     if (to->allocated) return 0;
957     return ONIGERR_MEMORY;
958 }
959 
960 typedef struct {
961     long byte_pos;
962     long char_pos;
963 } pair_t;
964 
965 static int
pair_byte_cmp(const void * pair1,const void * pair2)966 pair_byte_cmp(const void *pair1, const void *pair2)
967 {
968     long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
969 #if SIZEOF_LONG > SIZEOF_INT
970     return diff ? diff > 0 ? 1 : -1 : 0;
971 #else
972     return (int)diff;
973 #endif
974 }
975 
976 static void
update_char_offset(VALUE match)977 update_char_offset(VALUE match)
978 {
979     struct rmatch *rm = RMATCH(match)->rmatch;
980     struct re_registers *regs;
981     int i, num_regs, num_pos;
982     long c;
983     char *s, *p, *q;
984     rb_encoding *enc;
985     pair_t *pairs;
986 
987     if (rm->char_offset_updated)
988         return;
989 
990     regs = &rm->regs;
991     num_regs = rm->regs.num_regs;
992 
993     if (rm->char_offset_num_allocated < num_regs) {
994         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
995         rm->char_offset_num_allocated = num_regs;
996     }
997 
998     enc = rb_enc_get(RMATCH(match)->str);
999     if (rb_enc_mbmaxlen(enc) == 1) {
1000         for (i = 0; i < num_regs; i++) {
1001             rm->char_offset[i].beg = BEG(i);
1002             rm->char_offset[i].end = END(i);
1003         }
1004         rm->char_offset_updated = 1;
1005         return;
1006     }
1007 
1008     pairs = ALLOCA_N(pair_t, num_regs*2);
1009     num_pos = 0;
1010     for (i = 0; i < num_regs; i++) {
1011         if (BEG(i) < 0)
1012             continue;
1013         pairs[num_pos++].byte_pos = BEG(i);
1014         pairs[num_pos++].byte_pos = END(i);
1015     }
1016     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
1017 
1018     s = p = RSTRING_PTR(RMATCH(match)->str);
1019     c = 0;
1020     for (i = 0; i < num_pos; i++) {
1021         q = s + pairs[i].byte_pos;
1022         c += rb_enc_strlen(p, q, enc);
1023         pairs[i].char_pos = c;
1024         p = q;
1025     }
1026 
1027     for (i = 0; i < num_regs; i++) {
1028         pair_t key, *found;
1029         if (BEG(i) < 0) {
1030             rm->char_offset[i].beg = -1;
1031             rm->char_offset[i].end = -1;
1032             continue;
1033         }
1034 
1035         key.byte_pos = BEG(i);
1036         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
1037         rm->char_offset[i].beg = found->char_pos;
1038 
1039         key.byte_pos = END(i);
1040         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
1041         rm->char_offset[i].end = found->char_pos;
1042     }
1043 
1044     rm->char_offset_updated = 1;
1045 }
1046 
1047 static void
match_check(VALUE match)1048 match_check(VALUE match)
1049 {
1050     if (!RMATCH(match)->regexp) {
1051 	rb_raise(rb_eTypeError, "uninitialized Match");
1052     }
1053 }
1054 
1055 /* :nodoc: */
1056 static VALUE
match_init_copy(VALUE obj,VALUE orig)1057 match_init_copy(VALUE obj, VALUE orig)
1058 {
1059     struct rmatch *rm;
1060 
1061     if (!OBJ_INIT_COPY(obj, orig)) return obj;
1062 
1063     RMATCH(obj)->str = RMATCH(orig)->str;
1064     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
1065 
1066     rm = RMATCH(obj)->rmatch;
1067     if (rb_reg_region_copy(&rm->regs, RMATCH_REGS(orig)))
1068 	rb_memerror();
1069 
1070     if (!RMATCH(orig)->rmatch->char_offset_updated) {
1071         rm->char_offset_updated = 0;
1072     }
1073     else {
1074         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
1075             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
1076             rm->char_offset_num_allocated = rm->regs.num_regs;
1077         }
1078         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
1079                struct rmatch_offset, rm->regs.num_regs);
1080         rm->char_offset_updated = 1;
1081 	RB_GC_GUARD(orig);
1082     }
1083 
1084     return obj;
1085 }
1086 
1087 
1088 /*
1089  * call-seq:
1090  *    mtch.regexp   -> regexp
1091  *
1092  * Returns the regexp.
1093  *
1094  *     m = /a.*b/.match("abc")
1095  *     m.regexp #=> /a.*b/
1096  */
1097 
1098 static VALUE
match_regexp(VALUE match)1099 match_regexp(VALUE match)
1100 {
1101     VALUE regexp;
1102     match_check(match);
1103     regexp = RMATCH(match)->regexp;
1104     if (NIL_P(regexp)) {
1105 	VALUE str = rb_reg_nth_match(0, match);
1106 	regexp = rb_reg_regcomp(rb_reg_quote(str));
1107 	RMATCH(match)->regexp = regexp;
1108     }
1109     return regexp;
1110 }
1111 
1112 /*
1113  * call-seq:
1114  *    mtch.names   -> [name1, name2, ...]
1115  *
1116  * Returns a list of names of captures as an array of strings.
1117  * It is same as mtch.regexp.names.
1118  *
1119  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
1120  *     #=> ["foo", "bar", "baz"]
1121  *
1122  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
1123  *     m.names                          #=> ["x", "y"]
1124  */
1125 
1126 static VALUE
match_names(VALUE match)1127 match_names(VALUE match)
1128 {
1129     match_check(match);
1130     if (NIL_P(RMATCH(match)->regexp))
1131 	return rb_ary_new_capa(0);
1132     return rb_reg_names(RMATCH(match)->regexp);
1133 }
1134 
1135 /*
1136  *  call-seq:
1137  *     mtch.length   -> integer
1138  *     mtch.size     -> integer
1139  *
1140  *  Returns the number of elements in the match array.
1141  *
1142  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1143  *     m.length   #=> 5
1144  *     m.size     #=> 5
1145  */
1146 
1147 static VALUE
match_size(VALUE match)1148 match_size(VALUE match)
1149 {
1150     match_check(match);
1151     return INT2FIX(RMATCH_REGS(match)->num_regs);
1152 }
1153 
1154 static int name_to_backref_number(struct re_registers *, VALUE, const char*, const char*);
1155 
1156 static int
match_backref_number(VALUE match,VALUE backref)1157 match_backref_number(VALUE match, VALUE backref)
1158 {
1159     const char *name;
1160     int num;
1161 
1162     struct re_registers *regs = RMATCH_REGS(match);
1163     VALUE regexp = RMATCH(match)->regexp;
1164 
1165     match_check(match);
1166     if (SYMBOL_P(backref)) {
1167 	backref = rb_sym2str(backref);
1168     }
1169     else if (!RB_TYPE_P(backref, T_STRING)) {
1170 	return NUM2INT(backref);
1171     }
1172     name = StringValueCStr(backref);
1173 
1174     num = name_to_backref_number(regs, regexp, name, name + strlen(name));
1175 
1176     if (num < 1) {
1177         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
1178     }
1179 
1180     return num;
1181 }
1182 
1183 int
rb_reg_backref_number(VALUE match,VALUE backref)1184 rb_reg_backref_number(VALUE match, VALUE backref)
1185 {
1186     return match_backref_number(match, backref);
1187 }
1188 
1189 /*
1190  *  call-seq:
1191  *     mtch.offset(n)   -> array
1192  *
1193  *  Returns a two-element array containing the beginning and ending offsets of
1194  *  the <em>n</em>th match.
1195  *  <em>n</em> can be a string or symbol to reference a named capture.
1196  *
1197  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1198  *     m.offset(0)      #=> [1, 7]
1199  *     m.offset(4)      #=> [6, 7]
1200  *
1201  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1202  *     p m.offset(:foo) #=> [0, 1]
1203  *     p m.offset(:bar) #=> [2, 3]
1204  *
1205  */
1206 
1207 static VALUE
match_offset(VALUE match,VALUE n)1208 match_offset(VALUE match, VALUE n)
1209 {
1210     int i = match_backref_number(match, n);
1211     struct re_registers *regs = RMATCH_REGS(match);
1212 
1213     match_check(match);
1214     if (i < 0 || regs->num_regs <= i)
1215 	rb_raise(rb_eIndexError, "index %d out of matches", i);
1216 
1217     if (BEG(i) < 0)
1218 	return rb_assoc_new(Qnil, Qnil);
1219 
1220     update_char_offset(match);
1221     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
1222 			INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
1223 }
1224 
1225 
1226 /*
1227  *  call-seq:
1228  *     mtch.begin(n)   -> integer
1229  *
1230  *  Returns the offset of the start of the <em>n</em>th element of the match
1231  *  array in the string.
1232  *  <em>n</em> can be a string or symbol to reference a named capture.
1233  *
1234  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1235  *     m.begin(0)       #=> 1
1236  *     m.begin(2)       #=> 2
1237  *
1238  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1239  *     p m.begin(:foo)  #=> 0
1240  *     p m.begin(:bar)  #=> 2
1241  */
1242 
1243 static VALUE
match_begin(VALUE match,VALUE n)1244 match_begin(VALUE match, VALUE n)
1245 {
1246     int i = match_backref_number(match, n);
1247     struct re_registers *regs = RMATCH_REGS(match);
1248 
1249     match_check(match);
1250     if (i < 0 || regs->num_regs <= i)
1251 	rb_raise(rb_eIndexError, "index %d out of matches", i);
1252 
1253     if (BEG(i) < 0)
1254 	return Qnil;
1255 
1256     update_char_offset(match);
1257     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
1258 }
1259 
1260 
1261 /*
1262  *  call-seq:
1263  *     mtch.end(n)   -> integer
1264  *
1265  *  Returns the offset of the character immediately following the end of the
1266  *  <em>n</em>th element of the match array in the string.
1267  *  <em>n</em> can be a string or symbol to reference a named capture.
1268  *
1269  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1270  *     m.end(0)         #=> 7
1271  *     m.end(2)         #=> 3
1272  *
1273  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1274  *     p m.end(:foo)    #=> 1
1275  *     p m.end(:bar)    #=> 3
1276  */
1277 
1278 static VALUE
match_end(VALUE match,VALUE n)1279 match_end(VALUE match, VALUE n)
1280 {
1281     int i = match_backref_number(match, n);
1282     struct re_registers *regs = RMATCH_REGS(match);
1283 
1284     match_check(match);
1285     if (i < 0 || regs->num_regs <= i)
1286 	rb_raise(rb_eIndexError, "index %d out of matches", i);
1287 
1288     if (BEG(i) < 0)
1289 	return Qnil;
1290 
1291     update_char_offset(match);
1292     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
1293 }
1294 
1295 #define MATCH_BUSY FL_USER2
1296 
1297 void
rb_match_busy(VALUE match)1298 rb_match_busy(VALUE match)
1299 {
1300     FL_SET(match, MATCH_BUSY);
1301 }
1302 
1303 void
rb_match_unbusy(VALUE match)1304 rb_match_unbusy(VALUE match)
1305 {
1306     FL_UNSET(match, MATCH_BUSY);
1307 }
1308 
1309 int
rb_match_count(VALUE match)1310 rb_match_count(VALUE match)
1311 {
1312     struct re_registers *regs;
1313     if (NIL_P(match)) return -1;
1314     regs = RMATCH_REGS(match);
1315     if (!regs) return -1;
1316     return regs->num_regs;
1317 }
1318 
1319 int
rb_match_nth_defined(int nth,VALUE match)1320 rb_match_nth_defined(int nth, VALUE match)
1321 {
1322     struct re_registers *regs;
1323     if (NIL_P(match)) return FALSE;
1324     regs = RMATCH_REGS(match);
1325     if (!regs) return FALSE;
1326     if (nth >= regs->num_regs) {
1327 	return FALSE;
1328     }
1329     if (nth < 0) {
1330 	nth += regs->num_regs;
1331 	if (nth <= 0) return FALSE;
1332     }
1333     return (BEG(nth) != -1);
1334 }
1335 
1336 static void
match_set_string(VALUE m,VALUE string,long pos,long len)1337 match_set_string(VALUE m, VALUE string, long pos, long len)
1338 {
1339     struct RMatch *match = (struct RMatch *)m;
1340     struct rmatch *rmatch = match->rmatch;
1341 
1342     match->str = string;
1343     match->regexp = Qnil;
1344     onig_region_resize(&rmatch->regs, 1);
1345     rmatch->regs.beg[0] = pos;
1346     rmatch->regs.end[0] = pos + len;
1347     rmatch->char_offset_updated = 0;
1348     OBJ_INFECT(match, string);
1349 }
1350 
1351 void
rb_backref_set_string(VALUE string,long pos,long len)1352 rb_backref_set_string(VALUE string, long pos, long len)
1353 {
1354     VALUE match = rb_backref_get();
1355     if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
1356 	match = match_alloc(rb_cMatch);
1357     }
1358     match_set_string(match, string, pos, len);
1359     rb_backref_set(match);
1360 }
1361 
1362 /*
1363  *  call-seq:
1364  *     rxp.fixed_encoding?   -> true or false
1365  *
1366  *  Returns false if rxp is applicable to
1367  *  a string with any ASCII compatible encoding.
1368  *  Returns true otherwise.
1369  *
1370  *      r = /a/
1371  *      r.fixed_encoding?                               #=> false
1372  *      r =~ "\u{6666} a"                               #=> 2
1373  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
1374  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
1375  *
1376  *      r = /a/u
1377  *      r.fixed_encoding?                               #=> true
1378  *      r.encoding                                      #=> #<Encoding:UTF-8>
1379  *      r =~ "\u{6666} a"                               #=> 2
1380  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> Encoding::CompatibilityError
1381  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
1382  *
1383  *      r = /\u{6666}/
1384  *      r.fixed_encoding?                               #=> true
1385  *      r.encoding                                      #=> #<Encoding:UTF-8>
1386  *      r =~ "\u{6666} a"                               #=> 0
1387  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> Encoding::CompatibilityError
1388  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
1389  */
1390 
1391 static VALUE
rb_reg_fixed_encoding_p(VALUE re)1392 rb_reg_fixed_encoding_p(VALUE re)
1393 {
1394     if (FL_TEST(re, KCODE_FIXED))
1395         return Qtrue;
1396     else
1397         return Qfalse;
1398 }
1399 
1400 static VALUE
1401 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
1402         rb_encoding **fixed_enc, onig_errmsg_buffer err);
1403 
1404 NORETURN(static void reg_enc_error(VALUE re, VALUE str));
1405 
1406 static void
reg_enc_error(VALUE re,VALUE str)1407 reg_enc_error(VALUE re, VALUE str)
1408 {
1409     rb_raise(rb_eEncCompatError,
1410 	     "incompatible encoding regexp match (%s regexp with %s string)",
1411 	     rb_enc_name(rb_enc_get(re)),
1412 	     rb_enc_name(rb_enc_get(str)));
1413 }
1414 
1415 static inline int
str_coderange(VALUE str)1416 str_coderange(VALUE str)
1417 {
1418     int cr = ENC_CODERANGE(str);
1419     if (cr == ENC_CODERANGE_UNKNOWN) {
1420 	cr = rb_enc_str_coderange(str);
1421     }
1422     return cr;
1423 }
1424 
1425 static rb_encoding*
rb_reg_prepare_enc(VALUE re,VALUE str,int warn)1426 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
1427 {
1428     rb_encoding *enc = 0;
1429     int cr = str_coderange(str);
1430 
1431     if (cr == ENC_CODERANGE_BROKEN) {
1432         rb_raise(rb_eArgError,
1433             "invalid byte sequence in %s",
1434             rb_enc_name(rb_enc_get(str)));
1435     }
1436 
1437     rb_reg_check(re);
1438     enc = rb_enc_get(str);
1439     if (RREGEXP_PTR(re)->enc == enc) {
1440     }
1441     else if (cr == ENC_CODERANGE_7BIT &&
1442 	    RREGEXP_PTR(re)->enc == rb_usascii_encoding()) {
1443 	enc = RREGEXP_PTR(re)->enc;
1444     }
1445     else if (!rb_enc_asciicompat(enc)) {
1446 	reg_enc_error(re, str);
1447     }
1448     else if (rb_reg_fixed_encoding_p(re)) {
1449         if ((!rb_enc_asciicompat(RREGEXP_PTR(re)->enc) ||
1450 	     cr != ENC_CODERANGE_7BIT)) {
1451 	    reg_enc_error(re, str);
1452 	}
1453 	enc = RREGEXP_PTR(re)->enc;
1454     }
1455     else if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
1456 	enc != rb_ascii8bit_encoding() &&
1457 	cr != ENC_CODERANGE_7BIT) {
1458 	rb_warn("historical binary regexp match /.../n against %s string",
1459 		rb_enc_name(enc));
1460     }
1461     return enc;
1462 }
1463 
1464 regex_t *
rb_reg_prepare_re0(VALUE re,VALUE str,onig_errmsg_buffer err)1465 rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err)
1466 {
1467     regex_t *reg = RREGEXP_PTR(re);
1468     int r;
1469     OnigErrorInfo einfo;
1470     const char *pattern;
1471     VALUE unescaped;
1472     rb_encoding *fixed_enc = 0;
1473     rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
1474 
1475     if (reg->enc == enc) return reg;
1476 
1477     rb_reg_check(re);
1478     reg = RREGEXP_PTR(re);
1479     pattern = RREGEXP_SRC_PTR(re);
1480 
1481     unescaped = rb_reg_preprocess(
1482 	pattern, pattern + RREGEXP_SRC_LEN(re), enc,
1483 	&fixed_enc, err);
1484 
1485     if (unescaped == Qnil) {
1486 	rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
1487     }
1488 
1489     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
1490 		 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
1491 		 reg->options, enc,
1492 		 OnigDefaultSyntax, &einfo);
1493     if (r) {
1494 	onig_error_code_to_str((UChar*)err, r, &einfo);
1495 	rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
1496     }
1497 
1498     RB_GC_GUARD(unescaped);
1499     return reg;
1500 }
1501 
1502 regex_t *
rb_reg_prepare_re(VALUE re,VALUE str)1503 rb_reg_prepare_re(VALUE re, VALUE str)
1504 {
1505     onig_errmsg_buffer err = "";
1506     return rb_reg_prepare_re0(re, str, err);
1507 }
1508 
1509 long
rb_reg_adjust_startpos(VALUE re,VALUE str,long pos,int reverse)1510 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
1511 {
1512     long range;
1513     rb_encoding *enc;
1514     UChar *p, *string;
1515 
1516     enc = rb_reg_prepare_enc(re, str, 0);
1517 
1518     if (reverse) {
1519 	range = -pos;
1520     }
1521     else {
1522 	range = RSTRING_LEN(str) - pos;
1523     }
1524 
1525     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
1526 	 string = (UChar*)RSTRING_PTR(str);
1527 
1528 	 if (range > 0) {
1529 	      p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
1530 	 }
1531 	 else {
1532 	      p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
1533 	 }
1534 	 return p - string;
1535     }
1536 
1537     return pos;
1538 }
1539 
1540 /* returns byte offset */
1541 long
rb_reg_search0(VALUE re,VALUE str,long pos,int reverse,int set_backref_str)1542 rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str)
1543 {
1544     long result;
1545     VALUE match;
1546     struct re_registers regi, *regs = &regi;
1547     char *range = RSTRING_PTR(str);
1548     regex_t *reg;
1549     int tmpreg;
1550     onig_errmsg_buffer err = "";
1551 
1552     if (pos > RSTRING_LEN(str) || pos < 0) {
1553 	rb_backref_set(Qnil);
1554 	return -1;
1555     }
1556 
1557     reg = rb_reg_prepare_re0(re, str, err);
1558     tmpreg = reg != RREGEXP_PTR(re);
1559     if (!tmpreg) RREGEXP(re)->usecnt++;
1560 
1561     match = rb_backref_get();
1562     if (!NIL_P(match)) {
1563 	if (FL_TEST(match, MATCH_BUSY)) {
1564 	    match = Qnil;
1565 	}
1566 	else {
1567 	    regs = RMATCH_REGS(match);
1568 	}
1569     }
1570     if (NIL_P(match)) {
1571 	MEMZERO(regs, struct re_registers, 1);
1572     }
1573     if (!reverse) {
1574 	range += RSTRING_LEN(str);
1575     }
1576     result = onig_search(reg,
1577 			 (UChar*)(RSTRING_PTR(str)),
1578 			 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
1579 			 ((UChar*)(RSTRING_PTR(str)) + pos),
1580 			 ((UChar*)range),
1581 			 regs, ONIG_OPTION_NONE);
1582     if (!tmpreg) RREGEXP(re)->usecnt--;
1583     if (tmpreg) {
1584 	if (RREGEXP(re)->usecnt) {
1585 	    onig_free(reg);
1586 	}
1587 	else {
1588 	    onig_free(RREGEXP_PTR(re));
1589 	    RREGEXP_PTR(re) = reg;
1590 	}
1591     }
1592     if (result < 0) {
1593 	if (regs == &regi)
1594 	    onig_region_free(regs, 0);
1595 	if (result == ONIG_MISMATCH) {
1596 	    rb_backref_set(Qnil);
1597 	    return result;
1598 	}
1599 	else {
1600 	    onig_error_code_to_str((UChar*)err, (int)result);
1601 	    rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
1602 	}
1603     }
1604 
1605     if (NIL_P(match)) {
1606 	int err;
1607 	match = match_alloc(rb_cMatch);
1608 	err = rb_reg_region_copy(RMATCH_REGS(match), regs);
1609 	onig_region_free(regs, 0);
1610 	if (err) rb_memerror();
1611     }
1612     else {
1613 	FL_UNSET(match, FL_TAINT);
1614     }
1615 
1616     if (set_backref_str) {
1617 	RMATCH(match)->str = rb_str_new4(str);
1618 	OBJ_INFECT(match, str);
1619     }
1620 
1621     RMATCH(match)->regexp = re;
1622     RMATCH(match)->rmatch->char_offset_updated = 0;
1623     rb_backref_set(match);
1624 
1625     OBJ_INFECT(match, re);
1626 
1627     return result;
1628 }
1629 
1630 long
rb_reg_search(VALUE re,VALUE str,long pos,int reverse)1631 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
1632 {
1633     return rb_reg_search0(re, str, pos, reverse, 1);
1634 }
1635 
1636 bool
rb_reg_start_with_p(VALUE re,VALUE str)1637 rb_reg_start_with_p(VALUE re, VALUE str)
1638 {
1639     long result;
1640     VALUE match;
1641     struct re_registers regi, *regs = &regi;
1642     regex_t *reg;
1643     int tmpreg;
1644     onig_errmsg_buffer err = "";
1645 
1646     reg = rb_reg_prepare_re0(re, str, err);
1647     tmpreg = reg != RREGEXP_PTR(re);
1648     if (!tmpreg) RREGEXP(re)->usecnt++;
1649 
1650     match = rb_backref_get();
1651     if (!NIL_P(match)) {
1652 	if (FL_TEST(match, MATCH_BUSY)) {
1653 	    match = Qnil;
1654 	}
1655 	else {
1656 	    regs = RMATCH_REGS(match);
1657 	}
1658     }
1659     if (NIL_P(match)) {
1660 	MEMZERO(regs, struct re_registers, 1);
1661     }
1662     result = onig_match(reg,
1663 	    (UChar*)(RSTRING_PTR(str)),
1664 	    ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
1665 	    (UChar*)(RSTRING_PTR(str)),
1666 	    regs, ONIG_OPTION_NONE);
1667     if (!tmpreg) RREGEXP(re)->usecnt--;
1668     if (tmpreg) {
1669 	if (RREGEXP(re)->usecnt) {
1670 	    onig_free(reg);
1671 	}
1672 	else {
1673 	    onig_free(RREGEXP_PTR(re));
1674 	    RREGEXP_PTR(re) = reg;
1675 	}
1676     }
1677     if (result < 0) {
1678 	if (regs == &regi)
1679 	    onig_region_free(regs, 0);
1680 	if (result == ONIG_MISMATCH) {
1681 	    rb_backref_set(Qnil);
1682 	    return false;
1683 	}
1684 	else {
1685 	    onig_error_code_to_str((UChar*)err, (int)result);
1686 	    rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
1687 	}
1688     }
1689 
1690     if (NIL_P(match)) {
1691 	int err;
1692 	match = match_alloc(rb_cMatch);
1693 	err = rb_reg_region_copy(RMATCH_REGS(match), regs);
1694 	onig_region_free(regs, 0);
1695 	if (err) rb_memerror();
1696     }
1697     else {
1698 	FL_UNSET(match, FL_TAINT);
1699     }
1700 
1701     RMATCH(match)->str = rb_str_new4(str);
1702     OBJ_INFECT(match, str);
1703 
1704     RMATCH(match)->regexp = re;
1705     RMATCH(match)->rmatch->char_offset_updated = 0;
1706     rb_backref_set(match);
1707 
1708     OBJ_INFECT(match, re);
1709 
1710     return true;
1711 }
1712 
1713 VALUE
rb_reg_nth_defined(int nth,VALUE match)1714 rb_reg_nth_defined(int nth, VALUE match)
1715 {
1716     struct re_registers *regs;
1717     if (NIL_P(match)) return Qnil;
1718     match_check(match);
1719     regs = RMATCH_REGS(match);
1720     if (nth >= regs->num_regs) {
1721 	return Qnil;
1722     }
1723     if (nth < 0) {
1724 	nth += regs->num_regs;
1725 	if (nth <= 0) return Qnil;
1726     }
1727     if (BEG(nth) == -1) return Qfalse;
1728     return Qtrue;
1729 }
1730 
1731 VALUE
rb_reg_nth_match(int nth,VALUE match)1732 rb_reg_nth_match(int nth, VALUE match)
1733 {
1734     VALUE str;
1735     long start, end, len;
1736     struct re_registers *regs;
1737 
1738     if (NIL_P(match)) return Qnil;
1739     match_check(match);
1740     regs = RMATCH_REGS(match);
1741     if (nth >= regs->num_regs) {
1742 	return Qnil;
1743     }
1744     if (nth < 0) {
1745 	nth += regs->num_regs;
1746 	if (nth <= 0) return Qnil;
1747     }
1748     start = BEG(nth);
1749     if (start == -1) return Qnil;
1750     end = END(nth);
1751     len = end - start;
1752     str = rb_str_subseq(RMATCH(match)->str, start, len);
1753     OBJ_INFECT(str, match);
1754     return str;
1755 }
1756 
1757 VALUE
rb_reg_last_match(VALUE match)1758 rb_reg_last_match(VALUE match)
1759 {
1760     return rb_reg_nth_match(0, match);
1761 }
1762 
1763 
1764 /*
1765  *  call-seq:
1766  *     mtch.pre_match   -> str
1767  *
1768  *  Returns the portion of the original string before the current match.
1769  *  Equivalent to the special variable <code>$`</code>.
1770  *
1771  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1772  *     m.pre_match   #=> "T"
1773  */
1774 
1775 VALUE
rb_reg_match_pre(VALUE match)1776 rb_reg_match_pre(VALUE match)
1777 {
1778     VALUE str;
1779     struct re_registers *regs;
1780 
1781     if (NIL_P(match)) return Qnil;
1782     match_check(match);
1783     regs = RMATCH_REGS(match);
1784     if (BEG(0) == -1) return Qnil;
1785     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
1786     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1787     return str;
1788 }
1789 
1790 
1791 /*
1792  *  call-seq:
1793  *     mtch.post_match   -> str
1794  *
1795  *  Returns the portion of the original string after the current match.
1796  *  Equivalent to the special variable <code>$'</code>.
1797  *
1798  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1799  *     m.post_match   #=> ": The Movie"
1800  */
1801 
1802 VALUE
rb_reg_match_post(VALUE match)1803 rb_reg_match_post(VALUE match)
1804 {
1805     VALUE str;
1806     long pos;
1807     struct re_registers *regs;
1808 
1809     if (NIL_P(match)) return Qnil;
1810     match_check(match);
1811     regs = RMATCH_REGS(match);
1812     if (BEG(0) == -1) return Qnil;
1813     str = RMATCH(match)->str;
1814     pos = END(0);
1815     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1816     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1817     return str;
1818 }
1819 
1820 VALUE
rb_reg_match_last(VALUE match)1821 rb_reg_match_last(VALUE match)
1822 {
1823     int i;
1824     struct re_registers *regs;
1825 
1826     if (NIL_P(match)) return Qnil;
1827     match_check(match);
1828     regs = RMATCH_REGS(match);
1829     if (BEG(0) == -1) return Qnil;
1830 
1831     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
1832 	;
1833     if (i == 0) return Qnil;
1834     return rb_reg_nth_match(i, match);
1835 }
1836 
1837 static VALUE
last_match_getter(void)1838 last_match_getter(void)
1839 {
1840     return rb_reg_last_match(rb_backref_get());
1841 }
1842 
1843 static VALUE
prematch_getter(void)1844 prematch_getter(void)
1845 {
1846     return rb_reg_match_pre(rb_backref_get());
1847 }
1848 
1849 static VALUE
postmatch_getter(void)1850 postmatch_getter(void)
1851 {
1852     return rb_reg_match_post(rb_backref_get());
1853 }
1854 
1855 static VALUE
last_paren_match_getter(void)1856 last_paren_match_getter(void)
1857 {
1858     return rb_reg_match_last(rb_backref_get());
1859 }
1860 
1861 static VALUE
match_array(VALUE match,int start)1862 match_array(VALUE match, int start)
1863 {
1864     struct re_registers *regs;
1865     VALUE ary;
1866     VALUE target;
1867     int i;
1868     int taint = OBJ_TAINTED(match);
1869 
1870     match_check(match);
1871     regs = RMATCH_REGS(match);
1872     ary = rb_ary_new2(regs->num_regs);
1873     target = RMATCH(match)->str;
1874 
1875     for (i=start; i<regs->num_regs; i++) {
1876 	if (regs->beg[i] == -1) {
1877 	    rb_ary_push(ary, Qnil);
1878 	}
1879 	else {
1880 	    VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1881 	    if (taint) OBJ_TAINT(str);
1882 	    rb_ary_push(ary, str);
1883 	}
1884     }
1885     return ary;
1886 }
1887 
1888 
1889 /*
1890  *  call-seq:
1891  *     mtch.to_a   -> anArray
1892  *
1893  *  Returns the array of matches.
1894  *
1895  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1896  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
1897  *
1898  *  Because <code>to_a</code> is called when expanding
1899  *  <code>*</code><em>variable</em>, there's a useful assignment
1900  *  shortcut for extracting matched fields. This is slightly slower than
1901  *  accessing the fields directly (as an intermediate array is
1902  *  generated).
1903  *
1904  *     all,f1,f2,f3 = * /(.)(.)(\d+)(\d)/.match("THX1138.")
1905  *     all   #=> "HX1138"
1906  *     f1    #=> "H"
1907  *     f2    #=> "X"
1908  *     f3    #=> "113"
1909  */
1910 
1911 static VALUE
match_to_a(VALUE match)1912 match_to_a(VALUE match)
1913 {
1914     return match_array(match, 0);
1915 }
1916 
1917 
1918 /*
1919  *  call-seq:
1920  *     mtch.captures   -> array
1921  *
1922  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1923  *
1924  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1925  *     f1    #=> "H"
1926  *     f2    #=> "X"
1927  *     f3    #=> "113"
1928  *     f4    #=> "8"
1929  */
1930 static VALUE
match_captures(VALUE match)1931 match_captures(VALUE match)
1932 {
1933     return match_array(match, 1);
1934 }
1935 
1936 static int
name_to_backref_number(struct re_registers * regs,VALUE regexp,const char * name,const char * name_end)1937 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
1938 {
1939     if (NIL_P(regexp)) return -1;
1940     return onig_name_to_backref_number(RREGEXP_PTR(regexp),
1941 	(const unsigned char *)name, (const unsigned char *)name_end, regs);
1942 }
1943 
1944 NORETURN(static void name_to_backref_error(VALUE name));
1945 static void
name_to_backref_error(VALUE name)1946 name_to_backref_error(VALUE name)
1947 {
1948     rb_raise(rb_eIndexError, "undefined group name reference: % "PRIsVALUE,
1949 	     name);
1950 }
1951 
1952 #define NAME_TO_NUMBER(regs, re, name, name_ptr, name_end)	\
1953     (NIL_P(re) ? 0 : \
1954      !rb_enc_compatible(RREGEXP_SRC(re), (name)) ? 0 : \
1955      name_to_backref_number((regs), (re), (name_ptr), (name_end)))
1956 
1957 static int
namev_to_backref_number(struct re_registers * regs,VALUE re,VALUE name)1958 namev_to_backref_number(struct re_registers *regs, VALUE re, VALUE name)
1959 {
1960     int num;
1961 
1962     if (SYMBOL_P(name)) {
1963 	name = rb_sym2str(name);
1964     }
1965     else if (!RB_TYPE_P(name, T_STRING)) {
1966 	return -1;
1967     }
1968     num = NAME_TO_NUMBER(regs, re, name,
1969 			 RSTRING_PTR(name), RSTRING_END(name));
1970     if (num < 1) {
1971 	name_to_backref_error(name);
1972     }
1973     return num;
1974 }
1975 
1976 static VALUE
match_ary_subseq(VALUE match,long beg,long len,VALUE result)1977 match_ary_subseq(VALUE match, long beg, long len, VALUE result)
1978 {
1979     long olen = RMATCH_REGS(match)->num_regs;
1980     long j, end = olen < beg+len ? olen : beg+len;
1981     if (NIL_P(result)) result = rb_ary_new_capa(len);
1982     if (len == 0) return result;
1983 
1984     for (j = beg; j < end; j++) {
1985 	rb_ary_push(result, rb_reg_nth_match((int)j, match));
1986     }
1987     if (beg + len > j) {
1988 	rb_ary_resize(result, RARRAY_LEN(result) + (beg + len) - j);
1989     }
1990     return result;
1991 }
1992 
1993 static VALUE
match_ary_aref(VALUE match,VALUE idx,VALUE result)1994 match_ary_aref(VALUE match, VALUE idx, VALUE result)
1995 {
1996     long beg, len;
1997     int num_regs = RMATCH_REGS(match)->num_regs;
1998 
1999     /* check if idx is Range */
2000     switch (rb_range_beg_len(idx, &beg, &len, (long)num_regs, !NIL_P(result))) {
2001       case Qfalse:
2002 	if (NIL_P(result)) return rb_reg_nth_match(NUM2INT(idx), match);
2003 	rb_ary_push(result, rb_reg_nth_match(NUM2INT(idx), match));
2004 	return result;
2005       case Qnil:
2006 	return Qnil;
2007       default:
2008 	return match_ary_subseq(match, beg, len, result);
2009     }
2010 }
2011 
2012 /*
2013  *  call-seq:
2014  *     mtch[i]               -> str or nil
2015  *     mtch[start, length]   -> array
2016  *     mtch[range]           -> array
2017  *     mtch[name]            -> str or nil
2018  *
2019  *  Match Reference -- <code>MatchData</code> acts as an array, and may be
2020  *  accessed using the normal array indexing techniques.  <code>mtch[0]</code>
2021  *  is equivalent to the special variable <code>$&</code>, and returns the
2022  *  entire matched string.  <code>mtch[1]</code>, <code>mtch[2]</code>, and so
2023  *  on return the values of the matched backreferences (portions of the
2024  *  pattern between parentheses).
2025  *
2026  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2027  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
2028  *     m[0]       #=> "HX1138"
2029  *     m[1, 2]    #=> ["H", "X"]
2030  *     m[1..3]    #=> ["H", "X", "113"]
2031  *     m[-3, 2]   #=> ["X", "113"]
2032  *
2033  *     m = /(?<foo>a+)b/.match("ccaaab")
2034  *     m          #=> #<MatchData "aaab" foo:"aaa">
2035  *     m["foo"]   #=> "aaa"
2036  *     m[:foo]    #=> "aaa"
2037  */
2038 
2039 static VALUE
match_aref(int argc,VALUE * argv,VALUE match)2040 match_aref(int argc, VALUE *argv, VALUE match)
2041 {
2042     VALUE idx, length;
2043 
2044     match_check(match);
2045     rb_scan_args(argc, argv, "11", &idx, &length);
2046 
2047     if (NIL_P(length)) {
2048 	if (FIXNUM_P(idx)) {
2049 	    return rb_reg_nth_match(FIX2INT(idx), match);
2050 	}
2051 	else {
2052 	    int num = namev_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp, idx);
2053 	    if (num >= 0) {
2054 		return rb_reg_nth_match(num, match);
2055 	    }
2056 	    else {
2057 		return match_ary_aref(match, idx, Qnil);
2058 	    }
2059 	}
2060     }
2061     else {
2062 	long beg = NUM2LONG(idx);
2063 	long len = NUM2LONG(length);
2064 	long num_regs = RMATCH_REGS(match)->num_regs;
2065 	if (len < 0) {
2066 	    return Qnil;
2067 	}
2068 	if (beg < 0) {
2069 	    beg += num_regs;
2070 	    if (beg < 0) return Qnil;
2071 	}
2072 	else if (beg > num_regs) {
2073 	    return Qnil;
2074 	}
2075 	else if (beg+len > num_regs) {
2076 	    len = num_regs - beg;
2077 	}
2078 	return match_ary_subseq(match, beg, len, Qnil);
2079     }
2080 }
2081 
2082 /*
2083  *  call-seq:
2084  *
2085  *     mtch.values_at(index, ...)   -> array
2086  *
2087  *  Uses each <i>index</i> to access the matching values, returning an array of
2088  *  the corresponding matches.
2089  *
2090  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
2091  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
2092  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
2093  *
2094  *     m = /(?<a>\d+) *(?<op>[+\-*\/]) *(?<b>\d+)/.match("1 + 2")
2095  *     m.to_a               #=> ["1 + 2", "1", "+", "2"]
2096  *     m.values_at(:a, :b, :op) #=> ["1", "2", "+"]
2097  */
2098 
2099 static VALUE
match_values_at(int argc,VALUE * argv,VALUE match)2100 match_values_at(int argc, VALUE *argv, VALUE match)
2101 {
2102     VALUE result;
2103     int i;
2104 
2105     match_check(match);
2106     result = rb_ary_new2(argc);
2107 
2108     for (i=0; i<argc; i++) {
2109 	if (FIXNUM_P(argv[i])) {
2110 	    rb_ary_push(result, rb_reg_nth_match(FIX2INT(argv[i]), match));
2111 	}
2112 	else {
2113 	    int num = namev_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp, argv[i]);
2114 	    if (num >= 0) {
2115 		rb_ary_push(result, rb_reg_nth_match(num, match));
2116 	    }
2117 	    else {
2118 		match_ary_aref(match, argv[i], result);
2119 	    }
2120 	}
2121     }
2122     return result;
2123 }
2124 
2125 
2126 /*
2127  *  call-seq:
2128  *     mtch.to_s   -> str
2129  *
2130  *  Returns the entire matched string.
2131  *
2132  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2133  *     m.to_s   #=> "HX1138"
2134  */
2135 
2136 static VALUE
match_to_s(VALUE match)2137 match_to_s(VALUE match)
2138 {
2139     VALUE str = rb_reg_last_match(match);
2140 
2141     match_check(match);
2142     if (NIL_P(str)) str = rb_str_new(0,0);
2143     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
2144     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
2145     return str;
2146 }
2147 
2148 static int
match_named_captures_iter(const OnigUChar * name,const OnigUChar * name_end,int back_num,int * back_refs,OnigRegex regex,void * arg)2149 match_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
2150 	int back_num, int *back_refs, OnigRegex regex, void *arg) {
2151     struct MEMO *memo = MEMO_CAST(arg);
2152     VALUE hash = memo->v1;
2153     VALUE match = memo->v2;
2154 
2155     VALUE key = rb_enc_str_new((const char *)name, name_end-name, regex->enc);
2156     VALUE value;
2157 
2158     int i;
2159     int found = 0;
2160 
2161     for (i = 0; i < back_num; i++) {
2162 	value = rb_reg_nth_match(back_refs[i], match);
2163 	if (RTEST(value)) {
2164 	    rb_hash_aset(hash, key, value);
2165 	    found = 1;
2166 	}
2167     }
2168 
2169     if (found == 0) {
2170 	rb_hash_aset(hash, key, Qnil);
2171     }
2172 
2173     return 0;
2174 }
2175 
2176 /*
2177  *  call-seq:
2178  *     mtch.named_captures -> hash
2179  *
2180  *  Returns a Hash using named capture.
2181  *
2182  *  A key of the hash is a name of the named captures.
2183  *  A value of the hash is a string of last successful capture of corresponding
2184  *  group.
2185  *
2186  *     m = /(?<a>.)(?<b>.)/.match("01")
2187  *     m.named_captures #=> {"a" => "0", "b" => "1"}
2188  *
2189  *     m = /(?<a>.)(?<b>.)?/.match("0")
2190  *     m.named_captures #=> {"a" => "0", "b" => nil}
2191  *
2192  *     m = /(?<a>.)(?<a>.)/.match("01")
2193  *     m.named_captures #=> {"a" => "1"}
2194  *
2195  *     m = /(?<a>x)|(?<a>y)/.match("x")
2196  *     m.named_captures #=> {"a" => "x"}
2197  */
2198 
2199 static VALUE
match_named_captures(VALUE match)2200 match_named_captures(VALUE match)
2201 {
2202     VALUE hash;
2203     struct MEMO *memo;
2204 
2205     match_check(match);
2206     if (NIL_P(RMATCH(match)->regexp))
2207 	return rb_hash_new();
2208 
2209     hash = rb_hash_new();
2210     memo = MEMO_NEW(hash, match, 0);
2211 
2212     onig_foreach_name(RREGEXP(RMATCH(match)->regexp)->ptr, match_named_captures_iter, (void*)memo);
2213 
2214     return hash;
2215 }
2216 
2217 /*
2218  *  call-seq:
2219  *     mtch.string   -> str
2220  *
2221  *  Returns a frozen copy of the string passed in to <code>match</code>.
2222  *
2223  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2224  *     m.string   #=> "THX1138."
2225  */
2226 
2227 static VALUE
match_string(VALUE match)2228 match_string(VALUE match)
2229 {
2230     match_check(match);
2231     return RMATCH(match)->str;	/* str is frozen */
2232 }
2233 
2234 struct backref_name_tag {
2235     const UChar *name;
2236     long len;
2237 };
2238 
2239 static int
match_inspect_name_iter(const OnigUChar * name,const OnigUChar * name_end,int back_num,int * back_refs,OnigRegex regex,void * arg0)2240 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
2241           int back_num, int *back_refs, OnigRegex regex, void *arg0)
2242 {
2243     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
2244     int i;
2245 
2246     for (i = 0; i < back_num; i++) {
2247         arg[back_refs[i]].name = name;
2248         arg[back_refs[i]].len = name_end - name;
2249     }
2250     return 0;
2251 }
2252 
2253 /*
2254  * call-seq:
2255  *    mtch.inspect   -> str
2256  *
2257  * Returns a printable version of <i>mtch</i>.
2258  *
2259  *     puts /.$/.match("foo").inspect
2260  *     #=> #<MatchData "o">
2261  *
2262  *     puts /(.)(.)(.)/.match("foo").inspect
2263  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
2264  *
2265  *     puts /(.)(.)?(.)/.match("fo").inspect
2266  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
2267  *
2268  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
2269  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
2270  *
2271  */
2272 
2273 static VALUE
match_inspect(VALUE match)2274 match_inspect(VALUE match)
2275 {
2276     VALUE cname = rb_class_path(rb_obj_class(match));
2277     VALUE str;
2278     int i;
2279     struct re_registers *regs = RMATCH_REGS(match);
2280     int num_regs = regs->num_regs;
2281     struct backref_name_tag *names;
2282     VALUE regexp = RMATCH(match)->regexp;
2283 
2284     if (regexp == 0) {
2285         return rb_sprintf("#<%"PRIsVALUE":%p>", cname, (void*)match);
2286     }
2287     else if (NIL_P(regexp)) {
2288         return rb_sprintf("#<%"PRIsVALUE": %"PRIsVALUE">",
2289 			  cname, rb_reg_nth_match(0, match));
2290     }
2291 
2292     names = ALLOCA_N(struct backref_name_tag, num_regs);
2293     MEMZERO(names, struct backref_name_tag, num_regs);
2294 
2295     onig_foreach_name(RREGEXP_PTR(regexp),
2296             match_inspect_name_iter, names);
2297 
2298     str = rb_str_buf_new2("#<");
2299     rb_str_append(str, cname);
2300 
2301     for (i = 0; i < num_regs; i++) {
2302         VALUE v;
2303         rb_str_buf_cat2(str, " ");
2304         if (0 < i) {
2305             if (names[i].name)
2306                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
2307             else {
2308                 rb_str_catf(str, "%d", i);
2309             }
2310             rb_str_buf_cat2(str, ":");
2311         }
2312         v = rb_reg_nth_match(i, match);
2313         if (v == Qnil)
2314             rb_str_buf_cat2(str, "nil");
2315         else
2316             rb_str_buf_append(str, rb_str_inspect(v));
2317     }
2318     rb_str_buf_cat2(str, ">");
2319 
2320     return str;
2321 }
2322 
2323 VALUE rb_cRegexp;
2324 
2325 static int
read_escaped_byte(const char ** pp,const char * end,onig_errmsg_buffer err)2326 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
2327 {
2328     const char *p = *pp;
2329     int code;
2330     int meta_prefix = 0, ctrl_prefix = 0;
2331     size_t len;
2332 
2333     if (p == end || *p++ != '\\') {
2334         errcpy(err, "too short escaped multibyte character");
2335         return -1;
2336     }
2337 
2338 again:
2339     if (p == end) {
2340         errcpy(err, "too short escape sequence");
2341         return -1;
2342     }
2343     switch (*p++) {
2344       case '\\': code = '\\'; break;
2345       case 'n': code = '\n'; break;
2346       case 't': code = '\t'; break;
2347       case 'r': code = '\r'; break;
2348       case 'f': code = '\f'; break;
2349       case 'v': code = '\013'; break;
2350       case 'a': code = '\007'; break;
2351       case 'e': code = '\033'; break;
2352 
2353       /* \OOO */
2354       case '0': case '1': case '2': case '3':
2355       case '4': case '5': case '6': case '7':
2356         p--;
2357         code = scan_oct(p, end < p+3 ? end-p : 3, &len);
2358         p += len;
2359         break;
2360 
2361       case 'x': /* \xHH */
2362         code = scan_hex(p, end < p+2 ? end-p : 2, &len);
2363         if (len < 1) {
2364             errcpy(err, "invalid hex escape");
2365             return -1;
2366         }
2367         p += len;
2368         break;
2369 
2370       case 'M': /* \M-X, \M-\C-X, \M-\cX */
2371         if (meta_prefix) {
2372             errcpy(err, "duplicate meta escape");
2373             return -1;
2374         }
2375         meta_prefix = 1;
2376         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
2377             if (*p == '\\') {
2378                 p++;
2379                 goto again;
2380             }
2381             else {
2382                 code = *p++;
2383                 break;
2384             }
2385         }
2386         errcpy(err, "too short meta escape");
2387         return -1;
2388 
2389       case 'C': /* \C-X, \C-\M-X */
2390         if (p == end || *p++ != '-') {
2391             errcpy(err, "too short control escape");
2392             return -1;
2393         }
2394       case 'c': /* \cX, \c\M-X */
2395         if (ctrl_prefix) {
2396             errcpy(err, "duplicate control escape");
2397             return -1;
2398         }
2399         ctrl_prefix = 1;
2400         if (p < end && (*p & 0x80) == 0) {
2401             if (*p == '\\') {
2402                 p++;
2403                 goto again;
2404             }
2405             else {
2406                 code = *p++;
2407                 break;
2408             }
2409         }
2410         errcpy(err, "too short control escape");
2411         return -1;
2412 
2413       default:
2414         errcpy(err, "unexpected escape sequence");
2415         return -1;
2416     }
2417     if (code < 0 || 0xff < code) {
2418         errcpy(err, "invalid escape code");
2419         return -1;
2420     }
2421 
2422     if (ctrl_prefix)
2423         code &= 0x1f;
2424     if (meta_prefix)
2425         code |= 0x80;
2426 
2427     *pp = p;
2428     return code;
2429 }
2430 
2431 static int
unescape_escaped_nonascii(const char ** pp,const char * end,rb_encoding * enc,VALUE buf,rb_encoding ** encp,onig_errmsg_buffer err)2432 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
2433         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2434 {
2435     const char *p = *pp;
2436     int chmaxlen = rb_enc_mbmaxlen(enc);
2437     unsigned char *area = ALLOCA_N(unsigned char, chmaxlen);
2438     char *chbuf = (char *)area;
2439     int chlen = 0;
2440     int byte;
2441     int l;
2442 
2443     memset(chbuf, 0, chmaxlen);
2444 
2445     byte = read_escaped_byte(&p, end, err);
2446     if (byte == -1) {
2447         return -1;
2448     }
2449 
2450     area[chlen++] = byte;
2451     while (chlen < chmaxlen &&
2452            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
2453         byte = read_escaped_byte(&p, end, err);
2454         if (byte == -1) {
2455             return -1;
2456         }
2457         area[chlen++] = byte;
2458     }
2459 
2460     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
2461     if (MBCLEN_INVALID_P(l)) {
2462         errcpy(err, "invalid multibyte escape");
2463         return -1;
2464     }
2465     if (1 < chlen || (area[0] & 0x80)) {
2466         rb_str_buf_cat(buf, chbuf, chlen);
2467 
2468         if (*encp == 0)
2469             *encp = enc;
2470         else if (*encp != enc) {
2471             errcpy(err, "escaped non ASCII character in UTF-8 regexp");
2472             return -1;
2473         }
2474     }
2475     else {
2476         char escbuf[5];
2477         snprintf(escbuf, sizeof(escbuf), "\\x%02X", area[0]&0xff);
2478         rb_str_buf_cat(buf, escbuf, 4);
2479     }
2480     *pp = p;
2481     return 0;
2482 }
2483 
2484 static int
check_unicode_range(unsigned long code,onig_errmsg_buffer err)2485 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
2486 {
2487     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
2488         0x10ffff < code) {
2489         errcpy(err, "invalid Unicode range");
2490         return -1;
2491     }
2492     return 0;
2493 }
2494 
2495 static int
append_utf8(unsigned long uv,VALUE buf,rb_encoding ** encp,onig_errmsg_buffer err)2496 append_utf8(unsigned long uv,
2497         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2498 {
2499     if (check_unicode_range(uv, err) != 0)
2500         return -1;
2501     if (uv < 0x80) {
2502         char escbuf[5];
2503         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
2504         rb_str_buf_cat(buf, escbuf, 4);
2505     }
2506     else {
2507         int len;
2508         char utf8buf[6];
2509         len = rb_uv_to_utf8(utf8buf, uv);
2510         rb_str_buf_cat(buf, utf8buf, len);
2511 
2512         if (*encp == 0)
2513             *encp = rb_utf8_encoding();
2514         else if (*encp != rb_utf8_encoding()) {
2515             errcpy(err, "UTF-8 character in non UTF-8 regexp");
2516             return -1;
2517         }
2518     }
2519     return 0;
2520 }
2521 
2522 static int
unescape_unicode_list(const char ** pp,const char * end,VALUE buf,rb_encoding ** encp,onig_errmsg_buffer err)2523 unescape_unicode_list(const char **pp, const char *end,
2524         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2525 {
2526     const char *p = *pp;
2527     int has_unicode = 0;
2528     unsigned long code;
2529     size_t len;
2530 
2531     while (p < end && ISSPACE(*p)) p++;
2532 
2533     while (1) {
2534         code = ruby_scan_hex(p, end-p, &len);
2535         if (len == 0)
2536             break;
2537         if (6 < len) { /* max 10FFFF */
2538             errcpy(err, "invalid Unicode range");
2539             return -1;
2540         }
2541         p += len;
2542         if (append_utf8(code, buf, encp, err) != 0)
2543             return -1;
2544         has_unicode = 1;
2545 
2546         while (p < end && ISSPACE(*p)) p++;
2547     }
2548 
2549     if (has_unicode == 0) {
2550         errcpy(err, "invalid Unicode list");
2551         return -1;
2552     }
2553 
2554     *pp = p;
2555 
2556     return 0;
2557 }
2558 
2559 static int
unescape_unicode_bmp(const char ** pp,const char * end,VALUE buf,rb_encoding ** encp,onig_errmsg_buffer err)2560 unescape_unicode_bmp(const char **pp, const char *end,
2561         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
2562 {
2563     const char *p = *pp;
2564     size_t len;
2565     unsigned long code;
2566 
2567     if (end < p+4) {
2568         errcpy(err, "invalid Unicode escape");
2569         return -1;
2570     }
2571     code = ruby_scan_hex(p, 4, &len);
2572     if (len != 4) {
2573         errcpy(err, "invalid Unicode escape");
2574         return -1;
2575     }
2576     if (append_utf8(code, buf, encp, err) != 0)
2577         return -1;
2578     *pp = p + 4;
2579     return 0;
2580 }
2581 
2582 static int
unescape_nonascii(const char * p,const char * end,rb_encoding * enc,VALUE buf,rb_encoding ** encp,int * has_property,onig_errmsg_buffer err)2583 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2584         VALUE buf, rb_encoding **encp, int *has_property,
2585         onig_errmsg_buffer err)
2586 {
2587     unsigned char c;
2588     char smallbuf[2];
2589 
2590     while (p < end) {
2591         int chlen = rb_enc_precise_mbclen(p, end, enc);
2592         if (!MBCLEN_CHARFOUND_P(chlen)) {
2593           invalid_multibyte:
2594             errcpy(err, "invalid multibyte character");
2595             return -1;
2596         }
2597         chlen = MBCLEN_CHARFOUND_LEN(chlen);
2598         if (1 < chlen || (*p & 0x80)) {
2599           multibyte:
2600             rb_str_buf_cat(buf, p, chlen);
2601             p += chlen;
2602             if (*encp == 0)
2603                 *encp = enc;
2604             else if (*encp != enc) {
2605                 errcpy(err, "non ASCII character in UTF-8 regexp");
2606                 return -1;
2607             }
2608             continue;
2609         }
2610 
2611         switch (c = *p++) {
2612           case '\\':
2613             if (p == end) {
2614                 errcpy(err, "too short escape sequence");
2615                 return -1;
2616             }
2617             chlen = rb_enc_precise_mbclen(p, end, enc);
2618             if (!MBCLEN_CHARFOUND_P(chlen)) {
2619                 goto invalid_multibyte;
2620             }
2621             if ((chlen = MBCLEN_CHARFOUND_LEN(chlen)) > 1) {
2622 		/* include the previous backslash */
2623                 --p;
2624                 ++chlen;
2625                 goto multibyte;
2626             }
2627             switch (c = *p++) {
2628               case '1': case '2': case '3':
2629               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2630                 {
2631                     size_t len = end-(p-1), octlen;
2632                     if (ruby_scan_oct(p-1, len < 3 ? len : 3, &octlen) <= 0177) {
2633                         /* backref or 7bit octal.
2634                            no need to unescape anyway.
2635                            re-escaping may break backref */
2636                         goto escape_asis;
2637                     }
2638                 }
2639                 /* xxx: How about more than 199 subexpressions? */
2640 
2641               case '0': /* \0, \0O, \0OO */
2642 
2643               case 'x': /* \xHH */
2644               case 'c': /* \cX, \c\M-X */
2645               case 'C': /* \C-X, \C-\M-X */
2646               case 'M': /* \M-X, \M-\C-X, \M-\cX */
2647                 p = p-2;
2648 		if (enc == rb_usascii_encoding()) {
2649 		    const char *pbeg = p;
2650                     int byte = read_escaped_byte(&p, end, err);
2651                     if (byte == -1) return -1;
2652                     c = byte;
2653 		    rb_str_buf_cat(buf, pbeg, p-pbeg);
2654 		}
2655 		else {
2656 		    if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
2657 			return -1;
2658 		}
2659                 break;
2660 
2661               case 'u':
2662                 if (p == end) {
2663                     errcpy(err, "too short escape sequence");
2664                     return -1;
2665                 }
2666                 if (*p == '{') {
2667                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2668                     p++;
2669                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
2670                         return -1;
2671                     if (p == end || *p++ != '}') {
2672                         errcpy(err, "invalid Unicode list");
2673                         return -1;
2674                     }
2675                     break;
2676                 }
2677                 else {
2678                     /* \uHHHH */
2679                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
2680                         return -1;
2681                     break;
2682                 }
2683 
2684               case 'p': /* \p{Hiragana} */
2685               case 'P':
2686                 if (!*encp) {
2687                     *has_property = 1;
2688                 }
2689                 goto escape_asis;
2690 
2691               default: /* \n, \\, \d, \9, etc. */
2692 escape_asis:
2693                 smallbuf[0] = '\\';
2694                 smallbuf[1] = c;
2695                 rb_str_buf_cat(buf, smallbuf, 2);
2696                 break;
2697             }
2698             break;
2699 
2700           default:
2701             rb_str_buf_cat(buf, (char *)&c, 1);
2702             break;
2703         }
2704     }
2705 
2706     return 0;
2707 }
2708 
2709 static VALUE
rb_reg_preprocess(const char * p,const char * end,rb_encoding * enc,rb_encoding ** fixed_enc,onig_errmsg_buffer err)2710 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
2711         rb_encoding **fixed_enc, onig_errmsg_buffer err)
2712 {
2713     VALUE buf;
2714     int has_property = 0;
2715 
2716     buf = rb_str_buf_new(0);
2717 
2718     if (rb_enc_asciicompat(enc))
2719         *fixed_enc = 0;
2720     else {
2721         *fixed_enc = enc;
2722         rb_enc_associate(buf, enc);
2723     }
2724 
2725     if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
2726         return Qnil;
2727 
2728     if (has_property && !*fixed_enc) {
2729         *fixed_enc = enc;
2730     }
2731 
2732     if (*fixed_enc) {
2733         rb_enc_associate(buf, *fixed_enc);
2734     }
2735 
2736     return buf;
2737 }
2738 
2739 VALUE
rb_reg_check_preprocess(VALUE str)2740 rb_reg_check_preprocess(VALUE str)
2741 {
2742     rb_encoding *fixed_enc = 0;
2743     onig_errmsg_buffer err = "";
2744     VALUE buf;
2745     char *p, *end;
2746     rb_encoding *enc;
2747 
2748     StringValue(str);
2749     p = RSTRING_PTR(str);
2750     end = p + RSTRING_LEN(str);
2751     enc = rb_enc_get(str);
2752 
2753     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
2754     RB_GC_GUARD(str);
2755 
2756     if (buf == Qnil) {
2757 	return rb_reg_error_desc(str, 0, err);
2758     }
2759     return Qnil;
2760 }
2761 
2762 static VALUE
rb_reg_preprocess_dregexp(VALUE ary,int options)2763 rb_reg_preprocess_dregexp(VALUE ary, int options)
2764 {
2765     rb_encoding *fixed_enc = 0;
2766     rb_encoding *regexp_enc = 0;
2767     onig_errmsg_buffer err = "";
2768     int i;
2769     VALUE result = 0;
2770     rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2771 
2772     if (RARRAY_LEN(ary) == 0) {
2773         rb_raise(rb_eArgError, "no arguments given");
2774     }
2775 
2776     for (i = 0; i < RARRAY_LEN(ary); i++) {
2777         VALUE str = RARRAY_AREF(ary, i);
2778         VALUE buf;
2779         char *p, *end;
2780         rb_encoding *src_enc;
2781 
2782 	src_enc = rb_enc_get(str);
2783 	if (options & ARG_ENCODING_NONE &&
2784 		src_enc != ascii8bit) {
2785 	    if (str_coderange(str) != ENC_CODERANGE_7BIT)
2786 		rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2787 	    else
2788 		src_enc = ascii8bit;
2789 	}
2790 
2791         StringValue(str);
2792         p = RSTRING_PTR(str);
2793         end = p + RSTRING_LEN(str);
2794 
2795         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
2796 
2797         if (buf == Qnil)
2798             rb_raise(rb_eArgError, "%s", err);
2799 
2800         if (fixed_enc != 0) {
2801             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2802                 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
2803                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
2804             }
2805             regexp_enc = fixed_enc;
2806         }
2807 
2808         if (!result)
2809             result = rb_str_new3(str);
2810         else
2811             rb_str_buf_append(result, str);
2812     }
2813     if (regexp_enc) {
2814         rb_enc_associate(result, regexp_enc);
2815     }
2816 
2817     return result;
2818 }
2819 
2820 static int
rb_reg_initialize(VALUE obj,const char * s,long len,rb_encoding * enc,int options,onig_errmsg_buffer err,const char * sourcefile,int sourceline)2821 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
2822 		  int options, onig_errmsg_buffer err,
2823 		  const char *sourcefile, int sourceline)
2824 {
2825     struct RRegexp *re = RREGEXP(obj);
2826     VALUE unescaped;
2827     rb_encoding *fixed_enc = 0;
2828     rb_encoding *a_enc = rb_ascii8bit_encoding();
2829 
2830     rb_check_frozen(obj);
2831     if (FL_TEST(obj, REG_LITERAL))
2832 	rb_raise(rb_eSecurityError, "can't modify literal regexp");
2833     if (re->ptr)
2834         rb_raise(rb_eTypeError, "already initialized regexp");
2835     re->ptr = 0;
2836 
2837     if (rb_enc_dummy_p(enc)) {
2838 	errcpy(err, "can't make regexp with dummy encoding");
2839 	return -1;
2840     }
2841 
2842     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
2843     if (unescaped == Qnil)
2844         return -1;
2845 
2846     if (fixed_enc) {
2847 	if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
2848             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
2849 	    errcpy(err, "incompatible character encoding");
2850 	    return -1;
2851 	}
2852         if (fixed_enc != a_enc) {
2853 	    options |= ARG_ENCODING_FIXED;
2854 	    enc = fixed_enc;
2855 	}
2856     }
2857     else if (!(options & ARG_ENCODING_FIXED)) {
2858        enc = rb_usascii_encoding();
2859     }
2860 
2861     rb_enc_associate((VALUE)re, enc);
2862     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2863 	re->basic.flags |= KCODE_FIXED;
2864     }
2865     if (options & ARG_ENCODING_NONE) {
2866         re->basic.flags |= REG_ENCODING_NONE;
2867     }
2868 
2869     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
2870 			  options & ARG_REG_OPTION_MASK, err,
2871 			  sourcefile, sourceline);
2872     if (!re->ptr) return -1;
2873     RB_GC_GUARD(unescaped);
2874     return 0;
2875 }
2876 
2877 static void
reg_set_source(VALUE reg,VALUE str,rb_encoding * enc)2878 reg_set_source(VALUE reg, VALUE str, rb_encoding *enc)
2879 {
2880     rb_encoding *regenc = rb_enc_get(reg);
2881     if (regenc != enc) {
2882 	str = rb_enc_associate(rb_str_dup(str), enc = regenc);
2883     }
2884     RB_OBJ_WRITE(reg, &RREGEXP(reg)->src, rb_fstring(str));
2885 }
2886 
2887 static int
rb_reg_initialize_str(VALUE obj,VALUE str,int options,onig_errmsg_buffer err,const char * sourcefile,int sourceline)2888 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
2889 	const char *sourcefile, int sourceline)
2890 {
2891     int ret;
2892     rb_encoding *str_enc = rb_enc_get(str), *enc = str_enc;
2893     if (options & ARG_ENCODING_NONE) {
2894         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2895         if (enc != ascii8bit) {
2896             if (str_coderange(str) != ENC_CODERANGE_7BIT) {
2897                 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2898                 return -1;
2899             }
2900             enc = ascii8bit;
2901         }
2902     }
2903     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
2904 			    options, err, sourcefile, sourceline);
2905     OBJ_INFECT(obj, str);
2906     if (ret == 0) reg_set_source(obj, str, str_enc);
2907     return ret;
2908 }
2909 
2910 static VALUE
rb_reg_s_alloc(VALUE klass)2911 rb_reg_s_alloc(VALUE klass)
2912 {
2913     NEWOBJ_OF(re, struct RRegexp, klass, T_REGEXP | (RGENGC_WB_PROTECTED_REGEXP ? FL_WB_PROTECTED : 0));
2914 
2915     re->ptr = 0;
2916     RB_OBJ_WRITE(re, &re->src, 0);
2917     re->usecnt = 0;
2918 
2919     return (VALUE)re;
2920 }
2921 
2922 VALUE
rb_reg_alloc(void)2923 rb_reg_alloc(void)
2924 {
2925     return rb_reg_s_alloc(rb_cRegexp);
2926 }
2927 
2928 VALUE
rb_reg_new_str(VALUE s,int options)2929 rb_reg_new_str(VALUE s, int options)
2930 {
2931     return rb_reg_init_str(rb_reg_alloc(), s, options);
2932 }
2933 
2934 VALUE
rb_reg_init_str(VALUE re,VALUE s,int options)2935 rb_reg_init_str(VALUE re, VALUE s, int options)
2936 {
2937     onig_errmsg_buffer err = "";
2938 
2939     if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
2940 	rb_reg_raise_str(s, options, err);
2941     }
2942 
2943     return re;
2944 }
2945 
2946 static VALUE
rb_reg_init_str_enc(VALUE re,VALUE s,rb_encoding * enc,int options)2947 rb_reg_init_str_enc(VALUE re, VALUE s, rb_encoding *enc, int options)
2948 {
2949     onig_errmsg_buffer err = "";
2950 
2951     if (rb_reg_initialize(re, RSTRING_PTR(s), RSTRING_LEN(s),
2952 			  enc, options, err, NULL, 0) != 0) {
2953 	rb_reg_raise_str(s, options, err);
2954     }
2955     reg_set_source(re, s, enc);
2956 
2957     return re;
2958 }
2959 
2960 MJIT_FUNC_EXPORTED VALUE
rb_reg_new_ary(VALUE ary,int opt)2961 rb_reg_new_ary(VALUE ary, int opt)
2962 {
2963     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
2964 }
2965 
2966 VALUE
rb_enc_reg_new(const char * s,long len,rb_encoding * enc,int options)2967 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
2968 {
2969     VALUE re = rb_reg_alloc();
2970     onig_errmsg_buffer err = "";
2971 
2972     if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
2973 	rb_enc_reg_raise(s, len, enc, options, err);
2974     }
2975     RB_OBJ_WRITE(re, &RREGEXP(re)->src, rb_fstring(rb_enc_str_new(s, len, enc)));
2976 
2977     return re;
2978 }
2979 
2980 VALUE
rb_reg_new(const char * s,long len,int options)2981 rb_reg_new(const char *s, long len, int options)
2982 {
2983     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
2984 }
2985 
2986 VALUE
rb_reg_compile(VALUE str,int options,const char * sourcefile,int sourceline)2987 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
2988 {
2989     VALUE re = rb_reg_alloc();
2990     onig_errmsg_buffer err = "";
2991 
2992     if (!str) str = rb_str_new(0,0);
2993     if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
2994 	rb_set_errinfo(rb_reg_error_desc(str, options, err));
2995 	return Qnil;
2996     }
2997     FL_SET(re, REG_LITERAL);
2998     return re;
2999 }
3000 
3001 static VALUE reg_cache;
3002 
3003 VALUE
rb_reg_regcomp(VALUE str)3004 rb_reg_regcomp(VALUE str)
3005 {
3006     if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
3007 	&& ENCODING_GET(reg_cache) == ENCODING_GET(str)
3008 	&& memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
3009 	return reg_cache;
3010 
3011     return reg_cache = rb_reg_new_str(str, 0);
3012 }
3013 
3014 static st_index_t reg_hash(VALUE re);
3015 /*
3016  * call-seq:
3017  *   rxp.hash   -> integer
3018  *
3019  * Produce a hash based on the text and options of this regular expression.
3020  *
3021  * See also Object#hash.
3022  */
3023 
3024 static VALUE
rb_reg_hash(VALUE re)3025 rb_reg_hash(VALUE re)
3026 {
3027     st_index_t hashval = reg_hash(re);
3028     return ST2FIX(hashval);
3029 }
3030 
3031 static st_index_t
reg_hash(VALUE re)3032 reg_hash(VALUE re)
3033 {
3034     st_index_t hashval;
3035 
3036     rb_reg_check(re);
3037     hashval = RREGEXP_PTR(re)->options;
3038     hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
3039     return rb_hash_end(hashval);
3040 }
3041 
3042 
3043 /*
3044  *  call-seq:
3045  *     rxp == other_rxp      -> true or false
3046  *     rxp.eql?(other_rxp)   -> true or false
3047  *
3048  *  Equality---Two regexps are equal if their patterns are identical, they have
3049  *  the same character set code, and their <code>casefold?</code> values are the
3050  *  same.
3051  *
3052  *     /abc/  == /abc/x   #=> false
3053  *     /abc/  == /abc/i   #=> false
3054  *     /abc/  == /abc/u   #=> false
3055  *     /abc/u == /abc/n   #=> false
3056  */
3057 
3058 static VALUE
rb_reg_equal(VALUE re1,VALUE re2)3059 rb_reg_equal(VALUE re1, VALUE re2)
3060 {
3061     if (re1 == re2) return Qtrue;
3062     if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
3063     rb_reg_check(re1); rb_reg_check(re2);
3064     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
3065     if (RREGEXP_PTR(re1)->options != RREGEXP_PTR(re2)->options) return Qfalse;
3066     if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
3067     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
3068     if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
3069 	return Qtrue;
3070     }
3071     return Qfalse;
3072 }
3073 
3074 /*
3075  * call-seq:
3076  *    mtch.hash   -> integer
3077  *
3078  * Produce a hash based on the target string, regexp and matched
3079  * positions of this matchdata.
3080  *
3081  * See also Object#hash.
3082  */
3083 
3084 static VALUE
match_hash(VALUE match)3085 match_hash(VALUE match)
3086 {
3087     const struct re_registers *regs;
3088     st_index_t hashval;
3089 
3090     match_check(match);
3091     hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
3092     hashval = rb_hash_uint(hashval, reg_hash(match_regexp(match)));
3093     regs = RMATCH_REGS(match);
3094     hashval = rb_hash_uint(hashval, regs->num_regs);
3095     hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
3096     hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
3097     hashval = rb_hash_end(hashval);
3098     return ST2FIX(hashval);
3099 }
3100 
3101 /*
3102  * call-seq:
3103  *    mtch == mtch2   -> true or false
3104  *    mtch.eql?(mtch2)   -> true or false
3105  *
3106  *  Equality---Two matchdata are equal if their target strings,
3107  *  patterns, and matched positions are identical.
3108  */
3109 
3110 static VALUE
match_equal(VALUE match1,VALUE match2)3111 match_equal(VALUE match1, VALUE match2)
3112 {
3113     const struct re_registers *regs1, *regs2;
3114 
3115     if (match1 == match2) return Qtrue;
3116     if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse;
3117     if (!RMATCH(match1)->regexp || !RMATCH(match2)->regexp) return Qfalse;
3118     if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
3119     if (!rb_reg_equal(match_regexp(match1), match_regexp(match2))) return Qfalse;
3120     regs1 = RMATCH_REGS(match1);
3121     regs2 = RMATCH_REGS(match2);
3122     if (regs1->num_regs != regs2->num_regs) return Qfalse;
3123     if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
3124     if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
3125     return Qtrue;
3126 }
3127 
3128 static VALUE
reg_operand(VALUE s,int check)3129 reg_operand(VALUE s, int check)
3130 {
3131     if (SYMBOL_P(s)) {
3132 	return rb_sym2str(s);
3133     }
3134     else if (RB_TYPE_P(s, T_STRING)) {
3135         return s;
3136     }
3137     else {
3138         return check ? rb_str_to_str(s) : rb_check_string_type(s);
3139     }
3140 }
3141 
3142 static long
reg_match_pos(VALUE re,VALUE * strp,long pos)3143 reg_match_pos(VALUE re, VALUE *strp, long pos)
3144 {
3145     VALUE str = *strp;
3146 
3147     if (NIL_P(str)) {
3148 	rb_backref_set(Qnil);
3149 	return -1;
3150     }
3151     *strp = str = reg_operand(str, TRUE);
3152     if (pos != 0) {
3153 	if (pos < 0) {
3154 	    VALUE l = rb_str_length(str);
3155 	    pos += NUM2INT(l);
3156 	    if (pos < 0) {
3157 		return pos;
3158 	    }
3159 	}
3160 	pos = rb_str_offset(str, pos);
3161     }
3162     return rb_reg_search(re, str, pos, 0);
3163 }
3164 
3165 /*
3166  *  call-seq:
3167  *     rxp =~ str    -> integer or nil
3168  *
3169  *  Match---Matches <i>rxp</i> against <i>str</i>.
3170  *
3171  *     /at/ =~ "input data"   #=> 7
3172  *     /ax/ =~ "input data"   #=> nil
3173  *
3174  *  If <code>=~</code> is used with a regexp literal with named captures,
3175  *  captured strings (or nil) is assigned to local variables named by
3176  *  the capture names.
3177  *
3178  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
3179  *     p lhs    #=> "x"
3180  *     p rhs    #=> "y"
3181  *
3182  *  If it is not matched, nil is assigned for the variables.
3183  *
3184  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
3185  *     p lhs    #=> nil
3186  *     p rhs    #=> nil
3187  *
3188  *  This assignment is implemented in the Ruby parser.
3189  *  The parser detects 'regexp-literal =~ expression' for the assignment.
3190  *  The regexp must be a literal without interpolation and placed at left hand side.
3191  *
3192  *  The assignment does not occur if the regexp is not a literal.
3193  *
3194  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
3195  *     re =~ "  x = y  "
3196  *     p lhs    # undefined local variable
3197  *     p rhs    # undefined local variable
3198  *
3199  *  A regexp interpolation, <code>#{}</code>, also disables
3200  *  the assignment.
3201  *
3202  *     rhs_pat = /(?<rhs>\w+)/
3203  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
3204  *     p lhs    # undefined local variable
3205  *
3206  *  The assignment does not occur if the regexp is placed at the right hand side.
3207  *
3208  *    "  x = y  " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
3209  *    p lhs, rhs # undefined local variable
3210  *
3211  */
3212 
3213 VALUE
rb_reg_match(VALUE re,VALUE str)3214 rb_reg_match(VALUE re, VALUE str)
3215 {
3216     long pos = reg_match_pos(re, &str, 0);
3217     if (pos < 0) return Qnil;
3218     pos = rb_str_sublen(str, pos);
3219     return LONG2FIX(pos);
3220 }
3221 
3222 /*
3223  *  call-seq:
3224  *     rxp === str   -> true or false
3225  *
3226  *  Case Equality---Used in case statements.
3227  *
3228  *     a = "HELLO"
3229  *     case a
3230  *     when /\A[a-z]*\z/; print "Lower case\n"
3231  *     when /\A[A-Z]*\z/; print "Upper case\n"
3232  *     else;              print "Mixed case\n"
3233  *     end
3234  *     #=> "Upper case"
3235  *
3236  *  Following a regular expression literal with the #=== operator allows you to
3237  *  compare against a String.
3238  *
3239  *	/^[a-z]*$/ === "HELLO" #=> false
3240  *	/^[A-Z]*$/ === "HELLO" #=> true
3241  */
3242 
3243 VALUE
rb_reg_eqq(VALUE re,VALUE str)3244 rb_reg_eqq(VALUE re, VALUE str)
3245 {
3246     long start;
3247 
3248     str = reg_operand(str, FALSE);
3249     if (NIL_P(str)) {
3250 	rb_backref_set(Qnil);
3251 	return Qfalse;
3252     }
3253     start = rb_reg_search(re, str, 0, 0);
3254     if (start < 0) {
3255 	return Qfalse;
3256     }
3257     return Qtrue;
3258 }
3259 
3260 
3261 /*
3262  *  call-seq:
3263  *     ~ rxp   -> integer or nil
3264  *
3265  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
3266  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
3267  *
3268  *     $_ = "input data"
3269  *     ~ /at/   #=> 7
3270  */
3271 
3272 VALUE
rb_reg_match2(VALUE re)3273 rb_reg_match2(VALUE re)
3274 {
3275     long start;
3276     VALUE line = rb_lastline_get();
3277 
3278     if (!RB_TYPE_P(line, T_STRING)) {
3279 	rb_backref_set(Qnil);
3280 	return Qnil;
3281     }
3282 
3283     start = rb_reg_search(re, line, 0, 0);
3284     if (start < 0) {
3285 	return Qnil;
3286     }
3287     start = rb_str_sublen(line, start);
3288     return LONG2FIX(start);
3289 }
3290 
3291 
3292 /*
3293  *  call-seq:
3294  *     rxp.match(str)       -> matchdata or nil
3295  *     rxp.match(str,pos)   -> matchdata or nil
3296  *
3297  *  Returns a <code>MatchData</code> object describing the match, or
3298  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
3299  *  value of the special variable <code>$~</code> following a normal match.
3300  *  If the second parameter is present, it specifies the position in the string
3301  *  to begin the search.
3302  *
3303  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
3304  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
3305  *
3306  *  If a block is given, invoke the block with MatchData if match succeed, so
3307  *  that you can write
3308  *
3309  *     /M(.*)/.match("Matz") do |m|
3310  *       puts m[0]
3311  *       puts m[1]
3312  *     end
3313  *
3314  *  instead of
3315  *
3316  *     if m = /M(.*)/.match("Matz")
3317  *       puts m[0]
3318  *       puts m[1]
3319  *     end
3320  *
3321  *  The return value is a value from block execution in this case.
3322  */
3323 
3324 static VALUE
rb_reg_match_m(int argc,VALUE * argv,VALUE re)3325 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
3326 {
3327     VALUE result, str, initpos;
3328     long pos;
3329 
3330     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
3331 	pos = NUM2LONG(initpos);
3332     }
3333     else {
3334 	pos = 0;
3335     }
3336 
3337     pos = reg_match_pos(re, &str, pos);
3338     if (pos < 0) {
3339 	rb_backref_set(Qnil);
3340 	return Qnil;
3341     }
3342     result = rb_backref_get();
3343     rb_match_busy(result);
3344     if (!NIL_P(result) && rb_block_given_p()) {
3345 	return rb_yield(result);
3346     }
3347     return result;
3348 }
3349 
3350 /*
3351  *  call-seq:
3352  *     rxp.match?(str)       -> true or false
3353  *     rxp.match?(str,pos)   -> true or false
3354  *
3355  *  Returns a <code>true</code> or <code>false</code> indicates whether the
3356  *  regexp is matched or not without updating $~ and other related variables.
3357  *  If the second parameter is present, it specifies the position in the string
3358  *  to begin the search.
3359  *
3360  *     /R.../.match?("Ruby")    #=> true
3361  *     /R.../.match?("Ruby", 1) #=> false
3362  *     /P.../.match?("Ruby")    #=> false
3363  *     $&                       #=> nil
3364  */
3365 
3366 static VALUE
rb_reg_match_m_p(int argc,VALUE * argv,VALUE re)3367 rb_reg_match_m_p(int argc, VALUE *argv, VALUE re)
3368 {
3369     long pos = rb_check_arity(argc, 1, 2) > 1 ? NUM2LONG(argv[1]) : 0;
3370     return rb_reg_match_p(re, argv[0], pos);
3371 }
3372 
3373 VALUE
rb_reg_match_p(VALUE re,VALUE str,long pos)3374 rb_reg_match_p(VALUE re, VALUE str, long pos)
3375 {
3376     regex_t *reg;
3377     onig_errmsg_buffer err = "";
3378     OnigPosition result;
3379     const UChar *start, *end;
3380     int tmpreg;
3381 
3382     if (NIL_P(str)) return Qfalse;
3383     str = SYMBOL_P(str) ? rb_sym2str(str) : StringValue(str);
3384     if (pos) {
3385 	if (pos < 0) {
3386 	    pos += NUM2LONG(rb_str_length(str));
3387 	    if (pos < 0) return Qfalse;
3388 	}
3389 	if (pos > 0) {
3390 	    long len = 1;
3391 	    const char *beg = rb_str_subpos(str, pos, &len);
3392 	    if (!beg) return Qfalse;
3393 	    pos = beg - RSTRING_PTR(str);
3394 	}
3395     }
3396     reg = rb_reg_prepare_re0(re, str, err);
3397     tmpreg = reg != RREGEXP_PTR(re);
3398     if (!tmpreg) RREGEXP(re)->usecnt++;
3399     start = ((UChar*)RSTRING_PTR(str));
3400     end = start + RSTRING_LEN(str);
3401     result = onig_search(reg, start, end, start + pos, end,
3402 			 NULL, ONIG_OPTION_NONE);
3403     if (!tmpreg) RREGEXP(re)->usecnt--;
3404     if (tmpreg) {
3405 	if (RREGEXP(re)->usecnt) {
3406 	    onig_free(reg);
3407 	}
3408 	else {
3409 	    onig_free(RREGEXP_PTR(re));
3410 	    RREGEXP_PTR(re) = reg;
3411 	}
3412     }
3413     if (result < 0) {
3414 	if (result == ONIG_MISMATCH) {
3415 	    return Qfalse;
3416 	}
3417 	else {
3418 	    onig_error_code_to_str((UChar*)err, (int)result);
3419 	    rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
3420 	}
3421     }
3422     return Qtrue;
3423 }
3424 
3425 /*
3426  * Document-method: compile
3427  *
3428  * Alias for <code>Regexp.new</code>
3429  */
3430 
3431 /*
3432  *  call-seq:
3433  *     Regexp.new(string, [options])       -> regexp
3434  *     Regexp.new(regexp)                  -> regexp
3435  *     Regexp.compile(string, [options])   -> regexp
3436  *     Regexp.compile(regexp)              -> regexp
3437  *
3438  *  Constructs a new regular expression from +pattern+, which can be either a
3439  *  String or a Regexp (in which case that regexp's options are propagated),
3440  *  and new options may not be specified (a change as of Ruby 1.8).
3441  *
3442  *  If +options+ is an Integer, it should be one or more of the constants
3443  *  Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE,
3444  *  <em>or</em>-ed together.  Otherwise, if +options+ is not
3445  *  +nil+ or +false+, the regexp will be case insensitive.
3446  *
3447  *    r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
3448  *    r2 = Regexp.new('cat', true)     #=> /cat/i
3449  *    r3 = Regexp.new(r2)              #=> /cat/i
3450  *    r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix
3451  */
3452 
3453 static VALUE
rb_reg_initialize_m(int argc,VALUE * argv,VALUE self)3454 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
3455 {
3456     int flags = 0;
3457     VALUE str;
3458     rb_encoding *enc = 0;
3459 
3460     rb_check_arity(argc, 1, 3);
3461     if (RB_TYPE_P(argv[0], T_REGEXP)) {
3462 	VALUE re = argv[0];
3463 
3464 	if (argc > 1) {
3465 	    rb_warn("flags ignored");
3466 	}
3467 	rb_reg_check(re);
3468 	flags = rb_reg_options(re);
3469 	str = RREGEXP_SRC(re);
3470     }
3471     else {
3472 	if (argc >= 2) {
3473 	    if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
3474 	    else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
3475 	}
3476 	if (argc == 3 && !NIL_P(argv[2])) {
3477 	    char *kcode = StringValuePtr(argv[2]);
3478 	    if (kcode[0] == 'n' || kcode[0] == 'N') {
3479 		enc = rb_ascii8bit_encoding();
3480 		flags |= ARG_ENCODING_NONE;
3481 	    }
3482 	    else {
3483 		rb_warn("encoding option is ignored - %s", kcode);
3484 	    }
3485 	}
3486 	str = StringValue(argv[0]);
3487     }
3488     if (enc && rb_enc_get(str) != enc)
3489 	rb_reg_init_str_enc(self, str, enc, flags);
3490     else
3491 	rb_reg_init_str(self, str, flags);
3492     return self;
3493 }
3494 
3495 VALUE
rb_reg_quote(VALUE str)3496 rb_reg_quote(VALUE str)
3497 {
3498     rb_encoding *enc = rb_enc_get(str);
3499     char *s, *send, *t;
3500     VALUE tmp;
3501     int c, clen;
3502     int ascii_only = rb_enc_str_asciionly_p(str);
3503 
3504     s = RSTRING_PTR(str);
3505     send = s + RSTRING_LEN(str);
3506     while (s < send) {
3507         c = rb_enc_ascget(s, send, &clen, enc);
3508 	if (c == -1) {
3509             s += mbclen(s, send, enc);
3510 	    continue;
3511 	}
3512 	switch (c) {
3513 	  case '[': case ']': case '{': case '}':
3514 	  case '(': case ')': case '|': case '-':
3515 	  case '*': case '.': case '\\':
3516 	  case '?': case '+': case '^': case '$':
3517 	  case ' ': case '#':
3518 	  case '\t': case '\f': case '\v': case '\n': case '\r':
3519 	    goto meta_found;
3520 	}
3521         s += clen;
3522     }
3523     tmp = rb_str_new3(str);
3524     if (ascii_only) {
3525         rb_enc_associate(tmp, rb_usascii_encoding());
3526     }
3527     return tmp;
3528 
3529   meta_found:
3530     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
3531     if (ascii_only) {
3532         rb_enc_associate(tmp, rb_usascii_encoding());
3533     }
3534     else {
3535         rb_enc_copy(tmp, str);
3536     }
3537     t = RSTRING_PTR(tmp);
3538     /* copy upto metacharacter */
3539     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
3540     t += s - RSTRING_PTR(str);
3541 
3542     while (s < send) {
3543         c = rb_enc_ascget(s, send, &clen, enc);
3544 	if (c == -1) {
3545 	    int n = mbclen(s, send, enc);
3546 
3547 	    while (n--)
3548 		*t++ = *s++;
3549 	    continue;
3550 	}
3551         s += clen;
3552 	switch (c) {
3553 	  case '[': case ']': case '{': case '}':
3554 	  case '(': case ')': case '|': case '-':
3555 	  case '*': case '.': case '\\':
3556 	  case '?': case '+': case '^': case '$':
3557 	  case '#':
3558             t += rb_enc_mbcput('\\', t, enc);
3559 	    break;
3560 	  case ' ':
3561             t += rb_enc_mbcput('\\', t, enc);
3562             t += rb_enc_mbcput(' ', t, enc);
3563 	    continue;
3564 	  case '\t':
3565             t += rb_enc_mbcput('\\', t, enc);
3566             t += rb_enc_mbcput('t', t, enc);
3567 	    continue;
3568 	  case '\n':
3569             t += rb_enc_mbcput('\\', t, enc);
3570             t += rb_enc_mbcput('n', t, enc);
3571 	    continue;
3572 	  case '\r':
3573             t += rb_enc_mbcput('\\', t, enc);
3574             t += rb_enc_mbcput('r', t, enc);
3575 	    continue;
3576 	  case '\f':
3577             t += rb_enc_mbcput('\\', t, enc);
3578             t += rb_enc_mbcput('f', t, enc);
3579 	    continue;
3580 	  case '\v':
3581             t += rb_enc_mbcput('\\', t, enc);
3582             t += rb_enc_mbcput('v', t, enc);
3583 	    continue;
3584 	}
3585         t += rb_enc_mbcput(c, t, enc);
3586     }
3587     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
3588     OBJ_INFECT(tmp, str);
3589     return tmp;
3590 }
3591 
3592 
3593 /*
3594  *  call-seq:
3595  *     Regexp.escape(str)   -> string
3596  *     Regexp.quote(str)    -> string
3597  *
3598  *  Escapes any characters that would have special meaning in a regular
3599  *  expression. Returns a new escaped string, or self if no characters are
3600  *  escaped.  For any string,
3601  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
3602  *
3603  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
3604  *
3605  */
3606 
3607 static VALUE
rb_reg_s_quote(VALUE c,VALUE str)3608 rb_reg_s_quote(VALUE c, VALUE str)
3609 {
3610     return rb_reg_quote(reg_operand(str, TRUE));
3611 }
3612 
3613 int
rb_reg_options(VALUE re)3614 rb_reg_options(VALUE re)
3615 {
3616     int options;
3617 
3618     rb_reg_check(re);
3619     options = RREGEXP_PTR(re)->options & ARG_REG_OPTION_MASK;
3620     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
3621     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
3622     return options;
3623 }
3624 
3625 VALUE
rb_check_regexp_type(VALUE re)3626 rb_check_regexp_type(VALUE re)
3627 {
3628     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
3629 }
3630 
3631 /*
3632  *  call-seq:
3633  *     Regexp.try_convert(obj) -> re or nil
3634  *
3635  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
3636  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
3637  *  for any reason.
3638  *
3639  *     Regexp.try_convert(/re/)         #=> /re/
3640  *     Regexp.try_convert("re")         #=> nil
3641  *
3642  *     o = Object.new
3643  *     Regexp.try_convert(o)            #=> nil
3644  *     def o.to_regexp() /foo/ end
3645  *     Regexp.try_convert(o)            #=> /foo/
3646  *
3647  */
3648 static VALUE
rb_reg_s_try_convert(VALUE dummy,VALUE re)3649 rb_reg_s_try_convert(VALUE dummy, VALUE re)
3650 {
3651     return rb_check_regexp_type(re);
3652 }
3653 
3654 static VALUE
rb_reg_s_union(VALUE self,VALUE args0)3655 rb_reg_s_union(VALUE self, VALUE args0)
3656 {
3657     long argc = RARRAY_LEN(args0);
3658 
3659     if (argc == 0) {
3660         VALUE args[1];
3661         args[0] = rb_str_new2("(?!)");
3662         return rb_class_new_instance(1, args, rb_cRegexp);
3663     }
3664     else if (argc == 1) {
3665         VALUE arg = rb_ary_entry(args0, 0);
3666         VALUE re = rb_check_regexp_type(arg);
3667         if (!NIL_P(re))
3668             return re;
3669         else {
3670             VALUE quoted;
3671             quoted = rb_reg_s_quote(Qnil, arg);
3672             return rb_reg_new_str(quoted, 0);
3673         }
3674     }
3675     else {
3676 	int i;
3677 	VALUE source = rb_str_buf_new(0);
3678 	rb_encoding *result_enc;
3679 
3680         int has_asciionly = 0;
3681         rb_encoding *has_ascii_compat_fixed = 0;
3682         rb_encoding *has_ascii_incompat = 0;
3683 
3684 	for (i = 0; i < argc; i++) {
3685 	    volatile VALUE v;
3686 	    VALUE e = rb_ary_entry(args0, i);
3687 
3688 	    if (0 < i)
3689 		rb_str_buf_cat_ascii(source, "|");
3690 
3691 	    v = rb_check_regexp_type(e);
3692 	    if (!NIL_P(v)) {
3693                 rb_encoding *enc = rb_enc_get(v);
3694                 if (!rb_enc_asciicompat(enc)) {
3695                     if (!has_ascii_incompat)
3696                         has_ascii_incompat = enc;
3697                     else if (has_ascii_incompat != enc)
3698                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3699                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
3700                 }
3701                 else if (rb_reg_fixed_encoding_p(v)) {
3702                     if (!has_ascii_compat_fixed)
3703                         has_ascii_compat_fixed = enc;
3704                     else if (has_ascii_compat_fixed != enc)
3705                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3706                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3707                 }
3708                 else {
3709                     has_asciionly = 1;
3710                 }
3711 		v = rb_reg_str_with_term(v, -1);
3712 	    }
3713 	    else {
3714                 rb_encoding *enc;
3715                 StringValue(e);
3716                 enc = rb_enc_get(e);
3717                 if (!rb_enc_asciicompat(enc)) {
3718                     if (!has_ascii_incompat)
3719                         has_ascii_incompat = enc;
3720                     else if (has_ascii_incompat != enc)
3721                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3722                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
3723                 }
3724                 else if (rb_enc_str_asciionly_p(e)) {
3725                     has_asciionly = 1;
3726                 }
3727                 else {
3728                     if (!has_ascii_compat_fixed)
3729                         has_ascii_compat_fixed = enc;
3730                     else if (has_ascii_compat_fixed != enc)
3731                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3732                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3733                 }
3734 		v = rb_reg_s_quote(Qnil, e);
3735 	    }
3736             if (has_ascii_incompat) {
3737                 if (has_asciionly) {
3738                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
3739                         rb_enc_name(has_ascii_incompat));
3740                 }
3741                 if (has_ascii_compat_fixed) {
3742                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3743                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
3744                 }
3745             }
3746 
3747             if (i == 0) {
3748                 rb_enc_copy(source, v);
3749             }
3750 	    rb_str_append(source, v);
3751 	}
3752 
3753         if (has_ascii_incompat) {
3754             result_enc = has_ascii_incompat;
3755         }
3756         else if (has_ascii_compat_fixed) {
3757             result_enc = has_ascii_compat_fixed;
3758         }
3759         else {
3760             result_enc = rb_ascii8bit_encoding();
3761         }
3762 
3763         rb_enc_associate(source, result_enc);
3764         return rb_class_new_instance(1, &source, rb_cRegexp);
3765     }
3766 }
3767 
3768 /*
3769  *  call-seq:
3770  *     Regexp.union(pat1, pat2, ...)            -> new_regexp
3771  *     Regexp.union(pats_ary)                   -> new_regexp
3772  *
3773  *  Return a <code>Regexp</code> object that is the union of the given
3774  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
3775  *  can be Regexp objects, in which case their options will be preserved, or
3776  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
3777  *  The behavior is unspecified if any given <em>pattern</em> contains capture.
3778  *
3779  *     Regexp.union                         #=> /(?!)/
3780  *     Regexp.union("penzance")             #=> /penzance/
3781  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
3782  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
3783  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3784  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
3785  *
3786  *  Note: the arguments for ::union will try to be converted into a regular
3787  *  expression literal via #to_regexp.
3788  */
3789 static VALUE
rb_reg_s_union_m(VALUE self,VALUE args)3790 rb_reg_s_union_m(VALUE self, VALUE args)
3791 {
3792     VALUE v;
3793     if (RARRAY_LEN(args) == 1 &&
3794         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
3795         return rb_reg_s_union(self, v);
3796     }
3797     return rb_reg_s_union(self, args);
3798 }
3799 
3800 /* :nodoc: */
3801 static VALUE
rb_reg_init_copy(VALUE copy,VALUE re)3802 rb_reg_init_copy(VALUE copy, VALUE re)
3803 {
3804     if (!OBJ_INIT_COPY(copy, re)) return copy;
3805     rb_reg_check(re);
3806     return rb_reg_init_str(copy, RREGEXP_SRC(re), rb_reg_options(re));
3807 }
3808 
3809 VALUE
rb_reg_regsub(VALUE str,VALUE src,struct re_registers * regs,VALUE regexp)3810 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
3811 {
3812     VALUE val = 0;
3813     char *p, *s, *e;
3814     int no, clen;
3815     rb_encoding *str_enc = rb_enc_get(str);
3816     rb_encoding *src_enc = rb_enc_get(src);
3817     int acompat = rb_enc_asciicompat(str_enc);
3818 #define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
3819 
3820     p = s = RSTRING_PTR(str);
3821     e = s + RSTRING_LEN(str);
3822 
3823     while (s < e) {
3824         int c = ASCGET(s, e, &clen);
3825 	char *ss;
3826 
3827 	if (c == -1) {
3828 	    s += mbclen(s, e, str_enc);
3829 	    continue;
3830 	}
3831 	ss = s;
3832         s += clen;
3833 
3834 	if (c != '\\' || s == e) continue;
3835 
3836 	if (!val) {
3837 	    val = rb_str_buf_new(ss-p);
3838 	}
3839         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
3840 
3841         c = ASCGET(s, e, &clen);
3842         if (c == -1) {
3843             s += mbclen(s, e, str_enc);
3844 	    rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3845             p = s;
3846 	    continue;
3847         }
3848         s += clen;
3849 
3850 	p = s;
3851 	switch (c) {
3852 	  case '1': case '2': case '3': case '4':
3853 	  case '5': case '6': case '7': case '8': case '9':
3854             if (!NIL_P(regexp) && onig_noname_group_capture_is_active(RREGEXP_PTR(regexp))) {
3855                 no = c - '0';
3856             }
3857             else {
3858                 continue;
3859             }
3860 	    break;
3861 
3862           case 'k':
3863             if (s < e && ASCGET(s, e, &clen) == '<') {
3864                 char *name, *name_end;
3865 
3866                 name_end = name = s + clen;
3867                 while (name_end < e) {
3868                     c = ASCGET(name_end, e, &clen);
3869                     if (c == '>') break;
3870                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
3871                 }
3872                 if (name_end < e) {
3873 		    VALUE n = rb_str_subseq(str, (long)(name - RSTRING_PTR(str)),
3874 					    (long)(name_end - name));
3875 		    if ((no = NAME_TO_NUMBER(regs, regexp, n, name, name_end)) < 1) {
3876 			name_to_backref_error(n);
3877 		    }
3878                     p = s = name_end + clen;
3879                     break;
3880                 }
3881                 else {
3882                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
3883                 }
3884             }
3885 
3886             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3887             continue;
3888 
3889           case '0':
3890 	  case '&':
3891 	    no = 0;
3892 	    break;
3893 
3894 	  case '`':
3895 	    rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
3896 	    continue;
3897 
3898 	  case '\'':
3899 	    rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
3900 	    continue;
3901 
3902 	  case '+':
3903 	    no = regs->num_regs-1;
3904 	    while (BEG(no) == -1 && no > 0) no--;
3905 	    if (no == 0) continue;
3906 	    break;
3907 
3908 	  case '\\':
3909 	    rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
3910 	    continue;
3911 
3912 	  default:
3913 	    rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3914 	    continue;
3915 	}
3916 
3917 	if (no >= 0) {
3918 	    if (no >= regs->num_regs) continue;
3919 	    if (BEG(no) == -1) continue;
3920 	    rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
3921 	}
3922     }
3923 
3924     if (!val) return str;
3925     if (p < e) {
3926         rb_enc_str_buf_cat(val, p, e-p, str_enc);
3927     }
3928 
3929     return val;
3930 }
3931 
3932 static VALUE
kcode_getter(void)3933 kcode_getter(void)
3934 {
3935     rb_warn("variable $KCODE is no longer effective");
3936     return Qnil;
3937 }
3938 
3939 static void
kcode_setter(VALUE val,ID id)3940 kcode_setter(VALUE val, ID id)
3941 {
3942     rb_warn("variable $KCODE is no longer effective; ignored");
3943 }
3944 
3945 static VALUE
ignorecase_getter(void)3946 ignorecase_getter(void)
3947 {
3948     rb_warn("variable $= is no longer effective");
3949     return Qfalse;
3950 }
3951 
3952 static void
ignorecase_setter(VALUE val,ID id)3953 ignorecase_setter(VALUE val, ID id)
3954 {
3955     rb_warn("variable $= is no longer effective; ignored");
3956 }
3957 
3958 static VALUE
match_getter(void)3959 match_getter(void)
3960 {
3961     VALUE match = rb_backref_get();
3962 
3963     if (NIL_P(match)) return Qnil;
3964     rb_match_busy(match);
3965     return match;
3966 }
3967 
3968 static void
match_setter(VALUE val)3969 match_setter(VALUE val)
3970 {
3971     if (!NIL_P(val)) {
3972 	Check_Type(val, T_MATCH);
3973     }
3974     rb_backref_set(val);
3975 }
3976 
3977 /*
3978  *  call-seq:
3979  *     Regexp.last_match           -> matchdata
3980  *     Regexp.last_match(n)        -> str
3981  *
3982  *  The first form returns the MatchData object generated by the
3983  *  last successful pattern match.  Equivalent to reading the special global
3984  *  variable <code>$~</code> (see Special global variables in Regexp for
3985  *  details).
3986  *
3987  *  The second form returns the <i>n</i>th field in this MatchData object.
3988  *  _n_ can be a string or symbol to reference a named capture.
3989  *
3990  *  Note that the last_match is local to the thread and method scope of the
3991  *  method that did the pattern match.
3992  *
3993  *     /c(.)t/ =~ 'cat'        #=> 0
3994  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
3995  *     Regexp.last_match(0)    #=> "cat"
3996  *     Regexp.last_match(1)    #=> "a"
3997  *     Regexp.last_match(2)    #=> nil
3998  *
3999  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
4000  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
4001  *     Regexp.last_match(:lhs) #=> "var"
4002  *     Regexp.last_match(:rhs) #=> "val"
4003  */
4004 
4005 static VALUE
rb_reg_s_last_match(int argc,VALUE * argv)4006 rb_reg_s_last_match(int argc, VALUE *argv)
4007 {
4008     if (rb_check_arity(argc, 0, 1) == 1) {
4009         VALUE match = rb_backref_get();
4010         int n;
4011         if (NIL_P(match)) return Qnil;
4012         n = match_backref_number(match, argv[0]);
4013 	return rb_reg_nth_match(n, match);
4014     }
4015     return match_getter();
4016 }
4017 
4018 static void
re_warn(const char * s)4019 re_warn(const char *s)
4020 {
4021     rb_warn("%s", s);
4022 }
4023 
4024 /*
4025  *  Document-class: RegexpError
4026  *
4027  *  Raised when given an invalid regexp expression.
4028  *
4029  *     Regexp.new("?")
4030  *
4031  *  <em>raises the exception:</em>
4032  *
4033  *     RegexpError: target of repeat operator is not specified: /?/
4034  */
4035 
4036 /*
4037  *  Document-class: Regexp
4038  *
4039  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
4040  *  against strings. Regexps are created using the <code>/.../</code> and
4041  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
4042  *  constructor.
4043  *
4044  *  :include: doc/regexp.rdoc
4045  */
4046 
4047 void
Init_Regexp(void)4048 Init_Regexp(void)
4049 {
4050     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
4051 
4052     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
4053     onig_set_warn_func(re_warn);
4054     onig_set_verb_warn_func(re_warn);
4055 
4056     rb_define_virtual_variable("$~", match_getter, match_setter);
4057     rb_define_virtual_variable("$&", last_match_getter, 0);
4058     rb_define_virtual_variable("$`", prematch_getter, 0);
4059     rb_define_virtual_variable("$'", postmatch_getter, 0);
4060     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
4061 
4062     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
4063     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
4064     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
4065 
4066     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
4067     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
4068     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
4069     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
4070     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
4071     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
4072     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
4073     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
4074 
4075     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
4076     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
4077     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
4078     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
4079     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
4080     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
4081     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
4082     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
4083     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
4084     rb_define_method(rb_cRegexp, "match?", rb_reg_match_m_p, -1);
4085     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
4086     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
4087     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
4088     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
4089     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
4090     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
4091     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
4092     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
4093     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
4094 
4095     /* see Regexp.options and Regexp.new */
4096     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
4097     /* see Regexp.options and Regexp.new */
4098     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
4099     /* see Regexp.options and Regexp.new */
4100     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
4101     /* see Regexp.options and Regexp.new */
4102     rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
4103     /* see Regexp.options and Regexp.new */
4104     rb_define_const(rb_cRegexp, "NOENCODING", INT2FIX(ARG_ENCODING_NONE));
4105 
4106     rb_global_variable(&reg_cache);
4107 
4108     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
4109     rb_define_alloc_func(rb_cMatch, match_alloc);
4110     rb_undef_method(CLASS_OF(rb_cMatch), "new");
4111 
4112     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
4113     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
4114     rb_define_method(rb_cMatch, "names", match_names, 0);
4115     rb_define_method(rb_cMatch, "size", match_size, 0);
4116     rb_define_method(rb_cMatch, "length", match_size, 0);
4117     rb_define_method(rb_cMatch, "offset", match_offset, 1);
4118     rb_define_method(rb_cMatch, "begin", match_begin, 1);
4119     rb_define_method(rb_cMatch, "end", match_end, 1);
4120     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
4121     rb_define_method(rb_cMatch, "[]", match_aref, -1);
4122     rb_define_method(rb_cMatch, "captures", match_captures, 0);
4123     rb_define_method(rb_cMatch, "named_captures", match_named_captures, 0);
4124     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
4125     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
4126     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
4127     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
4128     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
4129     rb_define_method(rb_cMatch, "string", match_string, 0);
4130     rb_define_method(rb_cMatch, "hash", match_hash, 0);
4131     rb_define_method(rb_cMatch, "eql?", match_equal, 1);
4132     rb_define_method(rb_cMatch, "==", match_equal, 1);
4133 }
4134