1 /**********************************************************************
2 
3   string.c -
4 
5   $Author: nagachika $
6   created at: Mon Aug  9 17:12:58 JST 1993
7 
8   Copyright (C) 1993-2007 Yukihiro Matsumoto
9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/encoding.h"
15 #include "ruby/re.h"
16 #include "internal.h"
17 #include "encindex.h"
18 #include "probes.h"
19 #include "gc.h"
20 #include "ruby_assert.h"
21 #include "id.h"
22 #include "debug_counter.h"
23 #include "ruby/util.h"
24 
25 #define BEG(no) (regs->beg[(no)])
26 #define END(no) (regs->end[(no)])
27 
28 #include <errno.h>
29 #include <math.h>
30 #include <ctype.h>
31 
32 #ifdef HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35 
36 #if defined HAVE_CRYPT_R
37 # if defined HAVE_CRYPT_H
38 # include <crypt.h>
39 # endif
40 #elif !defined HAVE_CRYPT
41 # include "missing/crypt.h"
42 # define HAVE_CRYPT_R 1
43 #endif
44 
45 #define STRING_ENUMERATORS_WANTARRAY 0 /* next major */
46 
47 #undef rb_str_new
48 #undef rb_usascii_str_new
49 #undef rb_utf8_str_new
50 #undef rb_enc_str_new
51 #undef rb_str_new_cstr
52 #undef rb_tainted_str_new_cstr
53 #undef rb_usascii_str_new_cstr
54 #undef rb_utf8_str_new_cstr
55 #undef rb_enc_str_new_cstr
56 #undef rb_external_str_new_cstr
57 #undef rb_locale_str_new_cstr
58 #undef rb_str_dup_frozen
59 #undef rb_str_buf_new_cstr
60 #undef rb_str_buf_cat
61 #undef rb_str_buf_cat2
62 #undef rb_str_cat2
63 #undef rb_str_cat_cstr
64 #undef rb_fstring_cstr
65 #undef rb_fstring_enc_cstr
66 
67 static VALUE rb_str_clear(VALUE str);
68 
69 VALUE rb_cString;
70 VALUE rb_cSymbol;
71 
72 /* FLAGS of RString
73  *
74  * 1:     RSTRING_NOEMBED
75  * 2:     STR_SHARED (== ELTS_SHARED)
76  * 2-6:   RSTRING_EMBED_LEN (5 bits == 32)
77  * 5:     STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
78  *                         other strings that rely on this string's buffer)
79  * 6:     STR_IS_SHARED_M (shared, when RSTRING_NOEMBED==1 && klass==0)
80  * 7:     STR_TMPLOCK
81  * 8-9:   ENC_CODERANGE (2 bits)
82  * 10-16: ENCODING (7 bits == 128)
83  * 17:    RSTRING_FSTR
84  * 18:    STR_NOFREE
85  * 19:    STR_FAKESTR
86  */
87 
88 #define RUBY_MAX_CHAR_LEN 16
89 #define STR_SHARED_ROOT FL_USER5
90 #define STR_IS_SHARED_M FL_USER6
91 #define STR_TMPLOCK FL_USER7
92 #define STR_NOFREE FL_USER18
93 #define STR_FAKESTR FL_USER19
94 
95 #define STR_SET_NOEMBED(str) do {\
96     FL_SET((str), STR_NOEMBED);\
97     STR_SET_EMBED_LEN((str), 0);\
98 } while (0)
99 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
100 #define STR_SET_EMBED_LEN(str, n) do { \
101     long tmp_n = (n);\
102     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
103     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
104 } while (0)
105 
106 #define STR_SET_LEN(str, n) do { \
107     if (STR_EMBED_P(str)) {\
108 	STR_SET_EMBED_LEN((str), (n));\
109     }\
110     else {\
111 	RSTRING(str)->as.heap.len = (n);\
112     }\
113 } while (0)
114 
115 #define STR_DEC_LEN(str) do {\
116     if (STR_EMBED_P(str)) {\
117 	long n = RSTRING_LEN(str);\
118 	n--;\
119 	STR_SET_EMBED_LEN((str), n);\
120     }\
121     else {\
122 	RSTRING(str)->as.heap.len--;\
123     }\
124 } while (0)
125 
126 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
127 #define TERM_FILL(ptr, termlen) do {\
128     char *const term_fill_ptr = (ptr);\
129     const int term_fill_len = (termlen);\
130     *term_fill_ptr = '\0';\
131     if (UNLIKELY(term_fill_len > 1))\
132 	memset(term_fill_ptr, 0, term_fill_len);\
133 } while (0)
134 
135 #define RESIZE_CAPA(str,capacity) do {\
136     const int termlen = TERM_LEN(str);\
137     RESIZE_CAPA_TERM(str,capacity,termlen);\
138 } while (0)
139 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
140     if (STR_EMBED_P(str)) {\
141 	if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
142 	    char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
143 	    const long tlen = RSTRING_LEN(str);\
144 	    memcpy(tmp, RSTRING_PTR(str), tlen);\
145 	    RSTRING(str)->as.heap.ptr = tmp;\
146 	    RSTRING(str)->as.heap.len = tlen;\
147             STR_SET_NOEMBED(str);\
148 	    RSTRING(str)->as.heap.aux.capa = (capacity);\
149 	}\
150     }\
151     else {\
152 	assert(!FL_TEST((str), STR_SHARED)); \
153 	SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
154 			(size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
155 	RSTRING(str)->as.heap.aux.capa = (capacity);\
156     }\
157 } while (0)
158 
159 #define STR_SET_SHARED(str, shared_str) do { \
160     if (!FL_TEST(str, STR_FAKESTR)) { \
161 	RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
162 	FL_SET((str), STR_SHARED); \
163         FL_SET((shared_str), STR_SHARED_ROOT); \
164 	if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
165 	    FL_SET_RAW((shared_str), STR_IS_SHARED_M); \
166     } \
167 } while (0)
168 
169 #define STR_HEAP_PTR(str)  (RSTRING(str)->as.heap.ptr)
170 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
171 
172 #define STR_ENC_GET(str) get_encoding(str)
173 
174 #if !defined SHARABLE_MIDDLE_SUBSTRING
175 # define SHARABLE_MIDDLE_SUBSTRING 0
176 #endif
177 #if !SHARABLE_MIDDLE_SUBSTRING
178 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
179 #else
180 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
181 #endif
182 
183 #define STR_EMBEDDABLE_P(len, termlen) \
184     ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
185 
186 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
187 static VALUE str_new_shared(VALUE klass, VALUE str);
188 static VALUE str_new_frozen(VALUE klass, VALUE orig);
189 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
190 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
191 static inline void str_modifiable(VALUE str);
192 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
193 
194 static inline void
str_make_independent(VALUE str)195 str_make_independent(VALUE str)
196 {
197     long len = RSTRING_LEN(str);
198     int termlen = TERM_LEN(str);
199     str_make_independent_expand((str), len, 0L, termlen);
200 }
201 
202 /* symbols for [up|down|swap]case/capitalize options */
203 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
204 
205 static rb_encoding *
get_actual_encoding(const int encidx,VALUE str)206 get_actual_encoding(const int encidx, VALUE str)
207 {
208     const unsigned char *q;
209 
210     switch (encidx) {
211       case ENCINDEX_UTF_16:
212 	if (RSTRING_LEN(str) < 2) break;
213 	q = (const unsigned char *)RSTRING_PTR(str);
214 	if (q[0] == 0xFE && q[1] == 0xFF) {
215 	    return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
216 	}
217 	if (q[0] == 0xFF && q[1] == 0xFE) {
218 	    return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
219 	}
220 	return rb_ascii8bit_encoding();
221       case ENCINDEX_UTF_32:
222 	if (RSTRING_LEN(str) < 4) break;
223 	q = (const unsigned char *)RSTRING_PTR(str);
224 	if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
225 	    return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
226 	}
227 	if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
228 	    return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
229 	}
230 	return rb_ascii8bit_encoding();
231     }
232     return rb_enc_from_index(encidx);
233 }
234 
235 static rb_encoding *
get_encoding(VALUE str)236 get_encoding(VALUE str)
237 {
238     return get_actual_encoding(ENCODING_GET(str), str);
239 }
240 
241 static void
mustnot_broken(VALUE str)242 mustnot_broken(VALUE str)
243 {
244     if (is_broken_string(str)) {
245 	rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
246     }
247 }
248 
249 static void
mustnot_wchar(VALUE str)250 mustnot_wchar(VALUE str)
251 {
252     rb_encoding *enc = STR_ENC_GET(str);
253     if (rb_enc_mbminlen(enc) > 1) {
254 	rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
255     }
256 }
257 
258 static int fstring_cmp(VALUE a, VALUE b);
259 
260 static VALUE register_fstring(VALUE str);
261 
262 const struct st_hash_type rb_fstring_hash_type = {
263     fstring_cmp,
264     rb_str_hash,
265 };
266 
267 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
268 
269 static int
fstr_update_callback(st_data_t * key,st_data_t * value,st_data_t arg,int existing)270 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
271 {
272     VALUE *fstr = (VALUE *)arg;
273     VALUE str = (VALUE)*key;
274 
275     if (existing) {
276 	/* because of lazy sweep, str may be unmarked already and swept
277 	 * at next time */
278 
279 	if (rb_objspace_garbage_object_p(str)) {
280 	    *fstr = Qundef;
281 	    return ST_DELETE;
282 	}
283 
284 	*fstr = str;
285 	return ST_STOP;
286     }
287     else {
288 	if (FL_TEST_RAW(str, STR_FAKESTR)) {
289 	    str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
290 				 RSTRING(str)->as.heap.len,
291 				 ENCODING_GET(str));
292 	    OBJ_FREEZE_RAW(str);
293 	}
294 	else {
295 	    str = str_new_frozen(rb_cString, str);
296 	    if (STR_SHARED_P(str)) { /* str should not be shared */
297 		/* shared substring  */
298 		str_make_independent(str);
299 		assert(OBJ_FROZEN(str));
300 	    }
301 	    if (!BARE_STRING_P(str)) {
302 		str = str_new_frozen(rb_cString, str);
303 	    }
304 	}
305 	RBASIC(str)->flags |= RSTRING_FSTR;
306 
307 	*key = *value = *fstr = str;
308 	return ST_CONTINUE;
309     }
310 }
311 
312 RUBY_FUNC_EXPORTED
313 VALUE
rb_fstring(VALUE str)314 rb_fstring(VALUE str)
315 {
316     VALUE fstr;
317     int bare;
318 
319     Check_Type(str, T_STRING);
320 
321     if (FL_TEST(str, RSTRING_FSTR))
322 	return str;
323 
324     bare = BARE_STRING_P(str);
325     if (!bare) {
326         if (STR_EMBED_P(str)) {
327             OBJ_FREEZE_RAW(str);
328             return str;
329         }
330         if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
331             assert(OBJ_FROZEN(str));
332             return str;
333         }
334     }
335 
336     fstr = register_fstring(str);
337 
338     if (!bare) {
339 	str_replace_shared_without_enc(str, fstr);
340 	OBJ_FREEZE_RAW(str);
341 	return str;
342     }
343     return fstr;
344 }
345 
346 static VALUE
register_fstring(VALUE str)347 register_fstring(VALUE str)
348 {
349     VALUE ret;
350     st_table *frozen_strings = rb_vm_fstring_table();
351 
352     do {
353 	ret = str;
354 	st_update(frozen_strings, (st_data_t)str,
355 		  fstr_update_callback, (st_data_t)&ret);
356     } while (ret == Qundef);
357 
358     assert(OBJ_FROZEN(ret));
359     assert(!FL_TEST_RAW(ret, STR_FAKESTR));
360     assert(!FL_TEST_RAW(ret, FL_EXIVAR));
361     assert(!FL_TEST_RAW(ret, FL_TAINT));
362     assert(RBASIC_CLASS(ret) == rb_cString);
363     return ret;
364 }
365 
366 static VALUE
setup_fake_str(struct RString * fake_str,const char * name,long len,int encidx)367 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
368 {
369     fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
370     /* SHARED to be allocated by the callback */
371 
372     ENCODING_SET_INLINED((VALUE)fake_str, encidx);
373 
374     RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
375     fake_str->as.heap.len = len;
376     fake_str->as.heap.ptr = (char *)name;
377     fake_str->as.heap.aux.capa = len;
378     return (VALUE)fake_str;
379 }
380 
381 /*
382  * set up a fake string which refers a static string literal.
383  */
384 VALUE
rb_setup_fake_str(struct RString * fake_str,const char * name,long len,rb_encoding * enc)385 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
386 {
387     return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
388 }
389 
390 /*
391  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
392  * shared string which refers a static string literal.  `ptr` must
393  * point a constant string.
394  */
395 MJIT_FUNC_EXPORTED VALUE
rb_fstring_new(const char * ptr,long len)396 rb_fstring_new(const char *ptr, long len)
397 {
398     struct RString fake_str;
399     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII));
400 }
401 
402 VALUE
rb_fstring_enc_new(const char * ptr,long len,rb_encoding * enc)403 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
404 {
405     struct RString fake_str;
406     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc));
407 }
408 
409 VALUE
rb_fstring_cstr(const char * ptr)410 rb_fstring_cstr(const char *ptr)
411 {
412     return rb_fstring_new(ptr, strlen(ptr));
413 }
414 
415 VALUE
rb_fstring_enc_cstr(const char * ptr,rb_encoding * enc)416 rb_fstring_enc_cstr(const char *ptr, rb_encoding *enc)
417 {
418     return rb_fstring_enc_new(ptr, strlen(ptr), enc);
419 }
420 
421 static int
fstring_set_class_i(st_data_t key,st_data_t val,st_data_t arg)422 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
423 {
424     RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
425     return ST_CONTINUE;
426 }
427 
428 static int
fstring_cmp(VALUE a,VALUE b)429 fstring_cmp(VALUE a, VALUE b)
430 {
431     long alen, blen;
432     const char *aptr, *bptr;
433     RSTRING_GETMEM(a, aptr, alen);
434     RSTRING_GETMEM(b, bptr, blen);
435     return (alen != blen ||
436 	    ENCODING_GET(a) != ENCODING_GET(b) ||
437 	    memcmp(aptr, bptr, alen) != 0);
438 }
439 
440 static inline int
single_byte_optimizable(VALUE str)441 single_byte_optimizable(VALUE str)
442 {
443     rb_encoding *enc;
444 
445     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
446     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
447         return 1;
448 
449     enc = STR_ENC_GET(str);
450     if (rb_enc_mbmaxlen(enc) == 1)
451         return 1;
452 
453     /* Conservative.  Possibly single byte.
454      * "\xa1" in Shift_JIS for example. */
455     return 0;
456 }
457 
458 VALUE rb_fs;
459 
460 static inline const char *
search_nonascii(const char * p,const char * e)461 search_nonascii(const char *p, const char *e)
462 {
463     const uintptr_t *s, *t;
464 
465 #if defined(__STDC_VERSION) && (__STDC_VERSION__ >= 199901L)
466 # if SIZEOF_UINTPTR_T == 8
467 #  define NONASCII_MASK UINT64_C(0x8080808080808080)
468 # elif SIZEOF_UINTPTR_T == 4
469 #  define NONASCII_MASK UINT32_C(0x80808080)
470 # else
471 #  error "don't know what to do."
472 # endif
473 #else
474 # if SIZEOF_UINTPTR_T == 8
475 #  define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
476 # elif SIZEOF_UINTPTR_T == 4
477 #  define NONASCII_MASK 0x80808080UL /* or...? */
478 # else
479 #  error "don't know what to do."
480 # endif
481 #endif
482 
483     if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
484 #if !UNALIGNED_WORD_ACCESS
485 	if ((uintptr_t)p % SIZEOF_VOIDP) {
486 	    int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
487 	    p += l;
488 	    switch (l) {
489 	      default: UNREACHABLE;
490 #if SIZEOF_VOIDP > 4
491 	      case 7: if (p[-7]&0x80) return p-7;
492 	      case 6: if (p[-6]&0x80) return p-6;
493 	      case 5: if (p[-5]&0x80) return p-5;
494 	      case 4: if (p[-4]&0x80) return p-4;
495 #endif
496 	      case 3: if (p[-3]&0x80) return p-3;
497 	      case 2: if (p[-2]&0x80) return p-2;
498 	      case 1: if (p[-1]&0x80) return p-1;
499 	      case 0: break;
500 	    }
501 	}
502 #endif
503 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
504 #define aligned_ptr(value) \
505         __builtin_assume_aligned((value), sizeof(uintptr_t))
506 #else
507 #define aligned_ptr(value) (uintptr_t *)(value)
508 #endif
509 	s = aligned_ptr(p);
510 	t = aligned_ptr(e - (SIZEOF_VOIDP-1));
511 #undef aligned_ptr
512 	for (;s < t; s++) {
513 	    if (*s & NONASCII_MASK) {
514 #ifdef WORDS_BIGENDIAN
515 		return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
516 #else
517 		return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
518 #endif
519 	    }
520 	}
521 	p = (const char *)s;
522     }
523 
524     switch (e - p) {
525       default: UNREACHABLE;
526 #if SIZEOF_VOIDP > 4
527       case 7: if (e[-7]&0x80) return e-7;
528       case 6: if (e[-6]&0x80) return e-6;
529       case 5: if (e[-5]&0x80) return e-5;
530       case 4: if (e[-4]&0x80) return e-4;
531 #endif
532       case 3: if (e[-3]&0x80) return e-3;
533       case 2: if (e[-2]&0x80) return e-2;
534       case 1: if (e[-1]&0x80) return e-1;
535       case 0: return NULL;
536     }
537 }
538 
539 static int
coderange_scan(const char * p,long len,rb_encoding * enc)540 coderange_scan(const char *p, long len, rb_encoding *enc)
541 {
542     const char *e = p + len;
543 
544     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
545         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
546         p = search_nonascii(p, e);
547         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
548     }
549 
550     if (rb_enc_asciicompat(enc)) {
551         p = search_nonascii(p, e);
552         if (!p) return ENC_CODERANGE_7BIT;
553         for (;;) {
554             int ret = rb_enc_precise_mbclen(p, e, enc);
555             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
556             p += MBCLEN_CHARFOUND_LEN(ret);
557             if (p == e) break;
558             p = search_nonascii(p, e);
559             if (!p) break;
560         }
561     }
562     else {
563         while (p < e) {
564             int ret = rb_enc_precise_mbclen(p, e, enc);
565             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
566             p += MBCLEN_CHARFOUND_LEN(ret);
567         }
568     }
569     return ENC_CODERANGE_VALID;
570 }
571 
572 long
rb_str_coderange_scan_restartable(const char * s,const char * e,rb_encoding * enc,int * cr)573 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
574 {
575     const char *p = s;
576 
577     if (*cr == ENC_CODERANGE_BROKEN)
578 	return e - s;
579 
580     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
581 	/* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
582 	if (*cr == ENC_CODERANGE_VALID) return e - s;
583 	p = search_nonascii(p, e);
584         *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
585 	return e - s;
586     }
587     else if (rb_enc_asciicompat(enc)) {
588 	p = search_nonascii(p, e);
589 	if (!p) {
590 	    if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
591 	    return e - s;
592 	}
593 	for (;;) {
594 	    int ret = rb_enc_precise_mbclen(p, e, enc);
595 	    if (!MBCLEN_CHARFOUND_P(ret)) {
596 		*cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
597 		return p - s;
598 	    }
599 	    p += MBCLEN_CHARFOUND_LEN(ret);
600 	    if (p == e) break;
601 	    p = search_nonascii(p, e);
602 	    if (!p) break;
603 	}
604     }
605     else {
606 	while (p < e) {
607 	    int ret = rb_enc_precise_mbclen(p, e, enc);
608 	    if (!MBCLEN_CHARFOUND_P(ret)) {
609 		*cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
610 		return p - s;
611 	    }
612 	    p += MBCLEN_CHARFOUND_LEN(ret);
613 	}
614     }
615     *cr = ENC_CODERANGE_VALID;
616     return e - s;
617 }
618 
619 static inline void
str_enc_copy(VALUE str1,VALUE str2)620 str_enc_copy(VALUE str1, VALUE str2)
621 {
622     rb_enc_set_index(str1, ENCODING_GET(str2));
623 }
624 
625 static void
rb_enc_cr_str_copy_for_substr(VALUE dest,VALUE src)626 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
627 {
628     /* this function is designed for copying encoding and coderange
629      * from src to new string "dest" which is made from the part of src.
630      */
631     str_enc_copy(dest, src);
632     if (RSTRING_LEN(dest) == 0) {
633 	if (!rb_enc_asciicompat(STR_ENC_GET(src)))
634 	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
635 	else
636 	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
637 	return;
638     }
639     switch (ENC_CODERANGE(src)) {
640       case ENC_CODERANGE_7BIT:
641 	ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
642 	break;
643       case ENC_CODERANGE_VALID:
644 	if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
645 	    search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
646 	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
647 	else
648 	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
649 	break;
650       default:
651 	break;
652     }
653 }
654 
655 static void
rb_enc_cr_str_exact_copy(VALUE dest,VALUE src)656 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
657 {
658     str_enc_copy(dest, src);
659     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
660 }
661 
662 int
rb_enc_str_coderange(VALUE str)663 rb_enc_str_coderange(VALUE str)
664 {
665     int cr = ENC_CODERANGE(str);
666 
667     if (cr == ENC_CODERANGE_UNKNOWN) {
668 	int encidx = ENCODING_GET(str);
669 	rb_encoding *enc = rb_enc_from_index(encidx);
670 	if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
671             rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
672 	    cr = ENC_CODERANGE_BROKEN;
673 	}
674 	else {
675 	    cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
676                                 enc);
677 	}
678         ENC_CODERANGE_SET(str, cr);
679     }
680     return cr;
681 }
682 
683 int
rb_enc_str_asciionly_p(VALUE str)684 rb_enc_str_asciionly_p(VALUE str)
685 {
686     rb_encoding *enc = STR_ENC_GET(str);
687 
688     if (!rb_enc_asciicompat(enc))
689         return FALSE;
690     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
691         return TRUE;
692     return FALSE;
693 }
694 
695 static inline void
str_mod_check(VALUE s,const char * p,long len)696 str_mod_check(VALUE s, const char *p, long len)
697 {
698     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
699 	rb_raise(rb_eRuntimeError, "string modified");
700     }
701 }
702 
703 static size_t
str_capacity(VALUE str,const int termlen)704 str_capacity(VALUE str, const int termlen)
705 {
706     if (STR_EMBED_P(str)) {
707 	return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
708     }
709     else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
710 	return RSTRING(str)->as.heap.len;
711     }
712     else {
713 	return RSTRING(str)->as.heap.aux.capa;
714     }
715 }
716 
717 size_t
rb_str_capacity(VALUE str)718 rb_str_capacity(VALUE str)
719 {
720     return str_capacity(str, TERM_LEN(str));
721 }
722 
723 static inline void
must_not_null(const char * ptr)724 must_not_null(const char *ptr)
725 {
726     if (!ptr) {
727 	rb_raise(rb_eArgError, "NULL pointer given");
728     }
729 }
730 
731 static inline VALUE
str_alloc(VALUE klass)732 str_alloc(VALUE klass)
733 {
734     NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0));
735     return (VALUE)str;
736 }
737 
738 static inline VALUE
empty_str_alloc(VALUE klass)739 empty_str_alloc(VALUE klass)
740 {
741     RUBY_DTRACE_CREATE_HOOK(STRING, 0);
742     return str_alloc(klass);
743 }
744 
745 static VALUE
str_new0(VALUE klass,const char * ptr,long len,int termlen)746 str_new0(VALUE klass, const char *ptr, long len, int termlen)
747 {
748     VALUE str;
749 
750     if (len < 0) {
751 	rb_raise(rb_eArgError, "negative string size (or size too big)");
752     }
753 
754     RUBY_DTRACE_CREATE_HOOK(STRING, len);
755 
756     str = str_alloc(klass);
757     if (!STR_EMBEDDABLE_P(len, termlen)) {
758 	RSTRING(str)->as.heap.aux.capa = len;
759 	RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
760 	STR_SET_NOEMBED(str);
761     }
762     else if (len == 0) {
763 	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
764     }
765     if (ptr) {
766 	memcpy(RSTRING_PTR(str), ptr, len);
767     }
768     STR_SET_LEN(str, len);
769     TERM_FILL(RSTRING_PTR(str) + len, termlen);
770     return str;
771 }
772 
773 static VALUE
str_new(VALUE klass,const char * ptr,long len)774 str_new(VALUE klass, const char *ptr, long len)
775 {
776     return str_new0(klass, ptr, len, 1);
777 }
778 
779 VALUE
rb_str_new(const char * ptr,long len)780 rb_str_new(const char *ptr, long len)
781 {
782     return str_new(rb_cString, ptr, len);
783 }
784 
785 VALUE
rb_usascii_str_new(const char * ptr,long len)786 rb_usascii_str_new(const char *ptr, long len)
787 {
788     VALUE str = rb_str_new(ptr, len);
789     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
790     return str;
791 }
792 
793 VALUE
rb_utf8_str_new(const char * ptr,long len)794 rb_utf8_str_new(const char *ptr, long len)
795 {
796     VALUE str = str_new(rb_cString, ptr, len);
797     rb_enc_associate_index(str, rb_utf8_encindex());
798     return str;
799 }
800 
801 VALUE
rb_enc_str_new(const char * ptr,long len,rb_encoding * enc)802 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
803 {
804     VALUE str;
805 
806     if (!enc) return rb_str_new(ptr, len);
807 
808     str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
809     rb_enc_associate(str, enc);
810     return str;
811 }
812 
813 VALUE
rb_str_new_cstr(const char * ptr)814 rb_str_new_cstr(const char *ptr)
815 {
816     must_not_null(ptr);
817     /* rb_str_new_cstr() can take pointer from non-malloc-generated
818      * memory regions, and that cannot be detected by the MSAN.  Just
819      * trust the programmer that the argument passed here is a sane C
820      * string. */
821     __msan_unpoison_string(ptr);
822     return rb_str_new(ptr, strlen(ptr));
823 }
824 
825 VALUE
rb_usascii_str_new_cstr(const char * ptr)826 rb_usascii_str_new_cstr(const char *ptr)
827 {
828     VALUE str = rb_str_new_cstr(ptr);
829     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
830     return str;
831 }
832 
833 VALUE
rb_utf8_str_new_cstr(const char * ptr)834 rb_utf8_str_new_cstr(const char *ptr)
835 {
836     VALUE str = rb_str_new_cstr(ptr);
837     rb_enc_associate_index(str, rb_utf8_encindex());
838     return str;
839 }
840 
841 VALUE
rb_enc_str_new_cstr(const char * ptr,rb_encoding * enc)842 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
843 {
844     must_not_null(ptr);
845     if (rb_enc_mbminlen(enc) != 1) {
846 	rb_raise(rb_eArgError, "wchar encoding given");
847     }
848     return rb_enc_str_new(ptr, strlen(ptr), enc);
849 }
850 
851 static VALUE
str_new_static(VALUE klass,const char * ptr,long len,int encindex)852 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
853 {
854     VALUE str;
855 
856     if (len < 0) {
857 	rb_raise(rb_eArgError, "negative string size (or size too big)");
858     }
859 
860     if (!ptr) {
861 	rb_encoding *enc = rb_enc_get_from_index(encindex);
862 	str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
863     }
864     else {
865 	RUBY_DTRACE_CREATE_HOOK(STRING, len);
866 	str = str_alloc(klass);
867 	RSTRING(str)->as.heap.len = len;
868 	RSTRING(str)->as.heap.ptr = (char *)ptr;
869 	RSTRING(str)->as.heap.aux.capa = len;
870 	STR_SET_NOEMBED(str);
871 	RBASIC(str)->flags |= STR_NOFREE;
872     }
873     rb_enc_associate_index(str, encindex);
874     return str;
875 }
876 
877 VALUE
rb_str_new_static(const char * ptr,long len)878 rb_str_new_static(const char *ptr, long len)
879 {
880     return str_new_static(rb_cString, ptr, len, 0);
881 }
882 
883 VALUE
rb_usascii_str_new_static(const char * ptr,long len)884 rb_usascii_str_new_static(const char *ptr, long len)
885 {
886     return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
887 }
888 
889 VALUE
rb_utf8_str_new_static(const char * ptr,long len)890 rb_utf8_str_new_static(const char *ptr, long len)
891 {
892     return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
893 }
894 
895 VALUE
rb_enc_str_new_static(const char * ptr,long len,rb_encoding * enc)896 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
897 {
898     return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
899 }
900 
901 VALUE
rb_tainted_str_new(const char * ptr,long len)902 rb_tainted_str_new(const char *ptr, long len)
903 {
904     VALUE str = rb_str_new(ptr, len);
905 
906     OBJ_TAINT(str);
907     return str;
908 }
909 
910 static VALUE
rb_tainted_str_new_with_enc(const char * ptr,long len,rb_encoding * enc)911 rb_tainted_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
912 {
913     VALUE str = rb_enc_str_new(ptr, len, enc);
914 
915     OBJ_TAINT(str);
916     return str;
917 }
918 
919 VALUE
rb_tainted_str_new_cstr(const char * ptr)920 rb_tainted_str_new_cstr(const char *ptr)
921 {
922     VALUE str = rb_str_new_cstr(ptr);
923 
924     OBJ_TAINT(str);
925     return str;
926 }
927 
928 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
929 				   rb_encoding *from, rb_encoding *to,
930 				   int ecflags, VALUE ecopts);
931 
932 VALUE
rb_str_conv_enc_opts(VALUE str,rb_encoding * from,rb_encoding * to,int ecflags,VALUE ecopts)933 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
934 {
935     long len;
936     const char *ptr;
937     VALUE newstr;
938 
939     if (!to) return str;
940     if (!from) from = rb_enc_get(str);
941     if (from == to) return str;
942     if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
943 	to == rb_ascii8bit_encoding()) {
944 	if (STR_ENC_GET(str) != to) {
945 	    str = rb_str_dup(str);
946 	    rb_enc_associate(str, to);
947 	}
948 	return str;
949     }
950 
951     RSTRING_GETMEM(str, ptr, len);
952     newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
953 				   from, to, ecflags, ecopts);
954     if (NIL_P(newstr)) {
955 	/* some error, return original */
956 	return str;
957     }
958     OBJ_INFECT(newstr, str);
959     return newstr;
960 }
961 
962 VALUE
rb_str_cat_conv_enc_opts(VALUE newstr,long ofs,const char * ptr,long len,rb_encoding * from,int ecflags,VALUE ecopts)963 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
964 			 rb_encoding *from, int ecflags, VALUE ecopts)
965 {
966     long olen;
967 
968     olen = RSTRING_LEN(newstr);
969     if (ofs < -olen || olen < ofs)
970         rb_raise(rb_eIndexError, "index %ld out of string", ofs);
971     if (ofs < 0) ofs += olen;
972     if (!from) {
973 	STR_SET_LEN(newstr, ofs);
974 	return rb_str_cat(newstr, ptr, len);
975     }
976 
977     rb_str_modify(newstr);
978     return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
979 				 rb_enc_get(newstr),
980 				 ecflags, ecopts);
981 }
982 
983 VALUE
rb_str_initialize(VALUE str,const char * ptr,long len,rb_encoding * enc)984 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
985 {
986     STR_SET_LEN(str, 0);
987     rb_enc_associate(str, enc);
988     rb_str_cat(str, ptr, len);
989     return str;
990 }
991 
992 static VALUE
str_cat_conv_enc_opts(VALUE newstr,long ofs,const char * ptr,long len,rb_encoding * from,rb_encoding * to,int ecflags,VALUE ecopts)993 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
994 		      rb_encoding *from, rb_encoding *to,
995 		      int ecflags, VALUE ecopts)
996 {
997     rb_econv_t *ec;
998     rb_econv_result_t ret;
999     long olen;
1000     VALUE econv_wrapper;
1001     const unsigned char *start, *sp;
1002     unsigned char *dest, *dp;
1003     size_t converted_output = (size_t)ofs;
1004 
1005     olen = rb_str_capacity(newstr);
1006 
1007     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1008     RBASIC_CLEAR_CLASS(econv_wrapper);
1009     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1010     if (!ec) return Qnil;
1011     DATA_PTR(econv_wrapper) = ec;
1012 
1013     sp = (unsigned char*)ptr;
1014     start = sp;
1015     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1016 	   (dp = dest + converted_output),
1017 	   (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1018 	   ret == econv_destination_buffer_full) {
1019 	/* destination buffer short */
1020 	size_t converted_input = sp - start;
1021 	size_t rest = len - converted_input;
1022 	converted_output = dp - dest;
1023 	rb_str_set_len(newstr, converted_output);
1024 	if (converted_input && converted_output &&
1025 	    rest < (LONG_MAX / converted_output)) {
1026 	    rest = (rest * converted_output) / converted_input;
1027 	}
1028 	else {
1029 	    rest = olen;
1030 	}
1031 	olen += rest < 2 ? 2 : rest;
1032 	rb_str_resize(newstr, olen);
1033     }
1034     DATA_PTR(econv_wrapper) = 0;
1035     rb_econv_close(ec);
1036     rb_gc_force_recycle(econv_wrapper);
1037     switch (ret) {
1038       case econv_finished:
1039 	len = dp - (unsigned char*)RSTRING_PTR(newstr);
1040 	rb_str_set_len(newstr, len);
1041 	rb_enc_associate(newstr, to);
1042 	return newstr;
1043 
1044       default:
1045 	return Qnil;
1046     }
1047 }
1048 
1049 VALUE
rb_str_conv_enc(VALUE str,rb_encoding * from,rb_encoding * to)1050 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1051 {
1052     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1053 }
1054 
1055 VALUE
rb_external_str_new_with_enc(const char * ptr,long len,rb_encoding * eenc)1056 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1057 {
1058     rb_encoding *ienc;
1059     VALUE str;
1060     const int eidx = rb_enc_to_index(eenc);
1061 
1062     if (!ptr) {
1063 	return rb_tainted_str_new_with_enc(ptr, len, eenc);
1064     }
1065 
1066     /* ASCII-8BIT case, no conversion */
1067     if ((eidx == rb_ascii8bit_encindex()) ||
1068 	(eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1069 	return rb_tainted_str_new(ptr, len);
1070     }
1071     /* no default_internal or same encoding, no conversion */
1072     ienc = rb_default_internal_encoding();
1073     if (!ienc || eenc == ienc) {
1074 	return rb_tainted_str_new_with_enc(ptr, len, eenc);
1075     }
1076     /* ASCII compatible, and ASCII only string, no conversion in
1077      * default_internal */
1078     if ((eidx == rb_ascii8bit_encindex()) ||
1079 	(eidx == rb_usascii_encindex()) ||
1080 	(rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1081 	return rb_tainted_str_new_with_enc(ptr, len, ienc);
1082     }
1083     /* convert from the given encoding to default_internal */
1084     str = rb_tainted_str_new_with_enc(NULL, 0, ienc);
1085     /* when the conversion failed for some reason, just ignore the
1086      * default_internal and result in the given encoding as-is. */
1087     if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1088 	rb_str_initialize(str, ptr, len, eenc);
1089     }
1090     return str;
1091 }
1092 
1093 VALUE
rb_external_str_with_enc(VALUE str,rb_encoding * eenc)1094 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1095 {
1096     int eidx = rb_enc_to_index(eenc);
1097     if (eidx == rb_usascii_encindex() &&
1098 	rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1099 	rb_enc_associate_index(str, rb_ascii8bit_encindex());
1100 	return str;
1101     }
1102     rb_enc_associate_index(str, eidx);
1103     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1104 }
1105 
1106 VALUE
rb_external_str_new(const char * ptr,long len)1107 rb_external_str_new(const char *ptr, long len)
1108 {
1109     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1110 }
1111 
1112 VALUE
rb_external_str_new_cstr(const char * ptr)1113 rb_external_str_new_cstr(const char *ptr)
1114 {
1115     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1116 }
1117 
1118 VALUE
rb_locale_str_new(const char * ptr,long len)1119 rb_locale_str_new(const char *ptr, long len)
1120 {
1121     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1122 }
1123 
1124 VALUE
rb_locale_str_new_cstr(const char * ptr)1125 rb_locale_str_new_cstr(const char *ptr)
1126 {
1127     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1128 }
1129 
1130 VALUE
rb_filesystem_str_new(const char * ptr,long len)1131 rb_filesystem_str_new(const char *ptr, long len)
1132 {
1133     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1134 }
1135 
1136 VALUE
rb_filesystem_str_new_cstr(const char * ptr)1137 rb_filesystem_str_new_cstr(const char *ptr)
1138 {
1139     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1140 }
1141 
1142 VALUE
rb_str_export(VALUE str)1143 rb_str_export(VALUE str)
1144 {
1145     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
1146 }
1147 
1148 VALUE
rb_str_export_locale(VALUE str)1149 rb_str_export_locale(VALUE str)
1150 {
1151     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
1152 }
1153 
1154 VALUE
rb_str_export_to_enc(VALUE str,rb_encoding * enc)1155 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1156 {
1157     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1158 }
1159 
1160 static VALUE
str_replace_shared_without_enc(VALUE str2,VALUE str)1161 str_replace_shared_without_enc(VALUE str2, VALUE str)
1162 {
1163     const int termlen = TERM_LEN(str);
1164     char *ptr;
1165     long len;
1166 
1167     RSTRING_GETMEM(str, ptr, len);
1168     if (STR_EMBEDDABLE_P(len, termlen)) {
1169 	char *ptr2 = RSTRING(str2)->as.ary;
1170 	STR_SET_EMBED(str2);
1171 	memcpy(ptr2, RSTRING_PTR(str), len);
1172 	STR_SET_EMBED_LEN(str2, len);
1173 	TERM_FILL(ptr2+len, termlen);
1174     }
1175     else {
1176         VALUE root;
1177         if (STR_SHARED_P(str)) {
1178             root = RSTRING(str)->as.heap.aux.shared;
1179             RSTRING_GETMEM(str, ptr, len);
1180         }
1181         else {
1182             root = rb_str_new_frozen(str);
1183             RSTRING_GETMEM(root, ptr, len);
1184         }
1185         if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1186             if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1187                 rb_fatal("about to free a possible shared root");
1188             }
1189             char *ptr2 = STR_HEAP_PTR(str2);
1190             if (ptr2 != ptr) {
1191                 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1192             }
1193         }
1194 	FL_SET(str2, STR_NOEMBED);
1195 	RSTRING(str2)->as.heap.len = len;
1196 	RSTRING(str2)->as.heap.ptr = ptr;
1197 	STR_SET_SHARED(str2, root);
1198     }
1199     return str2;
1200 }
1201 
1202 static VALUE
str_replace_shared(VALUE str2,VALUE str)1203 str_replace_shared(VALUE str2, VALUE str)
1204 {
1205     str_replace_shared_without_enc(str2, str);
1206     rb_enc_cr_str_exact_copy(str2, str);
1207     return str2;
1208 }
1209 
1210 static VALUE
str_new_shared(VALUE klass,VALUE str)1211 str_new_shared(VALUE klass, VALUE str)
1212 {
1213     return str_replace_shared(str_alloc(klass), str);
1214 }
1215 
1216 VALUE
rb_str_new_shared(VALUE str)1217 rb_str_new_shared(VALUE str)
1218 {
1219     VALUE str2 = str_new_shared(rb_obj_class(str), str);
1220 
1221     OBJ_INFECT(str2, str);
1222     return str2;
1223 }
1224 
1225 VALUE
rb_str_new_frozen(VALUE orig)1226 rb_str_new_frozen(VALUE orig)
1227 {
1228     VALUE str;
1229 
1230     if (OBJ_FROZEN(orig)) return orig;
1231 
1232     str = str_new_frozen(rb_obj_class(orig), orig);
1233     OBJ_INFECT(str, orig);
1234     return str;
1235 }
1236 
1237 VALUE
rb_str_tmp_frozen_acquire(VALUE orig)1238 rb_str_tmp_frozen_acquire(VALUE orig)
1239 {
1240     VALUE tmp;
1241 
1242     if (OBJ_FROZEN_RAW(orig)) return orig;
1243 
1244     tmp = str_new_frozen(0, orig);
1245     OBJ_INFECT(tmp, orig);
1246 
1247     return tmp;
1248 }
1249 
1250 void
rb_str_tmp_frozen_release(VALUE orig,VALUE tmp)1251 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1252 {
1253     if (RBASIC_CLASS(tmp) != 0)
1254 	return;
1255 
1256     if (STR_EMBED_P(tmp)) {
1257 	assert(OBJ_FROZEN_RAW(tmp));
1258 	rb_gc_force_recycle(tmp);
1259     }
1260     else if (FL_TEST_RAW(orig, STR_SHARED) &&
1261 	    !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1262 	VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1263 
1264 	if (shared == tmp && !FL_TEST_RAW(tmp, STR_IS_SHARED_M)) {
1265 	    FL_UNSET_RAW(orig, STR_SHARED);
1266 	    assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1267 	    assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1268 	    RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1269 	    RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1270 	    assert(OBJ_FROZEN_RAW(tmp));
1271 	    rb_gc_force_recycle(tmp);
1272 	}
1273     }
1274 }
1275 
1276 static VALUE
str_new_frozen(VALUE klass,VALUE orig)1277 str_new_frozen(VALUE klass, VALUE orig)
1278 {
1279     VALUE str;
1280 
1281     if (STR_EMBED_P(orig)) {
1282 	str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1283     }
1284     else {
1285 	if (FL_TEST_RAW(orig, STR_SHARED)) {
1286 	    VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1287 	    long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1288 	    long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1289 	    assert(!STR_EMBED_P(shared));
1290 	    assert(OBJ_FROZEN(shared));
1291 
1292 	    if ((ofs > 0) || (rest > 0) ||
1293 		(klass != RBASIC(shared)->klass) ||
1294 		((RBASIC(shared)->flags ^ RBASIC(orig)->flags) & FL_TAINT) ||
1295 		ENCODING_GET(shared) != ENCODING_GET(orig)) {
1296 		str = str_new_shared(klass, shared);
1297 		RSTRING(str)->as.heap.ptr += ofs;
1298 		RSTRING(str)->as.heap.len -= ofs + rest;
1299 	    }
1300 	    else {
1301 		if (RBASIC_CLASS(shared) == 0)
1302 		    FL_SET_RAW(shared, STR_IS_SHARED_M);
1303 		return shared;
1304 	    }
1305 	}
1306 	else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1307 	    str = str_alloc(klass);
1308 	    STR_SET_EMBED(str);
1309 	    memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1310 	    STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1311 	    TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1312 	}
1313 	else {
1314 	    str = str_alloc(klass);
1315 	    STR_SET_NOEMBED(str);
1316 	    RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1317 	    RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1318 	    RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1319 	    RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1320 	    RBASIC(orig)->flags &= ~STR_NOFREE;
1321 	    STR_SET_SHARED(orig, str);
1322 	    if (klass == 0)
1323 		FL_UNSET_RAW(str, STR_IS_SHARED_M);
1324 	}
1325     }
1326 
1327     rb_enc_cr_str_exact_copy(str, orig);
1328     OBJ_FREEZE(str);
1329     return str;
1330 }
1331 
1332 VALUE
rb_str_new_with_class(VALUE obj,const char * ptr,long len)1333 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1334 {
1335     return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1336 }
1337 
1338 static VALUE
str_new_empty(VALUE str)1339 str_new_empty(VALUE str)
1340 {
1341     VALUE v = rb_str_new_with_class(str, 0, 0);
1342     rb_enc_copy(v, str);
1343     OBJ_INFECT(v, str);
1344     return v;
1345 }
1346 
1347 #define STR_BUF_MIN_SIZE 127
1348 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1349 
1350 VALUE
rb_str_buf_new(long capa)1351 rb_str_buf_new(long capa)
1352 {
1353     VALUE str = str_alloc(rb_cString);
1354 
1355     if (capa < STR_BUF_MIN_SIZE) {
1356 	capa = STR_BUF_MIN_SIZE;
1357     }
1358     FL_SET(str, STR_NOEMBED);
1359     RSTRING(str)->as.heap.aux.capa = capa;
1360     RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1361     RSTRING(str)->as.heap.ptr[0] = '\0';
1362 
1363     return str;
1364 }
1365 
1366 VALUE
rb_str_buf_new_cstr(const char * ptr)1367 rb_str_buf_new_cstr(const char *ptr)
1368 {
1369     VALUE str;
1370     long len = strlen(ptr);
1371 
1372     str = rb_str_buf_new(len);
1373     rb_str_buf_cat(str, ptr, len);
1374 
1375     return str;
1376 }
1377 
1378 VALUE
rb_str_tmp_new(long len)1379 rb_str_tmp_new(long len)
1380 {
1381     return str_new(0, 0, len);
1382 }
1383 
1384 void
rb_str_free(VALUE str)1385 rb_str_free(VALUE str)
1386 {
1387     if (FL_TEST(str, RSTRING_FSTR)) {
1388 	st_data_t fstr = (st_data_t)str;
1389 	st_delete(rb_vm_fstring_table(), &fstr, NULL);
1390 	RB_DEBUG_COUNTER_INC(obj_str_fstr);
1391     }
1392 
1393     if (STR_EMBED_P(str)) {
1394 	RB_DEBUG_COUNTER_INC(obj_str_embed);
1395     }
1396     else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1397 	(void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1398 	(void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1399     }
1400     else {
1401 	RB_DEBUG_COUNTER_INC(obj_str_ptr);
1402 	ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1403     }
1404 }
1405 
1406 RUBY_FUNC_EXPORTED size_t
rb_str_memsize(VALUE str)1407 rb_str_memsize(VALUE str)
1408 {
1409     if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1410 	return STR_HEAP_SIZE(str);
1411     }
1412     else {
1413 	return 0;
1414     }
1415 }
1416 
1417 VALUE
rb_str_to_str(VALUE str)1418 rb_str_to_str(VALUE str)
1419 {
1420     return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1421 }
1422 
1423 static inline void str_discard(VALUE str);
1424 static void str_shared_replace(VALUE str, VALUE str2);
1425 
1426 void
rb_str_shared_replace(VALUE str,VALUE str2)1427 rb_str_shared_replace(VALUE str, VALUE str2)
1428 {
1429     if (str != str2) str_shared_replace(str, str2);
1430 }
1431 
1432 static void
str_shared_replace(VALUE str,VALUE str2)1433 str_shared_replace(VALUE str, VALUE str2)
1434 {
1435     rb_encoding *enc;
1436     int cr;
1437     int termlen;
1438 
1439     RUBY_ASSERT(str2 != str);
1440     enc = STR_ENC_GET(str2);
1441     cr = ENC_CODERANGE(str2);
1442     str_discard(str);
1443     OBJ_INFECT(str, str2);
1444     termlen = rb_enc_mbminlen(enc);
1445 
1446     if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1447 	STR_SET_EMBED(str);
1448 	memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1449 	STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1450         rb_enc_associate(str, enc);
1451         ENC_CODERANGE_SET(str, cr);
1452     }
1453     else {
1454 	STR_SET_NOEMBED(str);
1455 	FL_UNSET(str, STR_SHARED);
1456 	RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1457 	RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1458 
1459 	if (FL_TEST(str2, STR_SHARED)) {
1460 	    VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1461 	    STR_SET_SHARED(str, shared);
1462 	}
1463 	else {
1464 	    RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1465 	}
1466 
1467 	/* abandon str2 */
1468 	STR_SET_EMBED(str2);
1469 	RSTRING_PTR(str2)[0] = 0;
1470 	STR_SET_EMBED_LEN(str2, 0);
1471 	rb_enc_associate(str, enc);
1472 	ENC_CODERANGE_SET(str, cr);
1473     }
1474 }
1475 
1476 VALUE
rb_obj_as_string(VALUE obj)1477 rb_obj_as_string(VALUE obj)
1478 {
1479     VALUE str;
1480 
1481     if (RB_TYPE_P(obj, T_STRING)) {
1482 	return obj;
1483     }
1484     str = rb_funcall(obj, idTo_s, 0);
1485     return rb_obj_as_string_result(str, obj);
1486 }
1487 
1488 MJIT_FUNC_EXPORTED VALUE
rb_obj_as_string_result(VALUE str,VALUE obj)1489 rb_obj_as_string_result(VALUE str, VALUE obj)
1490 {
1491     if (!RB_TYPE_P(str, T_STRING))
1492 	return rb_any_to_s(obj);
1493     if (!FL_TEST_RAW(str, RSTRING_FSTR) && FL_ABLE(obj))
1494 	/* fstring must not be tainted, at least */
1495 	OBJ_INFECT_RAW(str, obj);
1496     return str;
1497 }
1498 
1499 static VALUE
str_replace(VALUE str,VALUE str2)1500 str_replace(VALUE str, VALUE str2)
1501 {
1502     long len;
1503 
1504     len = RSTRING_LEN(str2);
1505     if (STR_SHARED_P(str2)) {
1506 	VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1507 	assert(OBJ_FROZEN(shared));
1508 	STR_SET_NOEMBED(str);
1509 	RSTRING(str)->as.heap.len = len;
1510 	RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1511 	STR_SET_SHARED(str, shared);
1512 	rb_enc_cr_str_exact_copy(str, str2);
1513     }
1514     else {
1515 	str_replace_shared(str, str2);
1516     }
1517 
1518     OBJ_INFECT(str, str2);
1519     return str;
1520 }
1521 
1522 static inline VALUE
str_duplicate(VALUE klass,VALUE str)1523 str_duplicate(VALUE klass, VALUE str)
1524 {
1525     enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1526     const VALUE flag_mask =
1527 	RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1528 	ENC_CODERANGE_MASK | ENCODING_MASK |
1529 	FL_TAINT | FL_FREEZE
1530 	;
1531     VALUE flags = FL_TEST_RAW(str, flag_mask);
1532     VALUE dup = str_alloc(klass);
1533     MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1534 	   char, embed_size);
1535     if (flags & STR_NOEMBED) {
1536         if (FL_TEST_RAW(str, STR_SHARED)) {
1537             str = RSTRING(str)->as.heap.aux.shared;
1538         }
1539         else if (UNLIKELY(!(flags & FL_FREEZE))) {
1540             str = str_new_frozen(klass, str);
1541             FL_SET_RAW(str, flags & FL_TAINT);
1542             flags = FL_TEST_RAW(str, flag_mask);
1543 	}
1544 	if (flags & STR_NOEMBED) {
1545 	    RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1546 	    flags |= STR_SHARED;
1547 	}
1548 	else {
1549 	    MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1550 		   char, embed_size);
1551 	}
1552     }
1553     FL_SET_RAW(dup, flags & ~FL_FREEZE);
1554     return dup;
1555 }
1556 
1557 VALUE
rb_str_dup(VALUE str)1558 rb_str_dup(VALUE str)
1559 {
1560     return str_duplicate(rb_obj_class(str), str);
1561 }
1562 
1563 VALUE
rb_str_resurrect(VALUE str)1564 rb_str_resurrect(VALUE str)
1565 {
1566     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1567     return str_duplicate(rb_cString, str);
1568 }
1569 
1570 /*
1571  *  call-seq:
1572  *     String.new(str="")                   -> new_str
1573  *     String.new(str="", encoding: enc)    -> new_str
1574  *     String.new(str="", capacity: size)   -> new_str
1575  *
1576  *  Returns a new string object containing a copy of <i>str</i>.
1577  *
1578  *  The optional <i>encoding</i> keyword argument specifies the encoding
1579  *  of the new string.
1580  *  If not specified, the encoding of <i>str</i> is used
1581  *  (or ASCII-8BIT, if <i>str</i> is not specified).
1582  *
1583  *  The optional <i>capacity</i> keyword argument specifies the size
1584  *  of the internal buffer.
1585  *  This may improve performance, when the string will be concatenated many
1586  *  times (causing many realloc calls).
1587  */
1588 
1589 static VALUE
rb_str_init(int argc,VALUE * argv,VALUE str)1590 rb_str_init(int argc, VALUE *argv, VALUE str)
1591 {
1592     static ID keyword_ids[2];
1593     VALUE orig, opt, venc, vcapa;
1594     VALUE kwargs[2];
1595     rb_encoding *enc = 0;
1596     int n;
1597 
1598     if (!keyword_ids[0]) {
1599 	keyword_ids[0] = rb_id_encoding();
1600 	CONST_ID(keyword_ids[1], "capacity");
1601     }
1602 
1603     n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1604     if (!NIL_P(opt)) {
1605 	rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1606 	venc = kwargs[0];
1607 	vcapa = kwargs[1];
1608 	if (venc != Qundef && !NIL_P(venc)) {
1609 	    enc = rb_to_encoding(venc);
1610 	}
1611 	if (vcapa != Qundef && !NIL_P(vcapa)) {
1612 	    long capa = NUM2LONG(vcapa);
1613 	    long len = 0;
1614 	    int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1615 
1616 	    if (capa < STR_BUF_MIN_SIZE) {
1617 		capa = STR_BUF_MIN_SIZE;
1618 	    }
1619 	    if (n == 1) {
1620 		StringValue(orig);
1621 		len = RSTRING_LEN(orig);
1622 		if (capa < len) {
1623 		    capa = len;
1624 		}
1625 		if (orig == str) n = 0;
1626 	    }
1627 	    str_modifiable(str);
1628 	    if (STR_EMBED_P(str)) { /* make noembed always */
1629                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1630                 memcpy(new_ptr, RSTRING(str)->as.ary, RSTRING_EMBED_LEN_MAX + 1);
1631                 RSTRING(str)->as.heap.ptr = new_ptr;
1632             }
1633             else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1634                 const size_t size = (size_t)capa + termlen;
1635                 const char *const old_ptr = RSTRING_PTR(str);
1636                 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1637                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1638                 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1639                 FL_UNSET_RAW(str, STR_SHARED);
1640                 RSTRING(str)->as.heap.ptr = new_ptr;
1641 	    }
1642 	    else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1643 		SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1644 			(size_t)capa + termlen, STR_HEAP_SIZE(str));
1645 	    }
1646 	    RSTRING(str)->as.heap.len = len;
1647 	    TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1648 	    if (n == 1) {
1649 		memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1650 		rb_enc_cr_str_exact_copy(str, orig);
1651 	    }
1652 	    FL_SET(str, STR_NOEMBED);
1653 	    RSTRING(str)->as.heap.aux.capa = capa;
1654 	}
1655 	else if (n == 1) {
1656 	    rb_str_replace(str, orig);
1657 	}
1658 	if (enc) {
1659 	    rb_enc_associate(str, enc);
1660 	    ENC_CODERANGE_CLEAR(str);
1661 	}
1662     }
1663     else if (n == 1) {
1664 	rb_str_replace(str, orig);
1665     }
1666     return str;
1667 }
1668 
1669 #ifdef NONASCII_MASK
1670 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1671 
1672 /*
1673  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1674  * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1675  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1676  *
1677  * if (!(byte & 0x80))
1678  *   byte |= 0x40;          // turn on bit6
1679  * return ((byte>>6) & 1);  // bit6 represent whether this byte is leading or not.
1680  *
1681  * This function calculates whether a byte is leading or not for all bytes
1682  * in the argument word by concurrently using the above logic, and then
1683  * adds up the number of leading bytes in the word.
1684  */
1685 static inline uintptr_t
count_utf8_lead_bytes_with_word(const uintptr_t * s)1686 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1687 {
1688     uintptr_t d = *s;
1689 
1690     /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1691     d = (d>>6) | (~d>>7);
1692     d &= NONASCII_MASK >> 7;
1693 
1694     /* Gather all bytes. */
1695 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1696     /* use only if it can use POPCNT */
1697     return rb_popcount_intptr(d);
1698 #else
1699     d += (d>>8);
1700     d += (d>>16);
1701 # if SIZEOF_VOIDP == 8
1702     d += (d>>32);
1703 # endif
1704     return (d&0xF);
1705 #endif
1706 }
1707 #endif
1708 
1709 static inline long
enc_strlen(const char * p,const char * e,rb_encoding * enc,int cr)1710 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1711 {
1712     long c;
1713     const char *q;
1714 
1715     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1716 	long diff = (long)(e - p);
1717 	return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1718     }
1719 #ifdef NONASCII_MASK
1720     else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1721 	uintptr_t len = 0;
1722 	if ((int)sizeof(uintptr_t) * 2 < e - p) {
1723 	    const uintptr_t *s, *t;
1724 	    const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1725 	    s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1726 	    t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1727 	    while (p < (const char *)s) {
1728 		if (is_utf8_lead_byte(*p)) len++;
1729 		p++;
1730 	    }
1731 	    while (s < t) {
1732 		len += count_utf8_lead_bytes_with_word(s);
1733 		s++;
1734 	    }
1735 	    p = (const char *)s;
1736 	}
1737 	while (p < e) {
1738 	    if (is_utf8_lead_byte(*p)) len++;
1739 	    p++;
1740 	}
1741 	return (long)len;
1742     }
1743 #endif
1744     else if (rb_enc_asciicompat(enc)) {
1745         c = 0;
1746 	if (ENC_CODERANGE_CLEAN_P(cr)) {
1747 	    while (p < e) {
1748 		if (ISASCII(*p)) {
1749 		    q = search_nonascii(p, e);
1750 		    if (!q)
1751 			return c + (e - p);
1752 		    c += q - p;
1753 		    p = q;
1754 		}
1755 		p += rb_enc_fast_mbclen(p, e, enc);
1756 		c++;
1757 	    }
1758 	}
1759 	else {
1760 	    while (p < e) {
1761 		if (ISASCII(*p)) {
1762 		    q = search_nonascii(p, e);
1763 		    if (!q)
1764 			return c + (e - p);
1765 		    c += q - p;
1766 		    p = q;
1767 		}
1768 		p += rb_enc_mbclen(p, e, enc);
1769 		c++;
1770 	    }
1771 	}
1772         return c;
1773     }
1774 
1775     for (c=0; p<e; c++) {
1776         p += rb_enc_mbclen(p, e, enc);
1777     }
1778     return c;
1779 }
1780 
1781 long
rb_enc_strlen(const char * p,const char * e,rb_encoding * enc)1782 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1783 {
1784     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1785 }
1786 
1787 /* To get strlen with cr
1788  * Note that given cr is not used.
1789  */
1790 long
rb_enc_strlen_cr(const char * p,const char * e,rb_encoding * enc,int * cr)1791 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1792 {
1793     long c;
1794     const char *q;
1795     int ret;
1796 
1797     *cr = 0;
1798     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1799 	long diff = (long)(e - p);
1800 	return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1801     }
1802     else if (rb_enc_asciicompat(enc)) {
1803 	c = 0;
1804 	while (p < e) {
1805 	    if (ISASCII(*p)) {
1806 		q = search_nonascii(p, e);
1807 		if (!q) {
1808 		    if (!*cr) *cr = ENC_CODERANGE_7BIT;
1809 		    return c + (e - p);
1810 		}
1811 		c += q - p;
1812 		p = q;
1813 	    }
1814 	    ret = rb_enc_precise_mbclen(p, e, enc);
1815 	    if (MBCLEN_CHARFOUND_P(ret)) {
1816 		*cr |= ENC_CODERANGE_VALID;
1817 		p += MBCLEN_CHARFOUND_LEN(ret);
1818 	    }
1819 	    else {
1820 		*cr = ENC_CODERANGE_BROKEN;
1821 		p++;
1822 	    }
1823 	    c++;
1824 	}
1825 	if (!*cr) *cr = ENC_CODERANGE_7BIT;
1826 	return c;
1827     }
1828 
1829     for (c=0; p<e; c++) {
1830 	ret = rb_enc_precise_mbclen(p, e, enc);
1831 	if (MBCLEN_CHARFOUND_P(ret)) {
1832 	    *cr |= ENC_CODERANGE_VALID;
1833 	    p += MBCLEN_CHARFOUND_LEN(ret);
1834 	}
1835 	else {
1836 	    *cr = ENC_CODERANGE_BROKEN;
1837             if (p + rb_enc_mbminlen(enc) <= e)
1838                 p += rb_enc_mbminlen(enc);
1839             else
1840                 p = e;
1841 	}
1842     }
1843     if (!*cr) *cr = ENC_CODERANGE_7BIT;
1844     return c;
1845 }
1846 
1847 /* enc must be str's enc or rb_enc_check(str, str2) */
1848 static long
str_strlen(VALUE str,rb_encoding * enc)1849 str_strlen(VALUE str, rb_encoding *enc)
1850 {
1851     const char *p, *e;
1852     int cr;
1853 
1854     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1855     if (!enc) enc = STR_ENC_GET(str);
1856     p = RSTRING_PTR(str);
1857     e = RSTRING_END(str);
1858     cr = ENC_CODERANGE(str);
1859 
1860     if (cr == ENC_CODERANGE_UNKNOWN) {
1861 	long n = rb_enc_strlen_cr(p, e, enc, &cr);
1862 	if (cr) ENC_CODERANGE_SET(str, cr);
1863 	return n;
1864     }
1865     else {
1866 	return enc_strlen(p, e, enc, cr);
1867     }
1868 }
1869 
1870 long
rb_str_strlen(VALUE str)1871 rb_str_strlen(VALUE str)
1872 {
1873     return str_strlen(str, NULL);
1874 }
1875 
1876 /*
1877  *  call-seq:
1878  *     str.length   -> integer
1879  *     str.size     -> integer
1880  *
1881  *  Returns the character length of <i>str</i>.
1882  */
1883 
1884 VALUE
rb_str_length(VALUE str)1885 rb_str_length(VALUE str)
1886 {
1887     return LONG2NUM(str_strlen(str, NULL));
1888 }
1889 
1890 /*
1891  *  call-seq:
1892  *     str.bytesize  -> integer
1893  *
1894  *  Returns the length of +str+ in bytes.
1895  *
1896  *    "\x80\u3042".bytesize  #=> 4
1897  *    "hello".bytesize       #=> 5
1898  */
1899 
1900 static VALUE
rb_str_bytesize(VALUE str)1901 rb_str_bytesize(VALUE str)
1902 {
1903     return LONG2NUM(RSTRING_LEN(str));
1904 }
1905 
1906 /*
1907  *  call-seq:
1908  *     str.empty?   -> true or false
1909  *
1910  *  Returns <code>true</code> if <i>str</i> has a length of zero.
1911  *
1912  *     "hello".empty?   #=> false
1913  *     " ".empty?       #=> false
1914  *     "".empty?        #=> true
1915  */
1916 
1917 static VALUE
rb_str_empty(VALUE str)1918 rb_str_empty(VALUE str)
1919 {
1920     if (RSTRING_LEN(str) == 0)
1921 	return Qtrue;
1922     return Qfalse;
1923 }
1924 
1925 /*
1926  *  call-seq:
1927  *     str + other_str   -> new_str
1928  *
1929  *  Concatenation---Returns a new <code>String</code> containing
1930  *  <i>other_str</i> concatenated to <i>str</i>.
1931  *
1932  *     "Hello from " + self.to_s   #=> "Hello from main"
1933  */
1934 
1935 VALUE
rb_str_plus(VALUE str1,VALUE str2)1936 rb_str_plus(VALUE str1, VALUE str2)
1937 {
1938     VALUE str3;
1939     rb_encoding *enc;
1940     char *ptr1, *ptr2, *ptr3;
1941     long len1, len2;
1942     int termlen;
1943 
1944     StringValue(str2);
1945     enc = rb_enc_check_str(str1, str2);
1946     RSTRING_GETMEM(str1, ptr1, len1);
1947     RSTRING_GETMEM(str2, ptr2, len2);
1948     termlen = rb_enc_mbminlen(enc);
1949     if (len1 > LONG_MAX - len2) {
1950 	rb_raise(rb_eArgError, "string size too big");
1951     }
1952     str3 = str_new0(rb_cString, 0, len1+len2, termlen);
1953     ptr3 = RSTRING_PTR(str3);
1954     memcpy(ptr3, ptr1, len1);
1955     memcpy(ptr3+len1, ptr2, len2);
1956     TERM_FILL(&ptr3[len1+len2], termlen);
1957 
1958     FL_SET_RAW(str3, OBJ_TAINTED_RAW(str1) | OBJ_TAINTED_RAW(str2));
1959     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
1960 			   ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
1961     RB_GC_GUARD(str1);
1962     RB_GC_GUARD(str2);
1963     return str3;
1964 }
1965 
1966 /*
1967  *  call-seq:
1968  *     str * integer   -> new_str
1969  *
1970  *  Copy --- Returns a new String containing +integer+ copies of the receiver.
1971  *  +integer+ must be greater than or equal to 0.
1972  *
1973  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
1974  *     "Ho! " * 0   #=> ""
1975  */
1976 
1977 VALUE
rb_str_times(VALUE str,VALUE times)1978 rb_str_times(VALUE str, VALUE times)
1979 {
1980     VALUE str2;
1981     long n, len;
1982     char *ptr2;
1983     int termlen;
1984 
1985     if (times == INT2FIX(1)) {
1986 	return rb_str_dup(str);
1987     }
1988     if (times == INT2FIX(0)) {
1989 	str2 = str_alloc(rb_obj_class(str));
1990 	rb_enc_copy(str2, str);
1991 	OBJ_INFECT(str2, str);
1992 	return str2;
1993     }
1994     len = NUM2LONG(times);
1995     if (len < 0) {
1996 	rb_raise(rb_eArgError, "negative argument");
1997     }
1998     if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
1999        str2 = str_alloc(rb_obj_class(str));
2000        if (!STR_EMBEDDABLE_P(len, 1)) {
2001            RSTRING(str2)->as.heap.aux.capa = len;
2002            RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2003            STR_SET_NOEMBED(str2);
2004        }
2005        STR_SET_LEN(str2, len);
2006        rb_enc_copy(str2, str);
2007        OBJ_INFECT(str2, str);
2008        return str2;
2009     }
2010     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
2011 	rb_raise(rb_eArgError, "argument too big");
2012     }
2013 
2014     len *= RSTRING_LEN(str);
2015     termlen = TERM_LEN(str);
2016     str2 = str_new0(rb_obj_class(str), 0, len, termlen);
2017     ptr2 = RSTRING_PTR(str2);
2018     if (len) {
2019         n = RSTRING_LEN(str);
2020         memcpy(ptr2, RSTRING_PTR(str), n);
2021         while (n <= len/2) {
2022             memcpy(ptr2 + n, ptr2, n);
2023             n *= 2;
2024         }
2025         memcpy(ptr2 + n, ptr2, len-n);
2026     }
2027     STR_SET_LEN(str2, len);
2028     TERM_FILL(&ptr2[len], termlen);
2029     OBJ_INFECT(str2, str);
2030     rb_enc_cr_str_copy_for_substr(str2, str);
2031 
2032     return str2;
2033 }
2034 
2035 /*
2036  *  call-seq:
2037  *     str % arg   -> new_str
2038  *
2039  *  Format---Uses <i>str</i> as a format specification, and returns the result
2040  *  of applying it to <i>arg</i>. If the format specification contains more than
2041  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
2042  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
2043  *  details of the format string.
2044  *
2045  *     "%05d" % 123                              #=> "00123"
2046  *     "%-5s: %016x" % [ "ID", self.object_id ]  #=> "ID   : 00002b054ec93168"
2047  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
2048  */
2049 
2050 static VALUE
rb_str_format_m(VALUE str,VALUE arg)2051 rb_str_format_m(VALUE str, VALUE arg)
2052 {
2053     VALUE tmp = rb_check_array_type(arg);
2054 
2055     if (!NIL_P(tmp)) {
2056         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2057     }
2058     return rb_str_format(1, &arg, str);
2059 }
2060 
2061 static inline void
rb_check_lockedtmp(VALUE str)2062 rb_check_lockedtmp(VALUE str)
2063 {
2064     if (FL_TEST(str, STR_TMPLOCK)) {
2065 	rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2066     }
2067 }
2068 
2069 static inline void
str_modifiable(VALUE str)2070 str_modifiable(VALUE str)
2071 {
2072     rb_check_lockedtmp(str);
2073     rb_check_frozen(str);
2074 }
2075 
2076 static inline int
str_dependent_p(VALUE str)2077 str_dependent_p(VALUE str)
2078 {
2079     if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2080 	return 0;
2081     }
2082     else {
2083 	return 1;
2084     }
2085 }
2086 
2087 static inline int
str_independent(VALUE str)2088 str_independent(VALUE str)
2089 {
2090     str_modifiable(str);
2091     return !str_dependent_p(str);
2092 }
2093 
2094 static void
str_make_independent_expand(VALUE str,long len,long expand,const int termlen)2095 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2096 {
2097     char *ptr;
2098     char *oldptr;
2099     long capa = len + expand;
2100 
2101     if (len > capa) len = capa;
2102 
2103     if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
2104 	ptr = RSTRING(str)->as.heap.ptr;
2105 	STR_SET_EMBED(str);
2106 	memcpy(RSTRING(str)->as.ary, ptr, len);
2107 	TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2108 	STR_SET_EMBED_LEN(str, len);
2109 	return;
2110     }
2111 
2112     ptr = ALLOC_N(char, (size_t)capa + termlen);
2113     oldptr = RSTRING_PTR(str);
2114     if (oldptr) {
2115 	memcpy(ptr, oldptr, len);
2116     }
2117     if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2118         xfree(oldptr);
2119     }
2120     STR_SET_NOEMBED(str);
2121     FL_UNSET(str, STR_SHARED|STR_NOFREE);
2122     TERM_FILL(ptr + len, termlen);
2123     RSTRING(str)->as.heap.ptr = ptr;
2124     RSTRING(str)->as.heap.len = len;
2125     RSTRING(str)->as.heap.aux.capa = capa;
2126 }
2127 
2128 void
rb_str_modify(VALUE str)2129 rb_str_modify(VALUE str)
2130 {
2131     if (!str_independent(str))
2132 	str_make_independent(str);
2133     ENC_CODERANGE_CLEAR(str);
2134 }
2135 
2136 void
rb_str_modify_expand(VALUE str,long expand)2137 rb_str_modify_expand(VALUE str, long expand)
2138 {
2139     int termlen = TERM_LEN(str);
2140     long len = RSTRING_LEN(str);
2141 
2142     if (expand < 0) {
2143 	rb_raise(rb_eArgError, "negative expanding string size");
2144     }
2145     if (expand > LONG_MAX - len) {
2146 	rb_raise(rb_eArgError, "string size too big");
2147     }
2148 
2149     if (!str_independent(str)) {
2150 	str_make_independent_expand(str, len, expand, termlen);
2151     }
2152     else if (expand > 0) {
2153 	RESIZE_CAPA_TERM(str, len + expand, termlen);
2154     }
2155     ENC_CODERANGE_CLEAR(str);
2156 }
2157 
2158 /* As rb_str_modify(), but don't clear coderange */
2159 static void
str_modify_keep_cr(VALUE str)2160 str_modify_keep_cr(VALUE str)
2161 {
2162     if (!str_independent(str))
2163 	str_make_independent(str);
2164     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2165 	/* Force re-scan later */
2166 	ENC_CODERANGE_CLEAR(str);
2167 }
2168 
2169 static inline void
str_discard(VALUE str)2170 str_discard(VALUE str)
2171 {
2172     str_modifiable(str);
2173     if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2174 	ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2175 	RSTRING(str)->as.heap.ptr = 0;
2176 	RSTRING(str)->as.heap.len = 0;
2177     }
2178 }
2179 
2180 void
rb_must_asciicompat(VALUE str)2181 rb_must_asciicompat(VALUE str)
2182 {
2183     rb_encoding *enc = rb_enc_get(str);
2184     if (!rb_enc_asciicompat(enc)) {
2185 	rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2186     }
2187 }
2188 
2189 VALUE
rb_string_value(volatile VALUE * ptr)2190 rb_string_value(volatile VALUE *ptr)
2191 {
2192     VALUE s = *ptr;
2193     if (!RB_TYPE_P(s, T_STRING)) {
2194 	s = rb_str_to_str(s);
2195 	*ptr = s;
2196     }
2197     return s;
2198 }
2199 
2200 char *
rb_string_value_ptr(volatile VALUE * ptr)2201 rb_string_value_ptr(volatile VALUE *ptr)
2202 {
2203     VALUE str = rb_string_value(ptr);
2204     return RSTRING_PTR(str);
2205 }
2206 
2207 static int
zero_filled(const char * s,int n)2208 zero_filled(const char *s, int n)
2209 {
2210     for (; n > 0; --n) {
2211 	if (*s++) return 0;
2212     }
2213     return 1;
2214 }
2215 
2216 static const char *
str_null_char(const char * s,long len,const int minlen,rb_encoding * enc)2217 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2218 {
2219     const char *e = s + len;
2220 
2221     for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2222 	if (zero_filled(s, minlen)) return s;
2223     }
2224     return 0;
2225 }
2226 
2227 static char *
str_fill_term(VALUE str,char * s,long len,int termlen)2228 str_fill_term(VALUE str, char *s, long len, int termlen)
2229 {
2230     /* This function assumes that (capa + termlen) bytes of memory
2231      * is allocated, like many other functions in this file.
2232      */
2233     if (str_dependent_p(str)) {
2234 	if (!zero_filled(s + len, termlen))
2235 	    str_make_independent_expand(str, len, 0L, termlen);
2236     }
2237     else {
2238 	TERM_FILL(s + len, termlen);
2239 	return s;
2240     }
2241     return RSTRING_PTR(str);
2242 }
2243 
2244 void
rb_str_change_terminator_length(VALUE str,const int oldtermlen,const int termlen)2245 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2246 {
2247     long capa = str_capacity(str, oldtermlen) + oldtermlen;
2248     long len = RSTRING_LEN(str);
2249 
2250     assert(capa >= len);
2251     if (capa - len < termlen) {
2252 	rb_check_lockedtmp(str);
2253 	str_make_independent_expand(str, len, 0L, termlen);
2254     }
2255     else if (str_dependent_p(str)) {
2256 	if (termlen > oldtermlen)
2257 	    str_make_independent_expand(str, len, 0L, termlen);
2258     }
2259     else {
2260 	if (!STR_EMBED_P(str)) {
2261 	    /* modify capa instead of realloc */
2262 	    assert(!FL_TEST((str), STR_SHARED));
2263 	    RSTRING(str)->as.heap.aux.capa = capa - termlen;
2264 	}
2265 	if (termlen > oldtermlen) {
2266 	    TERM_FILL(RSTRING_PTR(str) + len, termlen);
2267 	}
2268     }
2269 
2270     return;
2271 }
2272 
2273 static char *
str_null_check(VALUE str,int * w)2274 str_null_check(VALUE str, int *w)
2275 {
2276     char *s = RSTRING_PTR(str);
2277     long len = RSTRING_LEN(str);
2278     rb_encoding *enc = rb_enc_get(str);
2279     const int minlen = rb_enc_mbminlen(enc);
2280 
2281     if (minlen > 1) {
2282 	*w = 1;
2283 	if (str_null_char(s, len, minlen, enc)) {
2284 	    return NULL;
2285 	}
2286 	return str_fill_term(str, s, len, minlen);
2287     }
2288     *w = 0;
2289     if (!s || memchr(s, 0, len)) {
2290 	return NULL;
2291     }
2292     if (s[len]) {
2293 	s = str_fill_term(str, s, len, minlen);
2294     }
2295     return s;
2296 }
2297 
2298 char *
rb_str_to_cstr(VALUE str)2299 rb_str_to_cstr(VALUE str)
2300 {
2301     int w;
2302     return str_null_check(str, &w);
2303 }
2304 
2305 char *
rb_string_value_cstr(volatile VALUE * ptr)2306 rb_string_value_cstr(volatile VALUE *ptr)
2307 {
2308     VALUE str = rb_string_value(ptr);
2309     int w;
2310     char *s = str_null_check(str, &w);
2311     if (!s) {
2312 	if (w) {
2313 	    rb_raise(rb_eArgError, "string contains null char");
2314 	}
2315 	rb_raise(rb_eArgError, "string contains null byte");
2316     }
2317     return s;
2318 }
2319 
2320 char *
rb_str_fill_terminator(VALUE str,const int newminlen)2321 rb_str_fill_terminator(VALUE str, const int newminlen)
2322 {
2323     char *s = RSTRING_PTR(str);
2324     long len = RSTRING_LEN(str);
2325     return str_fill_term(str, s, len, newminlen);
2326 }
2327 
2328 VALUE
rb_check_string_type(VALUE str)2329 rb_check_string_type(VALUE str)
2330 {
2331     str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2332     return str;
2333 }
2334 
2335 /*
2336  *  call-seq:
2337  *     String.try_convert(obj) -> string or nil
2338  *
2339  *  Try to convert <i>obj</i> into a String, using to_str method.
2340  *  Returns converted string or nil if <i>obj</i> cannot be converted
2341  *  for any reason.
2342  *
2343  *     String.try_convert("str")     #=> "str"
2344  *     String.try_convert(/re/)      #=> nil
2345  */
2346 static VALUE
rb_str_s_try_convert(VALUE dummy,VALUE str)2347 rb_str_s_try_convert(VALUE dummy, VALUE str)
2348 {
2349     return rb_check_string_type(str);
2350 }
2351 
2352 static char*
str_nth_len(const char * p,const char * e,long * nthp,rb_encoding * enc)2353 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2354 {
2355     long nth = *nthp;
2356     if (rb_enc_mbmaxlen(enc) == 1) {
2357         p += nth;
2358     }
2359     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2360         p += nth * rb_enc_mbmaxlen(enc);
2361     }
2362     else if (rb_enc_asciicompat(enc)) {
2363         const char *p2, *e2;
2364         int n;
2365 
2366         while (p < e && 0 < nth) {
2367             e2 = p + nth;
2368             if (e < e2) {
2369                 *nthp = nth;
2370                 return (char *)e;
2371             }
2372             if (ISASCII(*p)) {
2373                 p2 = search_nonascii(p, e2);
2374                 if (!p2) {
2375 		    nth -= e2 - p;
2376 		    *nthp = nth;
2377                     return (char *)e2;
2378                 }
2379                 nth -= p2 - p;
2380                 p = p2;
2381             }
2382             n = rb_enc_mbclen(p, e, enc);
2383             p += n;
2384             nth--;
2385         }
2386         *nthp = nth;
2387         if (nth != 0) {
2388             return (char *)e;
2389         }
2390         return (char *)p;
2391     }
2392     else {
2393         while (p < e && nth--) {
2394             p += rb_enc_mbclen(p, e, enc);
2395         }
2396     }
2397     if (p > e) p = e;
2398     *nthp = nth;
2399     return (char*)p;
2400 }
2401 
2402 char*
rb_enc_nth(const char * p,const char * e,long nth,rb_encoding * enc)2403 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2404 {
2405     return str_nth_len(p, e, &nth, enc);
2406 }
2407 
2408 static char*
str_nth(const char * p,const char * e,long nth,rb_encoding * enc,int singlebyte)2409 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2410 {
2411     if (singlebyte)
2412 	p += nth;
2413     else {
2414 	p = str_nth_len(p, e, &nth, enc);
2415     }
2416     if (!p) return 0;
2417     if (p > e) p = e;
2418     return (char *)p;
2419 }
2420 
2421 /* char offset to byte offset */
2422 static long
str_offset(const char * p,const char * e,long nth,rb_encoding * enc,int singlebyte)2423 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2424 {
2425     const char *pp = str_nth(p, e, nth, enc, singlebyte);
2426     if (!pp) return e - p;
2427     return pp - p;
2428 }
2429 
2430 long
rb_str_offset(VALUE str,long pos)2431 rb_str_offset(VALUE str, long pos)
2432 {
2433     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2434 		      STR_ENC_GET(str), single_byte_optimizable(str));
2435 }
2436 
2437 #ifdef NONASCII_MASK
2438 static char *
str_utf8_nth(const char * p,const char * e,long * nthp)2439 str_utf8_nth(const char *p, const char *e, long *nthp)
2440 {
2441     long nth = *nthp;
2442     if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2443 	const uintptr_t *s, *t;
2444 	const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2445 	s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2446 	t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2447 	while (p < (const char *)s) {
2448 	    if (is_utf8_lead_byte(*p)) nth--;
2449 	    p++;
2450 	}
2451 	do {
2452 	    nth -= count_utf8_lead_bytes_with_word(s);
2453 	    s++;
2454 	} while (s < t && (int)SIZEOF_VOIDP <= nth);
2455 	p = (char *)s;
2456     }
2457     while (p < e) {
2458 	if (is_utf8_lead_byte(*p)) {
2459 	    if (nth == 0) break;
2460 	    nth--;
2461 	}
2462 	p++;
2463     }
2464     *nthp = nth;
2465     return (char *)p;
2466 }
2467 
2468 static long
str_utf8_offset(const char * p,const char * e,long nth)2469 str_utf8_offset(const char *p, const char *e, long nth)
2470 {
2471     const char *pp = str_utf8_nth(p, e, &nth);
2472     return pp - p;
2473 }
2474 #endif
2475 
2476 /* byte offset to char offset */
2477 long
rb_str_sublen(VALUE str,long pos)2478 rb_str_sublen(VALUE str, long pos)
2479 {
2480     if (single_byte_optimizable(str) || pos < 0)
2481         return pos;
2482     else {
2483 	char *p = RSTRING_PTR(str);
2484         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2485     }
2486 }
2487 
2488 VALUE
rb_str_subseq(VALUE str,long beg,long len)2489 rb_str_subseq(VALUE str, long beg, long len)
2490 {
2491     VALUE str2;
2492 
2493     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2494 	SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2495 	long olen;
2496 	str2 = rb_str_new_shared(rb_str_new_frozen(str));
2497 	RSTRING(str2)->as.heap.ptr += beg;
2498 	olen = RSTRING(str2)->as.heap.len;
2499 	if (olen > len) RSTRING(str2)->as.heap.len = len;
2500     }
2501     else {
2502         str2 = rb_str_new_with_class(str, RSTRING_PTR(str)+beg, len);
2503 	RB_GC_GUARD(str);
2504     }
2505 
2506     rb_enc_cr_str_copy_for_substr(str2, str);
2507     OBJ_INFECT(str2, str);
2508 
2509     return str2;
2510 }
2511 
2512 char *
rb_str_subpos(VALUE str,long beg,long * lenp)2513 rb_str_subpos(VALUE str, long beg, long *lenp)
2514 {
2515     long len = *lenp;
2516     long slen = -1L;
2517     long blen = RSTRING_LEN(str);
2518     rb_encoding *enc = STR_ENC_GET(str);
2519     char *p, *s = RSTRING_PTR(str), *e = s + blen;
2520 
2521     if (len < 0) return 0;
2522     if (!blen) {
2523 	len = 0;
2524     }
2525     if (single_byte_optimizable(str)) {
2526 	if (beg > blen) return 0;
2527 	if (beg < 0) {
2528 	    beg += blen;
2529 	    if (beg < 0) return 0;
2530 	}
2531 	if (len > blen - beg)
2532 	    len = blen - beg;
2533 	if (len < 0) return 0;
2534 	p = s + beg;
2535 	goto end;
2536     }
2537     if (beg < 0) {
2538 	if (len > -beg) len = -beg;
2539 	if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2540 	    beg = -beg;
2541 	    while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2542 	    p = e;
2543 	    if (!p) return 0;
2544 	    while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2545 	    if (!p) return 0;
2546 	    len = e - p;
2547 	    goto end;
2548 	}
2549 	else {
2550 	    slen = str_strlen(str, enc);
2551 	    beg += slen;
2552 	    if (beg < 0) return 0;
2553 	    p = s + beg;
2554 	    if (len == 0) goto end;
2555 	}
2556     }
2557     else if (beg > 0 && beg > RSTRING_LEN(str)) {
2558 	return 0;
2559     }
2560     if (len == 0) {
2561 	if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2562 	p = s + beg;
2563     }
2564 #ifdef NONASCII_MASK
2565     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2566         enc == rb_utf8_encoding()) {
2567         p = str_utf8_nth(s, e, &beg);
2568         if (beg > 0) return 0;
2569         len = str_utf8_offset(p, e, len);
2570     }
2571 #endif
2572     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2573 	int char_sz = rb_enc_mbmaxlen(enc);
2574 
2575 	p = s + beg * char_sz;
2576 	if (p > e) {
2577 	    return 0;
2578 	}
2579         else if (len * char_sz > e - p)
2580             len = e - p;
2581         else
2582 	    len *= char_sz;
2583     }
2584     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2585 	if (beg > 0) return 0;
2586 	len = 0;
2587     }
2588     else {
2589 	len = str_offset(p, e, len, enc, 0);
2590     }
2591   end:
2592     *lenp = len;
2593     RB_GC_GUARD(str);
2594     return p;
2595 }
2596 
2597 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2598 
2599 VALUE
rb_str_substr(VALUE str,long beg,long len)2600 rb_str_substr(VALUE str, long beg, long len)
2601 {
2602     return str_substr(str, beg, len, TRUE);
2603 }
2604 
2605 static VALUE
str_substr(VALUE str,long beg,long len,int empty)2606 str_substr(VALUE str, long beg, long len, int empty)
2607 {
2608     VALUE str2;
2609     char *p = rb_str_subpos(str, beg, &len);
2610 
2611     if (!p) return Qnil;
2612     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2613 	SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2614 	long ofs = p - RSTRING_PTR(str);
2615 	str2 = rb_str_new_frozen(str);
2616 	str2 = str_new_shared(rb_obj_class(str2), str2);
2617 	RSTRING(str2)->as.heap.ptr += ofs;
2618 	RSTRING(str2)->as.heap.len = len;
2619 	ENC_CODERANGE_CLEAR(str2);
2620     }
2621     else {
2622 	if (!len && !empty) return Qnil;
2623 	str2 = rb_str_new_with_class(str, p, len);
2624 	OBJ_INFECT(str2, str);
2625 	RB_GC_GUARD(str);
2626     }
2627     rb_enc_cr_str_copy_for_substr(str2, str);
2628 
2629     return str2;
2630 }
2631 
2632 VALUE
rb_str_freeze(VALUE str)2633 rb_str_freeze(VALUE str)
2634 {
2635     if (OBJ_FROZEN(str)) return str;
2636     rb_str_resize(str, RSTRING_LEN(str));
2637     return rb_obj_freeze(str);
2638 }
2639 
2640 
2641 /*
2642  * call-seq:
2643  *   +str  -> str (mutable)
2644  *
2645  * If the string is frozen, then return duplicated mutable string.
2646  *
2647  * If the string is not frozen, then return the string itself.
2648  */
2649 static VALUE
str_uplus(VALUE str)2650 str_uplus(VALUE str)
2651 {
2652     if (OBJ_FROZEN(str)) {
2653 	return rb_str_dup(str);
2654     }
2655     else {
2656 	return str;
2657     }
2658 }
2659 
2660 /*
2661  * call-seq:
2662  *   -str  -> str (frozen)
2663  *
2664  * Returns a frozen, possibly pre-existing copy of the string.
2665  *
2666  * The string will be deduplicated as long as it is not tainted,
2667  * or has any instance variables set on it.
2668  */
2669 static VALUE
str_uminus(VALUE str)2670 str_uminus(VALUE str)
2671 {
2672     return rb_fstring(str);
2673 }
2674 
rb_str_dup_frozen(VALUE str)2675 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2676 #define rb_str_dup_frozen rb_str_new_frozen
2677 
2678 VALUE
2679 rb_str_locktmp(VALUE str)
2680 {
2681     if (FL_TEST(str, STR_TMPLOCK)) {
2682 	rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2683     }
2684     FL_SET(str, STR_TMPLOCK);
2685     return str;
2686 }
2687 
2688 VALUE
rb_str_unlocktmp(VALUE str)2689 rb_str_unlocktmp(VALUE str)
2690 {
2691     if (!FL_TEST(str, STR_TMPLOCK)) {
2692 	rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2693     }
2694     FL_UNSET(str, STR_TMPLOCK);
2695     return str;
2696 }
2697 
2698 RUBY_FUNC_EXPORTED VALUE
rb_str_locktmp_ensure(VALUE str,VALUE (* func)(VALUE),VALUE arg)2699 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
2700 {
2701     rb_str_locktmp(str);
2702     return rb_ensure(func, arg, rb_str_unlocktmp, str);
2703 }
2704 
2705 void
rb_str_set_len(VALUE str,long len)2706 rb_str_set_len(VALUE str, long len)
2707 {
2708     long capa;
2709     const int termlen = TERM_LEN(str);
2710 
2711     str_modifiable(str);
2712     if (STR_SHARED_P(str)) {
2713 	rb_raise(rb_eRuntimeError, "can't set length of shared string");
2714     }
2715     if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
2716 	rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2717     }
2718     STR_SET_LEN(str, len);
2719     TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2720 }
2721 
2722 VALUE
rb_str_resize(VALUE str,long len)2723 rb_str_resize(VALUE str, long len)
2724 {
2725     long slen;
2726     int independent;
2727 
2728     if (len < 0) {
2729 	rb_raise(rb_eArgError, "negative string size (or size too big)");
2730     }
2731 
2732     independent = str_independent(str);
2733     ENC_CODERANGE_CLEAR(str);
2734     slen = RSTRING_LEN(str);
2735 
2736     {
2737 	long capa;
2738 	const int termlen = TERM_LEN(str);
2739 	if (STR_EMBED_P(str)) {
2740 	    if (len == slen) return str;
2741 	    if (STR_EMBEDDABLE_P(len, termlen)) {
2742 		STR_SET_EMBED_LEN(str, len);
2743 		TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2744 		return str;
2745 	    }
2746 	    str_make_independent_expand(str, slen, len - slen, termlen);
2747 	}
2748 	else if (STR_EMBEDDABLE_P(len, termlen)) {
2749 	    char *ptr = STR_HEAP_PTR(str);
2750 	    STR_SET_EMBED(str);
2751 	    if (slen > len) slen = len;
2752 	    if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2753 	    TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2754 	    STR_SET_EMBED_LEN(str, len);
2755 	    if (independent) ruby_xfree(ptr);
2756 	    return str;
2757 	}
2758 	else if (!independent) {
2759 	    if (len == slen) return str;
2760 	    str_make_independent_expand(str, slen, len - slen, termlen);
2761 	}
2762 	else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2763 		 (capa - len) > (len < 1024 ? len : 1024)) {
2764 	    SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2765 	                    (size_t)len + termlen, STR_HEAP_SIZE(str));
2766 	    RSTRING(str)->as.heap.aux.capa = len;
2767 	}
2768 	else if (len == slen) return str;
2769 	RSTRING(str)->as.heap.len = len;
2770 	TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2771     }
2772     return str;
2773 }
2774 
2775 static VALUE
str_buf_cat(VALUE str,const char * ptr,long len)2776 str_buf_cat(VALUE str, const char *ptr, long len)
2777 {
2778     long capa, total, olen, off = -1;
2779     char *sptr;
2780     const int termlen = TERM_LEN(str);
2781     assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2782 
2783     RSTRING_GETMEM(str, sptr, olen);
2784     if (ptr >= sptr && ptr <= sptr + olen) {
2785         off = ptr - sptr;
2786     }
2787     rb_str_modify(str);
2788     if (len == 0) return 0;
2789     if (STR_EMBED_P(str)) {
2790 	capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2791 	sptr = RSTRING(str)->as.ary;
2792 	olen = RSTRING_EMBED_LEN(str);
2793     }
2794     else {
2795 	capa = RSTRING(str)->as.heap.aux.capa;
2796 	sptr = RSTRING(str)->as.heap.ptr;
2797 	olen = RSTRING(str)->as.heap.len;
2798     }
2799     if (olen > LONG_MAX - len) {
2800 	rb_raise(rb_eArgError, "string sizes too big");
2801     }
2802     total = olen + len;
2803     if (capa < total) {
2804 	if (total >= LONG_MAX / 2) {
2805 	    capa = total;
2806 	}
2807 	while (total > capa) {
2808 	    capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2809 	}
2810 	RESIZE_CAPA_TERM(str, capa, termlen);
2811 	sptr = RSTRING_PTR(str);
2812     }
2813     if (off != -1) {
2814         ptr = sptr + off;
2815     }
2816     memcpy(sptr + olen, ptr, len);
2817     STR_SET_LEN(str, total);
2818     TERM_FILL(sptr + total, termlen); /* sentinel */
2819 
2820     return str;
2821 }
2822 
2823 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2824 
2825 VALUE
rb_str_cat(VALUE str,const char * ptr,long len)2826 rb_str_cat(VALUE str, const char *ptr, long len)
2827 {
2828     if (len == 0) return str;
2829     if (len < 0) {
2830 	rb_raise(rb_eArgError, "negative string size (or size too big)");
2831     }
2832     return str_buf_cat(str, ptr, len);
2833 }
2834 
2835 VALUE
rb_str_cat_cstr(VALUE str,const char * ptr)2836 rb_str_cat_cstr(VALUE str, const char *ptr)
2837 {
2838     must_not_null(ptr);
2839     return rb_str_buf_cat(str, ptr, strlen(ptr));
2840 }
2841 
rb_str_buf_cat(VALUE str,const char * ptr,long len)2842 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
2843 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2844 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2845 
2846 static VALUE
2847 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2848     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2849 {
2850     int str_encindex = ENCODING_GET(str);
2851     int res_encindex;
2852     int str_cr, res_cr;
2853     rb_encoding *str_enc, *ptr_enc;
2854 
2855     str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
2856 
2857     if (str_encindex == ptr_encindex) {
2858 	if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2859             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2860         }
2861     }
2862     else {
2863 	str_enc = rb_enc_from_index(str_encindex);
2864 	ptr_enc = rb_enc_from_index(ptr_encindex);
2865         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2866             if (len == 0)
2867                 return str;
2868             if (RSTRING_LEN(str) == 0) {
2869                 rb_str_buf_cat(str, ptr, len);
2870                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2871                 return str;
2872             }
2873             goto incompatible;
2874         }
2875 	if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2876 	    ptr_cr = coderange_scan(ptr, len, ptr_enc);
2877 	}
2878         if (str_cr == ENC_CODERANGE_UNKNOWN) {
2879             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2880                 str_cr = rb_enc_str_coderange(str);
2881             }
2882         }
2883     }
2884     if (ptr_cr_ret)
2885         *ptr_cr_ret = ptr_cr;
2886 
2887     if (str_encindex != ptr_encindex &&
2888         str_cr != ENC_CODERANGE_7BIT &&
2889         ptr_cr != ENC_CODERANGE_7BIT) {
2890 	str_enc = rb_enc_from_index(str_encindex);
2891 	ptr_enc = rb_enc_from_index(ptr_encindex);
2892       incompatible:
2893         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2894 		 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
2895     }
2896 
2897     if (str_cr == ENC_CODERANGE_UNKNOWN) {
2898         res_encindex = str_encindex;
2899         res_cr = ENC_CODERANGE_UNKNOWN;
2900     }
2901     else if (str_cr == ENC_CODERANGE_7BIT) {
2902         if (ptr_cr == ENC_CODERANGE_7BIT) {
2903             res_encindex = str_encindex;
2904             res_cr = ENC_CODERANGE_7BIT;
2905         }
2906         else {
2907             res_encindex = ptr_encindex;
2908             res_cr = ptr_cr;
2909         }
2910     }
2911     else if (str_cr == ENC_CODERANGE_VALID) {
2912         res_encindex = str_encindex;
2913 	if (ENC_CODERANGE_CLEAN_P(ptr_cr))
2914 	    res_cr = str_cr;
2915 	else
2916 	    res_cr = ptr_cr;
2917     }
2918     else { /* str_cr == ENC_CODERANGE_BROKEN */
2919         res_encindex = str_encindex;
2920         res_cr = str_cr;
2921         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2922     }
2923 
2924     if (len < 0) {
2925 	rb_raise(rb_eArgError, "negative string size (or size too big)");
2926     }
2927     str_buf_cat(str, ptr, len);
2928     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2929     return str;
2930 }
2931 
2932 VALUE
rb_enc_str_buf_cat(VALUE str,const char * ptr,long len,rb_encoding * ptr_enc)2933 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2934 {
2935     return rb_enc_cr_str_buf_cat(str, ptr, len,
2936         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
2937 }
2938 
2939 VALUE
rb_str_buf_cat_ascii(VALUE str,const char * ptr)2940 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2941 {
2942     /* ptr must reference NUL terminated ASCII string. */
2943     int encindex = ENCODING_GET(str);
2944     rb_encoding *enc = rb_enc_from_index(encindex);
2945     if (rb_enc_asciicompat(enc)) {
2946         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2947             encindex, ENC_CODERANGE_7BIT, 0);
2948     }
2949     else {
2950         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2951         while (*ptr) {
2952             unsigned int c = (unsigned char)*ptr;
2953             int len = rb_enc_codelen(c, enc);
2954             rb_enc_mbcput(c, buf, enc);
2955             rb_enc_cr_str_buf_cat(str, buf, len,
2956                 encindex, ENC_CODERANGE_VALID, 0);
2957             ptr++;
2958         }
2959         return str;
2960     }
2961 }
2962 
2963 VALUE
rb_str_buf_append(VALUE str,VALUE str2)2964 rb_str_buf_append(VALUE str, VALUE str2)
2965 {
2966     int str2_cr;
2967 
2968     str2_cr = ENC_CODERANGE(str2);
2969 
2970     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2971         ENCODING_GET(str2), str2_cr, &str2_cr);
2972 
2973     OBJ_INFECT(str, str2);
2974     ENC_CODERANGE_SET(str2, str2_cr);
2975 
2976     return str;
2977 }
2978 
2979 VALUE
rb_str_append(VALUE str,VALUE str2)2980 rb_str_append(VALUE str, VALUE str2)
2981 {
2982     StringValue(str2);
2983     return rb_str_buf_append(str, str2);
2984 }
2985 
2986 #define MIN_PRE_ALLOC_SIZE 48
2987 
2988 MJIT_FUNC_EXPORTED VALUE
rb_str_concat_literals(size_t num,const VALUE * strary)2989 rb_str_concat_literals(size_t num, const VALUE *strary)
2990 {
2991     VALUE str;
2992     size_t i, s;
2993     long len = 1;
2994 
2995     if (UNLIKELY(!num)) return rb_str_new(0, 0);
2996     if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
2997 
2998     for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
2999     if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3000 	str = rb_str_resurrect(strary[0]);
3001 	s = 1;
3002     }
3003     else {
3004 	str = rb_str_buf_new(len);
3005 	rb_enc_copy(str, strary[0]);
3006 	s = 0;
3007     }
3008 
3009     for (i = s; i < num; ++i) {
3010 	const VALUE v = strary[i];
3011 	int encidx = ENCODING_GET(v);
3012 
3013 	rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3014 			      encidx, ENC_CODERANGE(v), NULL);
3015 	OBJ_INFECT_RAW(str, v);
3016 	if (encidx != ENCINDEX_US_ASCII) {
3017 	    if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3018 		rb_enc_set_index(str, encidx);
3019 	}
3020     }
3021     return str;
3022 }
3023 
3024 /*
3025  *  call-seq:
3026  *     str.concat(obj1, obj2, ...)          -> str
3027  *
3028  *  Concatenates the given object(s) to <i>str</i>. If an object is an
3029  *  <code>Integer</code>, it is considered a codepoint and converted
3030  *  to a character before concatenation.
3031  *
3032  *  +concat+ can take multiple arguments, and all the arguments are
3033  *  concatenated in order.
3034  *
3035  *     a = "hello "
3036  *     a.concat("world", 33)      #=> "hello world!"
3037  *     a                          #=> "hello world!"
3038  *
3039  *     b = "sn"
3040  *     b.concat("_", b, "_", b)   #=> "sn_sn_sn"
3041  *
3042  *  See also String#<<, which takes a single argument.
3043  */
3044 static VALUE
rb_str_concat_multi(int argc,VALUE * argv,VALUE str)3045 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3046 {
3047     str_modifiable(str);
3048 
3049     if (argc == 1) {
3050 	return rb_str_concat(str, argv[0]);
3051     }
3052     else if (argc > 1) {
3053 	int i;
3054 	VALUE arg_str = rb_str_tmp_new(0);
3055 	rb_enc_copy(arg_str, str);
3056 	for (i = 0; i < argc; i++) {
3057 	    rb_str_concat(arg_str, argv[i]);
3058 	}
3059 	rb_str_buf_append(str, arg_str);
3060     }
3061 
3062     return str;
3063 }
3064 
3065 /*
3066  *  call-seq:
3067  *     str << obj      -> str
3068  *     str << integer  -> str
3069  *
3070  *  Appends the given object to <i>str</i>. If the object is an
3071  *  <code>Integer</code>, it is considered a codepoint and converted
3072  *  to a character before being appended.
3073  *
3074  *     a = "hello "
3075  *     a << "world"   #=> "hello world"
3076  *     a << 33        #=> "hello world!"
3077  *
3078  *  See also String#concat, which takes multiple arguments.
3079  */
3080 VALUE
rb_str_concat(VALUE str1,VALUE str2)3081 rb_str_concat(VALUE str1, VALUE str2)
3082 {
3083     unsigned int code;
3084     rb_encoding *enc = STR_ENC_GET(str1);
3085     int encidx;
3086 
3087     if (RB_INTEGER_TYPE_P(str2)) {
3088 	if (rb_num_to_uint(str2, &code) == 0) {
3089 	}
3090 	else if (FIXNUM_P(str2)) {
3091 	    rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3092 	}
3093 	else {
3094 	    rb_raise(rb_eRangeError, "bignum out of char range");
3095 	}
3096     }
3097     else {
3098 	return rb_str_append(str1, str2);
3099     }
3100 
3101     encidx = rb_enc_to_index(enc);
3102     if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3103 	/* US-ASCII automatically extended to ASCII-8BIT */
3104 	char buf[1];
3105 	buf[0] = (char)code;
3106 	if (code > 0xFF) {
3107 	    rb_raise(rb_eRangeError, "%u out of char range", code);
3108 	}
3109 	rb_str_cat(str1, buf, 1);
3110 	if (encidx == ENCINDEX_US_ASCII && code > 127) {
3111 	    rb_enc_associate_index(str1, ENCINDEX_ASCII);
3112 	    ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3113 	}
3114     }
3115     else {
3116 	long pos = RSTRING_LEN(str1);
3117 	int cr = ENC_CODERANGE(str1);
3118 	int len;
3119 	char *buf;
3120 
3121 	switch (len = rb_enc_codelen(code, enc)) {
3122 	  case ONIGERR_INVALID_CODE_POINT_VALUE:
3123 	    rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3124 	    break;
3125 	  case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3126 	  case 0:
3127 	    rb_raise(rb_eRangeError, "%u out of char range", code);
3128 	    break;
3129 	}
3130 	buf = ALLOCA_N(char, len + 1);
3131 	rb_enc_mbcput(code, buf, enc);
3132 	if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3133 	    rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3134 	}
3135 	rb_str_resize(str1, pos+len);
3136 	memcpy(RSTRING_PTR(str1) + pos, buf, len);
3137 	if (cr == ENC_CODERANGE_7BIT && code > 127)
3138 	    cr = ENC_CODERANGE_VALID;
3139 	ENC_CODERANGE_SET(str1, cr);
3140     }
3141     return str1;
3142 }
3143 
3144 /*
3145  *  call-seq:
3146  *     str.prepend(other_str1, other_str2, ...)  -> str
3147  *
3148  *  Prepend---Prepend the given strings to <i>str</i>.
3149  *
3150  *     a = "!"
3151  *     a.prepend("hello ", "world") #=> "hello world!"
3152  *     a                            #=> "hello world!"
3153  *
3154  *  See also String#concat.
3155  */
3156 
3157 static VALUE
rb_str_prepend_multi(int argc,VALUE * argv,VALUE str)3158 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3159 {
3160     str_modifiable(str);
3161 
3162     if (argc == 1) {
3163 	rb_str_update(str, 0L, 0L, argv[0]);
3164     }
3165     else if (argc > 1) {
3166 	int i;
3167 	VALUE arg_str = rb_str_tmp_new(0);
3168 	rb_enc_copy(arg_str, str);
3169 	for (i = 0; i < argc; i++) {
3170 	    rb_str_append(arg_str, argv[i]);
3171 	}
3172 	rb_str_update(str, 0L, 0L, arg_str);
3173     }
3174 
3175     return str;
3176 }
3177 
3178 st_index_t
rb_str_hash(VALUE str)3179 rb_str_hash(VALUE str)
3180 {
3181     int e = ENCODING_GET(str);
3182     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3183 	e = 0;
3184     }
3185     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3186 }
3187 
3188 int
rb_str_hash_cmp(VALUE str1,VALUE str2)3189 rb_str_hash_cmp(VALUE str1, VALUE str2)
3190 {
3191     long len1, len2;
3192     const char *ptr1, *ptr2;
3193     RSTRING_GETMEM(str1, ptr1, len1);
3194     RSTRING_GETMEM(str2, ptr2, len2);
3195     return (len1 != len2 ||
3196 	    !rb_str_comparable(str1, str2) ||
3197 	    memcmp(ptr1, ptr2, len1) != 0);
3198 }
3199 
3200 /*
3201  * call-seq:
3202  *    str.hash   -> integer
3203  *
3204  * Returns a hash based on the string's length, content and encoding.
3205  *
3206  * See also Object#hash.
3207  */
3208 
3209 static VALUE
rb_str_hash_m(VALUE str)3210 rb_str_hash_m(VALUE str)
3211 {
3212     st_index_t hval = rb_str_hash(str);
3213     return ST2FIX(hval);
3214 }
3215 
3216 #define lesser(a,b) (((a)>(b))?(b):(a))
3217 
3218 int
rb_str_comparable(VALUE str1,VALUE str2)3219 rb_str_comparable(VALUE str1, VALUE str2)
3220 {
3221     int idx1, idx2;
3222     int rc1, rc2;
3223 
3224     if (RSTRING_LEN(str1) == 0) return TRUE;
3225     if (RSTRING_LEN(str2) == 0) return TRUE;
3226     idx1 = ENCODING_GET(str1);
3227     idx2 = ENCODING_GET(str2);
3228     if (idx1 == idx2) return TRUE;
3229     rc1 = rb_enc_str_coderange(str1);
3230     rc2 = rb_enc_str_coderange(str2);
3231     if (rc1 == ENC_CODERANGE_7BIT) {
3232 	if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3233 	if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3234 	    return TRUE;
3235     }
3236     if (rc2 == ENC_CODERANGE_7BIT) {
3237 	if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3238 	    return TRUE;
3239     }
3240     return FALSE;
3241 }
3242 
3243 int
rb_str_cmp(VALUE str1,VALUE str2)3244 rb_str_cmp(VALUE str1, VALUE str2)
3245 {
3246     long len1, len2;
3247     const char *ptr1, *ptr2;
3248     int retval;
3249 
3250     if (str1 == str2) return 0;
3251     RSTRING_GETMEM(str1, ptr1, len1);
3252     RSTRING_GETMEM(str2, ptr2, len2);
3253     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3254 	if (len1 == len2) {
3255 	    if (!rb_str_comparable(str1, str2)) {
3256 		if (ENCODING_GET(str1) > ENCODING_GET(str2))
3257 		    return 1;
3258 		return -1;
3259 	    }
3260 	    return 0;
3261 	}
3262 	if (len1 > len2) return 1;
3263 	return -1;
3264     }
3265     if (retval > 0) return 1;
3266     return -1;
3267 }
3268 
3269 /* expect tail call optimization */
3270 static VALUE
str_eql(const VALUE str1,const VALUE str2)3271 str_eql(const VALUE str1, const VALUE str2)
3272 {
3273     const long len = RSTRING_LEN(str1);
3274     const char *ptr1, *ptr2;
3275 
3276     if (len != RSTRING_LEN(str2)) return Qfalse;
3277     if (!rb_str_comparable(str1, str2)) return Qfalse;
3278     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
3279 	return Qtrue;
3280     if (memcmp(ptr1, ptr2, len) == 0)
3281 	return Qtrue;
3282     return Qfalse;
3283 }
3284 
3285 /*
3286  *  call-seq:
3287  *     str == obj    -> true or false
3288  *     str === obj   -> true or false
3289  *
3290  *  Equality---Returns whether +str+ == +obj+, similar to Object#==.
3291  *
3292  *  If +obj+ is not an instance of String but responds to +to_str+, then the
3293  *  two strings are compared using <code>obj.==</code>.
3294  *
3295  *  Otherwise, returns similarly to String#eql?, comparing length and content.
3296  */
3297 
3298 VALUE
rb_str_equal(VALUE str1,VALUE str2)3299 rb_str_equal(VALUE str1, VALUE str2)
3300 {
3301     if (str1 == str2) return Qtrue;
3302     if (!RB_TYPE_P(str2, T_STRING)) {
3303 	if (!rb_respond_to(str2, idTo_str)) {
3304 	    return Qfalse;
3305 	}
3306 	return rb_equal(str2, str1);
3307     }
3308     return str_eql(str1, str2);
3309 }
3310 
3311 /*
3312  * call-seq:
3313  *   str.eql?(other)   -> true or false
3314  *
3315  * Two strings are equal if they have the same length and content.
3316  */
3317 
3318 MJIT_FUNC_EXPORTED VALUE
rb_str_eql(VALUE str1,VALUE str2)3319 rb_str_eql(VALUE str1, VALUE str2)
3320 {
3321     if (str1 == str2) return Qtrue;
3322     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3323     return str_eql(str1, str2);
3324 }
3325 
3326 /*
3327  *  call-seq:
3328  *     string <=> other_string   -> -1, 0, +1, or nil
3329  *
3330  *  Comparison---Returns -1, 0, +1, or +nil+ depending on whether +string+ is
3331  *  less than, equal to, or greater than +other_string+.
3332  *
3333  *  +nil+ is returned if the two values are incomparable.
3334  *
3335  *  If the strings are of different lengths, and the strings are equal when
3336  *  compared up to the shortest length, then the longer string is considered
3337  *  greater than the shorter one.
3338  *
3339  *  <code><=></code> is the basis for the methods <code><</code>,
3340  *  <code><=</code>, <code>></code>, <code>>=</code>, and
3341  *  <code>between?</code>, included from module Comparable. The method
3342  *  String#== does not use Comparable#==.
3343  *
3344  *     "abcdef" <=> "abcde"     #=> 1
3345  *     "abcdef" <=> "abcdef"    #=> 0
3346  *     "abcdef" <=> "abcdefg"   #=> -1
3347  *     "abcdef" <=> "ABCDEF"    #=> 1
3348  *     "abcdef" <=> 1           #=> nil
3349  */
3350 
3351 static VALUE
rb_str_cmp_m(VALUE str1,VALUE str2)3352 rb_str_cmp_m(VALUE str1, VALUE str2)
3353 {
3354     int result;
3355     VALUE s = rb_check_string_type(str2);
3356     if (NIL_P(s)) {
3357 	return rb_invcmp(str1, str2);
3358     }
3359     result = rb_str_cmp(str1, s);
3360     return INT2FIX(result);
3361 }
3362 
3363 static VALUE str_casecmp(VALUE str1, VALUE str2);
3364 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3365 
3366 /*
3367  *  call-seq:
3368  *     str.casecmp(other_str)   -> -1, 0, +1, or nil
3369  *
3370  *  Case-insensitive version of <code>String#<=></code>.
3371  *  Currently, case-insensitivity only works on characters A-Z/a-z,
3372  *  not all of Unicode. This is different from String#casecmp?.
3373  *
3374  *     "aBcDeF".casecmp("abcde")     #=> 1
3375  *     "aBcDeF".casecmp("abcdef")    #=> 0
3376  *     "aBcDeF".casecmp("abcdefg")   #=> -1
3377  *     "abcdef".casecmp("ABCDEF")    #=> 0
3378  *
3379  *  +nil+ is returned if the two strings have incompatible encodings,
3380  *  or if +other_str+ is not a string.
3381  *
3382  *     "foo".casecmp(2)   #=> nil
3383  *     "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp("\u{c4 d6 dc}")   #=> nil
3384  */
3385 
3386 static VALUE
rb_str_casecmp(VALUE str1,VALUE str2)3387 rb_str_casecmp(VALUE str1, VALUE str2)
3388 {
3389     VALUE s = rb_check_string_type(str2);
3390     if (NIL_P(s)) {
3391 	return Qnil;
3392     }
3393     return str_casecmp(str1, s);
3394 }
3395 
3396 static VALUE
str_casecmp(VALUE str1,VALUE str2)3397 str_casecmp(VALUE str1, VALUE str2)
3398 {
3399     long len;
3400     rb_encoding *enc;
3401     char *p1, *p1end, *p2, *p2end;
3402 
3403     enc = rb_enc_compatible(str1, str2);
3404     if (!enc) {
3405 	return Qnil;
3406     }
3407 
3408     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3409     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3410     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3411 	while (p1 < p1end && p2 < p2end) {
3412 	    if (*p1 != *p2) {
3413 		unsigned int c1 = TOUPPER(*p1 & 0xff);
3414 		unsigned int c2 = TOUPPER(*p2 & 0xff);
3415                 if (c1 != c2)
3416                     return INT2FIX(c1 < c2 ? -1 : 1);
3417 	    }
3418 	    p1++;
3419 	    p2++;
3420 	}
3421     }
3422     else {
3423 	while (p1 < p1end && p2 < p2end) {
3424             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3425             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3426 
3427             if (0 <= c1 && 0 <= c2) {
3428                 c1 = TOUPPER(c1);
3429                 c2 = TOUPPER(c2);
3430                 if (c1 != c2)
3431                     return INT2FIX(c1 < c2 ? -1 : 1);
3432             }
3433             else {
3434                 int r;
3435                 l1 = rb_enc_mbclen(p1, p1end, enc);
3436                 l2 = rb_enc_mbclen(p2, p2end, enc);
3437                 len = l1 < l2 ? l1 : l2;
3438                 r = memcmp(p1, p2, len);
3439                 if (r != 0)
3440                     return INT2FIX(r < 0 ? -1 : 1);
3441                 if (l1 != l2)
3442                     return INT2FIX(l1 < l2 ? -1 : 1);
3443             }
3444 	    p1 += l1;
3445 	    p2 += l2;
3446 	}
3447     }
3448     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3449     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3450     return INT2FIX(-1);
3451 }
3452 
3453 /*
3454  *  call-seq:
3455  *     str.casecmp?(other_str)   -> true, false, or nil
3456  *
3457  *  Returns +true+ if +str+ and +other_str+ are equal after
3458  *  Unicode case folding, +false+ if they are not equal.
3459  *
3460  *     "aBcDeF".casecmp?("abcde")     #=> false
3461  *     "aBcDeF".casecmp?("abcdef")    #=> true
3462  *     "aBcDeF".casecmp?("abcdefg")   #=> false
3463  *     "abcdef".casecmp?("ABCDEF")    #=> true
3464  *     "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}")   #=> true
3465  *
3466  *  +nil+ is returned if the two strings have incompatible encodings,
3467  *  or if +other_str+ is not a string.
3468  *
3469  *     "foo".casecmp?(2)   #=> nil
3470  *     "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp?("\u{c4 d6 dc}")   #=> nil
3471  */
3472 
3473 static VALUE
rb_str_casecmp_p(VALUE str1,VALUE str2)3474 rb_str_casecmp_p(VALUE str1, VALUE str2)
3475 {
3476     VALUE s = rb_check_string_type(str2);
3477     if (NIL_P(s)) {
3478 	return Qnil;
3479     }
3480     return str_casecmp_p(str1, s);
3481 }
3482 
3483 static VALUE
str_casecmp_p(VALUE str1,VALUE str2)3484 str_casecmp_p(VALUE str1, VALUE str2)
3485 {
3486     rb_encoding *enc;
3487     VALUE folded_str1, folded_str2;
3488     VALUE fold_opt = sym_fold;
3489 
3490     enc = rb_enc_compatible(str1, str2);
3491     if (!enc) {
3492 	return Qnil;
3493     }
3494 
3495     folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3496     folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3497 
3498     return rb_str_eql(folded_str1, folded_str2);
3499 }
3500 
3501 static long
strseq_core(const char * str_ptr,const char * str_ptr_end,long str_len,const char * sub_ptr,long sub_len,long offset,rb_encoding * enc)3502 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3503 	    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3504 {
3505     const char *search_start = str_ptr;
3506     long pos, search_len = str_len - offset;
3507 
3508     for (;;) {
3509 	const char *t;
3510 	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3511 	if (pos < 0) return pos;
3512 	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3513 	if (t == search_start + pos) break;
3514 	search_len -= t - search_start;
3515 	if (search_len <= 0) return -1;
3516 	offset += t - search_start;
3517 	search_start = t;
3518     }
3519     return pos + offset;
3520 }
3521 
3522 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3523 
3524 static long
rb_strseq_index(VALUE str,VALUE sub,long offset,int in_byte)3525 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3526 {
3527     const char *str_ptr, *str_ptr_end, *sub_ptr;
3528     long str_len, sub_len;
3529     int single_byte = single_byte_optimizable(str);
3530     rb_encoding *enc;
3531 
3532     enc = rb_enc_check(str, sub);
3533     if (is_broken_string(sub)) return -1;
3534 
3535     str_ptr = RSTRING_PTR(str);
3536     str_ptr_end = RSTRING_END(str);
3537     str_len = RSTRING_LEN(str);
3538     sub_ptr = RSTRING_PTR(sub);
3539     sub_len = RSTRING_LEN(sub);
3540 
3541     if (str_len < sub_len) return -1;
3542 
3543     if (offset != 0) {
3544 	long str_len_char, sub_len_char;
3545 	str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3546 	sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3547 	if (offset < 0) {
3548 	    offset += str_len_char;
3549 	    if (offset < 0) return -1;
3550 	}
3551 	if (str_len_char - offset < sub_len_char) return -1;
3552 	if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3553 	str_ptr += offset;
3554     }
3555     if (sub_len == 0) return offset;
3556 
3557     /* need proceed one character at a time */
3558     return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3559 }
3560 
3561 
3562 /*
3563  *  call-seq:
3564  *     str.index(substring [, offset])   -> integer or nil
3565  *     str.index(regexp [, offset])      -> integer or nil
3566  *
3567  *  Returns the index of the first occurrence of the given <i>substring</i> or
3568  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3569  *  found. If the second parameter is present, it specifies the position in the
3570  *  string to begin the search.
3571  *
3572  *     "hello".index('e')             #=> 1
3573  *     "hello".index('lo')            #=> 3
3574  *     "hello".index('a')             #=> nil
3575  *     "hello".index(?e)              #=> 1
3576  *     "hello".index(/[aeiou]/, -3)   #=> 4
3577  */
3578 
3579 static VALUE
rb_str_index_m(int argc,VALUE * argv,VALUE str)3580 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3581 {
3582     VALUE sub;
3583     VALUE initpos;
3584     long pos;
3585 
3586     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3587 	pos = NUM2LONG(initpos);
3588     }
3589     else {
3590 	pos = 0;
3591     }
3592     if (pos < 0) {
3593 	pos += str_strlen(str, NULL);
3594 	if (pos < 0) {
3595 	    if (RB_TYPE_P(sub, T_REGEXP)) {
3596 		rb_backref_set(Qnil);
3597 	    }
3598 	    return Qnil;
3599 	}
3600     }
3601 
3602     if (SPECIAL_CONST_P(sub)) goto generic;
3603     switch (BUILTIN_TYPE(sub)) {
3604       case T_REGEXP:
3605 	if (pos > str_strlen(str, NULL))
3606 	    return Qnil;
3607 	pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3608 			 rb_enc_check(str, sub), single_byte_optimizable(str));
3609 
3610 	pos = rb_reg_search(sub, str, pos, 0);
3611 	pos = rb_str_sublen(str, pos);
3612 	break;
3613 
3614       generic:
3615       default: {
3616 	VALUE tmp;
3617 
3618 	tmp = rb_check_string_type(sub);
3619 	if (NIL_P(tmp)) {
3620 	    rb_raise(rb_eTypeError, "type mismatch: %s given",
3621 		     rb_obj_classname(sub));
3622 	}
3623 	sub = tmp;
3624       }
3625 	/* fall through */
3626       case T_STRING:
3627 	pos = rb_str_index(str, sub, pos);
3628 	pos = rb_str_sublen(str, pos);
3629 	break;
3630     }
3631 
3632     if (pos == -1) return Qnil;
3633     return LONG2NUM(pos);
3634 }
3635 
3636 #ifdef HAVE_MEMRCHR
3637 static long
str_rindex(VALUE str,VALUE sub,const char * s,long pos,rb_encoding * enc)3638 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3639 {
3640     char *hit, *adjusted;
3641     int c;
3642     long slen, searchlen;
3643     char *sbeg, *e, *t;
3644 
3645     slen = RSTRING_LEN(sub);
3646     if (slen == 0) return pos;
3647     sbeg = RSTRING_PTR(str);
3648     e = RSTRING_END(str);
3649     t = RSTRING_PTR(sub);
3650     c = *t & 0xff;
3651     searchlen = s - sbeg + 1;
3652 
3653     do {
3654 	hit = memrchr(sbeg, c, searchlen);
3655 	if (!hit) break;
3656 	adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3657 	if (hit != adjusted) {
3658 	    searchlen = adjusted - sbeg;
3659 	    continue;
3660 	}
3661 	if (memcmp(hit, t, slen) == 0)
3662 	    return rb_str_sublen(str, hit - sbeg);
3663 	searchlen = adjusted - sbeg;
3664     } while (searchlen > 0);
3665 
3666     return -1;
3667 }
3668 #else
3669 static long
str_rindex(VALUE str,VALUE sub,const char * s,long pos,rb_encoding * enc)3670 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3671 {
3672     long slen;
3673     char *sbeg, *e, *t;
3674 
3675     sbeg = RSTRING_PTR(str);
3676     e = RSTRING_END(str);
3677     t = RSTRING_PTR(sub);
3678     slen = RSTRING_LEN(sub);
3679 
3680     while (s) {
3681 	if (memcmp(s, t, slen) == 0) {
3682 	    return pos;
3683 	}
3684 	if (pos == 0) break;
3685 	pos--;
3686 	s = rb_enc_prev_char(sbeg, s, e, enc);
3687     }
3688 
3689     return -1;
3690 }
3691 #endif
3692 
3693 static long
rb_str_rindex(VALUE str,VALUE sub,long pos)3694 rb_str_rindex(VALUE str, VALUE sub, long pos)
3695 {
3696     long len, slen;
3697     char *sbeg, *s;
3698     rb_encoding *enc;
3699     int singlebyte;
3700 
3701     enc = rb_enc_check(str, sub);
3702     if (is_broken_string(sub)) return -1;
3703     singlebyte = single_byte_optimizable(str);
3704     len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3705     slen = str_strlen(sub, enc); /* rb_enc_check */
3706 
3707     /* substring longer than string */
3708     if (len < slen) return -1;
3709     if (len - pos < slen) pos = len - slen;
3710     if (len == 0) return pos;
3711 
3712     sbeg = RSTRING_PTR(str);
3713 
3714     if (pos == 0) {
3715 	if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3716 	    return 0;
3717 	else
3718 	    return -1;
3719     }
3720 
3721     s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3722     return str_rindex(str, sub, s, pos, enc);
3723 }
3724 
3725 
3726 /*
3727  *  call-seq:
3728  *     str.rindex(substring [, integer])   -> integer or nil
3729  *     str.rindex(regexp [, integer])   -> integer or nil
3730  *
3731  *  Returns the index of the last occurrence of the given <i>substring</i> or
3732  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3733  *  found. If the second parameter is present, it specifies the position in the
3734  *  string to end the search---characters beyond this point will not be
3735  *  considered.
3736  *
3737  *     "hello".rindex('e')             #=> 1
3738  *     "hello".rindex('l')             #=> 3
3739  *     "hello".rindex('a')             #=> nil
3740  *     "hello".rindex(?e)              #=> 1
3741  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
3742  */
3743 
3744 static VALUE
rb_str_rindex_m(int argc,VALUE * argv,VALUE str)3745 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
3746 {
3747     VALUE sub;
3748     VALUE vpos;
3749     rb_encoding *enc = STR_ENC_GET(str);
3750     long pos, len = str_strlen(str, enc); /* str's enc */
3751 
3752     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3753 	pos = NUM2LONG(vpos);
3754 	if (pos < 0) {
3755 	    pos += len;
3756 	    if (pos < 0) {
3757 		if (RB_TYPE_P(sub, T_REGEXP)) {
3758 		    rb_backref_set(Qnil);
3759 		}
3760 		return Qnil;
3761 	    }
3762 	}
3763 	if (pos > len) pos = len;
3764     }
3765     else {
3766 	pos = len;
3767     }
3768 
3769     if (SPECIAL_CONST_P(sub)) goto generic;
3770     switch (BUILTIN_TYPE(sub)) {
3771       case T_REGEXP:
3772 	/* enc = rb_get_check(str, sub); */
3773 	pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3774 			 enc, single_byte_optimizable(str));
3775 
3776 	pos = rb_reg_search(sub, str, pos, 1);
3777 	pos = rb_str_sublen(str, pos);
3778 	if (pos >= 0) return LONG2NUM(pos);
3779 	break;
3780 
3781       generic:
3782       default: {
3783 	VALUE tmp;
3784 
3785 	tmp = rb_check_string_type(sub);
3786 	if (NIL_P(tmp)) {
3787 	    rb_raise(rb_eTypeError, "type mismatch: %s given",
3788 		     rb_obj_classname(sub));
3789 	}
3790 	sub = tmp;
3791       }
3792 	/* fall through */
3793       case T_STRING:
3794 	pos = rb_str_rindex(str, sub, pos);
3795 	if (pos >= 0) return LONG2NUM(pos);
3796 	break;
3797     }
3798     return Qnil;
3799 }
3800 
3801 /*
3802  *  call-seq:
3803  *     str =~ obj   -> integer or nil
3804  *
3805  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
3806  *  against <i>str</i>,and returns the position the match starts, or
3807  *  <code>nil</code> if there is no match. Otherwise, invokes
3808  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
3809  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
3810  *
3811  *  Note: <code>str =~ regexp</code> is not the same as
3812  *  <code>regexp =~ str</code>. Strings captured from named capture groups
3813  *  are assigned to local variables only in the second case.
3814  *
3815  *     "cat o' 9 tails" =~ /\d/   #=> 7
3816  *     "cat o' 9 tails" =~ 9      #=> nil
3817  */
3818 
3819 static VALUE
rb_str_match(VALUE x,VALUE y)3820 rb_str_match(VALUE x, VALUE y)
3821 {
3822     if (SPECIAL_CONST_P(y)) goto generic;
3823     switch (BUILTIN_TYPE(y)) {
3824       case T_STRING:
3825 	rb_raise(rb_eTypeError, "type mismatch: String given");
3826 
3827       case T_REGEXP:
3828 	return rb_reg_match(y, x);
3829 
3830       generic:
3831       default:
3832 	return rb_funcall(y, idEqTilde, 1, x);
3833     }
3834 }
3835 
3836 
3837 static VALUE get_pat(VALUE);
3838 
3839 
3840 /*
3841  *  call-seq:
3842  *     str.match(pattern)        -> matchdata or nil
3843  *     str.match(pattern, pos)   -> matchdata or nil
3844  *
3845  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
3846  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
3847  *  parameter is present, it specifies the position in the string to begin the
3848  *  search.
3849  *
3850  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
3851  *     'hello'.match('(.)\1')[0]   #=> "ll"
3852  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
3853  *     'hello'.match(/(.)\1/, 3)   #=> nil
3854  *     'hello'.match('xx')         #=> nil
3855  *
3856  *  If a block is given, invoke the block with MatchData if match succeed, so
3857  *  that you can write
3858  *
3859  *     str.match(pat) {|m| ...}
3860  *
3861  *  instead of
3862  *
3863  *     if m = str.match(pat)
3864  *       ...
3865  *     end
3866  *
3867  *  The return value is a value from block execution in this case.
3868  */
3869 
3870 static VALUE
rb_str_match_m(int argc,VALUE * argv,VALUE str)3871 rb_str_match_m(int argc, VALUE *argv, VALUE str)
3872 {
3873     VALUE re, result;
3874     if (argc < 1)
3875 	rb_check_arity(argc, 1, 2);
3876     re = argv[0];
3877     argv[0] = str;
3878     result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
3879     if (!NIL_P(result) && rb_block_given_p()) {
3880 	return rb_yield(result);
3881     }
3882     return result;
3883 }
3884 
3885 /*
3886  *  call-seq:
3887  *     str.match?(pattern)        -> true or false
3888  *     str.match?(pattern, pos)   -> true or false
3889  *
3890  *  Converts _pattern_ to a +Regexp+ (if it isn't already one), then
3891  *  returns a +true+ or +false+ indicates whether the regexp is
3892  *  matched _str_ or not without updating <code>$~</code> and other
3893  *  related variables.  If the second parameter is present, it
3894  *  specifies the position in the string to begin the search.
3895  *
3896  *     "Ruby".match?(/R.../)    #=> true
3897  *     "Ruby".match?(/R.../, 1) #=> false
3898  *     "Ruby".match?(/P.../)    #=> false
3899  *     $&                       #=> nil
3900  */
3901 
3902 static VALUE
rb_str_match_m_p(int argc,VALUE * argv,VALUE str)3903 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
3904 {
3905     VALUE re;
3906     rb_check_arity(argc, 1, 2);
3907     re = get_pat(argv[0]);
3908     return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
3909 }
3910 
3911 enum neighbor_char {
3912     NEIGHBOR_NOT_CHAR,
3913     NEIGHBOR_FOUND,
3914     NEIGHBOR_WRAPPED
3915 };
3916 
3917 static enum neighbor_char
enc_succ_char(char * p,long len,rb_encoding * enc)3918 enc_succ_char(char *p, long len, rb_encoding *enc)
3919 {
3920     long i;
3921     int l;
3922 
3923     if (rb_enc_mbminlen(enc) > 1) {
3924 	/* wchar, trivial case */
3925 	int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3926 	if (!MBCLEN_CHARFOUND_P(r)) {
3927 	    return NEIGHBOR_NOT_CHAR;
3928 	}
3929 	c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
3930 	l = rb_enc_code_to_mbclen(c, enc);
3931 	if (!l) return NEIGHBOR_NOT_CHAR;
3932 	if (l != len) return NEIGHBOR_WRAPPED;
3933 	rb_enc_mbcput(c, p, enc);
3934 	r = rb_enc_precise_mbclen(p, p + len, enc);
3935 	if (!MBCLEN_CHARFOUND_P(r)) {
3936 	    return NEIGHBOR_NOT_CHAR;
3937 	}
3938 	return NEIGHBOR_FOUND;
3939     }
3940     while (1) {
3941         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
3942             p[i] = '\0';
3943         if (i < 0)
3944             return NEIGHBOR_WRAPPED;
3945         ++((unsigned char*)p)[i];
3946         l = rb_enc_precise_mbclen(p, p+len, enc);
3947         if (MBCLEN_CHARFOUND_P(l)) {
3948             l = MBCLEN_CHARFOUND_LEN(l);
3949             if (l == len) {
3950                 return NEIGHBOR_FOUND;
3951             }
3952             else {
3953                 memset(p+l, 0xff, len-l);
3954             }
3955         }
3956         if (MBCLEN_INVALID_P(l) && i < len-1) {
3957             long len2;
3958             int l2;
3959             for (len2 = len-1; 0 < len2; len2--) {
3960                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3961                 if (!MBCLEN_INVALID_P(l2))
3962                     break;
3963             }
3964             memset(p+len2+1, 0xff, len-(len2+1));
3965         }
3966     }
3967 }
3968 
3969 static enum neighbor_char
enc_pred_char(char * p,long len,rb_encoding * enc)3970 enc_pred_char(char *p, long len, rb_encoding *enc)
3971 {
3972     long i;
3973     int l;
3974     if (rb_enc_mbminlen(enc) > 1) {
3975 	/* wchar, trivial case */
3976 	int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3977 	if (!MBCLEN_CHARFOUND_P(r)) {
3978 	    return NEIGHBOR_NOT_CHAR;
3979 	}
3980 	c = rb_enc_mbc_to_codepoint(p, p + len, enc);
3981 	if (!c) return NEIGHBOR_NOT_CHAR;
3982 	--c;
3983 	l = rb_enc_code_to_mbclen(c, enc);
3984 	if (!l) return NEIGHBOR_NOT_CHAR;
3985 	if (l != len) return NEIGHBOR_WRAPPED;
3986 	rb_enc_mbcput(c, p, enc);
3987 	r = rb_enc_precise_mbclen(p, p + len, enc);
3988 	if (!MBCLEN_CHARFOUND_P(r)) {
3989 	    return NEIGHBOR_NOT_CHAR;
3990 	}
3991 	return NEIGHBOR_FOUND;
3992     }
3993     while (1) {
3994         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
3995             p[i] = '\xff';
3996         if (i < 0)
3997             return NEIGHBOR_WRAPPED;
3998         --((unsigned char*)p)[i];
3999         l = rb_enc_precise_mbclen(p, p+len, enc);
4000         if (MBCLEN_CHARFOUND_P(l)) {
4001             l = MBCLEN_CHARFOUND_LEN(l);
4002             if (l == len) {
4003                 return NEIGHBOR_FOUND;
4004             }
4005             else {
4006                 memset(p+l, 0, len-l);
4007             }
4008         }
4009         if (MBCLEN_INVALID_P(l) && i < len-1) {
4010             long len2;
4011             int l2;
4012             for (len2 = len-1; 0 < len2; len2--) {
4013                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4014                 if (!MBCLEN_INVALID_P(l2))
4015                     break;
4016             }
4017             memset(p+len2+1, 0, len-(len2+1));
4018         }
4019     }
4020 }
4021 
4022 /*
4023   overwrite +p+ by succeeding letter in +enc+ and returns
4024   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4025   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4026   assuming each ranges are successive, and mbclen
4027   never change in each ranges.
4028   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4029   character.
4030  */
4031 static enum neighbor_char
enc_succ_alnum_char(char * p,long len,rb_encoding * enc,char * carry)4032 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4033 {
4034     enum neighbor_char ret;
4035     unsigned int c;
4036     int ctype;
4037     int range;
4038     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4039 
4040     /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4041     int try;
4042     const int max_gaps = 1;
4043 
4044     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4045     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4046         ctype = ONIGENC_CTYPE_DIGIT;
4047     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4048         ctype = ONIGENC_CTYPE_ALPHA;
4049     else
4050         return NEIGHBOR_NOT_CHAR;
4051 
4052     MEMCPY(save, p, char, len);
4053     for (try = 0; try <= max_gaps; ++try) {
4054 	ret = enc_succ_char(p, len, enc);
4055 	if (ret == NEIGHBOR_FOUND) {
4056 	    c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4057 	    if (rb_enc_isctype(c, ctype, enc))
4058 		return NEIGHBOR_FOUND;
4059 	}
4060     }
4061     MEMCPY(p, save, char, len);
4062     range = 1;
4063     while (1) {
4064         MEMCPY(save, p, char, len);
4065         ret = enc_pred_char(p, len, enc);
4066         if (ret == NEIGHBOR_FOUND) {
4067             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4068             if (!rb_enc_isctype(c, ctype, enc)) {
4069                 MEMCPY(p, save, char, len);
4070                 break;
4071             }
4072         }
4073         else {
4074             MEMCPY(p, save, char, len);
4075             break;
4076         }
4077         range++;
4078     }
4079     if (range == 1) {
4080         return NEIGHBOR_NOT_CHAR;
4081     }
4082 
4083     if (ctype != ONIGENC_CTYPE_DIGIT) {
4084         MEMCPY(carry, p, char, len);
4085         return NEIGHBOR_WRAPPED;
4086     }
4087 
4088     MEMCPY(carry, p, char, len);
4089     enc_succ_char(carry, len, enc);
4090     return NEIGHBOR_WRAPPED;
4091 }
4092 
4093 
4094 static VALUE str_succ(VALUE str);
4095 
4096 /*
4097  *  call-seq:
4098  *     str.succ   -> new_str
4099  *     str.next   -> new_str
4100  *
4101  *  Returns the successor to <i>str</i>. The successor is calculated by
4102  *  incrementing characters starting from the rightmost alphanumeric (or
4103  *  the rightmost character if there are no alphanumerics) in the
4104  *  string. Incrementing a digit always results in another digit, and
4105  *  incrementing a letter results in another letter of the same case.
4106  *  Incrementing nonalphanumerics uses the underlying character set's
4107  *  collating sequence.
4108  *
4109  *  If the increment generates a ``carry,'' the character to the left of
4110  *  it is incremented. This process repeats until there is no carry,
4111  *  adding an additional character if necessary.
4112  *
4113  *     "abcd".succ        #=> "abce"
4114  *     "THX1138".succ     #=> "THX1139"
4115  *     "<<koala>>".succ   #=> "<<koalb>>"
4116  *     "1999zzz".succ     #=> "2000aaa"
4117  *     "ZZZ9999".succ     #=> "AAAA0000"
4118  *     "***".succ         #=> "**+"
4119  */
4120 
4121 VALUE
rb_str_succ(VALUE orig)4122 rb_str_succ(VALUE orig)
4123 {
4124     VALUE str;
4125     str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
4126     rb_enc_cr_str_copy_for_substr(str, orig);
4127     OBJ_INFECT(str, orig);
4128     return str_succ(str);
4129 }
4130 
4131 static VALUE
str_succ(VALUE str)4132 str_succ(VALUE str)
4133 {
4134     rb_encoding *enc;
4135     char *sbeg, *s, *e, *last_alnum = 0;
4136     int c = -1;
4137     long l, slen;
4138     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4139     long carry_pos = 0, carry_len = 1;
4140     enum neighbor_char neighbor = NEIGHBOR_FOUND;
4141 
4142     slen = RSTRING_LEN(str);
4143     if (slen == 0) return str;
4144 
4145     enc = STR_ENC_GET(str);
4146     sbeg = RSTRING_PTR(str);
4147     s = e = sbeg + slen;
4148 
4149     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4150 	if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4151 	    if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4152 		ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4153 		s = last_alnum;
4154 		break;
4155 	    }
4156 	}
4157 	l = rb_enc_precise_mbclen(s, e, enc);
4158 	if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4159 	l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4160         neighbor = enc_succ_alnum_char(s, l, enc, carry);
4161         switch (neighbor) {
4162 	  case NEIGHBOR_NOT_CHAR:
4163 	    continue;
4164 	  case NEIGHBOR_FOUND:
4165 	    return str;
4166 	  case NEIGHBOR_WRAPPED:
4167 	    last_alnum = s;
4168 	    break;
4169 	}
4170         c = 1;
4171         carry_pos = s - sbeg;
4172         carry_len = l;
4173     }
4174     if (c == -1) {		/* str contains no alnum */
4175 	s = e;
4176 	while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4177             enum neighbor_char neighbor;
4178 	    char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4179 	    l = rb_enc_precise_mbclen(s, e, enc);
4180 	    if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4181 	    l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4182 	    MEMCPY(tmp, s, char, l);
4183 	    neighbor = enc_succ_char(tmp, l, enc);
4184 	    switch (neighbor) {
4185 	      case NEIGHBOR_FOUND:
4186 		MEMCPY(s, tmp, char, l);
4187                 return str;
4188 		break;
4189 	      case NEIGHBOR_WRAPPED:
4190 		MEMCPY(s, tmp, char, l);
4191 		break;
4192 	      case NEIGHBOR_NOT_CHAR:
4193 		break;
4194 	    }
4195             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4196                 /* wrapped to \0...\0.  search next valid char. */
4197                 enc_succ_char(s, l, enc);
4198             }
4199             if (!rb_enc_asciicompat(enc)) {
4200                 MEMCPY(carry, s, char, l);
4201                 carry_len = l;
4202             }
4203             carry_pos = s - sbeg;
4204 	}
4205 	ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4206     }
4207     RESIZE_CAPA(str, slen + carry_len);
4208     sbeg = RSTRING_PTR(str);
4209     s = sbeg + carry_pos;
4210     memmove(s + carry_len, s, slen - carry_pos);
4211     memmove(s, carry, carry_len);
4212     slen += carry_len;
4213     STR_SET_LEN(str, slen);
4214     TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4215     rb_enc_str_coderange(str);
4216     return str;
4217 }
4218 
4219 
4220 /*
4221  *  call-seq:
4222  *     str.succ!   -> str
4223  *     str.next!   -> str
4224  *
4225  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
4226  *  place.
4227  */
4228 
4229 static VALUE
rb_str_succ_bang(VALUE str)4230 rb_str_succ_bang(VALUE str)
4231 {
4232     rb_str_modify(str);
4233     str_succ(str);
4234     return str;
4235 }
4236 
4237 static int
all_digits_p(const char * s,long len)4238 all_digits_p(const char *s, long len)
4239 {
4240     while (len-- > 0) {
4241 	if (!ISDIGIT(*s)) return 0;
4242 	s++;
4243     }
4244     return 1;
4245 }
4246 
4247 static int
str_upto_i(VALUE str,VALUE arg)4248 str_upto_i(VALUE str, VALUE arg)
4249 {
4250     rb_yield(str);
4251     return 0;
4252 }
4253 
4254 /*
4255  *  call-seq:
4256  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
4257  *     str.upto(other_str, exclusive=false)                -> an_enumerator
4258  *
4259  *  Iterates through successive values, starting at <i>str</i> and
4260  *  ending at <i>other_str</i> inclusive, passing each value in turn to
4261  *  the block. The <code>String#succ</code> method is used to generate
4262  *  each value.  If optional second argument exclusive is omitted or is false,
4263  *  the last value will be included; otherwise it will be excluded.
4264  *
4265  *  If no block is given, an enumerator is returned instead.
4266  *
4267  *     "a8".upto("b6") {|s| print s, ' ' }
4268  *     for s in "a8".."b6"
4269  *       print s, ' '
4270  *     end
4271  *
4272  *  <em>produces:</em>
4273  *
4274  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
4275  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
4276  *
4277  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
4278  *  both are recognized as decimal numbers. In addition, the width of
4279  *  string (e.g. leading zeros) is handled appropriately.
4280  *
4281  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
4282  *     "25".upto("5").to_a   #=> []
4283  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
4284  */
4285 
4286 static VALUE
rb_str_upto(int argc,VALUE * argv,VALUE beg)4287 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4288 {
4289     VALUE end, exclusive;
4290 
4291     rb_scan_args(argc, argv, "11", &end, &exclusive);
4292     RETURN_ENUMERATOR(beg, argc, argv);
4293     return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4294 }
4295 
4296 VALUE
rb_str_upto_each(VALUE beg,VALUE end,int excl,int (* each)(VALUE,VALUE),VALUE arg)4297 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4298 {
4299     VALUE current, after_end;
4300     ID succ;
4301     int n, ascii;
4302     rb_encoding *enc;
4303 
4304     CONST_ID(succ, "succ");
4305     StringValue(end);
4306     enc = rb_enc_check(beg, end);
4307     ascii = (is_ascii_string(beg) && is_ascii_string(end));
4308     /* single character */
4309     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4310 	char c = RSTRING_PTR(beg)[0];
4311 	char e = RSTRING_PTR(end)[0];
4312 
4313 	if (c > e || (excl && c == e)) return beg;
4314 	for (;;) {
4315 	    if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4316 	    if (!excl && c == e) break;
4317 	    c++;
4318 	    if (excl && c == e) break;
4319 	}
4320 	return beg;
4321     }
4322     /* both edges are all digits */
4323     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4324 	all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4325 	all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4326 	VALUE b, e;
4327 	int width;
4328 
4329 	width = RSTRING_LENINT(beg);
4330 	b = rb_str_to_inum(beg, 10, FALSE);
4331 	e = rb_str_to_inum(end, 10, FALSE);
4332 	if (FIXNUM_P(b) && FIXNUM_P(e)) {
4333 	    long bi = FIX2LONG(b);
4334 	    long ei = FIX2LONG(e);
4335 	    rb_encoding *usascii = rb_usascii_encoding();
4336 
4337 	    while (bi <= ei) {
4338 		if (excl && bi == ei) break;
4339 		if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4340 		bi++;
4341 	    }
4342 	}
4343 	else {
4344 	    ID op = excl ? '<' : idLE;
4345 	    VALUE args[2], fmt = rb_fstring_lit("%.*d");
4346 
4347 	    args[0] = INT2FIX(width);
4348 	    while (rb_funcall(b, op, 1, e)) {
4349 		args[1] = b;
4350 		if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4351 		b = rb_funcallv(b, succ, 0, 0);
4352 	    }
4353 	}
4354 	return beg;
4355     }
4356     /* normal case */
4357     n = rb_str_cmp(beg, end);
4358     if (n > 0 || (excl && n == 0)) return beg;
4359 
4360     after_end = rb_funcallv(end, succ, 0, 0);
4361     current = rb_str_dup(beg);
4362     while (!rb_str_equal(current, after_end)) {
4363 	VALUE next = Qnil;
4364 	if (excl || !rb_str_equal(current, end))
4365 	    next = rb_funcallv(current, succ, 0, 0);
4366 	if ((*each)(current, arg)) break;
4367 	if (NIL_P(next)) break;
4368 	current = next;
4369 	StringValue(current);
4370 	if (excl && rb_str_equal(current, end)) break;
4371 	if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4372 	    break;
4373     }
4374 
4375     return beg;
4376 }
4377 
4378 VALUE
rb_str_upto_endless_each(VALUE beg,int (* each)(VALUE,VALUE),VALUE arg)4379 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4380 {
4381     VALUE current;
4382     ID succ;
4383 
4384     CONST_ID(succ, "succ");
4385     /* both edges are all digits */
4386     if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4387 	all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4388 	VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4389 	int width = RSTRING_LENINT(beg);
4390 	b = rb_str_to_inum(beg, 10, FALSE);
4391 	if (FIXNUM_P(b)) {
4392 	    long bi = FIX2LONG(b);
4393 	    rb_encoding *usascii = rb_usascii_encoding();
4394 
4395 	    while (FIXABLE(bi)) {
4396 		if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4397 		bi++;
4398 	    }
4399 	    b = LONG2NUM(bi);
4400 	}
4401 	args[0] = INT2FIX(width);
4402 	while (1) {
4403 	    args[1] = b;
4404 	    if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4405 	    b = rb_funcallv(b, succ, 0, 0);
4406 	}
4407     }
4408     /* normal case */
4409     current = rb_str_dup(beg);
4410     while (1) {
4411 	VALUE next = rb_funcallv(current, succ, 0, 0);
4412 	if ((*each)(current, arg)) break;
4413 	current = next;
4414 	StringValue(current);
4415 	if (RSTRING_LEN(current) == 0)
4416 	    break;
4417     }
4418 
4419     return beg;
4420 }
4421 
4422 static int
include_range_i(VALUE str,VALUE arg)4423 include_range_i(VALUE str, VALUE arg)
4424 {
4425     VALUE *argp = (VALUE *)arg;
4426     if (!rb_equal(str, *argp)) return 0;
4427     *argp = Qnil;
4428     return 1;
4429 }
4430 
4431 VALUE
rb_str_include_range_p(VALUE beg,VALUE end,VALUE val,VALUE exclusive)4432 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4433 {
4434     beg = rb_str_new_frozen(beg);
4435     StringValue(end);
4436     end = rb_str_new_frozen(end);
4437     if (NIL_P(val)) return Qfalse;
4438     val = rb_check_string_type(val);
4439     if (NIL_P(val)) return Qfalse;
4440     if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4441 	rb_enc_asciicompat(STR_ENC_GET(end)) &&
4442 	rb_enc_asciicompat(STR_ENC_GET(val))) {
4443 	const char *bp = RSTRING_PTR(beg);
4444 	const char *ep = RSTRING_PTR(end);
4445 	const char *vp = RSTRING_PTR(val);
4446 	if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4447 	    if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4448 		return Qfalse;
4449 	    else {
4450 		char b = *bp;
4451 		char e = *ep;
4452 		char v = *vp;
4453 
4454 		if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4455 		    if (b <= v && v < e) return Qtrue;
4456 		    if (!RTEST(exclusive) && v == e) return Qtrue;
4457 		    return Qfalse;
4458 		}
4459 	    }
4460 	}
4461 #if 0
4462 	/* both edges are all digits */
4463 	if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4464 	    all_digits_p(bp, RSTRING_LEN(beg)) &&
4465 	    all_digits_p(ep, RSTRING_LEN(end))) {
4466 	    /* TODO */
4467 	}
4468 #endif
4469     }
4470     rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4471 
4472     return NIL_P(val) ? Qtrue : Qfalse;
4473 }
4474 
4475 static VALUE
rb_str_subpat(VALUE str,VALUE re,VALUE backref)4476 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4477 {
4478     if (rb_reg_search(re, str, 0, 0) >= 0) {
4479         VALUE match = rb_backref_get();
4480         int nth = rb_reg_backref_number(match, backref);
4481 	return rb_reg_nth_match(nth, match);
4482     }
4483     return Qnil;
4484 }
4485 
4486 static VALUE
rb_str_aref(VALUE str,VALUE indx)4487 rb_str_aref(VALUE str, VALUE indx)
4488 {
4489     long idx;
4490 
4491     if (FIXNUM_P(indx)) {
4492 	idx = FIX2LONG(indx);
4493     }
4494     else if (RB_TYPE_P(indx, T_REGEXP)) {
4495 	return rb_str_subpat(str, indx, INT2FIX(0));
4496     }
4497     else if (RB_TYPE_P(indx, T_STRING)) {
4498 	if (rb_str_index(str, indx, 0) != -1)
4499 	    return rb_str_dup(indx);
4500 	return Qnil;
4501     }
4502     else {
4503 	/* check if indx is Range */
4504 	long beg, len = str_strlen(str, NULL);
4505 	switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4506 	  case Qfalse:
4507 	    break;
4508 	  case Qnil:
4509 	    return Qnil;
4510 	  default:
4511 	    return rb_str_substr(str, beg, len);
4512 	}
4513 	idx = NUM2LONG(indx);
4514     }
4515 
4516     return str_substr(str, idx, 1, FALSE);
4517 }
4518 
4519 
4520 /*
4521  *  call-seq:
4522  *     str[index]                 -> new_str or nil
4523  *     str[start, length]         -> new_str or nil
4524  *     str[range]                 -> new_str or nil
4525  *     str[regexp]                -> new_str or nil
4526  *     str[regexp, capture]       -> new_str or nil
4527  *     str[match_str]             -> new_str or nil
4528  *     str.slice(index)           -> new_str or nil
4529  *     str.slice(start, length)   -> new_str or nil
4530  *     str.slice(range)           -> new_str or nil
4531  *     str.slice(regexp)          -> new_str or nil
4532  *     str.slice(regexp, capture) -> new_str or nil
4533  *     str.slice(match_str)       -> new_str or nil
4534  *
4535  *  Element Reference --- If passed a single +index+, returns a substring of
4536  *  one character at that index. If passed a +start+ index and a +length+,
4537  *  returns a substring containing +length+ characters starting at the
4538  *  +start+ index. If passed a +range+, its beginning and end are interpreted as
4539  *  offsets delimiting the substring to be returned.
4540  *
4541  *  In these three cases, if an index is negative, it is counted from the end
4542  *  of the string.  For the +start+ and +range+ cases the starting index
4543  *  is just before a character and an index matching the string's size.
4544  *  Additionally, an empty string is returned when the starting index for a
4545  *  character range is at the end of the string.
4546  *
4547  *  Returns +nil+ if the initial index falls outside the string or the length
4548  *  is negative.
4549  *
4550  *  If a +Regexp+ is supplied, the matching portion of the string is
4551  *  returned.  If a +capture+ follows the regular expression, which may be a
4552  *  capture group index or name, follows the regular expression that component
4553  *  of the MatchData is returned instead.
4554  *
4555  *  If a +match_str+ is given, that string is returned if it occurs in
4556  *  the string.
4557  *
4558  *  Returns +nil+ if the regular expression does not match or the match string
4559  *  cannot be found.
4560  *
4561  *     a = "hello there"
4562  *
4563  *     a[1]                   #=> "e"
4564  *     a[2, 3]                #=> "llo"
4565  *     a[2..3]                #=> "ll"
4566  *
4567  *     a[-3, 2]               #=> "er"
4568  *     a[7..-2]               #=> "her"
4569  *     a[-4..-2]              #=> "her"
4570  *     a[-2..-4]              #=> ""
4571  *
4572  *     a[11, 0]               #=> ""
4573  *     a[11]                  #=> nil
4574  *     a[12, 0]               #=> nil
4575  *     a[12..-1]              #=> nil
4576  *
4577  *     a[/[aeiou](.)\1/]      #=> "ell"
4578  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
4579  *     a[/[aeiou](.)\1/, 1]   #=> "l"
4580  *     a[/[aeiou](.)\1/, 2]   #=> nil
4581  *
4582  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
4583  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"]     #=> "e"
4584  *
4585  *     a["lo"]                #=> "lo"
4586  *     a["bye"]               #=> nil
4587  */
4588 
4589 static VALUE
rb_str_aref_m(int argc,VALUE * argv,VALUE str)4590 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
4591 {
4592     if (argc == 2) {
4593 	if (RB_TYPE_P(argv[0], T_REGEXP)) {
4594 	    return rb_str_subpat(str, argv[0], argv[1]);
4595 	}
4596 	else {
4597 	    long beg = NUM2LONG(argv[0]);
4598 	    long len = NUM2LONG(argv[1]);
4599 	    return rb_str_substr(str, beg, len);
4600 	}
4601     }
4602     rb_check_arity(argc, 1, 2);
4603     return rb_str_aref(str, argv[0]);
4604 }
4605 
4606 VALUE
rb_str_drop_bytes(VALUE str,long len)4607 rb_str_drop_bytes(VALUE str, long len)
4608 {
4609     char *ptr = RSTRING_PTR(str);
4610     long olen = RSTRING_LEN(str), nlen;
4611 
4612     str_modifiable(str);
4613     if (len > olen) len = olen;
4614     nlen = olen - len;
4615     if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4616 	char *oldptr = ptr;
4617 	int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4618 	STR_SET_EMBED(str);
4619 	STR_SET_EMBED_LEN(str, nlen);
4620 	ptr = RSTRING(str)->as.ary;
4621 	memmove(ptr, oldptr + len, nlen);
4622 	if (fl == STR_NOEMBED) xfree(oldptr);
4623     }
4624     else {
4625 	if (!STR_SHARED_P(str)) rb_str_new_frozen(str);
4626 	ptr = RSTRING(str)->as.heap.ptr += len;
4627 	RSTRING(str)->as.heap.len = nlen;
4628     }
4629     ptr[nlen] = 0;
4630     ENC_CODERANGE_CLEAR(str);
4631     return str;
4632 }
4633 
4634 static void
rb_str_splice_0(VALUE str,long beg,long len,VALUE val)4635 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4636 {
4637     char *sptr;
4638     long slen, vlen = RSTRING_LEN(val);
4639     int cr;
4640 
4641     if (beg == 0 && vlen == 0) {
4642 	rb_str_drop_bytes(str, len);
4643 	OBJ_INFECT(str, val);
4644 	return;
4645     }
4646 
4647     str_modify_keep_cr(str);
4648     RSTRING_GETMEM(str, sptr, slen);
4649     if (len < vlen) {
4650 	/* expand string */
4651 	RESIZE_CAPA(str, slen + vlen - len);
4652 	sptr = RSTRING_PTR(str);
4653     }
4654 
4655     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
4656 	cr = rb_enc_str_coderange(val);
4657     else
4658 	cr = ENC_CODERANGE_UNKNOWN;
4659 
4660     if (vlen != len) {
4661 	memmove(sptr + beg + vlen,
4662 		sptr + beg + len,
4663 		slen - (beg + len));
4664     }
4665     if (vlen < beg && len < 0) {
4666 	MEMZERO(sptr + slen, char, -len);
4667     }
4668     if (vlen > 0) {
4669 	memmove(sptr + beg, RSTRING_PTR(val), vlen);
4670     }
4671     slen += vlen - len;
4672     STR_SET_LEN(str, slen);
4673     TERM_FILL(&sptr[slen], TERM_LEN(str));
4674     OBJ_INFECT(str, val);
4675     ENC_CODERANGE_SET(str, cr);
4676 }
4677 
4678 void
rb_str_update(VALUE str,long beg,long len,VALUE val)4679 rb_str_update(VALUE str, long beg, long len, VALUE val)
4680 {
4681     long slen;
4682     char *p, *e;
4683     rb_encoding *enc;
4684     int singlebyte = single_byte_optimizable(str);
4685     int cr;
4686 
4687     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4688 
4689     StringValue(val);
4690     enc = rb_enc_check(str, val);
4691     slen = str_strlen(str, enc); /* rb_enc_check */
4692 
4693     if (slen < beg) {
4694       out_of_range:
4695 	rb_raise(rb_eIndexError, "index %ld out of string", beg);
4696     }
4697     if (beg < 0) {
4698 	if (beg + slen < 0) {
4699 	    goto out_of_range;
4700 	}
4701 	beg += slen;
4702     }
4703     assert(beg >= 0);
4704     assert(beg <= slen);
4705     if (len > slen - beg) {
4706 	len = slen - beg;
4707     }
4708     str_modify_keep_cr(str);
4709     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4710     if (!p) p = RSTRING_END(str);
4711     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4712     if (!e) e = RSTRING_END(str);
4713     /* error check */
4714     beg = p - RSTRING_PTR(str);	/* physical position */
4715     len = e - p;		/* physical length */
4716     rb_str_splice_0(str, beg, len, val);
4717     rb_enc_associate(str, enc);
4718     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
4719     if (cr != ENC_CODERANGE_BROKEN)
4720 	ENC_CODERANGE_SET(str, cr);
4721 }
4722 
4723 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4724 
4725 static void
rb_str_subpat_set(VALUE str,VALUE re,VALUE backref,VALUE val)4726 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
4727 {
4728     int nth;
4729     VALUE match;
4730     long start, end, len;
4731     rb_encoding *enc;
4732     struct re_registers *regs;
4733 
4734     if (rb_reg_search(re, str, 0, 0) < 0) {
4735 	rb_raise(rb_eIndexError, "regexp not matched");
4736     }
4737     match = rb_backref_get();
4738     nth = rb_reg_backref_number(match, backref);
4739     regs = RMATCH_REGS(match);
4740     if (nth >= regs->num_regs) {
4741       out_of_range:
4742 	rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4743     }
4744     if (nth < 0) {
4745 	if (-nth >= regs->num_regs) {
4746 	    goto out_of_range;
4747 	}
4748 	nth += regs->num_regs;
4749     }
4750 
4751     start = BEG(nth);
4752     if (start == -1) {
4753 	rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4754     }
4755     end = END(nth);
4756     len = end - start;
4757     StringValue(val);
4758     enc = rb_enc_check_str(str, val);
4759     rb_str_splice_0(str, start, len, val);
4760     rb_enc_associate(str, enc);
4761 }
4762 
4763 static VALUE
rb_str_aset(VALUE str,VALUE indx,VALUE val)4764 rb_str_aset(VALUE str, VALUE indx, VALUE val)
4765 {
4766     long idx, beg;
4767 
4768     if (FIXNUM_P(indx)) {
4769 	idx = FIX2LONG(indx);
4770       num_index:
4771 	rb_str_splice(str, idx, 1, val);
4772 	return val;
4773     }
4774 
4775     if (SPECIAL_CONST_P(indx)) goto generic;
4776     switch (BUILTIN_TYPE(indx)) {
4777       case T_REGEXP:
4778 	rb_str_subpat_set(str, indx, INT2FIX(0), val);
4779 	return val;
4780 
4781       case T_STRING:
4782 	beg = rb_str_index(str, indx, 0);
4783 	if (beg < 0) {
4784 	    rb_raise(rb_eIndexError, "string not matched");
4785 	}
4786 	beg = rb_str_sublen(str, beg);
4787 	rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4788 	return val;
4789 
4790       generic:
4791       default:
4792 	/* check if indx is Range */
4793 	{
4794 	    long beg, len;
4795 	    if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4796 		rb_str_splice(str, beg, len, val);
4797 		return val;
4798 	    }
4799 	}
4800 	idx = NUM2LONG(indx);
4801 	goto num_index;
4802     }
4803 }
4804 
4805 /*
4806  *  call-seq:
4807  *     str[integer] = new_str
4808  *     str[integer, integer] = new_str
4809  *     str[range] = aString
4810  *     str[regexp] = new_str
4811  *     str[regexp, integer] = new_str
4812  *     str[regexp, name] = new_str
4813  *     str[other_str] = new_str
4814  *
4815  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
4816  *  portion of the string affected is determined using the same criteria as
4817  *  <code>String#[]</code>. If the replacement string is not the same length as
4818  *  the text it is replacing, the string will be adjusted accordingly. If the
4819  *  regular expression or string is used as the index doesn't match a position
4820  *  in the string, <code>IndexError</code> is raised. If the regular expression
4821  *  form is used, the optional second <code>Integer</code> allows you to specify
4822  *  which portion of the match to replace (effectively using the
4823  *  <code>MatchData</code> indexing rules. The forms that take an
4824  *  <code>Integer</code> will raise an <code>IndexError</code> if the value is
4825  *  out of range; the <code>Range</code> form will raise a
4826  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
4827  *  will raise an <code>IndexError</code> on negative match.
4828  */
4829 
4830 static VALUE
rb_str_aset_m(int argc,VALUE * argv,VALUE str)4831 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
4832 {
4833     if (argc == 3) {
4834 	if (RB_TYPE_P(argv[0], T_REGEXP)) {
4835 	    rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
4836 	}
4837 	else {
4838 	    rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
4839 	}
4840 	return argv[2];
4841     }
4842     rb_check_arity(argc, 2, 3);
4843     return rb_str_aset(str, argv[0], argv[1]);
4844 }
4845 
4846 /*
4847  *  call-seq:
4848  *     str.insert(index, other_str)   -> str
4849  *
4850  *  Inserts <i>other_str</i> before the character at the given
4851  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
4852  *  end of the string, and insert <em>after</em> the given character.
4853  *  The intent is insert <i>aString</i> so that it starts at the given
4854  *  <i>index</i>.
4855  *
4856  *     "abcd".insert(0, 'X')    #=> "Xabcd"
4857  *     "abcd".insert(3, 'X')    #=> "abcXd"
4858  *     "abcd".insert(4, 'X')    #=> "abcdX"
4859  *     "abcd".insert(-3, 'X')   #=> "abXcd"
4860  *     "abcd".insert(-1, 'X')   #=> "abcdX"
4861  */
4862 
4863 static VALUE
rb_str_insert(VALUE str,VALUE idx,VALUE str2)4864 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
4865 {
4866     long pos = NUM2LONG(idx);
4867 
4868     if (pos == -1) {
4869 	return rb_str_append(str, str2);
4870     }
4871     else if (pos < 0) {
4872 	pos++;
4873     }
4874     rb_str_splice(str, pos, 0, str2);
4875     return str;
4876 }
4877 
4878 
4879 /*
4880  *  call-seq:
4881  *     str.slice!(integer)           -> new_str or nil
4882  *     str.slice!(integer, integer)   -> new_str or nil
4883  *     str.slice!(range)            -> new_str or nil
4884  *     str.slice!(regexp)           -> new_str or nil
4885  *     str.slice!(other_str)        -> new_str or nil
4886  *
4887  *  Deletes the specified portion from <i>str</i>, and returns the portion
4888  *  deleted.
4889  *
4890  *     string = "this is a string"
4891  *     string.slice!(2)        #=> "i"
4892  *     string.slice!(3..6)     #=> " is "
4893  *     string.slice!(/s.*t/)   #=> "sa st"
4894  *     string.slice!("r")      #=> "r"
4895  *     string                  #=> "thing"
4896  */
4897 
4898 static VALUE
rb_str_slice_bang(int argc,VALUE * argv,VALUE str)4899 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
4900 {
4901     VALUE result;
4902     VALUE buf[3];
4903     int i;
4904 
4905     rb_check_arity(argc, 1, 2);
4906     for (i=0; i<argc; i++) {
4907 	buf[i] = argv[i];
4908     }
4909     str_modify_keep_cr(str);
4910     result = rb_str_aref_m(argc, buf, str);
4911     if (!NIL_P(result)) {
4912 	buf[i] = rb_str_new(0,0);
4913 	rb_str_aset_m(argc+1, buf, str);
4914     }
4915     return result;
4916 }
4917 
4918 static VALUE
get_pat(VALUE pat)4919 get_pat(VALUE pat)
4920 {
4921     VALUE val;
4922 
4923     if (SPECIAL_CONST_P(pat)) goto to_string;
4924     switch (BUILTIN_TYPE(pat)) {
4925       case T_REGEXP:
4926 	return pat;
4927 
4928       case T_STRING:
4929 	break;
4930 
4931       default:
4932       to_string:
4933 	val = rb_check_string_type(pat);
4934 	if (NIL_P(val)) {
4935 	    Check_Type(pat, T_REGEXP);
4936 	}
4937 	pat = val;
4938     }
4939 
4940     return rb_reg_regcomp(pat);
4941 }
4942 
4943 static VALUE
get_pat_quoted(VALUE pat,int check)4944 get_pat_quoted(VALUE pat, int check)
4945 {
4946     VALUE val;
4947 
4948     if (SPECIAL_CONST_P(pat)) goto to_string;
4949     switch (BUILTIN_TYPE(pat)) {
4950       case T_REGEXP:
4951 	return pat;
4952 
4953       case T_STRING:
4954 	break;
4955 
4956       default:
4957       to_string:
4958 	val = rb_check_string_type(pat);
4959 	if (NIL_P(val)) {
4960 	    Check_Type(pat, T_REGEXP);
4961 	}
4962 	pat = val;
4963     }
4964     if (check && is_broken_string(pat)) {
4965 	rb_exc_raise(rb_reg_check_preprocess(pat));
4966     }
4967     return pat;
4968 }
4969 
4970 static long
rb_pat_search(VALUE pat,VALUE str,long pos,int set_backref_str)4971 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
4972 {
4973     if (BUILTIN_TYPE(pat) == T_STRING) {
4974 	pos = rb_strseq_index(str, pat, pos, 1);
4975 	if (set_backref_str) {
4976 	    if (pos >= 0) {
4977 		VALUE match;
4978 		str = rb_str_new_frozen(str);
4979 		rb_backref_set_string(str, pos, RSTRING_LEN(pat));
4980 		match = rb_backref_get();
4981 		OBJ_INFECT(match, pat);
4982 	    }
4983 	    else {
4984 		rb_backref_set(Qnil);
4985 	    }
4986 	}
4987 	return pos;
4988     }
4989     else {
4990 	return rb_reg_search0(pat, str, pos, 0, set_backref_str);
4991     }
4992 }
4993 
4994 
4995 /*
4996  *  call-seq:
4997  *     str.sub!(pattern, replacement)          -> str or nil
4998  *     str.sub!(pattern) {|match| block }      -> str or nil
4999  *
5000  *  Performs the same substitution as String#sub in-place.
5001  *
5002  *  Returns +str+ if a substitution was performed or +nil+ if no substitution
5003  *  was performed.
5004  */
5005 
5006 static VALUE
rb_str_sub_bang(int argc,VALUE * argv,VALUE str)5007 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5008 {
5009     VALUE pat, repl, hash = Qnil;
5010     int iter = 0;
5011     int tainted = 0;
5012     long plen;
5013     int min_arity = rb_block_given_p() ? 1 : 2;
5014     long beg;
5015 
5016     rb_check_arity(argc, min_arity, 2);
5017     if (argc == 1) {
5018 	iter = 1;
5019     }
5020     else {
5021 	repl = argv[1];
5022 	hash = rb_check_hash_type(argv[1]);
5023 	if (NIL_P(hash)) {
5024 	    StringValue(repl);
5025 	}
5026 	tainted = OBJ_TAINTED_RAW(repl);
5027     }
5028 
5029     pat = get_pat_quoted(argv[0], 1);
5030 
5031     str_modifiable(str);
5032     beg = rb_pat_search(pat, str, 0, 1);
5033     if (beg >= 0) {
5034 	rb_encoding *enc;
5035 	int cr = ENC_CODERANGE(str);
5036 	long beg0, end0;
5037 	VALUE match, match0 = Qnil;
5038 	struct re_registers *regs;
5039 	char *p, *rp;
5040 	long len, rlen;
5041 
5042 	match = rb_backref_get();
5043 	regs = RMATCH_REGS(match);
5044 	if (RB_TYPE_P(pat, T_STRING)) {
5045 	    beg0 = beg;
5046 	    end0 = beg0 + RSTRING_LEN(pat);
5047 	    match0 = pat;
5048 	}
5049 	else {
5050 	    beg0 = BEG(0);
5051 	    end0 = END(0);
5052 	    if (iter) match0 = rb_reg_nth_match(0, match);
5053 	}
5054 
5055 	if (iter || !NIL_P(hash)) {
5056 	    p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5057 
5058             if (iter) {
5059                 repl = rb_obj_as_string(rb_yield(match0));
5060             }
5061             else {
5062                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5063                 repl = rb_obj_as_string(repl);
5064             }
5065 	    str_mod_check(str, p, len);
5066 	    rb_check_frozen(str);
5067 	}
5068 	else {
5069 	    repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5070 	}
5071 
5072         enc = rb_enc_compatible(str, repl);
5073         if (!enc) {
5074             rb_encoding *str_enc = STR_ENC_GET(str);
5075 	    p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5076 	    if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5077 		coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5078                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5079 			 rb_enc_name(str_enc),
5080 			 rb_enc_name(STR_ENC_GET(repl)));
5081             }
5082             enc = STR_ENC_GET(repl);
5083         }
5084 	rb_str_modify(str);
5085 	rb_enc_associate(str, enc);
5086 	tainted |= OBJ_TAINTED_RAW(repl);
5087 	if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5088 	    int cr2 = ENC_CODERANGE(repl);
5089             if (cr2 == ENC_CODERANGE_BROKEN ||
5090                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5091                 cr = ENC_CODERANGE_UNKNOWN;
5092             else
5093                 cr = cr2;
5094 	}
5095 	plen = end0 - beg0;
5096         rlen = RSTRING_LEN(repl);
5097 	len = RSTRING_LEN(str);
5098 	if (rlen > plen) {
5099 	    RESIZE_CAPA(str, len + rlen - plen);
5100 	}
5101 	p = RSTRING_PTR(str);
5102 	if (rlen != plen) {
5103 	    memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5104 	}
5105 	rp = RSTRING_PTR(repl);
5106 	memmove(p + beg0, rp, rlen);
5107 	len += rlen - plen;
5108 	STR_SET_LEN(str, len);
5109 	TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5110 	ENC_CODERANGE_SET(str, cr);
5111 	FL_SET_RAW(str, tainted);
5112 
5113 	return str;
5114     }
5115     return Qnil;
5116 }
5117 
5118 
5119 /*
5120  *  call-seq:
5121  *     str.sub(pattern, replacement)         -> new_str
5122  *     str.sub(pattern, hash)                -> new_str
5123  *     str.sub(pattern) {|match| block }     -> new_str
5124  *
5125  *  Returns a copy of +str+ with the _first_ occurrence of +pattern+
5126  *  replaced by the second argument. The +pattern+ is typically a Regexp; if
5127  *  given as a String, any regular expression metacharacters it contains will
5128  *  be interpreted literally, e.g. <code>'\\\d'</code> will match a backslash
5129  *  followed by 'd', instead of a digit.
5130  *
5131  *  If +replacement+ is a String it will be substituted for the matched text.
5132  *  It may contain back-references to the pattern's capture groups of the form
5133  *  <code>"\\d"</code>, where <i>d</i> is a group number, or
5134  *  <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
5135  *  double-quoted string, both back-references must be preceded by an
5136  *  additional backslash. However, within +replacement+ the special match
5137  *  variables, such as <code>$&</code>, will not refer to the current match.
5138  *  If +replacement+ is a String that looks like a pattern's capture group but
5139  *  is actually not a pattern capture group e.g. <code>"\\'"</code>, then it
5140  *  will have to be preceded by two backslashes like so <code>"\\\\'"</code>.
5141  *
5142  *  If the second argument is a Hash, and the matched text is one of its keys,
5143  *  the corresponding value is the replacement string.
5144  *
5145  *  In the block form, the current match string is passed in as a parameter,
5146  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5147  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5148  *  returned by the block will be substituted for the match on each call.
5149  *
5150  *  The result inherits any tainting in the original string or any supplied
5151  *  replacement string.
5152  *
5153  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
5154  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
5155  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
5156  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
5157  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
5158  *      #=> "Is /bin/bash your preferred shell?"
5159  */
5160 
5161 static VALUE
rb_str_sub(int argc,VALUE * argv,VALUE str)5162 rb_str_sub(int argc, VALUE *argv, VALUE str)
5163 {
5164     str = rb_str_dup(str);
5165     rb_str_sub_bang(argc, argv, str);
5166     return str;
5167 }
5168 
5169 static VALUE
str_gsub(int argc,VALUE * argv,VALUE str,int bang)5170 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5171 {
5172     VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5173     struct re_registers *regs;
5174     long beg, beg0, end0;
5175     long offset, blen, slen, len, last;
5176     enum {STR, ITER, MAP} mode = STR;
5177     char *sp, *cp;
5178     int tainted = 0;
5179     int need_backref = -1;
5180     rb_encoding *str_enc;
5181 
5182     switch (argc) {
5183       case 1:
5184 	RETURN_ENUMERATOR(str, argc, argv);
5185 	mode = ITER;
5186 	break;
5187       case 2:
5188 	repl = argv[1];
5189 	hash = rb_check_hash_type(argv[1]);
5190 	if (NIL_P(hash)) {
5191 	    StringValue(repl);
5192 	}
5193 	else {
5194 	    mode = MAP;
5195 	}
5196 	tainted = OBJ_TAINTED_RAW(repl);
5197 	break;
5198       default:
5199 	rb_check_arity(argc, 1, 2);
5200     }
5201 
5202     pat = get_pat_quoted(argv[0], 1);
5203     beg = rb_pat_search(pat, str, 0, need_backref);
5204     if (beg < 0) {
5205 	if (bang) return Qnil;	/* no match, no substitution */
5206 	return rb_str_dup(str);
5207     }
5208 
5209     offset = 0;
5210     blen = RSTRING_LEN(str) + 30; /* len + margin */
5211     dest = rb_str_buf_new(blen);
5212     sp = RSTRING_PTR(str);
5213     slen = RSTRING_LEN(str);
5214     cp = sp;
5215     str_enc = STR_ENC_GET(str);
5216     rb_enc_associate(dest, str_enc);
5217     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5218 
5219     do {
5220 	match = rb_backref_get();
5221 	regs = RMATCH_REGS(match);
5222 	if (RB_TYPE_P(pat, T_STRING)) {
5223 	    beg0 = beg;
5224 	    end0 = beg0 + RSTRING_LEN(pat);
5225 	    match0 = pat;
5226 	}
5227 	else {
5228 	    beg0 = BEG(0);
5229 	    end0 = END(0);
5230 	    if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5231 	}
5232 
5233 	if (mode) {
5234             if (mode == ITER) {
5235                 val = rb_obj_as_string(rb_yield(match0));
5236             }
5237             else {
5238                 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5239                 val = rb_obj_as_string(val);
5240             }
5241 	    str_mod_check(str, sp, slen);
5242 	    if (val == dest) { 	/* paranoid check [ruby-dev:24827] */
5243 		rb_raise(rb_eRuntimeError, "block should not cheat");
5244 	    }
5245 	}
5246 	else if (need_backref) {
5247 	    val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5248 	    if (need_backref < 0) {
5249 		need_backref = val != repl;
5250 	    }
5251 	}
5252 	else {
5253 	    val = repl;
5254 	}
5255 
5256 	tainted |= OBJ_TAINTED_RAW(val);
5257 
5258 	len = beg0 - offset;	/* copy pre-match substr */
5259         if (len) {
5260             rb_enc_str_buf_cat(dest, cp, len, str_enc);
5261         }
5262 
5263         rb_str_buf_append(dest, val);
5264 
5265 	last = offset;
5266 	offset = end0;
5267 	if (beg0 == end0) {
5268 	    /*
5269 	     * Always consume at least one character of the input string
5270 	     * in order to prevent infinite loops.
5271 	     */
5272 	    if (RSTRING_LEN(str) <= end0) break;
5273 	    len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5274             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5275 	    offset = end0 + len;
5276 	}
5277 	cp = RSTRING_PTR(str) + offset;
5278 	if (offset > RSTRING_LEN(str)) break;
5279 	beg = rb_pat_search(pat, str, offset, need_backref);
5280     } while (beg >= 0);
5281     if (RSTRING_LEN(str) > offset) {
5282         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5283     }
5284     rb_pat_search(pat, str, last, 1);
5285     if (bang) {
5286         str_shared_replace(str, dest);
5287     }
5288     else {
5289 	RBASIC_SET_CLASS(dest, rb_obj_class(str));
5290 	tainted |= OBJ_TAINTED_RAW(str);
5291 	str = dest;
5292     }
5293 
5294     FL_SET_RAW(str, tainted);
5295     return str;
5296 }
5297 
5298 
5299 /*
5300  *  call-seq:
5301  *     str.gsub!(pattern, replacement)        -> str or nil
5302  *     str.gsub!(pattern, hash)               -> str or nil
5303  *     str.gsub!(pattern) {|match| block }    -> str or nil
5304  *     str.gsub!(pattern)                     -> an_enumerator
5305  *
5306  *  Performs the substitutions of <code>String#gsub</code> in place, returning
5307  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
5308  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
5309  */
5310 
5311 static VALUE
rb_str_gsub_bang(int argc,VALUE * argv,VALUE str)5312 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5313 {
5314     str_modify_keep_cr(str);
5315     return str_gsub(argc, argv, str, 1);
5316 }
5317 
5318 
5319 /*
5320  *  call-seq:
5321  *     str.gsub(pattern, replacement)       -> new_str
5322  *     str.gsub(pattern, hash)              -> new_str
5323  *     str.gsub(pattern) {|match| block }   -> new_str
5324  *     str.gsub(pattern)                    -> enumerator
5325  *
5326  *  Returns a copy of <i>str</i> with <em>all</em> occurrences of
5327  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5328  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
5329  *  regular expression metacharacters it contains will be interpreted
5330  *  literally, e.g. <code>'\\\d'</code> will match a backslash followed by 'd',
5331  *  instead of a digit.
5332  *
5333  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
5334  *  the matched text. It may contain back-references to the pattern's capture
5335  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
5336  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
5337  *  double-quoted string, both back-references must be preceded by an
5338  *  additional backslash. However, within <i>replacement</i> the special match
5339  *  variables, such as <code>$&</code>, will not refer to the current match.
5340  *
5341  *  If the second argument is a <code>Hash</code>, and the matched text is one
5342  *  of its keys, the corresponding value is the replacement string.
5343  *
5344  *  In the block form, the current match string is passed in as a parameter,
5345  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5346  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5347  *  returned by the block will be substituted for the match on each call.
5348  *
5349  *  The result inherits any tainting in the original string or any supplied
5350  *  replacement string.
5351  *
5352  *  When neither a block nor a second argument is supplied, an
5353  *  <code>Enumerator</code> is returned.
5354  *
5355  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
5356  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
5357  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
5358  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
5359  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
5360  */
5361 
5362 static VALUE
rb_str_gsub(int argc,VALUE * argv,VALUE str)5363 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5364 {
5365     return str_gsub(argc, argv, str, 0);
5366 }
5367 
5368 
5369 /*
5370  *  call-seq:
5371  *     str.replace(other_str)   -> str
5372  *
5373  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
5374  *  values in <i>other_str</i>.
5375  *
5376  *     s = "hello"         #=> "hello"
5377  *     s.replace "world"   #=> "world"
5378  */
5379 
5380 VALUE
rb_str_replace(VALUE str,VALUE str2)5381 rb_str_replace(VALUE str, VALUE str2)
5382 {
5383     str_modifiable(str);
5384     if (str == str2) return str;
5385 
5386     StringValue(str2);
5387     str_discard(str);
5388     return str_replace(str, str2);
5389 }
5390 
5391 /*
5392  *  call-seq:
5393  *     string.clear    ->  string
5394  *
5395  *  Makes string empty.
5396  *
5397  *     a = "abcde"
5398  *     a.clear    #=> ""
5399  */
5400 
5401 static VALUE
rb_str_clear(VALUE str)5402 rb_str_clear(VALUE str)
5403 {
5404     str_discard(str);
5405     STR_SET_EMBED(str);
5406     STR_SET_EMBED_LEN(str, 0);
5407     RSTRING_PTR(str)[0] = 0;
5408     if (rb_enc_asciicompat(STR_ENC_GET(str)))
5409 	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5410     else
5411 	ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5412     return str;
5413 }
5414 
5415 /*
5416  *  call-seq:
5417  *     string.chr    ->  string
5418  *
5419  *  Returns a one-character string at the beginning of the string.
5420  *
5421  *     a = "abcde"
5422  *     a.chr    #=> "a"
5423  */
5424 
5425 static VALUE
rb_str_chr(VALUE str)5426 rb_str_chr(VALUE str)
5427 {
5428     return rb_str_substr(str, 0, 1);
5429 }
5430 
5431 /*
5432  *  call-seq:
5433  *     str.getbyte(index)          -> 0 .. 255
5434  *
5435  *  returns the <i>index</i>th byte as an integer.
5436  */
5437 static VALUE
rb_str_getbyte(VALUE str,VALUE index)5438 rb_str_getbyte(VALUE str, VALUE index)
5439 {
5440     long pos = NUM2LONG(index);
5441 
5442     if (pos < 0)
5443         pos += RSTRING_LEN(str);
5444     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
5445         return Qnil;
5446 
5447     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5448 }
5449 
5450 /*
5451  *  call-seq:
5452  *     str.setbyte(index, integer) -> integer
5453  *
5454  *  modifies the <i>index</i>th byte as <i>integer</i>.
5455  */
5456 static VALUE
rb_str_setbyte(VALUE str,VALUE index,VALUE value)5457 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5458 {
5459     long pos = NUM2LONG(index);
5460     long len = RSTRING_LEN(str);
5461     char *head, *left = 0;
5462     unsigned char *ptr;
5463     rb_encoding *enc;
5464     int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5465 
5466     if (pos < -len || len <= pos)
5467         rb_raise(rb_eIndexError, "index %ld out of string", pos);
5468     if (pos < 0)
5469         pos += len;
5470 
5471     VALUE v = rb_to_int(value);
5472     VALUE w = rb_int_modulo(v, INT2FIX(256));
5473     unsigned char byte = NUM2INT(w) & 0xFF;
5474 
5475     if (!str_independent(str))
5476 	str_make_independent(str);
5477     enc = STR_ENC_GET(str);
5478     head = RSTRING_PTR(str);
5479     ptr = (unsigned char *)&head[pos];
5480     if (!STR_EMBED_P(str)) {
5481 	cr = ENC_CODERANGE(str);
5482 	switch (cr) {
5483 	  case ENC_CODERANGE_7BIT:
5484             left = (char *)ptr;
5485 	    *ptr = byte;
5486 	    if (ISASCII(byte)) goto end;
5487 	    nlen = rb_enc_precise_mbclen(left, head+len, enc);
5488 	    if (!MBCLEN_CHARFOUND_P(nlen))
5489 		ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5490 	    else
5491 		ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5492 	    goto end;
5493 	  case ENC_CODERANGE_VALID:
5494 	    left = rb_enc_left_char_head(head, ptr, head+len, enc);
5495 	    width = rb_enc_precise_mbclen(left, head+len, enc);
5496 	    *ptr = byte;
5497 	    nlen = rb_enc_precise_mbclen(left, head+len, enc);
5498 	    if (!MBCLEN_CHARFOUND_P(nlen))
5499 		ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5500 	    else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5501 		ENC_CODERANGE_CLEAR(str);
5502 	    goto end;
5503 	}
5504     }
5505     ENC_CODERANGE_CLEAR(str);
5506     *ptr = byte;
5507 
5508   end:
5509     return value;
5510 }
5511 
5512 static VALUE
str_byte_substr(VALUE str,long beg,long len,int empty)5513 str_byte_substr(VALUE str, long beg, long len, int empty)
5514 {
5515     char *p, *s = RSTRING_PTR(str);
5516     long n = RSTRING_LEN(str);
5517     VALUE str2;
5518 
5519     if (beg > n || len < 0) return Qnil;
5520     if (beg < 0) {
5521 	beg += n;
5522 	if (beg < 0) return Qnil;
5523     }
5524     if (len > n - beg)
5525 	len = n - beg;
5526     if (len <= 0) {
5527 	if (!empty) return Qnil;
5528 	len = 0;
5529 	p = 0;
5530     }
5531     else
5532 	p = s + beg;
5533 
5534     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5535 	str2 = rb_str_new_frozen(str);
5536 	str2 = str_new_shared(rb_obj_class(str2), str2);
5537 	RSTRING(str2)->as.heap.ptr += beg;
5538 	RSTRING(str2)->as.heap.len = len;
5539     }
5540     else {
5541 	str2 = rb_str_new_with_class(str, p, len);
5542     }
5543 
5544     str_enc_copy(str2, str);
5545 
5546     if (RSTRING_LEN(str2) == 0) {
5547 	if (!rb_enc_asciicompat(STR_ENC_GET(str)))
5548 	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
5549 	else
5550 	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
5551     }
5552     else {
5553 	switch (ENC_CODERANGE(str)) {
5554 	  case ENC_CODERANGE_7BIT:
5555 	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
5556 	    break;
5557 	  default:
5558 	    ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
5559 	    break;
5560 	}
5561     }
5562 
5563     OBJ_INFECT_RAW(str2, str);
5564 
5565     return str2;
5566 }
5567 
5568 static VALUE
str_byte_aref(VALUE str,VALUE indx)5569 str_byte_aref(VALUE str, VALUE indx)
5570 {
5571     long idx;
5572     if (FIXNUM_P(indx)) {
5573 	idx = FIX2LONG(indx);
5574     }
5575     else {
5576 	/* check if indx is Range */
5577 	long beg, len = RSTRING_LEN(str);
5578 
5579 	switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5580 	  case Qfalse:
5581 	    break;
5582 	  case Qnil:
5583 	    return Qnil;
5584 	  default:
5585 	    return str_byte_substr(str, beg, len, TRUE);
5586 	}
5587 
5588 	idx = NUM2LONG(indx);
5589     }
5590     return str_byte_substr(str, idx, 1, FALSE);
5591 }
5592 
5593 /*
5594  *  call-seq:
5595  *     str.byteslice(integer)           -> new_str or nil
5596  *     str.byteslice(integer, integer)   -> new_str or nil
5597  *     str.byteslice(range)            -> new_str or nil
5598  *
5599  *  Byte Reference---If passed a single <code>Integer</code>, returns a
5600  *  substring of one byte at that position. If passed two <code>Integer</code>
5601  *  objects, returns a substring starting at the offset given by the first, and
5602  *  a length given by the second. If given a <code>Range</code>, a substring containing
5603  *  bytes at offsets given by the range is returned. In all three cases, if
5604  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
5605  *  <code>nil</code> if the initial offset falls outside the string, the length
5606  *  is negative, or the beginning of the range is greater than the end.
5607  *  The encoding of the resulted string keeps original encoding.
5608  *
5609  *     "hello".byteslice(1)     #=> "e"
5610  *     "hello".byteslice(-1)    #=> "o"
5611  *     "hello".byteslice(1, 2)  #=> "el"
5612  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5613  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5614  */
5615 
5616 static VALUE
rb_str_byteslice(int argc,VALUE * argv,VALUE str)5617 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
5618 {
5619     if (argc == 2) {
5620 	long beg = NUM2LONG(argv[0]);
5621 	long end = NUM2LONG(argv[1]);
5622 	return str_byte_substr(str, beg, end, TRUE);
5623     }
5624     rb_check_arity(argc, 1, 2);
5625     return str_byte_aref(str, argv[0]);
5626 }
5627 
5628 /*
5629  *  call-seq:
5630  *     str.reverse   -> new_str
5631  *
5632  *  Returns a new string with the characters from <i>str</i> in reverse order.
5633  *
5634  *     "stressed".reverse   #=> "desserts"
5635  */
5636 
5637 static VALUE
rb_str_reverse(VALUE str)5638 rb_str_reverse(VALUE str)
5639 {
5640     rb_encoding *enc;
5641     VALUE rev;
5642     char *s, *e, *p;
5643     int cr;
5644 
5645     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
5646     enc = STR_ENC_GET(str);
5647     rev = rb_str_new_with_class(str, 0, RSTRING_LEN(str));
5648     s = RSTRING_PTR(str); e = RSTRING_END(str);
5649     p = RSTRING_END(rev);
5650     cr = ENC_CODERANGE(str);
5651 
5652     if (RSTRING_LEN(str) > 1) {
5653 	if (single_byte_optimizable(str)) {
5654 	    while (s < e) {
5655 		*--p = *s++;
5656 	    }
5657 	}
5658 	else if (cr == ENC_CODERANGE_VALID) {
5659 	    while (s < e) {
5660 		int clen = rb_enc_fast_mbclen(s, e, enc);
5661 
5662 		p -= clen;
5663 		memcpy(p, s, clen);
5664 		s += clen;
5665 	    }
5666 	}
5667 	else {
5668 	    cr = rb_enc_asciicompat(enc) ?
5669 		ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5670 	    while (s < e) {
5671 		int clen = rb_enc_mbclen(s, e, enc);
5672 
5673 		if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5674 		p -= clen;
5675 		memcpy(p, s, clen);
5676 		s += clen;
5677 	    }
5678 	}
5679     }
5680     STR_SET_LEN(rev, RSTRING_LEN(str));
5681     OBJ_INFECT_RAW(rev, str);
5682     str_enc_copy(rev, str);
5683     ENC_CODERANGE_SET(rev, cr);
5684 
5685     return rev;
5686 }
5687 
5688 
5689 /*
5690  *  call-seq:
5691  *     str.reverse!   -> str
5692  *
5693  *  Reverses <i>str</i> in place.
5694  */
5695 
5696 static VALUE
rb_str_reverse_bang(VALUE str)5697 rb_str_reverse_bang(VALUE str)
5698 {
5699     if (RSTRING_LEN(str) > 1) {
5700 	if (single_byte_optimizable(str)) {
5701 	    char *s, *e, c;
5702 
5703 	    str_modify_keep_cr(str);
5704 	    s = RSTRING_PTR(str);
5705 	    e = RSTRING_END(str) - 1;
5706 	    while (s < e) {
5707 		c = *s;
5708 		*s++ = *e;
5709 		*e-- = c;
5710 	    }
5711 	}
5712 	else {
5713 	    str_shared_replace(str, rb_str_reverse(str));
5714 	}
5715     }
5716     else {
5717 	str_modify_keep_cr(str);
5718     }
5719     return str;
5720 }
5721 
5722 
5723 /*
5724  *  call-seq:
5725  *     str.include? other_str   -> true or false
5726  *
5727  *  Returns <code>true</code> if <i>str</i> contains the given string or
5728  *  character.
5729  *
5730  *     "hello".include? "lo"   #=> true
5731  *     "hello".include? "ol"   #=> false
5732  *     "hello".include? ?h     #=> true
5733  */
5734 
5735 static VALUE
rb_str_include(VALUE str,VALUE arg)5736 rb_str_include(VALUE str, VALUE arg)
5737 {
5738     long i;
5739 
5740     StringValue(arg);
5741     i = rb_str_index(str, arg, 0);
5742 
5743     if (i == -1) return Qfalse;
5744     return Qtrue;
5745 }
5746 
5747 
5748 /*
5749  *  call-seq:
5750  *     str.to_i(base=10)   -> integer
5751  *
5752  *  Returns the result of interpreting leading characters in <i>str</i> as an
5753  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
5754  *  end of a valid number are ignored. If there is not a valid number at the
5755  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
5756  *  exception when <i>base</i> is valid.
5757  *
5758  *     "12345".to_i             #=> 12345
5759  *     "99 red balloons".to_i   #=> 99
5760  *     "0a".to_i                #=> 0
5761  *     "0a".to_i(16)            #=> 10
5762  *     "hello".to_i             #=> 0
5763  *     "1100101".to_i(2)        #=> 101
5764  *     "1100101".to_i(8)        #=> 294977
5765  *     "1100101".to_i(10)       #=> 1100101
5766  *     "1100101".to_i(16)       #=> 17826049
5767  */
5768 
5769 static VALUE
rb_str_to_i(int argc,VALUE * argv,VALUE str)5770 rb_str_to_i(int argc, VALUE *argv, VALUE str)
5771 {
5772     int base = 10;
5773 
5774     if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
5775 	rb_raise(rb_eArgError, "invalid radix %d", base);
5776     }
5777     return rb_str_to_inum(str, base, FALSE);
5778 }
5779 
5780 
5781 /*
5782  *  call-seq:
5783  *     str.to_f   -> float
5784  *
5785  *  Returns the result of interpreting leading characters in <i>str</i> as a
5786  *  floating point number. Extraneous characters past the end of a valid number
5787  *  are ignored. If there is not a valid number at the start of <i>str</i>,
5788  *  <code>0.0</code> is returned. This method never raises an exception.
5789  *
5790  *     "123.45e1".to_f        #=> 1234.5
5791  *     "45.67 degrees".to_f   #=> 45.67
5792  *     "thx1138".to_f         #=> 0.0
5793  */
5794 
5795 static VALUE
rb_str_to_f(VALUE str)5796 rb_str_to_f(VALUE str)
5797 {
5798     return DBL2NUM(rb_str_to_dbl(str, FALSE));
5799 }
5800 
5801 
5802 /*
5803  *  call-seq:
5804  *     str.to_s     -> str
5805  *     str.to_str   -> str
5806  *
5807  *  Returns +self+.
5808  *
5809  *  If called on a subclass of String, converts the receiver to a String object.
5810  */
5811 
5812 static VALUE
rb_str_to_s(VALUE str)5813 rb_str_to_s(VALUE str)
5814 {
5815     if (rb_obj_class(str) != rb_cString) {
5816 	return str_duplicate(rb_cString, str);
5817     }
5818     return str;
5819 }
5820 
5821 #if 0
5822 static void
5823 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
5824 {
5825     char s[RUBY_MAX_CHAR_LEN];
5826     int n = rb_enc_codelen(c, enc);
5827 
5828     rb_enc_mbcput(c, s, enc);
5829     rb_enc_str_buf_cat(str, s, n, enc);
5830 }
5831 #endif
5832 
5833 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
5834 
5835 int
rb_str_buf_cat_escaped_char(VALUE result,unsigned int c,int unicode_p)5836 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
5837 {
5838     char buf[CHAR_ESC_LEN + 1];
5839     int l;
5840 
5841 #if SIZEOF_INT > 4
5842     c &= 0xffffffff;
5843 #endif
5844     if (unicode_p) {
5845 	if (c < 0x7F && ISPRINT(c)) {
5846 	    snprintf(buf, CHAR_ESC_LEN, "%c", c);
5847 	}
5848 	else if (c < 0x10000) {
5849 	    snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
5850 	}
5851 	else {
5852 	    snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
5853 	}
5854     }
5855     else {
5856 	if (c < 0x100) {
5857 	    snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
5858 	}
5859 	else {
5860 	    snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
5861 	}
5862     }
5863     l = (int)strlen(buf);	/* CHAR_ESC_LEN cannot exceed INT_MAX */
5864     rb_str_buf_cat(result, buf, l);
5865     return l;
5866 }
5867 
5868 VALUE
rb_str_escape(VALUE str)5869 rb_str_escape(VALUE str)
5870 {
5871     int encidx = ENCODING_GET(str);
5872     rb_encoding *enc = rb_enc_from_index(encidx);
5873     const char *p = RSTRING_PTR(str);
5874     const char *pend = RSTRING_END(str);
5875     const char *prev = p;
5876     char buf[CHAR_ESC_LEN + 1];
5877     VALUE result = rb_str_buf_new(0);
5878     int unicode_p = rb_enc_unicode_p(enc);
5879     int asciicompat = rb_enc_asciicompat(enc);
5880 
5881     while (p < pend) {
5882 	unsigned int c, cc;
5883 	int n = rb_enc_precise_mbclen(p, pend, enc);
5884         if (!MBCLEN_CHARFOUND_P(n)) {
5885 	    if (p > prev) str_buf_cat(result, prev, p - prev);
5886             n = rb_enc_mbminlen(enc);
5887             if (pend < p + n)
5888                 n = (int)(pend - p);
5889             while (n--) {
5890                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5891                 str_buf_cat(result, buf, strlen(buf));
5892                 prev = ++p;
5893             }
5894 	    continue;
5895 	}
5896         n = MBCLEN_CHARFOUND_LEN(n);
5897 	c = rb_enc_mbc_to_codepoint(p, pend, enc);
5898 	p += n;
5899 	switch (c) {
5900 	  case '\n': cc = 'n'; break;
5901 	  case '\r': cc = 'r'; break;
5902 	  case '\t': cc = 't'; break;
5903 	  case '\f': cc = 'f'; break;
5904 	  case '\013': cc = 'v'; break;
5905 	  case '\010': cc = 'b'; break;
5906 	  case '\007': cc = 'a'; break;
5907 	  case 033: cc = 'e'; break;
5908 	  default: cc = 0; break;
5909 	}
5910 	if (cc) {
5911 	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5912 	    buf[0] = '\\';
5913 	    buf[1] = (char)cc;
5914 	    str_buf_cat(result, buf, 2);
5915 	    prev = p;
5916 	}
5917 	else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
5918 	}
5919 	else {
5920 	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5921 	    rb_str_buf_cat_escaped_char(result, c, unicode_p);
5922 	    prev = p;
5923 	}
5924     }
5925     if (p > prev) str_buf_cat(result, prev, p - prev);
5926     ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
5927 
5928     OBJ_INFECT_RAW(result, str);
5929     return result;
5930 }
5931 
5932 /*
5933  * call-seq:
5934  *   str.inspect   -> string
5935  *
5936  * Returns a printable version of _str_, surrounded by quote marks,
5937  * with special characters escaped.
5938  *
5939  *    str = "hello"
5940  *    str[3] = "\b"
5941  *    str.inspect       #=> "\"hel\\bo\""
5942  */
5943 
5944 VALUE
rb_str_inspect(VALUE str)5945 rb_str_inspect(VALUE str)
5946 {
5947     int encidx = ENCODING_GET(str);
5948     rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
5949     const char *p, *pend, *prev;
5950     char buf[CHAR_ESC_LEN + 1];
5951     VALUE result = rb_str_buf_new(0);
5952     rb_encoding *resenc = rb_default_internal_encoding();
5953     int unicode_p = rb_enc_unicode_p(enc);
5954     int asciicompat = rb_enc_asciicompat(enc);
5955 
5956     if (resenc == NULL) resenc = rb_default_external_encoding();
5957     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
5958     rb_enc_associate(result, resenc);
5959     str_buf_cat2(result, "\"");
5960 
5961     p = RSTRING_PTR(str); pend = RSTRING_END(str);
5962     prev = p;
5963     actenc = get_actual_encoding(encidx, str);
5964     if (actenc != enc) {
5965 	enc = actenc;
5966 	if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
5967     }
5968     while (p < pend) {
5969 	unsigned int c, cc;
5970 	int n;
5971 
5972         n = rb_enc_precise_mbclen(p, pend, enc);
5973         if (!MBCLEN_CHARFOUND_P(n)) {
5974 	    if (p > prev) str_buf_cat(result, prev, p - prev);
5975             n = rb_enc_mbminlen(enc);
5976             if (pend < p + n)
5977                 n = (int)(pend - p);
5978             while (n--) {
5979                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5980                 str_buf_cat(result, buf, strlen(buf));
5981                 prev = ++p;
5982             }
5983 	    continue;
5984 	}
5985         n = MBCLEN_CHARFOUND_LEN(n);
5986 	c = rb_enc_mbc_to_codepoint(p, pend, enc);
5987 	p += n;
5988 	if ((asciicompat || unicode_p) &&
5989 	  (c == '"'|| c == '\\' ||
5990 	    (c == '#' &&
5991              p < pend &&
5992              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
5993              (cc = rb_enc_codepoint(p,pend,enc),
5994               (cc == '$' || cc == '@' || cc == '{'))))) {
5995 	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5996 	    str_buf_cat2(result, "\\");
5997 	    if (asciicompat || enc == resenc) {
5998 		prev = p - n;
5999 		continue;
6000 	    }
6001 	}
6002 	switch (c) {
6003 	  case '\n': cc = 'n'; break;
6004 	  case '\r': cc = 'r'; break;
6005 	  case '\t': cc = 't'; break;
6006 	  case '\f': cc = 'f'; break;
6007 	  case '\013': cc = 'v'; break;
6008 	  case '\010': cc = 'b'; break;
6009 	  case '\007': cc = 'a'; break;
6010 	  case 033: cc = 'e'; break;
6011 	  default: cc = 0; break;
6012 	}
6013 	if (cc) {
6014 	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6015 	    buf[0] = '\\';
6016 	    buf[1] = (char)cc;
6017 	    str_buf_cat(result, buf, 2);
6018 	    prev = p;
6019 	    continue;
6020 	}
6021 	if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6022 	    (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6023 	    continue;
6024 	}
6025 	else {
6026 	    if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6027 	    rb_str_buf_cat_escaped_char(result, c, unicode_p);
6028 	    prev = p;
6029 	    continue;
6030 	}
6031     }
6032     if (p > prev) str_buf_cat(result, prev, p - prev);
6033     str_buf_cat2(result, "\"");
6034 
6035     OBJ_INFECT_RAW(result, str);
6036     return result;
6037 }
6038 
6039 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6040 
6041 /*
6042  *  call-seq:
6043  *     str.dump   -> new_str
6044  *
6045  *  Produces a version of +str+ with all non-printing characters replaced by
6046  *  <code>\nnn</code> notation and all special characters escaped.
6047  *
6048  *    "hello \n ''".dump  #=> "\"hello \\n ''\""
6049  */
6050 
6051 VALUE
rb_str_dump(VALUE str)6052 rb_str_dump(VALUE str)
6053 {
6054     int encidx = rb_enc_get_index(str);
6055     rb_encoding *enc = rb_enc_from_index(encidx);
6056     long len;
6057     const char *p, *pend;
6058     char *q, *qend;
6059     VALUE result;
6060     int u8 = (encidx == rb_utf8_encindex());
6061     static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6062 
6063     len = 2;			/* "" */
6064     if (!rb_enc_asciicompat(enc)) {
6065 	len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6066 	len += strlen(enc->name);
6067     }
6068 
6069     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6070     while (p < pend) {
6071 	int clen;
6072 	unsigned char c = *p++;
6073 
6074 	switch (c) {
6075 	  case '"':  case '\\':
6076 	  case '\n': case '\r':
6077 	  case '\t': case '\f':
6078 	  case '\013': case '\010': case '\007': case '\033':
6079 	    clen = 2;
6080 	    break;
6081 
6082 	  case '#':
6083 	    clen = IS_EVSTR(p, pend) ? 2 : 1;
6084 	    break;
6085 
6086 	  default:
6087 	    if (ISPRINT(c)) {
6088 		clen = 1;
6089 	    }
6090 	    else {
6091 		if (u8 && c > 0x7F) {	/* \u notation */
6092 		    int n = rb_enc_precise_mbclen(p-1, pend, enc);
6093 		    if (MBCLEN_CHARFOUND_P(n)) {
6094 			unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6095 			if (cc <= 0xFFFF)
6096 			    clen = 6;  /* \uXXXX */
6097 			else if (cc <= 0xFFFFF)
6098 			    clen = 9;  /* \u{XXXXX} */
6099 			else
6100 			    clen = 10; /* \u{XXXXXX} */
6101 			p += MBCLEN_CHARFOUND_LEN(n)-1;
6102 			break;
6103 		    }
6104 		}
6105 		clen = 4;	/* \xNN */
6106 	    }
6107 	    break;
6108 	}
6109 
6110 	if (clen > LONG_MAX - len) {
6111 	    rb_raise(rb_eRuntimeError, "string size too big");
6112 	}
6113 	len += clen;
6114     }
6115 
6116     result = rb_str_new_with_class(str, 0, len);
6117     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6118     q = RSTRING_PTR(result); qend = q + len + 1;
6119 
6120     *q++ = '"';
6121     while (p < pend) {
6122 	unsigned char c = *p++;
6123 
6124 	if (c == '"' || c == '\\') {
6125 	    *q++ = '\\';
6126 	    *q++ = c;
6127 	}
6128 	else if (c == '#') {
6129 	    if (IS_EVSTR(p, pend)) *q++ = '\\';
6130 	    *q++ = '#';
6131 	}
6132 	else if (c == '\n') {
6133 	    *q++ = '\\';
6134 	    *q++ = 'n';
6135 	}
6136 	else if (c == '\r') {
6137 	    *q++ = '\\';
6138 	    *q++ = 'r';
6139 	}
6140 	else if (c == '\t') {
6141 	    *q++ = '\\';
6142 	    *q++ = 't';
6143 	}
6144 	else if (c == '\f') {
6145 	    *q++ = '\\';
6146 	    *q++ = 'f';
6147 	}
6148 	else if (c == '\013') {
6149 	    *q++ = '\\';
6150 	    *q++ = 'v';
6151 	}
6152 	else if (c == '\010') {
6153 	    *q++ = '\\';
6154 	    *q++ = 'b';
6155 	}
6156 	else if (c == '\007') {
6157 	    *q++ = '\\';
6158 	    *q++ = 'a';
6159 	}
6160 	else if (c == '\033') {
6161 	    *q++ = '\\';
6162 	    *q++ = 'e';
6163 	}
6164 	else if (ISPRINT(c)) {
6165 	    *q++ = c;
6166 	}
6167 	else {
6168 	    *q++ = '\\';
6169 	    if (u8) {
6170 		int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6171 		if (MBCLEN_CHARFOUND_P(n)) {
6172 		    int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6173 		    p += n;
6174 		    if (cc <= 0xFFFF)
6175 			snprintf(q, qend-q, "u%04X", cc);    /* \uXXXX */
6176 		    else
6177 			snprintf(q, qend-q, "u{%X}", cc);  /* \u{XXXXX} or \u{XXXXXX} */
6178 		    q += strlen(q);
6179 		    continue;
6180 		}
6181 	    }
6182 	    snprintf(q, qend-q, "x%02X", c);
6183 	    q += 3;
6184 	}
6185     }
6186     *q++ = '"';
6187     *q = '\0';
6188     if (!rb_enc_asciicompat(enc)) {
6189 	snprintf(q, qend-q, nonascii_suffix, enc->name);
6190 	encidx = rb_ascii8bit_encindex();
6191     }
6192     OBJ_INFECT_RAW(result, str);
6193     /* result from dump is ASCII */
6194     rb_enc_associate_index(result, encidx);
6195     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6196     return result;
6197 }
6198 
6199 static int
unescape_ascii(unsigned int c)6200 unescape_ascii(unsigned int c)
6201 {
6202     switch (c) {
6203       case 'n':
6204 	return '\n';
6205       case 'r':
6206 	return '\r';
6207       case 't':
6208 	return '\t';
6209       case 'f':
6210 	return '\f';
6211       case 'v':
6212 	return '\13';
6213       case 'b':
6214 	return '\010';
6215       case 'a':
6216 	return '\007';
6217       case 'e':
6218 	return 033;
6219       default:
6220 	UNREACHABLE;
6221     }
6222 }
6223 
6224 static void
undump_after_backslash(VALUE undumped,const char ** ss,const char * s_end,rb_encoding ** penc,bool * utf8,bool * binary)6225 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6226 {
6227     const char *s = *ss;
6228     unsigned int c;
6229     int codelen;
6230     size_t hexlen;
6231     unsigned char buf[6];
6232     static rb_encoding *enc_utf8 = NULL;
6233 
6234     switch (*s) {
6235       case '\\':
6236       case '"':
6237       case '#':
6238 	rb_str_cat(undumped, s, 1); /* cat itself */
6239 	s++;
6240 	break;
6241       case 'n':
6242       case 'r':
6243       case 't':
6244       case 'f':
6245       case 'v':
6246       case 'b':
6247       case 'a':
6248       case 'e':
6249         *buf = unescape_ascii(*s);
6250         rb_str_cat(undumped, (char *)buf, 1);
6251 	s++;
6252 	break;
6253       case 'u':
6254 	if (*binary) {
6255 	    rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6256 	}
6257 	*utf8 = true;
6258 	if (++s >= s_end) {
6259 	    rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6260 	}
6261 	if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6262 	if (*penc != enc_utf8) {
6263 	    *penc = enc_utf8;
6264 	    rb_enc_associate(undumped, enc_utf8);
6265 	}
6266 	if (*s == '{') { /* handle \u{...} form */
6267 	    s++;
6268 	    for (;;) {
6269 		if (s >= s_end) {
6270 		    rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6271 		}
6272 		if (*s == '}') {
6273 		    s++;
6274 		    break;
6275 		}
6276 		if (ISSPACE(*s)) {
6277 		    s++;
6278 		    continue;
6279 		}
6280 		c = scan_hex(s, s_end-s, &hexlen);
6281 		if (hexlen == 0 || hexlen > 6) {
6282 		    rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6283 		}
6284 		if (c > 0x10ffff) {
6285 		    rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6286 		}
6287 		if (0xd800 <= c && c <= 0xdfff) {
6288 		    rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6289 		}
6290                 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6291                 rb_str_cat(undumped, (char *)buf, codelen);
6292 		s += hexlen;
6293 	    }
6294 	}
6295 	else { /* handle \uXXXX form */
6296 	    c = scan_hex(s, 4, &hexlen);
6297 	    if (hexlen != 4) {
6298 		rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6299 	    }
6300 	    if (0xd800 <= c && c <= 0xdfff) {
6301 		rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6302 	    }
6303             codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6304             rb_str_cat(undumped, (char *)buf, codelen);
6305 	    s += hexlen;
6306 	}
6307 	break;
6308       case 'x':
6309 	if (*utf8) {
6310 	    rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6311 	}
6312 	*binary = true;
6313 	if (++s >= s_end) {
6314 	    rb_raise(rb_eRuntimeError, "invalid hex escape");
6315 	}
6316 	*buf = scan_hex(s, 2, &hexlen);
6317 	if (hexlen != 2) {
6318 	    rb_raise(rb_eRuntimeError, "invalid hex escape");
6319 	}
6320         rb_str_cat(undumped, (char *)buf, 1);
6321 	s += hexlen;
6322 	break;
6323       default:
6324 	rb_str_cat(undumped, s-1, 2);
6325 	s++;
6326     }
6327 
6328     *ss = s;
6329 }
6330 
6331 static VALUE rb_str_is_ascii_only_p(VALUE str);
6332 
6333 /*
6334  *  call-seq:
6335  *     str.undump   -> new_str
6336  *
6337  *  Produces unescaped version of +str+.
6338  *  See also String#dump because String#undump does inverse of String#dump.
6339  *
6340  *    "\"hello \\n ''\"".undump #=> "hello \n ''"
6341  */
6342 
6343 static VALUE
str_undump(VALUE str)6344 str_undump(VALUE str)
6345 {
6346     const char *s = RSTRING_PTR(str);
6347     const char *s_end = RSTRING_END(str);
6348     rb_encoding *enc = rb_enc_get(str);
6349     VALUE undumped = rb_enc_str_new(s, 0L, enc);
6350     bool utf8 = false;
6351     bool binary = false;
6352     int w;
6353 
6354     rb_must_asciicompat(str);
6355     if (rb_str_is_ascii_only_p(str) == Qfalse) {
6356 	rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6357     }
6358     if (!str_null_check(str, &w)) {
6359        rb_raise(rb_eRuntimeError, "string contains null byte");
6360     }
6361     if (RSTRING_LEN(str) < 2) goto invalid_format;
6362     if (*s != '"') goto invalid_format;
6363 
6364     /* strip '"' at the start */
6365     s++;
6366 
6367     for (;;) {
6368 	if (s >= s_end) {
6369 	    rb_raise(rb_eRuntimeError, "unterminated dumped string");
6370 	}
6371 
6372 	if (*s == '"') {
6373 	    /* epilogue */
6374 	    s++;
6375 	    if (s == s_end) {
6376 		/* ascii compatible dumped string */
6377 		break;
6378 	    }
6379 	    else {
6380 		static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6381 		static const char dup_suffix[] = ".dup";
6382 		const char *encname;
6383 		int encidx;
6384 		ptrdiff_t size;
6385 
6386 		/* check separately for strings dumped by older versions */
6387 		size = sizeof(dup_suffix) - 1;
6388 		if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6389 
6390 		size = sizeof(force_encoding_suffix) - 1;
6391 		if (s_end - s <= size) goto invalid_format;
6392 		if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6393 		s += size;
6394 
6395 		if (utf8) {
6396 		    rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6397 		}
6398 
6399 		encname = s;
6400 		s = memchr(s, '"', s_end-s);
6401 		size = s - encname;
6402 		if (!s) goto invalid_format;
6403 		if (s_end - s != 2) goto invalid_format;
6404 		if (s[0] != '"' || s[1] != ')') goto invalid_format;
6405 
6406 		encidx = rb_enc_find_index2(encname, (long)size);
6407 		if (encidx < 0) {
6408 		    rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6409 		}
6410 		rb_enc_associate_index(undumped, encidx);
6411 	    }
6412 	    break;
6413 	}
6414 
6415 	if (*s == '\\') {
6416 	    s++;
6417 	    if (s >= s_end) {
6418 		rb_raise(rb_eRuntimeError, "invalid escape");
6419 	    }
6420 	    undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6421 	}
6422 	else {
6423 	    rb_str_cat(undumped, s++, 1);
6424 	}
6425     }
6426 
6427     OBJ_INFECT(undumped, str);
6428     return undumped;
6429 invalid_format:
6430     rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6431 }
6432 
6433 static void
rb_str_check_dummy_enc(rb_encoding * enc)6434 rb_str_check_dummy_enc(rb_encoding *enc)
6435 {
6436     if (rb_enc_dummy_p(enc)) {
6437 	rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6438 		 rb_enc_name(enc));
6439     }
6440 }
6441 
6442 static OnigCaseFoldType
check_case_options(int argc,VALUE * argv,OnigCaseFoldType flags)6443 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6444 {
6445     if (argc==0)
6446         return flags;
6447     if (argc>2)
6448         rb_raise(rb_eArgError, "too many options");
6449     if (argv[0]==sym_turkic) {
6450 	flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6451 	if (argc==2) {
6452 	    if (argv[1]==sym_lithuanian)
6453 		flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6454 	    else
6455 		rb_raise(rb_eArgError, "invalid second option");
6456 	}
6457     }
6458     else if (argv[0]==sym_lithuanian) {
6459 	flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6460 	if (argc==2) {
6461 	    if (argv[1]==sym_turkic)
6462 	        flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6463 	    else
6464 	        rb_raise(rb_eArgError, "invalid second option");
6465 	}
6466     }
6467     else if (argc>1)
6468 	rb_raise(rb_eArgError, "too many options");
6469     else if (argv[0]==sym_ascii)
6470 	flags |= ONIGENC_CASE_ASCII_ONLY;
6471     else if (argv[0]==sym_fold) {
6472 	if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6473 	    flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6474 	else
6475 	    rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6476     }
6477     else
6478 	rb_raise(rb_eArgError, "invalid option");
6479     return flags;
6480 }
6481 
6482 /* 16 should be long enough to absorb any kind of single character length increase */
6483 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
6484 #ifndef CASEMAP_DEBUG
6485 # define CASEMAP_DEBUG 0
6486 #endif
6487 
6488 struct mapping_buffer;
6489 typedef struct mapping_buffer {
6490     size_t capa;
6491     size_t used;
6492     struct mapping_buffer *next;
6493     OnigUChar space[FLEX_ARY_LEN];
6494 } mapping_buffer;
6495 
6496 static void
mapping_buffer_free(void * p)6497 mapping_buffer_free(void *p)
6498 {
6499     mapping_buffer *previous_buffer;
6500     mapping_buffer *current_buffer = p;
6501     while (current_buffer) {
6502         previous_buffer = current_buffer;
6503         current_buffer  = current_buffer->next;
6504         ruby_sized_xfree(previous_buffer, previous_buffer->capa);
6505     }
6506 }
6507 
6508 static const rb_data_type_t mapping_buffer_type = {
6509     "mapping_buffer",
6510     {0, mapping_buffer_free,}
6511 };
6512 
6513 static VALUE
rb_str_casemap(VALUE source,OnigCaseFoldType * flags,rb_encoding * enc)6514 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6515 {
6516     VALUE target;
6517 
6518     OnigUChar *source_current, *source_end;
6519     int target_length = 0;
6520     VALUE buffer_anchor;
6521     mapping_buffer *current_buffer = 0;
6522     mapping_buffer **pre_buffer;
6523     size_t buffer_count = 0;
6524     int buffer_length_or_invalid;
6525 
6526     if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
6527 
6528     source_current = (OnigUChar*)RSTRING_PTR(source);
6529     source_end = (OnigUChar*)RSTRING_END(source);
6530 
6531     buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
6532     pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
6533     while (source_current < source_end) {
6534 	/* increase multiplier using buffer count to converge quickly */
6535 	size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6536 	if (CASEMAP_DEBUG) {
6537 	    fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6538 	}
6539         current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
6540         *pre_buffer = current_buffer;
6541         pre_buffer = &current_buffer->next;
6542 	current_buffer->next = NULL;
6543 	current_buffer->capa = capa;
6544 	buffer_length_or_invalid = enc->case_map(flags,
6545 				   (const OnigUChar**)&source_current, source_end,
6546 				   current_buffer->space,
6547 				   current_buffer->space+current_buffer->capa,
6548 				   enc);
6549 	if (buffer_length_or_invalid < 0) {
6550             current_buffer = DATA_PTR(buffer_anchor);
6551             DATA_PTR(buffer_anchor) = 0;
6552 	    mapping_buffer_free(current_buffer);
6553 	    rb_raise(rb_eArgError, "input string invalid");
6554 	}
6555 	target_length  += current_buffer->used = buffer_length_or_invalid;
6556     }
6557     if (CASEMAP_DEBUG) {
6558 	fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6559     }
6560 
6561     if (buffer_count==1) {
6562 	target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
6563     }
6564     else {
6565 	char *target_current;
6566 
6567 	target = rb_str_new_with_class(source, 0, target_length);
6568 	target_current = RSTRING_PTR(target);
6569 	current_buffer = DATA_PTR(buffer_anchor);
6570 	while (current_buffer) {
6571 	    memcpy(target_current, current_buffer->space, current_buffer->used);
6572 	    target_current += current_buffer->used;
6573 	    current_buffer  = current_buffer->next;
6574 	}
6575     }
6576     current_buffer = DATA_PTR(buffer_anchor);
6577     DATA_PTR(buffer_anchor) = 0;
6578     mapping_buffer_free(current_buffer);
6579 
6580     /* TODO: check about string terminator character */
6581     OBJ_INFECT_RAW(target, source);
6582     str_enc_copy(target, source);
6583     /*ENC_CODERANGE_SET(mapped, cr);*/
6584 
6585     return target;
6586 }
6587 
6588 static void
rb_str_ascii_casemap(VALUE source,OnigCaseFoldType * flags,rb_encoding * enc)6589 rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6590 {
6591     OnigUChar *source_current, *source_end;
6592     long old_length = RSTRING_LEN(source);
6593     int length_or_invalid;
6594 
6595     if (old_length == 0) return;
6596 
6597     source_current = (OnigUChar*)RSTRING_PTR(source);
6598     source_end = (OnigUChar*)RSTRING_END(source);
6599 
6600     length_or_invalid = onigenc_ascii_only_case_map(flags,
6601 			       (const OnigUChar**)&source_current, source_end,
6602 			       source_current, source_end, enc);
6603     if (length_or_invalid < 0)
6604         rb_raise(rb_eArgError, "input string invalid");
6605     if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6606 	fprintf(stderr, "problem with rb_str_ascii_casemap"
6607 		"; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6608 	rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6609 		 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6610     }
6611 }
6612 
6613 /*
6614  *  call-seq:
6615  *     str.upcase!              -> str or nil
6616  *     str.upcase!([options])   -> str or nil
6617  *
6618  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6619  *  were made.
6620  *
6621  *  See String#downcase for meaning of +options+ and use with different encodings.
6622  */
6623 
6624 static VALUE
rb_str_upcase_bang(int argc,VALUE * argv,VALUE str)6625 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
6626 {
6627     rb_encoding *enc;
6628     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
6629 
6630     flags = check_case_options(argc, argv, flags);
6631     str_modify_keep_cr(str);
6632     enc = STR_ENC_GET(str);
6633     rb_str_check_dummy_enc(enc);
6634     if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6635 	|| (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) {
6636         char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6637 
6638 	while (s < send) {
6639 	    unsigned int c = *(unsigned char*)s;
6640 
6641 	    if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6642 		*s = 'A' + (c - 'a');
6643 		flags |= ONIGENC_CASE_MODIFIED;
6644 	    }
6645 	    s++;
6646 	}
6647     }
6648     else if (flags&ONIGENC_CASE_ASCII_ONLY)
6649         rb_str_ascii_casemap(str, &flags, enc);
6650     else
6651 	str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6652 
6653     if (ONIGENC_CASE_MODIFIED&flags) return str;
6654     return Qnil;
6655 }
6656 
6657 
6658 /*
6659  *  call-seq:
6660  *     str.upcase              -> new_str
6661  *     str.upcase([options])   -> new_str
6662  *
6663  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
6664  *  uppercase counterparts.
6665  *
6666  *  See String#downcase for meaning of +options+ and use with different encodings.
6667  *
6668  *     "hEllO".upcase   #=> "HELLO"
6669  */
6670 
6671 static VALUE
rb_str_upcase(int argc,VALUE * argv,VALUE str)6672 rb_str_upcase(int argc, VALUE *argv, VALUE str)
6673 {
6674     str = rb_str_dup(str);
6675     rb_str_upcase_bang(argc, argv, str);
6676     return str;
6677 }
6678 
6679 /*
6680  *  call-seq:
6681  *     str.downcase!             -> str or nil
6682  *     str.downcase!([options])  -> str or nil
6683  *
6684  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
6685  *  changes were made.
6686  *
6687  *  See String#downcase for meaning of +options+ and use with different encodings.
6688  */
6689 
6690 static VALUE
rb_str_downcase_bang(int argc,VALUE * argv,VALUE str)6691 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
6692 {
6693     rb_encoding *enc;
6694     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
6695 
6696     flags = check_case_options(argc, argv, flags);
6697     str_modify_keep_cr(str);
6698     enc = STR_ENC_GET(str);
6699     rb_str_check_dummy_enc(enc);
6700     if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6701 	|| (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) {
6702         char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6703 
6704 	while (s < send) {
6705 	    unsigned int c = *(unsigned char*)s;
6706 
6707 	    if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6708 		*s = 'a' + (c - 'A');
6709 		flags |= ONIGENC_CASE_MODIFIED;
6710 	    }
6711 	    s++;
6712 	}
6713     }
6714     else if (flags&ONIGENC_CASE_ASCII_ONLY)
6715         rb_str_ascii_casemap(str, &flags, enc);
6716     else
6717 	str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6718 
6719     if (ONIGENC_CASE_MODIFIED&flags) return str;
6720     return Qnil;
6721 }
6722 
6723 
6724 /*
6725  *  call-seq:
6726  *     str.downcase              -> new_str
6727  *     str.downcase([options])   -> new_str
6728  *
6729  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
6730  *  lowercase counterparts. Which letters exactly are replaced, and by which
6731  *  other letters, depends on the presence or absence of options, and on the
6732  *  +encoding+ of the string.
6733  *
6734  *  The meaning of the +options+ is as follows:
6735  *
6736  *  No option ::
6737  *    Full Unicode case mapping, suitable for most languages
6738  *    (see :turkic and :lithuanian options below for exceptions).
6739  *    Context-dependent case mapping as described in Table 3-14 of the
6740  *    Unicode standard is currently not supported.
6741  *  :ascii ::
6742  *    Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
6743  *    ``a'' to ``z'', are affected.
6744  *    This option cannot be combined with any other option.
6745  *  :turkic ::
6746  *    Full Unicode case mapping, adapted for Turkic languages
6747  *    (Turkish, Azerbaijani, ...). This means that upper case I is mapped to
6748  *    lower case dotless i, and so on.
6749  *  :lithuanian ::
6750  *    Currently, just full Unicode case mapping. In the future, full Unicode
6751  *    case mapping adapted for Lithuanian (keeping the dot on the lower case
6752  *    i even if there is an accent on top).
6753  *  :fold ::
6754  *    Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
6755  *    which is more far-reaching than Unicode case mapping.
6756  *    This option currently cannot be combined with any other option
6757  *    (i.e. there is currently no variant for turkic languages).
6758  *
6759  *  Please note that several assumptions that are valid for ASCII-only case
6760  *  conversions do not hold for more general case conversions. For example,
6761  *  the length of the result may not be the same as the length of the input
6762  *  (neither in characters nor in bytes), some roundtrip assumptions
6763  *  (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
6764  *  normalization (i.e. String#unicode_normalize) is not necessarily maintained
6765  *  by case mapping operations.
6766  *
6767  *  Non-ASCII case mapping/folding is currently supported for UTF-8,
6768  *  UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
6769  *  This support will be extended to other encodings.
6770  *
6771  *     "hEllO".downcase   #=> "hello"
6772  */
6773 
6774 static VALUE
rb_str_downcase(int argc,VALUE * argv,VALUE str)6775 rb_str_downcase(int argc, VALUE *argv, VALUE str)
6776 {
6777     str = rb_str_dup(str);
6778     rb_str_downcase_bang(argc, argv, str);
6779     return str;
6780 }
6781 
6782 
6783 /*
6784  *  call-seq:
6785  *     str.capitalize!              -> str or nil
6786  *     str.capitalize!([options])   -> str or nil
6787  *
6788  *  Modifies <i>str</i> by converting the first character to uppercase and the
6789  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
6790  *  There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
6791  *  the result is the same as for String#downcase, to avoid mixed case.
6792  *
6793  *  See String#downcase for meaning of +options+ and use with different encodings.
6794  *
6795  *     a = "hello"
6796  *     a.capitalize!   #=> "Hello"
6797  *     a               #=> "Hello"
6798  *     a.capitalize!   #=> nil
6799  */
6800 
6801 static VALUE
rb_str_capitalize_bang(int argc,VALUE * argv,VALUE str)6802 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
6803 {
6804     rb_encoding *enc;
6805     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
6806 
6807     flags = check_case_options(argc, argv, flags);
6808     str_modify_keep_cr(str);
6809     enc = STR_ENC_GET(str);
6810     rb_str_check_dummy_enc(enc);
6811     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6812     if (flags&ONIGENC_CASE_ASCII_ONLY)
6813         rb_str_ascii_casemap(str, &flags, enc);
6814     else
6815 	str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6816 
6817     if (ONIGENC_CASE_MODIFIED&flags) return str;
6818     return Qnil;
6819 }
6820 
6821 
6822 /*
6823  *  call-seq:
6824  *     str.capitalize              -> new_str
6825  *     str.capitalize([options])   -> new_str
6826  *
6827  *  Returns a copy of <i>str</i> with the first character converted to uppercase
6828  *  and the remainder to lowercase.
6829  *
6830  *  See String#downcase for meaning of +options+ and use with different encodings.
6831  *
6832  *     "hello".capitalize    #=> "Hello"
6833  *     "HELLO".capitalize    #=> "Hello"
6834  *     "123ABC".capitalize   #=> "123abc"
6835  */
6836 
6837 static VALUE
rb_str_capitalize(int argc,VALUE * argv,VALUE str)6838 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
6839 {
6840     str = rb_str_dup(str);
6841     rb_str_capitalize_bang(argc, argv, str);
6842     return str;
6843 }
6844 
6845 
6846 /*
6847  *  call-seq:
6848  *     str.swapcase!              -> str or nil
6849  *     str.swapcase!([options])   -> str or nil
6850  *
6851  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
6852  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
6853  *
6854  *  See String#downcase for meaning of +options+ and use with different encodings.
6855  */
6856 
6857 static VALUE
rb_str_swapcase_bang(int argc,VALUE * argv,VALUE str)6858 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
6859 {
6860     rb_encoding *enc;
6861     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
6862 
6863     flags = check_case_options(argc, argv, flags);
6864     str_modify_keep_cr(str);
6865     enc = STR_ENC_GET(str);
6866     rb_str_check_dummy_enc(enc);
6867     if (flags&ONIGENC_CASE_ASCII_ONLY)
6868         rb_str_ascii_casemap(str, &flags, enc);
6869     else
6870 	str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6871 
6872     if (ONIGENC_CASE_MODIFIED&flags) return str;
6873     return Qnil;
6874 }
6875 
6876 
6877 /*
6878  *  call-seq:
6879  *     str.swapcase              -> new_str
6880  *     str.swapcase([options])   -> new_str
6881  *
6882  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
6883  *  to lowercase and lowercase characters converted to uppercase.
6884  *
6885  *  See String#downcase for meaning of +options+ and use with different encodings.
6886  *
6887  *     "Hello".swapcase          #=> "hELLO"
6888  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
6889  */
6890 
6891 static VALUE
rb_str_swapcase(int argc,VALUE * argv,VALUE str)6892 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
6893 {
6894     str = rb_str_dup(str);
6895     rb_str_swapcase_bang(argc, argv, str);
6896     return str;
6897 }
6898 
6899 typedef unsigned char *USTR;
6900 
6901 struct tr {
6902     int gen;
6903     unsigned int now, max;
6904     char *p, *pend;
6905 };
6906 
6907 static unsigned int
trnext(struct tr * t,rb_encoding * enc)6908 trnext(struct tr *t, rb_encoding *enc)
6909 {
6910     int n;
6911 
6912     for (;;) {
6913 	if (!t->gen) {
6914 nextpart:
6915 	    if (t->p == t->pend) return -1;
6916 	    if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
6917 		t->p += n;
6918 	    }
6919 	    t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6920 	    t->p += n;
6921 	    if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
6922 		t->p += n;
6923 		if (t->p < t->pend) {
6924 		    unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6925 		    t->p += n;
6926 		    if (t->now > c) {
6927 			if (t->now < 0x80 && c < 0x80) {
6928 			    rb_raise(rb_eArgError,
6929 				     "invalid range \"%c-%c\" in string transliteration",
6930 				     t->now, c);
6931 			}
6932 			else {
6933 			    rb_raise(rb_eArgError, "invalid range in string transliteration");
6934 			}
6935 			continue; /* not reached */
6936 		    }
6937 		    t->gen = 1;
6938 		    t->max = c;
6939 		}
6940 	    }
6941 	    return t->now;
6942 	}
6943 	else {
6944 	    while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
6945 		if (t->now == t->max) {
6946 		    t->gen = 0;
6947 		    goto nextpart;
6948 		}
6949 	    }
6950 	    if (t->now < t->max) {
6951 		return t->now;
6952 	    }
6953 	    else {
6954 		t->gen = 0;
6955 		return t->max;
6956 	    }
6957 	}
6958     }
6959 }
6960 
6961 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
6962 
6963 static VALUE
tr_trans(VALUE str,VALUE src,VALUE repl,int sflag)6964 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
6965 {
6966     const unsigned int errc = -1;
6967     unsigned int trans[256];
6968     rb_encoding *enc, *e1, *e2;
6969     struct tr trsrc, trrepl;
6970     int cflag = 0;
6971     unsigned int c, c0, last = 0;
6972     int modify = 0, i, l;
6973     unsigned char *s, *send;
6974     VALUE hash = 0;
6975     int singlebyte = single_byte_optimizable(str);
6976     int termlen;
6977     int cr;
6978 
6979 #define CHECK_IF_ASCII(c) \
6980     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
6981 	   (cr = ENC_CODERANGE_VALID) : 0)
6982 
6983     StringValue(src);
6984     StringValue(repl);
6985     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6986     if (RSTRING_LEN(repl) == 0) {
6987 	return rb_str_delete_bang(1, &src, str);
6988     }
6989 
6990     cr = ENC_CODERANGE(str);
6991     e1 = rb_enc_check(str, src);
6992     e2 = rb_enc_check(str, repl);
6993     if (e1 == e2) {
6994 	enc = e1;
6995     }
6996     else {
6997 	enc = rb_enc_check(src, repl);
6998     }
6999     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7000     if (RSTRING_LEN(src) > 1 &&
7001 	rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7002 	trsrc.p + l < trsrc.pend) {
7003 	cflag = 1;
7004 	trsrc.p += l;
7005     }
7006     trrepl.p = RSTRING_PTR(repl);
7007     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7008     trsrc.gen = trrepl.gen = 0;
7009     trsrc.now = trrepl.now = 0;
7010     trsrc.max = trrepl.max = 0;
7011 
7012     if (cflag) {
7013 	for (i=0; i<256; i++) {
7014 	    trans[i] = 1;
7015 	}
7016 	while ((c = trnext(&trsrc, enc)) != errc) {
7017 	    if (c < 256) {
7018 		trans[c] = errc;
7019 	    }
7020 	    else {
7021 		if (!hash) hash = rb_hash_new();
7022 		rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7023 	    }
7024 	}
7025 	while ((c = trnext(&trrepl, enc)) != errc)
7026 	    /* retrieve last replacer */;
7027 	last = trrepl.now;
7028 	for (i=0; i<256; i++) {
7029 	    if (trans[i] != errc) {
7030 		trans[i] = last;
7031 	    }
7032 	}
7033     }
7034     else {
7035 	unsigned int r;
7036 
7037 	for (i=0; i<256; i++) {
7038 	    trans[i] = errc;
7039 	}
7040 	while ((c = trnext(&trsrc, enc)) != errc) {
7041 	    r = trnext(&trrepl, enc);
7042 	    if (r == errc) r = trrepl.now;
7043 	    if (c < 256) {
7044 		trans[c] = r;
7045 		if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7046 	    }
7047 	    else {
7048 		if (!hash) hash = rb_hash_new();
7049 		rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7050 	    }
7051 	}
7052     }
7053 
7054     if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7055 	cr = ENC_CODERANGE_7BIT;
7056     str_modify_keep_cr(str);
7057     s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7058     termlen = rb_enc_mbminlen(enc);
7059     if (sflag) {
7060 	int clen, tlen;
7061 	long offset, max = RSTRING_LEN(str);
7062 	unsigned int save = -1;
7063         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7064 
7065 	while (s < send) {
7066 	    int may_modify = 0;
7067 
7068             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7069 	    tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7070 
7071 	    s += clen;
7072 	    if (c < 256) {
7073 		c = trans[c];
7074 	    }
7075 	    else if (hash) {
7076 		VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7077 		if (NIL_P(tmp)) {
7078 		    if (cflag) c = last;
7079 		    else c = errc;
7080 		}
7081 		else if (cflag) c = errc;
7082 		else c = NUM2INT(tmp);
7083 	    }
7084 	    else {
7085 		c = errc;
7086 	    }
7087 	    if (c != (unsigned int)-1) {
7088 		if (save == c) {
7089 		    CHECK_IF_ASCII(c);
7090 		    continue;
7091 		}
7092 		save = c;
7093 		tlen = rb_enc_codelen(c, enc);
7094 		modify = 1;
7095 	    }
7096 	    else {
7097 		save = -1;
7098 		c = c0;
7099 		if (enc != e1) may_modify = 1;
7100 	    }
7101 	    if ((offset = t - buf) + tlen > max) {
7102 		size_t MAYBE_UNUSED(old) = max + termlen;
7103 		max = offset + tlen + (send - s);
7104                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7105 		t = buf + offset;
7106 	    }
7107 	    rb_enc_mbcput(c, t, enc);
7108 	    if (may_modify && memcmp(s, t, tlen) != 0) {
7109 		modify = 1;
7110 	    }
7111 	    CHECK_IF_ASCII(c);
7112 	    t += tlen;
7113 	}
7114 	if (!STR_EMBED_P(str)) {
7115 	    ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7116 	}
7117         TERM_FILL((char *)t, termlen);
7118         RSTRING(str)->as.heap.ptr = (char *)buf;
7119 	RSTRING(str)->as.heap.len = t - buf;
7120 	STR_SET_NOEMBED(str);
7121 	RSTRING(str)->as.heap.aux.capa = max;
7122     }
7123     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7124 	while (s < send) {
7125 	    c = (unsigned char)*s;
7126 	    if (trans[c] != errc) {
7127 		if (!cflag) {
7128 		    c = trans[c];
7129 		    *s = c;
7130 		    modify = 1;
7131 		}
7132 		else {
7133 		    *s = last;
7134 		    modify = 1;
7135 		}
7136 	    }
7137 	    CHECK_IF_ASCII(c);
7138 	    s++;
7139 	}
7140     }
7141     else {
7142 	int clen, tlen;
7143 	long offset, max = (long)((send - s) * 1.2);
7144         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7145 
7146 	while (s < send) {
7147 	    int may_modify = 0;
7148             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7149 	    tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7150 
7151 	    if (c < 256) {
7152 		c = trans[c];
7153 	    }
7154 	    else if (hash) {
7155 		VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7156 		if (NIL_P(tmp)) {
7157 		    if (cflag) c = last;
7158 		    else c = errc;
7159 		}
7160 		else if (cflag) c = errc;
7161 		else c = NUM2INT(tmp);
7162 	    }
7163 	    else {
7164 		c = cflag ? last : errc;
7165 	    }
7166 	    if (c != errc) {
7167 		tlen = rb_enc_codelen(c, enc);
7168 		modify = 1;
7169 	    }
7170 	    else {
7171 		c = c0;
7172 		if (enc != e1) may_modify = 1;
7173 	    }
7174 	    if ((offset = t - buf) + tlen > max) {
7175 		size_t MAYBE_UNUSED(old) = max + termlen;
7176 		max = offset + tlen + (long)((send - s) * 1.2);
7177                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7178 		t = buf + offset;
7179 	    }
7180 	    if (s != t) {
7181 		rb_enc_mbcput(c, t, enc);
7182 		if (may_modify && memcmp(s, t, tlen) != 0) {
7183 		    modify = 1;
7184 		}
7185 	    }
7186 	    CHECK_IF_ASCII(c);
7187 	    s += clen;
7188 	    t += tlen;
7189 	}
7190 	if (!STR_EMBED_P(str)) {
7191 	    ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7192 	}
7193         TERM_FILL((char *)t, termlen);
7194         RSTRING(str)->as.heap.ptr = (char *)buf;
7195 	RSTRING(str)->as.heap.len = t - buf;
7196 	STR_SET_NOEMBED(str);
7197 	RSTRING(str)->as.heap.aux.capa = max;
7198     }
7199 
7200     if (modify) {
7201 	if (cr != ENC_CODERANGE_BROKEN)
7202 	    ENC_CODERANGE_SET(str, cr);
7203 	rb_enc_associate(str, enc);
7204 	return str;
7205     }
7206     return Qnil;
7207 }
7208 
7209 
7210 /*
7211  *  call-seq:
7212  *     str.tr!(from_str, to_str)   -> str or nil
7213  *
7214  *  Translates <i>str</i> in place, using the same rules as
7215  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
7216  *  changes were made.
7217  */
7218 
7219 static VALUE
rb_str_tr_bang(VALUE str,VALUE src,VALUE repl)7220 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7221 {
7222     return tr_trans(str, src, repl, 0);
7223 }
7224 
7225 
7226 /*
7227  *  call-seq:
7228  *     str.tr(from_str, to_str)   => new_str
7229  *
7230  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
7231  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
7232  *  +from_str+, it is padded with its last character in order to maintain the
7233  *  correspondence.
7234  *
7235  *     "hello".tr('el', 'ip')      #=> "hippo"
7236  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
7237  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
7238  *
7239  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
7240  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
7241  *  all characters except those listed.
7242  *
7243  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
7244  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
7245  *
7246  *  The backslash character <code>\\</code> can be used to escape
7247  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
7248  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
7249  *
7250  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7251  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
7252  *
7253  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
7254  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
7255  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7256  *
7257  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
7258  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
7259  */
7260 
7261 static VALUE
rb_str_tr(VALUE str,VALUE src,VALUE repl)7262 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7263 {
7264     str = rb_str_dup(str);
7265     tr_trans(str, src, repl, 0);
7266     return str;
7267 }
7268 
7269 #define TR_TABLE_SIZE 257
7270 static void
tr_setup_table(VALUE str,char stable[TR_TABLE_SIZE],int first,VALUE * tablep,VALUE * ctablep,rb_encoding * enc)7271 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7272 	       VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7273 {
7274     const unsigned int errc = -1;
7275     char buf[256];
7276     struct tr tr;
7277     unsigned int c;
7278     VALUE table = 0, ptable = 0;
7279     int i, l, cflag = 0;
7280 
7281     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7282     tr.gen = tr.now = tr.max = 0;
7283 
7284     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7285 	cflag = 1;
7286 	tr.p += l;
7287     }
7288     if (first) {
7289 	for (i=0; i<256; i++) {
7290 	    stable[i] = 1;
7291 	}
7292 	stable[256] = cflag;
7293     }
7294     else if (stable[256] && !cflag) {
7295 	stable[256] = 0;
7296     }
7297     for (i=0; i<256; i++) {
7298 	buf[i] = cflag;
7299     }
7300 
7301     while ((c = trnext(&tr, enc)) != errc) {
7302 	if (c < 256) {
7303 	    buf[c & 0xff] = !cflag;
7304 	}
7305 	else {
7306 	    VALUE key = UINT2NUM(c);
7307 
7308 	    if (!table && (first || *tablep || stable[256])) {
7309 		if (cflag) {
7310 		    ptable = *ctablep;
7311 		    table = ptable ? ptable : rb_hash_new();
7312 		    *ctablep = table;
7313 		}
7314 		else {
7315 		    table = rb_hash_new();
7316 		    ptable = *tablep;
7317 		    *tablep = table;
7318 		}
7319 	    }
7320 	    if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7321 		rb_hash_aset(table, key, Qtrue);
7322 	    }
7323 	}
7324     }
7325     for (i=0; i<256; i++) {
7326 	stable[i] = stable[i] && buf[i];
7327     }
7328     if (!table && !cflag) {
7329 	*tablep = 0;
7330     }
7331 }
7332 
7333 
7334 static int
tr_find(unsigned int c,const char table[TR_TABLE_SIZE],VALUE del,VALUE nodel)7335 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7336 {
7337     if (c < 256) {
7338 	return table[c] != 0;
7339     }
7340     else {
7341 	VALUE v = UINT2NUM(c);
7342 
7343 	if (del) {
7344 	    if (!NIL_P(rb_hash_lookup(del, v)) &&
7345 		    (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7346 		return TRUE;
7347 	    }
7348 	}
7349 	else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7350 	    return FALSE;
7351 	}
7352 	return table[256] ? TRUE : FALSE;
7353     }
7354 }
7355 
7356 /*
7357  *  call-seq:
7358  *     str.delete!([other_str]+)   -> str or nil
7359  *
7360  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7361  *  <code>nil</code> if <i>str</i> was not modified.
7362  */
7363 
7364 static VALUE
rb_str_delete_bang(int argc,VALUE * argv,VALUE str)7365 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7366 {
7367     char squeez[TR_TABLE_SIZE];
7368     rb_encoding *enc = 0;
7369     char *s, *send, *t;
7370     VALUE del = 0, nodel = 0;
7371     int modify = 0;
7372     int i, ascompat, cr;
7373 
7374     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7375     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7376     for (i=0; i<argc; i++) {
7377 	VALUE s = argv[i];
7378 
7379 	StringValue(s);
7380 	enc = rb_enc_check(str, s);
7381 	tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7382     }
7383 
7384     str_modify_keep_cr(str);
7385     ascompat = rb_enc_asciicompat(enc);
7386     s = t = RSTRING_PTR(str);
7387     send = RSTRING_END(str);
7388     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7389     while (s < send) {
7390 	unsigned int c;
7391 	int clen;
7392 
7393 	if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7394 	    if (squeez[c]) {
7395 		modify = 1;
7396 	    }
7397 	    else {
7398 		if (t != s) *t = c;
7399 		t++;
7400 	    }
7401 	    s++;
7402 	}
7403 	else {
7404 	    c = rb_enc_codepoint_len(s, send, &clen, enc);
7405 
7406 	    if (tr_find(c, squeez, del, nodel)) {
7407 		modify = 1;
7408 	    }
7409 	    else {
7410 		if (t != s) rb_enc_mbcput(c, t, enc);
7411 		t += clen;
7412 		if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
7413 	    }
7414 	    s += clen;
7415 	}
7416     }
7417     TERM_FILL(t, TERM_LEN(str));
7418     STR_SET_LEN(str, t - RSTRING_PTR(str));
7419     ENC_CODERANGE_SET(str, cr);
7420 
7421     if (modify) return str;
7422     return Qnil;
7423 }
7424 
7425 
7426 /*
7427  *  call-seq:
7428  *     str.delete([other_str]+)   -> new_str
7429  *
7430  *  Returns a copy of <i>str</i> with all characters in the intersection of its
7431  *  arguments deleted. Uses the same rules for building the set of characters as
7432  *  <code>String#count</code>.
7433  *
7434  *     "hello".delete "l","lo"        #=> "heo"
7435  *     "hello".delete "lo"            #=> "he"
7436  *     "hello".delete "aeiou", "^e"   #=> "hell"
7437  *     "hello".delete "ej-m"          #=> "ho"
7438  */
7439 
7440 static VALUE
rb_str_delete(int argc,VALUE * argv,VALUE str)7441 rb_str_delete(int argc, VALUE *argv, VALUE str)
7442 {
7443     str = rb_str_dup(str);
7444     rb_str_delete_bang(argc, argv, str);
7445     return str;
7446 }
7447 
7448 
7449 /*
7450  *  call-seq:
7451  *     str.squeeze!([other_str]*)   -> str or nil
7452  *
7453  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
7454  *  <code>nil</code> if no changes were made.
7455  */
7456 
7457 static VALUE
rb_str_squeeze_bang(int argc,VALUE * argv,VALUE str)7458 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
7459 {
7460     char squeez[TR_TABLE_SIZE];
7461     rb_encoding *enc = 0;
7462     VALUE del = 0, nodel = 0;
7463     unsigned char *s, *send, *t;
7464     int i, modify = 0;
7465     int ascompat, singlebyte = single_byte_optimizable(str);
7466     unsigned int save;
7467 
7468     if (argc == 0) {
7469 	enc = STR_ENC_GET(str);
7470     }
7471     else {
7472 	for (i=0; i<argc; i++) {
7473 	    VALUE s = argv[i];
7474 
7475 	    StringValue(s);
7476 	    enc = rb_enc_check(str, s);
7477 	    if (singlebyte && !single_byte_optimizable(s))
7478 		singlebyte = 0;
7479 	    tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7480 	}
7481     }
7482 
7483     str_modify_keep_cr(str);
7484     s = t = (unsigned char *)RSTRING_PTR(str);
7485     if (!s || RSTRING_LEN(str) == 0) return Qnil;
7486     send = (unsigned char *)RSTRING_END(str);
7487     save = -1;
7488     ascompat = rb_enc_asciicompat(enc);
7489 
7490     if (singlebyte) {
7491         while (s < send) {
7492             unsigned int c = *s++;
7493 	    if (c != save || (argc > 0 && !squeez[c])) {
7494 	        *t++ = save = c;
7495 	    }
7496 	}
7497     }
7498     else {
7499 	while (s < send) {
7500 	    unsigned int c;
7501 	    int clen;
7502 
7503             if (ascompat && (c = *s) < 0x80) {
7504 		if (c != save || (argc > 0 && !squeez[c])) {
7505 		    *t++ = save = c;
7506 		}
7507 		s++;
7508 	    }
7509 	    else {
7510                 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
7511 
7512 		if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
7513 		    if (t != s) rb_enc_mbcput(c, t, enc);
7514 		    save = c;
7515 		    t += clen;
7516 		}
7517 		s += clen;
7518 	    }
7519 	}
7520     }
7521 
7522     TERM_FILL((char *)t, TERM_LEN(str));
7523     if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
7524         STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
7525 	modify = 1;
7526     }
7527 
7528     if (modify) return str;
7529     return Qnil;
7530 }
7531 
7532 
7533 /*
7534  *  call-seq:
7535  *     str.squeeze([other_str]*)    -> new_str
7536  *
7537  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
7538  *  procedure described for <code>String#count</code>. Returns a new string
7539  *  where runs of the same character that occur in this set are replaced by a
7540  *  single character. If no arguments are given, all runs of identical
7541  *  characters are replaced by a single character.
7542  *
7543  *     "yellow moon".squeeze                  #=> "yelow mon"
7544  *     "  now   is  the".squeeze(" ")         #=> " now is the"
7545  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
7546  */
7547 
7548 static VALUE
rb_str_squeeze(int argc,VALUE * argv,VALUE str)7549 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
7550 {
7551     str = rb_str_dup(str);
7552     rb_str_squeeze_bang(argc, argv, str);
7553     return str;
7554 }
7555 
7556 
7557 /*
7558  *  call-seq:
7559  *     str.tr_s!(from_str, to_str)   -> str or nil
7560  *
7561  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
7562  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
7563  */
7564 
7565 static VALUE
rb_str_tr_s_bang(VALUE str,VALUE src,VALUE repl)7566 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
7567 {
7568     return tr_trans(str, src, repl, 1);
7569 }
7570 
7571 
7572 /*
7573  *  call-seq:
7574  *     str.tr_s(from_str, to_str)   -> new_str
7575  *
7576  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
7577  *  then removes duplicate characters in regions that were affected by the
7578  *  translation.
7579  *
7580  *     "hello".tr_s('l', 'r')     #=> "hero"
7581  *     "hello".tr_s('el', '*')    #=> "h*o"
7582  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
7583  */
7584 
7585 static VALUE
rb_str_tr_s(VALUE str,VALUE src,VALUE repl)7586 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7587 {
7588     str = rb_str_dup(str);
7589     tr_trans(str, src, repl, 1);
7590     return str;
7591 }
7592 
7593 
7594 /*
7595  *  call-seq:
7596  *     str.count([other_str]+)   -> integer
7597  *
7598  *  Each +other_str+ parameter defines a set of characters to count.  The
7599  *  intersection of these sets defines the characters to count in +str+.  Any
7600  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
7601  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
7602  *  backslash character <code>\\</code> can be used to escape <code>^</code> or
7603  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
7604  *  sequence or the end of a +other_str+.
7605  *
7606  *     a = "hello world"
7607  *     a.count "lo"                   #=> 5
7608  *     a.count "lo", "o"              #=> 2
7609  *     a.count "hello", "^l"          #=> 4
7610  *     a.count "ej-m"                 #=> 4
7611  *
7612  *     "hello^world".count "\\^aeiou" #=> 4
7613  *     "hello-world".count "a\\-eo"   #=> 4
7614  *
7615  *     c = "hello world\\r\\n"
7616  *     c.count "\\"                   #=> 2
7617  *     c.count "\\A"                  #=> 0
7618  *     c.count "X-\\w"                #=> 3
7619  */
7620 
7621 static VALUE
rb_str_count(int argc,VALUE * argv,VALUE str)7622 rb_str_count(int argc, VALUE *argv, VALUE str)
7623 {
7624     char table[TR_TABLE_SIZE];
7625     rb_encoding *enc = 0;
7626     VALUE del = 0, nodel = 0, tstr;
7627     char *s, *send;
7628     int i;
7629     int ascompat;
7630 
7631     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7632 
7633     tstr = argv[0];
7634     StringValue(tstr);
7635     enc = rb_enc_check(str, tstr);
7636     if (argc == 1) {
7637 	const char *ptstr;
7638 	if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7639 	    (ptstr = RSTRING_PTR(tstr),
7640 	     ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7641 	    !is_broken_string(str)) {
7642 	    int n = 0;
7643 	    int clen;
7644 	    unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
7645 
7646 	    s = RSTRING_PTR(str);
7647 	    if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7648 	    send = RSTRING_END(str);
7649 	    while (s < send) {
7650 		if (*(unsigned char*)s++ == c) n++;
7651 	    }
7652 	    return INT2NUM(n);
7653 	}
7654     }
7655 
7656     tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
7657     for (i=1; i<argc; i++) {
7658 	tstr = argv[i];
7659 	StringValue(tstr);
7660 	enc = rb_enc_check(str, tstr);
7661 	tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
7662     }
7663 
7664     s = RSTRING_PTR(str);
7665     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7666     send = RSTRING_END(str);
7667     ascompat = rb_enc_asciicompat(enc);
7668     i = 0;
7669     while (s < send) {
7670 	unsigned int c;
7671 
7672 	if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7673 	    if (table[c]) {
7674 		i++;
7675 	    }
7676 	    s++;
7677 	}
7678 	else {
7679 	    int clen;
7680 	    c = rb_enc_codepoint_len(s, send, &clen, enc);
7681 	    if (tr_find(c, table, del, nodel)) {
7682 		i++;
7683 	    }
7684 	    s += clen;
7685 	}
7686     }
7687 
7688     return INT2NUM(i);
7689 }
7690 
7691 static VALUE
rb_fs_check(VALUE val)7692 rb_fs_check(VALUE val)
7693 {
7694     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
7695 	val = rb_check_string_type(val);
7696 	if (NIL_P(val)) return 0;
7697     }
7698     return val;
7699 }
7700 
7701 static const char isspacetable[256] = {
7702     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
7703     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7704     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7705     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7706     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7707     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7708     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7709     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7710     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7711     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7712     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7713     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7714     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7715     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7716     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7717     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
7718 };
7719 
7720 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
7721 
7722 static long
split_string(VALUE result,VALUE str,long beg,long len,long empty_count)7723 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
7724 {
7725     if (empty_count >= 0 && len == 0) {
7726 	return empty_count + 1;
7727     }
7728     if (empty_count > 0) {
7729 	/* make different substrings */
7730 	if (result) {
7731 	    do {
7732 		rb_ary_push(result, str_new_empty(str));
7733 	    } while (--empty_count > 0);
7734 	}
7735 	else {
7736 	    do {
7737 		rb_yield(str_new_empty(str));
7738 	    } while (--empty_count > 0);
7739 	}
7740     }
7741     str = rb_str_subseq(str, beg, len);
7742     if (result) {
7743 	rb_ary_push(result, str);
7744     }
7745     else {
7746 	rb_yield(str);
7747     }
7748     return empty_count;
7749 }
7750 
7751 /*
7752  *  call-seq:
7753  *     str.split(pattern=nil, [limit])                -> an_array
7754  *     str.split(pattern=nil, [limit]) {|sub| block } -> str
7755  *
7756  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
7757  *  of these substrings.
7758  *
7759  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
7760  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
7761  *  space, <i>str</i> is split on whitespace, with leading and trailing
7762  *  whitespace and runs of contiguous whitespace characters ignored.
7763  *
7764  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
7765  *  pattern matches. Whenever the pattern matches a zero-length string,
7766  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
7767  *  groups, the respective matches will be returned in the array as well.
7768  *
7769  *  If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
7770  *  If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
7771  *  split on whitespace as if ' ' were specified.
7772  *
7773  *  If the <i>limit</i> parameter is omitted, trailing null fields are
7774  *  suppressed. If <i>limit</i> is a positive number, at most that number
7775  *  of split substrings will be returned (captured groups will be returned
7776  *  as well, but are not counted towards the limit).
7777  *  If <i>limit</i> is <code>1</code>, the entire
7778  *  string is returned as the only entry in an array. If negative, there is no
7779  *  limit to the number of fields returned, and trailing null fields are not
7780  *  suppressed.
7781  *
7782  *  When the input +str+ is empty an empty Array is returned as the string is
7783  *  considered to have no fields to split.
7784  *
7785  *     " now's  the time ".split       #=> ["now's", "the", "time"]
7786  *     " now's  the time ".split(' ')  #=> ["now's", "the", "time"]
7787  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
7788  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
7789  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
7790  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
7791  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
7792  *
7793  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
7794  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
7795  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
7796  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
7797  *
7798  *     "1:2:3".split(/(:)()()/, 2)     #=> ["1", ":", "", "", "2:3"]
7799  *
7800  *     "".split(',', -1)               #=> []
7801  *
7802  *  If a block is given, invoke the block with each split substring.
7803  *
7804  */
7805 
7806 static VALUE
rb_str_split_m(int argc,VALUE * argv,VALUE str)7807 rb_str_split_m(int argc, VALUE *argv, VALUE str)
7808 {
7809     rb_encoding *enc;
7810     VALUE spat;
7811     VALUE limit;
7812     enum {awk, string, regexp} split_type;
7813     long beg, end, i = 0, empty_count = -1;
7814     int lim = 0;
7815     VALUE result, tmp;
7816 
7817     result = rb_block_given_p() ? Qfalse : Qnil;
7818     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
7819 	lim = NUM2INT(limit);
7820 	if (lim <= 0) limit = Qnil;
7821 	else if (lim == 1) {
7822 	    if (RSTRING_LEN(str) == 0)
7823 		return result ? rb_ary_new2(0) : str;
7824 	    tmp = rb_str_dup(str);
7825 	    if (!result) {
7826 		rb_yield(tmp);
7827 		return str;
7828 	    }
7829 	    return rb_ary_new3(1, tmp);
7830 	}
7831 	i = 1;
7832     }
7833     if (NIL_P(limit) && !lim) empty_count = 0;
7834 
7835     enc = STR_ENC_GET(str);
7836     split_type = regexp;
7837     if (!NIL_P(spat)) {
7838 	spat = get_pat_quoted(spat, 0);
7839     }
7840     else if (NIL_P(spat = rb_fs)) {
7841 	split_type = awk;
7842     }
7843     else if (!(spat = rb_fs_check(spat))) {
7844 	rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
7845     }
7846     if (split_type != awk) {
7847 	if (BUILTIN_TYPE(spat) == T_STRING) {
7848 	    rb_encoding *enc2 = STR_ENC_GET(spat);
7849 
7850 	    mustnot_broken(spat);
7851 	    split_type = string;
7852 	    if (RSTRING_LEN(spat) == 0) {
7853 		/* Special case - split into chars */
7854 		spat = rb_reg_regcomp(spat);
7855 		split_type = regexp;
7856 	    }
7857 	    else if (rb_enc_asciicompat(enc2) == 1) {
7858 		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
7859 		    split_type = awk;
7860 		}
7861 	    }
7862 	    else {
7863 		int l;
7864 		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
7865 		    RSTRING_LEN(spat) == l) {
7866 		    split_type = awk;
7867 		}
7868 	    }
7869 	}
7870     }
7871 
7872 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
7873 
7874     if (result) result = rb_ary_new();
7875     beg = 0;
7876     if (split_type == awk) {
7877 	char *ptr = RSTRING_PTR(str);
7878 	char *eptr = RSTRING_END(str);
7879 	char *bptr = ptr;
7880 	int skip = 1;
7881 	unsigned int c;
7882 
7883 	end = beg;
7884 	if (is_ascii_string(str)) {
7885 	    while (ptr < eptr) {
7886 		c = (unsigned char)*ptr++;
7887 		if (skip) {
7888 		    if (ascii_isspace(c)) {
7889 			beg = ptr - bptr;
7890 		    }
7891 		    else {
7892 			end = ptr - bptr;
7893 			skip = 0;
7894 			if (!NIL_P(limit) && lim <= i) break;
7895 		    }
7896 		}
7897 		else if (ascii_isspace(c)) {
7898 		    SPLIT_STR(beg, end-beg);
7899 		    skip = 1;
7900 		    beg = ptr - bptr;
7901 		    if (!NIL_P(limit)) ++i;
7902 		}
7903 		else {
7904 		    end = ptr - bptr;
7905 		}
7906 	    }
7907 	}
7908 	else {
7909 	    while (ptr < eptr) {
7910 		int n;
7911 
7912 		c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
7913 		ptr += n;
7914 		if (skip) {
7915 		    if (rb_isspace(c)) {
7916 			beg = ptr - bptr;
7917 		    }
7918 		    else {
7919 			end = ptr - bptr;
7920 			skip = 0;
7921 			if (!NIL_P(limit) && lim <= i) break;
7922 		    }
7923 		}
7924 		else if (rb_isspace(c)) {
7925 		    SPLIT_STR(beg, end-beg);
7926 		    skip = 1;
7927 		    beg = ptr - bptr;
7928 		    if (!NIL_P(limit)) ++i;
7929 		}
7930 		else {
7931 		    end = ptr - bptr;
7932 		}
7933 	    }
7934 	}
7935     }
7936     else if (split_type == string) {
7937 	char *ptr = RSTRING_PTR(str);
7938 	char *str_start = ptr;
7939 	char *substr_start = ptr;
7940 	char *eptr = RSTRING_END(str);
7941 	char *sptr = RSTRING_PTR(spat);
7942 	long slen = RSTRING_LEN(spat);
7943 
7944 	mustnot_broken(str);
7945 	enc = rb_enc_check(str, spat);
7946 	while (ptr < eptr &&
7947 	       (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
7948 	    /* Check we are at the start of a char */
7949 	    char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
7950 	    if (t != ptr + end) {
7951 		ptr = t;
7952 		continue;
7953 	    }
7954 	    SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
7955 	    ptr += end + slen;
7956 	    substr_start = ptr;
7957 	    if (!NIL_P(limit) && lim <= ++i) break;
7958 	}
7959 	beg = ptr - str_start;
7960     }
7961     else {
7962 	char *ptr = RSTRING_PTR(str);
7963 	long len = RSTRING_LEN(str);
7964 	long start = beg;
7965 	long idx;
7966 	int last_null = 0;
7967 	struct re_registers *regs;
7968         VALUE match = 0;
7969 
7970         for (; (end = rb_reg_search(spat, str, start, 0)) >= 0;
7971              (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
7972             match = rb_backref_get();
7973             if (!result) rb_match_busy(match);
7974             regs = RMATCH_REGS(match);
7975 	    if (start == end && BEG(0) == END(0)) {
7976 		if (!ptr) {
7977 		    SPLIT_STR(0, 0);
7978 		    break;
7979 		}
7980 		else if (last_null == 1) {
7981 		    SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc));
7982 		    beg = start;
7983 		}
7984 		else {
7985                     if (start == len)
7986                         start++;
7987                     else
7988                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
7989 		    last_null = 1;
7990 		    continue;
7991 		}
7992 	    }
7993 	    else {
7994 		SPLIT_STR(beg, end-beg);
7995 		beg = start = END(0);
7996 	    }
7997 	    last_null = 0;
7998 
7999 	    for (idx=1; idx < regs->num_regs; idx++) {
8000 		if (BEG(idx) == -1) continue;
8001 		SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8002 	    }
8003 	    if (!NIL_P(limit) && lim <= ++i) break;
8004 	}
8005         if (match) rb_match_unbusy(match);
8006     }
8007     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8008 	SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8009     }
8010 
8011     return result ? result : str;
8012 }
8013 
8014 VALUE
rb_str_split(VALUE str,const char * sep0)8015 rb_str_split(VALUE str, const char *sep0)
8016 {
8017     VALUE sep;
8018 
8019     StringValue(str);
8020     sep = rb_str_new_cstr(sep0);
8021     return rb_str_split_m(1, &sep, str);
8022 }
8023 
8024 static int
enumerator_wantarray(const char * method)8025 enumerator_wantarray(const char *method)
8026 {
8027     if (rb_block_given_p()) {
8028 #if STRING_ENUMERATORS_WANTARRAY
8029 	rb_warn("given block not used");
8030 #else
8031 	rb_warning("passing a block to String#%s is deprecated", method);
8032 	return 0;
8033 #endif
8034     }
8035     return 1;
8036 }
8037 
8038 #define WANTARRAY(m, size) \
8039     (enumerator_wantarray(m) ? rb_ary_new_capa(size) : 0)
8040 
8041 static inline int
enumerator_element(VALUE ary,VALUE e)8042 enumerator_element(VALUE ary, VALUE e)
8043 {
8044     if (ary) {
8045 	rb_ary_push(ary, e);
8046 	return 0;
8047     }
8048     else {
8049 	rb_yield(e);
8050 	return 1;
8051     }
8052 }
8053 
8054 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8055 
8056 static const char *
chomp_newline(const char * p,const char * e,rb_encoding * enc)8057 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8058 {
8059     const char *prev = rb_enc_prev_char(p, e, e, enc);
8060     if (rb_enc_is_newline(prev, e, enc)) {
8061 	e = prev;
8062 	prev = rb_enc_prev_char(p, e, e, enc);
8063 	if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8064 	    e = prev;
8065     }
8066     return e;
8067 }
8068 
8069 static VALUE
rb_str_enumerate_lines(int argc,VALUE * argv,VALUE str,VALUE ary)8070 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8071 {
8072     rb_encoding *enc;
8073     VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8074     const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8075     long pos, len, rslen;
8076     int rsnewline = 0;
8077 
8078     if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8079 	rs = rb_rs;
8080     if (!NIL_P(opts)) {
8081 	static ID keywords[1];
8082 	if (!keywords[0]) {
8083 	    keywords[0] = rb_intern_const("chomp");
8084 	}
8085 	rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8086 	chomp = (chomp != Qundef && RTEST(chomp));
8087     }
8088 
8089     if (NIL_P(rs)) {
8090 	if (!ENUM_ELEM(ary, str)) {
8091 	    return ary;
8092 	}
8093 	else {
8094 	    return orig;
8095 	}
8096     }
8097 
8098     if (!RSTRING_LEN(str)) goto end;
8099     str = rb_str_new_frozen(str);
8100     ptr = subptr = RSTRING_PTR(str);
8101     pend = RSTRING_END(str);
8102     len = RSTRING_LEN(str);
8103     StringValue(rs);
8104     rslen = RSTRING_LEN(rs);
8105 
8106     if (rs == rb_default_rs)
8107 	enc = rb_enc_get(str);
8108     else
8109 	enc = rb_enc_check(str, rs);
8110 
8111     if (rslen == 0) {
8112 	/* paragraph mode */
8113 	int n;
8114 	const char *eol = NULL;
8115 	subend = subptr;
8116 	while (subend < pend) {
8117 	    do {
8118 		if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8119 		    n = 0;
8120 		rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8121 		if (rb_enc_is_newline(subend + n, pend, enc)) {
8122 		    if (eol == subend) break;
8123 		    subend += rslen;
8124 		    if (subptr) eol = subend;
8125 		}
8126 		else {
8127 		    if (!subptr) subptr = subend;
8128 		    subend += rslen;
8129 		}
8130 		rslen = 0;
8131 	    } while (subend < pend);
8132 	    if (!subptr) break;
8133 	    line = rb_str_subseq(str, subptr - ptr,
8134 				 subend - subptr + (chomp ? 0 : rslen));
8135 	    if (ENUM_ELEM(ary, line)) {
8136 		str_mod_check(str, ptr, len);
8137 	    }
8138 	    subptr = eol = NULL;
8139 	}
8140 	goto end;
8141     }
8142     else {
8143 	rsptr = RSTRING_PTR(rs);
8144 	if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8145 	    rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8146 	    rsnewline = 1;
8147 	}
8148     }
8149 
8150     if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8151 	rs = rb_str_new(rsptr, rslen);
8152 	rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8153 	rsptr = RSTRING_PTR(rs);
8154 	rslen = RSTRING_LEN(rs);
8155     }
8156 
8157     while (subptr < pend) {
8158 	pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8159 	if (pos < 0) break;
8160 	hit = subptr + pos;
8161 	adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8162 	if (hit != adjusted) {
8163 	    subptr = adjusted;
8164 	    continue;
8165 	}
8166 	subend = hit += rslen;
8167 	if (chomp) {
8168 	    if (rsnewline) {
8169 		subend = chomp_newline(subptr, subend, enc);
8170 	    }
8171 	    else {
8172 		subend -= rslen;
8173 	    }
8174 	}
8175 	line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8176 	if (ENUM_ELEM(ary, line)) {
8177 	    str_mod_check(str, ptr, len);
8178 	}
8179 	subptr = hit;
8180     }
8181 
8182     if (subptr != pend) {
8183 	if (chomp) {
8184 	    if (rsnewline) {
8185 		pend = chomp_newline(subptr, pend, enc);
8186 	    }
8187 	    else if (pend - subptr >= rslen &&
8188 		     memcmp(pend - rslen, rsptr, rslen) == 0) {
8189 		pend -= rslen;
8190 	    }
8191 	}
8192 	line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8193 	ENUM_ELEM(ary, line);
8194 	RB_GC_GUARD(str);
8195     }
8196 
8197   end:
8198     if (ary)
8199 	return ary;
8200     else
8201 	return orig;
8202 }
8203 
8204 /*
8205  *  call-seq:
8206  *     str.each_line(separator=$/ [, getline_args]) {|substr| block } -> str
8207  *     str.each_line(separator=$/ [, getline_args])                   -> an_enumerator
8208  *
8209  *  Splits <i>str</i> using the supplied parameter as the record
8210  *  separator (<code>$/</code> by default), passing each substring in
8211  *  turn to the supplied block.  If a zero-length record separator is
8212  *  supplied, the string is split into paragraphs delimited by
8213  *  multiple successive newlines.
8214  *
8215  *  See IO.readlines for details about getline_args.
8216  *
8217  *  If no block is given, an enumerator is returned instead.
8218  *
8219  *     print "Example one\n"
8220  *     "hello\nworld".each_line {|s| p s}
8221  *     print "Example two\n"
8222  *     "hello\nworld".each_line('l') {|s| p s}
8223  *     print "Example three\n"
8224  *     "hello\n\n\nworld".each_line('') {|s| p s}
8225  *
8226  *  <em>produces:</em>
8227  *
8228  *     Example one
8229  *     "hello\n"
8230  *     "world"
8231  *     Example two
8232  *     "hel"
8233  *     "l"
8234  *     "o\nworl"
8235  *     "d"
8236  *     Example three
8237  *     "hello\n\n"
8238  *     "world"
8239  */
8240 
8241 static VALUE
rb_str_each_line(int argc,VALUE * argv,VALUE str)8242 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8243 {
8244     RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8245     return rb_str_enumerate_lines(argc, argv, str, 0);
8246 }
8247 
8248 /*
8249  *  call-seq:
8250  *     str.lines(separator=$/ [, getline_args])  -> an_array
8251  *
8252  *  Returns an array of lines in <i>str</i> split using the supplied
8253  *  record separator (<code>$/</code> by default).  This is a
8254  *  shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8255  *
8256  *  See IO.readlines for details about getline_args.
8257  *
8258  *     "hello\nworld\n".lines              #=> ["hello\n", "world\n"]
8259  *     "hello  world".lines(' ')           #=> ["hello ", " ", "world"]
8260  *     "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8261  *
8262  *  If a block is given, which is a deprecated form, works the same as
8263  *  <code>each_line</code>.
8264  */
8265 
8266 static VALUE
rb_str_lines(int argc,VALUE * argv,VALUE str)8267 rb_str_lines(int argc, VALUE *argv, VALUE str)
8268 {
8269     VALUE ary = WANTARRAY("lines", 0);
8270     return rb_str_enumerate_lines(argc, argv, str, ary);
8271 }
8272 
8273 static VALUE
rb_str_each_byte_size(VALUE str,VALUE args,VALUE eobj)8274 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8275 {
8276     return LONG2FIX(RSTRING_LEN(str));
8277 }
8278 
8279 static VALUE
rb_str_enumerate_bytes(VALUE str,VALUE ary)8280 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8281 {
8282     long i;
8283 
8284     for (i=0; i<RSTRING_LEN(str); i++) {
8285 	ENUM_ELEM(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
8286     }
8287     if (ary)
8288 	return ary;
8289     else
8290 	return str;
8291 }
8292 
8293 /*
8294  *  call-seq:
8295  *     str.each_byte {|integer| block }    -> str
8296  *     str.each_byte                      -> an_enumerator
8297  *
8298  *  Passes each byte in <i>str</i> to the given block, or returns an
8299  *  enumerator if no block is given.
8300  *
8301  *     "hello".each_byte {|c| print c, ' ' }
8302  *
8303  *  <em>produces:</em>
8304  *
8305  *     104 101 108 108 111
8306  */
8307 
8308 static VALUE
rb_str_each_byte(VALUE str)8309 rb_str_each_byte(VALUE str)
8310 {
8311     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8312     return rb_str_enumerate_bytes(str, 0);
8313 }
8314 
8315 /*
8316  *  call-seq:
8317  *     str.bytes    -> an_array
8318  *
8319  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
8320  *  <code>str.each_byte.to_a</code>.
8321  *
8322  *  If a block is given, which is a deprecated form, works the same as
8323  *  <code>each_byte</code>.
8324  */
8325 
8326 static VALUE
rb_str_bytes(VALUE str)8327 rb_str_bytes(VALUE str)
8328 {
8329     VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8330     return rb_str_enumerate_bytes(str, ary);
8331 }
8332 
8333 static VALUE
rb_str_each_char_size(VALUE str,VALUE args,VALUE eobj)8334 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8335 {
8336     return rb_str_length(str);
8337 }
8338 
8339 static VALUE
rb_str_enumerate_chars(VALUE str,VALUE ary)8340 rb_str_enumerate_chars(VALUE str, VALUE ary)
8341 {
8342     VALUE orig = str;
8343     long i, len, n;
8344     const char *ptr;
8345     rb_encoding *enc;
8346 
8347     str = rb_str_new_frozen(str);
8348     ptr = RSTRING_PTR(str);
8349     len = RSTRING_LEN(str);
8350     enc = rb_enc_get(str);
8351 
8352     if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
8353 	for (i = 0; i < len; i += n) {
8354 	    n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
8355 	    ENUM_ELEM(ary, rb_str_subseq(str, i, n));
8356 	}
8357     }
8358     else {
8359 	for (i = 0; i < len; i += n) {
8360 	    n = rb_enc_mbclen(ptr + i, ptr + len, enc);
8361 	    ENUM_ELEM(ary, rb_str_subseq(str, i, n));
8362 	}
8363     }
8364     RB_GC_GUARD(str);
8365     if (ary)
8366 	return ary;
8367     else
8368 	return orig;
8369 }
8370 
8371 /*
8372  *  call-seq:
8373  *     str.each_char {|cstr| block }    -> str
8374  *     str.each_char                    -> an_enumerator
8375  *
8376  *  Passes each character in <i>str</i> to the given block, or returns
8377  *  an enumerator if no block is given.
8378  *
8379  *     "hello".each_char {|c| print c, ' ' }
8380  *
8381  *  <em>produces:</em>
8382  *
8383  *     h e l l o
8384  */
8385 
8386 static VALUE
rb_str_each_char(VALUE str)8387 rb_str_each_char(VALUE str)
8388 {
8389     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8390     return rb_str_enumerate_chars(str, 0);
8391 }
8392 
8393 /*
8394  *  call-seq:
8395  *     str.chars    -> an_array
8396  *
8397  *  Returns an array of characters in <i>str</i>.  This is a shorthand
8398  *  for <code>str.each_char.to_a</code>.
8399  *
8400  *  If a block is given, which is a deprecated form, works the same as
8401  *  <code>each_char</code>.
8402  */
8403 
8404 static VALUE
rb_str_chars(VALUE str)8405 rb_str_chars(VALUE str)
8406 {
8407     VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
8408     return rb_str_enumerate_chars(str, ary);
8409 }
8410 
8411 static VALUE
rb_str_enumerate_codepoints(VALUE str,VALUE ary)8412 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
8413 {
8414     VALUE orig = str;
8415     int n;
8416     unsigned int c;
8417     const char *ptr, *end;
8418     rb_encoding *enc;
8419 
8420     if (single_byte_optimizable(str))
8421 	return rb_str_enumerate_bytes(str, ary);
8422 
8423     str = rb_str_new_frozen(str);
8424     ptr = RSTRING_PTR(str);
8425     end = RSTRING_END(str);
8426     enc = STR_ENC_GET(str);
8427 
8428     while (ptr < end) {
8429 	c = rb_enc_codepoint_len(ptr, end, &n, enc);
8430 	ENUM_ELEM(ary, UINT2NUM(c));
8431 	ptr += n;
8432     }
8433     RB_GC_GUARD(str);
8434     if (ary)
8435 	return ary;
8436     else
8437 	return orig;
8438 }
8439 
8440 /*
8441  *  call-seq:
8442  *     str.each_codepoint {|integer| block }    -> str
8443  *     str.each_codepoint                       -> an_enumerator
8444  *
8445  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
8446  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
8447  *  given block.  For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
8448  *  values are directly derived from the binary representation
8449  *  of each character.
8450  *
8451  *  If no block is given, an enumerator is returned instead.
8452  *
8453  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
8454  *
8455  *  <em>produces:</em>
8456  *
8457  *     104 101 108 108 111 1593
8458  */
8459 
8460 static VALUE
rb_str_each_codepoint(VALUE str)8461 rb_str_each_codepoint(VALUE str)
8462 {
8463     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8464     return rb_str_enumerate_codepoints(str, 0);
8465 }
8466 
8467 /*
8468  *  call-seq:
8469  *     str.codepoints   -> an_array
8470  *
8471  *  Returns an array of the <code>Integer</code> ordinals of the
8472  *  characters in <i>str</i>.  This is a shorthand for
8473  *  <code>str.each_codepoint.to_a</code>.
8474  *
8475  *  If a block is given, which is a deprecated form, works the same as
8476  *  <code>each_codepoint</code>.
8477  */
8478 
8479 static VALUE
rb_str_codepoints(VALUE str)8480 rb_str_codepoints(VALUE str)
8481 {
8482     VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
8483     return rb_str_enumerate_codepoints(str, ary);
8484 }
8485 
8486 static regex_t *
get_reg_grapheme_cluster(rb_encoding * enc)8487 get_reg_grapheme_cluster(rb_encoding *enc)
8488 {
8489     int encidx = rb_enc_to_index(enc);
8490     regex_t *reg_grapheme_cluster = NULL;
8491     static regex_t *reg_grapheme_cluster_utf8 = NULL;
8492 
8493     /* synchronize */
8494     if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8495 	reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8496     }
8497     if (!reg_grapheme_cluster) {
8498 	const OnigUChar source_ascii[] = "\\X";
8499         OnigErrorInfo einfo;
8500         const OnigUChar *source = source_ascii;
8501         size_t source_len = sizeof(source_ascii) - 1;
8502         switch (encidx) {
8503 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
8504 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
8505 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
8506 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
8507 #define CASE_UTF(e) \
8508           case ENCINDEX_UTF_##e: { \
8509             static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
8510             source = source_UTF_##e; \
8511             source_len = sizeof(source_UTF_##e); \
8512             break; \
8513           }
8514             CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
8515 #undef CASE_UTF
8516 #undef CHARS_16BE
8517 #undef CHARS_16LE
8518 #undef CHARS_32BE
8519 #undef CHARS_32LE
8520         }
8521 	int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
8522                          ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
8523 	if (r) {
8524             UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
8525             onig_error_code_to_str(message, r, &einfo);
8526             rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
8527 	}
8528 	if (encidx == rb_utf8_encindex()) {
8529 	    reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8530 	}
8531     }
8532     return reg_grapheme_cluster;
8533 }
8534 
8535 static VALUE
rb_str_each_grapheme_cluster_size(VALUE str,VALUE args,VALUE eobj)8536 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
8537 {
8538     size_t grapheme_cluster_count = 0;
8539     regex_t *reg_grapheme_cluster = NULL;
8540     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
8541     const char *ptr, *end;
8542 
8543     if (!rb_enc_unicode_p(enc)) {
8544 	return rb_str_length(str);
8545     }
8546 
8547     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8548     ptr = RSTRING_PTR(str);
8549     end = RSTRING_END(str);
8550 
8551     while (ptr < end) {
8552 	OnigPosition len = onig_match(reg_grapheme_cluster,
8553 				      (const OnigUChar *)ptr, (const OnigUChar *)end,
8554 				      (const OnigUChar *)ptr, NULL, 0);
8555 	if (len <= 0) break;
8556 	grapheme_cluster_count++;
8557 	ptr += len;
8558     }
8559 
8560     return SIZET2NUM(grapheme_cluster_count);
8561 }
8562 
8563 static VALUE
rb_str_enumerate_grapheme_clusters(VALUE str,VALUE ary)8564 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8565 {
8566     VALUE orig = str;
8567     regex_t *reg_grapheme_cluster = NULL;
8568     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
8569     const char *ptr0, *ptr, *end;
8570 
8571     if (!rb_enc_unicode_p(enc)) {
8572 	return rb_str_enumerate_chars(str, ary);
8573     }
8574 
8575     if (!ary) str = rb_str_new_frozen(str);
8576     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8577     ptr0 = ptr = RSTRING_PTR(str);
8578     end = RSTRING_END(str);
8579 
8580     while (ptr < end) {
8581 	OnigPosition len = onig_match(reg_grapheme_cluster,
8582 				      (const OnigUChar *)ptr, (const OnigUChar *)end,
8583 				      (const OnigUChar *)ptr, NULL, 0);
8584 	if (len <= 0) break;
8585         ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
8586 	ptr += len;
8587     }
8588     RB_GC_GUARD(str);
8589     if (ary)
8590 	return ary;
8591     else
8592 	return orig;
8593 }
8594 
8595 /*
8596  *  call-seq:
8597  *     str.each_grapheme_cluster {|cstr| block }    -> str
8598  *     str.each_grapheme_cluster                    -> an_enumerator
8599  *
8600  *  Passes each grapheme cluster in <i>str</i> to the given block, or returns
8601  *  an enumerator if no block is given.
8602  *  Unlike String#each_char, this enumerates by grapheme clusters defined by
8603  *  Unicode Standard Annex #29 http://unicode.org/reports/tr29/
8604  *
8605  *     "a\u0300".each_char.to_a.size #=> 2
8606  *     "a\u0300".each_grapheme_cluster.to_a.size #=> 1
8607  *
8608  */
8609 
8610 static VALUE
rb_str_each_grapheme_cluster(VALUE str)8611 rb_str_each_grapheme_cluster(VALUE str)
8612 {
8613     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
8614     return rb_str_enumerate_grapheme_clusters(str, 0);
8615 }
8616 
8617 /*
8618  *  call-seq:
8619  *     str.grapheme_clusters   -> an_array
8620  *
8621  *  Returns an array of grapheme clusters in <i>str</i>.  This is a shorthand
8622  *  for <code>str.each_grapheme_cluster.to_a</code>.
8623  *
8624  *  If a block is given, which is a deprecated form, works the same as
8625  *  <code>each_grapheme_cluster</code>.
8626  */
8627 
8628 static VALUE
rb_str_grapheme_clusters(VALUE str)8629 rb_str_grapheme_clusters(VALUE str)
8630 {
8631     VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
8632     return rb_str_enumerate_grapheme_clusters(str, ary);
8633 }
8634 
8635 static long
chopped_length(VALUE str)8636 chopped_length(VALUE str)
8637 {
8638     rb_encoding *enc = STR_ENC_GET(str);
8639     const char *p, *p2, *beg, *end;
8640 
8641     beg = RSTRING_PTR(str);
8642     end = beg + RSTRING_LEN(str);
8643     if (beg > end) return 0;
8644     p = rb_enc_prev_char(beg, end, end, enc);
8645     if (!p) return 0;
8646     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
8647 	p2 = rb_enc_prev_char(beg, p, end, enc);
8648 	if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
8649     }
8650     return p - beg;
8651 }
8652 
8653 /*
8654  *  call-seq:
8655  *     str.chop!   -> str or nil
8656  *
8657  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
8658  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
8659  *  <code>String#chomp!</code>.
8660  */
8661 
8662 static VALUE
rb_str_chop_bang(VALUE str)8663 rb_str_chop_bang(VALUE str)
8664 {
8665     str_modify_keep_cr(str);
8666     if (RSTRING_LEN(str) > 0) {
8667 	long len;
8668 	len = chopped_length(str);
8669 	STR_SET_LEN(str, len);
8670 	TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8671 	if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8672 	    ENC_CODERANGE_CLEAR(str);
8673 	}
8674 	return str;
8675     }
8676     return Qnil;
8677 }
8678 
8679 
8680 /*
8681  *  call-seq:
8682  *     str.chop   -> new_str
8683  *
8684  *  Returns a new <code>String</code> with the last character removed.  If the
8685  *  string ends with <code>\r\n</code>, both characters are removed. Applying
8686  *  <code>chop</code> to an empty string returns an empty
8687  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
8688  *  the string unchanged if it doesn't end in a record separator.
8689  *
8690  *     "string\r\n".chop   #=> "string"
8691  *     "string\n\r".chop   #=> "string\n"
8692  *     "string\n".chop     #=> "string"
8693  *     "string".chop       #=> "strin"
8694  *     "x".chop.chop       #=> ""
8695  */
8696 
8697 static VALUE
rb_str_chop(VALUE str)8698 rb_str_chop(VALUE str)
8699 {
8700     return rb_str_subseq(str, 0, chopped_length(str));
8701 }
8702 
8703 
8704 static long
chompped_length(VALUE str,VALUE rs)8705 chompped_length(VALUE str, VALUE rs)
8706 {
8707     rb_encoding *enc;
8708     int newline;
8709     char *pp, *e, *rsptr;
8710     long rslen;
8711     char *const p = RSTRING_PTR(str);
8712     long len = RSTRING_LEN(str);
8713 
8714     if (len == 0) return 0;
8715     e = p + len;
8716     if (rs == rb_default_rs) {
8717       smart_chomp:
8718 	enc = rb_enc_get(str);
8719 	if (rb_enc_mbminlen(enc) > 1) {
8720 	    pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8721 	    if (rb_enc_is_newline(pp, e, enc)) {
8722 		e = pp;
8723 	    }
8724 	    pp = e - rb_enc_mbminlen(enc);
8725 	    if (pp >= p) {
8726 		pp = rb_enc_left_char_head(p, pp, e, enc);
8727 		if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8728 		    e = pp;
8729 		}
8730 	    }
8731 	}
8732 	else {
8733 	    switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
8734 	      case '\n':
8735 		if (--e > p && *(e-1) == '\r') {
8736 		    --e;
8737 		}
8738 		break;
8739 	      case '\r':
8740 		--e;
8741 		break;
8742 	    }
8743 	}
8744 	return e - p;
8745     }
8746 
8747     enc = rb_enc_get(str);
8748     RSTRING_GETMEM(rs, rsptr, rslen);
8749     if (rslen == 0) {
8750 	if (rb_enc_mbminlen(enc) > 1) {
8751 	    while (e > p) {
8752 		pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8753 		if (!rb_enc_is_newline(pp, e, enc)) break;
8754 		e = pp;
8755 		pp -= rb_enc_mbminlen(enc);
8756 		if (pp >= p) {
8757 		    pp = rb_enc_left_char_head(p, pp, e, enc);
8758 		    if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8759 			e = pp;
8760 		    }
8761 		}
8762 	    }
8763 	}
8764 	else {
8765 	    while (e > p && *(e-1) == '\n') {
8766 		--e;
8767 		if (e > p && *(e-1) == '\r')
8768 		    --e;
8769 	    }
8770 	}
8771 	return e - p;
8772     }
8773     if (rslen > len) return len;
8774 
8775     enc = rb_enc_get(rs);
8776     newline = rsptr[rslen-1];
8777     if (rslen == rb_enc_mbminlen(enc)) {
8778 	if (rslen == 1) {
8779 	    if (newline == '\n')
8780 		goto smart_chomp;
8781 	}
8782 	else {
8783 	    if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
8784 		goto smart_chomp;
8785 	}
8786     }
8787 
8788     enc = rb_enc_check(str, rs);
8789     if (is_broken_string(rs)) {
8790 	return len;
8791     }
8792     pp = e - rslen;
8793     if (p[len-1] == newline &&
8794 	(rslen <= 1 ||
8795 	 memcmp(rsptr, pp, rslen) == 0)) {
8796 	if (rb_enc_left_char_head(p, pp, e, enc) == pp)
8797 	    return len - rslen;
8798 	RB_GC_GUARD(rs);
8799     }
8800     return len;
8801 }
8802 
8803 /*!
8804  * Returns the separator for arguments of rb_str_chomp.
8805  *
8806  * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
8807  */
8808 static VALUE
chomp_rs(int argc,const VALUE * argv)8809 chomp_rs(int argc, const VALUE *argv)
8810 {
8811     rb_check_arity(argc, 0, 1);
8812     if (argc > 0) {
8813 	VALUE rs = argv[0];
8814 	if (!NIL_P(rs)) StringValue(rs);
8815 	return rs;
8816     }
8817     else {
8818 	return rb_rs;
8819     }
8820 }
8821 
8822 VALUE
rb_str_chomp_string(VALUE str,VALUE rs)8823 rb_str_chomp_string(VALUE str, VALUE rs)
8824 {
8825     long olen = RSTRING_LEN(str);
8826     long len = chompped_length(str, rs);
8827     if (len >= olen) return Qnil;
8828     str_modify_keep_cr(str);
8829     STR_SET_LEN(str, len);
8830     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8831     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8832 	ENC_CODERANGE_CLEAR(str);
8833     }
8834     return str;
8835 }
8836 
8837 /*
8838  *  call-seq:
8839  *     str.chomp!(separator=$/)   -> str or nil
8840  *
8841  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
8842  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
8843  */
8844 
8845 static VALUE
rb_str_chomp_bang(int argc,VALUE * argv,VALUE str)8846 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
8847 {
8848     VALUE rs;
8849     str_modifiable(str);
8850     if (RSTRING_LEN(str) == 0) return Qnil;
8851     rs = chomp_rs(argc, argv);
8852     if (NIL_P(rs)) return Qnil;
8853     return rb_str_chomp_string(str, rs);
8854 }
8855 
8856 
8857 /*
8858  *  call-seq:
8859  *     str.chomp(separator=$/)   -> new_str
8860  *
8861  *  Returns a new <code>String</code> with the given record separator removed
8862  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
8863  *  changed from the default Ruby record separator, then <code>chomp</code> also
8864  *  removes carriage return characters (that is it will remove <code>\n</code>,
8865  *  <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
8866  *  it will remove all trailing newlines from the string.
8867  *
8868  *     "hello".chomp                #=> "hello"
8869  *     "hello\n".chomp              #=> "hello"
8870  *     "hello\r\n".chomp            #=> "hello"
8871  *     "hello\n\r".chomp            #=> "hello\n"
8872  *     "hello\r".chomp              #=> "hello"
8873  *     "hello \n there".chomp       #=> "hello \n there"
8874  *     "hello".chomp("llo")         #=> "he"
8875  *     "hello\r\n\r\n".chomp('')    #=> "hello"
8876  *     "hello\r\n\r\r\n".chomp('')  #=> "hello\r\n\r"
8877  */
8878 
8879 static VALUE
rb_str_chomp(int argc,VALUE * argv,VALUE str)8880 rb_str_chomp(int argc, VALUE *argv, VALUE str)
8881 {
8882     VALUE rs = chomp_rs(argc, argv);
8883     if (NIL_P(rs)) return rb_str_dup(str);
8884     return rb_str_subseq(str, 0, chompped_length(str, rs));
8885 }
8886 
8887 static long
lstrip_offset(VALUE str,const char * s,const char * e,rb_encoding * enc)8888 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8889 {
8890     const char *const start = s;
8891 
8892     if (!s || s >= e) return 0;
8893 
8894     /* remove spaces at head */
8895     if (single_byte_optimizable(str)) {
8896 	while (s < e && ascii_isspace(*s)) s++;
8897     }
8898     else {
8899 	while (s < e) {
8900 	    int n;
8901 	    unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
8902 
8903 	    if (!rb_isspace(cc)) break;
8904 	    s += n;
8905 	}
8906     }
8907     return s - start;
8908 }
8909 
8910 /*
8911  *  call-seq:
8912  *     str.lstrip!   -> self or nil
8913  *
8914  *  Removes leading whitespace from the receiver.
8915  *  Returns the altered receiver, or +nil+ if no change was made.
8916  *  See also String#rstrip! and String#strip!.
8917  *
8918  *  Refer to String#strip for the definition of whitespace.
8919  *
8920  *     "  hello  ".lstrip!  #=> "hello  "
8921  *     "hello  ".lstrip!    #=> nil
8922  *     "hello".lstrip!      #=> nil
8923  */
8924 
8925 static VALUE
rb_str_lstrip_bang(VALUE str)8926 rb_str_lstrip_bang(VALUE str)
8927 {
8928     rb_encoding *enc;
8929     char *start, *s;
8930     long olen, loffset;
8931 
8932     str_modify_keep_cr(str);
8933     enc = STR_ENC_GET(str);
8934     RSTRING_GETMEM(str, start, olen);
8935     loffset = lstrip_offset(str, start, start+olen, enc);
8936     if (loffset > 0) {
8937 	long len = olen-loffset;
8938 	s = start + loffset;
8939 	memmove(start, s, len);
8940 	STR_SET_LEN(str, len);
8941 #if !SHARABLE_MIDDLE_SUBSTRING
8942 	TERM_FILL(start+len, rb_enc_mbminlen(enc));
8943 #endif
8944 	return str;
8945     }
8946     return Qnil;
8947 }
8948 
8949 
8950 /*
8951  *  call-seq:
8952  *     str.lstrip   -> new_str
8953  *
8954  *  Returns a copy of the receiver with leading whitespace removed.
8955  *  See also String#rstrip and String#strip.
8956  *
8957  *  Refer to String#strip for the definition of whitespace.
8958  *
8959  *     "  hello  ".lstrip   #=> "hello  "
8960  *     "hello".lstrip       #=> "hello"
8961  */
8962 
8963 static VALUE
rb_str_lstrip(VALUE str)8964 rb_str_lstrip(VALUE str)
8965 {
8966     char *start;
8967     long len, loffset;
8968     RSTRING_GETMEM(str, start, len);
8969     loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
8970     if (loffset <= 0) return rb_str_dup(str);
8971     return rb_str_subseq(str, loffset, len - loffset);
8972 }
8973 
8974 static long
rstrip_offset(VALUE str,const char * s,const char * e,rb_encoding * enc)8975 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8976 {
8977     const char *t;
8978 
8979     rb_str_check_dummy_enc(enc);
8980     if (!s || s >= e) return 0;
8981     t = e;
8982 
8983     /* remove trailing spaces or '\0's */
8984     if (single_byte_optimizable(str)) {
8985 	unsigned char c;
8986 	while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
8987     }
8988     else {
8989 	char *tp;
8990 
8991         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
8992 	    unsigned int c = rb_enc_codepoint(tp, e, enc);
8993 	    if (c && !rb_isspace(c)) break;
8994 	    t = tp;
8995 	}
8996     }
8997     return e - t;
8998 }
8999 
9000 /*
9001  *  call-seq:
9002  *     str.rstrip!   -> self or nil
9003  *
9004  *  Removes trailing whitespace from the receiver.
9005  *  Returns the altered receiver, or +nil+ if no change was made.
9006  *  See also String#lstrip! and String#strip!.
9007  *
9008  *  Refer to String#strip for the definition of whitespace.
9009  *
9010  *     "  hello  ".rstrip!  #=> "  hello"
9011  *     "  hello".rstrip!    #=> nil
9012  *     "hello".rstrip!      #=> nil
9013  */
9014 
9015 static VALUE
rb_str_rstrip_bang(VALUE str)9016 rb_str_rstrip_bang(VALUE str)
9017 {
9018     rb_encoding *enc;
9019     char *start;
9020     long olen, roffset;
9021 
9022     str_modify_keep_cr(str);
9023     enc = STR_ENC_GET(str);
9024     RSTRING_GETMEM(str, start, olen);
9025     roffset = rstrip_offset(str, start, start+olen, enc);
9026     if (roffset > 0) {
9027 	long len = olen - roffset;
9028 
9029 	STR_SET_LEN(str, len);
9030 #if !SHARABLE_MIDDLE_SUBSTRING
9031 	TERM_FILL(start+len, rb_enc_mbminlen(enc));
9032 #endif
9033 	return str;
9034     }
9035     return Qnil;
9036 }
9037 
9038 
9039 /*
9040  *  call-seq:
9041  *     str.rstrip   -> new_str
9042  *
9043  *  Returns a copy of the receiver with trailing whitespace removed.
9044  *  See also String#lstrip and String#strip.
9045  *
9046  *  Refer to String#strip for the definition of whitespace.
9047  *
9048  *     "  hello  ".rstrip   #=> "  hello"
9049  *     "hello".rstrip       #=> "hello"
9050  */
9051 
9052 static VALUE
rb_str_rstrip(VALUE str)9053 rb_str_rstrip(VALUE str)
9054 {
9055     rb_encoding *enc;
9056     char *start;
9057     long olen, roffset;
9058 
9059     enc = STR_ENC_GET(str);
9060     RSTRING_GETMEM(str, start, olen);
9061     roffset = rstrip_offset(str, start, start+olen, enc);
9062 
9063     if (roffset <= 0) return rb_str_dup(str);
9064     return rb_str_subseq(str, 0, olen-roffset);
9065 }
9066 
9067 
9068 /*
9069  *  call-seq:
9070  *     str.strip!   -> self or nil
9071  *
9072  *  Removes leading and trailing whitespace from the receiver.
9073  *  Returns the altered receiver, or +nil+ if there was no change.
9074  *
9075  *  Refer to String#strip for the definition of whitespace.
9076  *
9077  *     "  hello  ".strip!  #=> "hello"
9078  *     "hello".strip!      #=> nil
9079  */
9080 
9081 static VALUE
rb_str_strip_bang(VALUE str)9082 rb_str_strip_bang(VALUE str)
9083 {
9084     char *start;
9085     long olen, loffset, roffset;
9086     rb_encoding *enc;
9087 
9088     str_modify_keep_cr(str);
9089     enc = STR_ENC_GET(str);
9090     RSTRING_GETMEM(str, start, olen);
9091     loffset = lstrip_offset(str, start, start+olen, enc);
9092     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9093 
9094     if (loffset > 0 || roffset > 0) {
9095 	long len = olen-roffset;
9096 	if (loffset > 0) {
9097 	    len -= loffset;
9098 	    memmove(start, start + loffset, len);
9099 	}
9100 	STR_SET_LEN(str, len);
9101 #if !SHARABLE_MIDDLE_SUBSTRING
9102 	TERM_FILL(start+len, rb_enc_mbminlen(enc));
9103 #endif
9104 	return str;
9105     }
9106     return Qnil;
9107 }
9108 
9109 
9110 /*
9111  *  call-seq:
9112  *     str.strip   -> new_str
9113  *
9114  *  Returns a copy of the receiver with leading and trailing whitespace removed.
9115  *
9116  *  Whitespace is defined as any of the following characters:
9117  *  null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9118  *
9119  *     "    hello    ".strip   #=> "hello"
9120  *     "\tgoodbye\r\n".strip   #=> "goodbye"
9121  *     "\x00\t\n\v\f\r ".strip #=> ""
9122  *     "hello".strip           #=> "hello"
9123  */
9124 
9125 static VALUE
rb_str_strip(VALUE str)9126 rb_str_strip(VALUE str)
9127 {
9128     char *start;
9129     long olen, loffset, roffset;
9130     rb_encoding *enc = STR_ENC_GET(str);
9131 
9132     RSTRING_GETMEM(str, start, olen);
9133     loffset = lstrip_offset(str, start, start+olen, enc);
9134     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9135 
9136     if (loffset <= 0 && roffset <= 0) return rb_str_dup(str);
9137     return rb_str_subseq(str, loffset, olen-loffset-roffset);
9138 }
9139 
9140 static VALUE
scan_once(VALUE str,VALUE pat,long * start,int set_backref_str)9141 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9142 {
9143     VALUE result, match;
9144     struct re_registers *regs;
9145     int i;
9146     long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9147     if (pos >= 0) {
9148 	if (BUILTIN_TYPE(pat) == T_STRING) {
9149 	    regs = NULL;
9150 	    end = pos + RSTRING_LEN(pat);
9151 	}
9152 	else {
9153 	    match = rb_backref_get();
9154 	    regs = RMATCH_REGS(match);
9155 	    pos = BEG(0);
9156 	    end = END(0);
9157 	}
9158 	if (pos == end) {
9159 	    rb_encoding *enc = STR_ENC_GET(str);
9160 	    /*
9161 	     * Always consume at least one character of the input string
9162 	     */
9163 	    if (RSTRING_LEN(str) > end)
9164 		*start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9165 						  RSTRING_END(str), enc);
9166 	    else
9167 		*start = end + 1;
9168 	}
9169 	else {
9170 	    *start = end;
9171 	}
9172 	if (!regs || regs->num_regs == 1) {
9173 	    result = rb_str_subseq(str, pos, end - pos);
9174 	    OBJ_INFECT(result, pat);
9175 	    return result;
9176 	}
9177 	result = rb_ary_new2(regs->num_regs);
9178 	for (i=1; i < regs->num_regs; i++) {
9179 	    VALUE s = Qnil;
9180 	    if (BEG(i) >= 0) {
9181 		s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9182 		OBJ_INFECT(s, pat);
9183 	    }
9184 	    rb_ary_push(result, s);
9185 	}
9186 
9187 	return result;
9188     }
9189     return Qnil;
9190 }
9191 
9192 
9193 /*
9194  *  call-seq:
9195  *     str.scan(pattern)                         -> array
9196  *     str.scan(pattern) {|match, ...| block }   -> str
9197  *
9198  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
9199  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
9200  *  generated and either added to the result array or passed to the block. If
9201  *  the pattern contains no groups, each individual result consists of the
9202  *  matched string, <code>$&</code>.  If the pattern contains groups, each
9203  *  individual result is itself an array containing one entry per group.
9204  *
9205  *     a = "cruel world"
9206  *     a.scan(/\w+/)        #=> ["cruel", "world"]
9207  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
9208  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
9209  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
9210  *
9211  *  And the block form:
9212  *
9213  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
9214  *     print "\n"
9215  *     a.scan(/(.)(.)/) {|x,y| print y, x }
9216  *     print "\n"
9217  *
9218  *  <em>produces:</em>
9219  *
9220  *     <<cruel>> <<world>>
9221  *     rceu lowlr
9222  */
9223 
9224 static VALUE
rb_str_scan(VALUE str,VALUE pat)9225 rb_str_scan(VALUE str, VALUE pat)
9226 {
9227     VALUE result;
9228     long start = 0;
9229     long last = -1, prev = 0;
9230     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9231 
9232     pat = get_pat_quoted(pat, 1);
9233     mustnot_broken(str);
9234     if (!rb_block_given_p()) {
9235 	VALUE ary = rb_ary_new();
9236 
9237 	while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9238 	    last = prev;
9239 	    prev = start;
9240 	    rb_ary_push(ary, result);
9241 	}
9242 	if (last >= 0) rb_pat_search(pat, str, last, 1);
9243 	else rb_backref_set(Qnil);
9244 	return ary;
9245     }
9246 
9247     while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9248 	last = prev;
9249 	prev = start;
9250 	rb_yield(result);
9251 	str_mod_check(str, p, len);
9252     }
9253     if (last >= 0) rb_pat_search(pat, str, last, 1);
9254     return str;
9255 }
9256 
9257 
9258 /*
9259  *  call-seq:
9260  *     str.hex   -> integer
9261  *
9262  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
9263  *  (with an optional sign and an optional <code>0x</code>) and returns the
9264  *  corresponding number. Zero is returned on error.
9265  *
9266  *     "0x0a".hex     #=> 10
9267  *     "-1234".hex    #=> -4660
9268  *     "0".hex        #=> 0
9269  *     "wombat".hex   #=> 0
9270  */
9271 
9272 static VALUE
rb_str_hex(VALUE str)9273 rb_str_hex(VALUE str)
9274 {
9275     return rb_str_to_inum(str, 16, FALSE);
9276 }
9277 
9278 
9279 /*
9280  *  call-seq:
9281  *     str.oct   -> integer
9282  *
9283  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
9284  *  optional sign) and returns the corresponding number.  Returns 0 if the
9285  *  conversion fails.
9286  *
9287  *     "123".oct       #=> 83
9288  *     "-377".oct      #=> -255
9289  *     "bad".oct       #=> 0
9290  *     "0377bad".oct   #=> 255
9291  *
9292  *  If +str+ starts with <code>0</code>, radix indicators are honored.
9293  *  See Kernel#Integer.
9294  */
9295 
9296 static VALUE
rb_str_oct(VALUE str)9297 rb_str_oct(VALUE str)
9298 {
9299     return rb_str_to_inum(str, -8, FALSE);
9300 }
9301 
9302 
9303 /*
9304  *  call-seq:
9305  *     str.crypt(salt_str)   -> new_str
9306  *
9307  *  Returns the string generated by calling <code>crypt(3)</code>
9308  *  standard library function with <code>str</code> and
9309  *  <code>salt_str</code>, in this order, as its arguments.  Please do
9310  *  not use this method any longer.  It is legacy; provided only for
9311  *  backward compatibility with ruby scripts in earlier days.  It is
9312  *  bad to use in contemporary programs for several reasons:
9313  *
9314  *    * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
9315  *      run.  The generated string lacks data portability.
9316  *
9317  *    * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
9318  *      (i.e. silently ends up in unexpected results).
9319  *
9320  *    * On some OSes such as Mac OS, <code>crypt(3)</code> is not
9321  *      thread safe.
9322  *
9323  *    * So-called "traditional" usage of <code>crypt(3)</code> is very
9324  *      very very weak.  According to its manpage, Linux's traditional
9325  *      <code>crypt(3)</code> output has only 2**56 variations; too
9326  *      easy to brute force today.  And this is the default behaviour.
9327  *
9328  *    * In order to make things robust some OSes implement so-called
9329  *      "modular" usage. To go through, you have to do a complex
9330  *      build-up of the <code>salt_str</code> parameter, by hand.
9331  *      Failure in generation of a proper salt string tends not to
9332  *      yield any errors; typos in parameters are normally not
9333  *      detectable.
9334  *
9335  *        * For instance, in the following example, the second invocation
9336  *          of <code>String#crypt</code> is wrong; it has a typo in
9337  *          "round=" (lacks "s").  However the call does not fail and
9338  *          something unexpected is generated.
9339  *
9340  *             "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
9341  *             "foo".crypt("$5$round=1000$salt$")  # Typo not detected
9342  *
9343  *    * Even in the "modular" mode, some hash functions are considered
9344  *      archaic and no longer recommended at all; for instance module
9345  *      <code>$1$</code> is officially abandoned by its author: see
9346  *      http://phk.freebsd.dk/sagas/md5crypt_eol.html .  For another
9347  *      instance module <code>$3$</code> is considered completely
9348  *      broken: see the manpage of FreeBSD.
9349  *
9350  *    * On some OS such as Mac OS, there is no modular mode. Yet, as
9351  *      written above, <code>crypt(3)</code> on Mac OS never fails.
9352  *      This means even if you build up a proper salt string it
9353  *      generates a traditional DES hash anyways, and there is no way
9354  *      for you to be aware of.
9355  *
9356  *          "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
9357  *
9358  *  If for some reason you cannot migrate to other secure contemporary
9359  *  password hashing algorithms, install the string-crypt gem and
9360  *  <code>require 'string/crypt'</code> to continue using it.
9361  */
9362 
9363 static VALUE
rb_str_crypt(VALUE str,VALUE salt)9364 rb_str_crypt(VALUE str, VALUE salt)
9365 {
9366 #ifdef HAVE_CRYPT_R
9367     VALUE databuf;
9368     struct crypt_data *data;
9369 #   define CRYPT_END() ALLOCV_END(databuf)
9370 #else
9371     extern char *crypt(const char *, const char *);
9372 #   define CRYPT_END() (void)0
9373 #endif
9374     VALUE result;
9375     const char *s, *saltp;
9376     char *res;
9377 #ifdef BROKEN_CRYPT
9378     char salt_8bit_clean[3];
9379 #endif
9380 
9381     StringValue(salt);
9382     mustnot_wchar(str);
9383     mustnot_wchar(salt);
9384     if (RSTRING_LEN(salt) < 2) {
9385       short_salt:
9386 	rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
9387     }
9388 
9389     s = StringValueCStr(str);
9390     saltp = RSTRING_PTR(salt);
9391     if (!saltp[0] || !saltp[1]) goto short_salt;
9392 #ifdef BROKEN_CRYPT
9393     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
9394 	salt_8bit_clean[0] = saltp[0] & 0x7f;
9395 	salt_8bit_clean[1] = saltp[1] & 0x7f;
9396 	salt_8bit_clean[2] = '\0';
9397 	saltp = salt_8bit_clean;
9398     }
9399 #endif
9400 #ifdef HAVE_CRYPT_R
9401     data = ALLOCV(databuf, sizeof(struct crypt_data));
9402 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
9403     data->initialized = 0;
9404 # endif
9405     res = crypt_r(s, saltp, data);
9406 #else
9407     res = crypt(s, saltp);
9408 #endif
9409     if (!res) {
9410 	int err = errno;
9411 	CRYPT_END();
9412 	rb_syserr_fail(err, "crypt");
9413     }
9414     result = rb_str_new_cstr(res);
9415     CRYPT_END();
9416     FL_SET_RAW(result, OBJ_TAINTED_RAW(str) | OBJ_TAINTED_RAW(salt));
9417     return result;
9418 }
9419 
9420 
9421 /*
9422  *  call-seq:
9423  *     str.ord   -> integer
9424  *
9425  *  Returns the <code>Integer</code> ordinal of a one-character string.
9426  *
9427  *     "a".ord         #=> 97
9428  */
9429 
9430 VALUE
rb_str_ord(VALUE s)9431 rb_str_ord(VALUE s)
9432 {
9433     unsigned int c;
9434 
9435     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
9436     return UINT2NUM(c);
9437 }
9438 /*
9439  *  call-seq:
9440  *     str.sum(n=16)   -> integer
9441  *
9442  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
9443  *  where <em>n</em> is the optional <code>Integer</code> parameter, defaulting
9444  *  to 16. The result is simply the sum of the binary value of each byte in
9445  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
9446  *  checksum.
9447  */
9448 
9449 static VALUE
rb_str_sum(int argc,VALUE * argv,VALUE str)9450 rb_str_sum(int argc, VALUE *argv, VALUE str)
9451 {
9452     int bits = 16;
9453     char *ptr, *p, *pend;
9454     long len;
9455     VALUE sum = INT2FIX(0);
9456     unsigned long sum0 = 0;
9457 
9458     if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
9459         bits = 0;
9460     }
9461     ptr = p = RSTRING_PTR(str);
9462     len = RSTRING_LEN(str);
9463     pend = p + len;
9464 
9465     while (p < pend) {
9466         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
9467             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9468             str_mod_check(str, ptr, len);
9469             sum0 = 0;
9470         }
9471         sum0 += (unsigned char)*p;
9472         p++;
9473     }
9474 
9475     if (bits == 0) {
9476         if (sum0) {
9477             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9478         }
9479     }
9480     else {
9481         if (sum == INT2FIX(0)) {
9482             if (bits < (int)sizeof(long)*CHAR_BIT) {
9483                 sum0 &= (((unsigned long)1)<<bits)-1;
9484             }
9485             sum = LONG2FIX(sum0);
9486         }
9487         else {
9488             VALUE mod;
9489 
9490             if (sum0) {
9491                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9492             }
9493 
9494             mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
9495             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
9496             sum = rb_funcall(sum, '&', 1, mod);
9497         }
9498     }
9499     return sum;
9500 }
9501 
9502 static VALUE
rb_str_justify(int argc,VALUE * argv,VALUE str,char jflag)9503 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
9504 {
9505     rb_encoding *enc;
9506     VALUE w;
9507     long width, len, flen = 1, fclen = 1;
9508     VALUE res;
9509     char *p;
9510     const char *f = " ";
9511     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
9512     VALUE pad;
9513     int singlebyte = 1, cr;
9514     int termlen;
9515 
9516     rb_scan_args(argc, argv, "11", &w, &pad);
9517     enc = STR_ENC_GET(str);
9518     termlen = rb_enc_mbminlen(enc);
9519     width = NUM2LONG(w);
9520     if (argc == 2) {
9521 	StringValue(pad);
9522 	enc = rb_enc_check(str, pad);
9523 	f = RSTRING_PTR(pad);
9524 	flen = RSTRING_LEN(pad);
9525 	fclen = str_strlen(pad, enc); /* rb_enc_check */
9526 	singlebyte = single_byte_optimizable(pad);
9527 	if (flen == 0 || fclen == 0) {
9528 	    rb_raise(rb_eArgError, "zero width padding");
9529 	}
9530     }
9531     len = str_strlen(str, enc); /* rb_enc_check */
9532     if (width < 0 || len >= width) return rb_str_dup(str);
9533     n = width - len;
9534     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
9535     rlen = n - llen;
9536     cr = ENC_CODERANGE(str);
9537     if (flen > 1) {
9538        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
9539        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
9540     }
9541     size = RSTRING_LEN(str);
9542     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
9543        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
9544        (len += llen2 + rlen2) >= LONG_MAX - size) {
9545        rb_raise(rb_eArgError, "argument too big");
9546     }
9547     len += size;
9548     res = str_new0(rb_obj_class(str), 0, len, termlen);
9549     p = RSTRING_PTR(res);
9550     if (flen <= 1) {
9551        memset(p, *f, llen);
9552        p += llen;
9553     }
9554     else {
9555        while (llen >= fclen) {
9556 	    memcpy(p,f,flen);
9557 	    p += flen;
9558 	    llen -= fclen;
9559 	}
9560        if (llen > 0) {
9561            memcpy(p, f, llen2);
9562            p += llen2;
9563 	}
9564     }
9565     memcpy(p, RSTRING_PTR(str), size);
9566     p += size;
9567     if (flen <= 1) {
9568        memset(p, *f, rlen);
9569        p += rlen;
9570     }
9571     else {
9572        while (rlen >= fclen) {
9573 	    memcpy(p,f,flen);
9574 	    p += flen;
9575 	    rlen -= fclen;
9576 	}
9577        if (rlen > 0) {
9578            memcpy(p, f, rlen2);
9579            p += rlen2;
9580 	}
9581     }
9582     TERM_FILL(p, termlen);
9583     STR_SET_LEN(res, p-RSTRING_PTR(res));
9584     OBJ_INFECT_RAW(res, str);
9585     if (!NIL_P(pad)) OBJ_INFECT_RAW(res, pad);
9586     rb_enc_associate(res, enc);
9587     if (argc == 2)
9588 	cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
9589     if (cr != ENC_CODERANGE_BROKEN)
9590 	ENC_CODERANGE_SET(res, cr);
9591 
9592     RB_GC_GUARD(pad);
9593     return res;
9594 }
9595 
9596 
9597 /*
9598  *  call-seq:
9599  *     str.ljust(integer, padstr=' ')   -> new_str
9600  *
9601  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9602  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
9603  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9604  *
9605  *     "hello".ljust(4)            #=> "hello"
9606  *     "hello".ljust(20)           #=> "hello               "
9607  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
9608  */
9609 
9610 static VALUE
rb_str_ljust(int argc,VALUE * argv,VALUE str)9611 rb_str_ljust(int argc, VALUE *argv, VALUE str)
9612 {
9613     return rb_str_justify(argc, argv, str, 'l');
9614 }
9615 
9616 
9617 /*
9618  *  call-seq:
9619  *     str.rjust(integer, padstr=' ')   -> new_str
9620  *
9621  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9622  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
9623  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9624  *
9625  *     "hello".rjust(4)            #=> "hello"
9626  *     "hello".rjust(20)           #=> "               hello"
9627  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
9628  */
9629 
9630 static VALUE
rb_str_rjust(int argc,VALUE * argv,VALUE str)9631 rb_str_rjust(int argc, VALUE *argv, VALUE str)
9632 {
9633     return rb_str_justify(argc, argv, str, 'r');
9634 }
9635 
9636 
9637 /*
9638  *  call-seq:
9639  *     str.center(width, padstr=' ')   -> new_str
9640  *
9641  *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
9642  *  returns a new String of length +width+ with +str+ centered and padded with
9643  *  +padstr+; otherwise, returns +str+.
9644  *
9645  *     "hello".center(4)         #=> "hello"
9646  *     "hello".center(20)        #=> "       hello        "
9647  *     "hello".center(20, '123') #=> "1231231hello12312312"
9648  */
9649 
9650 static VALUE
rb_str_center(int argc,VALUE * argv,VALUE str)9651 rb_str_center(int argc, VALUE *argv, VALUE str)
9652 {
9653     return rb_str_justify(argc, argv, str, 'c');
9654 }
9655 
9656 /*
9657  *  call-seq:
9658  *     str.partition(sep)              -> [head, sep, tail]
9659  *     str.partition(regexp)           -> [head, match, tail]
9660  *
9661  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
9662  *  and returns the part before it, the match, and the part
9663  *  after it.
9664  *  If it is not found, returns two empty strings and <i>str</i>.
9665  *
9666  *     "hello".partition("l")         #=> ["he", "l", "lo"]
9667  *     "hello".partition("x")         #=> ["hello", "", ""]
9668  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
9669  */
9670 
9671 static VALUE
rb_str_partition(VALUE str,VALUE sep)9672 rb_str_partition(VALUE str, VALUE sep)
9673 {
9674     long pos;
9675 
9676     sep = get_pat_quoted(sep, 0);
9677     if (RB_TYPE_P(sep, T_REGEXP)) {
9678 	pos = rb_reg_search(sep, str, 0, 0);
9679 	if (pos < 0) {
9680 	  failed:
9681 	    return rb_ary_new3(3, rb_str_dup(str), str_new_empty(str), str_new_empty(str));
9682 	}
9683 	sep = rb_str_subpat(str, sep, INT2FIX(0));
9684 	if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
9685     }
9686     else {
9687 	pos = rb_str_index(str, sep, 0);
9688 	if (pos < 0) goto failed;
9689     }
9690     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9691 		          sep,
9692 		          rb_str_subseq(str, pos+RSTRING_LEN(sep),
9693 					     RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9694 }
9695 
9696 /*
9697  *  call-seq:
9698  *     str.rpartition(sep)             -> [head, sep, tail]
9699  *     str.rpartition(regexp)          -> [head, match, tail]
9700  *
9701  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
9702  *  of the string, and returns the part before it, the match, and the part
9703  *  after it.
9704  *  If it is not found, returns two empty strings and <i>str</i>.
9705  *
9706  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
9707  *     "hello".rpartition("x")         #=> ["", "", "hello"]
9708  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
9709  */
9710 
9711 static VALUE
rb_str_rpartition(VALUE str,VALUE sep)9712 rb_str_rpartition(VALUE str, VALUE sep)
9713 {
9714     long pos = RSTRING_LEN(str);
9715     int regex = FALSE;
9716 
9717     if (RB_TYPE_P(sep, T_REGEXP)) {
9718 	pos = rb_reg_search(sep, str, pos, 1);
9719 	regex = TRUE;
9720     }
9721     else {
9722 	VALUE tmp;
9723 
9724 	tmp = rb_check_string_type(sep);
9725 	if (NIL_P(tmp)) {
9726 	    rb_raise(rb_eTypeError, "type mismatch: %s given",
9727 		     rb_obj_classname(sep));
9728 	}
9729 	sep = tmp;
9730 	pos = rb_str_sublen(str, pos);
9731 	pos = rb_str_rindex(str, sep, pos);
9732     }
9733     if (pos < 0) {
9734        return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), rb_str_dup(str));
9735     }
9736     if (regex) {
9737 	sep = rb_reg_nth_match(0, rb_backref_get());
9738     }
9739     else {
9740 	pos = rb_str_offset(str, pos);
9741     }
9742     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9743 		          sep,
9744 		          rb_str_subseq(str, pos+RSTRING_LEN(sep),
9745 					RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9746 }
9747 
9748 /*
9749  *  call-seq:
9750  *     str.start_with?([prefixes]+)   -> true or false
9751  *
9752  *  Returns true if +str+ starts with one of the +prefixes+ given.
9753  *  Each of the +prefixes+ should be a String or a Regexp.
9754  *
9755  *    "hello".start_with?("hell")               #=> true
9756  *    "hello".start_with?(/H/i)                 #=> true
9757  *
9758  *    # returns true if one of the prefixes matches.
9759  *    "hello".start_with?("heaven", "hell")     #=> true
9760  *    "hello".start_with?("heaven", "paradise") #=> false
9761  */
9762 
9763 static VALUE
rb_str_start_with(int argc,VALUE * argv,VALUE str)9764 rb_str_start_with(int argc, VALUE *argv, VALUE str)
9765 {
9766     int i;
9767 
9768     for (i=0; i<argc; i++) {
9769 	VALUE tmp = argv[i];
9770 	if (RB_TYPE_P(tmp, T_REGEXP)) {
9771 	    if (rb_reg_start_with_p(tmp, str))
9772 		return Qtrue;
9773 	}
9774 	else {
9775 	    StringValue(tmp);
9776 	    rb_enc_check(str, tmp);
9777 	    if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9778 	    if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9779 		return Qtrue;
9780 	}
9781     }
9782     return Qfalse;
9783 }
9784 
9785 /*
9786  *  call-seq:
9787  *     str.end_with?([suffixes]+)   -> true or false
9788  *
9789  *  Returns true if +str+ ends with one of the +suffixes+ given.
9790  *
9791  *    "hello".end_with?("ello")               #=> true
9792  *
9793  *    # returns true if one of the +suffixes+ matches.
9794  *    "hello".end_with?("heaven", "ello")     #=> true
9795  *    "hello".end_with?("heaven", "paradise") #=> false
9796  */
9797 
9798 static VALUE
rb_str_end_with(int argc,VALUE * argv,VALUE str)9799 rb_str_end_with(int argc, VALUE *argv, VALUE str)
9800 {
9801     int i;
9802     char *p, *s, *e;
9803     rb_encoding *enc;
9804 
9805     for (i=0; i<argc; i++) {
9806 	VALUE tmp = argv[i];
9807 	StringValue(tmp);
9808 	enc = rb_enc_check(str, tmp);
9809 	if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9810 	p = RSTRING_PTR(str);
9811         e = p + RSTRING_LEN(str);
9812 	s = e - RSTRING_LEN(tmp);
9813 	if (rb_enc_left_char_head(p, s, e, enc) != s)
9814 	    continue;
9815 	if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9816 	    return Qtrue;
9817     }
9818     return Qfalse;
9819 }
9820 
9821 /*!
9822  * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
9823  * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
9824  *
9825  * @param str the target
9826  * @param prefix the prefix
9827  * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
9828  * @retval Positive-Integer otherwise
9829  */
9830 static long
deleted_prefix_length(VALUE str,VALUE prefix)9831 deleted_prefix_length(VALUE str, VALUE prefix)
9832 {
9833     char *strptr, *prefixptr;
9834     long olen, prefixlen;
9835 
9836     StringValue(prefix);
9837     if (is_broken_string(prefix)) return 0;
9838     rb_enc_check(str, prefix);
9839 
9840     /* return 0 if not start with prefix */
9841     prefixlen = RSTRING_LEN(prefix);
9842     if (prefixlen <= 0) return 0;
9843     olen = RSTRING_LEN(str);
9844     if (olen < prefixlen) return 0;
9845     strptr = RSTRING_PTR(str);
9846     prefixptr = RSTRING_PTR(prefix);
9847     if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
9848 
9849     return prefixlen;
9850 }
9851 
9852 /*
9853  *  call-seq:
9854  *     str.delete_prefix!(prefix) -> self or nil
9855  *
9856  *  Deletes leading <code>prefix</code> from <i>str</i>, returning
9857  *  <code>nil</code> if no change was made.
9858  *
9859  *     "hello".delete_prefix!("hel") #=> "lo"
9860  *     "hello".delete_prefix!("llo") #=> nil
9861  */
9862 
9863 static VALUE
rb_str_delete_prefix_bang(VALUE str,VALUE prefix)9864 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
9865 {
9866     long prefixlen;
9867     str_modify_keep_cr(str);
9868 
9869     prefixlen = deleted_prefix_length(str, prefix);
9870     if (prefixlen <= 0) return Qnil;
9871 
9872     return rb_str_drop_bytes(str, prefixlen);
9873 }
9874 
9875 /*
9876  *  call-seq:
9877  *     str.delete_prefix(prefix) -> new_str
9878  *
9879  *  Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
9880  *
9881  *     "hello".delete_prefix("hel") #=> "lo"
9882  *     "hello".delete_prefix("llo") #=> "hello"
9883  */
9884 
9885 static VALUE
rb_str_delete_prefix(VALUE str,VALUE prefix)9886 rb_str_delete_prefix(VALUE str, VALUE prefix)
9887 {
9888     long prefixlen;
9889 
9890     prefixlen = deleted_prefix_length(str, prefix);
9891     if (prefixlen <= 0) return rb_str_dup(str);
9892 
9893     return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
9894 }
9895 
9896 /*!
9897  * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
9898  * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
9899  *
9900  * @param str the target
9901  * @param suffix the suffix
9902  * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
9903  * @retval Positive-Integer otherwise
9904  */
9905 static long
deleted_suffix_length(VALUE str,VALUE suffix)9906 deleted_suffix_length(VALUE str, VALUE suffix)
9907 {
9908     char *strptr, *suffixptr, *s;
9909     long olen, suffixlen;
9910     rb_encoding *enc;
9911 
9912     StringValue(suffix);
9913     if (is_broken_string(suffix)) return 0;
9914     enc = rb_enc_check(str, suffix);
9915 
9916     /* return 0 if not start with suffix */
9917     suffixlen = RSTRING_LEN(suffix);
9918     if (suffixlen <= 0) return 0;
9919     olen = RSTRING_LEN(str);
9920     if (olen < suffixlen) return 0;
9921     strptr = RSTRING_PTR(str);
9922     suffixptr = RSTRING_PTR(suffix);
9923     s = strptr + olen - suffixlen;
9924     if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
9925     if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
9926 
9927     return suffixlen;
9928 }
9929 
9930 /*
9931  *  call-seq:
9932  *     str.delete_suffix!(suffix) -> self or nil
9933  *
9934  *  Deletes trailing <code>suffix</code> from <i>str</i>, returning
9935  *  <code>nil</code> if no change was made.
9936  *
9937  *     "hello".delete_suffix!("llo") #=> "he"
9938  *     "hello".delete_suffix!("hel") #=> nil
9939  */
9940 
9941 static VALUE
rb_str_delete_suffix_bang(VALUE str,VALUE suffix)9942 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
9943 {
9944     long olen, suffixlen, len;
9945     str_modifiable(str);
9946 
9947     suffixlen = deleted_suffix_length(str, suffix);
9948     if (suffixlen <= 0) return Qnil;
9949 
9950     olen = RSTRING_LEN(str);
9951     str_modify_keep_cr(str);
9952     len = olen - suffixlen;
9953     STR_SET_LEN(str, len);
9954     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9955     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9956 	ENC_CODERANGE_CLEAR(str);
9957     }
9958     return str;
9959 }
9960 
9961 /*
9962  *  call-seq:
9963  *     str.delete_suffix(suffix) -> new_str
9964  *
9965  *  Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
9966  *
9967  *     "hello".delete_suffix("llo") #=> "he"
9968  *     "hello".delete_suffix("hel") #=> "hello"
9969  */
9970 
9971 static VALUE
rb_str_delete_suffix(VALUE str,VALUE suffix)9972 rb_str_delete_suffix(VALUE str, VALUE suffix)
9973 {
9974     long suffixlen;
9975 
9976     suffixlen = deleted_suffix_length(str, suffix);
9977     if (suffixlen <= 0) return rb_str_dup(str);
9978 
9979     return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
9980 }
9981 
9982 void
rb_str_setter(VALUE val,ID id,VALUE * var)9983 rb_str_setter(VALUE val, ID id, VALUE *var)
9984 {
9985     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
9986 	rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
9987     }
9988     *var = val;
9989 }
9990 
9991 static void
rb_fs_setter(VALUE val,ID id,VALUE * var)9992 rb_fs_setter(VALUE val, ID id, VALUE *var)
9993 {
9994     val = rb_fs_check(val);
9995     if (!val) {
9996 	rb_raise(rb_eTypeError,
9997 		 "value of %"PRIsVALUE" must be String or Regexp",
9998 		 rb_id2str(id));
9999     }
10000     *var = val;
10001 }
10002 
10003 
10004 /*
10005  *  call-seq:
10006  *     str.force_encoding(encoding)   -> str
10007  *
10008  *  Changes the encoding to +encoding+ and returns self.
10009  */
10010 
10011 static VALUE
rb_str_force_encoding(VALUE str,VALUE enc)10012 rb_str_force_encoding(VALUE str, VALUE enc)
10013 {
10014     str_modifiable(str);
10015     rb_enc_associate(str, rb_to_encoding(enc));
10016     ENC_CODERANGE_CLEAR(str);
10017     return str;
10018 }
10019 
10020 /*
10021  *  call-seq:
10022  *     str.b   -> str
10023  *
10024  *  Returns a copied string whose encoding is ASCII-8BIT.
10025  */
10026 
10027 static VALUE
rb_str_b(VALUE str)10028 rb_str_b(VALUE str)
10029 {
10030     VALUE str2 = str_alloc(rb_cString);
10031     str_replace_shared_without_enc(str2, str);
10032     OBJ_INFECT_RAW(str2, str);
10033     ENC_CODERANGE_CLEAR(str2);
10034     return str2;
10035 }
10036 
10037 /*
10038  *  call-seq:
10039  *     str.valid_encoding?  -> true or false
10040  *
10041  *  Returns true for a string which is encoded correctly.
10042  *
10043  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
10044  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
10045  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
10046  */
10047 
10048 static VALUE
rb_str_valid_encoding_p(VALUE str)10049 rb_str_valid_encoding_p(VALUE str)
10050 {
10051     int cr = rb_enc_str_coderange(str);
10052 
10053     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
10054 }
10055 
10056 /*
10057  *  call-seq:
10058  *     str.ascii_only?  -> true or false
10059  *
10060  *  Returns true for a string which has only ASCII characters.
10061  *
10062  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
10063  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
10064  */
10065 
10066 static VALUE
rb_str_is_ascii_only_p(VALUE str)10067 rb_str_is_ascii_only_p(VALUE str)
10068 {
10069     int cr = rb_enc_str_coderange(str);
10070 
10071     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
10072 }
10073 
10074 /**
10075  * Shortens _str_ and adds three dots, an ellipsis, if it is longer
10076  * than _len_ characters.
10077  *
10078  * \param str	the string to ellipsize.
10079  * \param len	the maximum string length.
10080  * \return	the ellipsized string.
10081  * \pre 	_len_ must not be negative.
10082  * \post	the length of the returned string in characters is less than or equal to _len_.
10083  * \post	If the length of _str_ is less than or equal _len_, returns _str_ itself.
10084  * \post	the encoding of returned string is equal to the encoding of _str_.
10085  * \post	the class of returned string is equal to the class of _str_.
10086  * \note	the length is counted in characters.
10087  */
10088 VALUE
rb_str_ellipsize(VALUE str,long len)10089 rb_str_ellipsize(VALUE str, long len)
10090 {
10091     static const char ellipsis[] = "...";
10092     const long ellipsislen = sizeof(ellipsis) - 1;
10093     rb_encoding *const enc = rb_enc_get(str);
10094     const long blen = RSTRING_LEN(str);
10095     const char *const p = RSTRING_PTR(str), *e = p + blen;
10096     VALUE estr, ret = 0;
10097 
10098     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10099     if (len * rb_enc_mbminlen(enc) >= blen ||
10100 	(e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10101 	ret = str;
10102     }
10103     else if (len <= ellipsislen ||
10104 	     !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10105 	if (rb_enc_asciicompat(enc)) {
10106 	    ret = rb_str_new_with_class(str, ellipsis, len);
10107 	    rb_enc_associate(ret, enc);
10108 	}
10109 	else {
10110 	    estr = rb_usascii_str_new(ellipsis, len);
10111 	    ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10112 	}
10113     }
10114     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10115 	rb_str_cat(ret, ellipsis, ellipsislen);
10116     }
10117     else {
10118 	estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10119 			     rb_enc_from_encoding(enc), 0, Qnil);
10120 	rb_str_append(ret, estr);
10121     }
10122     return ret;
10123 }
10124 
10125 static VALUE
str_compat_and_valid(VALUE str,rb_encoding * enc)10126 str_compat_and_valid(VALUE str, rb_encoding *enc)
10127 {
10128     int cr;
10129     str = StringValue(str);
10130     cr = rb_enc_str_coderange(str);
10131     if (cr == ENC_CODERANGE_BROKEN) {
10132 	rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10133     }
10134     else {
10135 	rb_encoding *e = STR_ENC_GET(str);
10136 	if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10137 	    rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10138 		     rb_enc_name(enc), rb_enc_name(e));
10139 	}
10140     }
10141     return str;
10142 }
10143 
10144 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10145 
10146 /**
10147  * @param str the string to be scrubbed
10148  * @param repl the replacement character
10149  * @return If given string is invalid, returns a new string. Otherwise, returns Qnil.
10150  */
10151 VALUE
rb_str_scrub(VALUE str,VALUE repl)10152 rb_str_scrub(VALUE str, VALUE repl)
10153 {
10154     rb_encoding *enc = STR_ENC_GET(str);
10155     return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10156 }
10157 
10158 VALUE
rb_enc_str_scrub(rb_encoding * enc,VALUE str,VALUE repl)10159 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10160 {
10161     int cr = ENC_CODERANGE_UNKNOWN;
10162     if (enc == STR_ENC_GET(str)) {
10163 	/* cached coderange makes sense only when enc equals the
10164 	 * actual encoding of str */
10165 	cr = ENC_CODERANGE(str);
10166     }
10167     return enc_str_scrub(enc, str, repl, cr);
10168 }
10169 
10170 static VALUE
enc_str_scrub(rb_encoding * enc,VALUE str,VALUE repl,int cr)10171 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10172 {
10173     int encidx;
10174     VALUE buf = Qnil;
10175     const char *rep;
10176     long replen = -1;
10177     int tainted = 0;
10178 
10179     if (rb_block_given_p()) {
10180 	if (!NIL_P(repl))
10181 	    rb_raise(rb_eArgError, "both of block and replacement given");
10182 	replen = 0;
10183     }
10184 
10185     if (ENC_CODERANGE_CLEAN_P(cr))
10186 	return Qnil;
10187 
10188     if (!NIL_P(repl)) {
10189 	repl = str_compat_and_valid(repl, enc);
10190 	tainted = OBJ_TAINTED_RAW(repl);
10191     }
10192 
10193     if (rb_enc_dummy_p(enc)) {
10194 	return Qnil;
10195     }
10196     encidx = rb_enc_to_index(enc);
10197 
10198 #define DEFAULT_REPLACE_CHAR(str) do { \
10199 	static const char replace[sizeof(str)-1] = str; \
10200 	rep = replace; replen = (int)sizeof(replace); \
10201     } while (0)
10202 
10203     if (rb_enc_asciicompat(enc)) {
10204 	const char *p = RSTRING_PTR(str);
10205 	const char *e = RSTRING_END(str);
10206 	const char *p1 = p;
10207 	int rep7bit_p;
10208 	if (!replen) {
10209 	    rep = NULL;
10210 	    rep7bit_p = FALSE;
10211 	}
10212 	else if (!NIL_P(repl)) {
10213 	    rep = RSTRING_PTR(repl);
10214 	    replen = RSTRING_LEN(repl);
10215 	    rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10216 	}
10217 	else if (encidx == rb_utf8_encindex()) {
10218 	    DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10219 	    rep7bit_p = FALSE;
10220 	}
10221 	else {
10222 	    DEFAULT_REPLACE_CHAR("?");
10223 	    rep7bit_p = TRUE;
10224 	}
10225 	cr = ENC_CODERANGE_7BIT;
10226 
10227 	p = search_nonascii(p, e);
10228 	if (!p) {
10229 	    p = e;
10230 	}
10231 	while (p < e) {
10232 	    int ret = rb_enc_precise_mbclen(p, e, enc);
10233 	    if (MBCLEN_NEEDMORE_P(ret)) {
10234 		break;
10235 	    }
10236 	    else if (MBCLEN_CHARFOUND_P(ret)) {
10237 		cr = ENC_CODERANGE_VALID;
10238 		p += MBCLEN_CHARFOUND_LEN(ret);
10239 	    }
10240 	    else if (MBCLEN_INVALID_P(ret)) {
10241 		/*
10242 		 * p1~p: valid ascii/multibyte chars
10243 		 * p ~e: invalid bytes + unknown bytes
10244 		 */
10245 		long clen = rb_enc_mbmaxlen(enc);
10246 		if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10247 		if (p > p1) {
10248 		    rb_str_buf_cat(buf, p1, p - p1);
10249 		}
10250 
10251 		if (e - p < clen) clen = e - p;
10252 		if (clen <= 2) {
10253 		    clen = 1;
10254 		}
10255 		else {
10256 		    const char *q = p;
10257 		    clen--;
10258 		    for (; clen > 1; clen--) {
10259 			ret = rb_enc_precise_mbclen(q, q + clen, enc);
10260 			if (MBCLEN_NEEDMORE_P(ret)) break;
10261 			if (MBCLEN_INVALID_P(ret)) continue;
10262 			UNREACHABLE;
10263 		    }
10264 		}
10265 		if (rep) {
10266 		    rb_str_buf_cat(buf, rep, replen);
10267 		    if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10268 		}
10269 		else {
10270 		    repl = rb_yield(rb_enc_str_new(p, clen, enc));
10271 		    repl = str_compat_and_valid(repl, enc);
10272 		    tainted |= OBJ_TAINTED_RAW(repl);
10273 		    rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10274 		    if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10275 			cr = ENC_CODERANGE_VALID;
10276 		}
10277 		p += clen;
10278 		p1 = p;
10279 		p = search_nonascii(p, e);
10280 		if (!p) {
10281 		    p = e;
10282 		    break;
10283 		}
10284 	    }
10285 	    else {
10286 		UNREACHABLE;
10287 	    }
10288 	}
10289 	if (NIL_P(buf)) {
10290 	    if (p == e) {
10291 		ENC_CODERANGE_SET(str, cr);
10292 		return Qnil;
10293 	    }
10294 	    buf = rb_str_buf_new(RSTRING_LEN(str));
10295 	}
10296 	if (p1 < p) {
10297 	    rb_str_buf_cat(buf, p1, p - p1);
10298 	}
10299 	if (p < e) {
10300 	    if (rep) {
10301 		rb_str_buf_cat(buf, rep, replen);
10302 		if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10303 	    }
10304 	    else {
10305 		repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10306 		repl = str_compat_and_valid(repl, enc);
10307 		tainted |= OBJ_TAINTED_RAW(repl);
10308 		rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10309 		if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10310 		    cr = ENC_CODERANGE_VALID;
10311 	    }
10312 	}
10313     }
10314     else {
10315 	/* ASCII incompatible */
10316 	const char *p = RSTRING_PTR(str);
10317 	const char *e = RSTRING_END(str);
10318 	const char *p1 = p;
10319 	long mbminlen = rb_enc_mbminlen(enc);
10320 	if (!replen) {
10321 	    rep = NULL;
10322 	}
10323 	else if (!NIL_P(repl)) {
10324 	    rep = RSTRING_PTR(repl);
10325 	    replen = RSTRING_LEN(repl);
10326 	}
10327 	else if (encidx == ENCINDEX_UTF_16BE) {
10328 	    DEFAULT_REPLACE_CHAR("\xFF\xFD");
10329 	}
10330 	else if (encidx == ENCINDEX_UTF_16LE) {
10331 	    DEFAULT_REPLACE_CHAR("\xFD\xFF");
10332 	}
10333 	else if (encidx == ENCINDEX_UTF_32BE) {
10334 	    DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
10335 	}
10336 	else if (encidx == ENCINDEX_UTF_32LE) {
10337 	    DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
10338 	}
10339 	else {
10340 	    DEFAULT_REPLACE_CHAR("?");
10341 	}
10342 
10343 	while (p < e) {
10344 	    int ret = rb_enc_precise_mbclen(p, e, enc);
10345 	    if (MBCLEN_NEEDMORE_P(ret)) {
10346 		break;
10347 	    }
10348 	    else if (MBCLEN_CHARFOUND_P(ret)) {
10349 		p += MBCLEN_CHARFOUND_LEN(ret);
10350 	    }
10351 	    else if (MBCLEN_INVALID_P(ret)) {
10352 		const char *q = p;
10353 		long clen = rb_enc_mbmaxlen(enc);
10354 		if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10355 		if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
10356 
10357 		if (e - p < clen) clen = e - p;
10358 		if (clen <= mbminlen * 2) {
10359 		    clen = mbminlen;
10360 		}
10361 		else {
10362 		    clen -= mbminlen;
10363 		    for (; clen > mbminlen; clen-=mbminlen) {
10364 			ret = rb_enc_precise_mbclen(q, q + clen, enc);
10365 			if (MBCLEN_NEEDMORE_P(ret)) break;
10366 			if (MBCLEN_INVALID_P(ret)) continue;
10367 			UNREACHABLE;
10368 		    }
10369 		}
10370 		if (rep) {
10371 		    rb_str_buf_cat(buf, rep, replen);
10372 		}
10373 		else {
10374 		    repl = rb_yield(rb_enc_str_new(p, clen, enc));
10375 		    repl = str_compat_and_valid(repl, enc);
10376 		    tainted |= OBJ_TAINTED_RAW(repl);
10377 		    rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10378 		}
10379 		p += clen;
10380 		p1 = p;
10381 	    }
10382 	    else {
10383 		UNREACHABLE;
10384 	    }
10385 	}
10386 	if (NIL_P(buf)) {
10387 	    if (p == e) {
10388 		ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
10389 		return Qnil;
10390 	    }
10391 	    buf = rb_str_buf_new(RSTRING_LEN(str));
10392 	}
10393 	if (p1 < p) {
10394 	    rb_str_buf_cat(buf, p1, p - p1);
10395 	}
10396 	if (p < e) {
10397 	    if (rep) {
10398 		rb_str_buf_cat(buf, rep, replen);
10399 	    }
10400 	    else {
10401 		repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10402 		repl = str_compat_and_valid(repl, enc);
10403 		tainted |= OBJ_TAINTED_RAW(repl);
10404 		rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10405 	    }
10406 	}
10407 	cr = ENC_CODERANGE_VALID;
10408     }
10409     FL_SET_RAW(buf, tainted|OBJ_TAINTED_RAW(str));
10410     ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
10411     return buf;
10412 }
10413 
10414 /*
10415  *  call-seq:
10416  *    str.scrub -> new_str
10417  *    str.scrub(repl) -> new_str
10418  *    str.scrub{|bytes|} -> new_str
10419  *
10420  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
10421  *  character, else returns self.
10422  *  If block is given, replace invalid bytes with returned value of the block.
10423  *
10424  *     "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
10425  *     "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
10426  *     "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10427  */
10428 static VALUE
str_scrub(int argc,VALUE * argv,VALUE str)10429 str_scrub(int argc, VALUE *argv, VALUE str)
10430 {
10431     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10432     VALUE new = rb_str_scrub(str, repl);
10433     return NIL_P(new) ? rb_str_dup(str): new;
10434 }
10435 
10436 /*
10437  *  call-seq:
10438  *    str.scrub! -> str
10439  *    str.scrub!(repl) -> str
10440  *    str.scrub!{|bytes|} -> str
10441  *
10442  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
10443  *  character, else returns self.
10444  *  If block is given, replace invalid bytes with returned value of the block.
10445  *
10446  *     "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
10447  *     "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
10448  *     "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10449  */
10450 static VALUE
str_scrub_bang(int argc,VALUE * argv,VALUE str)10451 str_scrub_bang(int argc, VALUE *argv, VALUE str)
10452 {
10453     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10454     VALUE new = rb_str_scrub(str, repl);
10455     if (!NIL_P(new)) rb_str_replace(str, new);
10456     return str;
10457 }
10458 
10459 static ID id_normalize;
10460 static ID id_normalized_p;
10461 static VALUE mUnicodeNormalize;
10462 
10463 static VALUE
unicode_normalize_common(int argc,VALUE * argv,VALUE str,ID id)10464 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
10465 {
10466     static int UnicodeNormalizeRequired = 0;
10467     VALUE argv2[2];
10468 
10469     if (!UnicodeNormalizeRequired) {
10470 	rb_require("unicode_normalize/normalize.rb");
10471 	UnicodeNormalizeRequired = 1;
10472     }
10473     argv2[0] = str;
10474     if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
10475     return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
10476 }
10477 
10478 /*
10479  *  call-seq:
10480  *    str.unicode_normalize(form=:nfc)
10481  *
10482  *  Unicode Normalization---Returns a normalized form of +str+,
10483  *  using Unicode normalizations NFC, NFD, NFKC, or NFKD.
10484  *  The normalization form used is determined by +form+, which can
10485  *  be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10486  *  The default is +:nfc+.
10487  *
10488  *  If the string is not in a Unicode Encoding, then an Exception is raised.
10489  *  In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
10490  *  and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
10491  *  Anything other than UTF-8 is implemented by converting to UTF-8,
10492  *  which makes it slower than UTF-8.
10493  *
10494  *    "a\u0300".unicode_normalize        #=> "\u00E0"
10495  *    "a\u0300".unicode_normalize(:nfc)  #=> "\u00E0"
10496  *    "\u00E0".unicode_normalize(:nfd)   #=> "a\u0300"
10497  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
10498  *                                       #=> Encoding::CompatibilityError raised
10499  */
10500 static VALUE
rb_str_unicode_normalize(int argc,VALUE * argv,VALUE str)10501 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
10502 {
10503     return unicode_normalize_common(argc, argv, str, id_normalize);
10504 }
10505 
10506 /*
10507  *  call-seq:
10508  *    str.unicode_normalize!(form=:nfc)
10509  *
10510  *  Destructive version of String#unicode_normalize, doing Unicode
10511  *  normalization in place.
10512  */
10513 static VALUE
rb_str_unicode_normalize_bang(int argc,VALUE * argv,VALUE str)10514 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
10515 {
10516     return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
10517 }
10518 
10519 /*  call-seq:
10520  *    str.unicode_normalized?(form=:nfc)
10521  *
10522  *  Checks whether +str+ is in Unicode normalization form +form+,
10523  *  which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10524  *  The default is +:nfc+.
10525  *
10526  *  If the string is not in a Unicode Encoding, then an Exception is raised.
10527  *  For details, see String#unicode_normalize.
10528  *
10529  *    "a\u0300".unicode_normalized?        #=> false
10530  *    "a\u0300".unicode_normalized?(:nfd)  #=> true
10531  *    "\u00E0".unicode_normalized?         #=> true
10532  *    "\u00E0".unicode_normalized?(:nfd)   #=> false
10533  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
10534  *                                         #=> Encoding::CompatibilityError raised
10535  */
10536 static VALUE
rb_str_unicode_normalized_p(int argc,VALUE * argv,VALUE str)10537 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
10538 {
10539     return unicode_normalize_common(argc, argv, str, id_normalized_p);
10540 }
10541 
10542 /**********************************************************************
10543  * Document-class: Symbol
10544  *
10545  *  <code>Symbol</code> objects represent names and some strings
10546  *  inside the Ruby
10547  *  interpreter. They are generated using the <code>:name</code> and
10548  *  <code>:"string"</code> literals
10549  *  syntax, and by the various <code>to_sym</code> methods. The same
10550  *  <code>Symbol</code> object will be created for a given name or string
10551  *  for the duration of a program's execution, regardless of the context
10552  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
10553  *  one context, a method in another, and a class in a third, the
10554  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
10555  *  all three contexts.
10556  *
10557  *     module One
10558  *       class Fred
10559  *       end
10560  *       $f1 = :Fred
10561  *     end
10562  *     module Two
10563  *       Fred = 1
10564  *       $f2 = :Fred
10565  *     end
10566  *     def Fred()
10567  *     end
10568  *     $f3 = :Fred
10569  *     $f1.object_id   #=> 2514190
10570  *     $f2.object_id   #=> 2514190
10571  *     $f3.object_id   #=> 2514190
10572  *
10573  */
10574 
10575 
10576 /*
10577  *  call-seq:
10578  *     sym == obj   -> true or false
10579  *
10580  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
10581  *  symbol, returns <code>true</code>.
10582  */
10583 
10584 #define sym_equal rb_obj_equal
10585 
10586 static int
sym_printable(const char * s,const char * send,rb_encoding * enc)10587 sym_printable(const char *s, const char *send, rb_encoding *enc)
10588 {
10589     while (s < send) {
10590 	int n;
10591 	int c = rb_enc_precise_mbclen(s, send, enc);
10592 
10593 	if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
10594 	n = MBCLEN_CHARFOUND_LEN(c);
10595 	c = rb_enc_mbc_to_codepoint(s, send, enc);
10596 	if (!rb_enc_isprint(c, enc)) return FALSE;
10597 	s += n;
10598     }
10599     return TRUE;
10600 }
10601 
10602 int
rb_str_symname_p(VALUE sym)10603 rb_str_symname_p(VALUE sym)
10604 {
10605     rb_encoding *enc;
10606     const char *ptr;
10607     long len;
10608     rb_encoding *resenc = rb_default_internal_encoding();
10609 
10610     if (resenc == NULL) resenc = rb_default_external_encoding();
10611     enc = STR_ENC_GET(sym);
10612     ptr = RSTRING_PTR(sym);
10613     len = RSTRING_LEN(sym);
10614     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
10615 	!rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
10616 	return FALSE;
10617     }
10618     return TRUE;
10619 }
10620 
10621 VALUE
rb_str_quote_unprintable(VALUE str)10622 rb_str_quote_unprintable(VALUE str)
10623 {
10624     rb_encoding *enc;
10625     const char *ptr;
10626     long len;
10627     rb_encoding *resenc;
10628 
10629     Check_Type(str, T_STRING);
10630     resenc = rb_default_internal_encoding();
10631     if (resenc == NULL) resenc = rb_default_external_encoding();
10632     enc = STR_ENC_GET(str);
10633     ptr = RSTRING_PTR(str);
10634     len = RSTRING_LEN(str);
10635     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
10636 	!sym_printable(ptr, ptr + len, enc)) {
10637 	return rb_str_inspect(str);
10638     }
10639     return str;
10640 }
10641 
10642 MJIT_FUNC_EXPORTED VALUE
rb_id_quote_unprintable(ID id)10643 rb_id_quote_unprintable(ID id)
10644 {
10645     VALUE str = rb_id2str(id);
10646     if (!rb_str_symname_p(str)) {
10647 	return rb_str_inspect(str);
10648     }
10649     return str;
10650 }
10651 
10652 /*
10653  *  call-seq:
10654  *     sym.inspect    -> string
10655  *
10656  *  Returns the representation of <i>sym</i> as a symbol literal.
10657  *
10658  *     :fred.inspect   #=> ":fred"
10659  */
10660 
10661 static VALUE
sym_inspect(VALUE sym)10662 sym_inspect(VALUE sym)
10663 {
10664     VALUE str = rb_sym2str(sym);
10665     const char *ptr;
10666     long len;
10667     char *dest;
10668 
10669     if (!rb_str_symname_p(str)) {
10670 	str = rb_str_inspect(str);
10671 	len = RSTRING_LEN(str);
10672 	rb_str_resize(str, len + 1);
10673 	dest = RSTRING_PTR(str);
10674 	memmove(dest + 1, dest, len);
10675     }
10676     else {
10677 	rb_encoding *enc = STR_ENC_GET(str);
10678 	RSTRING_GETMEM(str, ptr, len);
10679 	str = rb_enc_str_new(0, len + 1, enc);
10680 	dest = RSTRING_PTR(str);
10681 	memcpy(dest + 1, ptr, len);
10682     }
10683     dest[0] = ':';
10684     return str;
10685 }
10686 
10687 
10688 /*
10689  *  call-seq:
10690  *     sym.id2name   -> string
10691  *     sym.to_s      -> string
10692  *
10693  *  Returns the name or string corresponding to <i>sym</i>.
10694  *
10695  *     :fred.id2name   #=> "fred"
10696  *     :ginger.to_s    #=> "ginger"
10697  */
10698 
10699 
10700 VALUE
rb_sym_to_s(VALUE sym)10701 rb_sym_to_s(VALUE sym)
10702 {
10703     return str_new_shared(rb_cString, rb_sym2str(sym));
10704 }
10705 
10706 
10707 /*
10708  * call-seq:
10709  *   sym.to_sym   -> sym
10710  *   sym.intern   -> sym
10711  *
10712  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
10713  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
10714  * in this case.
10715  */
10716 
10717 static VALUE
sym_to_sym(VALUE sym)10718 sym_to_sym(VALUE sym)
10719 {
10720     return sym;
10721 }
10722 
10723 MJIT_FUNC_EXPORTED VALUE
rb_sym_proc_call(ID mid,int argc,const VALUE * argv,VALUE passed_proc)10724 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc)
10725 {
10726     VALUE obj;
10727 
10728     if (argc < 1) {
10729 	rb_raise(rb_eArgError, "no receiver given");
10730     }
10731     obj = argv[0];
10732     return rb_funcall_with_block(obj, mid, argc - 1, argv + 1, passed_proc);
10733 }
10734 
10735 #if 0
10736 /*
10737  * call-seq:
10738  *   sym.to_proc
10739  *
10740  * Returns a _Proc_ object which responds to the given method by _sym_.
10741  *
10742  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
10743  */
10744 
10745 VALUE
10746 rb_sym_to_proc(VALUE sym)
10747 {
10748 }
10749 #endif
10750 
10751 /*
10752  * call-seq:
10753  *
10754  *   sym.succ
10755  *
10756  * Same as <code>sym.to_s.succ.intern</code>.
10757  */
10758 
10759 static VALUE
sym_succ(VALUE sym)10760 sym_succ(VALUE sym)
10761 {
10762     return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
10763 }
10764 
10765 /*
10766  * call-seq:
10767  *
10768  *   symbol <=> other_symbol       -> -1, 0, +1, or nil
10769  *
10770  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
10771  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
10772  * less than, equal to, or greater than +other_symbol+.
10773  *
10774  * +nil+ is returned if the two values are incomparable.
10775  *
10776  * See String#<=> for more information.
10777  */
10778 
10779 static VALUE
sym_cmp(VALUE sym,VALUE other)10780 sym_cmp(VALUE sym, VALUE other)
10781 {
10782     if (!SYMBOL_P(other)) {
10783 	return Qnil;
10784     }
10785     return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
10786 }
10787 
10788 /*
10789  * call-seq:
10790  *   sym.casecmp(other_symbol)   -> -1, 0, +1, or nil
10791  *
10792  * Case-insensitive version of <code>Symbol#<=></code>.
10793  * Currently, case-insensitivity only works on characters A-Z/a-z,
10794  * not all of Unicode. This is different from Symbol#casecmp?.
10795  *
10796  *   :aBcDeF.casecmp(:abcde)     #=> 1
10797  *   :aBcDeF.casecmp(:abcdef)    #=> 0
10798  *   :aBcDeF.casecmp(:abcdefg)   #=> -1
10799  *   :abcdef.casecmp(:ABCDEF)    #=> 0
10800  *
10801  * +nil+ is returned if the two symbols have incompatible encodings,
10802  * or if +other_symbol+ is not a symbol.
10803  *
10804  *   :foo.casecmp(2)   #=> nil
10805  *   "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}")   #=> nil
10806  */
10807 
10808 static VALUE
sym_casecmp(VALUE sym,VALUE other)10809 sym_casecmp(VALUE sym, VALUE other)
10810 {
10811     if (!SYMBOL_P(other)) {
10812 	return Qnil;
10813     }
10814     return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
10815 }
10816 
10817 /*
10818  * call-seq:
10819  *   sym.casecmp?(other_symbol)   -> true, false, or nil
10820  *
10821  * Returns +true+ if +sym+ and +other_symbol+ are equal after
10822  * Unicode case folding, +false+ if they are not equal.
10823  *
10824  *   :aBcDeF.casecmp?(:abcde)     #=> false
10825  *   :aBcDeF.casecmp?(:abcdef)    #=> true
10826  *   :aBcDeF.casecmp?(:abcdefg)   #=> false
10827  *   :abcdef.casecmp?(:ABCDEF)    #=> true
10828  *   :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}")   #=> true
10829  *
10830  * +nil+ is returned if the two symbols have incompatible encodings,
10831  * or if +other_symbol+ is not a symbol.
10832  *
10833  *   :foo.casecmp?(2)   #=> nil
10834  *   "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}")   #=> nil
10835  */
10836 
10837 static VALUE
sym_casecmp_p(VALUE sym,VALUE other)10838 sym_casecmp_p(VALUE sym, VALUE other)
10839 {
10840     if (!SYMBOL_P(other)) {
10841 	return Qnil;
10842     }
10843     return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
10844 }
10845 
10846 /*
10847  * call-seq:
10848  *   sym =~ obj   -> integer or nil
10849  *
10850  * Returns <code>sym.to_s =~ obj</code>.
10851  */
10852 
10853 static VALUE
sym_match(VALUE sym,VALUE other)10854 sym_match(VALUE sym, VALUE other)
10855 {
10856     return rb_str_match(rb_sym2str(sym), other);
10857 }
10858 
10859 /*
10860  * call-seq:
10861  *   sym.match(pattern)        -> matchdata or nil
10862  *   sym.match(pattern, pos)   -> matchdata or nil
10863  *
10864  * Returns <code>sym.to_s.match</code>.
10865  */
10866 
10867 static VALUE
sym_match_m(int argc,VALUE * argv,VALUE sym)10868 sym_match_m(int argc, VALUE *argv, VALUE sym)
10869 {
10870     return rb_str_match_m(argc, argv, rb_sym2str(sym));
10871 }
10872 
10873 /*
10874  * call-seq:
10875  *   sym.match?(pattern)        -> true or false
10876  *   sym.match?(pattern, pos)   -> true or false
10877  *
10878  * Returns <code>sym.to_s.match?</code>.
10879  */
10880 
10881 static VALUE
sym_match_m_p(int argc,VALUE * argv,VALUE sym)10882 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
10883 {
10884     return rb_str_match_m_p(argc, argv, sym);
10885 }
10886 
10887 /*
10888  * call-seq:
10889  *   sym[idx]      -> char
10890  *   sym[b, n]     -> string
10891  *   sym.slice(idx)      -> char
10892  *   sym.slice(b, n)     -> string
10893  *
10894  * Returns <code>sym.to_s[]</code>.
10895  */
10896 
10897 static VALUE
sym_aref(int argc,VALUE * argv,VALUE sym)10898 sym_aref(int argc, VALUE *argv, VALUE sym)
10899 {
10900     return rb_str_aref_m(argc, argv, rb_sym2str(sym));
10901 }
10902 
10903 /*
10904  * call-seq:
10905  *   sym.length   -> integer
10906  *   sym.size     -> integer
10907  *
10908  * Same as <code>sym.to_s.length</code>.
10909  */
10910 
10911 static VALUE
sym_length(VALUE sym)10912 sym_length(VALUE sym)
10913 {
10914     return rb_str_length(rb_sym2str(sym));
10915 }
10916 
10917 /*
10918  * call-seq:
10919  *   sym.empty?   -> true or false
10920  *
10921  * Returns whether _sym_ is :"" or not.
10922  */
10923 
10924 static VALUE
sym_empty(VALUE sym)10925 sym_empty(VALUE sym)
10926 {
10927     return rb_str_empty(rb_sym2str(sym));
10928 }
10929 
10930 /*
10931  * call-seq:
10932  *   sym.upcase              -> symbol
10933  *   sym.upcase([options])   -> symbol
10934  *
10935  * Same as <code>sym.to_s.upcase.intern</code>.
10936  */
10937 
10938 static VALUE
sym_upcase(int argc,VALUE * argv,VALUE sym)10939 sym_upcase(int argc, VALUE *argv, VALUE sym)
10940 {
10941     return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
10942 }
10943 
10944 /*
10945  * call-seq:
10946  *   sym.downcase              -> symbol
10947  *   sym.downcase([options])   -> symbol
10948  *
10949  * Same as <code>sym.to_s.downcase.intern</code>.
10950  */
10951 
10952 static VALUE
sym_downcase(int argc,VALUE * argv,VALUE sym)10953 sym_downcase(int argc, VALUE *argv, VALUE sym)
10954 {
10955     return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
10956 }
10957 
10958 /*
10959  * call-seq:
10960  *   sym.capitalize              -> symbol
10961  *   sym.capitalize([options])   -> symbol
10962  *
10963  * Same as <code>sym.to_s.capitalize.intern</code>.
10964  */
10965 
10966 static VALUE
sym_capitalize(int argc,VALUE * argv,VALUE sym)10967 sym_capitalize(int argc, VALUE *argv, VALUE sym)
10968 {
10969     return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
10970 }
10971 
10972 /*
10973  * call-seq:
10974  *   sym.swapcase              -> symbol
10975  *   sym.swapcase([options])   -> symbol
10976  *
10977  * Same as <code>sym.to_s.swapcase.intern</code>.
10978  */
10979 
10980 static VALUE
sym_swapcase(int argc,VALUE * argv,VALUE sym)10981 sym_swapcase(int argc, VALUE *argv, VALUE sym)
10982 {
10983     return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
10984 }
10985 
10986 /*
10987  * call-seq:
10988  *   sym.encoding   -> encoding
10989  *
10990  * Returns the Encoding object that represents the encoding of _sym_.
10991  */
10992 
10993 static VALUE
sym_encoding(VALUE sym)10994 sym_encoding(VALUE sym)
10995 {
10996     return rb_obj_encoding(rb_sym2str(sym));
10997 }
10998 
10999 static VALUE
string_for_symbol(VALUE name)11000 string_for_symbol(VALUE name)
11001 {
11002     if (!RB_TYPE_P(name, T_STRING)) {
11003 	VALUE tmp = rb_check_string_type(name);
11004 	if (NIL_P(tmp)) {
11005 	    rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11006 		     name);
11007 	}
11008 	name = tmp;
11009     }
11010     return name;
11011 }
11012 
11013 ID
rb_to_id(VALUE name)11014 rb_to_id(VALUE name)
11015 {
11016     if (SYMBOL_P(name)) {
11017 	return SYM2ID(name);
11018     }
11019     name = string_for_symbol(name);
11020     return rb_intern_str(name);
11021 }
11022 
11023 VALUE
rb_to_symbol(VALUE name)11024 rb_to_symbol(VALUE name)
11025 {
11026     if (SYMBOL_P(name)) {
11027 	return name;
11028     }
11029     name = string_for_symbol(name);
11030     return rb_str_intern(name);
11031 }
11032 
11033 /*
11034  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
11035  *  bytes, typically representing characters. String objects may be created
11036  *  using <code>String::new</code> or as literals.
11037  *
11038  *  Because of aliasing issues, users of strings should be aware of the methods
11039  *  that modify the contents of a <code>String</code> object.  Typically,
11040  *  methods with names ending in ``!'' modify their receiver, while those
11041  *  without a ``!'' return a new <code>String</code>.  However, there are
11042  *  exceptions, such as <code>String#[]=</code>.
11043  *
11044  */
11045 
11046 void
Init_String(void)11047 Init_String(void)
11048 {
11049 #undef rb_intern
11050 #define rb_intern(str) rb_intern_const(str)
11051 
11052     rb_cString  = rb_define_class("String", rb_cObject);
11053     assert(rb_vm_fstring_table());
11054     st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11055     rb_include_module(rb_cString, rb_mComparable);
11056     rb_define_alloc_func(rb_cString, empty_str_alloc);
11057     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11058     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11059     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11060     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11061     rb_define_method(rb_cString, "==", rb_str_equal, 1);
11062     rb_define_method(rb_cString, "===", rb_str_equal, 1);
11063     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11064     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11065     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11066     rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11067     rb_define_method(rb_cString, "+", rb_str_plus, 1);
11068     rb_define_method(rb_cString, "*", rb_str_times, 1);
11069     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11070     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11071     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11072     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11073     rb_define_method(rb_cString, "length", rb_str_length, 0);
11074     rb_define_method(rb_cString, "size", rb_str_length, 0);
11075     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11076     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11077     rb_define_method(rb_cString, "=~", rb_str_match, 1);
11078     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11079     rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11080     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
11081     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11082     rb_define_method(rb_cString, "next", rb_str_succ, 0);
11083     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11084     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11085     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11086     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11087     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
11088     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11089     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11090     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11091     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11092     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11093     rb_define_method(rb_cString, "scrub", str_scrub, -1);
11094     rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11095     rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
11096     rb_define_method(rb_cString, "+@", str_uplus, 0);
11097     rb_define_method(rb_cString, "-@", str_uminus, 0);
11098 
11099     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11100     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11101     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11102     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11103     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
11104     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
11105     rb_define_method(rb_cString, "undump", str_undump, 0);
11106 
11107     sym_ascii      = ID2SYM(rb_intern("ascii"));
11108     sym_turkic     = ID2SYM(rb_intern("turkic"));
11109     sym_lithuanian = ID2SYM(rb_intern("lithuanian"));
11110     sym_fold       = ID2SYM(rb_intern("fold"));
11111 
11112     rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11113     rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11114     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11115     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11116 
11117     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11118     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11119     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11120     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11121 
11122     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
11123     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
11124     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
11125     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
11126     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
11127     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
11128     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
11129     rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
11130     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
11131     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
11132     rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
11133     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
11134     rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
11135     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
11136     rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
11137     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
11138     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
11139 
11140     rb_define_method(rb_cString, "include?", rb_str_include, 1);
11141     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
11142     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
11143 
11144     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
11145 
11146     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
11147     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
11148     rb_define_method(rb_cString, "center", rb_str_center, -1);
11149 
11150     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
11151     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
11152     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
11153     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
11154     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
11155     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
11156     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
11157     rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
11158     rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
11159 
11160     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
11161     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
11162     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
11163     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
11164     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
11165     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
11166     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
11167     rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
11168     rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
11169 
11170     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
11171     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
11172     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
11173     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
11174     rb_define_method(rb_cString, "count", rb_str_count, -1);
11175 
11176     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
11177     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
11178     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
11179     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
11180 
11181     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
11182     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
11183     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
11184     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
11185     rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
11186 
11187     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
11188 
11189     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
11190     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
11191 
11192     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
11193     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
11194 
11195     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
11196     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
11197     rb_define_method(rb_cString, "b", rb_str_b, 0);
11198     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
11199     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
11200 
11201     /* define UnicodeNormalize module here so that we don't have to look it up */
11202     mUnicodeNormalize          = rb_define_module("UnicodeNormalize");
11203     id_normalize               = rb_intern("normalize");
11204     id_normalized_p            = rb_intern("normalized?");
11205 
11206     rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
11207     rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
11208     rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
11209 
11210     rb_fs = Qnil;
11211     rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
11212     rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
11213     rb_gc_register_address(&rb_fs);
11214 
11215     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
11216     rb_include_module(rb_cSymbol, rb_mComparable);
11217     rb_undef_alloc_func(rb_cSymbol);
11218     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
11219     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in symbol.c */
11220 
11221     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
11222     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
11223     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
11224     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
11225     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
11226     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
11227     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
11228     rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
11229     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
11230     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
11231 
11232     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
11233     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
11234     rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
11235     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
11236 
11237     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
11238     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
11239     rb_define_method(rb_cSymbol, "length", sym_length, 0);
11240     rb_define_method(rb_cSymbol, "size", sym_length, 0);
11241     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
11242     rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
11243     rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
11244 
11245     rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
11246     rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
11247     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
11248     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
11249 
11250     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
11251 }
11252