1 /**********************************************************************
2
3 string.c -
4
5 $Author: nagachika $
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12 **********************************************************************/
13
14 #include "ruby/encoding.h"
15 #include "ruby/re.h"
16 #include "internal.h"
17 #include "encindex.h"
18 #include "probes.h"
19 #include "gc.h"
20 #include "ruby_assert.h"
21 #include "id.h"
22 #include "debug_counter.h"
23 #include "ruby/util.h"
24
25 #define BEG(no) (regs->beg[(no)])
26 #define END(no) (regs->end[(no)])
27
28 #include <errno.h>
29 #include <math.h>
30 #include <ctype.h>
31
32 #ifdef HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35
36 #if defined HAVE_CRYPT_R
37 # if defined HAVE_CRYPT_H
38 # include <crypt.h>
39 # endif
40 #elif !defined HAVE_CRYPT
41 # include "missing/crypt.h"
42 # define HAVE_CRYPT_R 1
43 #endif
44
45 #define STRING_ENUMERATORS_WANTARRAY 0 /* next major */
46
47 #undef rb_str_new
48 #undef rb_usascii_str_new
49 #undef rb_utf8_str_new
50 #undef rb_enc_str_new
51 #undef rb_str_new_cstr
52 #undef rb_tainted_str_new_cstr
53 #undef rb_usascii_str_new_cstr
54 #undef rb_utf8_str_new_cstr
55 #undef rb_enc_str_new_cstr
56 #undef rb_external_str_new_cstr
57 #undef rb_locale_str_new_cstr
58 #undef rb_str_dup_frozen
59 #undef rb_str_buf_new_cstr
60 #undef rb_str_buf_cat
61 #undef rb_str_buf_cat2
62 #undef rb_str_cat2
63 #undef rb_str_cat_cstr
64 #undef rb_fstring_cstr
65 #undef rb_fstring_enc_cstr
66
67 static VALUE rb_str_clear(VALUE str);
68
69 VALUE rb_cString;
70 VALUE rb_cSymbol;
71
72 /* FLAGS of RString
73 *
74 * 1: RSTRING_NOEMBED
75 * 2: STR_SHARED (== ELTS_SHARED)
76 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
77 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
78 * other strings that rely on this string's buffer)
79 * 6: STR_IS_SHARED_M (shared, when RSTRING_NOEMBED==1 && klass==0)
80 * 7: STR_TMPLOCK
81 * 8-9: ENC_CODERANGE (2 bits)
82 * 10-16: ENCODING (7 bits == 128)
83 * 17: RSTRING_FSTR
84 * 18: STR_NOFREE
85 * 19: STR_FAKESTR
86 */
87
88 #define RUBY_MAX_CHAR_LEN 16
89 #define STR_SHARED_ROOT FL_USER5
90 #define STR_IS_SHARED_M FL_USER6
91 #define STR_TMPLOCK FL_USER7
92 #define STR_NOFREE FL_USER18
93 #define STR_FAKESTR FL_USER19
94
95 #define STR_SET_NOEMBED(str) do {\
96 FL_SET((str), STR_NOEMBED);\
97 STR_SET_EMBED_LEN((str), 0);\
98 } while (0)
99 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
100 #define STR_SET_EMBED_LEN(str, n) do { \
101 long tmp_n = (n);\
102 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
103 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
104 } while (0)
105
106 #define STR_SET_LEN(str, n) do { \
107 if (STR_EMBED_P(str)) {\
108 STR_SET_EMBED_LEN((str), (n));\
109 }\
110 else {\
111 RSTRING(str)->as.heap.len = (n);\
112 }\
113 } while (0)
114
115 #define STR_DEC_LEN(str) do {\
116 if (STR_EMBED_P(str)) {\
117 long n = RSTRING_LEN(str);\
118 n--;\
119 STR_SET_EMBED_LEN((str), n);\
120 }\
121 else {\
122 RSTRING(str)->as.heap.len--;\
123 }\
124 } while (0)
125
126 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
127 #define TERM_FILL(ptr, termlen) do {\
128 char *const term_fill_ptr = (ptr);\
129 const int term_fill_len = (termlen);\
130 *term_fill_ptr = '\0';\
131 if (UNLIKELY(term_fill_len > 1))\
132 memset(term_fill_ptr, 0, term_fill_len);\
133 } while (0)
134
135 #define RESIZE_CAPA(str,capacity) do {\
136 const int termlen = TERM_LEN(str);\
137 RESIZE_CAPA_TERM(str,capacity,termlen);\
138 } while (0)
139 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
140 if (STR_EMBED_P(str)) {\
141 if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
142 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
143 const long tlen = RSTRING_LEN(str);\
144 memcpy(tmp, RSTRING_PTR(str), tlen);\
145 RSTRING(str)->as.heap.ptr = tmp;\
146 RSTRING(str)->as.heap.len = tlen;\
147 STR_SET_NOEMBED(str);\
148 RSTRING(str)->as.heap.aux.capa = (capacity);\
149 }\
150 }\
151 else {\
152 assert(!FL_TEST((str), STR_SHARED)); \
153 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
154 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
155 RSTRING(str)->as.heap.aux.capa = (capacity);\
156 }\
157 } while (0)
158
159 #define STR_SET_SHARED(str, shared_str) do { \
160 if (!FL_TEST(str, STR_FAKESTR)) { \
161 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
162 FL_SET((str), STR_SHARED); \
163 FL_SET((shared_str), STR_SHARED_ROOT); \
164 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
165 FL_SET_RAW((shared_str), STR_IS_SHARED_M); \
166 } \
167 } while (0)
168
169 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
170 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
171
172 #define STR_ENC_GET(str) get_encoding(str)
173
174 #if !defined SHARABLE_MIDDLE_SUBSTRING
175 # define SHARABLE_MIDDLE_SUBSTRING 0
176 #endif
177 #if !SHARABLE_MIDDLE_SUBSTRING
178 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
179 #else
180 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
181 #endif
182
183 #define STR_EMBEDDABLE_P(len, termlen) \
184 ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
185
186 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
187 static VALUE str_new_shared(VALUE klass, VALUE str);
188 static VALUE str_new_frozen(VALUE klass, VALUE orig);
189 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
190 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
191 static inline void str_modifiable(VALUE str);
192 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
193
194 static inline void
str_make_independent(VALUE str)195 str_make_independent(VALUE str)
196 {
197 long len = RSTRING_LEN(str);
198 int termlen = TERM_LEN(str);
199 str_make_independent_expand((str), len, 0L, termlen);
200 }
201
202 /* symbols for [up|down|swap]case/capitalize options */
203 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
204
205 static rb_encoding *
get_actual_encoding(const int encidx,VALUE str)206 get_actual_encoding(const int encidx, VALUE str)
207 {
208 const unsigned char *q;
209
210 switch (encidx) {
211 case ENCINDEX_UTF_16:
212 if (RSTRING_LEN(str) < 2) break;
213 q = (const unsigned char *)RSTRING_PTR(str);
214 if (q[0] == 0xFE && q[1] == 0xFF) {
215 return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
216 }
217 if (q[0] == 0xFF && q[1] == 0xFE) {
218 return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
219 }
220 return rb_ascii8bit_encoding();
221 case ENCINDEX_UTF_32:
222 if (RSTRING_LEN(str) < 4) break;
223 q = (const unsigned char *)RSTRING_PTR(str);
224 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
225 return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
226 }
227 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
228 return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
229 }
230 return rb_ascii8bit_encoding();
231 }
232 return rb_enc_from_index(encidx);
233 }
234
235 static rb_encoding *
get_encoding(VALUE str)236 get_encoding(VALUE str)
237 {
238 return get_actual_encoding(ENCODING_GET(str), str);
239 }
240
241 static void
mustnot_broken(VALUE str)242 mustnot_broken(VALUE str)
243 {
244 if (is_broken_string(str)) {
245 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
246 }
247 }
248
249 static void
mustnot_wchar(VALUE str)250 mustnot_wchar(VALUE str)
251 {
252 rb_encoding *enc = STR_ENC_GET(str);
253 if (rb_enc_mbminlen(enc) > 1) {
254 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
255 }
256 }
257
258 static int fstring_cmp(VALUE a, VALUE b);
259
260 static VALUE register_fstring(VALUE str);
261
262 const struct st_hash_type rb_fstring_hash_type = {
263 fstring_cmp,
264 rb_str_hash,
265 };
266
267 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
268
269 static int
fstr_update_callback(st_data_t * key,st_data_t * value,st_data_t arg,int existing)270 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
271 {
272 VALUE *fstr = (VALUE *)arg;
273 VALUE str = (VALUE)*key;
274
275 if (existing) {
276 /* because of lazy sweep, str may be unmarked already and swept
277 * at next time */
278
279 if (rb_objspace_garbage_object_p(str)) {
280 *fstr = Qundef;
281 return ST_DELETE;
282 }
283
284 *fstr = str;
285 return ST_STOP;
286 }
287 else {
288 if (FL_TEST_RAW(str, STR_FAKESTR)) {
289 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
290 RSTRING(str)->as.heap.len,
291 ENCODING_GET(str));
292 OBJ_FREEZE_RAW(str);
293 }
294 else {
295 str = str_new_frozen(rb_cString, str);
296 if (STR_SHARED_P(str)) { /* str should not be shared */
297 /* shared substring */
298 str_make_independent(str);
299 assert(OBJ_FROZEN(str));
300 }
301 if (!BARE_STRING_P(str)) {
302 str = str_new_frozen(rb_cString, str);
303 }
304 }
305 RBASIC(str)->flags |= RSTRING_FSTR;
306
307 *key = *value = *fstr = str;
308 return ST_CONTINUE;
309 }
310 }
311
312 RUBY_FUNC_EXPORTED
313 VALUE
rb_fstring(VALUE str)314 rb_fstring(VALUE str)
315 {
316 VALUE fstr;
317 int bare;
318
319 Check_Type(str, T_STRING);
320
321 if (FL_TEST(str, RSTRING_FSTR))
322 return str;
323
324 bare = BARE_STRING_P(str);
325 if (!bare) {
326 if (STR_EMBED_P(str)) {
327 OBJ_FREEZE_RAW(str);
328 return str;
329 }
330 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
331 assert(OBJ_FROZEN(str));
332 return str;
333 }
334 }
335
336 fstr = register_fstring(str);
337
338 if (!bare) {
339 str_replace_shared_without_enc(str, fstr);
340 OBJ_FREEZE_RAW(str);
341 return str;
342 }
343 return fstr;
344 }
345
346 static VALUE
register_fstring(VALUE str)347 register_fstring(VALUE str)
348 {
349 VALUE ret;
350 st_table *frozen_strings = rb_vm_fstring_table();
351
352 do {
353 ret = str;
354 st_update(frozen_strings, (st_data_t)str,
355 fstr_update_callback, (st_data_t)&ret);
356 } while (ret == Qundef);
357
358 assert(OBJ_FROZEN(ret));
359 assert(!FL_TEST_RAW(ret, STR_FAKESTR));
360 assert(!FL_TEST_RAW(ret, FL_EXIVAR));
361 assert(!FL_TEST_RAW(ret, FL_TAINT));
362 assert(RBASIC_CLASS(ret) == rb_cString);
363 return ret;
364 }
365
366 static VALUE
setup_fake_str(struct RString * fake_str,const char * name,long len,int encidx)367 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
368 {
369 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
370 /* SHARED to be allocated by the callback */
371
372 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
373
374 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
375 fake_str->as.heap.len = len;
376 fake_str->as.heap.ptr = (char *)name;
377 fake_str->as.heap.aux.capa = len;
378 return (VALUE)fake_str;
379 }
380
381 /*
382 * set up a fake string which refers a static string literal.
383 */
384 VALUE
rb_setup_fake_str(struct RString * fake_str,const char * name,long len,rb_encoding * enc)385 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
386 {
387 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
388 }
389
390 /*
391 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
392 * shared string which refers a static string literal. `ptr` must
393 * point a constant string.
394 */
395 MJIT_FUNC_EXPORTED VALUE
rb_fstring_new(const char * ptr,long len)396 rb_fstring_new(const char *ptr, long len)
397 {
398 struct RString fake_str;
399 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII));
400 }
401
402 VALUE
rb_fstring_enc_new(const char * ptr,long len,rb_encoding * enc)403 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
404 {
405 struct RString fake_str;
406 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc));
407 }
408
409 VALUE
rb_fstring_cstr(const char * ptr)410 rb_fstring_cstr(const char *ptr)
411 {
412 return rb_fstring_new(ptr, strlen(ptr));
413 }
414
415 VALUE
rb_fstring_enc_cstr(const char * ptr,rb_encoding * enc)416 rb_fstring_enc_cstr(const char *ptr, rb_encoding *enc)
417 {
418 return rb_fstring_enc_new(ptr, strlen(ptr), enc);
419 }
420
421 static int
fstring_set_class_i(st_data_t key,st_data_t val,st_data_t arg)422 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
423 {
424 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
425 return ST_CONTINUE;
426 }
427
428 static int
fstring_cmp(VALUE a,VALUE b)429 fstring_cmp(VALUE a, VALUE b)
430 {
431 long alen, blen;
432 const char *aptr, *bptr;
433 RSTRING_GETMEM(a, aptr, alen);
434 RSTRING_GETMEM(b, bptr, blen);
435 return (alen != blen ||
436 ENCODING_GET(a) != ENCODING_GET(b) ||
437 memcmp(aptr, bptr, alen) != 0);
438 }
439
440 static inline int
single_byte_optimizable(VALUE str)441 single_byte_optimizable(VALUE str)
442 {
443 rb_encoding *enc;
444
445 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
446 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
447 return 1;
448
449 enc = STR_ENC_GET(str);
450 if (rb_enc_mbmaxlen(enc) == 1)
451 return 1;
452
453 /* Conservative. Possibly single byte.
454 * "\xa1" in Shift_JIS for example. */
455 return 0;
456 }
457
458 VALUE rb_fs;
459
460 static inline const char *
search_nonascii(const char * p,const char * e)461 search_nonascii(const char *p, const char *e)
462 {
463 const uintptr_t *s, *t;
464
465 #if defined(__STDC_VERSION) && (__STDC_VERSION__ >= 199901L)
466 # if SIZEOF_UINTPTR_T == 8
467 # define NONASCII_MASK UINT64_C(0x8080808080808080)
468 # elif SIZEOF_UINTPTR_T == 4
469 # define NONASCII_MASK UINT32_C(0x80808080)
470 # else
471 # error "don't know what to do."
472 # endif
473 #else
474 # if SIZEOF_UINTPTR_T == 8
475 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
476 # elif SIZEOF_UINTPTR_T == 4
477 # define NONASCII_MASK 0x80808080UL /* or...? */
478 # else
479 # error "don't know what to do."
480 # endif
481 #endif
482
483 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
484 #if !UNALIGNED_WORD_ACCESS
485 if ((uintptr_t)p % SIZEOF_VOIDP) {
486 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
487 p += l;
488 switch (l) {
489 default: UNREACHABLE;
490 #if SIZEOF_VOIDP > 4
491 case 7: if (p[-7]&0x80) return p-7;
492 case 6: if (p[-6]&0x80) return p-6;
493 case 5: if (p[-5]&0x80) return p-5;
494 case 4: if (p[-4]&0x80) return p-4;
495 #endif
496 case 3: if (p[-3]&0x80) return p-3;
497 case 2: if (p[-2]&0x80) return p-2;
498 case 1: if (p[-1]&0x80) return p-1;
499 case 0: break;
500 }
501 }
502 #endif
503 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
504 #define aligned_ptr(value) \
505 __builtin_assume_aligned((value), sizeof(uintptr_t))
506 #else
507 #define aligned_ptr(value) (uintptr_t *)(value)
508 #endif
509 s = aligned_ptr(p);
510 t = aligned_ptr(e - (SIZEOF_VOIDP-1));
511 #undef aligned_ptr
512 for (;s < t; s++) {
513 if (*s & NONASCII_MASK) {
514 #ifdef WORDS_BIGENDIAN
515 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
516 #else
517 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
518 #endif
519 }
520 }
521 p = (const char *)s;
522 }
523
524 switch (e - p) {
525 default: UNREACHABLE;
526 #if SIZEOF_VOIDP > 4
527 case 7: if (e[-7]&0x80) return e-7;
528 case 6: if (e[-6]&0x80) return e-6;
529 case 5: if (e[-5]&0x80) return e-5;
530 case 4: if (e[-4]&0x80) return e-4;
531 #endif
532 case 3: if (e[-3]&0x80) return e-3;
533 case 2: if (e[-2]&0x80) return e-2;
534 case 1: if (e[-1]&0x80) return e-1;
535 case 0: return NULL;
536 }
537 }
538
539 static int
coderange_scan(const char * p,long len,rb_encoding * enc)540 coderange_scan(const char *p, long len, rb_encoding *enc)
541 {
542 const char *e = p + len;
543
544 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
545 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
546 p = search_nonascii(p, e);
547 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
548 }
549
550 if (rb_enc_asciicompat(enc)) {
551 p = search_nonascii(p, e);
552 if (!p) return ENC_CODERANGE_7BIT;
553 for (;;) {
554 int ret = rb_enc_precise_mbclen(p, e, enc);
555 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
556 p += MBCLEN_CHARFOUND_LEN(ret);
557 if (p == e) break;
558 p = search_nonascii(p, e);
559 if (!p) break;
560 }
561 }
562 else {
563 while (p < e) {
564 int ret = rb_enc_precise_mbclen(p, e, enc);
565 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
566 p += MBCLEN_CHARFOUND_LEN(ret);
567 }
568 }
569 return ENC_CODERANGE_VALID;
570 }
571
572 long
rb_str_coderange_scan_restartable(const char * s,const char * e,rb_encoding * enc,int * cr)573 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
574 {
575 const char *p = s;
576
577 if (*cr == ENC_CODERANGE_BROKEN)
578 return e - s;
579
580 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
581 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
582 if (*cr == ENC_CODERANGE_VALID) return e - s;
583 p = search_nonascii(p, e);
584 *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
585 return e - s;
586 }
587 else if (rb_enc_asciicompat(enc)) {
588 p = search_nonascii(p, e);
589 if (!p) {
590 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
591 return e - s;
592 }
593 for (;;) {
594 int ret = rb_enc_precise_mbclen(p, e, enc);
595 if (!MBCLEN_CHARFOUND_P(ret)) {
596 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
597 return p - s;
598 }
599 p += MBCLEN_CHARFOUND_LEN(ret);
600 if (p == e) break;
601 p = search_nonascii(p, e);
602 if (!p) break;
603 }
604 }
605 else {
606 while (p < e) {
607 int ret = rb_enc_precise_mbclen(p, e, enc);
608 if (!MBCLEN_CHARFOUND_P(ret)) {
609 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
610 return p - s;
611 }
612 p += MBCLEN_CHARFOUND_LEN(ret);
613 }
614 }
615 *cr = ENC_CODERANGE_VALID;
616 return e - s;
617 }
618
619 static inline void
str_enc_copy(VALUE str1,VALUE str2)620 str_enc_copy(VALUE str1, VALUE str2)
621 {
622 rb_enc_set_index(str1, ENCODING_GET(str2));
623 }
624
625 static void
rb_enc_cr_str_copy_for_substr(VALUE dest,VALUE src)626 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
627 {
628 /* this function is designed for copying encoding and coderange
629 * from src to new string "dest" which is made from the part of src.
630 */
631 str_enc_copy(dest, src);
632 if (RSTRING_LEN(dest) == 0) {
633 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
634 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
635 else
636 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
637 return;
638 }
639 switch (ENC_CODERANGE(src)) {
640 case ENC_CODERANGE_7BIT:
641 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
642 break;
643 case ENC_CODERANGE_VALID:
644 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
645 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
646 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
647 else
648 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
649 break;
650 default:
651 break;
652 }
653 }
654
655 static void
rb_enc_cr_str_exact_copy(VALUE dest,VALUE src)656 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
657 {
658 str_enc_copy(dest, src);
659 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
660 }
661
662 int
rb_enc_str_coderange(VALUE str)663 rb_enc_str_coderange(VALUE str)
664 {
665 int cr = ENC_CODERANGE(str);
666
667 if (cr == ENC_CODERANGE_UNKNOWN) {
668 int encidx = ENCODING_GET(str);
669 rb_encoding *enc = rb_enc_from_index(encidx);
670 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
671 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
672 cr = ENC_CODERANGE_BROKEN;
673 }
674 else {
675 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
676 enc);
677 }
678 ENC_CODERANGE_SET(str, cr);
679 }
680 return cr;
681 }
682
683 int
rb_enc_str_asciionly_p(VALUE str)684 rb_enc_str_asciionly_p(VALUE str)
685 {
686 rb_encoding *enc = STR_ENC_GET(str);
687
688 if (!rb_enc_asciicompat(enc))
689 return FALSE;
690 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
691 return TRUE;
692 return FALSE;
693 }
694
695 static inline void
str_mod_check(VALUE s,const char * p,long len)696 str_mod_check(VALUE s, const char *p, long len)
697 {
698 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
699 rb_raise(rb_eRuntimeError, "string modified");
700 }
701 }
702
703 static size_t
str_capacity(VALUE str,const int termlen)704 str_capacity(VALUE str, const int termlen)
705 {
706 if (STR_EMBED_P(str)) {
707 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
708 }
709 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
710 return RSTRING(str)->as.heap.len;
711 }
712 else {
713 return RSTRING(str)->as.heap.aux.capa;
714 }
715 }
716
717 size_t
rb_str_capacity(VALUE str)718 rb_str_capacity(VALUE str)
719 {
720 return str_capacity(str, TERM_LEN(str));
721 }
722
723 static inline void
must_not_null(const char * ptr)724 must_not_null(const char *ptr)
725 {
726 if (!ptr) {
727 rb_raise(rb_eArgError, "NULL pointer given");
728 }
729 }
730
731 static inline VALUE
str_alloc(VALUE klass)732 str_alloc(VALUE klass)
733 {
734 NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0));
735 return (VALUE)str;
736 }
737
738 static inline VALUE
empty_str_alloc(VALUE klass)739 empty_str_alloc(VALUE klass)
740 {
741 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
742 return str_alloc(klass);
743 }
744
745 static VALUE
str_new0(VALUE klass,const char * ptr,long len,int termlen)746 str_new0(VALUE klass, const char *ptr, long len, int termlen)
747 {
748 VALUE str;
749
750 if (len < 0) {
751 rb_raise(rb_eArgError, "negative string size (or size too big)");
752 }
753
754 RUBY_DTRACE_CREATE_HOOK(STRING, len);
755
756 str = str_alloc(klass);
757 if (!STR_EMBEDDABLE_P(len, termlen)) {
758 RSTRING(str)->as.heap.aux.capa = len;
759 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
760 STR_SET_NOEMBED(str);
761 }
762 else if (len == 0) {
763 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
764 }
765 if (ptr) {
766 memcpy(RSTRING_PTR(str), ptr, len);
767 }
768 STR_SET_LEN(str, len);
769 TERM_FILL(RSTRING_PTR(str) + len, termlen);
770 return str;
771 }
772
773 static VALUE
str_new(VALUE klass,const char * ptr,long len)774 str_new(VALUE klass, const char *ptr, long len)
775 {
776 return str_new0(klass, ptr, len, 1);
777 }
778
779 VALUE
rb_str_new(const char * ptr,long len)780 rb_str_new(const char *ptr, long len)
781 {
782 return str_new(rb_cString, ptr, len);
783 }
784
785 VALUE
rb_usascii_str_new(const char * ptr,long len)786 rb_usascii_str_new(const char *ptr, long len)
787 {
788 VALUE str = rb_str_new(ptr, len);
789 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
790 return str;
791 }
792
793 VALUE
rb_utf8_str_new(const char * ptr,long len)794 rb_utf8_str_new(const char *ptr, long len)
795 {
796 VALUE str = str_new(rb_cString, ptr, len);
797 rb_enc_associate_index(str, rb_utf8_encindex());
798 return str;
799 }
800
801 VALUE
rb_enc_str_new(const char * ptr,long len,rb_encoding * enc)802 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
803 {
804 VALUE str;
805
806 if (!enc) return rb_str_new(ptr, len);
807
808 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
809 rb_enc_associate(str, enc);
810 return str;
811 }
812
813 VALUE
rb_str_new_cstr(const char * ptr)814 rb_str_new_cstr(const char *ptr)
815 {
816 must_not_null(ptr);
817 /* rb_str_new_cstr() can take pointer from non-malloc-generated
818 * memory regions, and that cannot be detected by the MSAN. Just
819 * trust the programmer that the argument passed here is a sane C
820 * string. */
821 __msan_unpoison_string(ptr);
822 return rb_str_new(ptr, strlen(ptr));
823 }
824
825 VALUE
rb_usascii_str_new_cstr(const char * ptr)826 rb_usascii_str_new_cstr(const char *ptr)
827 {
828 VALUE str = rb_str_new_cstr(ptr);
829 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
830 return str;
831 }
832
833 VALUE
rb_utf8_str_new_cstr(const char * ptr)834 rb_utf8_str_new_cstr(const char *ptr)
835 {
836 VALUE str = rb_str_new_cstr(ptr);
837 rb_enc_associate_index(str, rb_utf8_encindex());
838 return str;
839 }
840
841 VALUE
rb_enc_str_new_cstr(const char * ptr,rb_encoding * enc)842 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
843 {
844 must_not_null(ptr);
845 if (rb_enc_mbminlen(enc) != 1) {
846 rb_raise(rb_eArgError, "wchar encoding given");
847 }
848 return rb_enc_str_new(ptr, strlen(ptr), enc);
849 }
850
851 static VALUE
str_new_static(VALUE klass,const char * ptr,long len,int encindex)852 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
853 {
854 VALUE str;
855
856 if (len < 0) {
857 rb_raise(rb_eArgError, "negative string size (or size too big)");
858 }
859
860 if (!ptr) {
861 rb_encoding *enc = rb_enc_get_from_index(encindex);
862 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
863 }
864 else {
865 RUBY_DTRACE_CREATE_HOOK(STRING, len);
866 str = str_alloc(klass);
867 RSTRING(str)->as.heap.len = len;
868 RSTRING(str)->as.heap.ptr = (char *)ptr;
869 RSTRING(str)->as.heap.aux.capa = len;
870 STR_SET_NOEMBED(str);
871 RBASIC(str)->flags |= STR_NOFREE;
872 }
873 rb_enc_associate_index(str, encindex);
874 return str;
875 }
876
877 VALUE
rb_str_new_static(const char * ptr,long len)878 rb_str_new_static(const char *ptr, long len)
879 {
880 return str_new_static(rb_cString, ptr, len, 0);
881 }
882
883 VALUE
rb_usascii_str_new_static(const char * ptr,long len)884 rb_usascii_str_new_static(const char *ptr, long len)
885 {
886 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
887 }
888
889 VALUE
rb_utf8_str_new_static(const char * ptr,long len)890 rb_utf8_str_new_static(const char *ptr, long len)
891 {
892 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
893 }
894
895 VALUE
rb_enc_str_new_static(const char * ptr,long len,rb_encoding * enc)896 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
897 {
898 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
899 }
900
901 VALUE
rb_tainted_str_new(const char * ptr,long len)902 rb_tainted_str_new(const char *ptr, long len)
903 {
904 VALUE str = rb_str_new(ptr, len);
905
906 OBJ_TAINT(str);
907 return str;
908 }
909
910 static VALUE
rb_tainted_str_new_with_enc(const char * ptr,long len,rb_encoding * enc)911 rb_tainted_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
912 {
913 VALUE str = rb_enc_str_new(ptr, len, enc);
914
915 OBJ_TAINT(str);
916 return str;
917 }
918
919 VALUE
rb_tainted_str_new_cstr(const char * ptr)920 rb_tainted_str_new_cstr(const char *ptr)
921 {
922 VALUE str = rb_str_new_cstr(ptr);
923
924 OBJ_TAINT(str);
925 return str;
926 }
927
928 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
929 rb_encoding *from, rb_encoding *to,
930 int ecflags, VALUE ecopts);
931
932 VALUE
rb_str_conv_enc_opts(VALUE str,rb_encoding * from,rb_encoding * to,int ecflags,VALUE ecopts)933 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
934 {
935 long len;
936 const char *ptr;
937 VALUE newstr;
938
939 if (!to) return str;
940 if (!from) from = rb_enc_get(str);
941 if (from == to) return str;
942 if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
943 to == rb_ascii8bit_encoding()) {
944 if (STR_ENC_GET(str) != to) {
945 str = rb_str_dup(str);
946 rb_enc_associate(str, to);
947 }
948 return str;
949 }
950
951 RSTRING_GETMEM(str, ptr, len);
952 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
953 from, to, ecflags, ecopts);
954 if (NIL_P(newstr)) {
955 /* some error, return original */
956 return str;
957 }
958 OBJ_INFECT(newstr, str);
959 return newstr;
960 }
961
962 VALUE
rb_str_cat_conv_enc_opts(VALUE newstr,long ofs,const char * ptr,long len,rb_encoding * from,int ecflags,VALUE ecopts)963 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
964 rb_encoding *from, int ecflags, VALUE ecopts)
965 {
966 long olen;
967
968 olen = RSTRING_LEN(newstr);
969 if (ofs < -olen || olen < ofs)
970 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
971 if (ofs < 0) ofs += olen;
972 if (!from) {
973 STR_SET_LEN(newstr, ofs);
974 return rb_str_cat(newstr, ptr, len);
975 }
976
977 rb_str_modify(newstr);
978 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
979 rb_enc_get(newstr),
980 ecflags, ecopts);
981 }
982
983 VALUE
rb_str_initialize(VALUE str,const char * ptr,long len,rb_encoding * enc)984 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
985 {
986 STR_SET_LEN(str, 0);
987 rb_enc_associate(str, enc);
988 rb_str_cat(str, ptr, len);
989 return str;
990 }
991
992 static VALUE
str_cat_conv_enc_opts(VALUE newstr,long ofs,const char * ptr,long len,rb_encoding * from,rb_encoding * to,int ecflags,VALUE ecopts)993 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
994 rb_encoding *from, rb_encoding *to,
995 int ecflags, VALUE ecopts)
996 {
997 rb_econv_t *ec;
998 rb_econv_result_t ret;
999 long olen;
1000 VALUE econv_wrapper;
1001 const unsigned char *start, *sp;
1002 unsigned char *dest, *dp;
1003 size_t converted_output = (size_t)ofs;
1004
1005 olen = rb_str_capacity(newstr);
1006
1007 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1008 RBASIC_CLEAR_CLASS(econv_wrapper);
1009 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1010 if (!ec) return Qnil;
1011 DATA_PTR(econv_wrapper) = ec;
1012
1013 sp = (unsigned char*)ptr;
1014 start = sp;
1015 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1016 (dp = dest + converted_output),
1017 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1018 ret == econv_destination_buffer_full) {
1019 /* destination buffer short */
1020 size_t converted_input = sp - start;
1021 size_t rest = len - converted_input;
1022 converted_output = dp - dest;
1023 rb_str_set_len(newstr, converted_output);
1024 if (converted_input && converted_output &&
1025 rest < (LONG_MAX / converted_output)) {
1026 rest = (rest * converted_output) / converted_input;
1027 }
1028 else {
1029 rest = olen;
1030 }
1031 olen += rest < 2 ? 2 : rest;
1032 rb_str_resize(newstr, olen);
1033 }
1034 DATA_PTR(econv_wrapper) = 0;
1035 rb_econv_close(ec);
1036 rb_gc_force_recycle(econv_wrapper);
1037 switch (ret) {
1038 case econv_finished:
1039 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1040 rb_str_set_len(newstr, len);
1041 rb_enc_associate(newstr, to);
1042 return newstr;
1043
1044 default:
1045 return Qnil;
1046 }
1047 }
1048
1049 VALUE
rb_str_conv_enc(VALUE str,rb_encoding * from,rb_encoding * to)1050 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1051 {
1052 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1053 }
1054
1055 VALUE
rb_external_str_new_with_enc(const char * ptr,long len,rb_encoding * eenc)1056 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1057 {
1058 rb_encoding *ienc;
1059 VALUE str;
1060 const int eidx = rb_enc_to_index(eenc);
1061
1062 if (!ptr) {
1063 return rb_tainted_str_new_with_enc(ptr, len, eenc);
1064 }
1065
1066 /* ASCII-8BIT case, no conversion */
1067 if ((eidx == rb_ascii8bit_encindex()) ||
1068 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1069 return rb_tainted_str_new(ptr, len);
1070 }
1071 /* no default_internal or same encoding, no conversion */
1072 ienc = rb_default_internal_encoding();
1073 if (!ienc || eenc == ienc) {
1074 return rb_tainted_str_new_with_enc(ptr, len, eenc);
1075 }
1076 /* ASCII compatible, and ASCII only string, no conversion in
1077 * default_internal */
1078 if ((eidx == rb_ascii8bit_encindex()) ||
1079 (eidx == rb_usascii_encindex()) ||
1080 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1081 return rb_tainted_str_new_with_enc(ptr, len, ienc);
1082 }
1083 /* convert from the given encoding to default_internal */
1084 str = rb_tainted_str_new_with_enc(NULL, 0, ienc);
1085 /* when the conversion failed for some reason, just ignore the
1086 * default_internal and result in the given encoding as-is. */
1087 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1088 rb_str_initialize(str, ptr, len, eenc);
1089 }
1090 return str;
1091 }
1092
1093 VALUE
rb_external_str_with_enc(VALUE str,rb_encoding * eenc)1094 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1095 {
1096 int eidx = rb_enc_to_index(eenc);
1097 if (eidx == rb_usascii_encindex() &&
1098 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1099 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1100 return str;
1101 }
1102 rb_enc_associate_index(str, eidx);
1103 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1104 }
1105
1106 VALUE
rb_external_str_new(const char * ptr,long len)1107 rb_external_str_new(const char *ptr, long len)
1108 {
1109 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1110 }
1111
1112 VALUE
rb_external_str_new_cstr(const char * ptr)1113 rb_external_str_new_cstr(const char *ptr)
1114 {
1115 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1116 }
1117
1118 VALUE
rb_locale_str_new(const char * ptr,long len)1119 rb_locale_str_new(const char *ptr, long len)
1120 {
1121 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1122 }
1123
1124 VALUE
rb_locale_str_new_cstr(const char * ptr)1125 rb_locale_str_new_cstr(const char *ptr)
1126 {
1127 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1128 }
1129
1130 VALUE
rb_filesystem_str_new(const char * ptr,long len)1131 rb_filesystem_str_new(const char *ptr, long len)
1132 {
1133 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1134 }
1135
1136 VALUE
rb_filesystem_str_new_cstr(const char * ptr)1137 rb_filesystem_str_new_cstr(const char *ptr)
1138 {
1139 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1140 }
1141
1142 VALUE
rb_str_export(VALUE str)1143 rb_str_export(VALUE str)
1144 {
1145 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
1146 }
1147
1148 VALUE
rb_str_export_locale(VALUE str)1149 rb_str_export_locale(VALUE str)
1150 {
1151 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
1152 }
1153
1154 VALUE
rb_str_export_to_enc(VALUE str,rb_encoding * enc)1155 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1156 {
1157 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1158 }
1159
1160 static VALUE
str_replace_shared_without_enc(VALUE str2,VALUE str)1161 str_replace_shared_without_enc(VALUE str2, VALUE str)
1162 {
1163 const int termlen = TERM_LEN(str);
1164 char *ptr;
1165 long len;
1166
1167 RSTRING_GETMEM(str, ptr, len);
1168 if (STR_EMBEDDABLE_P(len, termlen)) {
1169 char *ptr2 = RSTRING(str2)->as.ary;
1170 STR_SET_EMBED(str2);
1171 memcpy(ptr2, RSTRING_PTR(str), len);
1172 STR_SET_EMBED_LEN(str2, len);
1173 TERM_FILL(ptr2+len, termlen);
1174 }
1175 else {
1176 VALUE root;
1177 if (STR_SHARED_P(str)) {
1178 root = RSTRING(str)->as.heap.aux.shared;
1179 RSTRING_GETMEM(str, ptr, len);
1180 }
1181 else {
1182 root = rb_str_new_frozen(str);
1183 RSTRING_GETMEM(root, ptr, len);
1184 }
1185 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1186 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1187 rb_fatal("about to free a possible shared root");
1188 }
1189 char *ptr2 = STR_HEAP_PTR(str2);
1190 if (ptr2 != ptr) {
1191 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1192 }
1193 }
1194 FL_SET(str2, STR_NOEMBED);
1195 RSTRING(str2)->as.heap.len = len;
1196 RSTRING(str2)->as.heap.ptr = ptr;
1197 STR_SET_SHARED(str2, root);
1198 }
1199 return str2;
1200 }
1201
1202 static VALUE
str_replace_shared(VALUE str2,VALUE str)1203 str_replace_shared(VALUE str2, VALUE str)
1204 {
1205 str_replace_shared_without_enc(str2, str);
1206 rb_enc_cr_str_exact_copy(str2, str);
1207 return str2;
1208 }
1209
1210 static VALUE
str_new_shared(VALUE klass,VALUE str)1211 str_new_shared(VALUE klass, VALUE str)
1212 {
1213 return str_replace_shared(str_alloc(klass), str);
1214 }
1215
1216 VALUE
rb_str_new_shared(VALUE str)1217 rb_str_new_shared(VALUE str)
1218 {
1219 VALUE str2 = str_new_shared(rb_obj_class(str), str);
1220
1221 OBJ_INFECT(str2, str);
1222 return str2;
1223 }
1224
1225 VALUE
rb_str_new_frozen(VALUE orig)1226 rb_str_new_frozen(VALUE orig)
1227 {
1228 VALUE str;
1229
1230 if (OBJ_FROZEN(orig)) return orig;
1231
1232 str = str_new_frozen(rb_obj_class(orig), orig);
1233 OBJ_INFECT(str, orig);
1234 return str;
1235 }
1236
1237 VALUE
rb_str_tmp_frozen_acquire(VALUE orig)1238 rb_str_tmp_frozen_acquire(VALUE orig)
1239 {
1240 VALUE tmp;
1241
1242 if (OBJ_FROZEN_RAW(orig)) return orig;
1243
1244 tmp = str_new_frozen(0, orig);
1245 OBJ_INFECT(tmp, orig);
1246
1247 return tmp;
1248 }
1249
1250 void
rb_str_tmp_frozen_release(VALUE orig,VALUE tmp)1251 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1252 {
1253 if (RBASIC_CLASS(tmp) != 0)
1254 return;
1255
1256 if (STR_EMBED_P(tmp)) {
1257 assert(OBJ_FROZEN_RAW(tmp));
1258 rb_gc_force_recycle(tmp);
1259 }
1260 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1261 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1262 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1263
1264 if (shared == tmp && !FL_TEST_RAW(tmp, STR_IS_SHARED_M)) {
1265 FL_UNSET_RAW(orig, STR_SHARED);
1266 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1267 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1268 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1269 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1270 assert(OBJ_FROZEN_RAW(tmp));
1271 rb_gc_force_recycle(tmp);
1272 }
1273 }
1274 }
1275
1276 static VALUE
str_new_frozen(VALUE klass,VALUE orig)1277 str_new_frozen(VALUE klass, VALUE orig)
1278 {
1279 VALUE str;
1280
1281 if (STR_EMBED_P(orig)) {
1282 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1283 }
1284 else {
1285 if (FL_TEST_RAW(orig, STR_SHARED)) {
1286 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1287 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1288 long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1289 assert(!STR_EMBED_P(shared));
1290 assert(OBJ_FROZEN(shared));
1291
1292 if ((ofs > 0) || (rest > 0) ||
1293 (klass != RBASIC(shared)->klass) ||
1294 ((RBASIC(shared)->flags ^ RBASIC(orig)->flags) & FL_TAINT) ||
1295 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1296 str = str_new_shared(klass, shared);
1297 RSTRING(str)->as.heap.ptr += ofs;
1298 RSTRING(str)->as.heap.len -= ofs + rest;
1299 }
1300 else {
1301 if (RBASIC_CLASS(shared) == 0)
1302 FL_SET_RAW(shared, STR_IS_SHARED_M);
1303 return shared;
1304 }
1305 }
1306 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1307 str = str_alloc(klass);
1308 STR_SET_EMBED(str);
1309 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1310 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1311 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1312 }
1313 else {
1314 str = str_alloc(klass);
1315 STR_SET_NOEMBED(str);
1316 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1317 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1318 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1319 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1320 RBASIC(orig)->flags &= ~STR_NOFREE;
1321 STR_SET_SHARED(orig, str);
1322 if (klass == 0)
1323 FL_UNSET_RAW(str, STR_IS_SHARED_M);
1324 }
1325 }
1326
1327 rb_enc_cr_str_exact_copy(str, orig);
1328 OBJ_FREEZE(str);
1329 return str;
1330 }
1331
1332 VALUE
rb_str_new_with_class(VALUE obj,const char * ptr,long len)1333 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1334 {
1335 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1336 }
1337
1338 static VALUE
str_new_empty(VALUE str)1339 str_new_empty(VALUE str)
1340 {
1341 VALUE v = rb_str_new_with_class(str, 0, 0);
1342 rb_enc_copy(v, str);
1343 OBJ_INFECT(v, str);
1344 return v;
1345 }
1346
1347 #define STR_BUF_MIN_SIZE 127
1348 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1349
1350 VALUE
rb_str_buf_new(long capa)1351 rb_str_buf_new(long capa)
1352 {
1353 VALUE str = str_alloc(rb_cString);
1354
1355 if (capa < STR_BUF_MIN_SIZE) {
1356 capa = STR_BUF_MIN_SIZE;
1357 }
1358 FL_SET(str, STR_NOEMBED);
1359 RSTRING(str)->as.heap.aux.capa = capa;
1360 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1361 RSTRING(str)->as.heap.ptr[0] = '\0';
1362
1363 return str;
1364 }
1365
1366 VALUE
rb_str_buf_new_cstr(const char * ptr)1367 rb_str_buf_new_cstr(const char *ptr)
1368 {
1369 VALUE str;
1370 long len = strlen(ptr);
1371
1372 str = rb_str_buf_new(len);
1373 rb_str_buf_cat(str, ptr, len);
1374
1375 return str;
1376 }
1377
1378 VALUE
rb_str_tmp_new(long len)1379 rb_str_tmp_new(long len)
1380 {
1381 return str_new(0, 0, len);
1382 }
1383
1384 void
rb_str_free(VALUE str)1385 rb_str_free(VALUE str)
1386 {
1387 if (FL_TEST(str, RSTRING_FSTR)) {
1388 st_data_t fstr = (st_data_t)str;
1389 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1390 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1391 }
1392
1393 if (STR_EMBED_P(str)) {
1394 RB_DEBUG_COUNTER_INC(obj_str_embed);
1395 }
1396 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1397 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1398 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1399 }
1400 else {
1401 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1402 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1403 }
1404 }
1405
1406 RUBY_FUNC_EXPORTED size_t
rb_str_memsize(VALUE str)1407 rb_str_memsize(VALUE str)
1408 {
1409 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1410 return STR_HEAP_SIZE(str);
1411 }
1412 else {
1413 return 0;
1414 }
1415 }
1416
1417 VALUE
rb_str_to_str(VALUE str)1418 rb_str_to_str(VALUE str)
1419 {
1420 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1421 }
1422
1423 static inline void str_discard(VALUE str);
1424 static void str_shared_replace(VALUE str, VALUE str2);
1425
1426 void
rb_str_shared_replace(VALUE str,VALUE str2)1427 rb_str_shared_replace(VALUE str, VALUE str2)
1428 {
1429 if (str != str2) str_shared_replace(str, str2);
1430 }
1431
1432 static void
str_shared_replace(VALUE str,VALUE str2)1433 str_shared_replace(VALUE str, VALUE str2)
1434 {
1435 rb_encoding *enc;
1436 int cr;
1437 int termlen;
1438
1439 RUBY_ASSERT(str2 != str);
1440 enc = STR_ENC_GET(str2);
1441 cr = ENC_CODERANGE(str2);
1442 str_discard(str);
1443 OBJ_INFECT(str, str2);
1444 termlen = rb_enc_mbminlen(enc);
1445
1446 if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1447 STR_SET_EMBED(str);
1448 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1449 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1450 rb_enc_associate(str, enc);
1451 ENC_CODERANGE_SET(str, cr);
1452 }
1453 else {
1454 STR_SET_NOEMBED(str);
1455 FL_UNSET(str, STR_SHARED);
1456 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1457 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1458
1459 if (FL_TEST(str2, STR_SHARED)) {
1460 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1461 STR_SET_SHARED(str, shared);
1462 }
1463 else {
1464 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1465 }
1466
1467 /* abandon str2 */
1468 STR_SET_EMBED(str2);
1469 RSTRING_PTR(str2)[0] = 0;
1470 STR_SET_EMBED_LEN(str2, 0);
1471 rb_enc_associate(str, enc);
1472 ENC_CODERANGE_SET(str, cr);
1473 }
1474 }
1475
1476 VALUE
rb_obj_as_string(VALUE obj)1477 rb_obj_as_string(VALUE obj)
1478 {
1479 VALUE str;
1480
1481 if (RB_TYPE_P(obj, T_STRING)) {
1482 return obj;
1483 }
1484 str = rb_funcall(obj, idTo_s, 0);
1485 return rb_obj_as_string_result(str, obj);
1486 }
1487
1488 MJIT_FUNC_EXPORTED VALUE
rb_obj_as_string_result(VALUE str,VALUE obj)1489 rb_obj_as_string_result(VALUE str, VALUE obj)
1490 {
1491 if (!RB_TYPE_P(str, T_STRING))
1492 return rb_any_to_s(obj);
1493 if (!FL_TEST_RAW(str, RSTRING_FSTR) && FL_ABLE(obj))
1494 /* fstring must not be tainted, at least */
1495 OBJ_INFECT_RAW(str, obj);
1496 return str;
1497 }
1498
1499 static VALUE
str_replace(VALUE str,VALUE str2)1500 str_replace(VALUE str, VALUE str2)
1501 {
1502 long len;
1503
1504 len = RSTRING_LEN(str2);
1505 if (STR_SHARED_P(str2)) {
1506 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1507 assert(OBJ_FROZEN(shared));
1508 STR_SET_NOEMBED(str);
1509 RSTRING(str)->as.heap.len = len;
1510 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1511 STR_SET_SHARED(str, shared);
1512 rb_enc_cr_str_exact_copy(str, str2);
1513 }
1514 else {
1515 str_replace_shared(str, str2);
1516 }
1517
1518 OBJ_INFECT(str, str2);
1519 return str;
1520 }
1521
1522 static inline VALUE
str_duplicate(VALUE klass,VALUE str)1523 str_duplicate(VALUE klass, VALUE str)
1524 {
1525 enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1526 const VALUE flag_mask =
1527 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1528 ENC_CODERANGE_MASK | ENCODING_MASK |
1529 FL_TAINT | FL_FREEZE
1530 ;
1531 VALUE flags = FL_TEST_RAW(str, flag_mask);
1532 VALUE dup = str_alloc(klass);
1533 MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1534 char, embed_size);
1535 if (flags & STR_NOEMBED) {
1536 if (FL_TEST_RAW(str, STR_SHARED)) {
1537 str = RSTRING(str)->as.heap.aux.shared;
1538 }
1539 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1540 str = str_new_frozen(klass, str);
1541 FL_SET_RAW(str, flags & FL_TAINT);
1542 flags = FL_TEST_RAW(str, flag_mask);
1543 }
1544 if (flags & STR_NOEMBED) {
1545 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1546 flags |= STR_SHARED;
1547 }
1548 else {
1549 MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1550 char, embed_size);
1551 }
1552 }
1553 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1554 return dup;
1555 }
1556
1557 VALUE
rb_str_dup(VALUE str)1558 rb_str_dup(VALUE str)
1559 {
1560 return str_duplicate(rb_obj_class(str), str);
1561 }
1562
1563 VALUE
rb_str_resurrect(VALUE str)1564 rb_str_resurrect(VALUE str)
1565 {
1566 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1567 return str_duplicate(rb_cString, str);
1568 }
1569
1570 /*
1571 * call-seq:
1572 * String.new(str="") -> new_str
1573 * String.new(str="", encoding: enc) -> new_str
1574 * String.new(str="", capacity: size) -> new_str
1575 *
1576 * Returns a new string object containing a copy of <i>str</i>.
1577 *
1578 * The optional <i>encoding</i> keyword argument specifies the encoding
1579 * of the new string.
1580 * If not specified, the encoding of <i>str</i> is used
1581 * (or ASCII-8BIT, if <i>str</i> is not specified).
1582 *
1583 * The optional <i>capacity</i> keyword argument specifies the size
1584 * of the internal buffer.
1585 * This may improve performance, when the string will be concatenated many
1586 * times (causing many realloc calls).
1587 */
1588
1589 static VALUE
rb_str_init(int argc,VALUE * argv,VALUE str)1590 rb_str_init(int argc, VALUE *argv, VALUE str)
1591 {
1592 static ID keyword_ids[2];
1593 VALUE orig, opt, venc, vcapa;
1594 VALUE kwargs[2];
1595 rb_encoding *enc = 0;
1596 int n;
1597
1598 if (!keyword_ids[0]) {
1599 keyword_ids[0] = rb_id_encoding();
1600 CONST_ID(keyword_ids[1], "capacity");
1601 }
1602
1603 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1604 if (!NIL_P(opt)) {
1605 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1606 venc = kwargs[0];
1607 vcapa = kwargs[1];
1608 if (venc != Qundef && !NIL_P(venc)) {
1609 enc = rb_to_encoding(venc);
1610 }
1611 if (vcapa != Qundef && !NIL_P(vcapa)) {
1612 long capa = NUM2LONG(vcapa);
1613 long len = 0;
1614 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1615
1616 if (capa < STR_BUF_MIN_SIZE) {
1617 capa = STR_BUF_MIN_SIZE;
1618 }
1619 if (n == 1) {
1620 StringValue(orig);
1621 len = RSTRING_LEN(orig);
1622 if (capa < len) {
1623 capa = len;
1624 }
1625 if (orig == str) n = 0;
1626 }
1627 str_modifiable(str);
1628 if (STR_EMBED_P(str)) { /* make noembed always */
1629 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1630 memcpy(new_ptr, RSTRING(str)->as.ary, RSTRING_EMBED_LEN_MAX + 1);
1631 RSTRING(str)->as.heap.ptr = new_ptr;
1632 }
1633 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1634 const size_t size = (size_t)capa + termlen;
1635 const char *const old_ptr = RSTRING_PTR(str);
1636 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1637 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1638 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1639 FL_UNSET_RAW(str, STR_SHARED);
1640 RSTRING(str)->as.heap.ptr = new_ptr;
1641 }
1642 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1643 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1644 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1645 }
1646 RSTRING(str)->as.heap.len = len;
1647 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1648 if (n == 1) {
1649 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1650 rb_enc_cr_str_exact_copy(str, orig);
1651 }
1652 FL_SET(str, STR_NOEMBED);
1653 RSTRING(str)->as.heap.aux.capa = capa;
1654 }
1655 else if (n == 1) {
1656 rb_str_replace(str, orig);
1657 }
1658 if (enc) {
1659 rb_enc_associate(str, enc);
1660 ENC_CODERANGE_CLEAR(str);
1661 }
1662 }
1663 else if (n == 1) {
1664 rb_str_replace(str, orig);
1665 }
1666 return str;
1667 }
1668
1669 #ifdef NONASCII_MASK
1670 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1671
1672 /*
1673 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1674 * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1675 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1676 *
1677 * if (!(byte & 0x80))
1678 * byte |= 0x40; // turn on bit6
1679 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1680 *
1681 * This function calculates whether a byte is leading or not for all bytes
1682 * in the argument word by concurrently using the above logic, and then
1683 * adds up the number of leading bytes in the word.
1684 */
1685 static inline uintptr_t
count_utf8_lead_bytes_with_word(const uintptr_t * s)1686 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1687 {
1688 uintptr_t d = *s;
1689
1690 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1691 d = (d>>6) | (~d>>7);
1692 d &= NONASCII_MASK >> 7;
1693
1694 /* Gather all bytes. */
1695 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1696 /* use only if it can use POPCNT */
1697 return rb_popcount_intptr(d);
1698 #else
1699 d += (d>>8);
1700 d += (d>>16);
1701 # if SIZEOF_VOIDP == 8
1702 d += (d>>32);
1703 # endif
1704 return (d&0xF);
1705 #endif
1706 }
1707 #endif
1708
1709 static inline long
enc_strlen(const char * p,const char * e,rb_encoding * enc,int cr)1710 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1711 {
1712 long c;
1713 const char *q;
1714
1715 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1716 long diff = (long)(e - p);
1717 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1718 }
1719 #ifdef NONASCII_MASK
1720 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1721 uintptr_t len = 0;
1722 if ((int)sizeof(uintptr_t) * 2 < e - p) {
1723 const uintptr_t *s, *t;
1724 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1725 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1726 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1727 while (p < (const char *)s) {
1728 if (is_utf8_lead_byte(*p)) len++;
1729 p++;
1730 }
1731 while (s < t) {
1732 len += count_utf8_lead_bytes_with_word(s);
1733 s++;
1734 }
1735 p = (const char *)s;
1736 }
1737 while (p < e) {
1738 if (is_utf8_lead_byte(*p)) len++;
1739 p++;
1740 }
1741 return (long)len;
1742 }
1743 #endif
1744 else if (rb_enc_asciicompat(enc)) {
1745 c = 0;
1746 if (ENC_CODERANGE_CLEAN_P(cr)) {
1747 while (p < e) {
1748 if (ISASCII(*p)) {
1749 q = search_nonascii(p, e);
1750 if (!q)
1751 return c + (e - p);
1752 c += q - p;
1753 p = q;
1754 }
1755 p += rb_enc_fast_mbclen(p, e, enc);
1756 c++;
1757 }
1758 }
1759 else {
1760 while (p < e) {
1761 if (ISASCII(*p)) {
1762 q = search_nonascii(p, e);
1763 if (!q)
1764 return c + (e - p);
1765 c += q - p;
1766 p = q;
1767 }
1768 p += rb_enc_mbclen(p, e, enc);
1769 c++;
1770 }
1771 }
1772 return c;
1773 }
1774
1775 for (c=0; p<e; c++) {
1776 p += rb_enc_mbclen(p, e, enc);
1777 }
1778 return c;
1779 }
1780
1781 long
rb_enc_strlen(const char * p,const char * e,rb_encoding * enc)1782 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1783 {
1784 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1785 }
1786
1787 /* To get strlen with cr
1788 * Note that given cr is not used.
1789 */
1790 long
rb_enc_strlen_cr(const char * p,const char * e,rb_encoding * enc,int * cr)1791 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1792 {
1793 long c;
1794 const char *q;
1795 int ret;
1796
1797 *cr = 0;
1798 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1799 long diff = (long)(e - p);
1800 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1801 }
1802 else if (rb_enc_asciicompat(enc)) {
1803 c = 0;
1804 while (p < e) {
1805 if (ISASCII(*p)) {
1806 q = search_nonascii(p, e);
1807 if (!q) {
1808 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1809 return c + (e - p);
1810 }
1811 c += q - p;
1812 p = q;
1813 }
1814 ret = rb_enc_precise_mbclen(p, e, enc);
1815 if (MBCLEN_CHARFOUND_P(ret)) {
1816 *cr |= ENC_CODERANGE_VALID;
1817 p += MBCLEN_CHARFOUND_LEN(ret);
1818 }
1819 else {
1820 *cr = ENC_CODERANGE_BROKEN;
1821 p++;
1822 }
1823 c++;
1824 }
1825 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1826 return c;
1827 }
1828
1829 for (c=0; p<e; c++) {
1830 ret = rb_enc_precise_mbclen(p, e, enc);
1831 if (MBCLEN_CHARFOUND_P(ret)) {
1832 *cr |= ENC_CODERANGE_VALID;
1833 p += MBCLEN_CHARFOUND_LEN(ret);
1834 }
1835 else {
1836 *cr = ENC_CODERANGE_BROKEN;
1837 if (p + rb_enc_mbminlen(enc) <= e)
1838 p += rb_enc_mbminlen(enc);
1839 else
1840 p = e;
1841 }
1842 }
1843 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1844 return c;
1845 }
1846
1847 /* enc must be str's enc or rb_enc_check(str, str2) */
1848 static long
str_strlen(VALUE str,rb_encoding * enc)1849 str_strlen(VALUE str, rb_encoding *enc)
1850 {
1851 const char *p, *e;
1852 int cr;
1853
1854 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1855 if (!enc) enc = STR_ENC_GET(str);
1856 p = RSTRING_PTR(str);
1857 e = RSTRING_END(str);
1858 cr = ENC_CODERANGE(str);
1859
1860 if (cr == ENC_CODERANGE_UNKNOWN) {
1861 long n = rb_enc_strlen_cr(p, e, enc, &cr);
1862 if (cr) ENC_CODERANGE_SET(str, cr);
1863 return n;
1864 }
1865 else {
1866 return enc_strlen(p, e, enc, cr);
1867 }
1868 }
1869
1870 long
rb_str_strlen(VALUE str)1871 rb_str_strlen(VALUE str)
1872 {
1873 return str_strlen(str, NULL);
1874 }
1875
1876 /*
1877 * call-seq:
1878 * str.length -> integer
1879 * str.size -> integer
1880 *
1881 * Returns the character length of <i>str</i>.
1882 */
1883
1884 VALUE
rb_str_length(VALUE str)1885 rb_str_length(VALUE str)
1886 {
1887 return LONG2NUM(str_strlen(str, NULL));
1888 }
1889
1890 /*
1891 * call-seq:
1892 * str.bytesize -> integer
1893 *
1894 * Returns the length of +str+ in bytes.
1895 *
1896 * "\x80\u3042".bytesize #=> 4
1897 * "hello".bytesize #=> 5
1898 */
1899
1900 static VALUE
rb_str_bytesize(VALUE str)1901 rb_str_bytesize(VALUE str)
1902 {
1903 return LONG2NUM(RSTRING_LEN(str));
1904 }
1905
1906 /*
1907 * call-seq:
1908 * str.empty? -> true or false
1909 *
1910 * Returns <code>true</code> if <i>str</i> has a length of zero.
1911 *
1912 * "hello".empty? #=> false
1913 * " ".empty? #=> false
1914 * "".empty? #=> true
1915 */
1916
1917 static VALUE
rb_str_empty(VALUE str)1918 rb_str_empty(VALUE str)
1919 {
1920 if (RSTRING_LEN(str) == 0)
1921 return Qtrue;
1922 return Qfalse;
1923 }
1924
1925 /*
1926 * call-seq:
1927 * str + other_str -> new_str
1928 *
1929 * Concatenation---Returns a new <code>String</code> containing
1930 * <i>other_str</i> concatenated to <i>str</i>.
1931 *
1932 * "Hello from " + self.to_s #=> "Hello from main"
1933 */
1934
1935 VALUE
rb_str_plus(VALUE str1,VALUE str2)1936 rb_str_plus(VALUE str1, VALUE str2)
1937 {
1938 VALUE str3;
1939 rb_encoding *enc;
1940 char *ptr1, *ptr2, *ptr3;
1941 long len1, len2;
1942 int termlen;
1943
1944 StringValue(str2);
1945 enc = rb_enc_check_str(str1, str2);
1946 RSTRING_GETMEM(str1, ptr1, len1);
1947 RSTRING_GETMEM(str2, ptr2, len2);
1948 termlen = rb_enc_mbminlen(enc);
1949 if (len1 > LONG_MAX - len2) {
1950 rb_raise(rb_eArgError, "string size too big");
1951 }
1952 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
1953 ptr3 = RSTRING_PTR(str3);
1954 memcpy(ptr3, ptr1, len1);
1955 memcpy(ptr3+len1, ptr2, len2);
1956 TERM_FILL(&ptr3[len1+len2], termlen);
1957
1958 FL_SET_RAW(str3, OBJ_TAINTED_RAW(str1) | OBJ_TAINTED_RAW(str2));
1959 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
1960 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
1961 RB_GC_GUARD(str1);
1962 RB_GC_GUARD(str2);
1963 return str3;
1964 }
1965
1966 /*
1967 * call-seq:
1968 * str * integer -> new_str
1969 *
1970 * Copy --- Returns a new String containing +integer+ copies of the receiver.
1971 * +integer+ must be greater than or equal to 0.
1972 *
1973 * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1974 * "Ho! " * 0 #=> ""
1975 */
1976
1977 VALUE
rb_str_times(VALUE str,VALUE times)1978 rb_str_times(VALUE str, VALUE times)
1979 {
1980 VALUE str2;
1981 long n, len;
1982 char *ptr2;
1983 int termlen;
1984
1985 if (times == INT2FIX(1)) {
1986 return rb_str_dup(str);
1987 }
1988 if (times == INT2FIX(0)) {
1989 str2 = str_alloc(rb_obj_class(str));
1990 rb_enc_copy(str2, str);
1991 OBJ_INFECT(str2, str);
1992 return str2;
1993 }
1994 len = NUM2LONG(times);
1995 if (len < 0) {
1996 rb_raise(rb_eArgError, "negative argument");
1997 }
1998 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
1999 str2 = str_alloc(rb_obj_class(str));
2000 if (!STR_EMBEDDABLE_P(len, 1)) {
2001 RSTRING(str2)->as.heap.aux.capa = len;
2002 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2003 STR_SET_NOEMBED(str2);
2004 }
2005 STR_SET_LEN(str2, len);
2006 rb_enc_copy(str2, str);
2007 OBJ_INFECT(str2, str);
2008 return str2;
2009 }
2010 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2011 rb_raise(rb_eArgError, "argument too big");
2012 }
2013
2014 len *= RSTRING_LEN(str);
2015 termlen = TERM_LEN(str);
2016 str2 = str_new0(rb_obj_class(str), 0, len, termlen);
2017 ptr2 = RSTRING_PTR(str2);
2018 if (len) {
2019 n = RSTRING_LEN(str);
2020 memcpy(ptr2, RSTRING_PTR(str), n);
2021 while (n <= len/2) {
2022 memcpy(ptr2 + n, ptr2, n);
2023 n *= 2;
2024 }
2025 memcpy(ptr2 + n, ptr2, len-n);
2026 }
2027 STR_SET_LEN(str2, len);
2028 TERM_FILL(&ptr2[len], termlen);
2029 OBJ_INFECT(str2, str);
2030 rb_enc_cr_str_copy_for_substr(str2, str);
2031
2032 return str2;
2033 }
2034
2035 /*
2036 * call-seq:
2037 * str % arg -> new_str
2038 *
2039 * Format---Uses <i>str</i> as a format specification, and returns the result
2040 * of applying it to <i>arg</i>. If the format specification contains more than
2041 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
2042 * containing the values to be substituted. See <code>Kernel::sprintf</code> for
2043 * details of the format string.
2044 *
2045 * "%05d" % 123 #=> "00123"
2046 * "%-5s: %016x" % [ "ID", self.object_id ] #=> "ID : 00002b054ec93168"
2047 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
2048 */
2049
2050 static VALUE
rb_str_format_m(VALUE str,VALUE arg)2051 rb_str_format_m(VALUE str, VALUE arg)
2052 {
2053 VALUE tmp = rb_check_array_type(arg);
2054
2055 if (!NIL_P(tmp)) {
2056 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2057 }
2058 return rb_str_format(1, &arg, str);
2059 }
2060
2061 static inline void
rb_check_lockedtmp(VALUE str)2062 rb_check_lockedtmp(VALUE str)
2063 {
2064 if (FL_TEST(str, STR_TMPLOCK)) {
2065 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2066 }
2067 }
2068
2069 static inline void
str_modifiable(VALUE str)2070 str_modifiable(VALUE str)
2071 {
2072 rb_check_lockedtmp(str);
2073 rb_check_frozen(str);
2074 }
2075
2076 static inline int
str_dependent_p(VALUE str)2077 str_dependent_p(VALUE str)
2078 {
2079 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2080 return 0;
2081 }
2082 else {
2083 return 1;
2084 }
2085 }
2086
2087 static inline int
str_independent(VALUE str)2088 str_independent(VALUE str)
2089 {
2090 str_modifiable(str);
2091 return !str_dependent_p(str);
2092 }
2093
2094 static void
str_make_independent_expand(VALUE str,long len,long expand,const int termlen)2095 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2096 {
2097 char *ptr;
2098 char *oldptr;
2099 long capa = len + expand;
2100
2101 if (len > capa) len = capa;
2102
2103 if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
2104 ptr = RSTRING(str)->as.heap.ptr;
2105 STR_SET_EMBED(str);
2106 memcpy(RSTRING(str)->as.ary, ptr, len);
2107 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2108 STR_SET_EMBED_LEN(str, len);
2109 return;
2110 }
2111
2112 ptr = ALLOC_N(char, (size_t)capa + termlen);
2113 oldptr = RSTRING_PTR(str);
2114 if (oldptr) {
2115 memcpy(ptr, oldptr, len);
2116 }
2117 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2118 xfree(oldptr);
2119 }
2120 STR_SET_NOEMBED(str);
2121 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2122 TERM_FILL(ptr + len, termlen);
2123 RSTRING(str)->as.heap.ptr = ptr;
2124 RSTRING(str)->as.heap.len = len;
2125 RSTRING(str)->as.heap.aux.capa = capa;
2126 }
2127
2128 void
rb_str_modify(VALUE str)2129 rb_str_modify(VALUE str)
2130 {
2131 if (!str_independent(str))
2132 str_make_independent(str);
2133 ENC_CODERANGE_CLEAR(str);
2134 }
2135
2136 void
rb_str_modify_expand(VALUE str,long expand)2137 rb_str_modify_expand(VALUE str, long expand)
2138 {
2139 int termlen = TERM_LEN(str);
2140 long len = RSTRING_LEN(str);
2141
2142 if (expand < 0) {
2143 rb_raise(rb_eArgError, "negative expanding string size");
2144 }
2145 if (expand > LONG_MAX - len) {
2146 rb_raise(rb_eArgError, "string size too big");
2147 }
2148
2149 if (!str_independent(str)) {
2150 str_make_independent_expand(str, len, expand, termlen);
2151 }
2152 else if (expand > 0) {
2153 RESIZE_CAPA_TERM(str, len + expand, termlen);
2154 }
2155 ENC_CODERANGE_CLEAR(str);
2156 }
2157
2158 /* As rb_str_modify(), but don't clear coderange */
2159 static void
str_modify_keep_cr(VALUE str)2160 str_modify_keep_cr(VALUE str)
2161 {
2162 if (!str_independent(str))
2163 str_make_independent(str);
2164 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2165 /* Force re-scan later */
2166 ENC_CODERANGE_CLEAR(str);
2167 }
2168
2169 static inline void
str_discard(VALUE str)2170 str_discard(VALUE str)
2171 {
2172 str_modifiable(str);
2173 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2174 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2175 RSTRING(str)->as.heap.ptr = 0;
2176 RSTRING(str)->as.heap.len = 0;
2177 }
2178 }
2179
2180 void
rb_must_asciicompat(VALUE str)2181 rb_must_asciicompat(VALUE str)
2182 {
2183 rb_encoding *enc = rb_enc_get(str);
2184 if (!rb_enc_asciicompat(enc)) {
2185 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2186 }
2187 }
2188
2189 VALUE
rb_string_value(volatile VALUE * ptr)2190 rb_string_value(volatile VALUE *ptr)
2191 {
2192 VALUE s = *ptr;
2193 if (!RB_TYPE_P(s, T_STRING)) {
2194 s = rb_str_to_str(s);
2195 *ptr = s;
2196 }
2197 return s;
2198 }
2199
2200 char *
rb_string_value_ptr(volatile VALUE * ptr)2201 rb_string_value_ptr(volatile VALUE *ptr)
2202 {
2203 VALUE str = rb_string_value(ptr);
2204 return RSTRING_PTR(str);
2205 }
2206
2207 static int
zero_filled(const char * s,int n)2208 zero_filled(const char *s, int n)
2209 {
2210 for (; n > 0; --n) {
2211 if (*s++) return 0;
2212 }
2213 return 1;
2214 }
2215
2216 static const char *
str_null_char(const char * s,long len,const int minlen,rb_encoding * enc)2217 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2218 {
2219 const char *e = s + len;
2220
2221 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2222 if (zero_filled(s, minlen)) return s;
2223 }
2224 return 0;
2225 }
2226
2227 static char *
str_fill_term(VALUE str,char * s,long len,int termlen)2228 str_fill_term(VALUE str, char *s, long len, int termlen)
2229 {
2230 /* This function assumes that (capa + termlen) bytes of memory
2231 * is allocated, like many other functions in this file.
2232 */
2233 if (str_dependent_p(str)) {
2234 if (!zero_filled(s + len, termlen))
2235 str_make_independent_expand(str, len, 0L, termlen);
2236 }
2237 else {
2238 TERM_FILL(s + len, termlen);
2239 return s;
2240 }
2241 return RSTRING_PTR(str);
2242 }
2243
2244 void
rb_str_change_terminator_length(VALUE str,const int oldtermlen,const int termlen)2245 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2246 {
2247 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2248 long len = RSTRING_LEN(str);
2249
2250 assert(capa >= len);
2251 if (capa - len < termlen) {
2252 rb_check_lockedtmp(str);
2253 str_make_independent_expand(str, len, 0L, termlen);
2254 }
2255 else if (str_dependent_p(str)) {
2256 if (termlen > oldtermlen)
2257 str_make_independent_expand(str, len, 0L, termlen);
2258 }
2259 else {
2260 if (!STR_EMBED_P(str)) {
2261 /* modify capa instead of realloc */
2262 assert(!FL_TEST((str), STR_SHARED));
2263 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2264 }
2265 if (termlen > oldtermlen) {
2266 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2267 }
2268 }
2269
2270 return;
2271 }
2272
2273 static char *
str_null_check(VALUE str,int * w)2274 str_null_check(VALUE str, int *w)
2275 {
2276 char *s = RSTRING_PTR(str);
2277 long len = RSTRING_LEN(str);
2278 rb_encoding *enc = rb_enc_get(str);
2279 const int minlen = rb_enc_mbminlen(enc);
2280
2281 if (minlen > 1) {
2282 *w = 1;
2283 if (str_null_char(s, len, minlen, enc)) {
2284 return NULL;
2285 }
2286 return str_fill_term(str, s, len, minlen);
2287 }
2288 *w = 0;
2289 if (!s || memchr(s, 0, len)) {
2290 return NULL;
2291 }
2292 if (s[len]) {
2293 s = str_fill_term(str, s, len, minlen);
2294 }
2295 return s;
2296 }
2297
2298 char *
rb_str_to_cstr(VALUE str)2299 rb_str_to_cstr(VALUE str)
2300 {
2301 int w;
2302 return str_null_check(str, &w);
2303 }
2304
2305 char *
rb_string_value_cstr(volatile VALUE * ptr)2306 rb_string_value_cstr(volatile VALUE *ptr)
2307 {
2308 VALUE str = rb_string_value(ptr);
2309 int w;
2310 char *s = str_null_check(str, &w);
2311 if (!s) {
2312 if (w) {
2313 rb_raise(rb_eArgError, "string contains null char");
2314 }
2315 rb_raise(rb_eArgError, "string contains null byte");
2316 }
2317 return s;
2318 }
2319
2320 char *
rb_str_fill_terminator(VALUE str,const int newminlen)2321 rb_str_fill_terminator(VALUE str, const int newminlen)
2322 {
2323 char *s = RSTRING_PTR(str);
2324 long len = RSTRING_LEN(str);
2325 return str_fill_term(str, s, len, newminlen);
2326 }
2327
2328 VALUE
rb_check_string_type(VALUE str)2329 rb_check_string_type(VALUE str)
2330 {
2331 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2332 return str;
2333 }
2334
2335 /*
2336 * call-seq:
2337 * String.try_convert(obj) -> string or nil
2338 *
2339 * Try to convert <i>obj</i> into a String, using to_str method.
2340 * Returns converted string or nil if <i>obj</i> cannot be converted
2341 * for any reason.
2342 *
2343 * String.try_convert("str") #=> "str"
2344 * String.try_convert(/re/) #=> nil
2345 */
2346 static VALUE
rb_str_s_try_convert(VALUE dummy,VALUE str)2347 rb_str_s_try_convert(VALUE dummy, VALUE str)
2348 {
2349 return rb_check_string_type(str);
2350 }
2351
2352 static char*
str_nth_len(const char * p,const char * e,long * nthp,rb_encoding * enc)2353 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2354 {
2355 long nth = *nthp;
2356 if (rb_enc_mbmaxlen(enc) == 1) {
2357 p += nth;
2358 }
2359 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2360 p += nth * rb_enc_mbmaxlen(enc);
2361 }
2362 else if (rb_enc_asciicompat(enc)) {
2363 const char *p2, *e2;
2364 int n;
2365
2366 while (p < e && 0 < nth) {
2367 e2 = p + nth;
2368 if (e < e2) {
2369 *nthp = nth;
2370 return (char *)e;
2371 }
2372 if (ISASCII(*p)) {
2373 p2 = search_nonascii(p, e2);
2374 if (!p2) {
2375 nth -= e2 - p;
2376 *nthp = nth;
2377 return (char *)e2;
2378 }
2379 nth -= p2 - p;
2380 p = p2;
2381 }
2382 n = rb_enc_mbclen(p, e, enc);
2383 p += n;
2384 nth--;
2385 }
2386 *nthp = nth;
2387 if (nth != 0) {
2388 return (char *)e;
2389 }
2390 return (char *)p;
2391 }
2392 else {
2393 while (p < e && nth--) {
2394 p += rb_enc_mbclen(p, e, enc);
2395 }
2396 }
2397 if (p > e) p = e;
2398 *nthp = nth;
2399 return (char*)p;
2400 }
2401
2402 char*
rb_enc_nth(const char * p,const char * e,long nth,rb_encoding * enc)2403 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2404 {
2405 return str_nth_len(p, e, &nth, enc);
2406 }
2407
2408 static char*
str_nth(const char * p,const char * e,long nth,rb_encoding * enc,int singlebyte)2409 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2410 {
2411 if (singlebyte)
2412 p += nth;
2413 else {
2414 p = str_nth_len(p, e, &nth, enc);
2415 }
2416 if (!p) return 0;
2417 if (p > e) p = e;
2418 return (char *)p;
2419 }
2420
2421 /* char offset to byte offset */
2422 static long
str_offset(const char * p,const char * e,long nth,rb_encoding * enc,int singlebyte)2423 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2424 {
2425 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2426 if (!pp) return e - p;
2427 return pp - p;
2428 }
2429
2430 long
rb_str_offset(VALUE str,long pos)2431 rb_str_offset(VALUE str, long pos)
2432 {
2433 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2434 STR_ENC_GET(str), single_byte_optimizable(str));
2435 }
2436
2437 #ifdef NONASCII_MASK
2438 static char *
str_utf8_nth(const char * p,const char * e,long * nthp)2439 str_utf8_nth(const char *p, const char *e, long *nthp)
2440 {
2441 long nth = *nthp;
2442 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2443 const uintptr_t *s, *t;
2444 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2445 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2446 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2447 while (p < (const char *)s) {
2448 if (is_utf8_lead_byte(*p)) nth--;
2449 p++;
2450 }
2451 do {
2452 nth -= count_utf8_lead_bytes_with_word(s);
2453 s++;
2454 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2455 p = (char *)s;
2456 }
2457 while (p < e) {
2458 if (is_utf8_lead_byte(*p)) {
2459 if (nth == 0) break;
2460 nth--;
2461 }
2462 p++;
2463 }
2464 *nthp = nth;
2465 return (char *)p;
2466 }
2467
2468 static long
str_utf8_offset(const char * p,const char * e,long nth)2469 str_utf8_offset(const char *p, const char *e, long nth)
2470 {
2471 const char *pp = str_utf8_nth(p, e, &nth);
2472 return pp - p;
2473 }
2474 #endif
2475
2476 /* byte offset to char offset */
2477 long
rb_str_sublen(VALUE str,long pos)2478 rb_str_sublen(VALUE str, long pos)
2479 {
2480 if (single_byte_optimizable(str) || pos < 0)
2481 return pos;
2482 else {
2483 char *p = RSTRING_PTR(str);
2484 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2485 }
2486 }
2487
2488 VALUE
rb_str_subseq(VALUE str,long beg,long len)2489 rb_str_subseq(VALUE str, long beg, long len)
2490 {
2491 VALUE str2;
2492
2493 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2494 SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2495 long olen;
2496 str2 = rb_str_new_shared(rb_str_new_frozen(str));
2497 RSTRING(str2)->as.heap.ptr += beg;
2498 olen = RSTRING(str2)->as.heap.len;
2499 if (olen > len) RSTRING(str2)->as.heap.len = len;
2500 }
2501 else {
2502 str2 = rb_str_new_with_class(str, RSTRING_PTR(str)+beg, len);
2503 RB_GC_GUARD(str);
2504 }
2505
2506 rb_enc_cr_str_copy_for_substr(str2, str);
2507 OBJ_INFECT(str2, str);
2508
2509 return str2;
2510 }
2511
2512 char *
rb_str_subpos(VALUE str,long beg,long * lenp)2513 rb_str_subpos(VALUE str, long beg, long *lenp)
2514 {
2515 long len = *lenp;
2516 long slen = -1L;
2517 long blen = RSTRING_LEN(str);
2518 rb_encoding *enc = STR_ENC_GET(str);
2519 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2520
2521 if (len < 0) return 0;
2522 if (!blen) {
2523 len = 0;
2524 }
2525 if (single_byte_optimizable(str)) {
2526 if (beg > blen) return 0;
2527 if (beg < 0) {
2528 beg += blen;
2529 if (beg < 0) return 0;
2530 }
2531 if (len > blen - beg)
2532 len = blen - beg;
2533 if (len < 0) return 0;
2534 p = s + beg;
2535 goto end;
2536 }
2537 if (beg < 0) {
2538 if (len > -beg) len = -beg;
2539 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2540 beg = -beg;
2541 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2542 p = e;
2543 if (!p) return 0;
2544 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2545 if (!p) return 0;
2546 len = e - p;
2547 goto end;
2548 }
2549 else {
2550 slen = str_strlen(str, enc);
2551 beg += slen;
2552 if (beg < 0) return 0;
2553 p = s + beg;
2554 if (len == 0) goto end;
2555 }
2556 }
2557 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2558 return 0;
2559 }
2560 if (len == 0) {
2561 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2562 p = s + beg;
2563 }
2564 #ifdef NONASCII_MASK
2565 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2566 enc == rb_utf8_encoding()) {
2567 p = str_utf8_nth(s, e, &beg);
2568 if (beg > 0) return 0;
2569 len = str_utf8_offset(p, e, len);
2570 }
2571 #endif
2572 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2573 int char_sz = rb_enc_mbmaxlen(enc);
2574
2575 p = s + beg * char_sz;
2576 if (p > e) {
2577 return 0;
2578 }
2579 else if (len * char_sz > e - p)
2580 len = e - p;
2581 else
2582 len *= char_sz;
2583 }
2584 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2585 if (beg > 0) return 0;
2586 len = 0;
2587 }
2588 else {
2589 len = str_offset(p, e, len, enc, 0);
2590 }
2591 end:
2592 *lenp = len;
2593 RB_GC_GUARD(str);
2594 return p;
2595 }
2596
2597 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2598
2599 VALUE
rb_str_substr(VALUE str,long beg,long len)2600 rb_str_substr(VALUE str, long beg, long len)
2601 {
2602 return str_substr(str, beg, len, TRUE);
2603 }
2604
2605 static VALUE
str_substr(VALUE str,long beg,long len,int empty)2606 str_substr(VALUE str, long beg, long len, int empty)
2607 {
2608 VALUE str2;
2609 char *p = rb_str_subpos(str, beg, &len);
2610
2611 if (!p) return Qnil;
2612 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2613 SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2614 long ofs = p - RSTRING_PTR(str);
2615 str2 = rb_str_new_frozen(str);
2616 str2 = str_new_shared(rb_obj_class(str2), str2);
2617 RSTRING(str2)->as.heap.ptr += ofs;
2618 RSTRING(str2)->as.heap.len = len;
2619 ENC_CODERANGE_CLEAR(str2);
2620 }
2621 else {
2622 if (!len && !empty) return Qnil;
2623 str2 = rb_str_new_with_class(str, p, len);
2624 OBJ_INFECT(str2, str);
2625 RB_GC_GUARD(str);
2626 }
2627 rb_enc_cr_str_copy_for_substr(str2, str);
2628
2629 return str2;
2630 }
2631
2632 VALUE
rb_str_freeze(VALUE str)2633 rb_str_freeze(VALUE str)
2634 {
2635 if (OBJ_FROZEN(str)) return str;
2636 rb_str_resize(str, RSTRING_LEN(str));
2637 return rb_obj_freeze(str);
2638 }
2639
2640
2641 /*
2642 * call-seq:
2643 * +str -> str (mutable)
2644 *
2645 * If the string is frozen, then return duplicated mutable string.
2646 *
2647 * If the string is not frozen, then return the string itself.
2648 */
2649 static VALUE
str_uplus(VALUE str)2650 str_uplus(VALUE str)
2651 {
2652 if (OBJ_FROZEN(str)) {
2653 return rb_str_dup(str);
2654 }
2655 else {
2656 return str;
2657 }
2658 }
2659
2660 /*
2661 * call-seq:
2662 * -str -> str (frozen)
2663 *
2664 * Returns a frozen, possibly pre-existing copy of the string.
2665 *
2666 * The string will be deduplicated as long as it is not tainted,
2667 * or has any instance variables set on it.
2668 */
2669 static VALUE
str_uminus(VALUE str)2670 str_uminus(VALUE str)
2671 {
2672 return rb_fstring(str);
2673 }
2674
rb_str_dup_frozen(VALUE str)2675 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2676 #define rb_str_dup_frozen rb_str_new_frozen
2677
2678 VALUE
2679 rb_str_locktmp(VALUE str)
2680 {
2681 if (FL_TEST(str, STR_TMPLOCK)) {
2682 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2683 }
2684 FL_SET(str, STR_TMPLOCK);
2685 return str;
2686 }
2687
2688 VALUE
rb_str_unlocktmp(VALUE str)2689 rb_str_unlocktmp(VALUE str)
2690 {
2691 if (!FL_TEST(str, STR_TMPLOCK)) {
2692 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2693 }
2694 FL_UNSET(str, STR_TMPLOCK);
2695 return str;
2696 }
2697
2698 RUBY_FUNC_EXPORTED VALUE
rb_str_locktmp_ensure(VALUE str,VALUE (* func)(VALUE),VALUE arg)2699 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
2700 {
2701 rb_str_locktmp(str);
2702 return rb_ensure(func, arg, rb_str_unlocktmp, str);
2703 }
2704
2705 void
rb_str_set_len(VALUE str,long len)2706 rb_str_set_len(VALUE str, long len)
2707 {
2708 long capa;
2709 const int termlen = TERM_LEN(str);
2710
2711 str_modifiable(str);
2712 if (STR_SHARED_P(str)) {
2713 rb_raise(rb_eRuntimeError, "can't set length of shared string");
2714 }
2715 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
2716 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2717 }
2718 STR_SET_LEN(str, len);
2719 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2720 }
2721
2722 VALUE
rb_str_resize(VALUE str,long len)2723 rb_str_resize(VALUE str, long len)
2724 {
2725 long slen;
2726 int independent;
2727
2728 if (len < 0) {
2729 rb_raise(rb_eArgError, "negative string size (or size too big)");
2730 }
2731
2732 independent = str_independent(str);
2733 ENC_CODERANGE_CLEAR(str);
2734 slen = RSTRING_LEN(str);
2735
2736 {
2737 long capa;
2738 const int termlen = TERM_LEN(str);
2739 if (STR_EMBED_P(str)) {
2740 if (len == slen) return str;
2741 if (STR_EMBEDDABLE_P(len, termlen)) {
2742 STR_SET_EMBED_LEN(str, len);
2743 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2744 return str;
2745 }
2746 str_make_independent_expand(str, slen, len - slen, termlen);
2747 }
2748 else if (STR_EMBEDDABLE_P(len, termlen)) {
2749 char *ptr = STR_HEAP_PTR(str);
2750 STR_SET_EMBED(str);
2751 if (slen > len) slen = len;
2752 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2753 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2754 STR_SET_EMBED_LEN(str, len);
2755 if (independent) ruby_xfree(ptr);
2756 return str;
2757 }
2758 else if (!independent) {
2759 if (len == slen) return str;
2760 str_make_independent_expand(str, slen, len - slen, termlen);
2761 }
2762 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2763 (capa - len) > (len < 1024 ? len : 1024)) {
2764 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2765 (size_t)len + termlen, STR_HEAP_SIZE(str));
2766 RSTRING(str)->as.heap.aux.capa = len;
2767 }
2768 else if (len == slen) return str;
2769 RSTRING(str)->as.heap.len = len;
2770 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2771 }
2772 return str;
2773 }
2774
2775 static VALUE
str_buf_cat(VALUE str,const char * ptr,long len)2776 str_buf_cat(VALUE str, const char *ptr, long len)
2777 {
2778 long capa, total, olen, off = -1;
2779 char *sptr;
2780 const int termlen = TERM_LEN(str);
2781 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2782
2783 RSTRING_GETMEM(str, sptr, olen);
2784 if (ptr >= sptr && ptr <= sptr + olen) {
2785 off = ptr - sptr;
2786 }
2787 rb_str_modify(str);
2788 if (len == 0) return 0;
2789 if (STR_EMBED_P(str)) {
2790 capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2791 sptr = RSTRING(str)->as.ary;
2792 olen = RSTRING_EMBED_LEN(str);
2793 }
2794 else {
2795 capa = RSTRING(str)->as.heap.aux.capa;
2796 sptr = RSTRING(str)->as.heap.ptr;
2797 olen = RSTRING(str)->as.heap.len;
2798 }
2799 if (olen > LONG_MAX - len) {
2800 rb_raise(rb_eArgError, "string sizes too big");
2801 }
2802 total = olen + len;
2803 if (capa < total) {
2804 if (total >= LONG_MAX / 2) {
2805 capa = total;
2806 }
2807 while (total > capa) {
2808 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2809 }
2810 RESIZE_CAPA_TERM(str, capa, termlen);
2811 sptr = RSTRING_PTR(str);
2812 }
2813 if (off != -1) {
2814 ptr = sptr + off;
2815 }
2816 memcpy(sptr + olen, ptr, len);
2817 STR_SET_LEN(str, total);
2818 TERM_FILL(sptr + total, termlen); /* sentinel */
2819
2820 return str;
2821 }
2822
2823 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2824
2825 VALUE
rb_str_cat(VALUE str,const char * ptr,long len)2826 rb_str_cat(VALUE str, const char *ptr, long len)
2827 {
2828 if (len == 0) return str;
2829 if (len < 0) {
2830 rb_raise(rb_eArgError, "negative string size (or size too big)");
2831 }
2832 return str_buf_cat(str, ptr, len);
2833 }
2834
2835 VALUE
rb_str_cat_cstr(VALUE str,const char * ptr)2836 rb_str_cat_cstr(VALUE str, const char *ptr)
2837 {
2838 must_not_null(ptr);
2839 return rb_str_buf_cat(str, ptr, strlen(ptr));
2840 }
2841
rb_str_buf_cat(VALUE str,const char * ptr,long len)2842 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
2843 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2844 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2845
2846 static VALUE
2847 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2848 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2849 {
2850 int str_encindex = ENCODING_GET(str);
2851 int res_encindex;
2852 int str_cr, res_cr;
2853 rb_encoding *str_enc, *ptr_enc;
2854
2855 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
2856
2857 if (str_encindex == ptr_encindex) {
2858 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2859 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2860 }
2861 }
2862 else {
2863 str_enc = rb_enc_from_index(str_encindex);
2864 ptr_enc = rb_enc_from_index(ptr_encindex);
2865 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2866 if (len == 0)
2867 return str;
2868 if (RSTRING_LEN(str) == 0) {
2869 rb_str_buf_cat(str, ptr, len);
2870 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2871 return str;
2872 }
2873 goto incompatible;
2874 }
2875 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2876 ptr_cr = coderange_scan(ptr, len, ptr_enc);
2877 }
2878 if (str_cr == ENC_CODERANGE_UNKNOWN) {
2879 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2880 str_cr = rb_enc_str_coderange(str);
2881 }
2882 }
2883 }
2884 if (ptr_cr_ret)
2885 *ptr_cr_ret = ptr_cr;
2886
2887 if (str_encindex != ptr_encindex &&
2888 str_cr != ENC_CODERANGE_7BIT &&
2889 ptr_cr != ENC_CODERANGE_7BIT) {
2890 str_enc = rb_enc_from_index(str_encindex);
2891 ptr_enc = rb_enc_from_index(ptr_encindex);
2892 incompatible:
2893 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2894 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
2895 }
2896
2897 if (str_cr == ENC_CODERANGE_UNKNOWN) {
2898 res_encindex = str_encindex;
2899 res_cr = ENC_CODERANGE_UNKNOWN;
2900 }
2901 else if (str_cr == ENC_CODERANGE_7BIT) {
2902 if (ptr_cr == ENC_CODERANGE_7BIT) {
2903 res_encindex = str_encindex;
2904 res_cr = ENC_CODERANGE_7BIT;
2905 }
2906 else {
2907 res_encindex = ptr_encindex;
2908 res_cr = ptr_cr;
2909 }
2910 }
2911 else if (str_cr == ENC_CODERANGE_VALID) {
2912 res_encindex = str_encindex;
2913 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
2914 res_cr = str_cr;
2915 else
2916 res_cr = ptr_cr;
2917 }
2918 else { /* str_cr == ENC_CODERANGE_BROKEN */
2919 res_encindex = str_encindex;
2920 res_cr = str_cr;
2921 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2922 }
2923
2924 if (len < 0) {
2925 rb_raise(rb_eArgError, "negative string size (or size too big)");
2926 }
2927 str_buf_cat(str, ptr, len);
2928 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2929 return str;
2930 }
2931
2932 VALUE
rb_enc_str_buf_cat(VALUE str,const char * ptr,long len,rb_encoding * ptr_enc)2933 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2934 {
2935 return rb_enc_cr_str_buf_cat(str, ptr, len,
2936 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
2937 }
2938
2939 VALUE
rb_str_buf_cat_ascii(VALUE str,const char * ptr)2940 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2941 {
2942 /* ptr must reference NUL terminated ASCII string. */
2943 int encindex = ENCODING_GET(str);
2944 rb_encoding *enc = rb_enc_from_index(encindex);
2945 if (rb_enc_asciicompat(enc)) {
2946 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2947 encindex, ENC_CODERANGE_7BIT, 0);
2948 }
2949 else {
2950 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2951 while (*ptr) {
2952 unsigned int c = (unsigned char)*ptr;
2953 int len = rb_enc_codelen(c, enc);
2954 rb_enc_mbcput(c, buf, enc);
2955 rb_enc_cr_str_buf_cat(str, buf, len,
2956 encindex, ENC_CODERANGE_VALID, 0);
2957 ptr++;
2958 }
2959 return str;
2960 }
2961 }
2962
2963 VALUE
rb_str_buf_append(VALUE str,VALUE str2)2964 rb_str_buf_append(VALUE str, VALUE str2)
2965 {
2966 int str2_cr;
2967
2968 str2_cr = ENC_CODERANGE(str2);
2969
2970 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2971 ENCODING_GET(str2), str2_cr, &str2_cr);
2972
2973 OBJ_INFECT(str, str2);
2974 ENC_CODERANGE_SET(str2, str2_cr);
2975
2976 return str;
2977 }
2978
2979 VALUE
rb_str_append(VALUE str,VALUE str2)2980 rb_str_append(VALUE str, VALUE str2)
2981 {
2982 StringValue(str2);
2983 return rb_str_buf_append(str, str2);
2984 }
2985
2986 #define MIN_PRE_ALLOC_SIZE 48
2987
2988 MJIT_FUNC_EXPORTED VALUE
rb_str_concat_literals(size_t num,const VALUE * strary)2989 rb_str_concat_literals(size_t num, const VALUE *strary)
2990 {
2991 VALUE str;
2992 size_t i, s;
2993 long len = 1;
2994
2995 if (UNLIKELY(!num)) return rb_str_new(0, 0);
2996 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
2997
2998 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
2999 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3000 str = rb_str_resurrect(strary[0]);
3001 s = 1;
3002 }
3003 else {
3004 str = rb_str_buf_new(len);
3005 rb_enc_copy(str, strary[0]);
3006 s = 0;
3007 }
3008
3009 for (i = s; i < num; ++i) {
3010 const VALUE v = strary[i];
3011 int encidx = ENCODING_GET(v);
3012
3013 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3014 encidx, ENC_CODERANGE(v), NULL);
3015 OBJ_INFECT_RAW(str, v);
3016 if (encidx != ENCINDEX_US_ASCII) {
3017 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3018 rb_enc_set_index(str, encidx);
3019 }
3020 }
3021 return str;
3022 }
3023
3024 /*
3025 * call-seq:
3026 * str.concat(obj1, obj2, ...) -> str
3027 *
3028 * Concatenates the given object(s) to <i>str</i>. If an object is an
3029 * <code>Integer</code>, it is considered a codepoint and converted
3030 * to a character before concatenation.
3031 *
3032 * +concat+ can take multiple arguments, and all the arguments are
3033 * concatenated in order.
3034 *
3035 * a = "hello "
3036 * a.concat("world", 33) #=> "hello world!"
3037 * a #=> "hello world!"
3038 *
3039 * b = "sn"
3040 * b.concat("_", b, "_", b) #=> "sn_sn_sn"
3041 *
3042 * See also String#<<, which takes a single argument.
3043 */
3044 static VALUE
rb_str_concat_multi(int argc,VALUE * argv,VALUE str)3045 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3046 {
3047 str_modifiable(str);
3048
3049 if (argc == 1) {
3050 return rb_str_concat(str, argv[0]);
3051 }
3052 else if (argc > 1) {
3053 int i;
3054 VALUE arg_str = rb_str_tmp_new(0);
3055 rb_enc_copy(arg_str, str);
3056 for (i = 0; i < argc; i++) {
3057 rb_str_concat(arg_str, argv[i]);
3058 }
3059 rb_str_buf_append(str, arg_str);
3060 }
3061
3062 return str;
3063 }
3064
3065 /*
3066 * call-seq:
3067 * str << obj -> str
3068 * str << integer -> str
3069 *
3070 * Appends the given object to <i>str</i>. If the object is an
3071 * <code>Integer</code>, it is considered a codepoint and converted
3072 * to a character before being appended.
3073 *
3074 * a = "hello "
3075 * a << "world" #=> "hello world"
3076 * a << 33 #=> "hello world!"
3077 *
3078 * See also String#concat, which takes multiple arguments.
3079 */
3080 VALUE
rb_str_concat(VALUE str1,VALUE str2)3081 rb_str_concat(VALUE str1, VALUE str2)
3082 {
3083 unsigned int code;
3084 rb_encoding *enc = STR_ENC_GET(str1);
3085 int encidx;
3086
3087 if (RB_INTEGER_TYPE_P(str2)) {
3088 if (rb_num_to_uint(str2, &code) == 0) {
3089 }
3090 else if (FIXNUM_P(str2)) {
3091 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3092 }
3093 else {
3094 rb_raise(rb_eRangeError, "bignum out of char range");
3095 }
3096 }
3097 else {
3098 return rb_str_append(str1, str2);
3099 }
3100
3101 encidx = rb_enc_to_index(enc);
3102 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3103 /* US-ASCII automatically extended to ASCII-8BIT */
3104 char buf[1];
3105 buf[0] = (char)code;
3106 if (code > 0xFF) {
3107 rb_raise(rb_eRangeError, "%u out of char range", code);
3108 }
3109 rb_str_cat(str1, buf, 1);
3110 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3111 rb_enc_associate_index(str1, ENCINDEX_ASCII);
3112 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3113 }
3114 }
3115 else {
3116 long pos = RSTRING_LEN(str1);
3117 int cr = ENC_CODERANGE(str1);
3118 int len;
3119 char *buf;
3120
3121 switch (len = rb_enc_codelen(code, enc)) {
3122 case ONIGERR_INVALID_CODE_POINT_VALUE:
3123 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3124 break;
3125 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3126 case 0:
3127 rb_raise(rb_eRangeError, "%u out of char range", code);
3128 break;
3129 }
3130 buf = ALLOCA_N(char, len + 1);
3131 rb_enc_mbcput(code, buf, enc);
3132 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3133 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3134 }
3135 rb_str_resize(str1, pos+len);
3136 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3137 if (cr == ENC_CODERANGE_7BIT && code > 127)
3138 cr = ENC_CODERANGE_VALID;
3139 ENC_CODERANGE_SET(str1, cr);
3140 }
3141 return str1;
3142 }
3143
3144 /*
3145 * call-seq:
3146 * str.prepend(other_str1, other_str2, ...) -> str
3147 *
3148 * Prepend---Prepend the given strings to <i>str</i>.
3149 *
3150 * a = "!"
3151 * a.prepend("hello ", "world") #=> "hello world!"
3152 * a #=> "hello world!"
3153 *
3154 * See also String#concat.
3155 */
3156
3157 static VALUE
rb_str_prepend_multi(int argc,VALUE * argv,VALUE str)3158 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3159 {
3160 str_modifiable(str);
3161
3162 if (argc == 1) {
3163 rb_str_update(str, 0L, 0L, argv[0]);
3164 }
3165 else if (argc > 1) {
3166 int i;
3167 VALUE arg_str = rb_str_tmp_new(0);
3168 rb_enc_copy(arg_str, str);
3169 for (i = 0; i < argc; i++) {
3170 rb_str_append(arg_str, argv[i]);
3171 }
3172 rb_str_update(str, 0L, 0L, arg_str);
3173 }
3174
3175 return str;
3176 }
3177
3178 st_index_t
rb_str_hash(VALUE str)3179 rb_str_hash(VALUE str)
3180 {
3181 int e = ENCODING_GET(str);
3182 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3183 e = 0;
3184 }
3185 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3186 }
3187
3188 int
rb_str_hash_cmp(VALUE str1,VALUE str2)3189 rb_str_hash_cmp(VALUE str1, VALUE str2)
3190 {
3191 long len1, len2;
3192 const char *ptr1, *ptr2;
3193 RSTRING_GETMEM(str1, ptr1, len1);
3194 RSTRING_GETMEM(str2, ptr2, len2);
3195 return (len1 != len2 ||
3196 !rb_str_comparable(str1, str2) ||
3197 memcmp(ptr1, ptr2, len1) != 0);
3198 }
3199
3200 /*
3201 * call-seq:
3202 * str.hash -> integer
3203 *
3204 * Returns a hash based on the string's length, content and encoding.
3205 *
3206 * See also Object#hash.
3207 */
3208
3209 static VALUE
rb_str_hash_m(VALUE str)3210 rb_str_hash_m(VALUE str)
3211 {
3212 st_index_t hval = rb_str_hash(str);
3213 return ST2FIX(hval);
3214 }
3215
3216 #define lesser(a,b) (((a)>(b))?(b):(a))
3217
3218 int
rb_str_comparable(VALUE str1,VALUE str2)3219 rb_str_comparable(VALUE str1, VALUE str2)
3220 {
3221 int idx1, idx2;
3222 int rc1, rc2;
3223
3224 if (RSTRING_LEN(str1) == 0) return TRUE;
3225 if (RSTRING_LEN(str2) == 0) return TRUE;
3226 idx1 = ENCODING_GET(str1);
3227 idx2 = ENCODING_GET(str2);
3228 if (idx1 == idx2) return TRUE;
3229 rc1 = rb_enc_str_coderange(str1);
3230 rc2 = rb_enc_str_coderange(str2);
3231 if (rc1 == ENC_CODERANGE_7BIT) {
3232 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3233 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3234 return TRUE;
3235 }
3236 if (rc2 == ENC_CODERANGE_7BIT) {
3237 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3238 return TRUE;
3239 }
3240 return FALSE;
3241 }
3242
3243 int
rb_str_cmp(VALUE str1,VALUE str2)3244 rb_str_cmp(VALUE str1, VALUE str2)
3245 {
3246 long len1, len2;
3247 const char *ptr1, *ptr2;
3248 int retval;
3249
3250 if (str1 == str2) return 0;
3251 RSTRING_GETMEM(str1, ptr1, len1);
3252 RSTRING_GETMEM(str2, ptr2, len2);
3253 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3254 if (len1 == len2) {
3255 if (!rb_str_comparable(str1, str2)) {
3256 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3257 return 1;
3258 return -1;
3259 }
3260 return 0;
3261 }
3262 if (len1 > len2) return 1;
3263 return -1;
3264 }
3265 if (retval > 0) return 1;
3266 return -1;
3267 }
3268
3269 /* expect tail call optimization */
3270 static VALUE
str_eql(const VALUE str1,const VALUE str2)3271 str_eql(const VALUE str1, const VALUE str2)
3272 {
3273 const long len = RSTRING_LEN(str1);
3274 const char *ptr1, *ptr2;
3275
3276 if (len != RSTRING_LEN(str2)) return Qfalse;
3277 if (!rb_str_comparable(str1, str2)) return Qfalse;
3278 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
3279 return Qtrue;
3280 if (memcmp(ptr1, ptr2, len) == 0)
3281 return Qtrue;
3282 return Qfalse;
3283 }
3284
3285 /*
3286 * call-seq:
3287 * str == obj -> true or false
3288 * str === obj -> true or false
3289 *
3290 * Equality---Returns whether +str+ == +obj+, similar to Object#==.
3291 *
3292 * If +obj+ is not an instance of String but responds to +to_str+, then the
3293 * two strings are compared using <code>obj.==</code>.
3294 *
3295 * Otherwise, returns similarly to String#eql?, comparing length and content.
3296 */
3297
3298 VALUE
rb_str_equal(VALUE str1,VALUE str2)3299 rb_str_equal(VALUE str1, VALUE str2)
3300 {
3301 if (str1 == str2) return Qtrue;
3302 if (!RB_TYPE_P(str2, T_STRING)) {
3303 if (!rb_respond_to(str2, idTo_str)) {
3304 return Qfalse;
3305 }
3306 return rb_equal(str2, str1);
3307 }
3308 return str_eql(str1, str2);
3309 }
3310
3311 /*
3312 * call-seq:
3313 * str.eql?(other) -> true or false
3314 *
3315 * Two strings are equal if they have the same length and content.
3316 */
3317
3318 MJIT_FUNC_EXPORTED VALUE
rb_str_eql(VALUE str1,VALUE str2)3319 rb_str_eql(VALUE str1, VALUE str2)
3320 {
3321 if (str1 == str2) return Qtrue;
3322 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3323 return str_eql(str1, str2);
3324 }
3325
3326 /*
3327 * call-seq:
3328 * string <=> other_string -> -1, 0, +1, or nil
3329 *
3330 * Comparison---Returns -1, 0, +1, or +nil+ depending on whether +string+ is
3331 * less than, equal to, or greater than +other_string+.
3332 *
3333 * +nil+ is returned if the two values are incomparable.
3334 *
3335 * If the strings are of different lengths, and the strings are equal when
3336 * compared up to the shortest length, then the longer string is considered
3337 * greater than the shorter one.
3338 *
3339 * <code><=></code> is the basis for the methods <code><</code>,
3340 * <code><=</code>, <code>></code>, <code>>=</code>, and
3341 * <code>between?</code>, included from module Comparable. The method
3342 * String#== does not use Comparable#==.
3343 *
3344 * "abcdef" <=> "abcde" #=> 1
3345 * "abcdef" <=> "abcdef" #=> 0
3346 * "abcdef" <=> "abcdefg" #=> -1
3347 * "abcdef" <=> "ABCDEF" #=> 1
3348 * "abcdef" <=> 1 #=> nil
3349 */
3350
3351 static VALUE
rb_str_cmp_m(VALUE str1,VALUE str2)3352 rb_str_cmp_m(VALUE str1, VALUE str2)
3353 {
3354 int result;
3355 VALUE s = rb_check_string_type(str2);
3356 if (NIL_P(s)) {
3357 return rb_invcmp(str1, str2);
3358 }
3359 result = rb_str_cmp(str1, s);
3360 return INT2FIX(result);
3361 }
3362
3363 static VALUE str_casecmp(VALUE str1, VALUE str2);
3364 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3365
3366 /*
3367 * call-seq:
3368 * str.casecmp(other_str) -> -1, 0, +1, or nil
3369 *
3370 * Case-insensitive version of <code>String#<=></code>.
3371 * Currently, case-insensitivity only works on characters A-Z/a-z,
3372 * not all of Unicode. This is different from String#casecmp?.
3373 *
3374 * "aBcDeF".casecmp("abcde") #=> 1
3375 * "aBcDeF".casecmp("abcdef") #=> 0
3376 * "aBcDeF".casecmp("abcdefg") #=> -1
3377 * "abcdef".casecmp("ABCDEF") #=> 0
3378 *
3379 * +nil+ is returned if the two strings have incompatible encodings,
3380 * or if +other_str+ is not a string.
3381 *
3382 * "foo".casecmp(2) #=> nil
3383 * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp("\u{c4 d6 dc}") #=> nil
3384 */
3385
3386 static VALUE
rb_str_casecmp(VALUE str1,VALUE str2)3387 rb_str_casecmp(VALUE str1, VALUE str2)
3388 {
3389 VALUE s = rb_check_string_type(str2);
3390 if (NIL_P(s)) {
3391 return Qnil;
3392 }
3393 return str_casecmp(str1, s);
3394 }
3395
3396 static VALUE
str_casecmp(VALUE str1,VALUE str2)3397 str_casecmp(VALUE str1, VALUE str2)
3398 {
3399 long len;
3400 rb_encoding *enc;
3401 char *p1, *p1end, *p2, *p2end;
3402
3403 enc = rb_enc_compatible(str1, str2);
3404 if (!enc) {
3405 return Qnil;
3406 }
3407
3408 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3409 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3410 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3411 while (p1 < p1end && p2 < p2end) {
3412 if (*p1 != *p2) {
3413 unsigned int c1 = TOUPPER(*p1 & 0xff);
3414 unsigned int c2 = TOUPPER(*p2 & 0xff);
3415 if (c1 != c2)
3416 return INT2FIX(c1 < c2 ? -1 : 1);
3417 }
3418 p1++;
3419 p2++;
3420 }
3421 }
3422 else {
3423 while (p1 < p1end && p2 < p2end) {
3424 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3425 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3426
3427 if (0 <= c1 && 0 <= c2) {
3428 c1 = TOUPPER(c1);
3429 c2 = TOUPPER(c2);
3430 if (c1 != c2)
3431 return INT2FIX(c1 < c2 ? -1 : 1);
3432 }
3433 else {
3434 int r;
3435 l1 = rb_enc_mbclen(p1, p1end, enc);
3436 l2 = rb_enc_mbclen(p2, p2end, enc);
3437 len = l1 < l2 ? l1 : l2;
3438 r = memcmp(p1, p2, len);
3439 if (r != 0)
3440 return INT2FIX(r < 0 ? -1 : 1);
3441 if (l1 != l2)
3442 return INT2FIX(l1 < l2 ? -1 : 1);
3443 }
3444 p1 += l1;
3445 p2 += l2;
3446 }
3447 }
3448 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3449 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3450 return INT2FIX(-1);
3451 }
3452
3453 /*
3454 * call-seq:
3455 * str.casecmp?(other_str) -> true, false, or nil
3456 *
3457 * Returns +true+ if +str+ and +other_str+ are equal after
3458 * Unicode case folding, +false+ if they are not equal.
3459 *
3460 * "aBcDeF".casecmp?("abcde") #=> false
3461 * "aBcDeF".casecmp?("abcdef") #=> true
3462 * "aBcDeF".casecmp?("abcdefg") #=> false
3463 * "abcdef".casecmp?("ABCDEF") #=> true
3464 * "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}") #=> true
3465 *
3466 * +nil+ is returned if the two strings have incompatible encodings,
3467 * or if +other_str+ is not a string.
3468 *
3469 * "foo".casecmp?(2) #=> nil
3470 * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp?("\u{c4 d6 dc}") #=> nil
3471 */
3472
3473 static VALUE
rb_str_casecmp_p(VALUE str1,VALUE str2)3474 rb_str_casecmp_p(VALUE str1, VALUE str2)
3475 {
3476 VALUE s = rb_check_string_type(str2);
3477 if (NIL_P(s)) {
3478 return Qnil;
3479 }
3480 return str_casecmp_p(str1, s);
3481 }
3482
3483 static VALUE
str_casecmp_p(VALUE str1,VALUE str2)3484 str_casecmp_p(VALUE str1, VALUE str2)
3485 {
3486 rb_encoding *enc;
3487 VALUE folded_str1, folded_str2;
3488 VALUE fold_opt = sym_fold;
3489
3490 enc = rb_enc_compatible(str1, str2);
3491 if (!enc) {
3492 return Qnil;
3493 }
3494
3495 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3496 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3497
3498 return rb_str_eql(folded_str1, folded_str2);
3499 }
3500
3501 static long
strseq_core(const char * str_ptr,const char * str_ptr_end,long str_len,const char * sub_ptr,long sub_len,long offset,rb_encoding * enc)3502 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3503 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3504 {
3505 const char *search_start = str_ptr;
3506 long pos, search_len = str_len - offset;
3507
3508 for (;;) {
3509 const char *t;
3510 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3511 if (pos < 0) return pos;
3512 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3513 if (t == search_start + pos) break;
3514 search_len -= t - search_start;
3515 if (search_len <= 0) return -1;
3516 offset += t - search_start;
3517 search_start = t;
3518 }
3519 return pos + offset;
3520 }
3521
3522 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3523
3524 static long
rb_strseq_index(VALUE str,VALUE sub,long offset,int in_byte)3525 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3526 {
3527 const char *str_ptr, *str_ptr_end, *sub_ptr;
3528 long str_len, sub_len;
3529 int single_byte = single_byte_optimizable(str);
3530 rb_encoding *enc;
3531
3532 enc = rb_enc_check(str, sub);
3533 if (is_broken_string(sub)) return -1;
3534
3535 str_ptr = RSTRING_PTR(str);
3536 str_ptr_end = RSTRING_END(str);
3537 str_len = RSTRING_LEN(str);
3538 sub_ptr = RSTRING_PTR(sub);
3539 sub_len = RSTRING_LEN(sub);
3540
3541 if (str_len < sub_len) return -1;
3542
3543 if (offset != 0) {
3544 long str_len_char, sub_len_char;
3545 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3546 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3547 if (offset < 0) {
3548 offset += str_len_char;
3549 if (offset < 0) return -1;
3550 }
3551 if (str_len_char - offset < sub_len_char) return -1;
3552 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3553 str_ptr += offset;
3554 }
3555 if (sub_len == 0) return offset;
3556
3557 /* need proceed one character at a time */
3558 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3559 }
3560
3561
3562 /*
3563 * call-seq:
3564 * str.index(substring [, offset]) -> integer or nil
3565 * str.index(regexp [, offset]) -> integer or nil
3566 *
3567 * Returns the index of the first occurrence of the given <i>substring</i> or
3568 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3569 * found. If the second parameter is present, it specifies the position in the
3570 * string to begin the search.
3571 *
3572 * "hello".index('e') #=> 1
3573 * "hello".index('lo') #=> 3
3574 * "hello".index('a') #=> nil
3575 * "hello".index(?e) #=> 1
3576 * "hello".index(/[aeiou]/, -3) #=> 4
3577 */
3578
3579 static VALUE
rb_str_index_m(int argc,VALUE * argv,VALUE str)3580 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3581 {
3582 VALUE sub;
3583 VALUE initpos;
3584 long pos;
3585
3586 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3587 pos = NUM2LONG(initpos);
3588 }
3589 else {
3590 pos = 0;
3591 }
3592 if (pos < 0) {
3593 pos += str_strlen(str, NULL);
3594 if (pos < 0) {
3595 if (RB_TYPE_P(sub, T_REGEXP)) {
3596 rb_backref_set(Qnil);
3597 }
3598 return Qnil;
3599 }
3600 }
3601
3602 if (SPECIAL_CONST_P(sub)) goto generic;
3603 switch (BUILTIN_TYPE(sub)) {
3604 case T_REGEXP:
3605 if (pos > str_strlen(str, NULL))
3606 return Qnil;
3607 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3608 rb_enc_check(str, sub), single_byte_optimizable(str));
3609
3610 pos = rb_reg_search(sub, str, pos, 0);
3611 pos = rb_str_sublen(str, pos);
3612 break;
3613
3614 generic:
3615 default: {
3616 VALUE tmp;
3617
3618 tmp = rb_check_string_type(sub);
3619 if (NIL_P(tmp)) {
3620 rb_raise(rb_eTypeError, "type mismatch: %s given",
3621 rb_obj_classname(sub));
3622 }
3623 sub = tmp;
3624 }
3625 /* fall through */
3626 case T_STRING:
3627 pos = rb_str_index(str, sub, pos);
3628 pos = rb_str_sublen(str, pos);
3629 break;
3630 }
3631
3632 if (pos == -1) return Qnil;
3633 return LONG2NUM(pos);
3634 }
3635
3636 #ifdef HAVE_MEMRCHR
3637 static long
str_rindex(VALUE str,VALUE sub,const char * s,long pos,rb_encoding * enc)3638 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3639 {
3640 char *hit, *adjusted;
3641 int c;
3642 long slen, searchlen;
3643 char *sbeg, *e, *t;
3644
3645 slen = RSTRING_LEN(sub);
3646 if (slen == 0) return pos;
3647 sbeg = RSTRING_PTR(str);
3648 e = RSTRING_END(str);
3649 t = RSTRING_PTR(sub);
3650 c = *t & 0xff;
3651 searchlen = s - sbeg + 1;
3652
3653 do {
3654 hit = memrchr(sbeg, c, searchlen);
3655 if (!hit) break;
3656 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3657 if (hit != adjusted) {
3658 searchlen = adjusted - sbeg;
3659 continue;
3660 }
3661 if (memcmp(hit, t, slen) == 0)
3662 return rb_str_sublen(str, hit - sbeg);
3663 searchlen = adjusted - sbeg;
3664 } while (searchlen > 0);
3665
3666 return -1;
3667 }
3668 #else
3669 static long
str_rindex(VALUE str,VALUE sub,const char * s,long pos,rb_encoding * enc)3670 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3671 {
3672 long slen;
3673 char *sbeg, *e, *t;
3674
3675 sbeg = RSTRING_PTR(str);
3676 e = RSTRING_END(str);
3677 t = RSTRING_PTR(sub);
3678 slen = RSTRING_LEN(sub);
3679
3680 while (s) {
3681 if (memcmp(s, t, slen) == 0) {
3682 return pos;
3683 }
3684 if (pos == 0) break;
3685 pos--;
3686 s = rb_enc_prev_char(sbeg, s, e, enc);
3687 }
3688
3689 return -1;
3690 }
3691 #endif
3692
3693 static long
rb_str_rindex(VALUE str,VALUE sub,long pos)3694 rb_str_rindex(VALUE str, VALUE sub, long pos)
3695 {
3696 long len, slen;
3697 char *sbeg, *s;
3698 rb_encoding *enc;
3699 int singlebyte;
3700
3701 enc = rb_enc_check(str, sub);
3702 if (is_broken_string(sub)) return -1;
3703 singlebyte = single_byte_optimizable(str);
3704 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3705 slen = str_strlen(sub, enc); /* rb_enc_check */
3706
3707 /* substring longer than string */
3708 if (len < slen) return -1;
3709 if (len - pos < slen) pos = len - slen;
3710 if (len == 0) return pos;
3711
3712 sbeg = RSTRING_PTR(str);
3713
3714 if (pos == 0) {
3715 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3716 return 0;
3717 else
3718 return -1;
3719 }
3720
3721 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3722 return str_rindex(str, sub, s, pos, enc);
3723 }
3724
3725
3726 /*
3727 * call-seq:
3728 * str.rindex(substring [, integer]) -> integer or nil
3729 * str.rindex(regexp [, integer]) -> integer or nil
3730 *
3731 * Returns the index of the last occurrence of the given <i>substring</i> or
3732 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3733 * found. If the second parameter is present, it specifies the position in the
3734 * string to end the search---characters beyond this point will not be
3735 * considered.
3736 *
3737 * "hello".rindex('e') #=> 1
3738 * "hello".rindex('l') #=> 3
3739 * "hello".rindex('a') #=> nil
3740 * "hello".rindex(?e) #=> 1
3741 * "hello".rindex(/[aeiou]/, -2) #=> 1
3742 */
3743
3744 static VALUE
rb_str_rindex_m(int argc,VALUE * argv,VALUE str)3745 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
3746 {
3747 VALUE sub;
3748 VALUE vpos;
3749 rb_encoding *enc = STR_ENC_GET(str);
3750 long pos, len = str_strlen(str, enc); /* str's enc */
3751
3752 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3753 pos = NUM2LONG(vpos);
3754 if (pos < 0) {
3755 pos += len;
3756 if (pos < 0) {
3757 if (RB_TYPE_P(sub, T_REGEXP)) {
3758 rb_backref_set(Qnil);
3759 }
3760 return Qnil;
3761 }
3762 }
3763 if (pos > len) pos = len;
3764 }
3765 else {
3766 pos = len;
3767 }
3768
3769 if (SPECIAL_CONST_P(sub)) goto generic;
3770 switch (BUILTIN_TYPE(sub)) {
3771 case T_REGEXP:
3772 /* enc = rb_get_check(str, sub); */
3773 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3774 enc, single_byte_optimizable(str));
3775
3776 pos = rb_reg_search(sub, str, pos, 1);
3777 pos = rb_str_sublen(str, pos);
3778 if (pos >= 0) return LONG2NUM(pos);
3779 break;
3780
3781 generic:
3782 default: {
3783 VALUE tmp;
3784
3785 tmp = rb_check_string_type(sub);
3786 if (NIL_P(tmp)) {
3787 rb_raise(rb_eTypeError, "type mismatch: %s given",
3788 rb_obj_classname(sub));
3789 }
3790 sub = tmp;
3791 }
3792 /* fall through */
3793 case T_STRING:
3794 pos = rb_str_rindex(str, sub, pos);
3795 if (pos >= 0) return LONG2NUM(pos);
3796 break;
3797 }
3798 return Qnil;
3799 }
3800
3801 /*
3802 * call-seq:
3803 * str =~ obj -> integer or nil
3804 *
3805 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
3806 * against <i>str</i>,and returns the position the match starts, or
3807 * <code>nil</code> if there is no match. Otherwise, invokes
3808 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
3809 * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
3810 *
3811 * Note: <code>str =~ regexp</code> is not the same as
3812 * <code>regexp =~ str</code>. Strings captured from named capture groups
3813 * are assigned to local variables only in the second case.
3814 *
3815 * "cat o' 9 tails" =~ /\d/ #=> 7
3816 * "cat o' 9 tails" =~ 9 #=> nil
3817 */
3818
3819 static VALUE
rb_str_match(VALUE x,VALUE y)3820 rb_str_match(VALUE x, VALUE y)
3821 {
3822 if (SPECIAL_CONST_P(y)) goto generic;
3823 switch (BUILTIN_TYPE(y)) {
3824 case T_STRING:
3825 rb_raise(rb_eTypeError, "type mismatch: String given");
3826
3827 case T_REGEXP:
3828 return rb_reg_match(y, x);
3829
3830 generic:
3831 default:
3832 return rb_funcall(y, idEqTilde, 1, x);
3833 }
3834 }
3835
3836
3837 static VALUE get_pat(VALUE);
3838
3839
3840 /*
3841 * call-seq:
3842 * str.match(pattern) -> matchdata or nil
3843 * str.match(pattern, pos) -> matchdata or nil
3844 *
3845 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
3846 * then invokes its <code>match</code> method on <i>str</i>. If the second
3847 * parameter is present, it specifies the position in the string to begin the
3848 * search.
3849 *
3850 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
3851 * 'hello'.match('(.)\1')[0] #=> "ll"
3852 * 'hello'.match(/(.)\1/)[0] #=> "ll"
3853 * 'hello'.match(/(.)\1/, 3) #=> nil
3854 * 'hello'.match('xx') #=> nil
3855 *
3856 * If a block is given, invoke the block with MatchData if match succeed, so
3857 * that you can write
3858 *
3859 * str.match(pat) {|m| ...}
3860 *
3861 * instead of
3862 *
3863 * if m = str.match(pat)
3864 * ...
3865 * end
3866 *
3867 * The return value is a value from block execution in this case.
3868 */
3869
3870 static VALUE
rb_str_match_m(int argc,VALUE * argv,VALUE str)3871 rb_str_match_m(int argc, VALUE *argv, VALUE str)
3872 {
3873 VALUE re, result;
3874 if (argc < 1)
3875 rb_check_arity(argc, 1, 2);
3876 re = argv[0];
3877 argv[0] = str;
3878 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
3879 if (!NIL_P(result) && rb_block_given_p()) {
3880 return rb_yield(result);
3881 }
3882 return result;
3883 }
3884
3885 /*
3886 * call-seq:
3887 * str.match?(pattern) -> true or false
3888 * str.match?(pattern, pos) -> true or false
3889 *
3890 * Converts _pattern_ to a +Regexp+ (if it isn't already one), then
3891 * returns a +true+ or +false+ indicates whether the regexp is
3892 * matched _str_ or not without updating <code>$~</code> and other
3893 * related variables. If the second parameter is present, it
3894 * specifies the position in the string to begin the search.
3895 *
3896 * "Ruby".match?(/R.../) #=> true
3897 * "Ruby".match?(/R.../, 1) #=> false
3898 * "Ruby".match?(/P.../) #=> false
3899 * $& #=> nil
3900 */
3901
3902 static VALUE
rb_str_match_m_p(int argc,VALUE * argv,VALUE str)3903 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
3904 {
3905 VALUE re;
3906 rb_check_arity(argc, 1, 2);
3907 re = get_pat(argv[0]);
3908 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
3909 }
3910
3911 enum neighbor_char {
3912 NEIGHBOR_NOT_CHAR,
3913 NEIGHBOR_FOUND,
3914 NEIGHBOR_WRAPPED
3915 };
3916
3917 static enum neighbor_char
enc_succ_char(char * p,long len,rb_encoding * enc)3918 enc_succ_char(char *p, long len, rb_encoding *enc)
3919 {
3920 long i;
3921 int l;
3922
3923 if (rb_enc_mbminlen(enc) > 1) {
3924 /* wchar, trivial case */
3925 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3926 if (!MBCLEN_CHARFOUND_P(r)) {
3927 return NEIGHBOR_NOT_CHAR;
3928 }
3929 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
3930 l = rb_enc_code_to_mbclen(c, enc);
3931 if (!l) return NEIGHBOR_NOT_CHAR;
3932 if (l != len) return NEIGHBOR_WRAPPED;
3933 rb_enc_mbcput(c, p, enc);
3934 r = rb_enc_precise_mbclen(p, p + len, enc);
3935 if (!MBCLEN_CHARFOUND_P(r)) {
3936 return NEIGHBOR_NOT_CHAR;
3937 }
3938 return NEIGHBOR_FOUND;
3939 }
3940 while (1) {
3941 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
3942 p[i] = '\0';
3943 if (i < 0)
3944 return NEIGHBOR_WRAPPED;
3945 ++((unsigned char*)p)[i];
3946 l = rb_enc_precise_mbclen(p, p+len, enc);
3947 if (MBCLEN_CHARFOUND_P(l)) {
3948 l = MBCLEN_CHARFOUND_LEN(l);
3949 if (l == len) {
3950 return NEIGHBOR_FOUND;
3951 }
3952 else {
3953 memset(p+l, 0xff, len-l);
3954 }
3955 }
3956 if (MBCLEN_INVALID_P(l) && i < len-1) {
3957 long len2;
3958 int l2;
3959 for (len2 = len-1; 0 < len2; len2--) {
3960 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3961 if (!MBCLEN_INVALID_P(l2))
3962 break;
3963 }
3964 memset(p+len2+1, 0xff, len-(len2+1));
3965 }
3966 }
3967 }
3968
3969 static enum neighbor_char
enc_pred_char(char * p,long len,rb_encoding * enc)3970 enc_pred_char(char *p, long len, rb_encoding *enc)
3971 {
3972 long i;
3973 int l;
3974 if (rb_enc_mbminlen(enc) > 1) {
3975 /* wchar, trivial case */
3976 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3977 if (!MBCLEN_CHARFOUND_P(r)) {
3978 return NEIGHBOR_NOT_CHAR;
3979 }
3980 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
3981 if (!c) return NEIGHBOR_NOT_CHAR;
3982 --c;
3983 l = rb_enc_code_to_mbclen(c, enc);
3984 if (!l) return NEIGHBOR_NOT_CHAR;
3985 if (l != len) return NEIGHBOR_WRAPPED;
3986 rb_enc_mbcput(c, p, enc);
3987 r = rb_enc_precise_mbclen(p, p + len, enc);
3988 if (!MBCLEN_CHARFOUND_P(r)) {
3989 return NEIGHBOR_NOT_CHAR;
3990 }
3991 return NEIGHBOR_FOUND;
3992 }
3993 while (1) {
3994 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
3995 p[i] = '\xff';
3996 if (i < 0)
3997 return NEIGHBOR_WRAPPED;
3998 --((unsigned char*)p)[i];
3999 l = rb_enc_precise_mbclen(p, p+len, enc);
4000 if (MBCLEN_CHARFOUND_P(l)) {
4001 l = MBCLEN_CHARFOUND_LEN(l);
4002 if (l == len) {
4003 return NEIGHBOR_FOUND;
4004 }
4005 else {
4006 memset(p+l, 0, len-l);
4007 }
4008 }
4009 if (MBCLEN_INVALID_P(l) && i < len-1) {
4010 long len2;
4011 int l2;
4012 for (len2 = len-1; 0 < len2; len2--) {
4013 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4014 if (!MBCLEN_INVALID_P(l2))
4015 break;
4016 }
4017 memset(p+len2+1, 0, len-(len2+1));
4018 }
4019 }
4020 }
4021
4022 /*
4023 overwrite +p+ by succeeding letter in +enc+ and returns
4024 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4025 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4026 assuming each ranges are successive, and mbclen
4027 never change in each ranges.
4028 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4029 character.
4030 */
4031 static enum neighbor_char
enc_succ_alnum_char(char * p,long len,rb_encoding * enc,char * carry)4032 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4033 {
4034 enum neighbor_char ret;
4035 unsigned int c;
4036 int ctype;
4037 int range;
4038 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4039
4040 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4041 int try;
4042 const int max_gaps = 1;
4043
4044 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4045 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4046 ctype = ONIGENC_CTYPE_DIGIT;
4047 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4048 ctype = ONIGENC_CTYPE_ALPHA;
4049 else
4050 return NEIGHBOR_NOT_CHAR;
4051
4052 MEMCPY(save, p, char, len);
4053 for (try = 0; try <= max_gaps; ++try) {
4054 ret = enc_succ_char(p, len, enc);
4055 if (ret == NEIGHBOR_FOUND) {
4056 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4057 if (rb_enc_isctype(c, ctype, enc))
4058 return NEIGHBOR_FOUND;
4059 }
4060 }
4061 MEMCPY(p, save, char, len);
4062 range = 1;
4063 while (1) {
4064 MEMCPY(save, p, char, len);
4065 ret = enc_pred_char(p, len, enc);
4066 if (ret == NEIGHBOR_FOUND) {
4067 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4068 if (!rb_enc_isctype(c, ctype, enc)) {
4069 MEMCPY(p, save, char, len);
4070 break;
4071 }
4072 }
4073 else {
4074 MEMCPY(p, save, char, len);
4075 break;
4076 }
4077 range++;
4078 }
4079 if (range == 1) {
4080 return NEIGHBOR_NOT_CHAR;
4081 }
4082
4083 if (ctype != ONIGENC_CTYPE_DIGIT) {
4084 MEMCPY(carry, p, char, len);
4085 return NEIGHBOR_WRAPPED;
4086 }
4087
4088 MEMCPY(carry, p, char, len);
4089 enc_succ_char(carry, len, enc);
4090 return NEIGHBOR_WRAPPED;
4091 }
4092
4093
4094 static VALUE str_succ(VALUE str);
4095
4096 /*
4097 * call-seq:
4098 * str.succ -> new_str
4099 * str.next -> new_str
4100 *
4101 * Returns the successor to <i>str</i>. The successor is calculated by
4102 * incrementing characters starting from the rightmost alphanumeric (or
4103 * the rightmost character if there are no alphanumerics) in the
4104 * string. Incrementing a digit always results in another digit, and
4105 * incrementing a letter results in another letter of the same case.
4106 * Incrementing nonalphanumerics uses the underlying character set's
4107 * collating sequence.
4108 *
4109 * If the increment generates a ``carry,'' the character to the left of
4110 * it is incremented. This process repeats until there is no carry,
4111 * adding an additional character if necessary.
4112 *
4113 * "abcd".succ #=> "abce"
4114 * "THX1138".succ #=> "THX1139"
4115 * "<<koala>>".succ #=> "<<koalb>>"
4116 * "1999zzz".succ #=> "2000aaa"
4117 * "ZZZ9999".succ #=> "AAAA0000"
4118 * "***".succ #=> "**+"
4119 */
4120
4121 VALUE
rb_str_succ(VALUE orig)4122 rb_str_succ(VALUE orig)
4123 {
4124 VALUE str;
4125 str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
4126 rb_enc_cr_str_copy_for_substr(str, orig);
4127 OBJ_INFECT(str, orig);
4128 return str_succ(str);
4129 }
4130
4131 static VALUE
str_succ(VALUE str)4132 str_succ(VALUE str)
4133 {
4134 rb_encoding *enc;
4135 char *sbeg, *s, *e, *last_alnum = 0;
4136 int c = -1;
4137 long l, slen;
4138 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4139 long carry_pos = 0, carry_len = 1;
4140 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4141
4142 slen = RSTRING_LEN(str);
4143 if (slen == 0) return str;
4144
4145 enc = STR_ENC_GET(str);
4146 sbeg = RSTRING_PTR(str);
4147 s = e = sbeg + slen;
4148
4149 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4150 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4151 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4152 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4153 s = last_alnum;
4154 break;
4155 }
4156 }
4157 l = rb_enc_precise_mbclen(s, e, enc);
4158 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4159 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4160 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4161 switch (neighbor) {
4162 case NEIGHBOR_NOT_CHAR:
4163 continue;
4164 case NEIGHBOR_FOUND:
4165 return str;
4166 case NEIGHBOR_WRAPPED:
4167 last_alnum = s;
4168 break;
4169 }
4170 c = 1;
4171 carry_pos = s - sbeg;
4172 carry_len = l;
4173 }
4174 if (c == -1) { /* str contains no alnum */
4175 s = e;
4176 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4177 enum neighbor_char neighbor;
4178 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4179 l = rb_enc_precise_mbclen(s, e, enc);
4180 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4181 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4182 MEMCPY(tmp, s, char, l);
4183 neighbor = enc_succ_char(tmp, l, enc);
4184 switch (neighbor) {
4185 case NEIGHBOR_FOUND:
4186 MEMCPY(s, tmp, char, l);
4187 return str;
4188 break;
4189 case NEIGHBOR_WRAPPED:
4190 MEMCPY(s, tmp, char, l);
4191 break;
4192 case NEIGHBOR_NOT_CHAR:
4193 break;
4194 }
4195 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4196 /* wrapped to \0...\0. search next valid char. */
4197 enc_succ_char(s, l, enc);
4198 }
4199 if (!rb_enc_asciicompat(enc)) {
4200 MEMCPY(carry, s, char, l);
4201 carry_len = l;
4202 }
4203 carry_pos = s - sbeg;
4204 }
4205 ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4206 }
4207 RESIZE_CAPA(str, slen + carry_len);
4208 sbeg = RSTRING_PTR(str);
4209 s = sbeg + carry_pos;
4210 memmove(s + carry_len, s, slen - carry_pos);
4211 memmove(s, carry, carry_len);
4212 slen += carry_len;
4213 STR_SET_LEN(str, slen);
4214 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4215 rb_enc_str_coderange(str);
4216 return str;
4217 }
4218
4219
4220 /*
4221 * call-seq:
4222 * str.succ! -> str
4223 * str.next! -> str
4224 *
4225 * Equivalent to <code>String#succ</code>, but modifies the receiver in
4226 * place.
4227 */
4228
4229 static VALUE
rb_str_succ_bang(VALUE str)4230 rb_str_succ_bang(VALUE str)
4231 {
4232 rb_str_modify(str);
4233 str_succ(str);
4234 return str;
4235 }
4236
4237 static int
all_digits_p(const char * s,long len)4238 all_digits_p(const char *s, long len)
4239 {
4240 while (len-- > 0) {
4241 if (!ISDIGIT(*s)) return 0;
4242 s++;
4243 }
4244 return 1;
4245 }
4246
4247 static int
str_upto_i(VALUE str,VALUE arg)4248 str_upto_i(VALUE str, VALUE arg)
4249 {
4250 rb_yield(str);
4251 return 0;
4252 }
4253
4254 /*
4255 * call-seq:
4256 * str.upto(other_str, exclusive=false) {|s| block } -> str
4257 * str.upto(other_str, exclusive=false) -> an_enumerator
4258 *
4259 * Iterates through successive values, starting at <i>str</i> and
4260 * ending at <i>other_str</i> inclusive, passing each value in turn to
4261 * the block. The <code>String#succ</code> method is used to generate
4262 * each value. If optional second argument exclusive is omitted or is false,
4263 * the last value will be included; otherwise it will be excluded.
4264 *
4265 * If no block is given, an enumerator is returned instead.
4266 *
4267 * "a8".upto("b6") {|s| print s, ' ' }
4268 * for s in "a8".."b6"
4269 * print s, ' '
4270 * end
4271 *
4272 * <em>produces:</em>
4273 *
4274 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4275 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4276 *
4277 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
4278 * both are recognized as decimal numbers. In addition, the width of
4279 * string (e.g. leading zeros) is handled appropriately.
4280 *
4281 * "9".upto("11").to_a #=> ["9", "10", "11"]
4282 * "25".upto("5").to_a #=> []
4283 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
4284 */
4285
4286 static VALUE
rb_str_upto(int argc,VALUE * argv,VALUE beg)4287 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4288 {
4289 VALUE end, exclusive;
4290
4291 rb_scan_args(argc, argv, "11", &end, &exclusive);
4292 RETURN_ENUMERATOR(beg, argc, argv);
4293 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4294 }
4295
4296 VALUE
rb_str_upto_each(VALUE beg,VALUE end,int excl,int (* each)(VALUE,VALUE),VALUE arg)4297 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4298 {
4299 VALUE current, after_end;
4300 ID succ;
4301 int n, ascii;
4302 rb_encoding *enc;
4303
4304 CONST_ID(succ, "succ");
4305 StringValue(end);
4306 enc = rb_enc_check(beg, end);
4307 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4308 /* single character */
4309 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4310 char c = RSTRING_PTR(beg)[0];
4311 char e = RSTRING_PTR(end)[0];
4312
4313 if (c > e || (excl && c == e)) return beg;
4314 for (;;) {
4315 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4316 if (!excl && c == e) break;
4317 c++;
4318 if (excl && c == e) break;
4319 }
4320 return beg;
4321 }
4322 /* both edges are all digits */
4323 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4324 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4325 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4326 VALUE b, e;
4327 int width;
4328
4329 width = RSTRING_LENINT(beg);
4330 b = rb_str_to_inum(beg, 10, FALSE);
4331 e = rb_str_to_inum(end, 10, FALSE);
4332 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4333 long bi = FIX2LONG(b);
4334 long ei = FIX2LONG(e);
4335 rb_encoding *usascii = rb_usascii_encoding();
4336
4337 while (bi <= ei) {
4338 if (excl && bi == ei) break;
4339 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4340 bi++;
4341 }
4342 }
4343 else {
4344 ID op = excl ? '<' : idLE;
4345 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4346
4347 args[0] = INT2FIX(width);
4348 while (rb_funcall(b, op, 1, e)) {
4349 args[1] = b;
4350 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4351 b = rb_funcallv(b, succ, 0, 0);
4352 }
4353 }
4354 return beg;
4355 }
4356 /* normal case */
4357 n = rb_str_cmp(beg, end);
4358 if (n > 0 || (excl && n == 0)) return beg;
4359
4360 after_end = rb_funcallv(end, succ, 0, 0);
4361 current = rb_str_dup(beg);
4362 while (!rb_str_equal(current, after_end)) {
4363 VALUE next = Qnil;
4364 if (excl || !rb_str_equal(current, end))
4365 next = rb_funcallv(current, succ, 0, 0);
4366 if ((*each)(current, arg)) break;
4367 if (NIL_P(next)) break;
4368 current = next;
4369 StringValue(current);
4370 if (excl && rb_str_equal(current, end)) break;
4371 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4372 break;
4373 }
4374
4375 return beg;
4376 }
4377
4378 VALUE
rb_str_upto_endless_each(VALUE beg,int (* each)(VALUE,VALUE),VALUE arg)4379 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4380 {
4381 VALUE current;
4382 ID succ;
4383
4384 CONST_ID(succ, "succ");
4385 /* both edges are all digits */
4386 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4387 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4388 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4389 int width = RSTRING_LENINT(beg);
4390 b = rb_str_to_inum(beg, 10, FALSE);
4391 if (FIXNUM_P(b)) {
4392 long bi = FIX2LONG(b);
4393 rb_encoding *usascii = rb_usascii_encoding();
4394
4395 while (FIXABLE(bi)) {
4396 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4397 bi++;
4398 }
4399 b = LONG2NUM(bi);
4400 }
4401 args[0] = INT2FIX(width);
4402 while (1) {
4403 args[1] = b;
4404 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4405 b = rb_funcallv(b, succ, 0, 0);
4406 }
4407 }
4408 /* normal case */
4409 current = rb_str_dup(beg);
4410 while (1) {
4411 VALUE next = rb_funcallv(current, succ, 0, 0);
4412 if ((*each)(current, arg)) break;
4413 current = next;
4414 StringValue(current);
4415 if (RSTRING_LEN(current) == 0)
4416 break;
4417 }
4418
4419 return beg;
4420 }
4421
4422 static int
include_range_i(VALUE str,VALUE arg)4423 include_range_i(VALUE str, VALUE arg)
4424 {
4425 VALUE *argp = (VALUE *)arg;
4426 if (!rb_equal(str, *argp)) return 0;
4427 *argp = Qnil;
4428 return 1;
4429 }
4430
4431 VALUE
rb_str_include_range_p(VALUE beg,VALUE end,VALUE val,VALUE exclusive)4432 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4433 {
4434 beg = rb_str_new_frozen(beg);
4435 StringValue(end);
4436 end = rb_str_new_frozen(end);
4437 if (NIL_P(val)) return Qfalse;
4438 val = rb_check_string_type(val);
4439 if (NIL_P(val)) return Qfalse;
4440 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4441 rb_enc_asciicompat(STR_ENC_GET(end)) &&
4442 rb_enc_asciicompat(STR_ENC_GET(val))) {
4443 const char *bp = RSTRING_PTR(beg);
4444 const char *ep = RSTRING_PTR(end);
4445 const char *vp = RSTRING_PTR(val);
4446 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4447 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4448 return Qfalse;
4449 else {
4450 char b = *bp;
4451 char e = *ep;
4452 char v = *vp;
4453
4454 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4455 if (b <= v && v < e) return Qtrue;
4456 if (!RTEST(exclusive) && v == e) return Qtrue;
4457 return Qfalse;
4458 }
4459 }
4460 }
4461 #if 0
4462 /* both edges are all digits */
4463 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4464 all_digits_p(bp, RSTRING_LEN(beg)) &&
4465 all_digits_p(ep, RSTRING_LEN(end))) {
4466 /* TODO */
4467 }
4468 #endif
4469 }
4470 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4471
4472 return NIL_P(val) ? Qtrue : Qfalse;
4473 }
4474
4475 static VALUE
rb_str_subpat(VALUE str,VALUE re,VALUE backref)4476 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4477 {
4478 if (rb_reg_search(re, str, 0, 0) >= 0) {
4479 VALUE match = rb_backref_get();
4480 int nth = rb_reg_backref_number(match, backref);
4481 return rb_reg_nth_match(nth, match);
4482 }
4483 return Qnil;
4484 }
4485
4486 static VALUE
rb_str_aref(VALUE str,VALUE indx)4487 rb_str_aref(VALUE str, VALUE indx)
4488 {
4489 long idx;
4490
4491 if (FIXNUM_P(indx)) {
4492 idx = FIX2LONG(indx);
4493 }
4494 else if (RB_TYPE_P(indx, T_REGEXP)) {
4495 return rb_str_subpat(str, indx, INT2FIX(0));
4496 }
4497 else if (RB_TYPE_P(indx, T_STRING)) {
4498 if (rb_str_index(str, indx, 0) != -1)
4499 return rb_str_dup(indx);
4500 return Qnil;
4501 }
4502 else {
4503 /* check if indx is Range */
4504 long beg, len = str_strlen(str, NULL);
4505 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4506 case Qfalse:
4507 break;
4508 case Qnil:
4509 return Qnil;
4510 default:
4511 return rb_str_substr(str, beg, len);
4512 }
4513 idx = NUM2LONG(indx);
4514 }
4515
4516 return str_substr(str, idx, 1, FALSE);
4517 }
4518
4519
4520 /*
4521 * call-seq:
4522 * str[index] -> new_str or nil
4523 * str[start, length] -> new_str or nil
4524 * str[range] -> new_str or nil
4525 * str[regexp] -> new_str or nil
4526 * str[regexp, capture] -> new_str or nil
4527 * str[match_str] -> new_str or nil
4528 * str.slice(index) -> new_str or nil
4529 * str.slice(start, length) -> new_str or nil
4530 * str.slice(range) -> new_str or nil
4531 * str.slice(regexp) -> new_str or nil
4532 * str.slice(regexp, capture) -> new_str or nil
4533 * str.slice(match_str) -> new_str or nil
4534 *
4535 * Element Reference --- If passed a single +index+, returns a substring of
4536 * one character at that index. If passed a +start+ index and a +length+,
4537 * returns a substring containing +length+ characters starting at the
4538 * +start+ index. If passed a +range+, its beginning and end are interpreted as
4539 * offsets delimiting the substring to be returned.
4540 *
4541 * In these three cases, if an index is negative, it is counted from the end
4542 * of the string. For the +start+ and +range+ cases the starting index
4543 * is just before a character and an index matching the string's size.
4544 * Additionally, an empty string is returned when the starting index for a
4545 * character range is at the end of the string.
4546 *
4547 * Returns +nil+ if the initial index falls outside the string or the length
4548 * is negative.
4549 *
4550 * If a +Regexp+ is supplied, the matching portion of the string is
4551 * returned. If a +capture+ follows the regular expression, which may be a
4552 * capture group index or name, follows the regular expression that component
4553 * of the MatchData is returned instead.
4554 *
4555 * If a +match_str+ is given, that string is returned if it occurs in
4556 * the string.
4557 *
4558 * Returns +nil+ if the regular expression does not match or the match string
4559 * cannot be found.
4560 *
4561 * a = "hello there"
4562 *
4563 * a[1] #=> "e"
4564 * a[2, 3] #=> "llo"
4565 * a[2..3] #=> "ll"
4566 *
4567 * a[-3, 2] #=> "er"
4568 * a[7..-2] #=> "her"
4569 * a[-4..-2] #=> "her"
4570 * a[-2..-4] #=> ""
4571 *
4572 * a[11, 0] #=> ""
4573 * a[11] #=> nil
4574 * a[12, 0] #=> nil
4575 * a[12..-1] #=> nil
4576 *
4577 * a[/[aeiou](.)\1/] #=> "ell"
4578 * a[/[aeiou](.)\1/, 0] #=> "ell"
4579 * a[/[aeiou](.)\1/, 1] #=> "l"
4580 * a[/[aeiou](.)\1/, 2] #=> nil
4581 *
4582 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
4583 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e"
4584 *
4585 * a["lo"] #=> "lo"
4586 * a["bye"] #=> nil
4587 */
4588
4589 static VALUE
rb_str_aref_m(int argc,VALUE * argv,VALUE str)4590 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
4591 {
4592 if (argc == 2) {
4593 if (RB_TYPE_P(argv[0], T_REGEXP)) {
4594 return rb_str_subpat(str, argv[0], argv[1]);
4595 }
4596 else {
4597 long beg = NUM2LONG(argv[0]);
4598 long len = NUM2LONG(argv[1]);
4599 return rb_str_substr(str, beg, len);
4600 }
4601 }
4602 rb_check_arity(argc, 1, 2);
4603 return rb_str_aref(str, argv[0]);
4604 }
4605
4606 VALUE
rb_str_drop_bytes(VALUE str,long len)4607 rb_str_drop_bytes(VALUE str, long len)
4608 {
4609 char *ptr = RSTRING_PTR(str);
4610 long olen = RSTRING_LEN(str), nlen;
4611
4612 str_modifiable(str);
4613 if (len > olen) len = olen;
4614 nlen = olen - len;
4615 if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4616 char *oldptr = ptr;
4617 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4618 STR_SET_EMBED(str);
4619 STR_SET_EMBED_LEN(str, nlen);
4620 ptr = RSTRING(str)->as.ary;
4621 memmove(ptr, oldptr + len, nlen);
4622 if (fl == STR_NOEMBED) xfree(oldptr);
4623 }
4624 else {
4625 if (!STR_SHARED_P(str)) rb_str_new_frozen(str);
4626 ptr = RSTRING(str)->as.heap.ptr += len;
4627 RSTRING(str)->as.heap.len = nlen;
4628 }
4629 ptr[nlen] = 0;
4630 ENC_CODERANGE_CLEAR(str);
4631 return str;
4632 }
4633
4634 static void
rb_str_splice_0(VALUE str,long beg,long len,VALUE val)4635 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4636 {
4637 char *sptr;
4638 long slen, vlen = RSTRING_LEN(val);
4639 int cr;
4640
4641 if (beg == 0 && vlen == 0) {
4642 rb_str_drop_bytes(str, len);
4643 OBJ_INFECT(str, val);
4644 return;
4645 }
4646
4647 str_modify_keep_cr(str);
4648 RSTRING_GETMEM(str, sptr, slen);
4649 if (len < vlen) {
4650 /* expand string */
4651 RESIZE_CAPA(str, slen + vlen - len);
4652 sptr = RSTRING_PTR(str);
4653 }
4654
4655 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
4656 cr = rb_enc_str_coderange(val);
4657 else
4658 cr = ENC_CODERANGE_UNKNOWN;
4659
4660 if (vlen != len) {
4661 memmove(sptr + beg + vlen,
4662 sptr + beg + len,
4663 slen - (beg + len));
4664 }
4665 if (vlen < beg && len < 0) {
4666 MEMZERO(sptr + slen, char, -len);
4667 }
4668 if (vlen > 0) {
4669 memmove(sptr + beg, RSTRING_PTR(val), vlen);
4670 }
4671 slen += vlen - len;
4672 STR_SET_LEN(str, slen);
4673 TERM_FILL(&sptr[slen], TERM_LEN(str));
4674 OBJ_INFECT(str, val);
4675 ENC_CODERANGE_SET(str, cr);
4676 }
4677
4678 void
rb_str_update(VALUE str,long beg,long len,VALUE val)4679 rb_str_update(VALUE str, long beg, long len, VALUE val)
4680 {
4681 long slen;
4682 char *p, *e;
4683 rb_encoding *enc;
4684 int singlebyte = single_byte_optimizable(str);
4685 int cr;
4686
4687 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4688
4689 StringValue(val);
4690 enc = rb_enc_check(str, val);
4691 slen = str_strlen(str, enc); /* rb_enc_check */
4692
4693 if (slen < beg) {
4694 out_of_range:
4695 rb_raise(rb_eIndexError, "index %ld out of string", beg);
4696 }
4697 if (beg < 0) {
4698 if (beg + slen < 0) {
4699 goto out_of_range;
4700 }
4701 beg += slen;
4702 }
4703 assert(beg >= 0);
4704 assert(beg <= slen);
4705 if (len > slen - beg) {
4706 len = slen - beg;
4707 }
4708 str_modify_keep_cr(str);
4709 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4710 if (!p) p = RSTRING_END(str);
4711 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4712 if (!e) e = RSTRING_END(str);
4713 /* error check */
4714 beg = p - RSTRING_PTR(str); /* physical position */
4715 len = e - p; /* physical length */
4716 rb_str_splice_0(str, beg, len, val);
4717 rb_enc_associate(str, enc);
4718 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
4719 if (cr != ENC_CODERANGE_BROKEN)
4720 ENC_CODERANGE_SET(str, cr);
4721 }
4722
4723 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4724
4725 static void
rb_str_subpat_set(VALUE str,VALUE re,VALUE backref,VALUE val)4726 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
4727 {
4728 int nth;
4729 VALUE match;
4730 long start, end, len;
4731 rb_encoding *enc;
4732 struct re_registers *regs;
4733
4734 if (rb_reg_search(re, str, 0, 0) < 0) {
4735 rb_raise(rb_eIndexError, "regexp not matched");
4736 }
4737 match = rb_backref_get();
4738 nth = rb_reg_backref_number(match, backref);
4739 regs = RMATCH_REGS(match);
4740 if (nth >= regs->num_regs) {
4741 out_of_range:
4742 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4743 }
4744 if (nth < 0) {
4745 if (-nth >= regs->num_regs) {
4746 goto out_of_range;
4747 }
4748 nth += regs->num_regs;
4749 }
4750
4751 start = BEG(nth);
4752 if (start == -1) {
4753 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4754 }
4755 end = END(nth);
4756 len = end - start;
4757 StringValue(val);
4758 enc = rb_enc_check_str(str, val);
4759 rb_str_splice_0(str, start, len, val);
4760 rb_enc_associate(str, enc);
4761 }
4762
4763 static VALUE
rb_str_aset(VALUE str,VALUE indx,VALUE val)4764 rb_str_aset(VALUE str, VALUE indx, VALUE val)
4765 {
4766 long idx, beg;
4767
4768 if (FIXNUM_P(indx)) {
4769 idx = FIX2LONG(indx);
4770 num_index:
4771 rb_str_splice(str, idx, 1, val);
4772 return val;
4773 }
4774
4775 if (SPECIAL_CONST_P(indx)) goto generic;
4776 switch (BUILTIN_TYPE(indx)) {
4777 case T_REGEXP:
4778 rb_str_subpat_set(str, indx, INT2FIX(0), val);
4779 return val;
4780
4781 case T_STRING:
4782 beg = rb_str_index(str, indx, 0);
4783 if (beg < 0) {
4784 rb_raise(rb_eIndexError, "string not matched");
4785 }
4786 beg = rb_str_sublen(str, beg);
4787 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4788 return val;
4789
4790 generic:
4791 default:
4792 /* check if indx is Range */
4793 {
4794 long beg, len;
4795 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4796 rb_str_splice(str, beg, len, val);
4797 return val;
4798 }
4799 }
4800 idx = NUM2LONG(indx);
4801 goto num_index;
4802 }
4803 }
4804
4805 /*
4806 * call-seq:
4807 * str[integer] = new_str
4808 * str[integer, integer] = new_str
4809 * str[range] = aString
4810 * str[regexp] = new_str
4811 * str[regexp, integer] = new_str
4812 * str[regexp, name] = new_str
4813 * str[other_str] = new_str
4814 *
4815 * Element Assignment---Replaces some or all of the content of <i>str</i>. The
4816 * portion of the string affected is determined using the same criteria as
4817 * <code>String#[]</code>. If the replacement string is not the same length as
4818 * the text it is replacing, the string will be adjusted accordingly. If the
4819 * regular expression or string is used as the index doesn't match a position
4820 * in the string, <code>IndexError</code> is raised. If the regular expression
4821 * form is used, the optional second <code>Integer</code> allows you to specify
4822 * which portion of the match to replace (effectively using the
4823 * <code>MatchData</code> indexing rules. The forms that take an
4824 * <code>Integer</code> will raise an <code>IndexError</code> if the value is
4825 * out of range; the <code>Range</code> form will raise a
4826 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
4827 * will raise an <code>IndexError</code> on negative match.
4828 */
4829
4830 static VALUE
rb_str_aset_m(int argc,VALUE * argv,VALUE str)4831 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
4832 {
4833 if (argc == 3) {
4834 if (RB_TYPE_P(argv[0], T_REGEXP)) {
4835 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
4836 }
4837 else {
4838 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
4839 }
4840 return argv[2];
4841 }
4842 rb_check_arity(argc, 2, 3);
4843 return rb_str_aset(str, argv[0], argv[1]);
4844 }
4845
4846 /*
4847 * call-seq:
4848 * str.insert(index, other_str) -> str
4849 *
4850 * Inserts <i>other_str</i> before the character at the given
4851 * <i>index</i>, modifying <i>str</i>. Negative indices count from the
4852 * end of the string, and insert <em>after</em> the given character.
4853 * The intent is insert <i>aString</i> so that it starts at the given
4854 * <i>index</i>.
4855 *
4856 * "abcd".insert(0, 'X') #=> "Xabcd"
4857 * "abcd".insert(3, 'X') #=> "abcXd"
4858 * "abcd".insert(4, 'X') #=> "abcdX"
4859 * "abcd".insert(-3, 'X') #=> "abXcd"
4860 * "abcd".insert(-1, 'X') #=> "abcdX"
4861 */
4862
4863 static VALUE
rb_str_insert(VALUE str,VALUE idx,VALUE str2)4864 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
4865 {
4866 long pos = NUM2LONG(idx);
4867
4868 if (pos == -1) {
4869 return rb_str_append(str, str2);
4870 }
4871 else if (pos < 0) {
4872 pos++;
4873 }
4874 rb_str_splice(str, pos, 0, str2);
4875 return str;
4876 }
4877
4878
4879 /*
4880 * call-seq:
4881 * str.slice!(integer) -> new_str or nil
4882 * str.slice!(integer, integer) -> new_str or nil
4883 * str.slice!(range) -> new_str or nil
4884 * str.slice!(regexp) -> new_str or nil
4885 * str.slice!(other_str) -> new_str or nil
4886 *
4887 * Deletes the specified portion from <i>str</i>, and returns the portion
4888 * deleted.
4889 *
4890 * string = "this is a string"
4891 * string.slice!(2) #=> "i"
4892 * string.slice!(3..6) #=> " is "
4893 * string.slice!(/s.*t/) #=> "sa st"
4894 * string.slice!("r") #=> "r"
4895 * string #=> "thing"
4896 */
4897
4898 static VALUE
rb_str_slice_bang(int argc,VALUE * argv,VALUE str)4899 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
4900 {
4901 VALUE result;
4902 VALUE buf[3];
4903 int i;
4904
4905 rb_check_arity(argc, 1, 2);
4906 for (i=0; i<argc; i++) {
4907 buf[i] = argv[i];
4908 }
4909 str_modify_keep_cr(str);
4910 result = rb_str_aref_m(argc, buf, str);
4911 if (!NIL_P(result)) {
4912 buf[i] = rb_str_new(0,0);
4913 rb_str_aset_m(argc+1, buf, str);
4914 }
4915 return result;
4916 }
4917
4918 static VALUE
get_pat(VALUE pat)4919 get_pat(VALUE pat)
4920 {
4921 VALUE val;
4922
4923 if (SPECIAL_CONST_P(pat)) goto to_string;
4924 switch (BUILTIN_TYPE(pat)) {
4925 case T_REGEXP:
4926 return pat;
4927
4928 case T_STRING:
4929 break;
4930
4931 default:
4932 to_string:
4933 val = rb_check_string_type(pat);
4934 if (NIL_P(val)) {
4935 Check_Type(pat, T_REGEXP);
4936 }
4937 pat = val;
4938 }
4939
4940 return rb_reg_regcomp(pat);
4941 }
4942
4943 static VALUE
get_pat_quoted(VALUE pat,int check)4944 get_pat_quoted(VALUE pat, int check)
4945 {
4946 VALUE val;
4947
4948 if (SPECIAL_CONST_P(pat)) goto to_string;
4949 switch (BUILTIN_TYPE(pat)) {
4950 case T_REGEXP:
4951 return pat;
4952
4953 case T_STRING:
4954 break;
4955
4956 default:
4957 to_string:
4958 val = rb_check_string_type(pat);
4959 if (NIL_P(val)) {
4960 Check_Type(pat, T_REGEXP);
4961 }
4962 pat = val;
4963 }
4964 if (check && is_broken_string(pat)) {
4965 rb_exc_raise(rb_reg_check_preprocess(pat));
4966 }
4967 return pat;
4968 }
4969
4970 static long
rb_pat_search(VALUE pat,VALUE str,long pos,int set_backref_str)4971 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
4972 {
4973 if (BUILTIN_TYPE(pat) == T_STRING) {
4974 pos = rb_strseq_index(str, pat, pos, 1);
4975 if (set_backref_str) {
4976 if (pos >= 0) {
4977 VALUE match;
4978 str = rb_str_new_frozen(str);
4979 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
4980 match = rb_backref_get();
4981 OBJ_INFECT(match, pat);
4982 }
4983 else {
4984 rb_backref_set(Qnil);
4985 }
4986 }
4987 return pos;
4988 }
4989 else {
4990 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
4991 }
4992 }
4993
4994
4995 /*
4996 * call-seq:
4997 * str.sub!(pattern, replacement) -> str or nil
4998 * str.sub!(pattern) {|match| block } -> str or nil
4999 *
5000 * Performs the same substitution as String#sub in-place.
5001 *
5002 * Returns +str+ if a substitution was performed or +nil+ if no substitution
5003 * was performed.
5004 */
5005
5006 static VALUE
rb_str_sub_bang(int argc,VALUE * argv,VALUE str)5007 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5008 {
5009 VALUE pat, repl, hash = Qnil;
5010 int iter = 0;
5011 int tainted = 0;
5012 long plen;
5013 int min_arity = rb_block_given_p() ? 1 : 2;
5014 long beg;
5015
5016 rb_check_arity(argc, min_arity, 2);
5017 if (argc == 1) {
5018 iter = 1;
5019 }
5020 else {
5021 repl = argv[1];
5022 hash = rb_check_hash_type(argv[1]);
5023 if (NIL_P(hash)) {
5024 StringValue(repl);
5025 }
5026 tainted = OBJ_TAINTED_RAW(repl);
5027 }
5028
5029 pat = get_pat_quoted(argv[0], 1);
5030
5031 str_modifiable(str);
5032 beg = rb_pat_search(pat, str, 0, 1);
5033 if (beg >= 0) {
5034 rb_encoding *enc;
5035 int cr = ENC_CODERANGE(str);
5036 long beg0, end0;
5037 VALUE match, match0 = Qnil;
5038 struct re_registers *regs;
5039 char *p, *rp;
5040 long len, rlen;
5041
5042 match = rb_backref_get();
5043 regs = RMATCH_REGS(match);
5044 if (RB_TYPE_P(pat, T_STRING)) {
5045 beg0 = beg;
5046 end0 = beg0 + RSTRING_LEN(pat);
5047 match0 = pat;
5048 }
5049 else {
5050 beg0 = BEG(0);
5051 end0 = END(0);
5052 if (iter) match0 = rb_reg_nth_match(0, match);
5053 }
5054
5055 if (iter || !NIL_P(hash)) {
5056 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5057
5058 if (iter) {
5059 repl = rb_obj_as_string(rb_yield(match0));
5060 }
5061 else {
5062 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5063 repl = rb_obj_as_string(repl);
5064 }
5065 str_mod_check(str, p, len);
5066 rb_check_frozen(str);
5067 }
5068 else {
5069 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5070 }
5071
5072 enc = rb_enc_compatible(str, repl);
5073 if (!enc) {
5074 rb_encoding *str_enc = STR_ENC_GET(str);
5075 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5076 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5077 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5078 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5079 rb_enc_name(str_enc),
5080 rb_enc_name(STR_ENC_GET(repl)));
5081 }
5082 enc = STR_ENC_GET(repl);
5083 }
5084 rb_str_modify(str);
5085 rb_enc_associate(str, enc);
5086 tainted |= OBJ_TAINTED_RAW(repl);
5087 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5088 int cr2 = ENC_CODERANGE(repl);
5089 if (cr2 == ENC_CODERANGE_BROKEN ||
5090 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5091 cr = ENC_CODERANGE_UNKNOWN;
5092 else
5093 cr = cr2;
5094 }
5095 plen = end0 - beg0;
5096 rlen = RSTRING_LEN(repl);
5097 len = RSTRING_LEN(str);
5098 if (rlen > plen) {
5099 RESIZE_CAPA(str, len + rlen - plen);
5100 }
5101 p = RSTRING_PTR(str);
5102 if (rlen != plen) {
5103 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5104 }
5105 rp = RSTRING_PTR(repl);
5106 memmove(p + beg0, rp, rlen);
5107 len += rlen - plen;
5108 STR_SET_LEN(str, len);
5109 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5110 ENC_CODERANGE_SET(str, cr);
5111 FL_SET_RAW(str, tainted);
5112
5113 return str;
5114 }
5115 return Qnil;
5116 }
5117
5118
5119 /*
5120 * call-seq:
5121 * str.sub(pattern, replacement) -> new_str
5122 * str.sub(pattern, hash) -> new_str
5123 * str.sub(pattern) {|match| block } -> new_str
5124 *
5125 * Returns a copy of +str+ with the _first_ occurrence of +pattern+
5126 * replaced by the second argument. The +pattern+ is typically a Regexp; if
5127 * given as a String, any regular expression metacharacters it contains will
5128 * be interpreted literally, e.g. <code>'\\\d'</code> will match a backslash
5129 * followed by 'd', instead of a digit.
5130 *
5131 * If +replacement+ is a String it will be substituted for the matched text.
5132 * It may contain back-references to the pattern's capture groups of the form
5133 * <code>"\\d"</code>, where <i>d</i> is a group number, or
5134 * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
5135 * double-quoted string, both back-references must be preceded by an
5136 * additional backslash. However, within +replacement+ the special match
5137 * variables, such as <code>$&</code>, will not refer to the current match.
5138 * If +replacement+ is a String that looks like a pattern's capture group but
5139 * is actually not a pattern capture group e.g. <code>"\\'"</code>, then it
5140 * will have to be preceded by two backslashes like so <code>"\\\\'"</code>.
5141 *
5142 * If the second argument is a Hash, and the matched text is one of its keys,
5143 * the corresponding value is the replacement string.
5144 *
5145 * In the block form, the current match string is passed in as a parameter,
5146 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5147 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5148 * returned by the block will be substituted for the match on each call.
5149 *
5150 * The result inherits any tainting in the original string or any supplied
5151 * replacement string.
5152 *
5153 * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
5154 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
5155 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
5156 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
5157 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
5158 * #=> "Is /bin/bash your preferred shell?"
5159 */
5160
5161 static VALUE
rb_str_sub(int argc,VALUE * argv,VALUE str)5162 rb_str_sub(int argc, VALUE *argv, VALUE str)
5163 {
5164 str = rb_str_dup(str);
5165 rb_str_sub_bang(argc, argv, str);
5166 return str;
5167 }
5168
5169 static VALUE
str_gsub(int argc,VALUE * argv,VALUE str,int bang)5170 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5171 {
5172 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5173 struct re_registers *regs;
5174 long beg, beg0, end0;
5175 long offset, blen, slen, len, last;
5176 enum {STR, ITER, MAP} mode = STR;
5177 char *sp, *cp;
5178 int tainted = 0;
5179 int need_backref = -1;
5180 rb_encoding *str_enc;
5181
5182 switch (argc) {
5183 case 1:
5184 RETURN_ENUMERATOR(str, argc, argv);
5185 mode = ITER;
5186 break;
5187 case 2:
5188 repl = argv[1];
5189 hash = rb_check_hash_type(argv[1]);
5190 if (NIL_P(hash)) {
5191 StringValue(repl);
5192 }
5193 else {
5194 mode = MAP;
5195 }
5196 tainted = OBJ_TAINTED_RAW(repl);
5197 break;
5198 default:
5199 rb_check_arity(argc, 1, 2);
5200 }
5201
5202 pat = get_pat_quoted(argv[0], 1);
5203 beg = rb_pat_search(pat, str, 0, need_backref);
5204 if (beg < 0) {
5205 if (bang) return Qnil; /* no match, no substitution */
5206 return rb_str_dup(str);
5207 }
5208
5209 offset = 0;
5210 blen = RSTRING_LEN(str) + 30; /* len + margin */
5211 dest = rb_str_buf_new(blen);
5212 sp = RSTRING_PTR(str);
5213 slen = RSTRING_LEN(str);
5214 cp = sp;
5215 str_enc = STR_ENC_GET(str);
5216 rb_enc_associate(dest, str_enc);
5217 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5218
5219 do {
5220 match = rb_backref_get();
5221 regs = RMATCH_REGS(match);
5222 if (RB_TYPE_P(pat, T_STRING)) {
5223 beg0 = beg;
5224 end0 = beg0 + RSTRING_LEN(pat);
5225 match0 = pat;
5226 }
5227 else {
5228 beg0 = BEG(0);
5229 end0 = END(0);
5230 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5231 }
5232
5233 if (mode) {
5234 if (mode == ITER) {
5235 val = rb_obj_as_string(rb_yield(match0));
5236 }
5237 else {
5238 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5239 val = rb_obj_as_string(val);
5240 }
5241 str_mod_check(str, sp, slen);
5242 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5243 rb_raise(rb_eRuntimeError, "block should not cheat");
5244 }
5245 }
5246 else if (need_backref) {
5247 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5248 if (need_backref < 0) {
5249 need_backref = val != repl;
5250 }
5251 }
5252 else {
5253 val = repl;
5254 }
5255
5256 tainted |= OBJ_TAINTED_RAW(val);
5257
5258 len = beg0 - offset; /* copy pre-match substr */
5259 if (len) {
5260 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5261 }
5262
5263 rb_str_buf_append(dest, val);
5264
5265 last = offset;
5266 offset = end0;
5267 if (beg0 == end0) {
5268 /*
5269 * Always consume at least one character of the input string
5270 * in order to prevent infinite loops.
5271 */
5272 if (RSTRING_LEN(str) <= end0) break;
5273 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5274 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5275 offset = end0 + len;
5276 }
5277 cp = RSTRING_PTR(str) + offset;
5278 if (offset > RSTRING_LEN(str)) break;
5279 beg = rb_pat_search(pat, str, offset, need_backref);
5280 } while (beg >= 0);
5281 if (RSTRING_LEN(str) > offset) {
5282 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5283 }
5284 rb_pat_search(pat, str, last, 1);
5285 if (bang) {
5286 str_shared_replace(str, dest);
5287 }
5288 else {
5289 RBASIC_SET_CLASS(dest, rb_obj_class(str));
5290 tainted |= OBJ_TAINTED_RAW(str);
5291 str = dest;
5292 }
5293
5294 FL_SET_RAW(str, tainted);
5295 return str;
5296 }
5297
5298
5299 /*
5300 * call-seq:
5301 * str.gsub!(pattern, replacement) -> str or nil
5302 * str.gsub!(pattern, hash) -> str or nil
5303 * str.gsub!(pattern) {|match| block } -> str or nil
5304 * str.gsub!(pattern) -> an_enumerator
5305 *
5306 * Performs the substitutions of <code>String#gsub</code> in place, returning
5307 * <i>str</i>, or <code>nil</code> if no substitutions were performed.
5308 * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
5309 */
5310
5311 static VALUE
rb_str_gsub_bang(int argc,VALUE * argv,VALUE str)5312 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5313 {
5314 str_modify_keep_cr(str);
5315 return str_gsub(argc, argv, str, 1);
5316 }
5317
5318
5319 /*
5320 * call-seq:
5321 * str.gsub(pattern, replacement) -> new_str
5322 * str.gsub(pattern, hash) -> new_str
5323 * str.gsub(pattern) {|match| block } -> new_str
5324 * str.gsub(pattern) -> enumerator
5325 *
5326 * Returns a copy of <i>str</i> with <em>all</em> occurrences of
5327 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5328 * typically a <code>Regexp</code>; if given as a <code>String</code>, any
5329 * regular expression metacharacters it contains will be interpreted
5330 * literally, e.g. <code>'\\\d'</code> will match a backslash followed by 'd',
5331 * instead of a digit.
5332 *
5333 * If <i>replacement</i> is a <code>String</code> it will be substituted for
5334 * the matched text. It may contain back-references to the pattern's capture
5335 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
5336 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
5337 * double-quoted string, both back-references must be preceded by an
5338 * additional backslash. However, within <i>replacement</i> the special match
5339 * variables, such as <code>$&</code>, will not refer to the current match.
5340 *
5341 * If the second argument is a <code>Hash</code>, and the matched text is one
5342 * of its keys, the corresponding value is the replacement string.
5343 *
5344 * In the block form, the current match string is passed in as a parameter,
5345 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5346 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5347 * returned by the block will be substituted for the match on each call.
5348 *
5349 * The result inherits any tainting in the original string or any supplied
5350 * replacement string.
5351 *
5352 * When neither a block nor a second argument is supplied, an
5353 * <code>Enumerator</code> is returned.
5354 *
5355 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
5356 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
5357 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
5358 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
5359 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
5360 */
5361
5362 static VALUE
rb_str_gsub(int argc,VALUE * argv,VALUE str)5363 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5364 {
5365 return str_gsub(argc, argv, str, 0);
5366 }
5367
5368
5369 /*
5370 * call-seq:
5371 * str.replace(other_str) -> str
5372 *
5373 * Replaces the contents and taintedness of <i>str</i> with the corresponding
5374 * values in <i>other_str</i>.
5375 *
5376 * s = "hello" #=> "hello"
5377 * s.replace "world" #=> "world"
5378 */
5379
5380 VALUE
rb_str_replace(VALUE str,VALUE str2)5381 rb_str_replace(VALUE str, VALUE str2)
5382 {
5383 str_modifiable(str);
5384 if (str == str2) return str;
5385
5386 StringValue(str2);
5387 str_discard(str);
5388 return str_replace(str, str2);
5389 }
5390
5391 /*
5392 * call-seq:
5393 * string.clear -> string
5394 *
5395 * Makes string empty.
5396 *
5397 * a = "abcde"
5398 * a.clear #=> ""
5399 */
5400
5401 static VALUE
rb_str_clear(VALUE str)5402 rb_str_clear(VALUE str)
5403 {
5404 str_discard(str);
5405 STR_SET_EMBED(str);
5406 STR_SET_EMBED_LEN(str, 0);
5407 RSTRING_PTR(str)[0] = 0;
5408 if (rb_enc_asciicompat(STR_ENC_GET(str)))
5409 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5410 else
5411 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5412 return str;
5413 }
5414
5415 /*
5416 * call-seq:
5417 * string.chr -> string
5418 *
5419 * Returns a one-character string at the beginning of the string.
5420 *
5421 * a = "abcde"
5422 * a.chr #=> "a"
5423 */
5424
5425 static VALUE
rb_str_chr(VALUE str)5426 rb_str_chr(VALUE str)
5427 {
5428 return rb_str_substr(str, 0, 1);
5429 }
5430
5431 /*
5432 * call-seq:
5433 * str.getbyte(index) -> 0 .. 255
5434 *
5435 * returns the <i>index</i>th byte as an integer.
5436 */
5437 static VALUE
rb_str_getbyte(VALUE str,VALUE index)5438 rb_str_getbyte(VALUE str, VALUE index)
5439 {
5440 long pos = NUM2LONG(index);
5441
5442 if (pos < 0)
5443 pos += RSTRING_LEN(str);
5444 if (pos < 0 || RSTRING_LEN(str) <= pos)
5445 return Qnil;
5446
5447 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5448 }
5449
5450 /*
5451 * call-seq:
5452 * str.setbyte(index, integer) -> integer
5453 *
5454 * modifies the <i>index</i>th byte as <i>integer</i>.
5455 */
5456 static VALUE
rb_str_setbyte(VALUE str,VALUE index,VALUE value)5457 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5458 {
5459 long pos = NUM2LONG(index);
5460 long len = RSTRING_LEN(str);
5461 char *head, *left = 0;
5462 unsigned char *ptr;
5463 rb_encoding *enc;
5464 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5465
5466 if (pos < -len || len <= pos)
5467 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5468 if (pos < 0)
5469 pos += len;
5470
5471 VALUE v = rb_to_int(value);
5472 VALUE w = rb_int_modulo(v, INT2FIX(256));
5473 unsigned char byte = NUM2INT(w) & 0xFF;
5474
5475 if (!str_independent(str))
5476 str_make_independent(str);
5477 enc = STR_ENC_GET(str);
5478 head = RSTRING_PTR(str);
5479 ptr = (unsigned char *)&head[pos];
5480 if (!STR_EMBED_P(str)) {
5481 cr = ENC_CODERANGE(str);
5482 switch (cr) {
5483 case ENC_CODERANGE_7BIT:
5484 left = (char *)ptr;
5485 *ptr = byte;
5486 if (ISASCII(byte)) goto end;
5487 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5488 if (!MBCLEN_CHARFOUND_P(nlen))
5489 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5490 else
5491 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5492 goto end;
5493 case ENC_CODERANGE_VALID:
5494 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5495 width = rb_enc_precise_mbclen(left, head+len, enc);
5496 *ptr = byte;
5497 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5498 if (!MBCLEN_CHARFOUND_P(nlen))
5499 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5500 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5501 ENC_CODERANGE_CLEAR(str);
5502 goto end;
5503 }
5504 }
5505 ENC_CODERANGE_CLEAR(str);
5506 *ptr = byte;
5507
5508 end:
5509 return value;
5510 }
5511
5512 static VALUE
str_byte_substr(VALUE str,long beg,long len,int empty)5513 str_byte_substr(VALUE str, long beg, long len, int empty)
5514 {
5515 char *p, *s = RSTRING_PTR(str);
5516 long n = RSTRING_LEN(str);
5517 VALUE str2;
5518
5519 if (beg > n || len < 0) return Qnil;
5520 if (beg < 0) {
5521 beg += n;
5522 if (beg < 0) return Qnil;
5523 }
5524 if (len > n - beg)
5525 len = n - beg;
5526 if (len <= 0) {
5527 if (!empty) return Qnil;
5528 len = 0;
5529 p = 0;
5530 }
5531 else
5532 p = s + beg;
5533
5534 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5535 str2 = rb_str_new_frozen(str);
5536 str2 = str_new_shared(rb_obj_class(str2), str2);
5537 RSTRING(str2)->as.heap.ptr += beg;
5538 RSTRING(str2)->as.heap.len = len;
5539 }
5540 else {
5541 str2 = rb_str_new_with_class(str, p, len);
5542 }
5543
5544 str_enc_copy(str2, str);
5545
5546 if (RSTRING_LEN(str2) == 0) {
5547 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
5548 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
5549 else
5550 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
5551 }
5552 else {
5553 switch (ENC_CODERANGE(str)) {
5554 case ENC_CODERANGE_7BIT:
5555 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
5556 break;
5557 default:
5558 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
5559 break;
5560 }
5561 }
5562
5563 OBJ_INFECT_RAW(str2, str);
5564
5565 return str2;
5566 }
5567
5568 static VALUE
str_byte_aref(VALUE str,VALUE indx)5569 str_byte_aref(VALUE str, VALUE indx)
5570 {
5571 long idx;
5572 if (FIXNUM_P(indx)) {
5573 idx = FIX2LONG(indx);
5574 }
5575 else {
5576 /* check if indx is Range */
5577 long beg, len = RSTRING_LEN(str);
5578
5579 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5580 case Qfalse:
5581 break;
5582 case Qnil:
5583 return Qnil;
5584 default:
5585 return str_byte_substr(str, beg, len, TRUE);
5586 }
5587
5588 idx = NUM2LONG(indx);
5589 }
5590 return str_byte_substr(str, idx, 1, FALSE);
5591 }
5592
5593 /*
5594 * call-seq:
5595 * str.byteslice(integer) -> new_str or nil
5596 * str.byteslice(integer, integer) -> new_str or nil
5597 * str.byteslice(range) -> new_str or nil
5598 *
5599 * Byte Reference---If passed a single <code>Integer</code>, returns a
5600 * substring of one byte at that position. If passed two <code>Integer</code>
5601 * objects, returns a substring starting at the offset given by the first, and
5602 * a length given by the second. If given a <code>Range</code>, a substring containing
5603 * bytes at offsets given by the range is returned. In all three cases, if
5604 * an offset is negative, it is counted from the end of <i>str</i>. Returns
5605 * <code>nil</code> if the initial offset falls outside the string, the length
5606 * is negative, or the beginning of the range is greater than the end.
5607 * The encoding of the resulted string keeps original encoding.
5608 *
5609 * "hello".byteslice(1) #=> "e"
5610 * "hello".byteslice(-1) #=> "o"
5611 * "hello".byteslice(1, 2) #=> "el"
5612 * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5613 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5614 */
5615
5616 static VALUE
rb_str_byteslice(int argc,VALUE * argv,VALUE str)5617 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
5618 {
5619 if (argc == 2) {
5620 long beg = NUM2LONG(argv[0]);
5621 long end = NUM2LONG(argv[1]);
5622 return str_byte_substr(str, beg, end, TRUE);
5623 }
5624 rb_check_arity(argc, 1, 2);
5625 return str_byte_aref(str, argv[0]);
5626 }
5627
5628 /*
5629 * call-seq:
5630 * str.reverse -> new_str
5631 *
5632 * Returns a new string with the characters from <i>str</i> in reverse order.
5633 *
5634 * "stressed".reverse #=> "desserts"
5635 */
5636
5637 static VALUE
rb_str_reverse(VALUE str)5638 rb_str_reverse(VALUE str)
5639 {
5640 rb_encoding *enc;
5641 VALUE rev;
5642 char *s, *e, *p;
5643 int cr;
5644
5645 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
5646 enc = STR_ENC_GET(str);
5647 rev = rb_str_new_with_class(str, 0, RSTRING_LEN(str));
5648 s = RSTRING_PTR(str); e = RSTRING_END(str);
5649 p = RSTRING_END(rev);
5650 cr = ENC_CODERANGE(str);
5651
5652 if (RSTRING_LEN(str) > 1) {
5653 if (single_byte_optimizable(str)) {
5654 while (s < e) {
5655 *--p = *s++;
5656 }
5657 }
5658 else if (cr == ENC_CODERANGE_VALID) {
5659 while (s < e) {
5660 int clen = rb_enc_fast_mbclen(s, e, enc);
5661
5662 p -= clen;
5663 memcpy(p, s, clen);
5664 s += clen;
5665 }
5666 }
5667 else {
5668 cr = rb_enc_asciicompat(enc) ?
5669 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
5670 while (s < e) {
5671 int clen = rb_enc_mbclen(s, e, enc);
5672
5673 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5674 p -= clen;
5675 memcpy(p, s, clen);
5676 s += clen;
5677 }
5678 }
5679 }
5680 STR_SET_LEN(rev, RSTRING_LEN(str));
5681 OBJ_INFECT_RAW(rev, str);
5682 str_enc_copy(rev, str);
5683 ENC_CODERANGE_SET(rev, cr);
5684
5685 return rev;
5686 }
5687
5688
5689 /*
5690 * call-seq:
5691 * str.reverse! -> str
5692 *
5693 * Reverses <i>str</i> in place.
5694 */
5695
5696 static VALUE
rb_str_reverse_bang(VALUE str)5697 rb_str_reverse_bang(VALUE str)
5698 {
5699 if (RSTRING_LEN(str) > 1) {
5700 if (single_byte_optimizable(str)) {
5701 char *s, *e, c;
5702
5703 str_modify_keep_cr(str);
5704 s = RSTRING_PTR(str);
5705 e = RSTRING_END(str) - 1;
5706 while (s < e) {
5707 c = *s;
5708 *s++ = *e;
5709 *e-- = c;
5710 }
5711 }
5712 else {
5713 str_shared_replace(str, rb_str_reverse(str));
5714 }
5715 }
5716 else {
5717 str_modify_keep_cr(str);
5718 }
5719 return str;
5720 }
5721
5722
5723 /*
5724 * call-seq:
5725 * str.include? other_str -> true or false
5726 *
5727 * Returns <code>true</code> if <i>str</i> contains the given string or
5728 * character.
5729 *
5730 * "hello".include? "lo" #=> true
5731 * "hello".include? "ol" #=> false
5732 * "hello".include? ?h #=> true
5733 */
5734
5735 static VALUE
rb_str_include(VALUE str,VALUE arg)5736 rb_str_include(VALUE str, VALUE arg)
5737 {
5738 long i;
5739
5740 StringValue(arg);
5741 i = rb_str_index(str, arg, 0);
5742
5743 if (i == -1) return Qfalse;
5744 return Qtrue;
5745 }
5746
5747
5748 /*
5749 * call-seq:
5750 * str.to_i(base=10) -> integer
5751 *
5752 * Returns the result of interpreting leading characters in <i>str</i> as an
5753 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
5754 * end of a valid number are ignored. If there is not a valid number at the
5755 * start of <i>str</i>, <code>0</code> is returned. This method never raises an
5756 * exception when <i>base</i> is valid.
5757 *
5758 * "12345".to_i #=> 12345
5759 * "99 red balloons".to_i #=> 99
5760 * "0a".to_i #=> 0
5761 * "0a".to_i(16) #=> 10
5762 * "hello".to_i #=> 0
5763 * "1100101".to_i(2) #=> 101
5764 * "1100101".to_i(8) #=> 294977
5765 * "1100101".to_i(10) #=> 1100101
5766 * "1100101".to_i(16) #=> 17826049
5767 */
5768
5769 static VALUE
rb_str_to_i(int argc,VALUE * argv,VALUE str)5770 rb_str_to_i(int argc, VALUE *argv, VALUE str)
5771 {
5772 int base = 10;
5773
5774 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
5775 rb_raise(rb_eArgError, "invalid radix %d", base);
5776 }
5777 return rb_str_to_inum(str, base, FALSE);
5778 }
5779
5780
5781 /*
5782 * call-seq:
5783 * str.to_f -> float
5784 *
5785 * Returns the result of interpreting leading characters in <i>str</i> as a
5786 * floating point number. Extraneous characters past the end of a valid number
5787 * are ignored. If there is not a valid number at the start of <i>str</i>,
5788 * <code>0.0</code> is returned. This method never raises an exception.
5789 *
5790 * "123.45e1".to_f #=> 1234.5
5791 * "45.67 degrees".to_f #=> 45.67
5792 * "thx1138".to_f #=> 0.0
5793 */
5794
5795 static VALUE
rb_str_to_f(VALUE str)5796 rb_str_to_f(VALUE str)
5797 {
5798 return DBL2NUM(rb_str_to_dbl(str, FALSE));
5799 }
5800
5801
5802 /*
5803 * call-seq:
5804 * str.to_s -> str
5805 * str.to_str -> str
5806 *
5807 * Returns +self+.
5808 *
5809 * If called on a subclass of String, converts the receiver to a String object.
5810 */
5811
5812 static VALUE
rb_str_to_s(VALUE str)5813 rb_str_to_s(VALUE str)
5814 {
5815 if (rb_obj_class(str) != rb_cString) {
5816 return str_duplicate(rb_cString, str);
5817 }
5818 return str;
5819 }
5820
5821 #if 0
5822 static void
5823 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
5824 {
5825 char s[RUBY_MAX_CHAR_LEN];
5826 int n = rb_enc_codelen(c, enc);
5827
5828 rb_enc_mbcput(c, s, enc);
5829 rb_enc_str_buf_cat(str, s, n, enc);
5830 }
5831 #endif
5832
5833 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
5834
5835 int
rb_str_buf_cat_escaped_char(VALUE result,unsigned int c,int unicode_p)5836 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
5837 {
5838 char buf[CHAR_ESC_LEN + 1];
5839 int l;
5840
5841 #if SIZEOF_INT > 4
5842 c &= 0xffffffff;
5843 #endif
5844 if (unicode_p) {
5845 if (c < 0x7F && ISPRINT(c)) {
5846 snprintf(buf, CHAR_ESC_LEN, "%c", c);
5847 }
5848 else if (c < 0x10000) {
5849 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
5850 }
5851 else {
5852 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
5853 }
5854 }
5855 else {
5856 if (c < 0x100) {
5857 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
5858 }
5859 else {
5860 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
5861 }
5862 }
5863 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
5864 rb_str_buf_cat(result, buf, l);
5865 return l;
5866 }
5867
5868 VALUE
rb_str_escape(VALUE str)5869 rb_str_escape(VALUE str)
5870 {
5871 int encidx = ENCODING_GET(str);
5872 rb_encoding *enc = rb_enc_from_index(encidx);
5873 const char *p = RSTRING_PTR(str);
5874 const char *pend = RSTRING_END(str);
5875 const char *prev = p;
5876 char buf[CHAR_ESC_LEN + 1];
5877 VALUE result = rb_str_buf_new(0);
5878 int unicode_p = rb_enc_unicode_p(enc);
5879 int asciicompat = rb_enc_asciicompat(enc);
5880
5881 while (p < pend) {
5882 unsigned int c, cc;
5883 int n = rb_enc_precise_mbclen(p, pend, enc);
5884 if (!MBCLEN_CHARFOUND_P(n)) {
5885 if (p > prev) str_buf_cat(result, prev, p - prev);
5886 n = rb_enc_mbminlen(enc);
5887 if (pend < p + n)
5888 n = (int)(pend - p);
5889 while (n--) {
5890 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5891 str_buf_cat(result, buf, strlen(buf));
5892 prev = ++p;
5893 }
5894 continue;
5895 }
5896 n = MBCLEN_CHARFOUND_LEN(n);
5897 c = rb_enc_mbc_to_codepoint(p, pend, enc);
5898 p += n;
5899 switch (c) {
5900 case '\n': cc = 'n'; break;
5901 case '\r': cc = 'r'; break;
5902 case '\t': cc = 't'; break;
5903 case '\f': cc = 'f'; break;
5904 case '\013': cc = 'v'; break;
5905 case '\010': cc = 'b'; break;
5906 case '\007': cc = 'a'; break;
5907 case 033: cc = 'e'; break;
5908 default: cc = 0; break;
5909 }
5910 if (cc) {
5911 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5912 buf[0] = '\\';
5913 buf[1] = (char)cc;
5914 str_buf_cat(result, buf, 2);
5915 prev = p;
5916 }
5917 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
5918 }
5919 else {
5920 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5921 rb_str_buf_cat_escaped_char(result, c, unicode_p);
5922 prev = p;
5923 }
5924 }
5925 if (p > prev) str_buf_cat(result, prev, p - prev);
5926 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
5927
5928 OBJ_INFECT_RAW(result, str);
5929 return result;
5930 }
5931
5932 /*
5933 * call-seq:
5934 * str.inspect -> string
5935 *
5936 * Returns a printable version of _str_, surrounded by quote marks,
5937 * with special characters escaped.
5938 *
5939 * str = "hello"
5940 * str[3] = "\b"
5941 * str.inspect #=> "\"hel\\bo\""
5942 */
5943
5944 VALUE
rb_str_inspect(VALUE str)5945 rb_str_inspect(VALUE str)
5946 {
5947 int encidx = ENCODING_GET(str);
5948 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
5949 const char *p, *pend, *prev;
5950 char buf[CHAR_ESC_LEN + 1];
5951 VALUE result = rb_str_buf_new(0);
5952 rb_encoding *resenc = rb_default_internal_encoding();
5953 int unicode_p = rb_enc_unicode_p(enc);
5954 int asciicompat = rb_enc_asciicompat(enc);
5955
5956 if (resenc == NULL) resenc = rb_default_external_encoding();
5957 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
5958 rb_enc_associate(result, resenc);
5959 str_buf_cat2(result, "\"");
5960
5961 p = RSTRING_PTR(str); pend = RSTRING_END(str);
5962 prev = p;
5963 actenc = get_actual_encoding(encidx, str);
5964 if (actenc != enc) {
5965 enc = actenc;
5966 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
5967 }
5968 while (p < pend) {
5969 unsigned int c, cc;
5970 int n;
5971
5972 n = rb_enc_precise_mbclen(p, pend, enc);
5973 if (!MBCLEN_CHARFOUND_P(n)) {
5974 if (p > prev) str_buf_cat(result, prev, p - prev);
5975 n = rb_enc_mbminlen(enc);
5976 if (pend < p + n)
5977 n = (int)(pend - p);
5978 while (n--) {
5979 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5980 str_buf_cat(result, buf, strlen(buf));
5981 prev = ++p;
5982 }
5983 continue;
5984 }
5985 n = MBCLEN_CHARFOUND_LEN(n);
5986 c = rb_enc_mbc_to_codepoint(p, pend, enc);
5987 p += n;
5988 if ((asciicompat || unicode_p) &&
5989 (c == '"'|| c == '\\' ||
5990 (c == '#' &&
5991 p < pend &&
5992 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
5993 (cc = rb_enc_codepoint(p,pend,enc),
5994 (cc == '$' || cc == '@' || cc == '{'))))) {
5995 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5996 str_buf_cat2(result, "\\");
5997 if (asciicompat || enc == resenc) {
5998 prev = p - n;
5999 continue;
6000 }
6001 }
6002 switch (c) {
6003 case '\n': cc = 'n'; break;
6004 case '\r': cc = 'r'; break;
6005 case '\t': cc = 't'; break;
6006 case '\f': cc = 'f'; break;
6007 case '\013': cc = 'v'; break;
6008 case '\010': cc = 'b'; break;
6009 case '\007': cc = 'a'; break;
6010 case 033: cc = 'e'; break;
6011 default: cc = 0; break;
6012 }
6013 if (cc) {
6014 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6015 buf[0] = '\\';
6016 buf[1] = (char)cc;
6017 str_buf_cat(result, buf, 2);
6018 prev = p;
6019 continue;
6020 }
6021 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6022 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6023 continue;
6024 }
6025 else {
6026 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6027 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6028 prev = p;
6029 continue;
6030 }
6031 }
6032 if (p > prev) str_buf_cat(result, prev, p - prev);
6033 str_buf_cat2(result, "\"");
6034
6035 OBJ_INFECT_RAW(result, str);
6036 return result;
6037 }
6038
6039 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6040
6041 /*
6042 * call-seq:
6043 * str.dump -> new_str
6044 *
6045 * Produces a version of +str+ with all non-printing characters replaced by
6046 * <code>\nnn</code> notation and all special characters escaped.
6047 *
6048 * "hello \n ''".dump #=> "\"hello \\n ''\""
6049 */
6050
6051 VALUE
rb_str_dump(VALUE str)6052 rb_str_dump(VALUE str)
6053 {
6054 int encidx = rb_enc_get_index(str);
6055 rb_encoding *enc = rb_enc_from_index(encidx);
6056 long len;
6057 const char *p, *pend;
6058 char *q, *qend;
6059 VALUE result;
6060 int u8 = (encidx == rb_utf8_encindex());
6061 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6062
6063 len = 2; /* "" */
6064 if (!rb_enc_asciicompat(enc)) {
6065 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6066 len += strlen(enc->name);
6067 }
6068
6069 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6070 while (p < pend) {
6071 int clen;
6072 unsigned char c = *p++;
6073
6074 switch (c) {
6075 case '"': case '\\':
6076 case '\n': case '\r':
6077 case '\t': case '\f':
6078 case '\013': case '\010': case '\007': case '\033':
6079 clen = 2;
6080 break;
6081
6082 case '#':
6083 clen = IS_EVSTR(p, pend) ? 2 : 1;
6084 break;
6085
6086 default:
6087 if (ISPRINT(c)) {
6088 clen = 1;
6089 }
6090 else {
6091 if (u8 && c > 0x7F) { /* \u notation */
6092 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6093 if (MBCLEN_CHARFOUND_P(n)) {
6094 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6095 if (cc <= 0xFFFF)
6096 clen = 6; /* \uXXXX */
6097 else if (cc <= 0xFFFFF)
6098 clen = 9; /* \u{XXXXX} */
6099 else
6100 clen = 10; /* \u{XXXXXX} */
6101 p += MBCLEN_CHARFOUND_LEN(n)-1;
6102 break;
6103 }
6104 }
6105 clen = 4; /* \xNN */
6106 }
6107 break;
6108 }
6109
6110 if (clen > LONG_MAX - len) {
6111 rb_raise(rb_eRuntimeError, "string size too big");
6112 }
6113 len += clen;
6114 }
6115
6116 result = rb_str_new_with_class(str, 0, len);
6117 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6118 q = RSTRING_PTR(result); qend = q + len + 1;
6119
6120 *q++ = '"';
6121 while (p < pend) {
6122 unsigned char c = *p++;
6123
6124 if (c == '"' || c == '\\') {
6125 *q++ = '\\';
6126 *q++ = c;
6127 }
6128 else if (c == '#') {
6129 if (IS_EVSTR(p, pend)) *q++ = '\\';
6130 *q++ = '#';
6131 }
6132 else if (c == '\n') {
6133 *q++ = '\\';
6134 *q++ = 'n';
6135 }
6136 else if (c == '\r') {
6137 *q++ = '\\';
6138 *q++ = 'r';
6139 }
6140 else if (c == '\t') {
6141 *q++ = '\\';
6142 *q++ = 't';
6143 }
6144 else if (c == '\f') {
6145 *q++ = '\\';
6146 *q++ = 'f';
6147 }
6148 else if (c == '\013') {
6149 *q++ = '\\';
6150 *q++ = 'v';
6151 }
6152 else if (c == '\010') {
6153 *q++ = '\\';
6154 *q++ = 'b';
6155 }
6156 else if (c == '\007') {
6157 *q++ = '\\';
6158 *q++ = 'a';
6159 }
6160 else if (c == '\033') {
6161 *q++ = '\\';
6162 *q++ = 'e';
6163 }
6164 else if (ISPRINT(c)) {
6165 *q++ = c;
6166 }
6167 else {
6168 *q++ = '\\';
6169 if (u8) {
6170 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6171 if (MBCLEN_CHARFOUND_P(n)) {
6172 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6173 p += n;
6174 if (cc <= 0xFFFF)
6175 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6176 else
6177 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6178 q += strlen(q);
6179 continue;
6180 }
6181 }
6182 snprintf(q, qend-q, "x%02X", c);
6183 q += 3;
6184 }
6185 }
6186 *q++ = '"';
6187 *q = '\0';
6188 if (!rb_enc_asciicompat(enc)) {
6189 snprintf(q, qend-q, nonascii_suffix, enc->name);
6190 encidx = rb_ascii8bit_encindex();
6191 }
6192 OBJ_INFECT_RAW(result, str);
6193 /* result from dump is ASCII */
6194 rb_enc_associate_index(result, encidx);
6195 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6196 return result;
6197 }
6198
6199 static int
unescape_ascii(unsigned int c)6200 unescape_ascii(unsigned int c)
6201 {
6202 switch (c) {
6203 case 'n':
6204 return '\n';
6205 case 'r':
6206 return '\r';
6207 case 't':
6208 return '\t';
6209 case 'f':
6210 return '\f';
6211 case 'v':
6212 return '\13';
6213 case 'b':
6214 return '\010';
6215 case 'a':
6216 return '\007';
6217 case 'e':
6218 return 033;
6219 default:
6220 UNREACHABLE;
6221 }
6222 }
6223
6224 static void
undump_after_backslash(VALUE undumped,const char ** ss,const char * s_end,rb_encoding ** penc,bool * utf8,bool * binary)6225 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6226 {
6227 const char *s = *ss;
6228 unsigned int c;
6229 int codelen;
6230 size_t hexlen;
6231 unsigned char buf[6];
6232 static rb_encoding *enc_utf8 = NULL;
6233
6234 switch (*s) {
6235 case '\\':
6236 case '"':
6237 case '#':
6238 rb_str_cat(undumped, s, 1); /* cat itself */
6239 s++;
6240 break;
6241 case 'n':
6242 case 'r':
6243 case 't':
6244 case 'f':
6245 case 'v':
6246 case 'b':
6247 case 'a':
6248 case 'e':
6249 *buf = unescape_ascii(*s);
6250 rb_str_cat(undumped, (char *)buf, 1);
6251 s++;
6252 break;
6253 case 'u':
6254 if (*binary) {
6255 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6256 }
6257 *utf8 = true;
6258 if (++s >= s_end) {
6259 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6260 }
6261 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6262 if (*penc != enc_utf8) {
6263 *penc = enc_utf8;
6264 rb_enc_associate(undumped, enc_utf8);
6265 }
6266 if (*s == '{') { /* handle \u{...} form */
6267 s++;
6268 for (;;) {
6269 if (s >= s_end) {
6270 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6271 }
6272 if (*s == '}') {
6273 s++;
6274 break;
6275 }
6276 if (ISSPACE(*s)) {
6277 s++;
6278 continue;
6279 }
6280 c = scan_hex(s, s_end-s, &hexlen);
6281 if (hexlen == 0 || hexlen > 6) {
6282 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6283 }
6284 if (c > 0x10ffff) {
6285 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6286 }
6287 if (0xd800 <= c && c <= 0xdfff) {
6288 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6289 }
6290 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6291 rb_str_cat(undumped, (char *)buf, codelen);
6292 s += hexlen;
6293 }
6294 }
6295 else { /* handle \uXXXX form */
6296 c = scan_hex(s, 4, &hexlen);
6297 if (hexlen != 4) {
6298 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6299 }
6300 if (0xd800 <= c && c <= 0xdfff) {
6301 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6302 }
6303 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6304 rb_str_cat(undumped, (char *)buf, codelen);
6305 s += hexlen;
6306 }
6307 break;
6308 case 'x':
6309 if (*utf8) {
6310 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6311 }
6312 *binary = true;
6313 if (++s >= s_end) {
6314 rb_raise(rb_eRuntimeError, "invalid hex escape");
6315 }
6316 *buf = scan_hex(s, 2, &hexlen);
6317 if (hexlen != 2) {
6318 rb_raise(rb_eRuntimeError, "invalid hex escape");
6319 }
6320 rb_str_cat(undumped, (char *)buf, 1);
6321 s += hexlen;
6322 break;
6323 default:
6324 rb_str_cat(undumped, s-1, 2);
6325 s++;
6326 }
6327
6328 *ss = s;
6329 }
6330
6331 static VALUE rb_str_is_ascii_only_p(VALUE str);
6332
6333 /*
6334 * call-seq:
6335 * str.undump -> new_str
6336 *
6337 * Produces unescaped version of +str+.
6338 * See also String#dump because String#undump does inverse of String#dump.
6339 *
6340 * "\"hello \\n ''\"".undump #=> "hello \n ''"
6341 */
6342
6343 static VALUE
str_undump(VALUE str)6344 str_undump(VALUE str)
6345 {
6346 const char *s = RSTRING_PTR(str);
6347 const char *s_end = RSTRING_END(str);
6348 rb_encoding *enc = rb_enc_get(str);
6349 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6350 bool utf8 = false;
6351 bool binary = false;
6352 int w;
6353
6354 rb_must_asciicompat(str);
6355 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6356 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6357 }
6358 if (!str_null_check(str, &w)) {
6359 rb_raise(rb_eRuntimeError, "string contains null byte");
6360 }
6361 if (RSTRING_LEN(str) < 2) goto invalid_format;
6362 if (*s != '"') goto invalid_format;
6363
6364 /* strip '"' at the start */
6365 s++;
6366
6367 for (;;) {
6368 if (s >= s_end) {
6369 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6370 }
6371
6372 if (*s == '"') {
6373 /* epilogue */
6374 s++;
6375 if (s == s_end) {
6376 /* ascii compatible dumped string */
6377 break;
6378 }
6379 else {
6380 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6381 static const char dup_suffix[] = ".dup";
6382 const char *encname;
6383 int encidx;
6384 ptrdiff_t size;
6385
6386 /* check separately for strings dumped by older versions */
6387 size = sizeof(dup_suffix) - 1;
6388 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6389
6390 size = sizeof(force_encoding_suffix) - 1;
6391 if (s_end - s <= size) goto invalid_format;
6392 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6393 s += size;
6394
6395 if (utf8) {
6396 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6397 }
6398
6399 encname = s;
6400 s = memchr(s, '"', s_end-s);
6401 size = s - encname;
6402 if (!s) goto invalid_format;
6403 if (s_end - s != 2) goto invalid_format;
6404 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6405
6406 encidx = rb_enc_find_index2(encname, (long)size);
6407 if (encidx < 0) {
6408 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6409 }
6410 rb_enc_associate_index(undumped, encidx);
6411 }
6412 break;
6413 }
6414
6415 if (*s == '\\') {
6416 s++;
6417 if (s >= s_end) {
6418 rb_raise(rb_eRuntimeError, "invalid escape");
6419 }
6420 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6421 }
6422 else {
6423 rb_str_cat(undumped, s++, 1);
6424 }
6425 }
6426
6427 OBJ_INFECT(undumped, str);
6428 return undumped;
6429 invalid_format:
6430 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6431 }
6432
6433 static void
rb_str_check_dummy_enc(rb_encoding * enc)6434 rb_str_check_dummy_enc(rb_encoding *enc)
6435 {
6436 if (rb_enc_dummy_p(enc)) {
6437 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6438 rb_enc_name(enc));
6439 }
6440 }
6441
6442 static OnigCaseFoldType
check_case_options(int argc,VALUE * argv,OnigCaseFoldType flags)6443 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6444 {
6445 if (argc==0)
6446 return flags;
6447 if (argc>2)
6448 rb_raise(rb_eArgError, "too many options");
6449 if (argv[0]==sym_turkic) {
6450 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6451 if (argc==2) {
6452 if (argv[1]==sym_lithuanian)
6453 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6454 else
6455 rb_raise(rb_eArgError, "invalid second option");
6456 }
6457 }
6458 else if (argv[0]==sym_lithuanian) {
6459 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6460 if (argc==2) {
6461 if (argv[1]==sym_turkic)
6462 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6463 else
6464 rb_raise(rb_eArgError, "invalid second option");
6465 }
6466 }
6467 else if (argc>1)
6468 rb_raise(rb_eArgError, "too many options");
6469 else if (argv[0]==sym_ascii)
6470 flags |= ONIGENC_CASE_ASCII_ONLY;
6471 else if (argv[0]==sym_fold) {
6472 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6473 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6474 else
6475 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6476 }
6477 else
6478 rb_raise(rb_eArgError, "invalid option");
6479 return flags;
6480 }
6481
6482 /* 16 should be long enough to absorb any kind of single character length increase */
6483 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
6484 #ifndef CASEMAP_DEBUG
6485 # define CASEMAP_DEBUG 0
6486 #endif
6487
6488 struct mapping_buffer;
6489 typedef struct mapping_buffer {
6490 size_t capa;
6491 size_t used;
6492 struct mapping_buffer *next;
6493 OnigUChar space[FLEX_ARY_LEN];
6494 } mapping_buffer;
6495
6496 static void
mapping_buffer_free(void * p)6497 mapping_buffer_free(void *p)
6498 {
6499 mapping_buffer *previous_buffer;
6500 mapping_buffer *current_buffer = p;
6501 while (current_buffer) {
6502 previous_buffer = current_buffer;
6503 current_buffer = current_buffer->next;
6504 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
6505 }
6506 }
6507
6508 static const rb_data_type_t mapping_buffer_type = {
6509 "mapping_buffer",
6510 {0, mapping_buffer_free,}
6511 };
6512
6513 static VALUE
rb_str_casemap(VALUE source,OnigCaseFoldType * flags,rb_encoding * enc)6514 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6515 {
6516 VALUE target;
6517
6518 OnigUChar *source_current, *source_end;
6519 int target_length = 0;
6520 VALUE buffer_anchor;
6521 mapping_buffer *current_buffer = 0;
6522 mapping_buffer **pre_buffer;
6523 size_t buffer_count = 0;
6524 int buffer_length_or_invalid;
6525
6526 if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
6527
6528 source_current = (OnigUChar*)RSTRING_PTR(source);
6529 source_end = (OnigUChar*)RSTRING_END(source);
6530
6531 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
6532 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
6533 while (source_current < source_end) {
6534 /* increase multiplier using buffer count to converge quickly */
6535 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6536 if (CASEMAP_DEBUG) {
6537 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6538 }
6539 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
6540 *pre_buffer = current_buffer;
6541 pre_buffer = ¤t_buffer->next;
6542 current_buffer->next = NULL;
6543 current_buffer->capa = capa;
6544 buffer_length_or_invalid = enc->case_map(flags,
6545 (const OnigUChar**)&source_current, source_end,
6546 current_buffer->space,
6547 current_buffer->space+current_buffer->capa,
6548 enc);
6549 if (buffer_length_or_invalid < 0) {
6550 current_buffer = DATA_PTR(buffer_anchor);
6551 DATA_PTR(buffer_anchor) = 0;
6552 mapping_buffer_free(current_buffer);
6553 rb_raise(rb_eArgError, "input string invalid");
6554 }
6555 target_length += current_buffer->used = buffer_length_or_invalid;
6556 }
6557 if (CASEMAP_DEBUG) {
6558 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6559 }
6560
6561 if (buffer_count==1) {
6562 target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
6563 }
6564 else {
6565 char *target_current;
6566
6567 target = rb_str_new_with_class(source, 0, target_length);
6568 target_current = RSTRING_PTR(target);
6569 current_buffer = DATA_PTR(buffer_anchor);
6570 while (current_buffer) {
6571 memcpy(target_current, current_buffer->space, current_buffer->used);
6572 target_current += current_buffer->used;
6573 current_buffer = current_buffer->next;
6574 }
6575 }
6576 current_buffer = DATA_PTR(buffer_anchor);
6577 DATA_PTR(buffer_anchor) = 0;
6578 mapping_buffer_free(current_buffer);
6579
6580 /* TODO: check about string terminator character */
6581 OBJ_INFECT_RAW(target, source);
6582 str_enc_copy(target, source);
6583 /*ENC_CODERANGE_SET(mapped, cr);*/
6584
6585 return target;
6586 }
6587
6588 static void
rb_str_ascii_casemap(VALUE source,OnigCaseFoldType * flags,rb_encoding * enc)6589 rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6590 {
6591 OnigUChar *source_current, *source_end;
6592 long old_length = RSTRING_LEN(source);
6593 int length_or_invalid;
6594
6595 if (old_length == 0) return;
6596
6597 source_current = (OnigUChar*)RSTRING_PTR(source);
6598 source_end = (OnigUChar*)RSTRING_END(source);
6599
6600 length_or_invalid = onigenc_ascii_only_case_map(flags,
6601 (const OnigUChar**)&source_current, source_end,
6602 source_current, source_end, enc);
6603 if (length_or_invalid < 0)
6604 rb_raise(rb_eArgError, "input string invalid");
6605 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6606 fprintf(stderr, "problem with rb_str_ascii_casemap"
6607 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6608 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6609 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6610 }
6611 }
6612
6613 /*
6614 * call-seq:
6615 * str.upcase! -> str or nil
6616 * str.upcase!([options]) -> str or nil
6617 *
6618 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6619 * were made.
6620 *
6621 * See String#downcase for meaning of +options+ and use with different encodings.
6622 */
6623
6624 static VALUE
rb_str_upcase_bang(int argc,VALUE * argv,VALUE str)6625 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
6626 {
6627 rb_encoding *enc;
6628 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
6629
6630 flags = check_case_options(argc, argv, flags);
6631 str_modify_keep_cr(str);
6632 enc = STR_ENC_GET(str);
6633 rb_str_check_dummy_enc(enc);
6634 if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6635 || (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) {
6636 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6637
6638 while (s < send) {
6639 unsigned int c = *(unsigned char*)s;
6640
6641 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6642 *s = 'A' + (c - 'a');
6643 flags |= ONIGENC_CASE_MODIFIED;
6644 }
6645 s++;
6646 }
6647 }
6648 else if (flags&ONIGENC_CASE_ASCII_ONLY)
6649 rb_str_ascii_casemap(str, &flags, enc);
6650 else
6651 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6652
6653 if (ONIGENC_CASE_MODIFIED&flags) return str;
6654 return Qnil;
6655 }
6656
6657
6658 /*
6659 * call-seq:
6660 * str.upcase -> new_str
6661 * str.upcase([options]) -> new_str
6662 *
6663 * Returns a copy of <i>str</i> with all lowercase letters replaced with their
6664 * uppercase counterparts.
6665 *
6666 * See String#downcase for meaning of +options+ and use with different encodings.
6667 *
6668 * "hEllO".upcase #=> "HELLO"
6669 */
6670
6671 static VALUE
rb_str_upcase(int argc,VALUE * argv,VALUE str)6672 rb_str_upcase(int argc, VALUE *argv, VALUE str)
6673 {
6674 str = rb_str_dup(str);
6675 rb_str_upcase_bang(argc, argv, str);
6676 return str;
6677 }
6678
6679 /*
6680 * call-seq:
6681 * str.downcase! -> str or nil
6682 * str.downcase!([options]) -> str or nil
6683 *
6684 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
6685 * changes were made.
6686 *
6687 * See String#downcase for meaning of +options+ and use with different encodings.
6688 */
6689
6690 static VALUE
rb_str_downcase_bang(int argc,VALUE * argv,VALUE str)6691 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
6692 {
6693 rb_encoding *enc;
6694 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
6695
6696 flags = check_case_options(argc, argv, flags);
6697 str_modify_keep_cr(str);
6698 enc = STR_ENC_GET(str);
6699 rb_str_check_dummy_enc(enc);
6700 if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6701 || (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) {
6702 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6703
6704 while (s < send) {
6705 unsigned int c = *(unsigned char*)s;
6706
6707 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6708 *s = 'a' + (c - 'A');
6709 flags |= ONIGENC_CASE_MODIFIED;
6710 }
6711 s++;
6712 }
6713 }
6714 else if (flags&ONIGENC_CASE_ASCII_ONLY)
6715 rb_str_ascii_casemap(str, &flags, enc);
6716 else
6717 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6718
6719 if (ONIGENC_CASE_MODIFIED&flags) return str;
6720 return Qnil;
6721 }
6722
6723
6724 /*
6725 * call-seq:
6726 * str.downcase -> new_str
6727 * str.downcase([options]) -> new_str
6728 *
6729 * Returns a copy of <i>str</i> with all uppercase letters replaced with their
6730 * lowercase counterparts. Which letters exactly are replaced, and by which
6731 * other letters, depends on the presence or absence of options, and on the
6732 * +encoding+ of the string.
6733 *
6734 * The meaning of the +options+ is as follows:
6735 *
6736 * No option ::
6737 * Full Unicode case mapping, suitable for most languages
6738 * (see :turkic and :lithuanian options below for exceptions).
6739 * Context-dependent case mapping as described in Table 3-14 of the
6740 * Unicode standard is currently not supported.
6741 * :ascii ::
6742 * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
6743 * ``a'' to ``z'', are affected.
6744 * This option cannot be combined with any other option.
6745 * :turkic ::
6746 * Full Unicode case mapping, adapted for Turkic languages
6747 * (Turkish, Azerbaijani, ...). This means that upper case I is mapped to
6748 * lower case dotless i, and so on.
6749 * :lithuanian ::
6750 * Currently, just full Unicode case mapping. In the future, full Unicode
6751 * case mapping adapted for Lithuanian (keeping the dot on the lower case
6752 * i even if there is an accent on top).
6753 * :fold ::
6754 * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
6755 * which is more far-reaching than Unicode case mapping.
6756 * This option currently cannot be combined with any other option
6757 * (i.e. there is currently no variant for turkic languages).
6758 *
6759 * Please note that several assumptions that are valid for ASCII-only case
6760 * conversions do not hold for more general case conversions. For example,
6761 * the length of the result may not be the same as the length of the input
6762 * (neither in characters nor in bytes), some roundtrip assumptions
6763 * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
6764 * normalization (i.e. String#unicode_normalize) is not necessarily maintained
6765 * by case mapping operations.
6766 *
6767 * Non-ASCII case mapping/folding is currently supported for UTF-8,
6768 * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
6769 * This support will be extended to other encodings.
6770 *
6771 * "hEllO".downcase #=> "hello"
6772 */
6773
6774 static VALUE
rb_str_downcase(int argc,VALUE * argv,VALUE str)6775 rb_str_downcase(int argc, VALUE *argv, VALUE str)
6776 {
6777 str = rb_str_dup(str);
6778 rb_str_downcase_bang(argc, argv, str);
6779 return str;
6780 }
6781
6782
6783 /*
6784 * call-seq:
6785 * str.capitalize! -> str or nil
6786 * str.capitalize!([options]) -> str or nil
6787 *
6788 * Modifies <i>str</i> by converting the first character to uppercase and the
6789 * remainder to lowercase. Returns <code>nil</code> if no changes are made.
6790 * There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
6791 * the result is the same as for String#downcase, to avoid mixed case.
6792 *
6793 * See String#downcase for meaning of +options+ and use with different encodings.
6794 *
6795 * a = "hello"
6796 * a.capitalize! #=> "Hello"
6797 * a #=> "Hello"
6798 * a.capitalize! #=> nil
6799 */
6800
6801 static VALUE
rb_str_capitalize_bang(int argc,VALUE * argv,VALUE str)6802 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
6803 {
6804 rb_encoding *enc;
6805 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
6806
6807 flags = check_case_options(argc, argv, flags);
6808 str_modify_keep_cr(str);
6809 enc = STR_ENC_GET(str);
6810 rb_str_check_dummy_enc(enc);
6811 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6812 if (flags&ONIGENC_CASE_ASCII_ONLY)
6813 rb_str_ascii_casemap(str, &flags, enc);
6814 else
6815 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6816
6817 if (ONIGENC_CASE_MODIFIED&flags) return str;
6818 return Qnil;
6819 }
6820
6821
6822 /*
6823 * call-seq:
6824 * str.capitalize -> new_str
6825 * str.capitalize([options]) -> new_str
6826 *
6827 * Returns a copy of <i>str</i> with the first character converted to uppercase
6828 * and the remainder to lowercase.
6829 *
6830 * See String#downcase for meaning of +options+ and use with different encodings.
6831 *
6832 * "hello".capitalize #=> "Hello"
6833 * "HELLO".capitalize #=> "Hello"
6834 * "123ABC".capitalize #=> "123abc"
6835 */
6836
6837 static VALUE
rb_str_capitalize(int argc,VALUE * argv,VALUE str)6838 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
6839 {
6840 str = rb_str_dup(str);
6841 rb_str_capitalize_bang(argc, argv, str);
6842 return str;
6843 }
6844
6845
6846 /*
6847 * call-seq:
6848 * str.swapcase! -> str or nil
6849 * str.swapcase!([options]) -> str or nil
6850 *
6851 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
6852 * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
6853 *
6854 * See String#downcase for meaning of +options+ and use with different encodings.
6855 */
6856
6857 static VALUE
rb_str_swapcase_bang(int argc,VALUE * argv,VALUE str)6858 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
6859 {
6860 rb_encoding *enc;
6861 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
6862
6863 flags = check_case_options(argc, argv, flags);
6864 str_modify_keep_cr(str);
6865 enc = STR_ENC_GET(str);
6866 rb_str_check_dummy_enc(enc);
6867 if (flags&ONIGENC_CASE_ASCII_ONLY)
6868 rb_str_ascii_casemap(str, &flags, enc);
6869 else
6870 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6871
6872 if (ONIGENC_CASE_MODIFIED&flags) return str;
6873 return Qnil;
6874 }
6875
6876
6877 /*
6878 * call-seq:
6879 * str.swapcase -> new_str
6880 * str.swapcase([options]) -> new_str
6881 *
6882 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
6883 * to lowercase and lowercase characters converted to uppercase.
6884 *
6885 * See String#downcase for meaning of +options+ and use with different encodings.
6886 *
6887 * "Hello".swapcase #=> "hELLO"
6888 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
6889 */
6890
6891 static VALUE
rb_str_swapcase(int argc,VALUE * argv,VALUE str)6892 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
6893 {
6894 str = rb_str_dup(str);
6895 rb_str_swapcase_bang(argc, argv, str);
6896 return str;
6897 }
6898
6899 typedef unsigned char *USTR;
6900
6901 struct tr {
6902 int gen;
6903 unsigned int now, max;
6904 char *p, *pend;
6905 };
6906
6907 static unsigned int
trnext(struct tr * t,rb_encoding * enc)6908 trnext(struct tr *t, rb_encoding *enc)
6909 {
6910 int n;
6911
6912 for (;;) {
6913 if (!t->gen) {
6914 nextpart:
6915 if (t->p == t->pend) return -1;
6916 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
6917 t->p += n;
6918 }
6919 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6920 t->p += n;
6921 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
6922 t->p += n;
6923 if (t->p < t->pend) {
6924 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6925 t->p += n;
6926 if (t->now > c) {
6927 if (t->now < 0x80 && c < 0x80) {
6928 rb_raise(rb_eArgError,
6929 "invalid range \"%c-%c\" in string transliteration",
6930 t->now, c);
6931 }
6932 else {
6933 rb_raise(rb_eArgError, "invalid range in string transliteration");
6934 }
6935 continue; /* not reached */
6936 }
6937 t->gen = 1;
6938 t->max = c;
6939 }
6940 }
6941 return t->now;
6942 }
6943 else {
6944 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
6945 if (t->now == t->max) {
6946 t->gen = 0;
6947 goto nextpart;
6948 }
6949 }
6950 if (t->now < t->max) {
6951 return t->now;
6952 }
6953 else {
6954 t->gen = 0;
6955 return t->max;
6956 }
6957 }
6958 }
6959 }
6960
6961 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
6962
6963 static VALUE
tr_trans(VALUE str,VALUE src,VALUE repl,int sflag)6964 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
6965 {
6966 const unsigned int errc = -1;
6967 unsigned int trans[256];
6968 rb_encoding *enc, *e1, *e2;
6969 struct tr trsrc, trrepl;
6970 int cflag = 0;
6971 unsigned int c, c0, last = 0;
6972 int modify = 0, i, l;
6973 unsigned char *s, *send;
6974 VALUE hash = 0;
6975 int singlebyte = single_byte_optimizable(str);
6976 int termlen;
6977 int cr;
6978
6979 #define CHECK_IF_ASCII(c) \
6980 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
6981 (cr = ENC_CODERANGE_VALID) : 0)
6982
6983 StringValue(src);
6984 StringValue(repl);
6985 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6986 if (RSTRING_LEN(repl) == 0) {
6987 return rb_str_delete_bang(1, &src, str);
6988 }
6989
6990 cr = ENC_CODERANGE(str);
6991 e1 = rb_enc_check(str, src);
6992 e2 = rb_enc_check(str, repl);
6993 if (e1 == e2) {
6994 enc = e1;
6995 }
6996 else {
6997 enc = rb_enc_check(src, repl);
6998 }
6999 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7000 if (RSTRING_LEN(src) > 1 &&
7001 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7002 trsrc.p + l < trsrc.pend) {
7003 cflag = 1;
7004 trsrc.p += l;
7005 }
7006 trrepl.p = RSTRING_PTR(repl);
7007 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7008 trsrc.gen = trrepl.gen = 0;
7009 trsrc.now = trrepl.now = 0;
7010 trsrc.max = trrepl.max = 0;
7011
7012 if (cflag) {
7013 for (i=0; i<256; i++) {
7014 trans[i] = 1;
7015 }
7016 while ((c = trnext(&trsrc, enc)) != errc) {
7017 if (c < 256) {
7018 trans[c] = errc;
7019 }
7020 else {
7021 if (!hash) hash = rb_hash_new();
7022 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7023 }
7024 }
7025 while ((c = trnext(&trrepl, enc)) != errc)
7026 /* retrieve last replacer */;
7027 last = trrepl.now;
7028 for (i=0; i<256; i++) {
7029 if (trans[i] != errc) {
7030 trans[i] = last;
7031 }
7032 }
7033 }
7034 else {
7035 unsigned int r;
7036
7037 for (i=0; i<256; i++) {
7038 trans[i] = errc;
7039 }
7040 while ((c = trnext(&trsrc, enc)) != errc) {
7041 r = trnext(&trrepl, enc);
7042 if (r == errc) r = trrepl.now;
7043 if (c < 256) {
7044 trans[c] = r;
7045 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7046 }
7047 else {
7048 if (!hash) hash = rb_hash_new();
7049 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7050 }
7051 }
7052 }
7053
7054 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7055 cr = ENC_CODERANGE_7BIT;
7056 str_modify_keep_cr(str);
7057 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7058 termlen = rb_enc_mbminlen(enc);
7059 if (sflag) {
7060 int clen, tlen;
7061 long offset, max = RSTRING_LEN(str);
7062 unsigned int save = -1;
7063 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7064
7065 while (s < send) {
7066 int may_modify = 0;
7067
7068 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7069 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7070
7071 s += clen;
7072 if (c < 256) {
7073 c = trans[c];
7074 }
7075 else if (hash) {
7076 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7077 if (NIL_P(tmp)) {
7078 if (cflag) c = last;
7079 else c = errc;
7080 }
7081 else if (cflag) c = errc;
7082 else c = NUM2INT(tmp);
7083 }
7084 else {
7085 c = errc;
7086 }
7087 if (c != (unsigned int)-1) {
7088 if (save == c) {
7089 CHECK_IF_ASCII(c);
7090 continue;
7091 }
7092 save = c;
7093 tlen = rb_enc_codelen(c, enc);
7094 modify = 1;
7095 }
7096 else {
7097 save = -1;
7098 c = c0;
7099 if (enc != e1) may_modify = 1;
7100 }
7101 if ((offset = t - buf) + tlen > max) {
7102 size_t MAYBE_UNUSED(old) = max + termlen;
7103 max = offset + tlen + (send - s);
7104 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7105 t = buf + offset;
7106 }
7107 rb_enc_mbcput(c, t, enc);
7108 if (may_modify && memcmp(s, t, tlen) != 0) {
7109 modify = 1;
7110 }
7111 CHECK_IF_ASCII(c);
7112 t += tlen;
7113 }
7114 if (!STR_EMBED_P(str)) {
7115 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7116 }
7117 TERM_FILL((char *)t, termlen);
7118 RSTRING(str)->as.heap.ptr = (char *)buf;
7119 RSTRING(str)->as.heap.len = t - buf;
7120 STR_SET_NOEMBED(str);
7121 RSTRING(str)->as.heap.aux.capa = max;
7122 }
7123 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7124 while (s < send) {
7125 c = (unsigned char)*s;
7126 if (trans[c] != errc) {
7127 if (!cflag) {
7128 c = trans[c];
7129 *s = c;
7130 modify = 1;
7131 }
7132 else {
7133 *s = last;
7134 modify = 1;
7135 }
7136 }
7137 CHECK_IF_ASCII(c);
7138 s++;
7139 }
7140 }
7141 else {
7142 int clen, tlen;
7143 long offset, max = (long)((send - s) * 1.2);
7144 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7145
7146 while (s < send) {
7147 int may_modify = 0;
7148 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7149 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7150
7151 if (c < 256) {
7152 c = trans[c];
7153 }
7154 else if (hash) {
7155 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7156 if (NIL_P(tmp)) {
7157 if (cflag) c = last;
7158 else c = errc;
7159 }
7160 else if (cflag) c = errc;
7161 else c = NUM2INT(tmp);
7162 }
7163 else {
7164 c = cflag ? last : errc;
7165 }
7166 if (c != errc) {
7167 tlen = rb_enc_codelen(c, enc);
7168 modify = 1;
7169 }
7170 else {
7171 c = c0;
7172 if (enc != e1) may_modify = 1;
7173 }
7174 if ((offset = t - buf) + tlen > max) {
7175 size_t MAYBE_UNUSED(old) = max + termlen;
7176 max = offset + tlen + (long)((send - s) * 1.2);
7177 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7178 t = buf + offset;
7179 }
7180 if (s != t) {
7181 rb_enc_mbcput(c, t, enc);
7182 if (may_modify && memcmp(s, t, tlen) != 0) {
7183 modify = 1;
7184 }
7185 }
7186 CHECK_IF_ASCII(c);
7187 s += clen;
7188 t += tlen;
7189 }
7190 if (!STR_EMBED_P(str)) {
7191 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7192 }
7193 TERM_FILL((char *)t, termlen);
7194 RSTRING(str)->as.heap.ptr = (char *)buf;
7195 RSTRING(str)->as.heap.len = t - buf;
7196 STR_SET_NOEMBED(str);
7197 RSTRING(str)->as.heap.aux.capa = max;
7198 }
7199
7200 if (modify) {
7201 if (cr != ENC_CODERANGE_BROKEN)
7202 ENC_CODERANGE_SET(str, cr);
7203 rb_enc_associate(str, enc);
7204 return str;
7205 }
7206 return Qnil;
7207 }
7208
7209
7210 /*
7211 * call-seq:
7212 * str.tr!(from_str, to_str) -> str or nil
7213 *
7214 * Translates <i>str</i> in place, using the same rules as
7215 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
7216 * changes were made.
7217 */
7218
7219 static VALUE
rb_str_tr_bang(VALUE str,VALUE src,VALUE repl)7220 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7221 {
7222 return tr_trans(str, src, repl, 0);
7223 }
7224
7225
7226 /*
7227 * call-seq:
7228 * str.tr(from_str, to_str) => new_str
7229 *
7230 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7231 * corresponding characters in +to_str+. If +to_str+ is shorter than
7232 * +from_str+, it is padded with its last character in order to maintain the
7233 * correspondence.
7234 *
7235 * "hello".tr('el', 'ip') #=> "hippo"
7236 * "hello".tr('aeiou', '*') #=> "h*ll*"
7237 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7238 *
7239 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7240 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7241 * all characters except those listed.
7242 *
7243 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7244 * "hello".tr('^aeiou', '*') #=> "*e**o"
7245 *
7246 * The backslash character <code>\\</code> can be used to escape
7247 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7248 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7249 *
7250 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7251 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7252 *
7253 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7254 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7255 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7256 *
7257 * "X['\\b']".tr("X\\", "") #=> "['b']"
7258 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7259 */
7260
7261 static VALUE
rb_str_tr(VALUE str,VALUE src,VALUE repl)7262 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7263 {
7264 str = rb_str_dup(str);
7265 tr_trans(str, src, repl, 0);
7266 return str;
7267 }
7268
7269 #define TR_TABLE_SIZE 257
7270 static void
tr_setup_table(VALUE str,char stable[TR_TABLE_SIZE],int first,VALUE * tablep,VALUE * ctablep,rb_encoding * enc)7271 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7272 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7273 {
7274 const unsigned int errc = -1;
7275 char buf[256];
7276 struct tr tr;
7277 unsigned int c;
7278 VALUE table = 0, ptable = 0;
7279 int i, l, cflag = 0;
7280
7281 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7282 tr.gen = tr.now = tr.max = 0;
7283
7284 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7285 cflag = 1;
7286 tr.p += l;
7287 }
7288 if (first) {
7289 for (i=0; i<256; i++) {
7290 stable[i] = 1;
7291 }
7292 stable[256] = cflag;
7293 }
7294 else if (stable[256] && !cflag) {
7295 stable[256] = 0;
7296 }
7297 for (i=0; i<256; i++) {
7298 buf[i] = cflag;
7299 }
7300
7301 while ((c = trnext(&tr, enc)) != errc) {
7302 if (c < 256) {
7303 buf[c & 0xff] = !cflag;
7304 }
7305 else {
7306 VALUE key = UINT2NUM(c);
7307
7308 if (!table && (first || *tablep || stable[256])) {
7309 if (cflag) {
7310 ptable = *ctablep;
7311 table = ptable ? ptable : rb_hash_new();
7312 *ctablep = table;
7313 }
7314 else {
7315 table = rb_hash_new();
7316 ptable = *tablep;
7317 *tablep = table;
7318 }
7319 }
7320 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7321 rb_hash_aset(table, key, Qtrue);
7322 }
7323 }
7324 }
7325 for (i=0; i<256; i++) {
7326 stable[i] = stable[i] && buf[i];
7327 }
7328 if (!table && !cflag) {
7329 *tablep = 0;
7330 }
7331 }
7332
7333
7334 static int
tr_find(unsigned int c,const char table[TR_TABLE_SIZE],VALUE del,VALUE nodel)7335 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7336 {
7337 if (c < 256) {
7338 return table[c] != 0;
7339 }
7340 else {
7341 VALUE v = UINT2NUM(c);
7342
7343 if (del) {
7344 if (!NIL_P(rb_hash_lookup(del, v)) &&
7345 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7346 return TRUE;
7347 }
7348 }
7349 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7350 return FALSE;
7351 }
7352 return table[256] ? TRUE : FALSE;
7353 }
7354 }
7355
7356 /*
7357 * call-seq:
7358 * str.delete!([other_str]+) -> str or nil
7359 *
7360 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7361 * <code>nil</code> if <i>str</i> was not modified.
7362 */
7363
7364 static VALUE
rb_str_delete_bang(int argc,VALUE * argv,VALUE str)7365 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7366 {
7367 char squeez[TR_TABLE_SIZE];
7368 rb_encoding *enc = 0;
7369 char *s, *send, *t;
7370 VALUE del = 0, nodel = 0;
7371 int modify = 0;
7372 int i, ascompat, cr;
7373
7374 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7375 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7376 for (i=0; i<argc; i++) {
7377 VALUE s = argv[i];
7378
7379 StringValue(s);
7380 enc = rb_enc_check(str, s);
7381 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7382 }
7383
7384 str_modify_keep_cr(str);
7385 ascompat = rb_enc_asciicompat(enc);
7386 s = t = RSTRING_PTR(str);
7387 send = RSTRING_END(str);
7388 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7389 while (s < send) {
7390 unsigned int c;
7391 int clen;
7392
7393 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7394 if (squeez[c]) {
7395 modify = 1;
7396 }
7397 else {
7398 if (t != s) *t = c;
7399 t++;
7400 }
7401 s++;
7402 }
7403 else {
7404 c = rb_enc_codepoint_len(s, send, &clen, enc);
7405
7406 if (tr_find(c, squeez, del, nodel)) {
7407 modify = 1;
7408 }
7409 else {
7410 if (t != s) rb_enc_mbcput(c, t, enc);
7411 t += clen;
7412 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
7413 }
7414 s += clen;
7415 }
7416 }
7417 TERM_FILL(t, TERM_LEN(str));
7418 STR_SET_LEN(str, t - RSTRING_PTR(str));
7419 ENC_CODERANGE_SET(str, cr);
7420
7421 if (modify) return str;
7422 return Qnil;
7423 }
7424
7425
7426 /*
7427 * call-seq:
7428 * str.delete([other_str]+) -> new_str
7429 *
7430 * Returns a copy of <i>str</i> with all characters in the intersection of its
7431 * arguments deleted. Uses the same rules for building the set of characters as
7432 * <code>String#count</code>.
7433 *
7434 * "hello".delete "l","lo" #=> "heo"
7435 * "hello".delete "lo" #=> "he"
7436 * "hello".delete "aeiou", "^e" #=> "hell"
7437 * "hello".delete "ej-m" #=> "ho"
7438 */
7439
7440 static VALUE
rb_str_delete(int argc,VALUE * argv,VALUE str)7441 rb_str_delete(int argc, VALUE *argv, VALUE str)
7442 {
7443 str = rb_str_dup(str);
7444 rb_str_delete_bang(argc, argv, str);
7445 return str;
7446 }
7447
7448
7449 /*
7450 * call-seq:
7451 * str.squeeze!([other_str]*) -> str or nil
7452 *
7453 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
7454 * <code>nil</code> if no changes were made.
7455 */
7456
7457 static VALUE
rb_str_squeeze_bang(int argc,VALUE * argv,VALUE str)7458 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
7459 {
7460 char squeez[TR_TABLE_SIZE];
7461 rb_encoding *enc = 0;
7462 VALUE del = 0, nodel = 0;
7463 unsigned char *s, *send, *t;
7464 int i, modify = 0;
7465 int ascompat, singlebyte = single_byte_optimizable(str);
7466 unsigned int save;
7467
7468 if (argc == 0) {
7469 enc = STR_ENC_GET(str);
7470 }
7471 else {
7472 for (i=0; i<argc; i++) {
7473 VALUE s = argv[i];
7474
7475 StringValue(s);
7476 enc = rb_enc_check(str, s);
7477 if (singlebyte && !single_byte_optimizable(s))
7478 singlebyte = 0;
7479 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7480 }
7481 }
7482
7483 str_modify_keep_cr(str);
7484 s = t = (unsigned char *)RSTRING_PTR(str);
7485 if (!s || RSTRING_LEN(str) == 0) return Qnil;
7486 send = (unsigned char *)RSTRING_END(str);
7487 save = -1;
7488 ascompat = rb_enc_asciicompat(enc);
7489
7490 if (singlebyte) {
7491 while (s < send) {
7492 unsigned int c = *s++;
7493 if (c != save || (argc > 0 && !squeez[c])) {
7494 *t++ = save = c;
7495 }
7496 }
7497 }
7498 else {
7499 while (s < send) {
7500 unsigned int c;
7501 int clen;
7502
7503 if (ascompat && (c = *s) < 0x80) {
7504 if (c != save || (argc > 0 && !squeez[c])) {
7505 *t++ = save = c;
7506 }
7507 s++;
7508 }
7509 else {
7510 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
7511
7512 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
7513 if (t != s) rb_enc_mbcput(c, t, enc);
7514 save = c;
7515 t += clen;
7516 }
7517 s += clen;
7518 }
7519 }
7520 }
7521
7522 TERM_FILL((char *)t, TERM_LEN(str));
7523 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
7524 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
7525 modify = 1;
7526 }
7527
7528 if (modify) return str;
7529 return Qnil;
7530 }
7531
7532
7533 /*
7534 * call-seq:
7535 * str.squeeze([other_str]*) -> new_str
7536 *
7537 * Builds a set of characters from the <i>other_str</i> parameter(s) using the
7538 * procedure described for <code>String#count</code>. Returns a new string
7539 * where runs of the same character that occur in this set are replaced by a
7540 * single character. If no arguments are given, all runs of identical
7541 * characters are replaced by a single character.
7542 *
7543 * "yellow moon".squeeze #=> "yelow mon"
7544 * " now is the".squeeze(" ") #=> " now is the"
7545 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
7546 */
7547
7548 static VALUE
rb_str_squeeze(int argc,VALUE * argv,VALUE str)7549 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
7550 {
7551 str = rb_str_dup(str);
7552 rb_str_squeeze_bang(argc, argv, str);
7553 return str;
7554 }
7555
7556
7557 /*
7558 * call-seq:
7559 * str.tr_s!(from_str, to_str) -> str or nil
7560 *
7561 * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
7562 * returning <i>str</i>, or <code>nil</code> if no changes were made.
7563 */
7564
7565 static VALUE
rb_str_tr_s_bang(VALUE str,VALUE src,VALUE repl)7566 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
7567 {
7568 return tr_trans(str, src, repl, 1);
7569 }
7570
7571
7572 /*
7573 * call-seq:
7574 * str.tr_s(from_str, to_str) -> new_str
7575 *
7576 * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
7577 * then removes duplicate characters in regions that were affected by the
7578 * translation.
7579 *
7580 * "hello".tr_s('l', 'r') #=> "hero"
7581 * "hello".tr_s('el', '*') #=> "h*o"
7582 * "hello".tr_s('el', 'hx') #=> "hhxo"
7583 */
7584
7585 static VALUE
rb_str_tr_s(VALUE str,VALUE src,VALUE repl)7586 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7587 {
7588 str = rb_str_dup(str);
7589 tr_trans(str, src, repl, 1);
7590 return str;
7591 }
7592
7593
7594 /*
7595 * call-seq:
7596 * str.count([other_str]+) -> integer
7597 *
7598 * Each +other_str+ parameter defines a set of characters to count. The
7599 * intersection of these sets defines the characters to count in +str+. Any
7600 * +other_str+ that starts with a caret <code>^</code> is negated. The
7601 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
7602 * backslash character <code>\\</code> can be used to escape <code>^</code> or
7603 * <code>-</code> and is otherwise ignored unless it appears at the end of a
7604 * sequence or the end of a +other_str+.
7605 *
7606 * a = "hello world"
7607 * a.count "lo" #=> 5
7608 * a.count "lo", "o" #=> 2
7609 * a.count "hello", "^l" #=> 4
7610 * a.count "ej-m" #=> 4
7611 *
7612 * "hello^world".count "\\^aeiou" #=> 4
7613 * "hello-world".count "a\\-eo" #=> 4
7614 *
7615 * c = "hello world\\r\\n"
7616 * c.count "\\" #=> 2
7617 * c.count "\\A" #=> 0
7618 * c.count "X-\\w" #=> 3
7619 */
7620
7621 static VALUE
rb_str_count(int argc,VALUE * argv,VALUE str)7622 rb_str_count(int argc, VALUE *argv, VALUE str)
7623 {
7624 char table[TR_TABLE_SIZE];
7625 rb_encoding *enc = 0;
7626 VALUE del = 0, nodel = 0, tstr;
7627 char *s, *send;
7628 int i;
7629 int ascompat;
7630
7631 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7632
7633 tstr = argv[0];
7634 StringValue(tstr);
7635 enc = rb_enc_check(str, tstr);
7636 if (argc == 1) {
7637 const char *ptstr;
7638 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7639 (ptstr = RSTRING_PTR(tstr),
7640 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7641 !is_broken_string(str)) {
7642 int n = 0;
7643 int clen;
7644 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
7645
7646 s = RSTRING_PTR(str);
7647 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7648 send = RSTRING_END(str);
7649 while (s < send) {
7650 if (*(unsigned char*)s++ == c) n++;
7651 }
7652 return INT2NUM(n);
7653 }
7654 }
7655
7656 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
7657 for (i=1; i<argc; i++) {
7658 tstr = argv[i];
7659 StringValue(tstr);
7660 enc = rb_enc_check(str, tstr);
7661 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
7662 }
7663
7664 s = RSTRING_PTR(str);
7665 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7666 send = RSTRING_END(str);
7667 ascompat = rb_enc_asciicompat(enc);
7668 i = 0;
7669 while (s < send) {
7670 unsigned int c;
7671
7672 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7673 if (table[c]) {
7674 i++;
7675 }
7676 s++;
7677 }
7678 else {
7679 int clen;
7680 c = rb_enc_codepoint_len(s, send, &clen, enc);
7681 if (tr_find(c, table, del, nodel)) {
7682 i++;
7683 }
7684 s += clen;
7685 }
7686 }
7687
7688 return INT2NUM(i);
7689 }
7690
7691 static VALUE
rb_fs_check(VALUE val)7692 rb_fs_check(VALUE val)
7693 {
7694 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
7695 val = rb_check_string_type(val);
7696 if (NIL_P(val)) return 0;
7697 }
7698 return val;
7699 }
7700
7701 static const char isspacetable[256] = {
7702 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
7703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7704 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7707 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7709 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7710 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7711 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7712 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7713 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7715 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
7718 };
7719
7720 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
7721
7722 static long
split_string(VALUE result,VALUE str,long beg,long len,long empty_count)7723 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
7724 {
7725 if (empty_count >= 0 && len == 0) {
7726 return empty_count + 1;
7727 }
7728 if (empty_count > 0) {
7729 /* make different substrings */
7730 if (result) {
7731 do {
7732 rb_ary_push(result, str_new_empty(str));
7733 } while (--empty_count > 0);
7734 }
7735 else {
7736 do {
7737 rb_yield(str_new_empty(str));
7738 } while (--empty_count > 0);
7739 }
7740 }
7741 str = rb_str_subseq(str, beg, len);
7742 if (result) {
7743 rb_ary_push(result, str);
7744 }
7745 else {
7746 rb_yield(str);
7747 }
7748 return empty_count;
7749 }
7750
7751 /*
7752 * call-seq:
7753 * str.split(pattern=nil, [limit]) -> an_array
7754 * str.split(pattern=nil, [limit]) {|sub| block } -> str
7755 *
7756 * Divides <i>str</i> into substrings based on a delimiter, returning an array
7757 * of these substrings.
7758 *
7759 * If <i>pattern</i> is a <code>String</code>, then its contents are used as
7760 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
7761 * space, <i>str</i> is split on whitespace, with leading and trailing
7762 * whitespace and runs of contiguous whitespace characters ignored.
7763 *
7764 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
7765 * pattern matches. Whenever the pattern matches a zero-length string,
7766 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
7767 * groups, the respective matches will be returned in the array as well.
7768 *
7769 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
7770 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
7771 * split on whitespace as if ' ' were specified.
7772 *
7773 * If the <i>limit</i> parameter is omitted, trailing null fields are
7774 * suppressed. If <i>limit</i> is a positive number, at most that number
7775 * of split substrings will be returned (captured groups will be returned
7776 * as well, but are not counted towards the limit).
7777 * If <i>limit</i> is <code>1</code>, the entire
7778 * string is returned as the only entry in an array. If negative, there is no
7779 * limit to the number of fields returned, and trailing null fields are not
7780 * suppressed.
7781 *
7782 * When the input +str+ is empty an empty Array is returned as the string is
7783 * considered to have no fields to split.
7784 *
7785 * " now's the time ".split #=> ["now's", "the", "time"]
7786 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
7787 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
7788 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
7789 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
7790 * "hello".split(//, 3) #=> ["h", "e", "llo"]
7791 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
7792 *
7793 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
7794 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
7795 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
7796 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
7797 *
7798 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
7799 *
7800 * "".split(',', -1) #=> []
7801 *
7802 * If a block is given, invoke the block with each split substring.
7803 *
7804 */
7805
7806 static VALUE
rb_str_split_m(int argc,VALUE * argv,VALUE str)7807 rb_str_split_m(int argc, VALUE *argv, VALUE str)
7808 {
7809 rb_encoding *enc;
7810 VALUE spat;
7811 VALUE limit;
7812 enum {awk, string, regexp} split_type;
7813 long beg, end, i = 0, empty_count = -1;
7814 int lim = 0;
7815 VALUE result, tmp;
7816
7817 result = rb_block_given_p() ? Qfalse : Qnil;
7818 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
7819 lim = NUM2INT(limit);
7820 if (lim <= 0) limit = Qnil;
7821 else if (lim == 1) {
7822 if (RSTRING_LEN(str) == 0)
7823 return result ? rb_ary_new2(0) : str;
7824 tmp = rb_str_dup(str);
7825 if (!result) {
7826 rb_yield(tmp);
7827 return str;
7828 }
7829 return rb_ary_new3(1, tmp);
7830 }
7831 i = 1;
7832 }
7833 if (NIL_P(limit) && !lim) empty_count = 0;
7834
7835 enc = STR_ENC_GET(str);
7836 split_type = regexp;
7837 if (!NIL_P(spat)) {
7838 spat = get_pat_quoted(spat, 0);
7839 }
7840 else if (NIL_P(spat = rb_fs)) {
7841 split_type = awk;
7842 }
7843 else if (!(spat = rb_fs_check(spat))) {
7844 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
7845 }
7846 if (split_type != awk) {
7847 if (BUILTIN_TYPE(spat) == T_STRING) {
7848 rb_encoding *enc2 = STR_ENC_GET(spat);
7849
7850 mustnot_broken(spat);
7851 split_type = string;
7852 if (RSTRING_LEN(spat) == 0) {
7853 /* Special case - split into chars */
7854 spat = rb_reg_regcomp(spat);
7855 split_type = regexp;
7856 }
7857 else if (rb_enc_asciicompat(enc2) == 1) {
7858 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
7859 split_type = awk;
7860 }
7861 }
7862 else {
7863 int l;
7864 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
7865 RSTRING_LEN(spat) == l) {
7866 split_type = awk;
7867 }
7868 }
7869 }
7870 }
7871
7872 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
7873
7874 if (result) result = rb_ary_new();
7875 beg = 0;
7876 if (split_type == awk) {
7877 char *ptr = RSTRING_PTR(str);
7878 char *eptr = RSTRING_END(str);
7879 char *bptr = ptr;
7880 int skip = 1;
7881 unsigned int c;
7882
7883 end = beg;
7884 if (is_ascii_string(str)) {
7885 while (ptr < eptr) {
7886 c = (unsigned char)*ptr++;
7887 if (skip) {
7888 if (ascii_isspace(c)) {
7889 beg = ptr - bptr;
7890 }
7891 else {
7892 end = ptr - bptr;
7893 skip = 0;
7894 if (!NIL_P(limit) && lim <= i) break;
7895 }
7896 }
7897 else if (ascii_isspace(c)) {
7898 SPLIT_STR(beg, end-beg);
7899 skip = 1;
7900 beg = ptr - bptr;
7901 if (!NIL_P(limit)) ++i;
7902 }
7903 else {
7904 end = ptr - bptr;
7905 }
7906 }
7907 }
7908 else {
7909 while (ptr < eptr) {
7910 int n;
7911
7912 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
7913 ptr += n;
7914 if (skip) {
7915 if (rb_isspace(c)) {
7916 beg = ptr - bptr;
7917 }
7918 else {
7919 end = ptr - bptr;
7920 skip = 0;
7921 if (!NIL_P(limit) && lim <= i) break;
7922 }
7923 }
7924 else if (rb_isspace(c)) {
7925 SPLIT_STR(beg, end-beg);
7926 skip = 1;
7927 beg = ptr - bptr;
7928 if (!NIL_P(limit)) ++i;
7929 }
7930 else {
7931 end = ptr - bptr;
7932 }
7933 }
7934 }
7935 }
7936 else if (split_type == string) {
7937 char *ptr = RSTRING_PTR(str);
7938 char *str_start = ptr;
7939 char *substr_start = ptr;
7940 char *eptr = RSTRING_END(str);
7941 char *sptr = RSTRING_PTR(spat);
7942 long slen = RSTRING_LEN(spat);
7943
7944 mustnot_broken(str);
7945 enc = rb_enc_check(str, spat);
7946 while (ptr < eptr &&
7947 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
7948 /* Check we are at the start of a char */
7949 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
7950 if (t != ptr + end) {
7951 ptr = t;
7952 continue;
7953 }
7954 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
7955 ptr += end + slen;
7956 substr_start = ptr;
7957 if (!NIL_P(limit) && lim <= ++i) break;
7958 }
7959 beg = ptr - str_start;
7960 }
7961 else {
7962 char *ptr = RSTRING_PTR(str);
7963 long len = RSTRING_LEN(str);
7964 long start = beg;
7965 long idx;
7966 int last_null = 0;
7967 struct re_registers *regs;
7968 VALUE match = 0;
7969
7970 for (; (end = rb_reg_search(spat, str, start, 0)) >= 0;
7971 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
7972 match = rb_backref_get();
7973 if (!result) rb_match_busy(match);
7974 regs = RMATCH_REGS(match);
7975 if (start == end && BEG(0) == END(0)) {
7976 if (!ptr) {
7977 SPLIT_STR(0, 0);
7978 break;
7979 }
7980 else if (last_null == 1) {
7981 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc));
7982 beg = start;
7983 }
7984 else {
7985 if (start == len)
7986 start++;
7987 else
7988 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
7989 last_null = 1;
7990 continue;
7991 }
7992 }
7993 else {
7994 SPLIT_STR(beg, end-beg);
7995 beg = start = END(0);
7996 }
7997 last_null = 0;
7998
7999 for (idx=1; idx < regs->num_regs; idx++) {
8000 if (BEG(idx) == -1) continue;
8001 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8002 }
8003 if (!NIL_P(limit) && lim <= ++i) break;
8004 }
8005 if (match) rb_match_unbusy(match);
8006 }
8007 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8008 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8009 }
8010
8011 return result ? result : str;
8012 }
8013
8014 VALUE
rb_str_split(VALUE str,const char * sep0)8015 rb_str_split(VALUE str, const char *sep0)
8016 {
8017 VALUE sep;
8018
8019 StringValue(str);
8020 sep = rb_str_new_cstr(sep0);
8021 return rb_str_split_m(1, &sep, str);
8022 }
8023
8024 static int
enumerator_wantarray(const char * method)8025 enumerator_wantarray(const char *method)
8026 {
8027 if (rb_block_given_p()) {
8028 #if STRING_ENUMERATORS_WANTARRAY
8029 rb_warn("given block not used");
8030 #else
8031 rb_warning("passing a block to String#%s is deprecated", method);
8032 return 0;
8033 #endif
8034 }
8035 return 1;
8036 }
8037
8038 #define WANTARRAY(m, size) \
8039 (enumerator_wantarray(m) ? rb_ary_new_capa(size) : 0)
8040
8041 static inline int
enumerator_element(VALUE ary,VALUE e)8042 enumerator_element(VALUE ary, VALUE e)
8043 {
8044 if (ary) {
8045 rb_ary_push(ary, e);
8046 return 0;
8047 }
8048 else {
8049 rb_yield(e);
8050 return 1;
8051 }
8052 }
8053
8054 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8055
8056 static const char *
chomp_newline(const char * p,const char * e,rb_encoding * enc)8057 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8058 {
8059 const char *prev = rb_enc_prev_char(p, e, e, enc);
8060 if (rb_enc_is_newline(prev, e, enc)) {
8061 e = prev;
8062 prev = rb_enc_prev_char(p, e, e, enc);
8063 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8064 e = prev;
8065 }
8066 return e;
8067 }
8068
8069 static VALUE
rb_str_enumerate_lines(int argc,VALUE * argv,VALUE str,VALUE ary)8070 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8071 {
8072 rb_encoding *enc;
8073 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8074 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8075 long pos, len, rslen;
8076 int rsnewline = 0;
8077
8078 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8079 rs = rb_rs;
8080 if (!NIL_P(opts)) {
8081 static ID keywords[1];
8082 if (!keywords[0]) {
8083 keywords[0] = rb_intern_const("chomp");
8084 }
8085 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8086 chomp = (chomp != Qundef && RTEST(chomp));
8087 }
8088
8089 if (NIL_P(rs)) {
8090 if (!ENUM_ELEM(ary, str)) {
8091 return ary;
8092 }
8093 else {
8094 return orig;
8095 }
8096 }
8097
8098 if (!RSTRING_LEN(str)) goto end;
8099 str = rb_str_new_frozen(str);
8100 ptr = subptr = RSTRING_PTR(str);
8101 pend = RSTRING_END(str);
8102 len = RSTRING_LEN(str);
8103 StringValue(rs);
8104 rslen = RSTRING_LEN(rs);
8105
8106 if (rs == rb_default_rs)
8107 enc = rb_enc_get(str);
8108 else
8109 enc = rb_enc_check(str, rs);
8110
8111 if (rslen == 0) {
8112 /* paragraph mode */
8113 int n;
8114 const char *eol = NULL;
8115 subend = subptr;
8116 while (subend < pend) {
8117 do {
8118 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8119 n = 0;
8120 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8121 if (rb_enc_is_newline(subend + n, pend, enc)) {
8122 if (eol == subend) break;
8123 subend += rslen;
8124 if (subptr) eol = subend;
8125 }
8126 else {
8127 if (!subptr) subptr = subend;
8128 subend += rslen;
8129 }
8130 rslen = 0;
8131 } while (subend < pend);
8132 if (!subptr) break;
8133 line = rb_str_subseq(str, subptr - ptr,
8134 subend - subptr + (chomp ? 0 : rslen));
8135 if (ENUM_ELEM(ary, line)) {
8136 str_mod_check(str, ptr, len);
8137 }
8138 subptr = eol = NULL;
8139 }
8140 goto end;
8141 }
8142 else {
8143 rsptr = RSTRING_PTR(rs);
8144 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8145 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8146 rsnewline = 1;
8147 }
8148 }
8149
8150 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8151 rs = rb_str_new(rsptr, rslen);
8152 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8153 rsptr = RSTRING_PTR(rs);
8154 rslen = RSTRING_LEN(rs);
8155 }
8156
8157 while (subptr < pend) {
8158 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8159 if (pos < 0) break;
8160 hit = subptr + pos;
8161 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8162 if (hit != adjusted) {
8163 subptr = adjusted;
8164 continue;
8165 }
8166 subend = hit += rslen;
8167 if (chomp) {
8168 if (rsnewline) {
8169 subend = chomp_newline(subptr, subend, enc);
8170 }
8171 else {
8172 subend -= rslen;
8173 }
8174 }
8175 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8176 if (ENUM_ELEM(ary, line)) {
8177 str_mod_check(str, ptr, len);
8178 }
8179 subptr = hit;
8180 }
8181
8182 if (subptr != pend) {
8183 if (chomp) {
8184 if (rsnewline) {
8185 pend = chomp_newline(subptr, pend, enc);
8186 }
8187 else if (pend - subptr >= rslen &&
8188 memcmp(pend - rslen, rsptr, rslen) == 0) {
8189 pend -= rslen;
8190 }
8191 }
8192 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8193 ENUM_ELEM(ary, line);
8194 RB_GC_GUARD(str);
8195 }
8196
8197 end:
8198 if (ary)
8199 return ary;
8200 else
8201 return orig;
8202 }
8203
8204 /*
8205 * call-seq:
8206 * str.each_line(separator=$/ [, getline_args]) {|substr| block } -> str
8207 * str.each_line(separator=$/ [, getline_args]) -> an_enumerator
8208 *
8209 * Splits <i>str</i> using the supplied parameter as the record
8210 * separator (<code>$/</code> by default), passing each substring in
8211 * turn to the supplied block. If a zero-length record separator is
8212 * supplied, the string is split into paragraphs delimited by
8213 * multiple successive newlines.
8214 *
8215 * See IO.readlines for details about getline_args.
8216 *
8217 * If no block is given, an enumerator is returned instead.
8218 *
8219 * print "Example one\n"
8220 * "hello\nworld".each_line {|s| p s}
8221 * print "Example two\n"
8222 * "hello\nworld".each_line('l') {|s| p s}
8223 * print "Example three\n"
8224 * "hello\n\n\nworld".each_line('') {|s| p s}
8225 *
8226 * <em>produces:</em>
8227 *
8228 * Example one
8229 * "hello\n"
8230 * "world"
8231 * Example two
8232 * "hel"
8233 * "l"
8234 * "o\nworl"
8235 * "d"
8236 * Example three
8237 * "hello\n\n"
8238 * "world"
8239 */
8240
8241 static VALUE
rb_str_each_line(int argc,VALUE * argv,VALUE str)8242 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8243 {
8244 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8245 return rb_str_enumerate_lines(argc, argv, str, 0);
8246 }
8247
8248 /*
8249 * call-seq:
8250 * str.lines(separator=$/ [, getline_args]) -> an_array
8251 *
8252 * Returns an array of lines in <i>str</i> split using the supplied
8253 * record separator (<code>$/</code> by default). This is a
8254 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8255 *
8256 * See IO.readlines for details about getline_args.
8257 *
8258 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8259 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8260 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8261 *
8262 * If a block is given, which is a deprecated form, works the same as
8263 * <code>each_line</code>.
8264 */
8265
8266 static VALUE
rb_str_lines(int argc,VALUE * argv,VALUE str)8267 rb_str_lines(int argc, VALUE *argv, VALUE str)
8268 {
8269 VALUE ary = WANTARRAY("lines", 0);
8270 return rb_str_enumerate_lines(argc, argv, str, ary);
8271 }
8272
8273 static VALUE
rb_str_each_byte_size(VALUE str,VALUE args,VALUE eobj)8274 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8275 {
8276 return LONG2FIX(RSTRING_LEN(str));
8277 }
8278
8279 static VALUE
rb_str_enumerate_bytes(VALUE str,VALUE ary)8280 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8281 {
8282 long i;
8283
8284 for (i=0; i<RSTRING_LEN(str); i++) {
8285 ENUM_ELEM(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
8286 }
8287 if (ary)
8288 return ary;
8289 else
8290 return str;
8291 }
8292
8293 /*
8294 * call-seq:
8295 * str.each_byte {|integer| block } -> str
8296 * str.each_byte -> an_enumerator
8297 *
8298 * Passes each byte in <i>str</i> to the given block, or returns an
8299 * enumerator if no block is given.
8300 *
8301 * "hello".each_byte {|c| print c, ' ' }
8302 *
8303 * <em>produces:</em>
8304 *
8305 * 104 101 108 108 111
8306 */
8307
8308 static VALUE
rb_str_each_byte(VALUE str)8309 rb_str_each_byte(VALUE str)
8310 {
8311 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8312 return rb_str_enumerate_bytes(str, 0);
8313 }
8314
8315 /*
8316 * call-seq:
8317 * str.bytes -> an_array
8318 *
8319 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8320 * <code>str.each_byte.to_a</code>.
8321 *
8322 * If a block is given, which is a deprecated form, works the same as
8323 * <code>each_byte</code>.
8324 */
8325
8326 static VALUE
rb_str_bytes(VALUE str)8327 rb_str_bytes(VALUE str)
8328 {
8329 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8330 return rb_str_enumerate_bytes(str, ary);
8331 }
8332
8333 static VALUE
rb_str_each_char_size(VALUE str,VALUE args,VALUE eobj)8334 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8335 {
8336 return rb_str_length(str);
8337 }
8338
8339 static VALUE
rb_str_enumerate_chars(VALUE str,VALUE ary)8340 rb_str_enumerate_chars(VALUE str, VALUE ary)
8341 {
8342 VALUE orig = str;
8343 long i, len, n;
8344 const char *ptr;
8345 rb_encoding *enc;
8346
8347 str = rb_str_new_frozen(str);
8348 ptr = RSTRING_PTR(str);
8349 len = RSTRING_LEN(str);
8350 enc = rb_enc_get(str);
8351
8352 if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
8353 for (i = 0; i < len; i += n) {
8354 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
8355 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
8356 }
8357 }
8358 else {
8359 for (i = 0; i < len; i += n) {
8360 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
8361 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
8362 }
8363 }
8364 RB_GC_GUARD(str);
8365 if (ary)
8366 return ary;
8367 else
8368 return orig;
8369 }
8370
8371 /*
8372 * call-seq:
8373 * str.each_char {|cstr| block } -> str
8374 * str.each_char -> an_enumerator
8375 *
8376 * Passes each character in <i>str</i> to the given block, or returns
8377 * an enumerator if no block is given.
8378 *
8379 * "hello".each_char {|c| print c, ' ' }
8380 *
8381 * <em>produces:</em>
8382 *
8383 * h e l l o
8384 */
8385
8386 static VALUE
rb_str_each_char(VALUE str)8387 rb_str_each_char(VALUE str)
8388 {
8389 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8390 return rb_str_enumerate_chars(str, 0);
8391 }
8392
8393 /*
8394 * call-seq:
8395 * str.chars -> an_array
8396 *
8397 * Returns an array of characters in <i>str</i>. This is a shorthand
8398 * for <code>str.each_char.to_a</code>.
8399 *
8400 * If a block is given, which is a deprecated form, works the same as
8401 * <code>each_char</code>.
8402 */
8403
8404 static VALUE
rb_str_chars(VALUE str)8405 rb_str_chars(VALUE str)
8406 {
8407 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
8408 return rb_str_enumerate_chars(str, ary);
8409 }
8410
8411 static VALUE
rb_str_enumerate_codepoints(VALUE str,VALUE ary)8412 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
8413 {
8414 VALUE orig = str;
8415 int n;
8416 unsigned int c;
8417 const char *ptr, *end;
8418 rb_encoding *enc;
8419
8420 if (single_byte_optimizable(str))
8421 return rb_str_enumerate_bytes(str, ary);
8422
8423 str = rb_str_new_frozen(str);
8424 ptr = RSTRING_PTR(str);
8425 end = RSTRING_END(str);
8426 enc = STR_ENC_GET(str);
8427
8428 while (ptr < end) {
8429 c = rb_enc_codepoint_len(ptr, end, &n, enc);
8430 ENUM_ELEM(ary, UINT2NUM(c));
8431 ptr += n;
8432 }
8433 RB_GC_GUARD(str);
8434 if (ary)
8435 return ary;
8436 else
8437 return orig;
8438 }
8439
8440 /*
8441 * call-seq:
8442 * str.each_codepoint {|integer| block } -> str
8443 * str.each_codepoint -> an_enumerator
8444 *
8445 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
8446 * also known as a <i>codepoint</i> when applied to Unicode strings to the
8447 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
8448 * values are directly derived from the binary representation
8449 * of each character.
8450 *
8451 * If no block is given, an enumerator is returned instead.
8452 *
8453 * "hello\u0639".each_codepoint {|c| print c, ' ' }
8454 *
8455 * <em>produces:</em>
8456 *
8457 * 104 101 108 108 111 1593
8458 */
8459
8460 static VALUE
rb_str_each_codepoint(VALUE str)8461 rb_str_each_codepoint(VALUE str)
8462 {
8463 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8464 return rb_str_enumerate_codepoints(str, 0);
8465 }
8466
8467 /*
8468 * call-seq:
8469 * str.codepoints -> an_array
8470 *
8471 * Returns an array of the <code>Integer</code> ordinals of the
8472 * characters in <i>str</i>. This is a shorthand for
8473 * <code>str.each_codepoint.to_a</code>.
8474 *
8475 * If a block is given, which is a deprecated form, works the same as
8476 * <code>each_codepoint</code>.
8477 */
8478
8479 static VALUE
rb_str_codepoints(VALUE str)8480 rb_str_codepoints(VALUE str)
8481 {
8482 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
8483 return rb_str_enumerate_codepoints(str, ary);
8484 }
8485
8486 static regex_t *
get_reg_grapheme_cluster(rb_encoding * enc)8487 get_reg_grapheme_cluster(rb_encoding *enc)
8488 {
8489 int encidx = rb_enc_to_index(enc);
8490 regex_t *reg_grapheme_cluster = NULL;
8491 static regex_t *reg_grapheme_cluster_utf8 = NULL;
8492
8493 /* synchronize */
8494 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8495 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8496 }
8497 if (!reg_grapheme_cluster) {
8498 const OnigUChar source_ascii[] = "\\X";
8499 OnigErrorInfo einfo;
8500 const OnigUChar *source = source_ascii;
8501 size_t source_len = sizeof(source_ascii) - 1;
8502 switch (encidx) {
8503 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
8504 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
8505 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
8506 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
8507 #define CASE_UTF(e) \
8508 case ENCINDEX_UTF_##e: { \
8509 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
8510 source = source_UTF_##e; \
8511 source_len = sizeof(source_UTF_##e); \
8512 break; \
8513 }
8514 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
8515 #undef CASE_UTF
8516 #undef CHARS_16BE
8517 #undef CHARS_16LE
8518 #undef CHARS_32BE
8519 #undef CHARS_32LE
8520 }
8521 int r = onig_new(®_grapheme_cluster, source, source + source_len,
8522 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
8523 if (r) {
8524 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
8525 onig_error_code_to_str(message, r, &einfo);
8526 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
8527 }
8528 if (encidx == rb_utf8_encindex()) {
8529 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8530 }
8531 }
8532 return reg_grapheme_cluster;
8533 }
8534
8535 static VALUE
rb_str_each_grapheme_cluster_size(VALUE str,VALUE args,VALUE eobj)8536 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
8537 {
8538 size_t grapheme_cluster_count = 0;
8539 regex_t *reg_grapheme_cluster = NULL;
8540 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
8541 const char *ptr, *end;
8542
8543 if (!rb_enc_unicode_p(enc)) {
8544 return rb_str_length(str);
8545 }
8546
8547 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8548 ptr = RSTRING_PTR(str);
8549 end = RSTRING_END(str);
8550
8551 while (ptr < end) {
8552 OnigPosition len = onig_match(reg_grapheme_cluster,
8553 (const OnigUChar *)ptr, (const OnigUChar *)end,
8554 (const OnigUChar *)ptr, NULL, 0);
8555 if (len <= 0) break;
8556 grapheme_cluster_count++;
8557 ptr += len;
8558 }
8559
8560 return SIZET2NUM(grapheme_cluster_count);
8561 }
8562
8563 static VALUE
rb_str_enumerate_grapheme_clusters(VALUE str,VALUE ary)8564 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8565 {
8566 VALUE orig = str;
8567 regex_t *reg_grapheme_cluster = NULL;
8568 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
8569 const char *ptr0, *ptr, *end;
8570
8571 if (!rb_enc_unicode_p(enc)) {
8572 return rb_str_enumerate_chars(str, ary);
8573 }
8574
8575 if (!ary) str = rb_str_new_frozen(str);
8576 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8577 ptr0 = ptr = RSTRING_PTR(str);
8578 end = RSTRING_END(str);
8579
8580 while (ptr < end) {
8581 OnigPosition len = onig_match(reg_grapheme_cluster,
8582 (const OnigUChar *)ptr, (const OnigUChar *)end,
8583 (const OnigUChar *)ptr, NULL, 0);
8584 if (len <= 0) break;
8585 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
8586 ptr += len;
8587 }
8588 RB_GC_GUARD(str);
8589 if (ary)
8590 return ary;
8591 else
8592 return orig;
8593 }
8594
8595 /*
8596 * call-seq:
8597 * str.each_grapheme_cluster {|cstr| block } -> str
8598 * str.each_grapheme_cluster -> an_enumerator
8599 *
8600 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
8601 * an enumerator if no block is given.
8602 * Unlike String#each_char, this enumerates by grapheme clusters defined by
8603 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
8604 *
8605 * "a\u0300".each_char.to_a.size #=> 2
8606 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
8607 *
8608 */
8609
8610 static VALUE
rb_str_each_grapheme_cluster(VALUE str)8611 rb_str_each_grapheme_cluster(VALUE str)
8612 {
8613 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
8614 return rb_str_enumerate_grapheme_clusters(str, 0);
8615 }
8616
8617 /*
8618 * call-seq:
8619 * str.grapheme_clusters -> an_array
8620 *
8621 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
8622 * for <code>str.each_grapheme_cluster.to_a</code>.
8623 *
8624 * If a block is given, which is a deprecated form, works the same as
8625 * <code>each_grapheme_cluster</code>.
8626 */
8627
8628 static VALUE
rb_str_grapheme_clusters(VALUE str)8629 rb_str_grapheme_clusters(VALUE str)
8630 {
8631 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
8632 return rb_str_enumerate_grapheme_clusters(str, ary);
8633 }
8634
8635 static long
chopped_length(VALUE str)8636 chopped_length(VALUE str)
8637 {
8638 rb_encoding *enc = STR_ENC_GET(str);
8639 const char *p, *p2, *beg, *end;
8640
8641 beg = RSTRING_PTR(str);
8642 end = beg + RSTRING_LEN(str);
8643 if (beg > end) return 0;
8644 p = rb_enc_prev_char(beg, end, end, enc);
8645 if (!p) return 0;
8646 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
8647 p2 = rb_enc_prev_char(beg, p, end, enc);
8648 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
8649 }
8650 return p - beg;
8651 }
8652
8653 /*
8654 * call-seq:
8655 * str.chop! -> str or nil
8656 *
8657 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
8658 * or <code>nil</code> if <i>str</i> is the empty string. See also
8659 * <code>String#chomp!</code>.
8660 */
8661
8662 static VALUE
rb_str_chop_bang(VALUE str)8663 rb_str_chop_bang(VALUE str)
8664 {
8665 str_modify_keep_cr(str);
8666 if (RSTRING_LEN(str) > 0) {
8667 long len;
8668 len = chopped_length(str);
8669 STR_SET_LEN(str, len);
8670 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8671 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8672 ENC_CODERANGE_CLEAR(str);
8673 }
8674 return str;
8675 }
8676 return Qnil;
8677 }
8678
8679
8680 /*
8681 * call-seq:
8682 * str.chop -> new_str
8683 *
8684 * Returns a new <code>String</code> with the last character removed. If the
8685 * string ends with <code>\r\n</code>, both characters are removed. Applying
8686 * <code>chop</code> to an empty string returns an empty
8687 * string. <code>String#chomp</code> is often a safer alternative, as it leaves
8688 * the string unchanged if it doesn't end in a record separator.
8689 *
8690 * "string\r\n".chop #=> "string"
8691 * "string\n\r".chop #=> "string\n"
8692 * "string\n".chop #=> "string"
8693 * "string".chop #=> "strin"
8694 * "x".chop.chop #=> ""
8695 */
8696
8697 static VALUE
rb_str_chop(VALUE str)8698 rb_str_chop(VALUE str)
8699 {
8700 return rb_str_subseq(str, 0, chopped_length(str));
8701 }
8702
8703
8704 static long
chompped_length(VALUE str,VALUE rs)8705 chompped_length(VALUE str, VALUE rs)
8706 {
8707 rb_encoding *enc;
8708 int newline;
8709 char *pp, *e, *rsptr;
8710 long rslen;
8711 char *const p = RSTRING_PTR(str);
8712 long len = RSTRING_LEN(str);
8713
8714 if (len == 0) return 0;
8715 e = p + len;
8716 if (rs == rb_default_rs) {
8717 smart_chomp:
8718 enc = rb_enc_get(str);
8719 if (rb_enc_mbminlen(enc) > 1) {
8720 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8721 if (rb_enc_is_newline(pp, e, enc)) {
8722 e = pp;
8723 }
8724 pp = e - rb_enc_mbminlen(enc);
8725 if (pp >= p) {
8726 pp = rb_enc_left_char_head(p, pp, e, enc);
8727 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8728 e = pp;
8729 }
8730 }
8731 }
8732 else {
8733 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
8734 case '\n':
8735 if (--e > p && *(e-1) == '\r') {
8736 --e;
8737 }
8738 break;
8739 case '\r':
8740 --e;
8741 break;
8742 }
8743 }
8744 return e - p;
8745 }
8746
8747 enc = rb_enc_get(str);
8748 RSTRING_GETMEM(rs, rsptr, rslen);
8749 if (rslen == 0) {
8750 if (rb_enc_mbminlen(enc) > 1) {
8751 while (e > p) {
8752 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8753 if (!rb_enc_is_newline(pp, e, enc)) break;
8754 e = pp;
8755 pp -= rb_enc_mbminlen(enc);
8756 if (pp >= p) {
8757 pp = rb_enc_left_char_head(p, pp, e, enc);
8758 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8759 e = pp;
8760 }
8761 }
8762 }
8763 }
8764 else {
8765 while (e > p && *(e-1) == '\n') {
8766 --e;
8767 if (e > p && *(e-1) == '\r')
8768 --e;
8769 }
8770 }
8771 return e - p;
8772 }
8773 if (rslen > len) return len;
8774
8775 enc = rb_enc_get(rs);
8776 newline = rsptr[rslen-1];
8777 if (rslen == rb_enc_mbminlen(enc)) {
8778 if (rslen == 1) {
8779 if (newline == '\n')
8780 goto smart_chomp;
8781 }
8782 else {
8783 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
8784 goto smart_chomp;
8785 }
8786 }
8787
8788 enc = rb_enc_check(str, rs);
8789 if (is_broken_string(rs)) {
8790 return len;
8791 }
8792 pp = e - rslen;
8793 if (p[len-1] == newline &&
8794 (rslen <= 1 ||
8795 memcmp(rsptr, pp, rslen) == 0)) {
8796 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
8797 return len - rslen;
8798 RB_GC_GUARD(rs);
8799 }
8800 return len;
8801 }
8802
8803 /*!
8804 * Returns the separator for arguments of rb_str_chomp.
8805 *
8806 * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
8807 */
8808 static VALUE
chomp_rs(int argc,const VALUE * argv)8809 chomp_rs(int argc, const VALUE *argv)
8810 {
8811 rb_check_arity(argc, 0, 1);
8812 if (argc > 0) {
8813 VALUE rs = argv[0];
8814 if (!NIL_P(rs)) StringValue(rs);
8815 return rs;
8816 }
8817 else {
8818 return rb_rs;
8819 }
8820 }
8821
8822 VALUE
rb_str_chomp_string(VALUE str,VALUE rs)8823 rb_str_chomp_string(VALUE str, VALUE rs)
8824 {
8825 long olen = RSTRING_LEN(str);
8826 long len = chompped_length(str, rs);
8827 if (len >= olen) return Qnil;
8828 str_modify_keep_cr(str);
8829 STR_SET_LEN(str, len);
8830 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8831 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8832 ENC_CODERANGE_CLEAR(str);
8833 }
8834 return str;
8835 }
8836
8837 /*
8838 * call-seq:
8839 * str.chomp!(separator=$/) -> str or nil
8840 *
8841 * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
8842 * returning <i>str</i>, or <code>nil</code> if no modifications were made.
8843 */
8844
8845 static VALUE
rb_str_chomp_bang(int argc,VALUE * argv,VALUE str)8846 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
8847 {
8848 VALUE rs;
8849 str_modifiable(str);
8850 if (RSTRING_LEN(str) == 0) return Qnil;
8851 rs = chomp_rs(argc, argv);
8852 if (NIL_P(rs)) return Qnil;
8853 return rb_str_chomp_string(str, rs);
8854 }
8855
8856
8857 /*
8858 * call-seq:
8859 * str.chomp(separator=$/) -> new_str
8860 *
8861 * Returns a new <code>String</code> with the given record separator removed
8862 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
8863 * changed from the default Ruby record separator, then <code>chomp</code> also
8864 * removes carriage return characters (that is it will remove <code>\n</code>,
8865 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
8866 * it will remove all trailing newlines from the string.
8867 *
8868 * "hello".chomp #=> "hello"
8869 * "hello\n".chomp #=> "hello"
8870 * "hello\r\n".chomp #=> "hello"
8871 * "hello\n\r".chomp #=> "hello\n"
8872 * "hello\r".chomp #=> "hello"
8873 * "hello \n there".chomp #=> "hello \n there"
8874 * "hello".chomp("llo") #=> "he"
8875 * "hello\r\n\r\n".chomp('') #=> "hello"
8876 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
8877 */
8878
8879 static VALUE
rb_str_chomp(int argc,VALUE * argv,VALUE str)8880 rb_str_chomp(int argc, VALUE *argv, VALUE str)
8881 {
8882 VALUE rs = chomp_rs(argc, argv);
8883 if (NIL_P(rs)) return rb_str_dup(str);
8884 return rb_str_subseq(str, 0, chompped_length(str, rs));
8885 }
8886
8887 static long
lstrip_offset(VALUE str,const char * s,const char * e,rb_encoding * enc)8888 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8889 {
8890 const char *const start = s;
8891
8892 if (!s || s >= e) return 0;
8893
8894 /* remove spaces at head */
8895 if (single_byte_optimizable(str)) {
8896 while (s < e && ascii_isspace(*s)) s++;
8897 }
8898 else {
8899 while (s < e) {
8900 int n;
8901 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
8902
8903 if (!rb_isspace(cc)) break;
8904 s += n;
8905 }
8906 }
8907 return s - start;
8908 }
8909
8910 /*
8911 * call-seq:
8912 * str.lstrip! -> self or nil
8913 *
8914 * Removes leading whitespace from the receiver.
8915 * Returns the altered receiver, or +nil+ if no change was made.
8916 * See also String#rstrip! and String#strip!.
8917 *
8918 * Refer to String#strip for the definition of whitespace.
8919 *
8920 * " hello ".lstrip! #=> "hello "
8921 * "hello ".lstrip! #=> nil
8922 * "hello".lstrip! #=> nil
8923 */
8924
8925 static VALUE
rb_str_lstrip_bang(VALUE str)8926 rb_str_lstrip_bang(VALUE str)
8927 {
8928 rb_encoding *enc;
8929 char *start, *s;
8930 long olen, loffset;
8931
8932 str_modify_keep_cr(str);
8933 enc = STR_ENC_GET(str);
8934 RSTRING_GETMEM(str, start, olen);
8935 loffset = lstrip_offset(str, start, start+olen, enc);
8936 if (loffset > 0) {
8937 long len = olen-loffset;
8938 s = start + loffset;
8939 memmove(start, s, len);
8940 STR_SET_LEN(str, len);
8941 #if !SHARABLE_MIDDLE_SUBSTRING
8942 TERM_FILL(start+len, rb_enc_mbminlen(enc));
8943 #endif
8944 return str;
8945 }
8946 return Qnil;
8947 }
8948
8949
8950 /*
8951 * call-seq:
8952 * str.lstrip -> new_str
8953 *
8954 * Returns a copy of the receiver with leading whitespace removed.
8955 * See also String#rstrip and String#strip.
8956 *
8957 * Refer to String#strip for the definition of whitespace.
8958 *
8959 * " hello ".lstrip #=> "hello "
8960 * "hello".lstrip #=> "hello"
8961 */
8962
8963 static VALUE
rb_str_lstrip(VALUE str)8964 rb_str_lstrip(VALUE str)
8965 {
8966 char *start;
8967 long len, loffset;
8968 RSTRING_GETMEM(str, start, len);
8969 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
8970 if (loffset <= 0) return rb_str_dup(str);
8971 return rb_str_subseq(str, loffset, len - loffset);
8972 }
8973
8974 static long
rstrip_offset(VALUE str,const char * s,const char * e,rb_encoding * enc)8975 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8976 {
8977 const char *t;
8978
8979 rb_str_check_dummy_enc(enc);
8980 if (!s || s >= e) return 0;
8981 t = e;
8982
8983 /* remove trailing spaces or '\0's */
8984 if (single_byte_optimizable(str)) {
8985 unsigned char c;
8986 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
8987 }
8988 else {
8989 char *tp;
8990
8991 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
8992 unsigned int c = rb_enc_codepoint(tp, e, enc);
8993 if (c && !rb_isspace(c)) break;
8994 t = tp;
8995 }
8996 }
8997 return e - t;
8998 }
8999
9000 /*
9001 * call-seq:
9002 * str.rstrip! -> self or nil
9003 *
9004 * Removes trailing whitespace from the receiver.
9005 * Returns the altered receiver, or +nil+ if no change was made.
9006 * See also String#lstrip! and String#strip!.
9007 *
9008 * Refer to String#strip for the definition of whitespace.
9009 *
9010 * " hello ".rstrip! #=> " hello"
9011 * " hello".rstrip! #=> nil
9012 * "hello".rstrip! #=> nil
9013 */
9014
9015 static VALUE
rb_str_rstrip_bang(VALUE str)9016 rb_str_rstrip_bang(VALUE str)
9017 {
9018 rb_encoding *enc;
9019 char *start;
9020 long olen, roffset;
9021
9022 str_modify_keep_cr(str);
9023 enc = STR_ENC_GET(str);
9024 RSTRING_GETMEM(str, start, olen);
9025 roffset = rstrip_offset(str, start, start+olen, enc);
9026 if (roffset > 0) {
9027 long len = olen - roffset;
9028
9029 STR_SET_LEN(str, len);
9030 #if !SHARABLE_MIDDLE_SUBSTRING
9031 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9032 #endif
9033 return str;
9034 }
9035 return Qnil;
9036 }
9037
9038
9039 /*
9040 * call-seq:
9041 * str.rstrip -> new_str
9042 *
9043 * Returns a copy of the receiver with trailing whitespace removed.
9044 * See also String#lstrip and String#strip.
9045 *
9046 * Refer to String#strip for the definition of whitespace.
9047 *
9048 * " hello ".rstrip #=> " hello"
9049 * "hello".rstrip #=> "hello"
9050 */
9051
9052 static VALUE
rb_str_rstrip(VALUE str)9053 rb_str_rstrip(VALUE str)
9054 {
9055 rb_encoding *enc;
9056 char *start;
9057 long olen, roffset;
9058
9059 enc = STR_ENC_GET(str);
9060 RSTRING_GETMEM(str, start, olen);
9061 roffset = rstrip_offset(str, start, start+olen, enc);
9062
9063 if (roffset <= 0) return rb_str_dup(str);
9064 return rb_str_subseq(str, 0, olen-roffset);
9065 }
9066
9067
9068 /*
9069 * call-seq:
9070 * str.strip! -> self or nil
9071 *
9072 * Removes leading and trailing whitespace from the receiver.
9073 * Returns the altered receiver, or +nil+ if there was no change.
9074 *
9075 * Refer to String#strip for the definition of whitespace.
9076 *
9077 * " hello ".strip! #=> "hello"
9078 * "hello".strip! #=> nil
9079 */
9080
9081 static VALUE
rb_str_strip_bang(VALUE str)9082 rb_str_strip_bang(VALUE str)
9083 {
9084 char *start;
9085 long olen, loffset, roffset;
9086 rb_encoding *enc;
9087
9088 str_modify_keep_cr(str);
9089 enc = STR_ENC_GET(str);
9090 RSTRING_GETMEM(str, start, olen);
9091 loffset = lstrip_offset(str, start, start+olen, enc);
9092 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9093
9094 if (loffset > 0 || roffset > 0) {
9095 long len = olen-roffset;
9096 if (loffset > 0) {
9097 len -= loffset;
9098 memmove(start, start + loffset, len);
9099 }
9100 STR_SET_LEN(str, len);
9101 #if !SHARABLE_MIDDLE_SUBSTRING
9102 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9103 #endif
9104 return str;
9105 }
9106 return Qnil;
9107 }
9108
9109
9110 /*
9111 * call-seq:
9112 * str.strip -> new_str
9113 *
9114 * Returns a copy of the receiver with leading and trailing whitespace removed.
9115 *
9116 * Whitespace is defined as any of the following characters:
9117 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9118 *
9119 * " hello ".strip #=> "hello"
9120 * "\tgoodbye\r\n".strip #=> "goodbye"
9121 * "\x00\t\n\v\f\r ".strip #=> ""
9122 * "hello".strip #=> "hello"
9123 */
9124
9125 static VALUE
rb_str_strip(VALUE str)9126 rb_str_strip(VALUE str)
9127 {
9128 char *start;
9129 long olen, loffset, roffset;
9130 rb_encoding *enc = STR_ENC_GET(str);
9131
9132 RSTRING_GETMEM(str, start, olen);
9133 loffset = lstrip_offset(str, start, start+olen, enc);
9134 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9135
9136 if (loffset <= 0 && roffset <= 0) return rb_str_dup(str);
9137 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9138 }
9139
9140 static VALUE
scan_once(VALUE str,VALUE pat,long * start,int set_backref_str)9141 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9142 {
9143 VALUE result, match;
9144 struct re_registers *regs;
9145 int i;
9146 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9147 if (pos >= 0) {
9148 if (BUILTIN_TYPE(pat) == T_STRING) {
9149 regs = NULL;
9150 end = pos + RSTRING_LEN(pat);
9151 }
9152 else {
9153 match = rb_backref_get();
9154 regs = RMATCH_REGS(match);
9155 pos = BEG(0);
9156 end = END(0);
9157 }
9158 if (pos == end) {
9159 rb_encoding *enc = STR_ENC_GET(str);
9160 /*
9161 * Always consume at least one character of the input string
9162 */
9163 if (RSTRING_LEN(str) > end)
9164 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9165 RSTRING_END(str), enc);
9166 else
9167 *start = end + 1;
9168 }
9169 else {
9170 *start = end;
9171 }
9172 if (!regs || regs->num_regs == 1) {
9173 result = rb_str_subseq(str, pos, end - pos);
9174 OBJ_INFECT(result, pat);
9175 return result;
9176 }
9177 result = rb_ary_new2(regs->num_regs);
9178 for (i=1; i < regs->num_regs; i++) {
9179 VALUE s = Qnil;
9180 if (BEG(i) >= 0) {
9181 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9182 OBJ_INFECT(s, pat);
9183 }
9184 rb_ary_push(result, s);
9185 }
9186
9187 return result;
9188 }
9189 return Qnil;
9190 }
9191
9192
9193 /*
9194 * call-seq:
9195 * str.scan(pattern) -> array
9196 * str.scan(pattern) {|match, ...| block } -> str
9197 *
9198 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9199 * <code>Regexp</code> or a <code>String</code>). For each match, a result is
9200 * generated and either added to the result array or passed to the block. If
9201 * the pattern contains no groups, each individual result consists of the
9202 * matched string, <code>$&</code>. If the pattern contains groups, each
9203 * individual result is itself an array containing one entry per group.
9204 *
9205 * a = "cruel world"
9206 * a.scan(/\w+/) #=> ["cruel", "world"]
9207 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9208 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9209 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9210 *
9211 * And the block form:
9212 *
9213 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9214 * print "\n"
9215 * a.scan(/(.)(.)/) {|x,y| print y, x }
9216 * print "\n"
9217 *
9218 * <em>produces:</em>
9219 *
9220 * <<cruel>> <<world>>
9221 * rceu lowlr
9222 */
9223
9224 static VALUE
rb_str_scan(VALUE str,VALUE pat)9225 rb_str_scan(VALUE str, VALUE pat)
9226 {
9227 VALUE result;
9228 long start = 0;
9229 long last = -1, prev = 0;
9230 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9231
9232 pat = get_pat_quoted(pat, 1);
9233 mustnot_broken(str);
9234 if (!rb_block_given_p()) {
9235 VALUE ary = rb_ary_new();
9236
9237 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9238 last = prev;
9239 prev = start;
9240 rb_ary_push(ary, result);
9241 }
9242 if (last >= 0) rb_pat_search(pat, str, last, 1);
9243 else rb_backref_set(Qnil);
9244 return ary;
9245 }
9246
9247 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9248 last = prev;
9249 prev = start;
9250 rb_yield(result);
9251 str_mod_check(str, p, len);
9252 }
9253 if (last >= 0) rb_pat_search(pat, str, last, 1);
9254 return str;
9255 }
9256
9257
9258 /*
9259 * call-seq:
9260 * str.hex -> integer
9261 *
9262 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9263 * (with an optional sign and an optional <code>0x</code>) and returns the
9264 * corresponding number. Zero is returned on error.
9265 *
9266 * "0x0a".hex #=> 10
9267 * "-1234".hex #=> -4660
9268 * "0".hex #=> 0
9269 * "wombat".hex #=> 0
9270 */
9271
9272 static VALUE
rb_str_hex(VALUE str)9273 rb_str_hex(VALUE str)
9274 {
9275 return rb_str_to_inum(str, 16, FALSE);
9276 }
9277
9278
9279 /*
9280 * call-seq:
9281 * str.oct -> integer
9282 *
9283 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9284 * optional sign) and returns the corresponding number. Returns 0 if the
9285 * conversion fails.
9286 *
9287 * "123".oct #=> 83
9288 * "-377".oct #=> -255
9289 * "bad".oct #=> 0
9290 * "0377bad".oct #=> 255
9291 *
9292 * If +str+ starts with <code>0</code>, radix indicators are honored.
9293 * See Kernel#Integer.
9294 */
9295
9296 static VALUE
rb_str_oct(VALUE str)9297 rb_str_oct(VALUE str)
9298 {
9299 return rb_str_to_inum(str, -8, FALSE);
9300 }
9301
9302
9303 /*
9304 * call-seq:
9305 * str.crypt(salt_str) -> new_str
9306 *
9307 * Returns the string generated by calling <code>crypt(3)</code>
9308 * standard library function with <code>str</code> and
9309 * <code>salt_str</code>, in this order, as its arguments. Please do
9310 * not use this method any longer. It is legacy; provided only for
9311 * backward compatibility with ruby scripts in earlier days. It is
9312 * bad to use in contemporary programs for several reasons:
9313 *
9314 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
9315 * run. The generated string lacks data portability.
9316 *
9317 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
9318 * (i.e. silently ends up in unexpected results).
9319 *
9320 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
9321 * thread safe.
9322 *
9323 * * So-called "traditional" usage of <code>crypt(3)</code> is very
9324 * very very weak. According to its manpage, Linux's traditional
9325 * <code>crypt(3)</code> output has only 2**56 variations; too
9326 * easy to brute force today. And this is the default behaviour.
9327 *
9328 * * In order to make things robust some OSes implement so-called
9329 * "modular" usage. To go through, you have to do a complex
9330 * build-up of the <code>salt_str</code> parameter, by hand.
9331 * Failure in generation of a proper salt string tends not to
9332 * yield any errors; typos in parameters are normally not
9333 * detectable.
9334 *
9335 * * For instance, in the following example, the second invocation
9336 * of <code>String#crypt</code> is wrong; it has a typo in
9337 * "round=" (lacks "s"). However the call does not fail and
9338 * something unexpected is generated.
9339 *
9340 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
9341 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
9342 *
9343 * * Even in the "modular" mode, some hash functions are considered
9344 * archaic and no longer recommended at all; for instance module
9345 * <code>$1$</code> is officially abandoned by its author: see
9346 * http://phk.freebsd.dk/sagas/md5crypt_eol.html . For another
9347 * instance module <code>$3$</code> is considered completely
9348 * broken: see the manpage of FreeBSD.
9349 *
9350 * * On some OS such as Mac OS, there is no modular mode. Yet, as
9351 * written above, <code>crypt(3)</code> on Mac OS never fails.
9352 * This means even if you build up a proper salt string it
9353 * generates a traditional DES hash anyways, and there is no way
9354 * for you to be aware of.
9355 *
9356 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
9357 *
9358 * If for some reason you cannot migrate to other secure contemporary
9359 * password hashing algorithms, install the string-crypt gem and
9360 * <code>require 'string/crypt'</code> to continue using it.
9361 */
9362
9363 static VALUE
rb_str_crypt(VALUE str,VALUE salt)9364 rb_str_crypt(VALUE str, VALUE salt)
9365 {
9366 #ifdef HAVE_CRYPT_R
9367 VALUE databuf;
9368 struct crypt_data *data;
9369 # define CRYPT_END() ALLOCV_END(databuf)
9370 #else
9371 extern char *crypt(const char *, const char *);
9372 # define CRYPT_END() (void)0
9373 #endif
9374 VALUE result;
9375 const char *s, *saltp;
9376 char *res;
9377 #ifdef BROKEN_CRYPT
9378 char salt_8bit_clean[3];
9379 #endif
9380
9381 StringValue(salt);
9382 mustnot_wchar(str);
9383 mustnot_wchar(salt);
9384 if (RSTRING_LEN(salt) < 2) {
9385 short_salt:
9386 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
9387 }
9388
9389 s = StringValueCStr(str);
9390 saltp = RSTRING_PTR(salt);
9391 if (!saltp[0] || !saltp[1]) goto short_salt;
9392 #ifdef BROKEN_CRYPT
9393 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
9394 salt_8bit_clean[0] = saltp[0] & 0x7f;
9395 salt_8bit_clean[1] = saltp[1] & 0x7f;
9396 salt_8bit_clean[2] = '\0';
9397 saltp = salt_8bit_clean;
9398 }
9399 #endif
9400 #ifdef HAVE_CRYPT_R
9401 data = ALLOCV(databuf, sizeof(struct crypt_data));
9402 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
9403 data->initialized = 0;
9404 # endif
9405 res = crypt_r(s, saltp, data);
9406 #else
9407 res = crypt(s, saltp);
9408 #endif
9409 if (!res) {
9410 int err = errno;
9411 CRYPT_END();
9412 rb_syserr_fail(err, "crypt");
9413 }
9414 result = rb_str_new_cstr(res);
9415 CRYPT_END();
9416 FL_SET_RAW(result, OBJ_TAINTED_RAW(str) | OBJ_TAINTED_RAW(salt));
9417 return result;
9418 }
9419
9420
9421 /*
9422 * call-seq:
9423 * str.ord -> integer
9424 *
9425 * Returns the <code>Integer</code> ordinal of a one-character string.
9426 *
9427 * "a".ord #=> 97
9428 */
9429
9430 VALUE
rb_str_ord(VALUE s)9431 rb_str_ord(VALUE s)
9432 {
9433 unsigned int c;
9434
9435 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
9436 return UINT2NUM(c);
9437 }
9438 /*
9439 * call-seq:
9440 * str.sum(n=16) -> integer
9441 *
9442 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
9443 * where <em>n</em> is the optional <code>Integer</code> parameter, defaulting
9444 * to 16. The result is simply the sum of the binary value of each byte in
9445 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
9446 * checksum.
9447 */
9448
9449 static VALUE
rb_str_sum(int argc,VALUE * argv,VALUE str)9450 rb_str_sum(int argc, VALUE *argv, VALUE str)
9451 {
9452 int bits = 16;
9453 char *ptr, *p, *pend;
9454 long len;
9455 VALUE sum = INT2FIX(0);
9456 unsigned long sum0 = 0;
9457
9458 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
9459 bits = 0;
9460 }
9461 ptr = p = RSTRING_PTR(str);
9462 len = RSTRING_LEN(str);
9463 pend = p + len;
9464
9465 while (p < pend) {
9466 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
9467 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9468 str_mod_check(str, ptr, len);
9469 sum0 = 0;
9470 }
9471 sum0 += (unsigned char)*p;
9472 p++;
9473 }
9474
9475 if (bits == 0) {
9476 if (sum0) {
9477 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9478 }
9479 }
9480 else {
9481 if (sum == INT2FIX(0)) {
9482 if (bits < (int)sizeof(long)*CHAR_BIT) {
9483 sum0 &= (((unsigned long)1)<<bits)-1;
9484 }
9485 sum = LONG2FIX(sum0);
9486 }
9487 else {
9488 VALUE mod;
9489
9490 if (sum0) {
9491 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9492 }
9493
9494 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
9495 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
9496 sum = rb_funcall(sum, '&', 1, mod);
9497 }
9498 }
9499 return sum;
9500 }
9501
9502 static VALUE
rb_str_justify(int argc,VALUE * argv,VALUE str,char jflag)9503 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
9504 {
9505 rb_encoding *enc;
9506 VALUE w;
9507 long width, len, flen = 1, fclen = 1;
9508 VALUE res;
9509 char *p;
9510 const char *f = " ";
9511 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
9512 VALUE pad;
9513 int singlebyte = 1, cr;
9514 int termlen;
9515
9516 rb_scan_args(argc, argv, "11", &w, &pad);
9517 enc = STR_ENC_GET(str);
9518 termlen = rb_enc_mbminlen(enc);
9519 width = NUM2LONG(w);
9520 if (argc == 2) {
9521 StringValue(pad);
9522 enc = rb_enc_check(str, pad);
9523 f = RSTRING_PTR(pad);
9524 flen = RSTRING_LEN(pad);
9525 fclen = str_strlen(pad, enc); /* rb_enc_check */
9526 singlebyte = single_byte_optimizable(pad);
9527 if (flen == 0 || fclen == 0) {
9528 rb_raise(rb_eArgError, "zero width padding");
9529 }
9530 }
9531 len = str_strlen(str, enc); /* rb_enc_check */
9532 if (width < 0 || len >= width) return rb_str_dup(str);
9533 n = width - len;
9534 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
9535 rlen = n - llen;
9536 cr = ENC_CODERANGE(str);
9537 if (flen > 1) {
9538 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
9539 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
9540 }
9541 size = RSTRING_LEN(str);
9542 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
9543 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
9544 (len += llen2 + rlen2) >= LONG_MAX - size) {
9545 rb_raise(rb_eArgError, "argument too big");
9546 }
9547 len += size;
9548 res = str_new0(rb_obj_class(str), 0, len, termlen);
9549 p = RSTRING_PTR(res);
9550 if (flen <= 1) {
9551 memset(p, *f, llen);
9552 p += llen;
9553 }
9554 else {
9555 while (llen >= fclen) {
9556 memcpy(p,f,flen);
9557 p += flen;
9558 llen -= fclen;
9559 }
9560 if (llen > 0) {
9561 memcpy(p, f, llen2);
9562 p += llen2;
9563 }
9564 }
9565 memcpy(p, RSTRING_PTR(str), size);
9566 p += size;
9567 if (flen <= 1) {
9568 memset(p, *f, rlen);
9569 p += rlen;
9570 }
9571 else {
9572 while (rlen >= fclen) {
9573 memcpy(p,f,flen);
9574 p += flen;
9575 rlen -= fclen;
9576 }
9577 if (rlen > 0) {
9578 memcpy(p, f, rlen2);
9579 p += rlen2;
9580 }
9581 }
9582 TERM_FILL(p, termlen);
9583 STR_SET_LEN(res, p-RSTRING_PTR(res));
9584 OBJ_INFECT_RAW(res, str);
9585 if (!NIL_P(pad)) OBJ_INFECT_RAW(res, pad);
9586 rb_enc_associate(res, enc);
9587 if (argc == 2)
9588 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
9589 if (cr != ENC_CODERANGE_BROKEN)
9590 ENC_CODERANGE_SET(res, cr);
9591
9592 RB_GC_GUARD(pad);
9593 return res;
9594 }
9595
9596
9597 /*
9598 * call-seq:
9599 * str.ljust(integer, padstr=' ') -> new_str
9600 *
9601 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9602 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
9603 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9604 *
9605 * "hello".ljust(4) #=> "hello"
9606 * "hello".ljust(20) #=> "hello "
9607 * "hello".ljust(20, '1234') #=> "hello123412341234123"
9608 */
9609
9610 static VALUE
rb_str_ljust(int argc,VALUE * argv,VALUE str)9611 rb_str_ljust(int argc, VALUE *argv, VALUE str)
9612 {
9613 return rb_str_justify(argc, argv, str, 'l');
9614 }
9615
9616
9617 /*
9618 * call-seq:
9619 * str.rjust(integer, padstr=' ') -> new_str
9620 *
9621 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9622 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
9623 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9624 *
9625 * "hello".rjust(4) #=> "hello"
9626 * "hello".rjust(20) #=> " hello"
9627 * "hello".rjust(20, '1234') #=> "123412341234123hello"
9628 */
9629
9630 static VALUE
rb_str_rjust(int argc,VALUE * argv,VALUE str)9631 rb_str_rjust(int argc, VALUE *argv, VALUE str)
9632 {
9633 return rb_str_justify(argc, argv, str, 'r');
9634 }
9635
9636
9637 /*
9638 * call-seq:
9639 * str.center(width, padstr=' ') -> new_str
9640 *
9641 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
9642 * returns a new String of length +width+ with +str+ centered and padded with
9643 * +padstr+; otherwise, returns +str+.
9644 *
9645 * "hello".center(4) #=> "hello"
9646 * "hello".center(20) #=> " hello "
9647 * "hello".center(20, '123') #=> "1231231hello12312312"
9648 */
9649
9650 static VALUE
rb_str_center(int argc,VALUE * argv,VALUE str)9651 rb_str_center(int argc, VALUE *argv, VALUE str)
9652 {
9653 return rb_str_justify(argc, argv, str, 'c');
9654 }
9655
9656 /*
9657 * call-seq:
9658 * str.partition(sep) -> [head, sep, tail]
9659 * str.partition(regexp) -> [head, match, tail]
9660 *
9661 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
9662 * and returns the part before it, the match, and the part
9663 * after it.
9664 * If it is not found, returns two empty strings and <i>str</i>.
9665 *
9666 * "hello".partition("l") #=> ["he", "l", "lo"]
9667 * "hello".partition("x") #=> ["hello", "", ""]
9668 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
9669 */
9670
9671 static VALUE
rb_str_partition(VALUE str,VALUE sep)9672 rb_str_partition(VALUE str, VALUE sep)
9673 {
9674 long pos;
9675
9676 sep = get_pat_quoted(sep, 0);
9677 if (RB_TYPE_P(sep, T_REGEXP)) {
9678 pos = rb_reg_search(sep, str, 0, 0);
9679 if (pos < 0) {
9680 failed:
9681 return rb_ary_new3(3, rb_str_dup(str), str_new_empty(str), str_new_empty(str));
9682 }
9683 sep = rb_str_subpat(str, sep, INT2FIX(0));
9684 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
9685 }
9686 else {
9687 pos = rb_str_index(str, sep, 0);
9688 if (pos < 0) goto failed;
9689 }
9690 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9691 sep,
9692 rb_str_subseq(str, pos+RSTRING_LEN(sep),
9693 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9694 }
9695
9696 /*
9697 * call-seq:
9698 * str.rpartition(sep) -> [head, sep, tail]
9699 * str.rpartition(regexp) -> [head, match, tail]
9700 *
9701 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
9702 * of the string, and returns the part before it, the match, and the part
9703 * after it.
9704 * If it is not found, returns two empty strings and <i>str</i>.
9705 *
9706 * "hello".rpartition("l") #=> ["hel", "l", "o"]
9707 * "hello".rpartition("x") #=> ["", "", "hello"]
9708 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
9709 */
9710
9711 static VALUE
rb_str_rpartition(VALUE str,VALUE sep)9712 rb_str_rpartition(VALUE str, VALUE sep)
9713 {
9714 long pos = RSTRING_LEN(str);
9715 int regex = FALSE;
9716
9717 if (RB_TYPE_P(sep, T_REGEXP)) {
9718 pos = rb_reg_search(sep, str, pos, 1);
9719 regex = TRUE;
9720 }
9721 else {
9722 VALUE tmp;
9723
9724 tmp = rb_check_string_type(sep);
9725 if (NIL_P(tmp)) {
9726 rb_raise(rb_eTypeError, "type mismatch: %s given",
9727 rb_obj_classname(sep));
9728 }
9729 sep = tmp;
9730 pos = rb_str_sublen(str, pos);
9731 pos = rb_str_rindex(str, sep, pos);
9732 }
9733 if (pos < 0) {
9734 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), rb_str_dup(str));
9735 }
9736 if (regex) {
9737 sep = rb_reg_nth_match(0, rb_backref_get());
9738 }
9739 else {
9740 pos = rb_str_offset(str, pos);
9741 }
9742 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9743 sep,
9744 rb_str_subseq(str, pos+RSTRING_LEN(sep),
9745 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9746 }
9747
9748 /*
9749 * call-seq:
9750 * str.start_with?([prefixes]+) -> true or false
9751 *
9752 * Returns true if +str+ starts with one of the +prefixes+ given.
9753 * Each of the +prefixes+ should be a String or a Regexp.
9754 *
9755 * "hello".start_with?("hell") #=> true
9756 * "hello".start_with?(/H/i) #=> true
9757 *
9758 * # returns true if one of the prefixes matches.
9759 * "hello".start_with?("heaven", "hell") #=> true
9760 * "hello".start_with?("heaven", "paradise") #=> false
9761 */
9762
9763 static VALUE
rb_str_start_with(int argc,VALUE * argv,VALUE str)9764 rb_str_start_with(int argc, VALUE *argv, VALUE str)
9765 {
9766 int i;
9767
9768 for (i=0; i<argc; i++) {
9769 VALUE tmp = argv[i];
9770 if (RB_TYPE_P(tmp, T_REGEXP)) {
9771 if (rb_reg_start_with_p(tmp, str))
9772 return Qtrue;
9773 }
9774 else {
9775 StringValue(tmp);
9776 rb_enc_check(str, tmp);
9777 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9778 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9779 return Qtrue;
9780 }
9781 }
9782 return Qfalse;
9783 }
9784
9785 /*
9786 * call-seq:
9787 * str.end_with?([suffixes]+) -> true or false
9788 *
9789 * Returns true if +str+ ends with one of the +suffixes+ given.
9790 *
9791 * "hello".end_with?("ello") #=> true
9792 *
9793 * # returns true if one of the +suffixes+ matches.
9794 * "hello".end_with?("heaven", "ello") #=> true
9795 * "hello".end_with?("heaven", "paradise") #=> false
9796 */
9797
9798 static VALUE
rb_str_end_with(int argc,VALUE * argv,VALUE str)9799 rb_str_end_with(int argc, VALUE *argv, VALUE str)
9800 {
9801 int i;
9802 char *p, *s, *e;
9803 rb_encoding *enc;
9804
9805 for (i=0; i<argc; i++) {
9806 VALUE tmp = argv[i];
9807 StringValue(tmp);
9808 enc = rb_enc_check(str, tmp);
9809 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9810 p = RSTRING_PTR(str);
9811 e = p + RSTRING_LEN(str);
9812 s = e - RSTRING_LEN(tmp);
9813 if (rb_enc_left_char_head(p, s, e, enc) != s)
9814 continue;
9815 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9816 return Qtrue;
9817 }
9818 return Qfalse;
9819 }
9820
9821 /*!
9822 * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
9823 * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
9824 *
9825 * @param str the target
9826 * @param prefix the prefix
9827 * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
9828 * @retval Positive-Integer otherwise
9829 */
9830 static long
deleted_prefix_length(VALUE str,VALUE prefix)9831 deleted_prefix_length(VALUE str, VALUE prefix)
9832 {
9833 char *strptr, *prefixptr;
9834 long olen, prefixlen;
9835
9836 StringValue(prefix);
9837 if (is_broken_string(prefix)) return 0;
9838 rb_enc_check(str, prefix);
9839
9840 /* return 0 if not start with prefix */
9841 prefixlen = RSTRING_LEN(prefix);
9842 if (prefixlen <= 0) return 0;
9843 olen = RSTRING_LEN(str);
9844 if (olen < prefixlen) return 0;
9845 strptr = RSTRING_PTR(str);
9846 prefixptr = RSTRING_PTR(prefix);
9847 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
9848
9849 return prefixlen;
9850 }
9851
9852 /*
9853 * call-seq:
9854 * str.delete_prefix!(prefix) -> self or nil
9855 *
9856 * Deletes leading <code>prefix</code> from <i>str</i>, returning
9857 * <code>nil</code> if no change was made.
9858 *
9859 * "hello".delete_prefix!("hel") #=> "lo"
9860 * "hello".delete_prefix!("llo") #=> nil
9861 */
9862
9863 static VALUE
rb_str_delete_prefix_bang(VALUE str,VALUE prefix)9864 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
9865 {
9866 long prefixlen;
9867 str_modify_keep_cr(str);
9868
9869 prefixlen = deleted_prefix_length(str, prefix);
9870 if (prefixlen <= 0) return Qnil;
9871
9872 return rb_str_drop_bytes(str, prefixlen);
9873 }
9874
9875 /*
9876 * call-seq:
9877 * str.delete_prefix(prefix) -> new_str
9878 *
9879 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
9880 *
9881 * "hello".delete_prefix("hel") #=> "lo"
9882 * "hello".delete_prefix("llo") #=> "hello"
9883 */
9884
9885 static VALUE
rb_str_delete_prefix(VALUE str,VALUE prefix)9886 rb_str_delete_prefix(VALUE str, VALUE prefix)
9887 {
9888 long prefixlen;
9889
9890 prefixlen = deleted_prefix_length(str, prefix);
9891 if (prefixlen <= 0) return rb_str_dup(str);
9892
9893 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
9894 }
9895
9896 /*!
9897 * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
9898 * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
9899 *
9900 * @param str the target
9901 * @param suffix the suffix
9902 * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
9903 * @retval Positive-Integer otherwise
9904 */
9905 static long
deleted_suffix_length(VALUE str,VALUE suffix)9906 deleted_suffix_length(VALUE str, VALUE suffix)
9907 {
9908 char *strptr, *suffixptr, *s;
9909 long olen, suffixlen;
9910 rb_encoding *enc;
9911
9912 StringValue(suffix);
9913 if (is_broken_string(suffix)) return 0;
9914 enc = rb_enc_check(str, suffix);
9915
9916 /* return 0 if not start with suffix */
9917 suffixlen = RSTRING_LEN(suffix);
9918 if (suffixlen <= 0) return 0;
9919 olen = RSTRING_LEN(str);
9920 if (olen < suffixlen) return 0;
9921 strptr = RSTRING_PTR(str);
9922 suffixptr = RSTRING_PTR(suffix);
9923 s = strptr + olen - suffixlen;
9924 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
9925 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
9926
9927 return suffixlen;
9928 }
9929
9930 /*
9931 * call-seq:
9932 * str.delete_suffix!(suffix) -> self or nil
9933 *
9934 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
9935 * <code>nil</code> if no change was made.
9936 *
9937 * "hello".delete_suffix!("llo") #=> "he"
9938 * "hello".delete_suffix!("hel") #=> nil
9939 */
9940
9941 static VALUE
rb_str_delete_suffix_bang(VALUE str,VALUE suffix)9942 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
9943 {
9944 long olen, suffixlen, len;
9945 str_modifiable(str);
9946
9947 suffixlen = deleted_suffix_length(str, suffix);
9948 if (suffixlen <= 0) return Qnil;
9949
9950 olen = RSTRING_LEN(str);
9951 str_modify_keep_cr(str);
9952 len = olen - suffixlen;
9953 STR_SET_LEN(str, len);
9954 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9955 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9956 ENC_CODERANGE_CLEAR(str);
9957 }
9958 return str;
9959 }
9960
9961 /*
9962 * call-seq:
9963 * str.delete_suffix(suffix) -> new_str
9964 *
9965 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
9966 *
9967 * "hello".delete_suffix("llo") #=> "he"
9968 * "hello".delete_suffix("hel") #=> "hello"
9969 */
9970
9971 static VALUE
rb_str_delete_suffix(VALUE str,VALUE suffix)9972 rb_str_delete_suffix(VALUE str, VALUE suffix)
9973 {
9974 long suffixlen;
9975
9976 suffixlen = deleted_suffix_length(str, suffix);
9977 if (suffixlen <= 0) return rb_str_dup(str);
9978
9979 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
9980 }
9981
9982 void
rb_str_setter(VALUE val,ID id,VALUE * var)9983 rb_str_setter(VALUE val, ID id, VALUE *var)
9984 {
9985 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
9986 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
9987 }
9988 *var = val;
9989 }
9990
9991 static void
rb_fs_setter(VALUE val,ID id,VALUE * var)9992 rb_fs_setter(VALUE val, ID id, VALUE *var)
9993 {
9994 val = rb_fs_check(val);
9995 if (!val) {
9996 rb_raise(rb_eTypeError,
9997 "value of %"PRIsVALUE" must be String or Regexp",
9998 rb_id2str(id));
9999 }
10000 *var = val;
10001 }
10002
10003
10004 /*
10005 * call-seq:
10006 * str.force_encoding(encoding) -> str
10007 *
10008 * Changes the encoding to +encoding+ and returns self.
10009 */
10010
10011 static VALUE
rb_str_force_encoding(VALUE str,VALUE enc)10012 rb_str_force_encoding(VALUE str, VALUE enc)
10013 {
10014 str_modifiable(str);
10015 rb_enc_associate(str, rb_to_encoding(enc));
10016 ENC_CODERANGE_CLEAR(str);
10017 return str;
10018 }
10019
10020 /*
10021 * call-seq:
10022 * str.b -> str
10023 *
10024 * Returns a copied string whose encoding is ASCII-8BIT.
10025 */
10026
10027 static VALUE
rb_str_b(VALUE str)10028 rb_str_b(VALUE str)
10029 {
10030 VALUE str2 = str_alloc(rb_cString);
10031 str_replace_shared_without_enc(str2, str);
10032 OBJ_INFECT_RAW(str2, str);
10033 ENC_CODERANGE_CLEAR(str2);
10034 return str2;
10035 }
10036
10037 /*
10038 * call-seq:
10039 * str.valid_encoding? -> true or false
10040 *
10041 * Returns true for a string which is encoded correctly.
10042 *
10043 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10044 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10045 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10046 */
10047
10048 static VALUE
rb_str_valid_encoding_p(VALUE str)10049 rb_str_valid_encoding_p(VALUE str)
10050 {
10051 int cr = rb_enc_str_coderange(str);
10052
10053 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
10054 }
10055
10056 /*
10057 * call-seq:
10058 * str.ascii_only? -> true or false
10059 *
10060 * Returns true for a string which has only ASCII characters.
10061 *
10062 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10063 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10064 */
10065
10066 static VALUE
rb_str_is_ascii_only_p(VALUE str)10067 rb_str_is_ascii_only_p(VALUE str)
10068 {
10069 int cr = rb_enc_str_coderange(str);
10070
10071 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
10072 }
10073
10074 /**
10075 * Shortens _str_ and adds three dots, an ellipsis, if it is longer
10076 * than _len_ characters.
10077 *
10078 * \param str the string to ellipsize.
10079 * \param len the maximum string length.
10080 * \return the ellipsized string.
10081 * \pre _len_ must not be negative.
10082 * \post the length of the returned string in characters is less than or equal to _len_.
10083 * \post If the length of _str_ is less than or equal _len_, returns _str_ itself.
10084 * \post the encoding of returned string is equal to the encoding of _str_.
10085 * \post the class of returned string is equal to the class of _str_.
10086 * \note the length is counted in characters.
10087 */
10088 VALUE
rb_str_ellipsize(VALUE str,long len)10089 rb_str_ellipsize(VALUE str, long len)
10090 {
10091 static const char ellipsis[] = "...";
10092 const long ellipsislen = sizeof(ellipsis) - 1;
10093 rb_encoding *const enc = rb_enc_get(str);
10094 const long blen = RSTRING_LEN(str);
10095 const char *const p = RSTRING_PTR(str), *e = p + blen;
10096 VALUE estr, ret = 0;
10097
10098 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10099 if (len * rb_enc_mbminlen(enc) >= blen ||
10100 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10101 ret = str;
10102 }
10103 else if (len <= ellipsislen ||
10104 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10105 if (rb_enc_asciicompat(enc)) {
10106 ret = rb_str_new_with_class(str, ellipsis, len);
10107 rb_enc_associate(ret, enc);
10108 }
10109 else {
10110 estr = rb_usascii_str_new(ellipsis, len);
10111 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10112 }
10113 }
10114 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10115 rb_str_cat(ret, ellipsis, ellipsislen);
10116 }
10117 else {
10118 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10119 rb_enc_from_encoding(enc), 0, Qnil);
10120 rb_str_append(ret, estr);
10121 }
10122 return ret;
10123 }
10124
10125 static VALUE
str_compat_and_valid(VALUE str,rb_encoding * enc)10126 str_compat_and_valid(VALUE str, rb_encoding *enc)
10127 {
10128 int cr;
10129 str = StringValue(str);
10130 cr = rb_enc_str_coderange(str);
10131 if (cr == ENC_CODERANGE_BROKEN) {
10132 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10133 }
10134 else {
10135 rb_encoding *e = STR_ENC_GET(str);
10136 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10137 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10138 rb_enc_name(enc), rb_enc_name(e));
10139 }
10140 }
10141 return str;
10142 }
10143
10144 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10145
10146 /**
10147 * @param str the string to be scrubbed
10148 * @param repl the replacement character
10149 * @return If given string is invalid, returns a new string. Otherwise, returns Qnil.
10150 */
10151 VALUE
rb_str_scrub(VALUE str,VALUE repl)10152 rb_str_scrub(VALUE str, VALUE repl)
10153 {
10154 rb_encoding *enc = STR_ENC_GET(str);
10155 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10156 }
10157
10158 VALUE
rb_enc_str_scrub(rb_encoding * enc,VALUE str,VALUE repl)10159 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10160 {
10161 int cr = ENC_CODERANGE_UNKNOWN;
10162 if (enc == STR_ENC_GET(str)) {
10163 /* cached coderange makes sense only when enc equals the
10164 * actual encoding of str */
10165 cr = ENC_CODERANGE(str);
10166 }
10167 return enc_str_scrub(enc, str, repl, cr);
10168 }
10169
10170 static VALUE
enc_str_scrub(rb_encoding * enc,VALUE str,VALUE repl,int cr)10171 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10172 {
10173 int encidx;
10174 VALUE buf = Qnil;
10175 const char *rep;
10176 long replen = -1;
10177 int tainted = 0;
10178
10179 if (rb_block_given_p()) {
10180 if (!NIL_P(repl))
10181 rb_raise(rb_eArgError, "both of block and replacement given");
10182 replen = 0;
10183 }
10184
10185 if (ENC_CODERANGE_CLEAN_P(cr))
10186 return Qnil;
10187
10188 if (!NIL_P(repl)) {
10189 repl = str_compat_and_valid(repl, enc);
10190 tainted = OBJ_TAINTED_RAW(repl);
10191 }
10192
10193 if (rb_enc_dummy_p(enc)) {
10194 return Qnil;
10195 }
10196 encidx = rb_enc_to_index(enc);
10197
10198 #define DEFAULT_REPLACE_CHAR(str) do { \
10199 static const char replace[sizeof(str)-1] = str; \
10200 rep = replace; replen = (int)sizeof(replace); \
10201 } while (0)
10202
10203 if (rb_enc_asciicompat(enc)) {
10204 const char *p = RSTRING_PTR(str);
10205 const char *e = RSTRING_END(str);
10206 const char *p1 = p;
10207 int rep7bit_p;
10208 if (!replen) {
10209 rep = NULL;
10210 rep7bit_p = FALSE;
10211 }
10212 else if (!NIL_P(repl)) {
10213 rep = RSTRING_PTR(repl);
10214 replen = RSTRING_LEN(repl);
10215 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10216 }
10217 else if (encidx == rb_utf8_encindex()) {
10218 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10219 rep7bit_p = FALSE;
10220 }
10221 else {
10222 DEFAULT_REPLACE_CHAR("?");
10223 rep7bit_p = TRUE;
10224 }
10225 cr = ENC_CODERANGE_7BIT;
10226
10227 p = search_nonascii(p, e);
10228 if (!p) {
10229 p = e;
10230 }
10231 while (p < e) {
10232 int ret = rb_enc_precise_mbclen(p, e, enc);
10233 if (MBCLEN_NEEDMORE_P(ret)) {
10234 break;
10235 }
10236 else if (MBCLEN_CHARFOUND_P(ret)) {
10237 cr = ENC_CODERANGE_VALID;
10238 p += MBCLEN_CHARFOUND_LEN(ret);
10239 }
10240 else if (MBCLEN_INVALID_P(ret)) {
10241 /*
10242 * p1~p: valid ascii/multibyte chars
10243 * p ~e: invalid bytes + unknown bytes
10244 */
10245 long clen = rb_enc_mbmaxlen(enc);
10246 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10247 if (p > p1) {
10248 rb_str_buf_cat(buf, p1, p - p1);
10249 }
10250
10251 if (e - p < clen) clen = e - p;
10252 if (clen <= 2) {
10253 clen = 1;
10254 }
10255 else {
10256 const char *q = p;
10257 clen--;
10258 for (; clen > 1; clen--) {
10259 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10260 if (MBCLEN_NEEDMORE_P(ret)) break;
10261 if (MBCLEN_INVALID_P(ret)) continue;
10262 UNREACHABLE;
10263 }
10264 }
10265 if (rep) {
10266 rb_str_buf_cat(buf, rep, replen);
10267 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10268 }
10269 else {
10270 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10271 repl = str_compat_and_valid(repl, enc);
10272 tainted |= OBJ_TAINTED_RAW(repl);
10273 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10274 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10275 cr = ENC_CODERANGE_VALID;
10276 }
10277 p += clen;
10278 p1 = p;
10279 p = search_nonascii(p, e);
10280 if (!p) {
10281 p = e;
10282 break;
10283 }
10284 }
10285 else {
10286 UNREACHABLE;
10287 }
10288 }
10289 if (NIL_P(buf)) {
10290 if (p == e) {
10291 ENC_CODERANGE_SET(str, cr);
10292 return Qnil;
10293 }
10294 buf = rb_str_buf_new(RSTRING_LEN(str));
10295 }
10296 if (p1 < p) {
10297 rb_str_buf_cat(buf, p1, p - p1);
10298 }
10299 if (p < e) {
10300 if (rep) {
10301 rb_str_buf_cat(buf, rep, replen);
10302 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10303 }
10304 else {
10305 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10306 repl = str_compat_and_valid(repl, enc);
10307 tainted |= OBJ_TAINTED_RAW(repl);
10308 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10309 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10310 cr = ENC_CODERANGE_VALID;
10311 }
10312 }
10313 }
10314 else {
10315 /* ASCII incompatible */
10316 const char *p = RSTRING_PTR(str);
10317 const char *e = RSTRING_END(str);
10318 const char *p1 = p;
10319 long mbminlen = rb_enc_mbminlen(enc);
10320 if (!replen) {
10321 rep = NULL;
10322 }
10323 else if (!NIL_P(repl)) {
10324 rep = RSTRING_PTR(repl);
10325 replen = RSTRING_LEN(repl);
10326 }
10327 else if (encidx == ENCINDEX_UTF_16BE) {
10328 DEFAULT_REPLACE_CHAR("\xFF\xFD");
10329 }
10330 else if (encidx == ENCINDEX_UTF_16LE) {
10331 DEFAULT_REPLACE_CHAR("\xFD\xFF");
10332 }
10333 else if (encidx == ENCINDEX_UTF_32BE) {
10334 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
10335 }
10336 else if (encidx == ENCINDEX_UTF_32LE) {
10337 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
10338 }
10339 else {
10340 DEFAULT_REPLACE_CHAR("?");
10341 }
10342
10343 while (p < e) {
10344 int ret = rb_enc_precise_mbclen(p, e, enc);
10345 if (MBCLEN_NEEDMORE_P(ret)) {
10346 break;
10347 }
10348 else if (MBCLEN_CHARFOUND_P(ret)) {
10349 p += MBCLEN_CHARFOUND_LEN(ret);
10350 }
10351 else if (MBCLEN_INVALID_P(ret)) {
10352 const char *q = p;
10353 long clen = rb_enc_mbmaxlen(enc);
10354 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10355 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
10356
10357 if (e - p < clen) clen = e - p;
10358 if (clen <= mbminlen * 2) {
10359 clen = mbminlen;
10360 }
10361 else {
10362 clen -= mbminlen;
10363 for (; clen > mbminlen; clen-=mbminlen) {
10364 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10365 if (MBCLEN_NEEDMORE_P(ret)) break;
10366 if (MBCLEN_INVALID_P(ret)) continue;
10367 UNREACHABLE;
10368 }
10369 }
10370 if (rep) {
10371 rb_str_buf_cat(buf, rep, replen);
10372 }
10373 else {
10374 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10375 repl = str_compat_and_valid(repl, enc);
10376 tainted |= OBJ_TAINTED_RAW(repl);
10377 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10378 }
10379 p += clen;
10380 p1 = p;
10381 }
10382 else {
10383 UNREACHABLE;
10384 }
10385 }
10386 if (NIL_P(buf)) {
10387 if (p == e) {
10388 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
10389 return Qnil;
10390 }
10391 buf = rb_str_buf_new(RSTRING_LEN(str));
10392 }
10393 if (p1 < p) {
10394 rb_str_buf_cat(buf, p1, p - p1);
10395 }
10396 if (p < e) {
10397 if (rep) {
10398 rb_str_buf_cat(buf, rep, replen);
10399 }
10400 else {
10401 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10402 repl = str_compat_and_valid(repl, enc);
10403 tainted |= OBJ_TAINTED_RAW(repl);
10404 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10405 }
10406 }
10407 cr = ENC_CODERANGE_VALID;
10408 }
10409 FL_SET_RAW(buf, tainted|OBJ_TAINTED_RAW(str));
10410 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
10411 return buf;
10412 }
10413
10414 /*
10415 * call-seq:
10416 * str.scrub -> new_str
10417 * str.scrub(repl) -> new_str
10418 * str.scrub{|bytes|} -> new_str
10419 *
10420 * If the string is invalid byte sequence then replace invalid bytes with given replacement
10421 * character, else returns self.
10422 * If block is given, replace invalid bytes with returned value of the block.
10423 *
10424 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
10425 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
10426 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10427 */
10428 static VALUE
str_scrub(int argc,VALUE * argv,VALUE str)10429 str_scrub(int argc, VALUE *argv, VALUE str)
10430 {
10431 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10432 VALUE new = rb_str_scrub(str, repl);
10433 return NIL_P(new) ? rb_str_dup(str): new;
10434 }
10435
10436 /*
10437 * call-seq:
10438 * str.scrub! -> str
10439 * str.scrub!(repl) -> str
10440 * str.scrub!{|bytes|} -> str
10441 *
10442 * If the string is invalid byte sequence then replace invalid bytes with given replacement
10443 * character, else returns self.
10444 * If block is given, replace invalid bytes with returned value of the block.
10445 *
10446 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
10447 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
10448 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10449 */
10450 static VALUE
str_scrub_bang(int argc,VALUE * argv,VALUE str)10451 str_scrub_bang(int argc, VALUE *argv, VALUE str)
10452 {
10453 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10454 VALUE new = rb_str_scrub(str, repl);
10455 if (!NIL_P(new)) rb_str_replace(str, new);
10456 return str;
10457 }
10458
10459 static ID id_normalize;
10460 static ID id_normalized_p;
10461 static VALUE mUnicodeNormalize;
10462
10463 static VALUE
unicode_normalize_common(int argc,VALUE * argv,VALUE str,ID id)10464 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
10465 {
10466 static int UnicodeNormalizeRequired = 0;
10467 VALUE argv2[2];
10468
10469 if (!UnicodeNormalizeRequired) {
10470 rb_require("unicode_normalize/normalize.rb");
10471 UnicodeNormalizeRequired = 1;
10472 }
10473 argv2[0] = str;
10474 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
10475 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
10476 }
10477
10478 /*
10479 * call-seq:
10480 * str.unicode_normalize(form=:nfc)
10481 *
10482 * Unicode Normalization---Returns a normalized form of +str+,
10483 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
10484 * The normalization form used is determined by +form+, which can
10485 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10486 * The default is +:nfc+.
10487 *
10488 * If the string is not in a Unicode Encoding, then an Exception is raised.
10489 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
10490 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
10491 * Anything other than UTF-8 is implemented by converting to UTF-8,
10492 * which makes it slower than UTF-8.
10493 *
10494 * "a\u0300".unicode_normalize #=> "\u00E0"
10495 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
10496 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
10497 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
10498 * #=> Encoding::CompatibilityError raised
10499 */
10500 static VALUE
rb_str_unicode_normalize(int argc,VALUE * argv,VALUE str)10501 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
10502 {
10503 return unicode_normalize_common(argc, argv, str, id_normalize);
10504 }
10505
10506 /*
10507 * call-seq:
10508 * str.unicode_normalize!(form=:nfc)
10509 *
10510 * Destructive version of String#unicode_normalize, doing Unicode
10511 * normalization in place.
10512 */
10513 static VALUE
rb_str_unicode_normalize_bang(int argc,VALUE * argv,VALUE str)10514 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
10515 {
10516 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
10517 }
10518
10519 /* call-seq:
10520 * str.unicode_normalized?(form=:nfc)
10521 *
10522 * Checks whether +str+ is in Unicode normalization form +form+,
10523 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10524 * The default is +:nfc+.
10525 *
10526 * If the string is not in a Unicode Encoding, then an Exception is raised.
10527 * For details, see String#unicode_normalize.
10528 *
10529 * "a\u0300".unicode_normalized? #=> false
10530 * "a\u0300".unicode_normalized?(:nfd) #=> true
10531 * "\u00E0".unicode_normalized? #=> true
10532 * "\u00E0".unicode_normalized?(:nfd) #=> false
10533 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
10534 * #=> Encoding::CompatibilityError raised
10535 */
10536 static VALUE
rb_str_unicode_normalized_p(int argc,VALUE * argv,VALUE str)10537 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
10538 {
10539 return unicode_normalize_common(argc, argv, str, id_normalized_p);
10540 }
10541
10542 /**********************************************************************
10543 * Document-class: Symbol
10544 *
10545 * <code>Symbol</code> objects represent names and some strings
10546 * inside the Ruby
10547 * interpreter. They are generated using the <code>:name</code> and
10548 * <code>:"string"</code> literals
10549 * syntax, and by the various <code>to_sym</code> methods. The same
10550 * <code>Symbol</code> object will be created for a given name or string
10551 * for the duration of a program's execution, regardless of the context
10552 * or meaning of that name. Thus if <code>Fred</code> is a constant in
10553 * one context, a method in another, and a class in a third, the
10554 * <code>Symbol</code> <code>:Fred</code> will be the same object in
10555 * all three contexts.
10556 *
10557 * module One
10558 * class Fred
10559 * end
10560 * $f1 = :Fred
10561 * end
10562 * module Two
10563 * Fred = 1
10564 * $f2 = :Fred
10565 * end
10566 * def Fred()
10567 * end
10568 * $f3 = :Fred
10569 * $f1.object_id #=> 2514190
10570 * $f2.object_id #=> 2514190
10571 * $f3.object_id #=> 2514190
10572 *
10573 */
10574
10575
10576 /*
10577 * call-seq:
10578 * sym == obj -> true or false
10579 *
10580 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
10581 * symbol, returns <code>true</code>.
10582 */
10583
10584 #define sym_equal rb_obj_equal
10585
10586 static int
sym_printable(const char * s,const char * send,rb_encoding * enc)10587 sym_printable(const char *s, const char *send, rb_encoding *enc)
10588 {
10589 while (s < send) {
10590 int n;
10591 int c = rb_enc_precise_mbclen(s, send, enc);
10592
10593 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
10594 n = MBCLEN_CHARFOUND_LEN(c);
10595 c = rb_enc_mbc_to_codepoint(s, send, enc);
10596 if (!rb_enc_isprint(c, enc)) return FALSE;
10597 s += n;
10598 }
10599 return TRUE;
10600 }
10601
10602 int
rb_str_symname_p(VALUE sym)10603 rb_str_symname_p(VALUE sym)
10604 {
10605 rb_encoding *enc;
10606 const char *ptr;
10607 long len;
10608 rb_encoding *resenc = rb_default_internal_encoding();
10609
10610 if (resenc == NULL) resenc = rb_default_external_encoding();
10611 enc = STR_ENC_GET(sym);
10612 ptr = RSTRING_PTR(sym);
10613 len = RSTRING_LEN(sym);
10614 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
10615 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
10616 return FALSE;
10617 }
10618 return TRUE;
10619 }
10620
10621 VALUE
rb_str_quote_unprintable(VALUE str)10622 rb_str_quote_unprintable(VALUE str)
10623 {
10624 rb_encoding *enc;
10625 const char *ptr;
10626 long len;
10627 rb_encoding *resenc;
10628
10629 Check_Type(str, T_STRING);
10630 resenc = rb_default_internal_encoding();
10631 if (resenc == NULL) resenc = rb_default_external_encoding();
10632 enc = STR_ENC_GET(str);
10633 ptr = RSTRING_PTR(str);
10634 len = RSTRING_LEN(str);
10635 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
10636 !sym_printable(ptr, ptr + len, enc)) {
10637 return rb_str_inspect(str);
10638 }
10639 return str;
10640 }
10641
10642 MJIT_FUNC_EXPORTED VALUE
rb_id_quote_unprintable(ID id)10643 rb_id_quote_unprintable(ID id)
10644 {
10645 VALUE str = rb_id2str(id);
10646 if (!rb_str_symname_p(str)) {
10647 return rb_str_inspect(str);
10648 }
10649 return str;
10650 }
10651
10652 /*
10653 * call-seq:
10654 * sym.inspect -> string
10655 *
10656 * Returns the representation of <i>sym</i> as a symbol literal.
10657 *
10658 * :fred.inspect #=> ":fred"
10659 */
10660
10661 static VALUE
sym_inspect(VALUE sym)10662 sym_inspect(VALUE sym)
10663 {
10664 VALUE str = rb_sym2str(sym);
10665 const char *ptr;
10666 long len;
10667 char *dest;
10668
10669 if (!rb_str_symname_p(str)) {
10670 str = rb_str_inspect(str);
10671 len = RSTRING_LEN(str);
10672 rb_str_resize(str, len + 1);
10673 dest = RSTRING_PTR(str);
10674 memmove(dest + 1, dest, len);
10675 }
10676 else {
10677 rb_encoding *enc = STR_ENC_GET(str);
10678 RSTRING_GETMEM(str, ptr, len);
10679 str = rb_enc_str_new(0, len + 1, enc);
10680 dest = RSTRING_PTR(str);
10681 memcpy(dest + 1, ptr, len);
10682 }
10683 dest[0] = ':';
10684 return str;
10685 }
10686
10687
10688 /*
10689 * call-seq:
10690 * sym.id2name -> string
10691 * sym.to_s -> string
10692 *
10693 * Returns the name or string corresponding to <i>sym</i>.
10694 *
10695 * :fred.id2name #=> "fred"
10696 * :ginger.to_s #=> "ginger"
10697 */
10698
10699
10700 VALUE
rb_sym_to_s(VALUE sym)10701 rb_sym_to_s(VALUE sym)
10702 {
10703 return str_new_shared(rb_cString, rb_sym2str(sym));
10704 }
10705
10706
10707 /*
10708 * call-seq:
10709 * sym.to_sym -> sym
10710 * sym.intern -> sym
10711 *
10712 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
10713 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
10714 * in this case.
10715 */
10716
10717 static VALUE
sym_to_sym(VALUE sym)10718 sym_to_sym(VALUE sym)
10719 {
10720 return sym;
10721 }
10722
10723 MJIT_FUNC_EXPORTED VALUE
rb_sym_proc_call(ID mid,int argc,const VALUE * argv,VALUE passed_proc)10724 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc)
10725 {
10726 VALUE obj;
10727
10728 if (argc < 1) {
10729 rb_raise(rb_eArgError, "no receiver given");
10730 }
10731 obj = argv[0];
10732 return rb_funcall_with_block(obj, mid, argc - 1, argv + 1, passed_proc);
10733 }
10734
10735 #if 0
10736 /*
10737 * call-seq:
10738 * sym.to_proc
10739 *
10740 * Returns a _Proc_ object which responds to the given method by _sym_.
10741 *
10742 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
10743 */
10744
10745 VALUE
10746 rb_sym_to_proc(VALUE sym)
10747 {
10748 }
10749 #endif
10750
10751 /*
10752 * call-seq:
10753 *
10754 * sym.succ
10755 *
10756 * Same as <code>sym.to_s.succ.intern</code>.
10757 */
10758
10759 static VALUE
sym_succ(VALUE sym)10760 sym_succ(VALUE sym)
10761 {
10762 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
10763 }
10764
10765 /*
10766 * call-seq:
10767 *
10768 * symbol <=> other_symbol -> -1, 0, +1, or nil
10769 *
10770 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
10771 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
10772 * less than, equal to, or greater than +other_symbol+.
10773 *
10774 * +nil+ is returned if the two values are incomparable.
10775 *
10776 * See String#<=> for more information.
10777 */
10778
10779 static VALUE
sym_cmp(VALUE sym,VALUE other)10780 sym_cmp(VALUE sym, VALUE other)
10781 {
10782 if (!SYMBOL_P(other)) {
10783 return Qnil;
10784 }
10785 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
10786 }
10787
10788 /*
10789 * call-seq:
10790 * sym.casecmp(other_symbol) -> -1, 0, +1, or nil
10791 *
10792 * Case-insensitive version of <code>Symbol#<=></code>.
10793 * Currently, case-insensitivity only works on characters A-Z/a-z,
10794 * not all of Unicode. This is different from Symbol#casecmp?.
10795 *
10796 * :aBcDeF.casecmp(:abcde) #=> 1
10797 * :aBcDeF.casecmp(:abcdef) #=> 0
10798 * :aBcDeF.casecmp(:abcdefg) #=> -1
10799 * :abcdef.casecmp(:ABCDEF) #=> 0
10800 *
10801 * +nil+ is returned if the two symbols have incompatible encodings,
10802 * or if +other_symbol+ is not a symbol.
10803 *
10804 * :foo.casecmp(2) #=> nil
10805 * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}") #=> nil
10806 */
10807
10808 static VALUE
sym_casecmp(VALUE sym,VALUE other)10809 sym_casecmp(VALUE sym, VALUE other)
10810 {
10811 if (!SYMBOL_P(other)) {
10812 return Qnil;
10813 }
10814 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
10815 }
10816
10817 /*
10818 * call-seq:
10819 * sym.casecmp?(other_symbol) -> true, false, or nil
10820 *
10821 * Returns +true+ if +sym+ and +other_symbol+ are equal after
10822 * Unicode case folding, +false+ if they are not equal.
10823 *
10824 * :aBcDeF.casecmp?(:abcde) #=> false
10825 * :aBcDeF.casecmp?(:abcdef) #=> true
10826 * :aBcDeF.casecmp?(:abcdefg) #=> false
10827 * :abcdef.casecmp?(:ABCDEF) #=> true
10828 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
10829 *
10830 * +nil+ is returned if the two symbols have incompatible encodings,
10831 * or if +other_symbol+ is not a symbol.
10832 *
10833 * :foo.casecmp?(2) #=> nil
10834 * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}") #=> nil
10835 */
10836
10837 static VALUE
sym_casecmp_p(VALUE sym,VALUE other)10838 sym_casecmp_p(VALUE sym, VALUE other)
10839 {
10840 if (!SYMBOL_P(other)) {
10841 return Qnil;
10842 }
10843 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
10844 }
10845
10846 /*
10847 * call-seq:
10848 * sym =~ obj -> integer or nil
10849 *
10850 * Returns <code>sym.to_s =~ obj</code>.
10851 */
10852
10853 static VALUE
sym_match(VALUE sym,VALUE other)10854 sym_match(VALUE sym, VALUE other)
10855 {
10856 return rb_str_match(rb_sym2str(sym), other);
10857 }
10858
10859 /*
10860 * call-seq:
10861 * sym.match(pattern) -> matchdata or nil
10862 * sym.match(pattern, pos) -> matchdata or nil
10863 *
10864 * Returns <code>sym.to_s.match</code>.
10865 */
10866
10867 static VALUE
sym_match_m(int argc,VALUE * argv,VALUE sym)10868 sym_match_m(int argc, VALUE *argv, VALUE sym)
10869 {
10870 return rb_str_match_m(argc, argv, rb_sym2str(sym));
10871 }
10872
10873 /*
10874 * call-seq:
10875 * sym.match?(pattern) -> true or false
10876 * sym.match?(pattern, pos) -> true or false
10877 *
10878 * Returns <code>sym.to_s.match?</code>.
10879 */
10880
10881 static VALUE
sym_match_m_p(int argc,VALUE * argv,VALUE sym)10882 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
10883 {
10884 return rb_str_match_m_p(argc, argv, sym);
10885 }
10886
10887 /*
10888 * call-seq:
10889 * sym[idx] -> char
10890 * sym[b, n] -> string
10891 * sym.slice(idx) -> char
10892 * sym.slice(b, n) -> string
10893 *
10894 * Returns <code>sym.to_s[]</code>.
10895 */
10896
10897 static VALUE
sym_aref(int argc,VALUE * argv,VALUE sym)10898 sym_aref(int argc, VALUE *argv, VALUE sym)
10899 {
10900 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
10901 }
10902
10903 /*
10904 * call-seq:
10905 * sym.length -> integer
10906 * sym.size -> integer
10907 *
10908 * Same as <code>sym.to_s.length</code>.
10909 */
10910
10911 static VALUE
sym_length(VALUE sym)10912 sym_length(VALUE sym)
10913 {
10914 return rb_str_length(rb_sym2str(sym));
10915 }
10916
10917 /*
10918 * call-seq:
10919 * sym.empty? -> true or false
10920 *
10921 * Returns whether _sym_ is :"" or not.
10922 */
10923
10924 static VALUE
sym_empty(VALUE sym)10925 sym_empty(VALUE sym)
10926 {
10927 return rb_str_empty(rb_sym2str(sym));
10928 }
10929
10930 /*
10931 * call-seq:
10932 * sym.upcase -> symbol
10933 * sym.upcase([options]) -> symbol
10934 *
10935 * Same as <code>sym.to_s.upcase.intern</code>.
10936 */
10937
10938 static VALUE
sym_upcase(int argc,VALUE * argv,VALUE sym)10939 sym_upcase(int argc, VALUE *argv, VALUE sym)
10940 {
10941 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
10942 }
10943
10944 /*
10945 * call-seq:
10946 * sym.downcase -> symbol
10947 * sym.downcase([options]) -> symbol
10948 *
10949 * Same as <code>sym.to_s.downcase.intern</code>.
10950 */
10951
10952 static VALUE
sym_downcase(int argc,VALUE * argv,VALUE sym)10953 sym_downcase(int argc, VALUE *argv, VALUE sym)
10954 {
10955 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
10956 }
10957
10958 /*
10959 * call-seq:
10960 * sym.capitalize -> symbol
10961 * sym.capitalize([options]) -> symbol
10962 *
10963 * Same as <code>sym.to_s.capitalize.intern</code>.
10964 */
10965
10966 static VALUE
sym_capitalize(int argc,VALUE * argv,VALUE sym)10967 sym_capitalize(int argc, VALUE *argv, VALUE sym)
10968 {
10969 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
10970 }
10971
10972 /*
10973 * call-seq:
10974 * sym.swapcase -> symbol
10975 * sym.swapcase([options]) -> symbol
10976 *
10977 * Same as <code>sym.to_s.swapcase.intern</code>.
10978 */
10979
10980 static VALUE
sym_swapcase(int argc,VALUE * argv,VALUE sym)10981 sym_swapcase(int argc, VALUE *argv, VALUE sym)
10982 {
10983 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
10984 }
10985
10986 /*
10987 * call-seq:
10988 * sym.encoding -> encoding
10989 *
10990 * Returns the Encoding object that represents the encoding of _sym_.
10991 */
10992
10993 static VALUE
sym_encoding(VALUE sym)10994 sym_encoding(VALUE sym)
10995 {
10996 return rb_obj_encoding(rb_sym2str(sym));
10997 }
10998
10999 static VALUE
string_for_symbol(VALUE name)11000 string_for_symbol(VALUE name)
11001 {
11002 if (!RB_TYPE_P(name, T_STRING)) {
11003 VALUE tmp = rb_check_string_type(name);
11004 if (NIL_P(tmp)) {
11005 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11006 name);
11007 }
11008 name = tmp;
11009 }
11010 return name;
11011 }
11012
11013 ID
rb_to_id(VALUE name)11014 rb_to_id(VALUE name)
11015 {
11016 if (SYMBOL_P(name)) {
11017 return SYM2ID(name);
11018 }
11019 name = string_for_symbol(name);
11020 return rb_intern_str(name);
11021 }
11022
11023 VALUE
rb_to_symbol(VALUE name)11024 rb_to_symbol(VALUE name)
11025 {
11026 if (SYMBOL_P(name)) {
11027 return name;
11028 }
11029 name = string_for_symbol(name);
11030 return rb_str_intern(name);
11031 }
11032
11033 /*
11034 * A <code>String</code> object holds and manipulates an arbitrary sequence of
11035 * bytes, typically representing characters. String objects may be created
11036 * using <code>String::new</code> or as literals.
11037 *
11038 * Because of aliasing issues, users of strings should be aware of the methods
11039 * that modify the contents of a <code>String</code> object. Typically,
11040 * methods with names ending in ``!'' modify their receiver, while those
11041 * without a ``!'' return a new <code>String</code>. However, there are
11042 * exceptions, such as <code>String#[]=</code>.
11043 *
11044 */
11045
11046 void
Init_String(void)11047 Init_String(void)
11048 {
11049 #undef rb_intern
11050 #define rb_intern(str) rb_intern_const(str)
11051
11052 rb_cString = rb_define_class("String", rb_cObject);
11053 assert(rb_vm_fstring_table());
11054 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11055 rb_include_module(rb_cString, rb_mComparable);
11056 rb_define_alloc_func(rb_cString, empty_str_alloc);
11057 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11058 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11059 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11060 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11061 rb_define_method(rb_cString, "==", rb_str_equal, 1);
11062 rb_define_method(rb_cString, "===", rb_str_equal, 1);
11063 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11064 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11065 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11066 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11067 rb_define_method(rb_cString, "+", rb_str_plus, 1);
11068 rb_define_method(rb_cString, "*", rb_str_times, 1);
11069 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11070 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11071 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11072 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11073 rb_define_method(rb_cString, "length", rb_str_length, 0);
11074 rb_define_method(rb_cString, "size", rb_str_length, 0);
11075 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11076 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11077 rb_define_method(rb_cString, "=~", rb_str_match, 1);
11078 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11079 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11080 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
11081 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11082 rb_define_method(rb_cString, "next", rb_str_succ, 0);
11083 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11084 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11085 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11086 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11087 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
11088 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11089 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11090 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11091 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11092 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11093 rb_define_method(rb_cString, "scrub", str_scrub, -1);
11094 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11095 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
11096 rb_define_method(rb_cString, "+@", str_uplus, 0);
11097 rb_define_method(rb_cString, "-@", str_uminus, 0);
11098
11099 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11100 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11101 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11102 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11103 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
11104 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
11105 rb_define_method(rb_cString, "undump", str_undump, 0);
11106
11107 sym_ascii = ID2SYM(rb_intern("ascii"));
11108 sym_turkic = ID2SYM(rb_intern("turkic"));
11109 sym_lithuanian = ID2SYM(rb_intern("lithuanian"));
11110 sym_fold = ID2SYM(rb_intern("fold"));
11111
11112 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11113 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11114 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11115 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11116
11117 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11118 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11119 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11120 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11121
11122 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
11123 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
11124 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
11125 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
11126 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
11127 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
11128 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
11129 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
11130 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
11131 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
11132 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
11133 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
11134 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
11135 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
11136 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
11137 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
11138 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
11139
11140 rb_define_method(rb_cString, "include?", rb_str_include, 1);
11141 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
11142 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
11143
11144 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
11145
11146 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
11147 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
11148 rb_define_method(rb_cString, "center", rb_str_center, -1);
11149
11150 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
11151 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
11152 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
11153 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
11154 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
11155 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
11156 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
11157 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
11158 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
11159
11160 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
11161 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
11162 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
11163 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
11164 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
11165 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
11166 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
11167 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
11168 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
11169
11170 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
11171 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
11172 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
11173 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
11174 rb_define_method(rb_cString, "count", rb_str_count, -1);
11175
11176 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
11177 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
11178 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
11179 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
11180
11181 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
11182 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
11183 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
11184 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
11185 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
11186
11187 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
11188
11189 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
11190 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
11191
11192 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
11193 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
11194
11195 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
11196 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
11197 rb_define_method(rb_cString, "b", rb_str_b, 0);
11198 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
11199 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
11200
11201 /* define UnicodeNormalize module here so that we don't have to look it up */
11202 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
11203 id_normalize = rb_intern("normalize");
11204 id_normalized_p = rb_intern("normalized?");
11205
11206 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
11207 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
11208 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
11209
11210 rb_fs = Qnil;
11211 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
11212 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
11213 rb_gc_register_address(&rb_fs);
11214
11215 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
11216 rb_include_module(rb_cSymbol, rb_mComparable);
11217 rb_undef_alloc_func(rb_cSymbol);
11218 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
11219 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in symbol.c */
11220
11221 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
11222 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
11223 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
11224 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
11225 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
11226 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
11227 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
11228 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
11229 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
11230 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
11231
11232 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
11233 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
11234 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
11235 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
11236
11237 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
11238 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
11239 rb_define_method(rb_cSymbol, "length", sym_length, 0);
11240 rb_define_method(rb_cSymbol, "size", sym_length, 0);
11241 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
11242 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
11243 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
11244
11245 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
11246 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
11247 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
11248 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
11249
11250 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
11251 }
11252