1 /*
2 ** string.c - String class
3 **
4 ** See Copyright Notice in mruby.h
5 */
6
7 #ifdef _MSC_VER
8 # define _CRT_NONSTDC_NO_DEPRECATE
9 #endif
10
11 #ifndef MRB_WITHOUT_FLOAT
12 #include <float.h>
13 #endif
14 #include <limits.h>
15 #include <stddef.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <mruby.h>
19 #include <mruby/array.h>
20 #include <mruby/class.h>
21 #include <mruby/range.h>
22 #include <mruby/string.h>
23 #include <mruby/numeric.h>
24 #include <mruby/re.h>
25
26 typedef struct mrb_shared_string {
27 mrb_bool nofree : 1;
28 int refcnt;
29 char *ptr;
30 mrb_int len;
31 } mrb_shared_string;
32
33 const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
34
35 #define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class))
36
37 static struct RString*
str_new_static(mrb_state * mrb,const char * p,size_t len)38 str_new_static(mrb_state *mrb, const char *p, size_t len)
39 {
40 struct RString *s;
41
42 if (len >= MRB_INT_MAX) {
43 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
44 }
45 s = mrb_obj_alloc_string(mrb);
46 s->as.heap.len = (mrb_int)len;
47 s->as.heap.aux.capa = 0; /* nofree */
48 s->as.heap.ptr = (char *)p;
49 s->flags = MRB_STR_NOFREE;
50
51 return s;
52 }
53
54 static struct RString*
str_new(mrb_state * mrb,const char * p,size_t len)55 str_new(mrb_state *mrb, const char *p, size_t len)
56 {
57 struct RString *s;
58
59 if (p && mrb_ro_data_p(p)) {
60 return str_new_static(mrb, p, len);
61 }
62 s = mrb_obj_alloc_string(mrb);
63 if (len <= RSTRING_EMBED_LEN_MAX) {
64 RSTR_SET_EMBED_FLAG(s);
65 RSTR_SET_EMBED_LEN(s, len);
66 if (p) {
67 memcpy(s->as.ary, p, len);
68 }
69 }
70 else {
71 if (len >= MRB_INT_MAX) {
72 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
73 }
74 s->as.heap.ptr = (char *)mrb_malloc(mrb, len+1);
75 s->as.heap.len = (mrb_int)len;
76 s->as.heap.aux.capa = (mrb_int)len;
77 if (p) {
78 memcpy(s->as.heap.ptr, p, len);
79 }
80 }
81 RSTR_PTR(s)[len] = '\0';
82 return s;
83 }
84
85 static inline void
str_with_class(mrb_state * mrb,struct RString * s,mrb_value obj)86 str_with_class(mrb_state *mrb, struct RString *s, mrb_value obj)
87 {
88 s->c = mrb_str_ptr(obj)->c;
89 }
90
91 static mrb_value
mrb_str_new_empty(mrb_state * mrb,mrb_value str)92 mrb_str_new_empty(mrb_state *mrb, mrb_value str)
93 {
94 struct RString *s = str_new(mrb, 0, 0);
95
96 str_with_class(mrb, s, str);
97 return mrb_obj_value(s);
98 }
99
100 MRB_API mrb_value
mrb_str_new_capa(mrb_state * mrb,size_t capa)101 mrb_str_new_capa(mrb_state *mrb, size_t capa)
102 {
103 struct RString *s;
104
105 s = mrb_obj_alloc_string(mrb);
106
107 if (capa >= MRB_INT_MAX) {
108 mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big");
109 }
110 s->as.heap.len = 0;
111 s->as.heap.aux.capa = (mrb_int)capa;
112 s->as.heap.ptr = (char *)mrb_malloc(mrb, capa+1);
113 RSTR_PTR(s)[0] = '\0';
114
115 return mrb_obj_value(s);
116 }
117
118 #ifndef MRB_STR_BUF_MIN_SIZE
119 # define MRB_STR_BUF_MIN_SIZE 128
120 #endif
121
122 MRB_API mrb_value
mrb_str_buf_new(mrb_state * mrb,size_t capa)123 mrb_str_buf_new(mrb_state *mrb, size_t capa)
124 {
125 if (capa < MRB_STR_BUF_MIN_SIZE) {
126 capa = MRB_STR_BUF_MIN_SIZE;
127 }
128 return mrb_str_new_capa(mrb, capa);
129 }
130
131 static void
resize_capa(mrb_state * mrb,struct RString * s,size_t capacity)132 resize_capa(mrb_state *mrb, struct RString *s, size_t capacity)
133 {
134 #if SIZE_MAX > MRB_INT_MAX
135 mrb_assert(capacity < MRB_INT_MAX);
136 #endif
137 if (RSTR_EMBED_P(s)) {
138 if (RSTRING_EMBED_LEN_MAX < capacity) {
139 char *const tmp = (char *)mrb_malloc(mrb, capacity+1);
140 const mrb_int len = RSTR_EMBED_LEN(s);
141 memcpy(tmp, s->as.ary, len);
142 RSTR_UNSET_EMBED_FLAG(s);
143 s->as.heap.ptr = tmp;
144 s->as.heap.len = len;
145 s->as.heap.aux.capa = (mrb_int)capacity;
146 }
147 }
148 else {
149 s->as.heap.ptr = (char*)mrb_realloc(mrb, RSTR_PTR(s), capacity+1);
150 s->as.heap.aux.capa = (mrb_int)capacity;
151 }
152 }
153
154 MRB_API mrb_value
mrb_str_new(mrb_state * mrb,const char * p,size_t len)155 mrb_str_new(mrb_state *mrb, const char *p, size_t len)
156 {
157 return mrb_obj_value(str_new(mrb, p, len));
158 }
159
160 MRB_API mrb_value
mrb_str_new_cstr(mrb_state * mrb,const char * p)161 mrb_str_new_cstr(mrb_state *mrb, const char *p)
162 {
163 struct RString *s;
164 size_t len;
165
166 if (p) {
167 len = strlen(p);
168 }
169 else {
170 len = 0;
171 }
172
173 s = str_new(mrb, p, len);
174
175 return mrb_obj_value(s);
176 }
177
178 MRB_API mrb_value
mrb_str_new_static(mrb_state * mrb,const char * p,size_t len)179 mrb_str_new_static(mrb_state *mrb, const char *p, size_t len)
180 {
181 struct RString *s = str_new_static(mrb, p, len);
182 return mrb_obj_value(s);
183 }
184
185 static void
str_decref(mrb_state * mrb,mrb_shared_string * shared)186 str_decref(mrb_state *mrb, mrb_shared_string *shared)
187 {
188 shared->refcnt--;
189 if (shared->refcnt == 0) {
190 if (!shared->nofree) {
191 mrb_free(mrb, shared->ptr);
192 }
193 mrb_free(mrb, shared);
194 }
195 }
196
197 void
mrb_gc_free_str(mrb_state * mrb,struct RString * str)198 mrb_gc_free_str(mrb_state *mrb, struct RString *str)
199 {
200 if (RSTR_EMBED_P(str))
201 /* no code */;
202 else if (RSTR_SHARED_P(str))
203 str_decref(mrb, str->as.heap.aux.shared);
204 else if (!RSTR_NOFREE_P(str) && !RSTR_FSHARED_P(str))
205 mrb_free(mrb, str->as.heap.ptr);
206 }
207
208 #ifdef MRB_UTF8_STRING
209 static const char utf8len_codepage[256] =
210 {
211 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
212 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
213 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
214 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
215 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
216 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
217 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
218 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
219 };
220
221 static mrb_int
utf8len(const char * p,const char * e)222 utf8len(const char* p, const char* e)
223 {
224 mrb_int len;
225 mrb_int i;
226
227 len = utf8len_codepage[(unsigned char)*p];
228 if (p + len > e) return 1;
229 for (i = 1; i < len; ++i)
230 if ((p[i] & 0xc0) != 0x80)
231 return 1;
232 return len;
233 }
234
235 mrb_int
mrb_utf8_len(const char * str,mrb_int byte_len)236 mrb_utf8_len(const char *str, mrb_int byte_len)
237 {
238 mrb_int total = 0;
239 const char *p = str;
240 const char *e = p + byte_len;
241
242 while (p < e) {
243 p += utf8len(p, e);
244 total++;
245 }
246 return total;
247 }
248
249 static mrb_int
utf8_strlen(mrb_value str)250 utf8_strlen(mrb_value str)
251 {
252 mrb_int byte_len = RSTRING_LEN(str);
253
254 if (RSTRING(str)->flags & MRB_STR_NO_UTF) {
255 return byte_len;
256 }
257 else {
258 mrb_int utf8_len = mrb_utf8_len(RSTRING_PTR(str), byte_len);
259 if (byte_len == utf8_len) RSTRING(str)->flags |= MRB_STR_NO_UTF;
260 return utf8_len;
261 }
262 }
263
264 #define RSTRING_CHAR_LEN(s) utf8_strlen(s)
265
266 /* map character index to byte offset index */
267 static mrb_int
chars2bytes(mrb_value s,mrb_int off,mrb_int idx)268 chars2bytes(mrb_value s, mrb_int off, mrb_int idx)
269 {
270 mrb_int i, b, n;
271 const char *p = RSTRING_PTR(s) + off;
272 const char *e = RSTRING_END(s);
273
274 for (b=i=0; p<e && i<idx; i++) {
275 n = utf8len(p, e);
276 b += n;
277 p += n;
278 }
279 return b;
280 }
281
282 /* map byte offset to character index */
283 static mrb_int
bytes2chars(char * p,mrb_int bi)284 bytes2chars(char *p, mrb_int bi)
285 {
286 mrb_int i, b, n;
287
288 for (b=i=0; b<bi; i++) {
289 n = utf8len_codepage[(unsigned char)*p];
290 b += n;
291 p += n;
292 }
293 if (b != bi) return -1;
294 return i;
295 }
296
297 #define BYTES_ALIGN_CHECK(pos) if (pos < 0) return mrb_nil_value();
298 #else
299 #define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
300 #define chars2bytes(p, off, ci) (ci)
301 #define bytes2chars(p, bi) (bi)
302 #define BYTES_ALIGN_CHECK(pos)
303 #endif
304
305 static inline mrb_int
mrb_memsearch_qs(const unsigned char * xs,mrb_int m,const unsigned char * ys,mrb_int n)306 mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n)
307 {
308 const unsigned char *x = xs, *xe = xs + m;
309 const unsigned char *y = ys;
310 int i;
311 ptrdiff_t qstable[256];
312
313 /* Preprocessing */
314 for (i = 0; i < 256; ++i)
315 qstable[i] = m + 1;
316 for (; x < xe; ++x)
317 qstable[*x] = xe - x;
318 /* Searching */
319 for (; y + m <= ys + n; y += *(qstable + y[m])) {
320 if (*xs == *y && memcmp(xs, y, m) == 0)
321 return (mrb_int)(y - ys);
322 }
323 return -1;
324 }
325
326 static mrb_int
mrb_memsearch(const void * x0,mrb_int m,const void * y0,mrb_int n)327 mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
328 {
329 const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0;
330
331 if (m > n) return -1;
332 else if (m == n) {
333 return memcmp(x0, y0, m) == 0 ? 0 : -1;
334 }
335 else if (m < 1) {
336 return 0;
337 }
338 else if (m == 1) {
339 const unsigned char *ys = (const unsigned char *)memchr(y, *x, n);
340
341 if (ys)
342 return (mrb_int)(ys - y);
343 else
344 return -1;
345 }
346 return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n);
347 }
348
349 static void
str_make_shared(mrb_state * mrb,struct RString * orig,struct RString * s)350 str_make_shared(mrb_state *mrb, struct RString *orig, struct RString *s)
351 {
352 mrb_shared_string *shared;
353 mrb_int len = RSTR_LEN(orig);
354
355 mrb_assert(!RSTR_EMBED_P(orig));
356 if (RSTR_SHARED_P(orig)) {
357 shared = orig->as.heap.aux.shared;
358 shared->refcnt++;
359 s->as.heap.ptr = orig->as.heap.ptr;
360 s->as.heap.len = len;
361 s->as.heap.aux.shared = shared;
362 RSTR_SET_SHARED_FLAG(s);
363 RSTR_UNSET_EMBED_FLAG(s);
364 }
365 else if (RSTR_FSHARED_P(orig)) {
366 struct RString *fs;
367
368 fs = orig->as.heap.aux.fshared;
369 s->as.heap.ptr = orig->as.heap.ptr;
370 s->as.heap.len = len;
371 s->as.heap.aux.fshared = fs;
372 RSTR_SET_FSHARED_FLAG(s);
373 RSTR_UNSET_EMBED_FLAG(s);
374 }
375 else if (MRB_FROZEN_P(orig) && !RSTR_POOL_P(orig)) {
376 s->as.heap.ptr = orig->as.heap.ptr;
377 s->as.heap.len = len;
378 s->as.heap.aux.fshared = orig;
379 RSTR_SET_FSHARED_FLAG(s);
380 RSTR_UNSET_EMBED_FLAG(s);
381 }
382 else {
383 shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string));
384 shared->refcnt = 2;
385 shared->nofree = !!RSTR_NOFREE_P(orig);
386 if (!shared->nofree && orig->as.heap.aux.capa > orig->as.heap.len) {
387 shared->ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1);
388 orig->as.heap.ptr = shared->ptr;
389 }
390 else {
391 shared->ptr = orig->as.heap.ptr;
392 }
393 orig->as.heap.aux.shared = shared;
394 RSTR_SET_SHARED_FLAG(orig);
395 shared->len = len;
396 s->as.heap.aux.shared = shared;
397 s->as.heap.ptr = shared->ptr;
398 s->as.heap.len = len;
399 RSTR_SET_SHARED_FLAG(s);
400 RSTR_UNSET_EMBED_FLAG(s);
401 }
402 }
403
404 static mrb_value
byte_subseq(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)405 byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
406 {
407 struct RString *orig, *s;
408
409 orig = mrb_str_ptr(str);
410 if (RSTR_EMBED_P(orig) || RSTR_LEN(orig) == 0 || len <= RSTRING_EMBED_LEN_MAX) {
411 s = str_new(mrb, RSTR_PTR(orig)+beg, len);
412 }
413 else {
414 s = mrb_obj_alloc_string(mrb);
415 str_make_shared(mrb, orig, s);
416 s->as.heap.ptr += beg;
417 s->as.heap.len = len;
418 }
419 return mrb_obj_value(s);
420 }
421 #ifdef MRB_UTF8_STRING
422 static inline mrb_value
str_subseq(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)423 str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
424 {
425 beg = chars2bytes(str, 0, beg);
426 len = chars2bytes(str, beg, len);
427
428 return byte_subseq(mrb, str, beg, len);
429 }
430 #else
431 #define str_subseq(mrb, str, beg, len) byte_subseq(mrb, str, beg, len)
432 #endif
433
434 static mrb_value
str_substr(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)435 str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
436 {
437 mrb_int clen = RSTRING_CHAR_LEN(str);
438
439 if (len < 0) return mrb_nil_value();
440 if (clen == 0) {
441 len = 0;
442 }
443 else if (beg < 0) {
444 beg = clen + beg;
445 }
446 if (beg > clen) return mrb_nil_value();
447 if (beg < 0) {
448 beg += clen;
449 if (beg < 0) return mrb_nil_value();
450 }
451 if (len > clen - beg)
452 len = clen - beg;
453 if (len <= 0) {
454 len = 0;
455 }
456 return str_subseq(mrb, str, beg, len);
457 }
458
459 MRB_API mrb_int
mrb_str_index(mrb_state * mrb,mrb_value str,const char * sptr,mrb_int slen,mrb_int offset)460 mrb_str_index(mrb_state *mrb, mrb_value str, const char *sptr, mrb_int slen, mrb_int offset)
461 {
462 mrb_int pos;
463 char *s;
464 mrb_int len;
465
466 len = RSTRING_LEN(str);
467 if (offset < 0) {
468 offset += len;
469 if (offset < 0) return -1;
470 }
471 if (len - offset < slen) return -1;
472 s = RSTRING_PTR(str);
473 if (offset) {
474 s += offset;
475 }
476 if (slen == 0) return offset;
477 /* need proceed one character at a time */
478 len = RSTRING_LEN(str) - offset;
479 pos = mrb_memsearch(sptr, slen, s, len);
480 if (pos < 0) return pos;
481 return pos + offset;
482 }
483
484 static mrb_int
str_index_str(mrb_state * mrb,mrb_value str,mrb_value str2,mrb_int offset)485 str_index_str(mrb_state *mrb, mrb_value str, mrb_value str2, mrb_int offset)
486 {
487 const char *ptr;
488 mrb_int len;
489
490 ptr = RSTRING_PTR(str2);
491 len = RSTRING_LEN(str2);
492
493 return mrb_str_index(mrb, str, ptr, len, offset);
494 }
495
496 static void
check_frozen(mrb_state * mrb,struct RString * s)497 check_frozen(mrb_state *mrb, struct RString *s)
498 {
499 if (MRB_FROZEN_P(s)) {
500 mrb_raise(mrb, E_FROZEN_ERROR, "can't modify frozen string");
501 }
502 }
503
504 static mrb_value
str_replace(mrb_state * mrb,struct RString * s1,struct RString * s2)505 str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
506 {
507 mrb_int len;
508
509 check_frozen(mrb, s1);
510 if (s1 == s2) return mrb_obj_value(s1);
511 s1->flags &= ~MRB_STR_NO_UTF;
512 s1->flags |= s2->flags&MRB_STR_NO_UTF;
513 len = RSTR_LEN(s2);
514 if (RSTR_SHARED_P(s1)) {
515 str_decref(mrb, s1->as.heap.aux.shared);
516 RSTR_UNSET_SHARED_FLAG(s1);
517 }
518 else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1)
519 && s1->as.heap.ptr) {
520 mrb_free(mrb, s1->as.heap.ptr);
521 }
522
523 RSTR_UNSET_FSHARED_FLAG(s1);
524 RSTR_UNSET_NOFREE_FLAG(s1);
525 if (len <= RSTRING_EMBED_LEN_MAX) {
526 RSTR_UNSET_SHARED_FLAG(s1);
527 RSTR_UNSET_FSHARED_FLAG(s1);
528 RSTR_SET_EMBED_FLAG(s1);
529 memcpy(s1->as.ary, RSTR_PTR(s2), len);
530 RSTR_SET_EMBED_LEN(s1, len);
531 }
532 else {
533 str_make_shared(mrb, s2, s1);
534 }
535
536 return mrb_obj_value(s1);
537 }
538
539 static mrb_int
str_rindex(mrb_state * mrb,mrb_value str,mrb_value sub,mrb_int pos)540 str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
541 {
542 char *s, *sbeg, *t;
543 struct RString *ps = mrb_str_ptr(str);
544 mrb_int len = RSTRING_LEN(sub);
545
546 /* substring longer than string */
547 if (RSTR_LEN(ps) < len) return -1;
548 if (RSTR_LEN(ps) - pos < len) {
549 pos = RSTR_LEN(ps) - len;
550 }
551 sbeg = RSTR_PTR(ps);
552 s = RSTR_PTR(ps) + pos;
553 t = RSTRING_PTR(sub);
554 if (len) {
555 while (sbeg <= s) {
556 if (memcmp(s, t, len) == 0) {
557 return (mrb_int)(s - RSTR_PTR(ps));
558 }
559 s--;
560 }
561 return -1;
562 }
563 else {
564 return pos;
565 }
566 }
567
568 MRB_API mrb_int
mrb_str_strlen(mrb_state * mrb,struct RString * s)569 mrb_str_strlen(mrb_state *mrb, struct RString *s)
570 {
571 mrb_int i, max = RSTR_LEN(s);
572 char *p = RSTR_PTR(s);
573
574 if (!p) return 0;
575 for (i=0; i<max; i++) {
576 if (p[i] == '\0') {
577 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
578 }
579 }
580 return max;
581 }
582
583 #ifdef _WIN32
584 #include <windows.h>
585
586 char*
mrb_utf8_from_locale(const char * str,int len)587 mrb_utf8_from_locale(const char *str, int len)
588 {
589 wchar_t* wcsp;
590 char* mbsp;
591 int mbssize, wcssize;
592
593 if (len == 0)
594 return strdup("");
595 if (len == -1)
596 len = (int)strlen(str);
597 wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0);
598 wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
599 if (!wcsp)
600 return NULL;
601 wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1);
602 wcsp[wcssize] = 0;
603
604 mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
605 mbsp = (char*) malloc((mbssize + 1));
606 if (!mbsp) {
607 free(wcsp);
608 return NULL;
609 }
610 mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
611 mbsp[mbssize] = 0;
612 free(wcsp);
613 return mbsp;
614 }
615
616 char*
mrb_locale_from_utf8(const char * utf8,int len)617 mrb_locale_from_utf8(const char *utf8, int len)
618 {
619 wchar_t* wcsp;
620 char* mbsp;
621 int mbssize, wcssize;
622
623 if (len == 0)
624 return strdup("");
625 if (len == -1)
626 len = (int)strlen(utf8);
627 wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
628 wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
629 if (!wcsp)
630 return NULL;
631 wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1);
632 wcsp[wcssize] = 0;
633 mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
634 mbsp = (char*) malloc((mbssize + 1));
635 if (!mbsp) {
636 free(wcsp);
637 return NULL;
638 }
639 mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
640 mbsp[mbssize] = 0;
641 free(wcsp);
642 return mbsp;
643 }
644 #endif
645
646 MRB_API void
mrb_str_modify(mrb_state * mrb,struct RString * s)647 mrb_str_modify(mrb_state *mrb, struct RString *s)
648 {
649 check_frozen(mrb, s);
650 s->flags &= ~MRB_STR_NO_UTF;
651 if (RSTR_SHARED_P(s)) {
652 mrb_shared_string *shared = s->as.heap.aux.shared;
653
654 if (shared->nofree == 0 && shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
655 s->as.heap.ptr = shared->ptr;
656 s->as.heap.aux.capa = shared->len;
657 RSTR_PTR(s)[s->as.heap.len] = '\0';
658 mrb_free(mrb, shared);
659 }
660 else {
661 char *ptr, *p;
662 mrb_int len;
663
664 p = RSTR_PTR(s);
665 len = s->as.heap.len;
666 if (len < RSTRING_EMBED_LEN_MAX) {
667 RSTR_SET_EMBED_FLAG(s);
668 RSTR_SET_EMBED_LEN(s, len);
669 ptr = RSTR_PTR(s);
670 }
671 else {
672 ptr = (char *)mrb_malloc(mrb, (size_t)len + 1);
673 s->as.heap.ptr = ptr;
674 s->as.heap.aux.capa = len;
675 }
676 if (p) {
677 memcpy(ptr, p, len);
678 }
679 ptr[len] = '\0';
680 str_decref(mrb, shared);
681 }
682 RSTR_UNSET_SHARED_FLAG(s);
683 return;
684 }
685 if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) {
686 char *p = s->as.heap.ptr;
687 mrb_int len = s->as.heap.len;
688
689 RSTR_UNSET_FSHARED_FLAG(s);
690 RSTR_UNSET_NOFREE_FLAG(s);
691 RSTR_UNSET_FSHARED_FLAG(s);
692 if (len < RSTRING_EMBED_LEN_MAX) {
693 RSTR_SET_EMBED_FLAG(s);
694 RSTR_SET_EMBED_LEN(s, len);
695 }
696 else {
697 s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)len+1);
698 s->as.heap.aux.capa = len;
699 }
700 if (p) {
701 memcpy(RSTR_PTR(s), p, len);
702 }
703 RSTR_PTR(s)[len] = '\0';
704 return;
705 }
706 }
707
708 MRB_API mrb_value
mrb_str_resize(mrb_state * mrb,mrb_value str,mrb_int len)709 mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len)
710 {
711 mrb_int slen;
712 struct RString *s = mrb_str_ptr(str);
713
714 if (len < 0) {
715 mrb_raise(mrb, E_ARGUMENT_ERROR, "negative (or overflowed) string size");
716 }
717 mrb_str_modify(mrb, s);
718 slen = RSTR_LEN(s);
719 if (len != slen) {
720 if (slen < len || slen - len > 256) {
721 resize_capa(mrb, s, len);
722 }
723 RSTR_SET_LEN(s, len);
724 RSTR_PTR(s)[len] = '\0'; /* sentinel */
725 }
726 return str;
727 }
728
729 MRB_API char*
mrb_str_to_cstr(mrb_state * mrb,mrb_value str0)730 mrb_str_to_cstr(mrb_state *mrb, mrb_value str0)
731 {
732 struct RString *s;
733
734 if (!mrb_string_p(str0)) {
735 mrb_raise(mrb, E_TYPE_ERROR, "expected String");
736 }
737
738 s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0));
739 if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) {
740 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
741 }
742 return RSTR_PTR(s);
743 }
744
745 MRB_API void
mrb_str_concat(mrb_state * mrb,mrb_value self,mrb_value other)746 mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other)
747 {
748 other = mrb_str_to_str(mrb, other);
749 mrb_str_cat_str(mrb, self, other);
750 }
751
752 MRB_API mrb_value
mrb_str_plus(mrb_state * mrb,mrb_value a,mrb_value b)753 mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b)
754 {
755 struct RString *s = mrb_str_ptr(a);
756 struct RString *s2 = mrb_str_ptr(b);
757 struct RString *t;
758
759 t = str_new(mrb, 0, RSTR_LEN(s) + RSTR_LEN(s2));
760 memcpy(RSTR_PTR(t), RSTR_PTR(s), RSTR_LEN(s));
761 memcpy(RSTR_PTR(t) + RSTR_LEN(s), RSTR_PTR(s2), RSTR_LEN(s2));
762
763 return mrb_obj_value(t);
764 }
765
766 /* 15.2.10.5.2 */
767
768 /*
769 * call-seq:
770 * str + other_str -> new_str
771 *
772 * Concatenation---Returns a new <code>String</code> containing
773 * <i>other_str</i> concatenated to <i>str</i>.
774 *
775 * "Hello from " + self.to_s #=> "Hello from main"
776 */
777 static mrb_value
mrb_str_plus_m(mrb_state * mrb,mrb_value self)778 mrb_str_plus_m(mrb_state *mrb, mrb_value self)
779 {
780 mrb_value str;
781
782 mrb_get_args(mrb, "S", &str);
783 return mrb_str_plus(mrb, self, str);
784 }
785
786 /* 15.2.10.5.26 */
787 /* 15.2.10.5.33 */
788 /*
789 * call-seq:
790 * "abcd".size => int
791 *
792 * Returns the length of string.
793 */
794 static mrb_value
mrb_str_size(mrb_state * mrb,mrb_value self)795 mrb_str_size(mrb_state *mrb, mrb_value self)
796 {
797 mrb_int len = RSTRING_CHAR_LEN(self);
798 return mrb_fixnum_value(len);
799 }
800
801 static mrb_value
mrb_str_bytesize(mrb_state * mrb,mrb_value self)802 mrb_str_bytesize(mrb_state *mrb, mrb_value self)
803 {
804 mrb_int len = RSTRING_LEN(self);
805 return mrb_fixnum_value(len);
806 }
807
808 /* 15.2.10.5.1 */
809 /*
810 * call-seq:
811 * str * integer => new_str
812 *
813 * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
814 * the receiver.
815 *
816 * "Ho! " * 3 #=> "Ho! Ho! Ho! "
817 */
818 static mrb_value
mrb_str_times(mrb_state * mrb,mrb_value self)819 mrb_str_times(mrb_state *mrb, mrb_value self)
820 {
821 mrb_int n,len,times;
822 struct RString *str2;
823 char *p;
824
825 mrb_get_args(mrb, "i", ×);
826 if (times < 0) {
827 mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument");
828 }
829 if (times && MRB_INT_MAX / times < RSTRING_LEN(self)) {
830 mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big");
831 }
832
833 len = RSTRING_LEN(self)*times;
834 str2 = str_new(mrb, 0, len);
835 str_with_class(mrb, str2, self);
836 p = RSTR_PTR(str2);
837 if (len > 0) {
838 n = RSTRING_LEN(self);
839 memcpy(p, RSTRING_PTR(self), n);
840 while (n <= len/2) {
841 memcpy(p + n, p, n);
842 n *= 2;
843 }
844 memcpy(p + n, p, len-n);
845 }
846 p[RSTR_LEN(str2)] = '\0';
847
848 return mrb_obj_value(str2);
849 }
850 /* -------------------------------------------------------------- */
851
852 #define lesser(a,b) (((a)>(b))?(b):(a))
853
854 /* ---------------------------*/
855 /*
856 * call-seq:
857 * mrb_value str1 <=> mrb_value str2 => int
858 * > 1
859 * = 0
860 * < -1
861 */
862 MRB_API int
mrb_str_cmp(mrb_state * mrb,mrb_value str1,mrb_value str2)863 mrb_str_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2)
864 {
865 mrb_int len;
866 mrb_int retval;
867 struct RString *s1 = mrb_str_ptr(str1);
868 struct RString *s2 = mrb_str_ptr(str2);
869
870 len = lesser(RSTR_LEN(s1), RSTR_LEN(s2));
871 retval = memcmp(RSTR_PTR(s1), RSTR_PTR(s2), len);
872 if (retval == 0) {
873 if (RSTR_LEN(s1) == RSTR_LEN(s2)) return 0;
874 if (RSTR_LEN(s1) > RSTR_LEN(s2)) return 1;
875 return -1;
876 }
877 if (retval > 0) return 1;
878 return -1;
879 }
880
881 /* 15.2.10.5.3 */
882
883 /*
884 * call-seq:
885 * str <=> other_str => -1, 0, +1
886 *
887 * Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
888 * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
889 * <i>str</i>. If the strings are of different lengths, and the strings are
890 * equal when compared up to the shortest length, then the longer string is
891 * considered greater than the shorter one. If the variable <code>$=</code> is
892 * <code>false</code>, the comparison is based on comparing the binary values
893 * of each character in the string. In older versions of Ruby, setting
894 * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
895 * in favor of using <code>String#casecmp</code>.
896 *
897 * <code><=></code> is the basis for the methods <code><</code>,
898 * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
899 * included from module <code>Comparable</code>. The method
900 * <code>String#==</code> does not use <code>Comparable#==</code>.
901 *
902 * "abcdef" <=> "abcde" #=> 1
903 * "abcdef" <=> "abcdef" #=> 0
904 * "abcdef" <=> "abcdefg" #=> -1
905 * "abcdef" <=> "ABCDEF" #=> 1
906 */
907 static mrb_value
mrb_str_cmp_m(mrb_state * mrb,mrb_value str1)908 mrb_str_cmp_m(mrb_state *mrb, mrb_value str1)
909 {
910 mrb_value str2;
911 mrb_int result;
912
913 mrb_get_args(mrb, "o", &str2);
914 if (!mrb_string_p(str2)) {
915 if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "to_s"))) {
916 return mrb_nil_value();
917 }
918 else if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "<=>"))) {
919 return mrb_nil_value();
920 }
921 else {
922 mrb_value tmp = mrb_funcall(mrb, str2, "<=>", 1, str1);
923
924 if (mrb_nil_p(tmp)) return mrb_nil_value();
925 if (!mrb_fixnum_p(tmp)) {
926 return mrb_funcall(mrb, mrb_fixnum_value(0), "-", 1, tmp);
927 }
928 result = -mrb_fixnum(tmp);
929 }
930 }
931 else {
932 result = mrb_str_cmp(mrb, str1, str2);
933 }
934 return mrb_fixnum_value(result);
935 }
936
937 static mrb_bool
str_eql(mrb_state * mrb,const mrb_value str1,const mrb_value str2)938 str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2)
939 {
940 const mrb_int len = RSTRING_LEN(str1);
941
942 if (len != RSTRING_LEN(str2)) return FALSE;
943 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), (size_t)len) == 0)
944 return TRUE;
945 return FALSE;
946 }
947
948 MRB_API mrb_bool
mrb_str_equal(mrb_state * mrb,mrb_value str1,mrb_value str2)949 mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2)
950 {
951 if (!mrb_string_p(str2)) return FALSE;
952 return str_eql(mrb, str1, str2);
953 }
954
955 /* 15.2.10.5.4 */
956 /*
957 * call-seq:
958 * str == obj => true or false
959 *
960 * Equality---
961 * If <i>obj</i> is not a <code>String</code>, returns <code>false</code>.
962 * Otherwise, returns <code>false</code> or <code>true</code>
963 *
964 * caution:if <i>str</i> <code><=></code> <i>obj</i> returns zero.
965 */
966 static mrb_value
mrb_str_equal_m(mrb_state * mrb,mrb_value str1)967 mrb_str_equal_m(mrb_state *mrb, mrb_value str1)
968 {
969 mrb_value str2;
970
971 mrb_get_args(mrb, "o", &str2);
972
973 return mrb_bool_value(mrb_str_equal(mrb, str1, str2));
974 }
975 /* ---------------------------------- */
976 mrb_value mrb_mod_to_s(mrb_state *mrb, mrb_value klass);
977
978 MRB_API mrb_value
mrb_str_to_str(mrb_state * mrb,mrb_value str)979 mrb_str_to_str(mrb_state *mrb, mrb_value str)
980 {
981 switch (mrb_type(str)) {
982 case MRB_TT_STRING:
983 return str;
984 case MRB_TT_FIXNUM:
985 return mrb_fixnum_to_str(mrb, str, 10);
986 case MRB_TT_CLASS:
987 case MRB_TT_MODULE:
988 return mrb_mod_to_s(mrb, str);
989 default:
990 return mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_s");
991 }
992 }
993
994 MRB_API const char*
mrb_string_value_ptr(mrb_state * mrb,mrb_value str)995 mrb_string_value_ptr(mrb_state *mrb, mrb_value str)
996 {
997 str = mrb_str_to_str(mrb, str);
998 return RSTRING_PTR(str);
999 }
1000
1001 MRB_API mrb_int
mrb_string_value_len(mrb_state * mrb,mrb_value ptr)1002 mrb_string_value_len(mrb_state *mrb, mrb_value ptr)
1003 {
1004 mrb_to_str(mrb, ptr);
1005 return RSTRING_LEN(ptr);
1006 }
1007
1008 void
mrb_noregexp(mrb_state * mrb,mrb_value self)1009 mrb_noregexp(mrb_state *mrb, mrb_value self)
1010 {
1011 mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented");
1012 }
1013
1014 void
mrb_regexp_check(mrb_state * mrb,mrb_value obj)1015 mrb_regexp_check(mrb_state *mrb, mrb_value obj)
1016 {
1017 if (mrb_regexp_p(mrb, obj)) {
1018 mrb_noregexp(mrb, obj);
1019 }
1020 }
1021
1022 MRB_API mrb_value
mrb_str_dup(mrb_state * mrb,mrb_value str)1023 mrb_str_dup(mrb_state *mrb, mrb_value str)
1024 {
1025 struct RString *s = mrb_str_ptr(str);
1026 struct RString *dup = str_new(mrb, 0, 0);
1027
1028 str_with_class(mrb, dup, str);
1029 return str_replace(mrb, dup, s);
1030 }
1031
1032 static mrb_value
mrb_str_aref(mrb_state * mrb,mrb_value str,mrb_value indx)1033 mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx)
1034 {
1035 mrb_int idx;
1036
1037 mrb_regexp_check(mrb, indx);
1038 switch (mrb_type(indx)) {
1039 case MRB_TT_FIXNUM:
1040 idx = mrb_fixnum(indx);
1041
1042 num_index:
1043 str = str_substr(mrb, str, idx, 1);
1044 if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value();
1045 return str;
1046
1047 case MRB_TT_STRING:
1048 if (str_index_str(mrb, str, indx, 0) != -1)
1049 return mrb_str_dup(mrb, indx);
1050 return mrb_nil_value();
1051
1052 case MRB_TT_RANGE:
1053 goto range_arg;
1054
1055 default:
1056 indx = mrb_Integer(mrb, indx);
1057 if (mrb_nil_p(indx)) {
1058 range_arg:
1059 {
1060 mrb_int beg, len;
1061
1062 len = RSTRING_CHAR_LEN(str);
1063 switch (mrb_range_beg_len(mrb, indx, &beg, &len, len, TRUE)) {
1064 case 1:
1065 return str_subseq(mrb, str, beg, len);
1066 case 2:
1067 return mrb_nil_value();
1068 default:
1069 break;
1070 }
1071 }
1072 mrb_raise(mrb, E_TYPE_ERROR, "can't convert to Fixnum");
1073 }
1074 idx = mrb_fixnum(indx);
1075 goto num_index;
1076 }
1077 return mrb_nil_value(); /* not reached */
1078 }
1079
1080 /* 15.2.10.5.6 */
1081 /* 15.2.10.5.34 */
1082 /*
1083 * call-seq:
1084 * str[fixnum] => fixnum or nil
1085 * str[fixnum, fixnum] => new_str or nil
1086 * str[range] => new_str or nil
1087 * str[regexp] => new_str or nil
1088 * str[regexp, fixnum] => new_str or nil
1089 * str[other_str] => new_str or nil
1090 * str.slice(fixnum) => fixnum or nil
1091 * str.slice(fixnum, fixnum) => new_str or nil
1092 * str.slice(range) => new_str or nil
1093 * str.slice(other_str) => new_str or nil
1094 *
1095 * Element Reference---If passed a single <code>Fixnum</code>, returns the code
1096 * of the character at that position. If passed two <code>Fixnum</code>
1097 * objects, returns a substring starting at the offset given by the first, and
1098 * a length given by the second. If given a range, a substring containing
1099 * characters at offsets given by the range is returned. In all three cases, if
1100 * an offset is negative, it is counted from the end of <i>str</i>. Returns
1101 * <code>nil</code> if the initial offset falls outside the string, the length
1102 * is negative, or the beginning of the range is greater than the end.
1103 *
1104 * If a <code>String</code> is given, that string is returned if it occurs in
1105 * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1106 * match.
1107 *
1108 * a = "hello there"
1109 * a[1] #=> 101(1.8.7) "e"(1.9.2)
1110 * a[1.1] #=> "e"(1.9.2)
1111 * a[1,3] #=> "ell"
1112 * a[1..3] #=> "ell"
1113 * a[-3,2] #=> "er"
1114 * a[-4..-2] #=> "her"
1115 * a[12..-1] #=> nil
1116 * a[-2..-4] #=> ""
1117 * a["lo"] #=> "lo"
1118 * a["bye"] #=> nil
1119 */
1120 static mrb_value
mrb_str_aref_m(mrb_state * mrb,mrb_value str)1121 mrb_str_aref_m(mrb_state *mrb, mrb_value str)
1122 {
1123 mrb_value a1, a2;
1124 mrb_int argc;
1125
1126 argc = mrb_get_args(mrb, "o|o", &a1, &a2);
1127 if (argc == 2) {
1128 mrb_int n1, n2;
1129
1130 mrb_regexp_check(mrb, a1);
1131 mrb_get_args(mrb, "ii", &n1, &n2);
1132 return str_substr(mrb, str, n1, n2);
1133 }
1134 if (argc != 1) {
1135 mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc));
1136 }
1137 return mrb_str_aref(mrb, str, a1);
1138 }
1139
1140 /* 15.2.10.5.8 */
1141 /*
1142 * call-seq:
1143 * str.capitalize! => str or nil
1144 *
1145 * Modifies <i>str</i> by converting the first character to uppercase and the
1146 * remainder to lowercase. Returns <code>nil</code> if no changes are made.
1147 *
1148 * a = "hello"
1149 * a.capitalize! #=> "Hello"
1150 * a #=> "Hello"
1151 * a.capitalize! #=> nil
1152 */
1153 static mrb_value
mrb_str_capitalize_bang(mrb_state * mrb,mrb_value str)1154 mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str)
1155 {
1156 char *p, *pend;
1157 mrb_bool modify = FALSE;
1158 struct RString *s = mrb_str_ptr(str);
1159
1160 mrb_str_modify(mrb, s);
1161 if (RSTR_LEN(s) == 0 || !RSTR_PTR(s)) return mrb_nil_value();
1162 p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s);
1163 if (ISLOWER(*p)) {
1164 *p = TOUPPER(*p);
1165 modify = TRUE;
1166 }
1167 while (++p < pend) {
1168 if (ISUPPER(*p)) {
1169 *p = TOLOWER(*p);
1170 modify = TRUE;
1171 }
1172 }
1173 if (modify) return str;
1174 return mrb_nil_value();
1175 }
1176
1177 /* 15.2.10.5.7 */
1178 /*
1179 * call-seq:
1180 * str.capitalize => new_str
1181 *
1182 * Returns a copy of <i>str</i> with the first character converted to uppercase
1183 * and the remainder to lowercase.
1184 *
1185 * "hello".capitalize #=> "Hello"
1186 * "HELLO".capitalize #=> "Hello"
1187 * "123ABC".capitalize #=> "123abc"
1188 */
1189 static mrb_value
mrb_str_capitalize(mrb_state * mrb,mrb_value self)1190 mrb_str_capitalize(mrb_state *mrb, mrb_value self)
1191 {
1192 mrb_value str;
1193
1194 str = mrb_str_dup(mrb, self);
1195 mrb_str_capitalize_bang(mrb, str);
1196 return str;
1197 }
1198
1199 /* 15.2.10.5.10 */
1200 /*
1201 * call-seq:
1202 * str.chomp!(separator="\n") => str or nil
1203 *
1204 * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
1205 * returning <i>str</i>, or <code>nil</code> if no modifications were made.
1206 */
1207 static mrb_value
mrb_str_chomp_bang(mrb_state * mrb,mrb_value str)1208 mrb_str_chomp_bang(mrb_state *mrb, mrb_value str)
1209 {
1210 mrb_value rs;
1211 mrb_int newline;
1212 char *p, *pp;
1213 mrb_int rslen;
1214 mrb_int len;
1215 mrb_int argc;
1216 struct RString *s = mrb_str_ptr(str);
1217
1218 argc = mrb_get_args(mrb, "|S", &rs);
1219 mrb_str_modify(mrb, s);
1220 len = RSTR_LEN(s);
1221 if (argc == 0) {
1222 if (len == 0) return mrb_nil_value();
1223 smart_chomp:
1224 if (RSTR_PTR(s)[len-1] == '\n') {
1225 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1226 if (RSTR_LEN(s) > 0 &&
1227 RSTR_PTR(s)[RSTR_LEN(s)-1] == '\r') {
1228 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1229 }
1230 }
1231 else if (RSTR_PTR(s)[len-1] == '\r') {
1232 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1233 }
1234 else {
1235 return mrb_nil_value();
1236 }
1237 RSTR_PTR(s)[RSTR_LEN(s)] = '\0';
1238 return str;
1239 }
1240
1241 if (len == 0 || mrb_nil_p(rs)) return mrb_nil_value();
1242 p = RSTR_PTR(s);
1243 rslen = RSTRING_LEN(rs);
1244 if (rslen == 0) {
1245 while (len>0 && p[len-1] == '\n') {
1246 len--;
1247 if (len>0 && p[len-1] == '\r')
1248 len--;
1249 }
1250 if (len < RSTR_LEN(s)) {
1251 RSTR_SET_LEN(s, len);
1252 p[len] = '\0';
1253 return str;
1254 }
1255 return mrb_nil_value();
1256 }
1257 if (rslen > len) return mrb_nil_value();
1258 newline = RSTRING_PTR(rs)[rslen-1];
1259 if (rslen == 1 && newline == '\n')
1260 newline = RSTRING_PTR(rs)[rslen-1];
1261 if (rslen == 1 && newline == '\n')
1262 goto smart_chomp;
1263
1264 pp = p + len - rslen;
1265 if (p[len-1] == newline &&
1266 (rslen <= 1 ||
1267 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
1268 RSTR_SET_LEN(s, len - rslen);
1269 p[RSTR_LEN(s)] = '\0';
1270 return str;
1271 }
1272 return mrb_nil_value();
1273 }
1274
1275 /* 15.2.10.5.9 */
1276 /*
1277 * call-seq:
1278 * str.chomp(separator="\n") => new_str
1279 *
1280 * Returns a new <code>String</code> with the given record separator removed
1281 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
1282 * changed from the default Ruby record separator, then <code>chomp</code> also
1283 * removes carriage return characters (that is it will remove <code>\n</code>,
1284 * <code>\r</code>, and <code>\r\n</code>).
1285 *
1286 * "hello".chomp #=> "hello"
1287 * "hello\n".chomp #=> "hello"
1288 * "hello\r\n".chomp #=> "hello"
1289 * "hello\n\r".chomp #=> "hello\n"
1290 * "hello\r".chomp #=> "hello"
1291 * "hello \n there".chomp #=> "hello \n there"
1292 * "hello".chomp("llo") #=> "he"
1293 */
1294 static mrb_value
mrb_str_chomp(mrb_state * mrb,mrb_value self)1295 mrb_str_chomp(mrb_state *mrb, mrb_value self)
1296 {
1297 mrb_value str;
1298
1299 str = mrb_str_dup(mrb, self);
1300 mrb_str_chomp_bang(mrb, str);
1301 return str;
1302 }
1303
1304 /* 15.2.10.5.12 */
1305 /*
1306 * call-seq:
1307 * str.chop! => str or nil
1308 *
1309 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
1310 * or <code>nil</code> if <i>str</i> is the empty string. See also
1311 * <code>String#chomp!</code>.
1312 */
1313 static mrb_value
mrb_str_chop_bang(mrb_state * mrb,mrb_value str)1314 mrb_str_chop_bang(mrb_state *mrb, mrb_value str)
1315 {
1316 struct RString *s = mrb_str_ptr(str);
1317
1318 mrb_str_modify(mrb, s);
1319 if (RSTR_LEN(s) > 0) {
1320 mrb_int len;
1321 #ifdef MRB_UTF8_STRING
1322 const char* t = RSTR_PTR(s), *p = t;
1323 const char* e = p + RSTR_LEN(s);
1324 while (p<e) {
1325 mrb_int clen = utf8len(p, e);
1326 if (p + clen>=e) break;
1327 p += clen;
1328 }
1329 len = p - t;
1330 #else
1331 len = RSTR_LEN(s) - 1;
1332 #endif
1333 if (RSTR_PTR(s)[len] == '\n') {
1334 if (len > 0 &&
1335 RSTR_PTR(s)[len-1] == '\r') {
1336 len--;
1337 }
1338 }
1339 RSTR_SET_LEN(s, len);
1340 RSTR_PTR(s)[len] = '\0';
1341 return str;
1342 }
1343 return mrb_nil_value();
1344 }
1345
1346 /* 15.2.10.5.11 */
1347 /*
1348 * call-seq:
1349 * str.chop => new_str
1350 *
1351 * Returns a new <code>String</code> with the last character removed. If the
1352 * string ends with <code>\r\n</code>, both characters are removed. Applying
1353 * <code>chop</code> to an empty string returns an empty
1354 * string. <code>String#chomp</code> is often a safer alternative, as it leaves
1355 * the string unchanged if it doesn't end in a record separator.
1356 *
1357 * "string\r\n".chop #=> "string"
1358 * "string\n\r".chop #=> "string\n"
1359 * "string\n".chop #=> "string"
1360 * "string".chop #=> "strin"
1361 * "x".chop #=> ""
1362 */
1363 static mrb_value
mrb_str_chop(mrb_state * mrb,mrb_value self)1364 mrb_str_chop(mrb_state *mrb, mrb_value self)
1365 {
1366 mrb_value str;
1367 str = mrb_str_dup(mrb, self);
1368 mrb_str_chop_bang(mrb, str);
1369 return str;
1370 }
1371
1372 /* 15.2.10.5.14 */
1373 /*
1374 * call-seq:
1375 * str.downcase! => str or nil
1376 *
1377 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
1378 * changes were made.
1379 */
1380 static mrb_value
mrb_str_downcase_bang(mrb_state * mrb,mrb_value str)1381 mrb_str_downcase_bang(mrb_state *mrb, mrb_value str)
1382 {
1383 char *p, *pend;
1384 mrb_bool modify = FALSE;
1385 struct RString *s = mrb_str_ptr(str);
1386
1387 mrb_str_modify(mrb, s);
1388 p = RSTR_PTR(s);
1389 pend = RSTR_PTR(s) + RSTR_LEN(s);
1390 while (p < pend) {
1391 if (ISUPPER(*p)) {
1392 *p = TOLOWER(*p);
1393 modify = TRUE;
1394 }
1395 p++;
1396 }
1397
1398 if (modify) return str;
1399 return mrb_nil_value();
1400 }
1401
1402 /* 15.2.10.5.13 */
1403 /*
1404 * call-seq:
1405 * str.downcase => new_str
1406 *
1407 * Returns a copy of <i>str</i> with all uppercase letters replaced with their
1408 * lowercase counterparts. The operation is locale insensitive---only
1409 * characters 'A' to 'Z' are affected.
1410 *
1411 * "hEllO".downcase #=> "hello"
1412 */
1413 static mrb_value
mrb_str_downcase(mrb_state * mrb,mrb_value self)1414 mrb_str_downcase(mrb_state *mrb, mrb_value self)
1415 {
1416 mrb_value str;
1417
1418 str = mrb_str_dup(mrb, self);
1419 mrb_str_downcase_bang(mrb, str);
1420 return str;
1421 }
1422
1423 /* 15.2.10.5.16 */
1424 /*
1425 * call-seq:
1426 * str.empty? => true or false
1427 *
1428 * Returns <code>true</code> if <i>str</i> has a length of zero.
1429 *
1430 * "hello".empty? #=> false
1431 * "".empty? #=> true
1432 */
1433 static mrb_value
mrb_str_empty_p(mrb_state * mrb,mrb_value self)1434 mrb_str_empty_p(mrb_state *mrb, mrb_value self)
1435 {
1436 struct RString *s = mrb_str_ptr(self);
1437
1438 return mrb_bool_value(RSTR_LEN(s) == 0);
1439 }
1440
1441 /* 15.2.10.5.17 */
1442 /*
1443 * call-seq:
1444 * str.eql?(other) => true or false
1445 *
1446 * Two strings are equal if the have the same length and content.
1447 */
1448 static mrb_value
mrb_str_eql(mrb_state * mrb,mrb_value self)1449 mrb_str_eql(mrb_state *mrb, mrb_value self)
1450 {
1451 mrb_value str2;
1452 mrb_bool eql_p;
1453
1454 mrb_get_args(mrb, "o", &str2);
1455 eql_p = (mrb_type(str2) == MRB_TT_STRING) && str_eql(mrb, self, str2);
1456
1457 return mrb_bool_value(eql_p);
1458 }
1459
1460 MRB_API mrb_value
mrb_str_substr(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)1461 mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
1462 {
1463 return str_substr(mrb, str, beg, len);
1464 }
1465
1466 uint32_t
mrb_str_hash(mrb_state * mrb,mrb_value str)1467 mrb_str_hash(mrb_state *mrb, mrb_value str)
1468 {
1469 /* 1-8-7 */
1470 struct RString *s = mrb_str_ptr(str);
1471 mrb_int len = RSTR_LEN(s);
1472 char *p = RSTR_PTR(s);
1473 uint64_t key = 0;
1474
1475 while (len--) {
1476 key = key*65599 + *p;
1477 p++;
1478 }
1479 return (uint32_t)(key + (key>>5));
1480 }
1481
1482 /* 15.2.10.5.20 */
1483 /*
1484 * call-seq:
1485 * str.hash => fixnum
1486 *
1487 * Return a hash based on the string's length and content.
1488 */
1489 static mrb_value
mrb_str_hash_m(mrb_state * mrb,mrb_value self)1490 mrb_str_hash_m(mrb_state *mrb, mrb_value self)
1491 {
1492 mrb_int key = mrb_str_hash(mrb, self);
1493 return mrb_fixnum_value(key);
1494 }
1495
1496 /* 15.2.10.5.21 */
1497 /*
1498 * call-seq:
1499 * str.include? other_str => true or false
1500 * str.include? fixnum => true or false
1501 *
1502 * Returns <code>true</code> if <i>str</i> contains the given string or
1503 * character.
1504 *
1505 * "hello".include? "lo" #=> true
1506 * "hello".include? "ol" #=> false
1507 * "hello".include? ?h #=> true
1508 */
1509 static mrb_value
mrb_str_include(mrb_state * mrb,mrb_value self)1510 mrb_str_include(mrb_state *mrb, mrb_value self)
1511 {
1512 mrb_value str2;
1513
1514 mrb_get_args(mrb, "S", &str2);
1515 if (str_index_str(mrb, self, str2, 0) < 0)
1516 return mrb_bool_value(FALSE);
1517 return mrb_bool_value(TRUE);
1518 }
1519
1520 /* 15.2.10.5.22 */
1521 /*
1522 * call-seq:
1523 * str.index(substring [, offset]) => fixnum or nil
1524 * str.index(fixnum [, offset]) => fixnum or nil
1525 * str.index(regexp [, offset]) => fixnum or nil
1526 *
1527 * Returns the index of the first occurrence of the given
1528 * <i>substring</i>,
1529 * character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>.
1530 * Returns
1531 * <code>nil</code> if not found.
1532 * If the second parameter is present, it
1533 * specifies the position in the string to begin the search.
1534 *
1535 * "hello".index('e') #=> 1
1536 * "hello".index('lo') #=> 3
1537 * "hello".index('a') #=> nil
1538 * "hello".index(101) #=> 1(101=0x65='e')
1539 * "hello".index(/[aeiou]/, -3) #=> 4
1540 */
1541 static mrb_value
mrb_str_index_m(mrb_state * mrb,mrb_value str)1542 mrb_str_index_m(mrb_state *mrb, mrb_value str)
1543 {
1544 mrb_value *argv;
1545 mrb_int argc;
1546 mrb_value sub;
1547 mrb_int pos, clen;
1548
1549 mrb_get_args(mrb, "*!", &argv, &argc);
1550 if (argc == 2) {
1551 mrb_get_args(mrb, "oi", &sub, &pos);
1552 }
1553 else {
1554 pos = 0;
1555 if (argc > 0)
1556 sub = argv[0];
1557 else
1558 sub = mrb_nil_value();
1559 }
1560 mrb_regexp_check(mrb, sub);
1561 clen = RSTRING_CHAR_LEN(str);
1562 if (pos < 0) {
1563 pos += clen;
1564 if (pos < 0) {
1565 return mrb_nil_value();
1566 }
1567 }
1568 if (pos > clen) return mrb_nil_value();
1569 pos = chars2bytes(str, 0, pos);
1570
1571 switch (mrb_type(sub)) {
1572 default: {
1573 mrb_value tmp;
1574
1575 tmp = mrb_check_string_type(mrb, sub);
1576 if (mrb_nil_p(tmp)) {
1577 mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub);
1578 }
1579 sub = tmp;
1580 }
1581 /* fall through */
1582 case MRB_TT_STRING:
1583 pos = str_index_str(mrb, str, sub, pos);
1584 break;
1585 }
1586
1587 if (pos == -1) return mrb_nil_value();
1588 pos = bytes2chars(RSTRING_PTR(str), pos);
1589 BYTES_ALIGN_CHECK(pos);
1590 return mrb_fixnum_value(pos);
1591 }
1592
1593 /* 15.2.10.5.24 */
1594 /* 15.2.10.5.28 */
1595 /*
1596 * call-seq:
1597 * str.replace(other_str) => str
1598 *
1599 * s = "hello" #=> "hello"
1600 * s.replace "world" #=> "world"
1601 */
1602 static mrb_value
mrb_str_replace(mrb_state * mrb,mrb_value str)1603 mrb_str_replace(mrb_state *mrb, mrb_value str)
1604 {
1605 mrb_value str2;
1606
1607 mrb_get_args(mrb, "S", &str2);
1608 return str_replace(mrb, mrb_str_ptr(str), mrb_str_ptr(str2));
1609 }
1610
1611 /* 15.2.10.5.23 */
1612 /*
1613 * call-seq:
1614 * String.new(str="") => new_str
1615 *
1616 * Returns a new string object containing a copy of <i>str</i>.
1617 */
1618 static mrb_value
mrb_str_init(mrb_state * mrb,mrb_value self)1619 mrb_str_init(mrb_state *mrb, mrb_value self)
1620 {
1621 mrb_value str2;
1622
1623 if (mrb_get_args(mrb, "|S", &str2) == 0) {
1624 struct RString *s = str_new(mrb, 0, 0);
1625 str2 = mrb_obj_value(s);
1626 }
1627 str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2));
1628 return self;
1629 }
1630
1631 /* 15.2.10.5.25 */
1632 /* 15.2.10.5.41 */
1633 /*
1634 * call-seq:
1635 * str.intern => symbol
1636 * str.to_sym => symbol
1637 *
1638 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
1639 * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
1640 *
1641 * "Koala".intern #=> :Koala
1642 * s = 'cat'.to_sym #=> :cat
1643 * s == :cat #=> true
1644 * s = '@cat'.to_sym #=> :@cat
1645 * s == :@cat #=> true
1646 *
1647 * This can also be used to create symbols that cannot be represented using the
1648 * <code>:xxx</code> notation.
1649 *
1650 * 'cat and dog'.to_sym #=> :"cat and dog"
1651 */
1652 MRB_API mrb_value
mrb_str_intern(mrb_state * mrb,mrb_value self)1653 mrb_str_intern(mrb_state *mrb, mrb_value self)
1654 {
1655 return mrb_symbol_value(mrb_intern_str(mrb, self));
1656 }
1657 /* ---------------------------------- */
1658 MRB_API mrb_value
mrb_obj_as_string(mrb_state * mrb,mrb_value obj)1659 mrb_obj_as_string(mrb_state *mrb, mrb_value obj)
1660 {
1661 mrb_value str;
1662
1663 if (mrb_string_p(obj)) {
1664 return obj;
1665 }
1666 str = mrb_funcall(mrb, obj, "to_s", 0);
1667 if (!mrb_string_p(str))
1668 return mrb_any_to_s(mrb, obj);
1669 return str;
1670 }
1671
1672 MRB_API mrb_value
mrb_ptr_to_str(mrb_state * mrb,void * p)1673 mrb_ptr_to_str(mrb_state *mrb, void *p)
1674 {
1675 struct RString *p_str;
1676 char *p1;
1677 char *p2;
1678 uintptr_t n = (uintptr_t)p;
1679
1680 p_str = str_new(mrb, NULL, 2 + sizeof(uintptr_t) * CHAR_BIT / 4);
1681 p1 = RSTR_PTR(p_str);
1682 *p1++ = '0';
1683 *p1++ = 'x';
1684 p2 = p1;
1685
1686 do {
1687 *p2++ = mrb_digitmap[n % 16];
1688 n /= 16;
1689 } while (n > 0);
1690 *p2 = '\0';
1691 RSTR_SET_LEN(p_str, (mrb_int)(p2 - RSTR_PTR(p_str)));
1692
1693 while (p1 < p2) {
1694 const char c = *p1;
1695 *p1++ = *--p2;
1696 *p2 = c;
1697 }
1698
1699 return mrb_obj_value(p_str);
1700 }
1701
1702 /* 15.2.10.5.30 */
1703 /*
1704 * call-seq:
1705 * str.reverse! => str
1706 *
1707 * Reverses <i>str</i> in place.
1708 */
1709 static mrb_value
mrb_str_reverse_bang(mrb_state * mrb,mrb_value str)1710 mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
1711 {
1712 #ifdef MRB_UTF8_STRING
1713 mrb_int utf8_len = RSTRING_CHAR_LEN(str);
1714 mrb_int len = RSTRING_LEN(str);
1715
1716 if (utf8_len == len) goto bytes;
1717 if (utf8_len > 1) {
1718 char *buf;
1719 char *p, *e, *r;
1720
1721 mrb_str_modify(mrb, mrb_str_ptr(str));
1722 len = RSTRING_LEN(str);
1723 buf = (char*)mrb_malloc(mrb, (size_t)len);
1724 p = buf;
1725 e = buf + len;
1726
1727 memcpy(buf, RSTRING_PTR(str), len);
1728 r = RSTRING_PTR(str) + len;
1729
1730 while (p<e) {
1731 mrb_int clen = utf8len(p, e);
1732 r -= clen;
1733 memcpy(r, p, clen);
1734 p += clen;
1735 }
1736 mrb_free(mrb, buf);
1737 }
1738 return str;
1739
1740 bytes:
1741 #endif
1742 {
1743 struct RString *s = mrb_str_ptr(str);
1744 char *p, *e;
1745 char c;
1746
1747 mrb_str_modify(mrb, s);
1748 if (RSTR_LEN(s) > 1) {
1749 p = RSTR_PTR(s);
1750 e = p + RSTR_LEN(s) - 1;
1751 while (p < e) {
1752 c = *p;
1753 *p++ = *e;
1754 *e-- = c;
1755 }
1756 }
1757 return str;
1758 }
1759 }
1760
1761 /* ---------------------------------- */
1762 /* 15.2.10.5.29 */
1763 /*
1764 * call-seq:
1765 * str.reverse => new_str
1766 *
1767 * Returns a new string with the characters from <i>str</i> in reverse order.
1768 *
1769 * "stressed".reverse #=> "desserts"
1770 */
1771 static mrb_value
mrb_str_reverse(mrb_state * mrb,mrb_value str)1772 mrb_str_reverse(mrb_state *mrb, mrb_value str)
1773 {
1774 mrb_value str2 = mrb_str_dup(mrb, str);
1775 mrb_str_reverse_bang(mrb, str2);
1776 return str2;
1777 }
1778
1779 /* 15.2.10.5.31 */
1780 /*
1781 * call-seq:
1782 * str.rindex(substring [, fixnum]) => fixnum or nil
1783 * str.rindex(fixnum [, fixnum]) => fixnum or nil
1784 * str.rindex(regexp [, fixnum]) => fixnum or nil
1785 *
1786 * Returns the index of the last occurrence of the given <i>substring</i>,
1787 * character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
1788 * <code>nil</code> if not found. If the second parameter is present, it
1789 * specifies the position in the string to end the search---characters beyond
1790 * this point will not be considered.
1791 *
1792 * "hello".rindex('e') #=> 1
1793 * "hello".rindex('l') #=> 3
1794 * "hello".rindex('a') #=> nil
1795 * "hello".rindex(101) #=> 1
1796 * "hello".rindex(/[aeiou]/, -2) #=> 1
1797 */
1798 static mrb_value
mrb_str_rindex(mrb_state * mrb,mrb_value str)1799 mrb_str_rindex(mrb_state *mrb, mrb_value str)
1800 {
1801 mrb_value *argv;
1802 mrb_int argc;
1803 mrb_value sub;
1804 mrb_int pos, len = RSTRING_CHAR_LEN(str);
1805
1806 mrb_get_args(mrb, "*!", &argv, &argc);
1807 if (argc == 2) {
1808 mrb_get_args(mrb, "oi", &sub, &pos);
1809 if (pos < 0) {
1810 pos += len;
1811 if (pos < 0) {
1812 mrb_regexp_check(mrb, sub);
1813 return mrb_nil_value();
1814 }
1815 }
1816 if (pos > len) pos = len;
1817 }
1818 else {
1819 pos = len;
1820 if (argc > 0)
1821 sub = argv[0];
1822 else
1823 sub = mrb_nil_value();
1824 }
1825 pos = chars2bytes(str, 0, pos);
1826 mrb_regexp_check(mrb, sub);
1827
1828 switch (mrb_type(sub)) {
1829 default: {
1830 mrb_value tmp;
1831
1832 tmp = mrb_check_string_type(mrb, sub);
1833 if (mrb_nil_p(tmp)) {
1834 mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub);
1835 }
1836 sub = tmp;
1837 }
1838 /* fall through */
1839 case MRB_TT_STRING:
1840 pos = str_rindex(mrb, str, sub, pos);
1841 if (pos >= 0) {
1842 pos = bytes2chars(RSTRING_PTR(str), pos);
1843 BYTES_ALIGN_CHECK(pos);
1844 return mrb_fixnum_value(pos);
1845 }
1846 break;
1847
1848 } /* end of switch (TYPE(sub)) */
1849 return mrb_nil_value();
1850 }
1851
1852 /* 15.2.10.5.35 */
1853
1854 /*
1855 * call-seq:
1856 * str.split(pattern="\n", [limit]) => anArray
1857 *
1858 * Divides <i>str</i> into substrings based on a delimiter, returning an array
1859 * of these substrings.
1860 *
1861 * If <i>pattern</i> is a <code>String</code>, then its contents are used as
1862 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
1863 * space, <i>str</i> is split on whitespace, with leading whitespace and runs
1864 * of contiguous whitespace characters ignored.
1865 *
1866 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
1867 * pattern matches. Whenever the pattern matches a zero-length string,
1868 * <i>str</i> is split into individual characters.
1869 *
1870 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
1871 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
1872 * split on whitespace as if ' ' were specified.
1873 *
1874 * If the <i>limit</i> parameter is omitted, trailing null fields are
1875 * suppressed. If <i>limit</i> is a positive number, at most that number of
1876 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
1877 * string is returned as the only entry in an array). If negative, there is no
1878 * limit to the number of fields returned, and trailing null fields are not
1879 * suppressed.
1880 *
1881 * " now's the time".split #=> ["now's", "the", "time"]
1882 * " now's the time".split(' ') #=> ["now's", "the", "time"]
1883 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
1884 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
1885 * "hello".split(//, 3) #=> ["h", "e", "llo"]
1886 *
1887 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
1888 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
1889 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
1890 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
1891 */
1892
1893 static mrb_value
mrb_str_split_m(mrb_state * mrb,mrb_value str)1894 mrb_str_split_m(mrb_state *mrb, mrb_value str)
1895 {
1896 mrb_int argc;
1897 mrb_value spat = mrb_nil_value();
1898 enum {awk, string, regexp} split_type = string;
1899 mrb_int i = 0;
1900 mrb_int beg;
1901 mrb_int end;
1902 mrb_int lim = 0;
1903 mrb_bool lim_p;
1904 mrb_value result, tmp;
1905
1906 argc = mrb_get_args(mrb, "|oi", &spat, &lim);
1907 lim_p = (lim > 0 && argc == 2);
1908 if (argc == 2) {
1909 if (lim == 1) {
1910 if (RSTRING_LEN(str) == 0)
1911 return mrb_ary_new_capa(mrb, 0);
1912 return mrb_ary_new_from_values(mrb, 1, &str);
1913 }
1914 i = 1;
1915 }
1916
1917 if (argc == 0 || mrb_nil_p(spat)) {
1918 split_type = awk;
1919 }
1920 else {
1921 if (mrb_string_p(spat)) {
1922 split_type = string;
1923 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
1924 split_type = awk;
1925 }
1926 }
1927 else {
1928 mrb_noregexp(mrb, str);
1929 }
1930 }
1931
1932 result = mrb_ary_new(mrb);
1933 beg = 0;
1934 if (split_type == awk) {
1935 mrb_bool skip = TRUE;
1936 mrb_int idx = 0;
1937 mrb_int str_len = RSTRING_LEN(str);
1938 unsigned int c;
1939 int ai = mrb_gc_arena_save(mrb);
1940
1941 idx = end = beg;
1942 while (idx < str_len) {
1943 c = (unsigned char)RSTRING_PTR(str)[idx++];
1944 if (skip) {
1945 if (ISSPACE(c)) {
1946 beg = idx;
1947 }
1948 else {
1949 end = idx;
1950 skip = FALSE;
1951 if (lim_p && lim <= i) break;
1952 }
1953 }
1954 else if (ISSPACE(c)) {
1955 mrb_ary_push(mrb, result, byte_subseq(mrb, str, beg, end-beg));
1956 mrb_gc_arena_restore(mrb, ai);
1957 skip = TRUE;
1958 beg = idx;
1959 if (lim_p) ++i;
1960 }
1961 else {
1962 end = idx;
1963 }
1964 }
1965 }
1966 else if (split_type == string) {
1967 mrb_int str_len = RSTRING_LEN(str);
1968 mrb_int pat_len = RSTRING_LEN(spat);
1969 mrb_int idx = 0;
1970 int ai = mrb_gc_arena_save(mrb);
1971
1972 while (idx < str_len) {
1973 if (pat_len > 0) {
1974 end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx);
1975 if (end < 0) break;
1976 }
1977 else {
1978 end = chars2bytes(str, idx, 1);
1979 }
1980 mrb_ary_push(mrb, result, byte_subseq(mrb, str, idx, end));
1981 mrb_gc_arena_restore(mrb, ai);
1982 idx += end + pat_len;
1983 if (lim_p && lim <= ++i) break;
1984 }
1985 beg = idx;
1986 }
1987 else {
1988 mrb_noregexp(mrb, str);
1989 }
1990 if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) {
1991 if (RSTRING_LEN(str) == beg) {
1992 tmp = mrb_str_new_empty(mrb, str);
1993 }
1994 else {
1995 tmp = byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg);
1996 }
1997 mrb_ary_push(mrb, result, tmp);
1998 }
1999 if (!lim_p && lim == 0) {
2000 mrb_int len;
2001 while ((len = RARRAY_LEN(result)) > 0 &&
2002 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
2003 mrb_ary_pop(mrb, result);
2004 }
2005
2006 return result;
2007 }
2008
2009 MRB_API mrb_value
mrb_str_len_to_inum(mrb_state * mrb,const char * str,mrb_int len,mrb_int base,int badcheck)2010 mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, int badcheck)
2011 {
2012 const char *p = str;
2013 const char *pend = str + len;
2014 char sign = 1;
2015 int c;
2016 uint64_t n = 0;
2017 mrb_int val;
2018
2019 #define conv_digit(c) \
2020 (ISDIGIT(c) ? ((c) - '0') : \
2021 ISLOWER(c) ? ((c) - 'a' + 10) : \
2022 ISUPPER(c) ? ((c) - 'A' + 10) : \
2023 -1)
2024
2025 if (!p) {
2026 if (badcheck) goto bad;
2027 return mrb_fixnum_value(0);
2028 }
2029 while (p<pend && ISSPACE(*p))
2030 p++;
2031
2032 if (p[0] == '+') {
2033 p++;
2034 }
2035 else if (p[0] == '-') {
2036 p++;
2037 sign = 0;
2038 }
2039 if (base <= 0) {
2040 if (p[0] == '0') {
2041 switch (p[1]) {
2042 case 'x': case 'X':
2043 base = 16;
2044 break;
2045 case 'b': case 'B':
2046 base = 2;
2047 break;
2048 case 'o': case 'O':
2049 base = 8;
2050 break;
2051 case 'd': case 'D':
2052 base = 10;
2053 break;
2054 default:
2055 base = 8;
2056 break;
2057 }
2058 }
2059 else if (base < -1) {
2060 base = -base;
2061 }
2062 else {
2063 base = 10;
2064 }
2065 }
2066 switch (base) {
2067 case 2:
2068 if (p[0] == '0' && (p[1] == 'b'||p[1] == 'B')) {
2069 p += 2;
2070 }
2071 break;
2072 case 3:
2073 break;
2074 case 8:
2075 if (p[0] == '0' && (p[1] == 'o'||p[1] == 'O')) {
2076 p += 2;
2077 }
2078 case 4: case 5: case 6: case 7:
2079 break;
2080 case 10:
2081 if (p[0] == '0' && (p[1] == 'd'||p[1] == 'D')) {
2082 p += 2;
2083 }
2084 case 9: case 11: case 12: case 13: case 14: case 15:
2085 break;
2086 case 16:
2087 if (p[0] == '0' && (p[1] == 'x'||p[1] == 'X')) {
2088 p += 2;
2089 }
2090 break;
2091 default:
2092 if (base < 2 || 36 < base) {
2093 mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base));
2094 }
2095 break;
2096 } /* end of switch (base) { */
2097 if (p>=pend) {
2098 if (badcheck) goto bad;
2099 return mrb_fixnum_value(0);
2100 }
2101 if (*p == '0') { /* squeeze preceding 0s */
2102 p++;
2103 while (p<pend) {
2104 c = *p++;
2105 if (c == '_') {
2106 if (p<pend && *p == '_') {
2107 if (badcheck) goto bad;
2108 break;
2109 }
2110 continue;
2111 }
2112 if (c != '0') {
2113 p--;
2114 break;
2115 }
2116 }
2117 if (*(p - 1) == '0')
2118 p--;
2119 }
2120 if (p == pend) {
2121 if (badcheck) goto bad;
2122 return mrb_fixnum_value(0);
2123 }
2124 for ( ;p<pend;p++) {
2125 if (*p == '_') {
2126 p++;
2127 if (p==pend) {
2128 if (badcheck) goto bad;
2129 continue;
2130 }
2131 if (*p == '_') {
2132 if (badcheck) goto bad;
2133 break;
2134 }
2135 }
2136 if (badcheck && *p == '\0') {
2137 goto nullbyte;
2138 }
2139 c = conv_digit(*p);
2140 if (c < 0 || c >= base) {
2141 break;
2142 }
2143 n *= base;
2144 n += c;
2145 if (n > (uint64_t)MRB_INT_MAX + (sign ? 0 : 1)) {
2146 #ifndef MRB_WITHOUT_FLOAT
2147 if (base == 10) {
2148 return mrb_float_value(mrb, mrb_str_to_dbl(mrb, mrb_str_new(mrb, str, len), badcheck));
2149 }
2150 else
2151 #endif
2152 {
2153 mrb_raisef(mrb, E_ARGUMENT_ERROR, "string (%S) too big for integer",
2154 mrb_str_new(mrb, str, pend-str));
2155 }
2156 }
2157 }
2158 val = (mrb_int)n;
2159 if (badcheck) {
2160 if (p == str) goto bad; /* no number */
2161 while (p<pend && ISSPACE(*p)) p++;
2162 if (p<pend) goto bad; /* trailing garbage */
2163 }
2164
2165 return mrb_fixnum_value(sign ? val : -val);
2166 nullbyte:
2167 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
2168 /* not reached */
2169 bad:
2170 mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%S)",
2171 mrb_inspect(mrb, mrb_str_new(mrb, str, pend-str)));
2172 /* not reached */
2173 return mrb_fixnum_value(0);
2174 }
2175
2176 MRB_API mrb_value
mrb_cstr_to_inum(mrb_state * mrb,const char * str,int base,int badcheck)2177 mrb_cstr_to_inum(mrb_state *mrb, const char *str, int base, int badcheck)
2178 {
2179 return mrb_str_len_to_inum(mrb, str, strlen(str), base, badcheck);
2180 }
2181
2182 MRB_API const char*
mrb_string_value_cstr(mrb_state * mrb,mrb_value * ptr)2183 mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr)
2184 {
2185 mrb_value str = mrb_to_str(mrb, *ptr);
2186 struct RString *ps = mrb_str_ptr(str);
2187 mrb_int len = mrb_str_strlen(mrb, ps);
2188 char *p = RSTR_PTR(ps);
2189
2190 if (!p || p[len] != '\0') {
2191 if (MRB_FROZEN_P(ps)) {
2192 *ptr = str = mrb_str_dup(mrb, str);
2193 ps = mrb_str_ptr(str);
2194 }
2195 mrb_str_modify(mrb, ps);
2196 return RSTR_PTR(ps);
2197 }
2198 return p;
2199 }
2200
2201 MRB_API mrb_value
mrb_str_to_inum(mrb_state * mrb,mrb_value str,mrb_int base,mrb_bool badcheck)2202 mrb_str_to_inum(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck)
2203 {
2204 const char *s;
2205 mrb_int len;
2206
2207 s = mrb_string_value_ptr(mrb, str);
2208 len = RSTRING_LEN(str);
2209 return mrb_str_len_to_inum(mrb, s, len, base, badcheck);
2210 }
2211
2212 /* 15.2.10.5.38 */
2213 /*
2214 * call-seq:
2215 * str.to_i(base=10) => integer
2216 *
2217 * Returns the result of interpreting leading characters in <i>str</i> as an
2218 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
2219 * end of a valid number are ignored. If there is not a valid number at the
2220 * start of <i>str</i>, <code>0</code> is returned. This method never raises an
2221 * exception.
2222 *
2223 * "12345".to_i #=> 12345
2224 * "99 red balloons".to_i #=> 99
2225 * "0a".to_i #=> 0
2226 * "0a".to_i(16) #=> 10
2227 * "hello".to_i #=> 0
2228 * "1100101".to_i(2) #=> 101
2229 * "1100101".to_i(8) #=> 294977
2230 * "1100101".to_i(10) #=> 1100101
2231 * "1100101".to_i(16) #=> 17826049
2232 */
2233 static mrb_value
mrb_str_to_i(mrb_state * mrb,mrb_value self)2234 mrb_str_to_i(mrb_state *mrb, mrb_value self)
2235 {
2236 mrb_int base = 10;
2237
2238 mrb_get_args(mrb, "|i", &base);
2239 if (base < 0) {
2240 mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base));
2241 }
2242 return mrb_str_to_inum(mrb, self, base, FALSE);
2243 }
2244
2245 #ifndef MRB_WITHOUT_FLOAT
2246 MRB_API double
mrb_cstr_to_dbl(mrb_state * mrb,const char * p,mrb_bool badcheck)2247 mrb_cstr_to_dbl(mrb_state *mrb, const char * p, mrb_bool badcheck)
2248 {
2249 char *end;
2250 char buf[DBL_DIG * 4 + 10];
2251 double d;
2252
2253 enum {max_width = 20};
2254
2255 if (!p) return 0.0;
2256 while (ISSPACE(*p)) p++;
2257
2258 if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
2259 return 0.0;
2260 }
2261 d = mrb_float_read(p, &end);
2262 if (p == end) {
2263 if (badcheck) {
2264 bad:
2265 mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%S)", mrb_str_new_cstr(mrb, p));
2266 /* not reached */
2267 }
2268 return d;
2269 }
2270 if (*end) {
2271 char *n = buf;
2272 char *e = buf + sizeof(buf) - 1;
2273 char prev = 0;
2274
2275 while (p < end && n < e) prev = *n++ = *p++;
2276 while (*p) {
2277 if (*p == '_') {
2278 /* remove underscores between digits */
2279 if (badcheck) {
2280 if (n == buf || !ISDIGIT(prev)) goto bad;
2281 ++p;
2282 if (!ISDIGIT(*p)) goto bad;
2283 }
2284 else {
2285 while (*++p == '_');
2286 continue;
2287 }
2288 }
2289 prev = *p++;
2290 if (n < e) *n++ = prev;
2291 }
2292 *n = '\0';
2293 p = buf;
2294
2295 if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
2296 return 0.0;
2297 }
2298
2299 d = mrb_float_read(p, &end);
2300 if (badcheck) {
2301 if (!end || p == end) goto bad;
2302 while (*end && ISSPACE(*end)) end++;
2303 if (*end) goto bad;
2304 }
2305 }
2306 return d;
2307 }
2308
2309 MRB_API double
mrb_str_to_dbl(mrb_state * mrb,mrb_value str,mrb_bool badcheck)2310 mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck)
2311 {
2312 char *s;
2313 mrb_int len;
2314
2315 mrb_to_str(mrb, str);
2316 s = RSTRING_PTR(str);
2317 len = RSTRING_LEN(str);
2318 if (s) {
2319 if (badcheck && memchr(s, '\0', len)) {
2320 mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte");
2321 }
2322 if (s[len]) { /* no sentinel somehow */
2323 struct RString *temp_str = str_new(mrb, s, len);
2324 s = RSTR_PTR(temp_str);
2325 }
2326 }
2327 return mrb_cstr_to_dbl(mrb, s, badcheck);
2328 }
2329
2330 /* 15.2.10.5.39 */
2331 /*
2332 * call-seq:
2333 * str.to_f => float
2334 *
2335 * Returns the result of interpreting leading characters in <i>str</i> as a
2336 * floating point number. Extraneous characters past the end of a valid number
2337 * are ignored. If there is not a valid number at the start of <i>str</i>,
2338 * <code>0.0</code> is returned. This method never raises an exception.
2339 *
2340 * "123.45e1".to_f #=> 1234.5
2341 * "45.67 degrees".to_f #=> 45.67
2342 * "thx1138".to_f #=> 0.0
2343 */
2344 static mrb_value
mrb_str_to_f(mrb_state * mrb,mrb_value self)2345 mrb_str_to_f(mrb_state *mrb, mrb_value self)
2346 {
2347 return mrb_float_value(mrb, mrb_str_to_dbl(mrb, self, FALSE));
2348 }
2349 #endif
2350
2351 /* 15.2.10.5.40 */
2352 /*
2353 * call-seq:
2354 * str.to_s => str
2355 *
2356 * Returns the receiver.
2357 */
2358 static mrb_value
mrb_str_to_s(mrb_state * mrb,mrb_value self)2359 mrb_str_to_s(mrb_state *mrb, mrb_value self)
2360 {
2361 if (mrb_obj_class(mrb, self) != mrb->string_class) {
2362 return mrb_str_dup(mrb, self);
2363 }
2364 return self;
2365 }
2366
2367 /* 15.2.10.5.43 */
2368 /*
2369 * call-seq:
2370 * str.upcase! => str or nil
2371 *
2372 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
2373 * were made.
2374 */
2375 static mrb_value
mrb_str_upcase_bang(mrb_state * mrb,mrb_value str)2376 mrb_str_upcase_bang(mrb_state *mrb, mrb_value str)
2377 {
2378 struct RString *s = mrb_str_ptr(str);
2379 char *p, *pend;
2380 mrb_bool modify = FALSE;
2381
2382 mrb_str_modify(mrb, s);
2383 p = RSTRING_PTR(str);
2384 pend = RSTRING_END(str);
2385 while (p < pend) {
2386 if (ISLOWER(*p)) {
2387 *p = TOUPPER(*p);
2388 modify = TRUE;
2389 }
2390 p++;
2391 }
2392
2393 if (modify) return str;
2394 return mrb_nil_value();
2395 }
2396
2397 /* 15.2.10.5.42 */
2398 /*
2399 * call-seq:
2400 * str.upcase => new_str
2401 *
2402 * Returns a copy of <i>str</i> with all lowercase letters replaced with their
2403 * uppercase counterparts. The operation is locale insensitive---only
2404 * characters 'a' to 'z' are affected.
2405 *
2406 * "hEllO".upcase #=> "HELLO"
2407 */
2408 static mrb_value
mrb_str_upcase(mrb_state * mrb,mrb_value self)2409 mrb_str_upcase(mrb_state *mrb, mrb_value self)
2410 {
2411 mrb_value str;
2412
2413 str = mrb_str_dup(mrb, self);
2414 mrb_str_upcase_bang(mrb, str);
2415 return str;
2416 }
2417
2418 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
2419
2420 /*
2421 * call-seq:
2422 * str.dump -> new_str
2423 *
2424 * Produces a version of <i>str</i> with all nonprinting characters replaced by
2425 * <code>\nnn</code> notation and all special characters escaped.
2426 */
2427 mrb_value
mrb_str_dump(mrb_state * mrb,mrb_value str)2428 mrb_str_dump(mrb_state *mrb, mrb_value str)
2429 {
2430 mrb_int len;
2431 const char *p, *pend;
2432 char *q;
2433 struct RString *result;
2434
2435 len = 2; /* "" */
2436 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
2437 while (p < pend) {
2438 unsigned char c = *p++;
2439 switch (c) {
2440 case '"': case '\\':
2441 case '\n': case '\r':
2442 case '\t': case '\f':
2443 case '\013': case '\010': case '\007': case '\033':
2444 len += 2;
2445 break;
2446
2447 case '#':
2448 len += IS_EVSTR(p, pend) ? 2 : 1;
2449 break;
2450
2451 default:
2452 if (ISPRINT(c)) {
2453 len++;
2454 }
2455 else {
2456 len += 4; /* \NNN */
2457 }
2458 break;
2459 }
2460 }
2461
2462 result = str_new(mrb, 0, len);
2463 str_with_class(mrb, result, str);
2464 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
2465 q = RSTR_PTR(result);
2466 *q++ = '"';
2467 while (p < pend) {
2468 unsigned char c = *p++;
2469
2470 switch (c) {
2471 case '"':
2472 case '\\':
2473 *q++ = '\\';
2474 *q++ = c;
2475 break;
2476
2477 case '\n':
2478 *q++ = '\\';
2479 *q++ = 'n';
2480 break;
2481
2482 case '\r':
2483 *q++ = '\\';
2484 *q++ = 'r';
2485 break;
2486
2487 case '\t':
2488 *q++ = '\\';
2489 *q++ = 't';
2490 break;
2491
2492 case '\f':
2493 *q++ = '\\';
2494 *q++ = 'f';
2495 break;
2496
2497 case '\013':
2498 *q++ = '\\';
2499 *q++ = 'v';
2500 break;
2501
2502 case '\010':
2503 *q++ = '\\';
2504 *q++ = 'b';
2505 break;
2506
2507 case '\007':
2508 *q++ = '\\';
2509 *q++ = 'a';
2510 break;
2511
2512 case '\033':
2513 *q++ = '\\';
2514 *q++ = 'e';
2515 break;
2516
2517 case '#':
2518 if (IS_EVSTR(p, pend)) *q++ = '\\';
2519 *q++ = '#';
2520 break;
2521
2522 default:
2523 if (ISPRINT(c)) {
2524 *q++ = c;
2525 }
2526 else {
2527 *q++ = '\\';
2528 *q++ = 'x';
2529 q[1] = mrb_digitmap[c % 16]; c /= 16;
2530 q[0] = mrb_digitmap[c % 16];
2531 q += 2;
2532 }
2533 }
2534 }
2535 *q = '"';
2536 return mrb_obj_value(result);
2537 }
2538
2539 MRB_API mrb_value
mrb_str_cat(mrb_state * mrb,mrb_value str,const char * ptr,size_t len)2540 mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len)
2541 {
2542 struct RString *s = mrb_str_ptr(str);
2543 size_t capa;
2544 size_t total;
2545 ptrdiff_t off = -1;
2546
2547 if (len == 0) return str;
2548 mrb_str_modify(mrb, s);
2549 if (ptr >= RSTR_PTR(s) && ptr <= RSTR_PTR(s) + (size_t)RSTR_LEN(s)) {
2550 off = ptr - RSTR_PTR(s);
2551 }
2552
2553 capa = RSTR_CAPA(s);
2554 total = RSTR_LEN(s)+len;
2555 if (total >= MRB_INT_MAX) {
2556 size_error:
2557 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
2558 }
2559 if (capa <= total) {
2560 if (capa == 0) capa = 1;
2561 while (capa <= total) {
2562 if (capa <= MRB_INT_MAX / 2) {
2563 capa *= 2;
2564 }
2565 else {
2566 capa = total+1;
2567 }
2568 }
2569 if (capa <= total || capa > MRB_INT_MAX) {
2570 goto size_error;
2571 }
2572 resize_capa(mrb, s, capa);
2573 }
2574 if (off != -1) {
2575 ptr = RSTR_PTR(s) + off;
2576 }
2577 memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len);
2578 mrb_assert_int_fit(size_t, total, mrb_int, MRB_INT_MAX);
2579 RSTR_SET_LEN(s, total);
2580 RSTR_PTR(s)[total] = '\0'; /* sentinel */
2581 return str;
2582 }
2583
2584 MRB_API mrb_value
mrb_str_cat_cstr(mrb_state * mrb,mrb_value str,const char * ptr)2585 mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr)
2586 {
2587 return mrb_str_cat(mrb, str, ptr, strlen(ptr));
2588 }
2589
2590 MRB_API mrb_value
mrb_str_cat_str(mrb_state * mrb,mrb_value str,mrb_value str2)2591 mrb_str_cat_str(mrb_state *mrb, mrb_value str, mrb_value str2)
2592 {
2593 if (mrb_str_ptr(str) == mrb_str_ptr(str2)) {
2594 mrb_str_modify(mrb, mrb_str_ptr(str));
2595 }
2596 return mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2));
2597 }
2598
2599 MRB_API mrb_value
mrb_str_append(mrb_state * mrb,mrb_value str1,mrb_value str2)2600 mrb_str_append(mrb_state *mrb, mrb_value str1, mrb_value str2)
2601 {
2602 mrb_to_str(mrb, str2);
2603 return mrb_str_cat_str(mrb, str1, str2);
2604 }
2605
2606 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
2607
2608 /*
2609 * call-seq:
2610 * str.inspect -> string
2611 *
2612 * Returns a printable version of _str_, surrounded by quote marks,
2613 * with special characters escaped.
2614 *
2615 * str = "hello"
2616 * str[3] = "\b"
2617 * str.inspect #=> "\"hel\\bo\""
2618 */
2619 mrb_value
mrb_str_inspect(mrb_state * mrb,mrb_value str)2620 mrb_str_inspect(mrb_state *mrb, mrb_value str)
2621 {
2622 const char *p, *pend;
2623 char buf[CHAR_ESC_LEN + 1];
2624 mrb_value result = mrb_str_new_lit(mrb, "\"");
2625
2626 p = RSTRING_PTR(str); pend = RSTRING_END(str);
2627 for (;p < pend; p++) {
2628 unsigned char c, cc;
2629 #ifdef MRB_UTF8_STRING
2630 mrb_int clen;
2631
2632 clen = utf8len(p, pend);
2633 if (clen > 1) {
2634 mrb_int i;
2635
2636 for (i=0; i<clen; i++) {
2637 buf[i] = p[i];
2638 }
2639 mrb_str_cat(mrb, result, buf, clen);
2640 p += clen-1;
2641 continue;
2642 }
2643 #endif
2644 c = *p;
2645 if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p+1, pend))) {
2646 buf[0] = '\\'; buf[1] = c;
2647 mrb_str_cat(mrb, result, buf, 2);
2648 continue;
2649 }
2650 if (ISPRINT(c)) {
2651 buf[0] = c;
2652 mrb_str_cat(mrb, result, buf, 1);
2653 continue;
2654 }
2655 switch (c) {
2656 case '\n': cc = 'n'; break;
2657 case '\r': cc = 'r'; break;
2658 case '\t': cc = 't'; break;
2659 case '\f': cc = 'f'; break;
2660 case '\013': cc = 'v'; break;
2661 case '\010': cc = 'b'; break;
2662 case '\007': cc = 'a'; break;
2663 case 033: cc = 'e'; break;
2664 default: cc = 0; break;
2665 }
2666 if (cc) {
2667 buf[0] = '\\';
2668 buf[1] = (char)cc;
2669 mrb_str_cat(mrb, result, buf, 2);
2670 continue;
2671 }
2672 else {
2673 buf[0] = '\\';
2674 buf[1] = 'x';
2675 buf[3] = mrb_digitmap[c % 16]; c /= 16;
2676 buf[2] = mrb_digitmap[c % 16];
2677 mrb_str_cat(mrb, result, buf, 4);
2678 continue;
2679 }
2680 }
2681 mrb_str_cat_lit(mrb, result, "\"");
2682
2683 return result;
2684 }
2685
2686 /*
2687 * call-seq:
2688 * str.bytes -> array of fixnums
2689 *
2690 * Returns an array of bytes in _str_.
2691 *
2692 * str = "hello"
2693 * str.bytes #=> [104, 101, 108, 108, 111]
2694 */
2695 static mrb_value
mrb_str_bytes(mrb_state * mrb,mrb_value str)2696 mrb_str_bytes(mrb_state *mrb, mrb_value str)
2697 {
2698 struct RString *s = mrb_str_ptr(str);
2699 mrb_value a = mrb_ary_new_capa(mrb, RSTR_LEN(s));
2700 unsigned char *p = (unsigned char *)(RSTR_PTR(s)), *pend = p + RSTR_LEN(s);
2701
2702 while (p < pend) {
2703 mrb_ary_push(mrb, a, mrb_fixnum_value(p[0]));
2704 p++;
2705 }
2706 return a;
2707 }
2708
2709 /* ---------------------------*/
2710 void
mrb_init_string(mrb_state * mrb)2711 mrb_init_string(mrb_state *mrb)
2712 {
2713 struct RClass *s;
2714
2715 mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << 5), "pointer size too big for embedded string");
2716
2717 mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */
2718 MRB_SET_INSTANCE_TT(s, MRB_TT_STRING);
2719
2720 mrb_define_method(mrb, s, "bytesize", mrb_str_bytesize, MRB_ARGS_NONE());
2721
2722 mrb_define_method(mrb, s, "<=>", mrb_str_cmp_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.1 */
2723 mrb_define_method(mrb, s, "==", mrb_str_equal_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.2 */
2724 mrb_define_method(mrb, s, "+", mrb_str_plus_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.4 */
2725 mrb_define_method(mrb, s, "*", mrb_str_times, MRB_ARGS_REQ(1)); /* 15.2.10.5.5 */
2726 mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.6 */
2727 mrb_define_method(mrb, s, "capitalize", mrb_str_capitalize, MRB_ARGS_NONE()); /* 15.2.10.5.7 */
2728 mrb_define_method(mrb, s, "capitalize!", mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8 */
2729 mrb_define_method(mrb, s, "chomp", mrb_str_chomp, MRB_ARGS_ANY()); /* 15.2.10.5.9 */
2730 mrb_define_method(mrb, s, "chomp!", mrb_str_chomp_bang, MRB_ARGS_ANY()); /* 15.2.10.5.10 */
2731 mrb_define_method(mrb, s, "chop", mrb_str_chop, MRB_ARGS_NONE()); /* 15.2.10.5.11 */
2732 mrb_define_method(mrb, s, "chop!", mrb_str_chop_bang, MRB_ARGS_NONE()); /* 15.2.10.5.12 */
2733 mrb_define_method(mrb, s, "downcase", mrb_str_downcase, MRB_ARGS_NONE()); /* 15.2.10.5.13 */
2734 mrb_define_method(mrb, s, "downcase!", mrb_str_downcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.14 */
2735 mrb_define_method(mrb, s, "empty?", mrb_str_empty_p, MRB_ARGS_NONE()); /* 15.2.10.5.16 */
2736 mrb_define_method(mrb, s, "eql?", mrb_str_eql, MRB_ARGS_REQ(1)); /* 15.2.10.5.17 */
2737
2738 mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */
2739 mrb_define_method(mrb, s, "include?", mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */
2740 mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY()); /* 15.2.10.5.22 */
2741 mrb_define_method(mrb, s, "initialize", mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */
2742 mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */
2743 mrb_define_method(mrb, s, "intern", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */
2744 mrb_define_method(mrb, s, "length", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.26 */
2745 mrb_define_method(mrb, s, "replace", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */
2746 mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */
2747 mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */
2748 mrb_define_method(mrb, s, "rindex", mrb_str_rindex, MRB_ARGS_ANY()); /* 15.2.10.5.31 */
2749 mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */
2750 mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */
2751 mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */
2752
2753 #ifndef MRB_WITHOUT_FLOAT
2754 mrb_define_method(mrb, s, "to_f", mrb_str_to_f, MRB_ARGS_NONE()); /* 15.2.10.5.38 */
2755 #endif
2756 mrb_define_method(mrb, s, "to_i", mrb_str_to_i, MRB_ARGS_ANY()); /* 15.2.10.5.39 */
2757 mrb_define_method(mrb, s, "to_s", mrb_str_to_s, MRB_ARGS_NONE()); /* 15.2.10.5.40 */
2758 mrb_define_method(mrb, s, "to_str", mrb_str_to_s, MRB_ARGS_NONE());
2759 mrb_define_method(mrb, s, "to_sym", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.41 */
2760 mrb_define_method(mrb, s, "upcase", mrb_str_upcase, MRB_ARGS_NONE()); /* 15.2.10.5.42 */
2761 mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.43 */
2762 mrb_define_method(mrb, s, "inspect", mrb_str_inspect, MRB_ARGS_NONE()); /* 15.2.10.5.46(x) */
2763 mrb_define_method(mrb, s, "bytes", mrb_str_bytes, MRB_ARGS_NONE());
2764 }
2765
2766 #ifndef MRB_WITHOUT_FLOAT
2767 /*
2768 * Source code for the "strtod" library procedure.
2769 *
2770 * Copyright (c) 1988-1993 The Regents of the University of California.
2771 * Copyright (c) 1994 Sun Microsystems, Inc.
2772 *
2773 * Permission to use, copy, modify, and distribute this
2774 * software and its documentation for any purpose and without
2775 * fee is hereby granted, provided that the above copyright
2776 * notice appear in all copies. The University of California
2777 * makes no representations about the suitability of this
2778 * software for any purpose. It is provided "as is" without
2779 * express or implied warranty.
2780 *
2781 * RCS: @(#) $Id: strtod.c 11708 2007-02-12 23:01:19Z shyouhei $
2782 */
2783
2784 #include <ctype.h>
2785 #include <errno.h>
2786
2787 static const int maxExponent = 511; /* Largest possible base 10 exponent. Any
2788 * exponent larger than this will already
2789 * produce underflow or overflow, so there's
2790 * no need to worry about additional digits.
2791 */
2792 static const double powersOf10[] = {/* Table giving binary powers of 10. Entry */
2793 10., /* is 10^2^i. Used to convert decimal */
2794 100., /* exponents into floating-point numbers. */
2795 1.0e4,
2796 1.0e8,
2797 1.0e16,
2798 1.0e32,
2799 1.0e64,
2800 1.0e128,
2801 1.0e256
2802 };
2803
2804 MRB_API double
mrb_float_read(const char * string,char ** endPtr)2805 mrb_float_read(const char *string, char **endPtr)
2806 /* const char *string; A decimal ASCII floating-point number,
2807 * optionally preceded by white space.
2808 * Must have form "-I.FE-X", where I is the
2809 * integer part of the mantissa, F is the
2810 * fractional part of the mantissa, and X
2811 * is the exponent. Either of the signs
2812 * may be "+", "-", or omitted. Either I
2813 * or F may be omitted, or both. The decimal
2814 * point isn't necessary unless F is present.
2815 * The "E" may actually be an "e". E and X
2816 * may both be omitted (but not just one).
2817 */
2818 /* char **endPtr; If non-NULL, store terminating character's
2819 * address here. */
2820 {
2821 int sign, expSign = FALSE;
2822 double fraction, dblExp;
2823 const double *d;
2824 const char *p;
2825 int c;
2826 int exp = 0; /* Exponent read from "EX" field. */
2827 int fracExp = 0; /* Exponent that derives from the fractional
2828 * part. Under normal circumstatnces, it is
2829 * the negative of the number of digits in F.
2830 * However, if I is very long, the last digits
2831 * of I get dropped (otherwise a long I with a
2832 * large negative exponent could cause an
2833 * unnecessary overflow on I alone). In this
2834 * case, fracExp is incremented one for each
2835 * dropped digit. */
2836 int mantSize; /* Number of digits in mantissa. */
2837 int decPt; /* Number of mantissa digits BEFORE decimal
2838 * point. */
2839 const char *pExp; /* Temporarily holds location of exponent
2840 * in string. */
2841
2842 /*
2843 * Strip off leading blanks and check for a sign.
2844 */
2845
2846 p = string;
2847 while (isspace(*p)) {
2848 p += 1;
2849 }
2850 if (*p == '-') {
2851 sign = TRUE;
2852 p += 1;
2853 }
2854 else {
2855 if (*p == '+') {
2856 p += 1;
2857 }
2858 sign = FALSE;
2859 }
2860
2861 /*
2862 * Count the number of digits in the mantissa (including the decimal
2863 * point), and also locate the decimal point.
2864 */
2865
2866 decPt = -1;
2867 for (mantSize = 0; ; mantSize += 1)
2868 {
2869 c = *p;
2870 if (!isdigit(c)) {
2871 if ((c != '.') || (decPt >= 0)) {
2872 break;
2873 }
2874 decPt = mantSize;
2875 }
2876 p += 1;
2877 }
2878
2879 /*
2880 * Now suck up the digits in the mantissa. Use two integers to
2881 * collect 9 digits each (this is faster than using floating-point).
2882 * If the mantissa has more than 18 digits, ignore the extras, since
2883 * they can't affect the value anyway.
2884 */
2885
2886 pExp = p;
2887 p -= mantSize;
2888 if (decPt < 0) {
2889 decPt = mantSize;
2890 }
2891 else {
2892 mantSize -= 1; /* One of the digits was the point. */
2893 }
2894 if (mantSize > 18) {
2895 if (decPt - 18 > 29999) {
2896 fracExp = 29999;
2897 }
2898 else {
2899 fracExp = decPt - 18;
2900 }
2901 mantSize = 18;
2902 }
2903 else {
2904 fracExp = decPt - mantSize;
2905 }
2906 if (mantSize == 0) {
2907 fraction = 0.0;
2908 p = string;
2909 goto done;
2910 }
2911 else {
2912 int frac1, frac2;
2913 frac1 = 0;
2914 for ( ; mantSize > 9; mantSize -= 1)
2915 {
2916 c = *p;
2917 p += 1;
2918 if (c == '.') {
2919 c = *p;
2920 p += 1;
2921 }
2922 frac1 = 10*frac1 + (c - '0');
2923 }
2924 frac2 = 0;
2925 for (; mantSize > 0; mantSize -= 1)
2926 {
2927 c = *p;
2928 p += 1;
2929 if (c == '.') {
2930 c = *p;
2931 p += 1;
2932 }
2933 frac2 = 10*frac2 + (c - '0');
2934 }
2935 fraction = (1.0e9 * frac1) + frac2;
2936 }
2937
2938 /*
2939 * Skim off the exponent.
2940 */
2941
2942 p = pExp;
2943 if ((*p == 'E') || (*p == 'e')) {
2944 p += 1;
2945 if (*p == '-') {
2946 expSign = TRUE;
2947 p += 1;
2948 }
2949 else {
2950 if (*p == '+') {
2951 p += 1;
2952 }
2953 expSign = FALSE;
2954 }
2955 while (isdigit(*p)) {
2956 exp = exp * 10 + (*p - '0');
2957 if (exp > 19999) {
2958 exp = 19999;
2959 }
2960 p += 1;
2961 }
2962 }
2963 if (expSign) {
2964 exp = fracExp - exp;
2965 }
2966 else {
2967 exp = fracExp + exp;
2968 }
2969
2970 /*
2971 * Generate a floating-point number that represents the exponent.
2972 * Do this by processing the exponent one bit at a time to combine
2973 * many powers of 2 of 10. Then combine the exponent with the
2974 * fraction.
2975 */
2976
2977 if (exp < 0) {
2978 expSign = TRUE;
2979 exp = -exp;
2980 }
2981 else {
2982 expSign = FALSE;
2983 }
2984 if (exp > maxExponent) {
2985 exp = maxExponent;
2986 errno = ERANGE;
2987 }
2988 dblExp = 1.0;
2989 for (d = powersOf10; exp != 0; exp >>= 1, d += 1) {
2990 if (exp & 01) {
2991 dblExp *= *d;
2992 }
2993 }
2994 if (expSign) {
2995 fraction /= dblExp;
2996 }
2997 else {
2998 fraction *= dblExp;
2999 }
3000
3001 done:
3002 if (endPtr != NULL) {
3003 *endPtr = (char *) p;
3004 }
3005
3006 if (sign) {
3007 return -fraction;
3008 }
3009 return fraction;
3010 }
3011 #endif
3012