1 /*
2 ** string.c - String class
3 **
4 ** See Copyright Notice in mruby.h
5 */
6 
7 #ifdef _MSC_VER
8 # define _CRT_NONSTDC_NO_DEPRECATE
9 #endif
10 
11 #ifndef MRB_WITHOUT_FLOAT
12 #include <float.h>
13 #endif
14 #include <limits.h>
15 #include <stddef.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <mruby.h>
19 #include <mruby/array.h>
20 #include <mruby/class.h>
21 #include <mruby/range.h>
22 #include <mruby/string.h>
23 #include <mruby/numeric.h>
24 #include <mruby/re.h>
25 
26 typedef struct mrb_shared_string {
27   mrb_bool nofree : 1;
28   int refcnt;
29   char *ptr;
30   mrb_int len;
31 } mrb_shared_string;
32 
33 const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
34 
35 #define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class))
36 
37 static struct RString*
str_new_static(mrb_state * mrb,const char * p,size_t len)38 str_new_static(mrb_state *mrb, const char *p, size_t len)
39 {
40   struct RString *s;
41 
42   if (len >= MRB_INT_MAX) {
43     mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
44   }
45   s = mrb_obj_alloc_string(mrb);
46   s->as.heap.len = (mrb_int)len;
47   s->as.heap.aux.capa = 0;             /* nofree */
48   s->as.heap.ptr = (char *)p;
49   s->flags = MRB_STR_NOFREE;
50 
51   return s;
52 }
53 
54 static struct RString*
str_new(mrb_state * mrb,const char * p,size_t len)55 str_new(mrb_state *mrb, const char *p, size_t len)
56 {
57   struct RString *s;
58 
59   if (p && mrb_ro_data_p(p)) {
60     return str_new_static(mrb, p, len);
61   }
62   s = mrb_obj_alloc_string(mrb);
63   if (len <= RSTRING_EMBED_LEN_MAX) {
64     RSTR_SET_EMBED_FLAG(s);
65     RSTR_SET_EMBED_LEN(s, len);
66     if (p) {
67       memcpy(s->as.ary, p, len);
68     }
69   }
70   else {
71     if (len >= MRB_INT_MAX) {
72       mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
73     }
74     s->as.heap.ptr = (char *)mrb_malloc(mrb, len+1);
75     s->as.heap.len = (mrb_int)len;
76     s->as.heap.aux.capa = (mrb_int)len;
77     if (p) {
78       memcpy(s->as.heap.ptr, p, len);
79     }
80   }
81   RSTR_PTR(s)[len] = '\0';
82   return s;
83 }
84 
85 static inline void
str_with_class(mrb_state * mrb,struct RString * s,mrb_value obj)86 str_with_class(mrb_state *mrb, struct RString *s, mrb_value obj)
87 {
88   s->c = mrb_str_ptr(obj)->c;
89 }
90 
91 static mrb_value
mrb_str_new_empty(mrb_state * mrb,mrb_value str)92 mrb_str_new_empty(mrb_state *mrb, mrb_value str)
93 {
94   struct RString *s = str_new(mrb, 0, 0);
95 
96   str_with_class(mrb, s, str);
97   return mrb_obj_value(s);
98 }
99 
100 MRB_API mrb_value
mrb_str_new_capa(mrb_state * mrb,size_t capa)101 mrb_str_new_capa(mrb_state *mrb, size_t capa)
102 {
103   struct RString *s;
104 
105   s = mrb_obj_alloc_string(mrb);
106 
107   if (capa >= MRB_INT_MAX) {
108     mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big");
109   }
110   s->as.heap.len = 0;
111   s->as.heap.aux.capa = (mrb_int)capa;
112   s->as.heap.ptr = (char *)mrb_malloc(mrb, capa+1);
113   RSTR_PTR(s)[0] = '\0';
114 
115   return mrb_obj_value(s);
116 }
117 
118 #ifndef MRB_STR_BUF_MIN_SIZE
119 # define MRB_STR_BUF_MIN_SIZE 128
120 #endif
121 
122 MRB_API mrb_value
mrb_str_buf_new(mrb_state * mrb,size_t capa)123 mrb_str_buf_new(mrb_state *mrb, size_t capa)
124 {
125   if (capa < MRB_STR_BUF_MIN_SIZE) {
126     capa = MRB_STR_BUF_MIN_SIZE;
127   }
128   return mrb_str_new_capa(mrb, capa);
129 }
130 
131 static void
resize_capa(mrb_state * mrb,struct RString * s,size_t capacity)132 resize_capa(mrb_state *mrb, struct RString *s, size_t capacity)
133 {
134 #if SIZE_MAX > MRB_INT_MAX
135     mrb_assert(capacity < MRB_INT_MAX);
136 #endif
137   if (RSTR_EMBED_P(s)) {
138     if (RSTRING_EMBED_LEN_MAX < capacity) {
139       char *const tmp = (char *)mrb_malloc(mrb, capacity+1);
140       const mrb_int len = RSTR_EMBED_LEN(s);
141       memcpy(tmp, s->as.ary, len);
142       RSTR_UNSET_EMBED_FLAG(s);
143       s->as.heap.ptr = tmp;
144       s->as.heap.len = len;
145       s->as.heap.aux.capa = (mrb_int)capacity;
146     }
147   }
148   else {
149     s->as.heap.ptr = (char*)mrb_realloc(mrb, RSTR_PTR(s), capacity+1);
150     s->as.heap.aux.capa = (mrb_int)capacity;
151   }
152 }
153 
154 MRB_API mrb_value
mrb_str_new(mrb_state * mrb,const char * p,size_t len)155 mrb_str_new(mrb_state *mrb, const char *p, size_t len)
156 {
157   return mrb_obj_value(str_new(mrb, p, len));
158 }
159 
160 MRB_API mrb_value
mrb_str_new_cstr(mrb_state * mrb,const char * p)161 mrb_str_new_cstr(mrb_state *mrb, const char *p)
162 {
163   struct RString *s;
164   size_t len;
165 
166   if (p) {
167     len = strlen(p);
168   }
169   else {
170     len = 0;
171   }
172 
173   s = str_new(mrb, p, len);
174 
175   return mrb_obj_value(s);
176 }
177 
178 MRB_API mrb_value
mrb_str_new_static(mrb_state * mrb,const char * p,size_t len)179 mrb_str_new_static(mrb_state *mrb, const char *p, size_t len)
180 {
181   struct RString *s = str_new_static(mrb, p, len);
182   return mrb_obj_value(s);
183 }
184 
185 static void
str_decref(mrb_state * mrb,mrb_shared_string * shared)186 str_decref(mrb_state *mrb, mrb_shared_string *shared)
187 {
188   shared->refcnt--;
189   if (shared->refcnt == 0) {
190     if (!shared->nofree) {
191       mrb_free(mrb, shared->ptr);
192     }
193     mrb_free(mrb, shared);
194   }
195 }
196 
197 void
mrb_gc_free_str(mrb_state * mrb,struct RString * str)198 mrb_gc_free_str(mrb_state *mrb, struct RString *str)
199 {
200   if (RSTR_EMBED_P(str))
201     /* no code */;
202   else if (RSTR_SHARED_P(str))
203     str_decref(mrb, str->as.heap.aux.shared);
204   else if (!RSTR_NOFREE_P(str) && !RSTR_FSHARED_P(str))
205     mrb_free(mrb, str->as.heap.ptr);
206 }
207 
208 #ifdef MRB_UTF8_STRING
209 static const char utf8len_codepage[256] =
210 {
211   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
212   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
213   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
214   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
215   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
216   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
217   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
218   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
219 };
220 
221 static mrb_int
utf8len(const char * p,const char * e)222 utf8len(const char* p, const char* e)
223 {
224   mrb_int len;
225   mrb_int i;
226 
227   len = utf8len_codepage[(unsigned char)*p];
228   if (p + len > e) return 1;
229   for (i = 1; i < len; ++i)
230     if ((p[i] & 0xc0) != 0x80)
231       return 1;
232   return len;
233 }
234 
235 mrb_int
mrb_utf8_len(const char * str,mrb_int byte_len)236 mrb_utf8_len(const char *str, mrb_int byte_len)
237 {
238   mrb_int total = 0;
239   const char *p = str;
240   const char *e = p + byte_len;
241 
242   while (p < e) {
243     p += utf8len(p, e);
244     total++;
245   }
246   return total;
247 }
248 
249 static mrb_int
utf8_strlen(mrb_value str)250 utf8_strlen(mrb_value str)
251 {
252   mrb_int byte_len = RSTRING_LEN(str);
253 
254   if (RSTRING(str)->flags & MRB_STR_NO_UTF) {
255     return byte_len;
256   }
257   else {
258     mrb_int utf8_len = mrb_utf8_len(RSTRING_PTR(str), byte_len);
259     if (byte_len == utf8_len) RSTRING(str)->flags |= MRB_STR_NO_UTF;
260     return utf8_len;
261   }
262 }
263 
264 #define RSTRING_CHAR_LEN(s) utf8_strlen(s)
265 
266 /* map character index to byte offset index */
267 static mrb_int
chars2bytes(mrb_value s,mrb_int off,mrb_int idx)268 chars2bytes(mrb_value s, mrb_int off, mrb_int idx)
269 {
270   mrb_int i, b, n;
271   const char *p = RSTRING_PTR(s) + off;
272   const char *e = RSTRING_END(s);
273 
274   for (b=i=0; p<e && i<idx; i++) {
275     n = utf8len(p, e);
276     b += n;
277     p += n;
278   }
279   return b;
280 }
281 
282 /* map byte offset to character index */
283 static mrb_int
bytes2chars(char * p,mrb_int bi)284 bytes2chars(char *p, mrb_int bi)
285 {
286   mrb_int i, b, n;
287 
288   for (b=i=0; b<bi; i++) {
289     n = utf8len_codepage[(unsigned char)*p];
290     b += n;
291     p += n;
292   }
293   if (b != bi) return -1;
294   return i;
295 }
296 
297 #define BYTES_ALIGN_CHECK(pos) if (pos < 0) return mrb_nil_value();
298 #else
299 #define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
300 #define chars2bytes(p, off, ci) (ci)
301 #define bytes2chars(p, bi) (bi)
302 #define BYTES_ALIGN_CHECK(pos)
303 #endif
304 
305 static inline mrb_int
mrb_memsearch_qs(const unsigned char * xs,mrb_int m,const unsigned char * ys,mrb_int n)306 mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n)
307 {
308   const unsigned char *x = xs, *xe = xs + m;
309   const unsigned char *y = ys;
310   int i;
311   ptrdiff_t qstable[256];
312 
313   /* Preprocessing */
314   for (i = 0; i < 256; ++i)
315     qstable[i] = m + 1;
316   for (; x < xe; ++x)
317     qstable[*x] = xe - x;
318   /* Searching */
319   for (; y + m <= ys + n; y += *(qstable + y[m])) {
320     if (*xs == *y && memcmp(xs, y, m) == 0)
321       return (mrb_int)(y - ys);
322   }
323   return -1;
324 }
325 
326 static mrb_int
mrb_memsearch(const void * x0,mrb_int m,const void * y0,mrb_int n)327 mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
328 {
329   const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0;
330 
331   if (m > n) return -1;
332   else if (m == n) {
333     return memcmp(x0, y0, m) == 0 ? 0 : -1;
334   }
335   else if (m < 1) {
336     return 0;
337   }
338   else if (m == 1) {
339     const unsigned char *ys = (const unsigned char *)memchr(y, *x, n);
340 
341     if (ys)
342       return (mrb_int)(ys - y);
343     else
344       return -1;
345   }
346   return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n);
347 }
348 
349 static void
str_make_shared(mrb_state * mrb,struct RString * orig,struct RString * s)350 str_make_shared(mrb_state *mrb, struct RString *orig, struct RString *s)
351 {
352   mrb_shared_string *shared;
353   mrb_int len = RSTR_LEN(orig);
354 
355   mrb_assert(!RSTR_EMBED_P(orig));
356   if (RSTR_SHARED_P(orig)) {
357     shared = orig->as.heap.aux.shared;
358     shared->refcnt++;
359     s->as.heap.ptr = orig->as.heap.ptr;
360     s->as.heap.len = len;
361     s->as.heap.aux.shared = shared;
362     RSTR_SET_SHARED_FLAG(s);
363     RSTR_UNSET_EMBED_FLAG(s);
364   }
365   else if (RSTR_FSHARED_P(orig)) {
366     struct RString *fs;
367 
368     fs = orig->as.heap.aux.fshared;
369     s->as.heap.ptr = orig->as.heap.ptr;
370     s->as.heap.len = len;
371     s->as.heap.aux.fshared = fs;
372     RSTR_SET_FSHARED_FLAG(s);
373     RSTR_UNSET_EMBED_FLAG(s);
374   }
375   else if (MRB_FROZEN_P(orig) && !RSTR_POOL_P(orig)) {
376     s->as.heap.ptr = orig->as.heap.ptr;
377     s->as.heap.len = len;
378     s->as.heap.aux.fshared = orig;
379     RSTR_SET_FSHARED_FLAG(s);
380     RSTR_UNSET_EMBED_FLAG(s);
381   }
382   else {
383     shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string));
384     shared->refcnt = 2;
385     shared->nofree = !!RSTR_NOFREE_P(orig);
386     if (!shared->nofree && orig->as.heap.aux.capa > orig->as.heap.len) {
387       shared->ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1);
388       orig->as.heap.ptr = shared->ptr;
389     }
390     else {
391       shared->ptr = orig->as.heap.ptr;
392     }
393     orig->as.heap.aux.shared = shared;
394     RSTR_SET_SHARED_FLAG(orig);
395     shared->len = len;
396     s->as.heap.aux.shared = shared;
397     s->as.heap.ptr = shared->ptr;
398     s->as.heap.len = len;
399     RSTR_SET_SHARED_FLAG(s);
400     RSTR_UNSET_EMBED_FLAG(s);
401   }
402 }
403 
404 static mrb_value
byte_subseq(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)405 byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
406 {
407   struct RString *orig, *s;
408 
409   orig = mrb_str_ptr(str);
410   if (RSTR_EMBED_P(orig) || RSTR_LEN(orig) == 0 || len <= RSTRING_EMBED_LEN_MAX) {
411     s = str_new(mrb, RSTR_PTR(orig)+beg, len);
412   }
413   else {
414     s = mrb_obj_alloc_string(mrb);
415     str_make_shared(mrb, orig, s);
416     s->as.heap.ptr += beg;
417     s->as.heap.len = len;
418   }
419   return mrb_obj_value(s);
420 }
421 #ifdef MRB_UTF8_STRING
422 static inline mrb_value
str_subseq(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)423 str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
424 {
425   beg = chars2bytes(str, 0, beg);
426   len = chars2bytes(str, beg, len);
427 
428   return byte_subseq(mrb, str, beg, len);
429 }
430 #else
431 #define str_subseq(mrb, str, beg, len) byte_subseq(mrb, str, beg, len)
432 #endif
433 
434 static mrb_value
str_substr(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)435 str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
436 {
437   mrb_int clen = RSTRING_CHAR_LEN(str);
438 
439   if (len < 0) return mrb_nil_value();
440   if (clen == 0) {
441     len = 0;
442   }
443   else if (beg < 0) {
444     beg = clen + beg;
445   }
446   if (beg > clen) return mrb_nil_value();
447   if (beg < 0) {
448     beg += clen;
449     if (beg < 0) return mrb_nil_value();
450   }
451   if (len > clen - beg)
452     len = clen - beg;
453   if (len <= 0) {
454     len = 0;
455   }
456   return str_subseq(mrb, str, beg, len);
457 }
458 
459 MRB_API mrb_int
mrb_str_index(mrb_state * mrb,mrb_value str,const char * sptr,mrb_int slen,mrb_int offset)460 mrb_str_index(mrb_state *mrb, mrb_value str, const char *sptr, mrb_int slen, mrb_int offset)
461 {
462   mrb_int pos;
463   char *s;
464   mrb_int len;
465 
466   len = RSTRING_LEN(str);
467   if (offset < 0) {
468     offset += len;
469     if (offset < 0) return -1;
470   }
471   if (len - offset < slen) return -1;
472   s = RSTRING_PTR(str);
473   if (offset) {
474     s += offset;
475   }
476   if (slen == 0) return offset;
477   /* need proceed one character at a time */
478   len = RSTRING_LEN(str) - offset;
479   pos = mrb_memsearch(sptr, slen, s, len);
480   if (pos < 0) return pos;
481   return pos + offset;
482 }
483 
484 static mrb_int
str_index_str(mrb_state * mrb,mrb_value str,mrb_value str2,mrb_int offset)485 str_index_str(mrb_state *mrb, mrb_value str, mrb_value str2, mrb_int offset)
486 {
487   const char *ptr;
488   mrb_int len;
489 
490   ptr = RSTRING_PTR(str2);
491   len = RSTRING_LEN(str2);
492 
493   return mrb_str_index(mrb, str, ptr, len, offset);
494 }
495 
496 static void
check_frozen(mrb_state * mrb,struct RString * s)497 check_frozen(mrb_state *mrb, struct RString *s)
498 {
499   if (MRB_FROZEN_P(s)) {
500     mrb_raise(mrb, E_FROZEN_ERROR, "can't modify frozen string");
501   }
502 }
503 
504 static mrb_value
str_replace(mrb_state * mrb,struct RString * s1,struct RString * s2)505 str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
506 {
507   mrb_int len;
508 
509   check_frozen(mrb, s1);
510   if (s1 == s2) return mrb_obj_value(s1);
511   s1->flags &= ~MRB_STR_NO_UTF;
512   s1->flags |= s2->flags&MRB_STR_NO_UTF;
513   len = RSTR_LEN(s2);
514   if (RSTR_SHARED_P(s1)) {
515     str_decref(mrb, s1->as.heap.aux.shared);
516     RSTR_UNSET_SHARED_FLAG(s1);
517   }
518   else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1)
519            && s1->as.heap.ptr) {
520     mrb_free(mrb, s1->as.heap.ptr);
521   }
522 
523   RSTR_UNSET_FSHARED_FLAG(s1);
524   RSTR_UNSET_NOFREE_FLAG(s1);
525   if (len <= RSTRING_EMBED_LEN_MAX) {
526     RSTR_UNSET_SHARED_FLAG(s1);
527     RSTR_UNSET_FSHARED_FLAG(s1);
528     RSTR_SET_EMBED_FLAG(s1);
529     memcpy(s1->as.ary, RSTR_PTR(s2), len);
530     RSTR_SET_EMBED_LEN(s1, len);
531   }
532   else {
533     str_make_shared(mrb, s2, s1);
534   }
535 
536   return mrb_obj_value(s1);
537 }
538 
539 static mrb_int
str_rindex(mrb_state * mrb,mrb_value str,mrb_value sub,mrb_int pos)540 str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
541 {
542   char *s, *sbeg, *t;
543   struct RString *ps = mrb_str_ptr(str);
544   mrb_int len = RSTRING_LEN(sub);
545 
546   /* substring longer than string */
547   if (RSTR_LEN(ps) < len) return -1;
548   if (RSTR_LEN(ps) - pos < len) {
549     pos = RSTR_LEN(ps) - len;
550   }
551   sbeg = RSTR_PTR(ps);
552   s = RSTR_PTR(ps) + pos;
553   t = RSTRING_PTR(sub);
554   if (len) {
555     while (sbeg <= s) {
556       if (memcmp(s, t, len) == 0) {
557         return (mrb_int)(s - RSTR_PTR(ps));
558       }
559       s--;
560     }
561     return -1;
562   }
563   else {
564     return pos;
565   }
566 }
567 
568 MRB_API mrb_int
mrb_str_strlen(mrb_state * mrb,struct RString * s)569 mrb_str_strlen(mrb_state *mrb, struct RString *s)
570 {
571   mrb_int i, max = RSTR_LEN(s);
572   char *p = RSTR_PTR(s);
573 
574   if (!p) return 0;
575   for (i=0; i<max; i++) {
576     if (p[i] == '\0') {
577       mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
578     }
579   }
580   return max;
581 }
582 
583 #ifdef _WIN32
584 #include <windows.h>
585 
586 char*
mrb_utf8_from_locale(const char * str,int len)587 mrb_utf8_from_locale(const char *str, int len)
588 {
589   wchar_t* wcsp;
590   char* mbsp;
591   int mbssize, wcssize;
592 
593   if (len == 0)
594     return strdup("");
595   if (len == -1)
596     len = (int)strlen(str);
597   wcssize = MultiByteToWideChar(GetACP(), 0, str, len,  NULL, 0);
598   wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
599   if (!wcsp)
600     return NULL;
601   wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1);
602   wcsp[wcssize] = 0;
603 
604   mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
605   mbsp = (char*) malloc((mbssize + 1));
606   if (!mbsp) {
607     free(wcsp);
608     return NULL;
609   }
610   mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
611   mbsp[mbssize] = 0;
612   free(wcsp);
613   return mbsp;
614 }
615 
616 char*
mrb_locale_from_utf8(const char * utf8,int len)617 mrb_locale_from_utf8(const char *utf8, int len)
618 {
619   wchar_t* wcsp;
620   char* mbsp;
621   int mbssize, wcssize;
622 
623   if (len == 0)
624     return strdup("");
625   if (len == -1)
626     len = (int)strlen(utf8);
627   wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len,  NULL, 0);
628   wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
629   if (!wcsp)
630     return NULL;
631   wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1);
632   wcsp[wcssize] = 0;
633   mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
634   mbsp = (char*) malloc((mbssize + 1));
635   if (!mbsp) {
636     free(wcsp);
637     return NULL;
638   }
639   mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
640   mbsp[mbssize] = 0;
641   free(wcsp);
642   return mbsp;
643 }
644 #endif
645 
646 MRB_API void
mrb_str_modify(mrb_state * mrb,struct RString * s)647 mrb_str_modify(mrb_state *mrb, struct RString *s)
648 {
649   check_frozen(mrb, s);
650   s->flags &= ~MRB_STR_NO_UTF;
651   if (RSTR_SHARED_P(s)) {
652     mrb_shared_string *shared = s->as.heap.aux.shared;
653 
654     if (shared->nofree == 0 && shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
655       s->as.heap.ptr = shared->ptr;
656       s->as.heap.aux.capa = shared->len;
657       RSTR_PTR(s)[s->as.heap.len] = '\0';
658       mrb_free(mrb, shared);
659     }
660     else {
661       char *ptr, *p;
662       mrb_int len;
663 
664       p = RSTR_PTR(s);
665       len = s->as.heap.len;
666       if (len < RSTRING_EMBED_LEN_MAX) {
667         RSTR_SET_EMBED_FLAG(s);
668         RSTR_SET_EMBED_LEN(s, len);
669         ptr = RSTR_PTR(s);
670       }
671       else {
672         ptr = (char *)mrb_malloc(mrb, (size_t)len + 1);
673         s->as.heap.ptr = ptr;
674         s->as.heap.aux.capa = len;
675       }
676       if (p) {
677         memcpy(ptr, p, len);
678       }
679       ptr[len] = '\0';
680       str_decref(mrb, shared);
681     }
682     RSTR_UNSET_SHARED_FLAG(s);
683     return;
684   }
685   if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) {
686     char *p = s->as.heap.ptr;
687     mrb_int len = s->as.heap.len;
688 
689     RSTR_UNSET_FSHARED_FLAG(s);
690     RSTR_UNSET_NOFREE_FLAG(s);
691     RSTR_UNSET_FSHARED_FLAG(s);
692     if (len < RSTRING_EMBED_LEN_MAX) {
693       RSTR_SET_EMBED_FLAG(s);
694       RSTR_SET_EMBED_LEN(s, len);
695     }
696     else {
697       s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)len+1);
698       s->as.heap.aux.capa = len;
699     }
700     if (p) {
701       memcpy(RSTR_PTR(s), p, len);
702     }
703     RSTR_PTR(s)[len] = '\0';
704     return;
705   }
706 }
707 
708 MRB_API mrb_value
mrb_str_resize(mrb_state * mrb,mrb_value str,mrb_int len)709 mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len)
710 {
711   mrb_int slen;
712   struct RString *s = mrb_str_ptr(str);
713 
714   if (len < 0) {
715     mrb_raise(mrb, E_ARGUMENT_ERROR, "negative (or overflowed) string size");
716   }
717   mrb_str_modify(mrb, s);
718   slen = RSTR_LEN(s);
719   if (len != slen) {
720     if (slen < len || slen - len > 256) {
721       resize_capa(mrb, s, len);
722     }
723     RSTR_SET_LEN(s, len);
724     RSTR_PTR(s)[len] = '\0';   /* sentinel */
725   }
726   return str;
727 }
728 
729 MRB_API char*
mrb_str_to_cstr(mrb_state * mrb,mrb_value str0)730 mrb_str_to_cstr(mrb_state *mrb, mrb_value str0)
731 {
732   struct RString *s;
733 
734   if (!mrb_string_p(str0)) {
735     mrb_raise(mrb, E_TYPE_ERROR, "expected String");
736   }
737 
738   s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0));
739   if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) {
740     mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
741   }
742   return RSTR_PTR(s);
743 }
744 
745 MRB_API void
mrb_str_concat(mrb_state * mrb,mrb_value self,mrb_value other)746 mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other)
747 {
748   other = mrb_str_to_str(mrb, other);
749   mrb_str_cat_str(mrb, self, other);
750 }
751 
752 MRB_API mrb_value
mrb_str_plus(mrb_state * mrb,mrb_value a,mrb_value b)753 mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b)
754 {
755   struct RString *s = mrb_str_ptr(a);
756   struct RString *s2 = mrb_str_ptr(b);
757   struct RString *t;
758 
759   t = str_new(mrb, 0, RSTR_LEN(s) + RSTR_LEN(s2));
760   memcpy(RSTR_PTR(t), RSTR_PTR(s), RSTR_LEN(s));
761   memcpy(RSTR_PTR(t) + RSTR_LEN(s), RSTR_PTR(s2), RSTR_LEN(s2));
762 
763   return mrb_obj_value(t);
764 }
765 
766 /* 15.2.10.5.2  */
767 
768 /*
769  *  call-seq:
770  *     str + other_str   -> new_str
771  *
772  *  Concatenation---Returns a new <code>String</code> containing
773  *  <i>other_str</i> concatenated to <i>str</i>.
774  *
775  *     "Hello from " + self.to_s   #=> "Hello from main"
776  */
777 static mrb_value
mrb_str_plus_m(mrb_state * mrb,mrb_value self)778 mrb_str_plus_m(mrb_state *mrb, mrb_value self)
779 {
780   mrb_value str;
781 
782   mrb_get_args(mrb, "S", &str);
783   return mrb_str_plus(mrb, self, str);
784 }
785 
786 /* 15.2.10.5.26 */
787 /* 15.2.10.5.33 */
788 /*
789  *  call-seq:
790  *     "abcd".size   => int
791  *
792  *  Returns the length of string.
793  */
794 static mrb_value
mrb_str_size(mrb_state * mrb,mrb_value self)795 mrb_str_size(mrb_state *mrb, mrb_value self)
796 {
797   mrb_int len = RSTRING_CHAR_LEN(self);
798   return mrb_fixnum_value(len);
799 }
800 
801 static mrb_value
mrb_str_bytesize(mrb_state * mrb,mrb_value self)802 mrb_str_bytesize(mrb_state *mrb, mrb_value self)
803 {
804   mrb_int len = RSTRING_LEN(self);
805   return mrb_fixnum_value(len);
806 }
807 
808 /* 15.2.10.5.1  */
809 /*
810  *  call-seq:
811  *     str * integer   => new_str
812  *
813  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
814  *  the receiver.
815  *
816  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
817  */
818 static mrb_value
mrb_str_times(mrb_state * mrb,mrb_value self)819 mrb_str_times(mrb_state *mrb, mrb_value self)
820 {
821   mrb_int n,len,times;
822   struct RString *str2;
823   char *p;
824 
825   mrb_get_args(mrb, "i", &times);
826   if (times < 0) {
827     mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument");
828   }
829   if (times && MRB_INT_MAX / times < RSTRING_LEN(self)) {
830     mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big");
831   }
832 
833   len = RSTRING_LEN(self)*times;
834   str2 = str_new(mrb, 0, len);
835   str_with_class(mrb, str2, self);
836   p = RSTR_PTR(str2);
837   if (len > 0) {
838     n = RSTRING_LEN(self);
839     memcpy(p, RSTRING_PTR(self), n);
840     while (n <= len/2) {
841       memcpy(p + n, p, n);
842       n *= 2;
843     }
844     memcpy(p + n, p, len-n);
845   }
846   p[RSTR_LEN(str2)] = '\0';
847 
848   return mrb_obj_value(str2);
849 }
850 /* -------------------------------------------------------------- */
851 
852 #define lesser(a,b) (((a)>(b))?(b):(a))
853 
854 /* ---------------------------*/
855 /*
856  *  call-seq:
857  *     mrb_value str1 <=> mrb_value str2   => int
858  *                     >  1
859  *                     =  0
860  *                     <  -1
861  */
862 MRB_API int
mrb_str_cmp(mrb_state * mrb,mrb_value str1,mrb_value str2)863 mrb_str_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2)
864 {
865   mrb_int len;
866   mrb_int retval;
867   struct RString *s1 = mrb_str_ptr(str1);
868   struct RString *s2 = mrb_str_ptr(str2);
869 
870   len = lesser(RSTR_LEN(s1), RSTR_LEN(s2));
871   retval = memcmp(RSTR_PTR(s1), RSTR_PTR(s2), len);
872   if (retval == 0) {
873     if (RSTR_LEN(s1) == RSTR_LEN(s2)) return 0;
874     if (RSTR_LEN(s1) > RSTR_LEN(s2))  return 1;
875     return -1;
876   }
877   if (retval > 0) return 1;
878   return -1;
879 }
880 
881 /* 15.2.10.5.3  */
882 
883 /*
884  *  call-seq:
885  *     str <=> other_str   => -1, 0, +1
886  *
887  *  Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
888  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
889  *  <i>str</i>. If the strings are of different lengths, and the strings are
890  *  equal when compared up to the shortest length, then the longer string is
891  *  considered greater than the shorter one. If the variable <code>$=</code> is
892  *  <code>false</code>, the comparison is based on comparing the binary values
893  *  of each character in the string. In older versions of Ruby, setting
894  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
895  *  in favor of using <code>String#casecmp</code>.
896  *
897  *  <code><=></code> is the basis for the methods <code><</code>,
898  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
899  *  included from module <code>Comparable</code>.  The method
900  *  <code>String#==</code> does not use <code>Comparable#==</code>.
901  *
902  *     "abcdef" <=> "abcde"     #=> 1
903  *     "abcdef" <=> "abcdef"    #=> 0
904  *     "abcdef" <=> "abcdefg"   #=> -1
905  *     "abcdef" <=> "ABCDEF"    #=> 1
906  */
907 static mrb_value
mrb_str_cmp_m(mrb_state * mrb,mrb_value str1)908 mrb_str_cmp_m(mrb_state *mrb, mrb_value str1)
909 {
910   mrb_value str2;
911   mrb_int result;
912 
913   mrb_get_args(mrb, "o", &str2);
914   if (!mrb_string_p(str2)) {
915     if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "to_s"))) {
916       return mrb_nil_value();
917     }
918     else if (!mrb_respond_to(mrb, str2, mrb_intern_lit(mrb, "<=>"))) {
919       return mrb_nil_value();
920     }
921     else {
922       mrb_value tmp = mrb_funcall(mrb, str2, "<=>", 1, str1);
923 
924       if (mrb_nil_p(tmp)) return mrb_nil_value();
925       if (!mrb_fixnum_p(tmp)) {
926         return mrb_funcall(mrb, mrb_fixnum_value(0), "-", 1, tmp);
927       }
928       result = -mrb_fixnum(tmp);
929     }
930   }
931   else {
932     result = mrb_str_cmp(mrb, str1, str2);
933   }
934   return mrb_fixnum_value(result);
935 }
936 
937 static mrb_bool
str_eql(mrb_state * mrb,const mrb_value str1,const mrb_value str2)938 str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2)
939 {
940   const mrb_int len = RSTRING_LEN(str1);
941 
942   if (len != RSTRING_LEN(str2)) return FALSE;
943   if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), (size_t)len) == 0)
944     return TRUE;
945   return FALSE;
946 }
947 
948 MRB_API mrb_bool
mrb_str_equal(mrb_state * mrb,mrb_value str1,mrb_value str2)949 mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2)
950 {
951   if (!mrb_string_p(str2)) return FALSE;
952   return str_eql(mrb, str1, str2);
953 }
954 
955 /* 15.2.10.5.4  */
956 /*
957  *  call-seq:
958  *     str == obj   => true or false
959  *
960  *  Equality---
961  *  If <i>obj</i> is not a <code>String</code>, returns <code>false</code>.
962  *  Otherwise, returns <code>false</code> or <code>true</code>
963  *
964  *   caution:if <i>str</i> <code><=></code> <i>obj</i> returns zero.
965  */
966 static mrb_value
mrb_str_equal_m(mrb_state * mrb,mrb_value str1)967 mrb_str_equal_m(mrb_state *mrb, mrb_value str1)
968 {
969   mrb_value str2;
970 
971   mrb_get_args(mrb, "o", &str2);
972 
973   return mrb_bool_value(mrb_str_equal(mrb, str1, str2));
974 }
975 /* ---------------------------------- */
976 mrb_value mrb_mod_to_s(mrb_state *mrb, mrb_value klass);
977 
978 MRB_API mrb_value
mrb_str_to_str(mrb_state * mrb,mrb_value str)979 mrb_str_to_str(mrb_state *mrb, mrb_value str)
980 {
981   switch (mrb_type(str)) {
982   case MRB_TT_STRING:
983     return str;
984   case MRB_TT_FIXNUM:
985     return mrb_fixnum_to_str(mrb, str, 10);
986   case MRB_TT_CLASS:
987   case MRB_TT_MODULE:
988     return mrb_mod_to_s(mrb, str);
989   default:
990     return mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_s");
991   }
992 }
993 
994 MRB_API const char*
mrb_string_value_ptr(mrb_state * mrb,mrb_value str)995 mrb_string_value_ptr(mrb_state *mrb, mrb_value str)
996 {
997   str = mrb_str_to_str(mrb, str);
998   return RSTRING_PTR(str);
999 }
1000 
1001 MRB_API mrb_int
mrb_string_value_len(mrb_state * mrb,mrb_value ptr)1002 mrb_string_value_len(mrb_state *mrb, mrb_value ptr)
1003 {
1004   mrb_to_str(mrb, ptr);
1005   return RSTRING_LEN(ptr);
1006 }
1007 
1008 void
mrb_noregexp(mrb_state * mrb,mrb_value self)1009 mrb_noregexp(mrb_state *mrb, mrb_value self)
1010 {
1011   mrb_raise(mrb, E_NOTIMP_ERROR, "Regexp class not implemented");
1012 }
1013 
1014 void
mrb_regexp_check(mrb_state * mrb,mrb_value obj)1015 mrb_regexp_check(mrb_state *mrb, mrb_value obj)
1016 {
1017   if (mrb_regexp_p(mrb, obj)) {
1018     mrb_noregexp(mrb, obj);
1019   }
1020 }
1021 
1022 MRB_API mrb_value
mrb_str_dup(mrb_state * mrb,mrb_value str)1023 mrb_str_dup(mrb_state *mrb, mrb_value str)
1024 {
1025   struct RString *s = mrb_str_ptr(str);
1026   struct RString *dup = str_new(mrb, 0, 0);
1027 
1028   str_with_class(mrb, dup, str);
1029   return str_replace(mrb, dup, s);
1030 }
1031 
1032 static mrb_value
mrb_str_aref(mrb_state * mrb,mrb_value str,mrb_value indx)1033 mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx)
1034 {
1035   mrb_int idx;
1036 
1037   mrb_regexp_check(mrb, indx);
1038   switch (mrb_type(indx)) {
1039     case MRB_TT_FIXNUM:
1040       idx = mrb_fixnum(indx);
1041 
1042 num_index:
1043       str = str_substr(mrb, str, idx, 1);
1044       if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value();
1045       return str;
1046 
1047     case MRB_TT_STRING:
1048       if (str_index_str(mrb, str, indx, 0) != -1)
1049         return mrb_str_dup(mrb, indx);
1050       return mrb_nil_value();
1051 
1052     case MRB_TT_RANGE:
1053       goto range_arg;
1054 
1055     default:
1056       indx = mrb_Integer(mrb, indx);
1057       if (mrb_nil_p(indx)) {
1058       range_arg:
1059         {
1060           mrb_int beg, len;
1061 
1062           len = RSTRING_CHAR_LEN(str);
1063           switch (mrb_range_beg_len(mrb, indx, &beg, &len, len, TRUE)) {
1064           case 1:
1065             return str_subseq(mrb, str, beg, len);
1066           case 2:
1067             return mrb_nil_value();
1068           default:
1069             break;
1070           }
1071         }
1072         mrb_raise(mrb, E_TYPE_ERROR, "can't convert to Fixnum");
1073       }
1074       idx = mrb_fixnum(indx);
1075       goto num_index;
1076   }
1077   return mrb_nil_value();    /* not reached */
1078 }
1079 
1080 /* 15.2.10.5.6  */
1081 /* 15.2.10.5.34 */
1082 /*
1083  *  call-seq:
1084  *     str[fixnum]                 => fixnum or nil
1085  *     str[fixnum, fixnum]         => new_str or nil
1086  *     str[range]                  => new_str or nil
1087  *     str[regexp]                 => new_str or nil
1088  *     str[regexp, fixnum]         => new_str or nil
1089  *     str[other_str]              => new_str or nil
1090  *     str.slice(fixnum)           => fixnum or nil
1091  *     str.slice(fixnum, fixnum)   => new_str or nil
1092  *     str.slice(range)            => new_str or nil
1093  *     str.slice(other_str)        => new_str or nil
1094  *
1095  *  Element Reference---If passed a single <code>Fixnum</code>, returns the code
1096  *  of the character at that position. If passed two <code>Fixnum</code>
1097  *  objects, returns a substring starting at the offset given by the first, and
1098  *  a length given by the second. If given a range, a substring containing
1099  *  characters at offsets given by the range is returned. In all three cases, if
1100  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
1101  *  <code>nil</code> if the initial offset falls outside the string, the length
1102  *  is negative, or the beginning of the range is greater than the end.
1103  *
1104  *  If a <code>String</code> is given, that string is returned if it occurs in
1105  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1106  *  match.
1107  *
1108  *     a = "hello there"
1109  *     a[1]                   #=> 101(1.8.7) "e"(1.9.2)
1110  *     a[1.1]                 #=>            "e"(1.9.2)
1111  *     a[1,3]                 #=> "ell"
1112  *     a[1..3]                #=> "ell"
1113  *     a[-3,2]                #=> "er"
1114  *     a[-4..-2]              #=> "her"
1115  *     a[12..-1]              #=> nil
1116  *     a[-2..-4]              #=> ""
1117  *     a["lo"]                #=> "lo"
1118  *     a["bye"]               #=> nil
1119  */
1120 static mrb_value
mrb_str_aref_m(mrb_state * mrb,mrb_value str)1121 mrb_str_aref_m(mrb_state *mrb, mrb_value str)
1122 {
1123   mrb_value a1, a2;
1124   mrb_int argc;
1125 
1126   argc = mrb_get_args(mrb, "o|o", &a1, &a2);
1127   if (argc == 2) {
1128     mrb_int n1, n2;
1129 
1130     mrb_regexp_check(mrb, a1);
1131     mrb_get_args(mrb, "ii", &n1, &n2);
1132     return str_substr(mrb, str, n1, n2);
1133   }
1134   if (argc != 1) {
1135     mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc));
1136   }
1137   return mrb_str_aref(mrb, str, a1);
1138 }
1139 
1140 /* 15.2.10.5.8  */
1141 /*
1142  *  call-seq:
1143  *     str.capitalize!   => str or nil
1144  *
1145  *  Modifies <i>str</i> by converting the first character to uppercase and the
1146  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
1147  *
1148  *     a = "hello"
1149  *     a.capitalize!   #=> "Hello"
1150  *     a               #=> "Hello"
1151  *     a.capitalize!   #=> nil
1152  */
1153 static mrb_value
mrb_str_capitalize_bang(mrb_state * mrb,mrb_value str)1154 mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str)
1155 {
1156   char *p, *pend;
1157   mrb_bool modify = FALSE;
1158   struct RString *s = mrb_str_ptr(str);
1159 
1160   mrb_str_modify(mrb, s);
1161   if (RSTR_LEN(s) == 0 || !RSTR_PTR(s)) return mrb_nil_value();
1162   p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s);
1163   if (ISLOWER(*p)) {
1164     *p = TOUPPER(*p);
1165     modify = TRUE;
1166   }
1167   while (++p < pend) {
1168     if (ISUPPER(*p)) {
1169       *p = TOLOWER(*p);
1170       modify = TRUE;
1171     }
1172   }
1173   if (modify) return str;
1174   return mrb_nil_value();
1175 }
1176 
1177 /* 15.2.10.5.7  */
1178 /*
1179  *  call-seq:
1180  *     str.capitalize   => new_str
1181  *
1182  *  Returns a copy of <i>str</i> with the first character converted to uppercase
1183  *  and the remainder to lowercase.
1184  *
1185  *     "hello".capitalize    #=> "Hello"
1186  *     "HELLO".capitalize    #=> "Hello"
1187  *     "123ABC".capitalize   #=> "123abc"
1188  */
1189 static mrb_value
mrb_str_capitalize(mrb_state * mrb,mrb_value self)1190 mrb_str_capitalize(mrb_state *mrb, mrb_value self)
1191 {
1192   mrb_value str;
1193 
1194   str = mrb_str_dup(mrb, self);
1195   mrb_str_capitalize_bang(mrb, str);
1196   return str;
1197 }
1198 
1199 /* 15.2.10.5.10  */
1200 /*
1201  *  call-seq:
1202  *     str.chomp!(separator="\n")   => str or nil
1203  *
1204  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
1205  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
1206  */
1207 static mrb_value
mrb_str_chomp_bang(mrb_state * mrb,mrb_value str)1208 mrb_str_chomp_bang(mrb_state *mrb, mrb_value str)
1209 {
1210   mrb_value rs;
1211   mrb_int newline;
1212   char *p, *pp;
1213   mrb_int rslen;
1214   mrb_int len;
1215   mrb_int argc;
1216   struct RString *s = mrb_str_ptr(str);
1217 
1218   argc = mrb_get_args(mrb, "|S", &rs);
1219   mrb_str_modify(mrb, s);
1220   len = RSTR_LEN(s);
1221   if (argc == 0) {
1222     if (len == 0) return mrb_nil_value();
1223   smart_chomp:
1224     if (RSTR_PTR(s)[len-1] == '\n') {
1225       RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1226       if (RSTR_LEN(s) > 0 &&
1227           RSTR_PTR(s)[RSTR_LEN(s)-1] == '\r') {
1228         RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1229       }
1230     }
1231     else if (RSTR_PTR(s)[len-1] == '\r') {
1232       RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1233     }
1234     else {
1235       return mrb_nil_value();
1236     }
1237     RSTR_PTR(s)[RSTR_LEN(s)] = '\0';
1238     return str;
1239   }
1240 
1241   if (len == 0 || mrb_nil_p(rs)) return mrb_nil_value();
1242   p = RSTR_PTR(s);
1243   rslen = RSTRING_LEN(rs);
1244   if (rslen == 0) {
1245     while (len>0 && p[len-1] == '\n') {
1246       len--;
1247       if (len>0 && p[len-1] == '\r')
1248         len--;
1249     }
1250     if (len < RSTR_LEN(s)) {
1251       RSTR_SET_LEN(s, len);
1252       p[len] = '\0';
1253       return str;
1254     }
1255     return mrb_nil_value();
1256   }
1257   if (rslen > len) return mrb_nil_value();
1258   newline = RSTRING_PTR(rs)[rslen-1];
1259   if (rslen == 1 && newline == '\n')
1260     newline = RSTRING_PTR(rs)[rslen-1];
1261   if (rslen == 1 && newline == '\n')
1262     goto smart_chomp;
1263 
1264   pp = p + len - rslen;
1265   if (p[len-1] == newline &&
1266      (rslen <= 1 ||
1267      memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
1268     RSTR_SET_LEN(s, len - rslen);
1269     p[RSTR_LEN(s)] = '\0';
1270     return str;
1271   }
1272   return mrb_nil_value();
1273 }
1274 
1275 /* 15.2.10.5.9  */
1276 /*
1277  *  call-seq:
1278  *     str.chomp(separator="\n")   => new_str
1279  *
1280  *  Returns a new <code>String</code> with the given record separator removed
1281  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
1282  *  changed from the default Ruby record separator, then <code>chomp</code> also
1283  *  removes carriage return characters (that is it will remove <code>\n</code>,
1284  *  <code>\r</code>, and <code>\r\n</code>).
1285  *
1286  *     "hello".chomp            #=> "hello"
1287  *     "hello\n".chomp          #=> "hello"
1288  *     "hello\r\n".chomp        #=> "hello"
1289  *     "hello\n\r".chomp        #=> "hello\n"
1290  *     "hello\r".chomp          #=> "hello"
1291  *     "hello \n there".chomp   #=> "hello \n there"
1292  *     "hello".chomp("llo")     #=> "he"
1293  */
1294 static mrb_value
mrb_str_chomp(mrb_state * mrb,mrb_value self)1295 mrb_str_chomp(mrb_state *mrb, mrb_value self)
1296 {
1297   mrb_value str;
1298 
1299   str = mrb_str_dup(mrb, self);
1300   mrb_str_chomp_bang(mrb, str);
1301   return str;
1302 }
1303 
1304 /* 15.2.10.5.12 */
1305 /*
1306  *  call-seq:
1307  *     str.chop!   => str or nil
1308  *
1309  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
1310  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
1311  *  <code>String#chomp!</code>.
1312  */
1313 static mrb_value
mrb_str_chop_bang(mrb_state * mrb,mrb_value str)1314 mrb_str_chop_bang(mrb_state *mrb, mrb_value str)
1315 {
1316   struct RString *s = mrb_str_ptr(str);
1317 
1318   mrb_str_modify(mrb, s);
1319   if (RSTR_LEN(s) > 0) {
1320     mrb_int len;
1321 #ifdef MRB_UTF8_STRING
1322     const char* t = RSTR_PTR(s), *p = t;
1323     const char* e = p + RSTR_LEN(s);
1324     while (p<e) {
1325       mrb_int clen = utf8len(p, e);
1326       if (p + clen>=e) break;
1327       p += clen;
1328     }
1329     len = p - t;
1330 #else
1331     len = RSTR_LEN(s) - 1;
1332 #endif
1333     if (RSTR_PTR(s)[len] == '\n') {
1334       if (len > 0 &&
1335           RSTR_PTR(s)[len-1] == '\r') {
1336         len--;
1337       }
1338     }
1339     RSTR_SET_LEN(s, len);
1340     RSTR_PTR(s)[len] = '\0';
1341     return str;
1342   }
1343   return mrb_nil_value();
1344 }
1345 
1346 /* 15.2.10.5.11 */
1347 /*
1348  *  call-seq:
1349  *     str.chop   => new_str
1350  *
1351  *  Returns a new <code>String</code> with the last character removed.  If the
1352  *  string ends with <code>\r\n</code>, both characters are removed. Applying
1353  *  <code>chop</code> to an empty string returns an empty
1354  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
1355  *  the string unchanged if it doesn't end in a record separator.
1356  *
1357  *     "string\r\n".chop   #=> "string"
1358  *     "string\n\r".chop   #=> "string\n"
1359  *     "string\n".chop     #=> "string"
1360  *     "string".chop       #=> "strin"
1361  *     "x".chop            #=> ""
1362  */
1363 static mrb_value
mrb_str_chop(mrb_state * mrb,mrb_value self)1364 mrb_str_chop(mrb_state *mrb, mrb_value self)
1365 {
1366   mrb_value str;
1367   str = mrb_str_dup(mrb, self);
1368   mrb_str_chop_bang(mrb, str);
1369   return str;
1370 }
1371 
1372 /* 15.2.10.5.14 */
1373 /*
1374  *  call-seq:
1375  *     str.downcase!   => str or nil
1376  *
1377  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
1378  *  changes were made.
1379  */
1380 static mrb_value
mrb_str_downcase_bang(mrb_state * mrb,mrb_value str)1381 mrb_str_downcase_bang(mrb_state *mrb, mrb_value str)
1382 {
1383   char *p, *pend;
1384   mrb_bool modify = FALSE;
1385   struct RString *s = mrb_str_ptr(str);
1386 
1387   mrb_str_modify(mrb, s);
1388   p = RSTR_PTR(s);
1389   pend = RSTR_PTR(s) + RSTR_LEN(s);
1390   while (p < pend) {
1391     if (ISUPPER(*p)) {
1392       *p = TOLOWER(*p);
1393       modify = TRUE;
1394     }
1395     p++;
1396   }
1397 
1398   if (modify) return str;
1399   return mrb_nil_value();
1400 }
1401 
1402 /* 15.2.10.5.13 */
1403 /*
1404  *  call-seq:
1405  *     str.downcase   => new_str
1406  *
1407  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
1408  *  lowercase counterparts. The operation is locale insensitive---only
1409  *  characters 'A' to 'Z' are affected.
1410  *
1411  *     "hEllO".downcase   #=> "hello"
1412  */
1413 static mrb_value
mrb_str_downcase(mrb_state * mrb,mrb_value self)1414 mrb_str_downcase(mrb_state *mrb, mrb_value self)
1415 {
1416   mrb_value str;
1417 
1418   str = mrb_str_dup(mrb, self);
1419   mrb_str_downcase_bang(mrb, str);
1420   return str;
1421 }
1422 
1423 /* 15.2.10.5.16 */
1424 /*
1425  *  call-seq:
1426  *     str.empty?   => true or false
1427  *
1428  *  Returns <code>true</code> if <i>str</i> has a length of zero.
1429  *
1430  *     "hello".empty?   #=> false
1431  *     "".empty?        #=> true
1432  */
1433 static mrb_value
mrb_str_empty_p(mrb_state * mrb,mrb_value self)1434 mrb_str_empty_p(mrb_state *mrb, mrb_value self)
1435 {
1436   struct RString *s = mrb_str_ptr(self);
1437 
1438   return mrb_bool_value(RSTR_LEN(s) == 0);
1439 }
1440 
1441 /* 15.2.10.5.17 */
1442 /*
1443  * call-seq:
1444  *   str.eql?(other)   => true or false
1445  *
1446  * Two strings are equal if the have the same length and content.
1447  */
1448 static mrb_value
mrb_str_eql(mrb_state * mrb,mrb_value self)1449 mrb_str_eql(mrb_state *mrb, mrb_value self)
1450 {
1451   mrb_value str2;
1452   mrb_bool eql_p;
1453 
1454   mrb_get_args(mrb, "o", &str2);
1455   eql_p = (mrb_type(str2) == MRB_TT_STRING) && str_eql(mrb, self, str2);
1456 
1457   return mrb_bool_value(eql_p);
1458 }
1459 
1460 MRB_API mrb_value
mrb_str_substr(mrb_state * mrb,mrb_value str,mrb_int beg,mrb_int len)1461 mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
1462 {
1463   return str_substr(mrb, str, beg, len);
1464 }
1465 
1466 uint32_t
mrb_str_hash(mrb_state * mrb,mrb_value str)1467 mrb_str_hash(mrb_state *mrb, mrb_value str)
1468 {
1469   /* 1-8-7 */
1470   struct RString *s = mrb_str_ptr(str);
1471   mrb_int len = RSTR_LEN(s);
1472   char *p = RSTR_PTR(s);
1473   uint64_t key = 0;
1474 
1475   while (len--) {
1476     key = key*65599 + *p;
1477     p++;
1478   }
1479   return (uint32_t)(key + (key>>5));
1480 }
1481 
1482 /* 15.2.10.5.20 */
1483 /*
1484  * call-seq:
1485  *    str.hash   => fixnum
1486  *
1487  * Return a hash based on the string's length and content.
1488  */
1489 static mrb_value
mrb_str_hash_m(mrb_state * mrb,mrb_value self)1490 mrb_str_hash_m(mrb_state *mrb, mrb_value self)
1491 {
1492   mrb_int key = mrb_str_hash(mrb, self);
1493   return mrb_fixnum_value(key);
1494 }
1495 
1496 /* 15.2.10.5.21 */
1497 /*
1498  *  call-seq:
1499  *     str.include? other_str   => true or false
1500  *     str.include? fixnum      => true or false
1501  *
1502  *  Returns <code>true</code> if <i>str</i> contains the given string or
1503  *  character.
1504  *
1505  *     "hello".include? "lo"   #=> true
1506  *     "hello".include? "ol"   #=> false
1507  *     "hello".include? ?h     #=> true
1508  */
1509 static mrb_value
mrb_str_include(mrb_state * mrb,mrb_value self)1510 mrb_str_include(mrb_state *mrb, mrb_value self)
1511 {
1512   mrb_value str2;
1513 
1514   mrb_get_args(mrb, "S", &str2);
1515   if (str_index_str(mrb, self, str2, 0) < 0)
1516     return mrb_bool_value(FALSE);
1517   return mrb_bool_value(TRUE);
1518 }
1519 
1520 /* 15.2.10.5.22 */
1521 /*
1522  *  call-seq:
1523  *     str.index(substring [, offset])   => fixnum or nil
1524  *     str.index(fixnum [, offset])      => fixnum or nil
1525  *     str.index(regexp [, offset])      => fixnum or nil
1526  *
1527  *  Returns the index of the first occurrence of the given
1528  *  <i>substring</i>,
1529  *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>.
1530  *  Returns
1531  *  <code>nil</code> if not found.
1532  *  If the second parameter is present, it
1533  *  specifies the position in the string to begin the search.
1534  *
1535  *     "hello".index('e')             #=> 1
1536  *     "hello".index('lo')            #=> 3
1537  *     "hello".index('a')             #=> nil
1538  *     "hello".index(101)             #=> 1(101=0x65='e')
1539  *     "hello".index(/[aeiou]/, -3)   #=> 4
1540  */
1541 static mrb_value
mrb_str_index_m(mrb_state * mrb,mrb_value str)1542 mrb_str_index_m(mrb_state *mrb, mrb_value str)
1543 {
1544   mrb_value *argv;
1545   mrb_int argc;
1546   mrb_value sub;
1547   mrb_int pos, clen;
1548 
1549   mrb_get_args(mrb, "*!", &argv, &argc);
1550   if (argc == 2) {
1551     mrb_get_args(mrb, "oi", &sub, &pos);
1552   }
1553   else {
1554     pos = 0;
1555     if (argc > 0)
1556       sub = argv[0];
1557     else
1558       sub = mrb_nil_value();
1559   }
1560   mrb_regexp_check(mrb, sub);
1561   clen = RSTRING_CHAR_LEN(str);
1562   if (pos < 0) {
1563     pos += clen;
1564     if (pos < 0) {
1565       return mrb_nil_value();
1566     }
1567   }
1568   if (pos > clen) return mrb_nil_value();
1569   pos = chars2bytes(str, 0, pos);
1570 
1571   switch (mrb_type(sub)) {
1572     default: {
1573       mrb_value tmp;
1574 
1575       tmp = mrb_check_string_type(mrb, sub);
1576       if (mrb_nil_p(tmp)) {
1577         mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub);
1578       }
1579       sub = tmp;
1580     }
1581     /* fall through */
1582     case MRB_TT_STRING:
1583       pos = str_index_str(mrb, str, sub, pos);
1584       break;
1585   }
1586 
1587   if (pos == -1) return mrb_nil_value();
1588   pos = bytes2chars(RSTRING_PTR(str), pos);
1589   BYTES_ALIGN_CHECK(pos);
1590   return mrb_fixnum_value(pos);
1591 }
1592 
1593 /* 15.2.10.5.24 */
1594 /* 15.2.10.5.28 */
1595 /*
1596  *  call-seq:
1597  *     str.replace(other_str)   => str
1598  *
1599  *     s = "hello"         #=> "hello"
1600  *     s.replace "world"   #=> "world"
1601  */
1602 static mrb_value
mrb_str_replace(mrb_state * mrb,mrb_value str)1603 mrb_str_replace(mrb_state *mrb, mrb_value str)
1604 {
1605   mrb_value str2;
1606 
1607   mrb_get_args(mrb, "S", &str2);
1608   return str_replace(mrb, mrb_str_ptr(str), mrb_str_ptr(str2));
1609 }
1610 
1611 /* 15.2.10.5.23 */
1612 /*
1613  *  call-seq:
1614  *     String.new(str="")   => new_str
1615  *
1616  *  Returns a new string object containing a copy of <i>str</i>.
1617  */
1618 static mrb_value
mrb_str_init(mrb_state * mrb,mrb_value self)1619 mrb_str_init(mrb_state *mrb, mrb_value self)
1620 {
1621   mrb_value str2;
1622 
1623   if (mrb_get_args(mrb, "|S", &str2) == 0) {
1624     struct RString *s = str_new(mrb, 0, 0);
1625     str2 = mrb_obj_value(s);
1626   }
1627   str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2));
1628   return self;
1629 }
1630 
1631 /* 15.2.10.5.25 */
1632 /* 15.2.10.5.41 */
1633 /*
1634  *  call-seq:
1635  *     str.intern   => symbol
1636  *     str.to_sym   => symbol
1637  *
1638  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
1639  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
1640  *
1641  *     "Koala".intern         #=> :Koala
1642  *     s = 'cat'.to_sym       #=> :cat
1643  *     s == :cat              #=> true
1644  *     s = '@cat'.to_sym      #=> :@cat
1645  *     s == :@cat             #=> true
1646  *
1647  *  This can also be used to create symbols that cannot be represented using the
1648  *  <code>:xxx</code> notation.
1649  *
1650  *     'cat and dog'.to_sym   #=> :"cat and dog"
1651  */
1652 MRB_API mrb_value
mrb_str_intern(mrb_state * mrb,mrb_value self)1653 mrb_str_intern(mrb_state *mrb, mrb_value self)
1654 {
1655   return mrb_symbol_value(mrb_intern_str(mrb, self));
1656 }
1657 /* ---------------------------------- */
1658 MRB_API mrb_value
mrb_obj_as_string(mrb_state * mrb,mrb_value obj)1659 mrb_obj_as_string(mrb_state *mrb, mrb_value obj)
1660 {
1661   mrb_value str;
1662 
1663   if (mrb_string_p(obj)) {
1664     return obj;
1665   }
1666   str = mrb_funcall(mrb, obj, "to_s", 0);
1667   if (!mrb_string_p(str))
1668     return mrb_any_to_s(mrb, obj);
1669   return str;
1670 }
1671 
1672 MRB_API mrb_value
mrb_ptr_to_str(mrb_state * mrb,void * p)1673 mrb_ptr_to_str(mrb_state *mrb, void *p)
1674 {
1675   struct RString *p_str;
1676   char *p1;
1677   char *p2;
1678   uintptr_t n = (uintptr_t)p;
1679 
1680   p_str = str_new(mrb, NULL, 2 + sizeof(uintptr_t) * CHAR_BIT / 4);
1681   p1 = RSTR_PTR(p_str);
1682   *p1++ = '0';
1683   *p1++ = 'x';
1684   p2 = p1;
1685 
1686   do {
1687     *p2++ = mrb_digitmap[n % 16];
1688     n /= 16;
1689   } while (n > 0);
1690   *p2 = '\0';
1691   RSTR_SET_LEN(p_str, (mrb_int)(p2 - RSTR_PTR(p_str)));
1692 
1693   while (p1 < p2) {
1694     const char  c = *p1;
1695     *p1++ = *--p2;
1696     *p2 = c;
1697   }
1698 
1699   return mrb_obj_value(p_str);
1700 }
1701 
1702 /* 15.2.10.5.30 */
1703 /*
1704  *  call-seq:
1705  *     str.reverse!   => str
1706  *
1707  *  Reverses <i>str</i> in place.
1708  */
1709 static mrb_value
mrb_str_reverse_bang(mrb_state * mrb,mrb_value str)1710 mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
1711 {
1712 #ifdef MRB_UTF8_STRING
1713   mrb_int utf8_len = RSTRING_CHAR_LEN(str);
1714   mrb_int len = RSTRING_LEN(str);
1715 
1716   if (utf8_len == len) goto bytes;
1717   if (utf8_len > 1) {
1718     char *buf;
1719     char *p, *e, *r;
1720 
1721     mrb_str_modify(mrb, mrb_str_ptr(str));
1722     len = RSTRING_LEN(str);
1723     buf = (char*)mrb_malloc(mrb, (size_t)len);
1724     p = buf;
1725     e = buf + len;
1726 
1727     memcpy(buf, RSTRING_PTR(str), len);
1728     r = RSTRING_PTR(str) + len;
1729 
1730     while (p<e) {
1731       mrb_int clen = utf8len(p, e);
1732       r -= clen;
1733       memcpy(r, p, clen);
1734       p += clen;
1735     }
1736     mrb_free(mrb, buf);
1737   }
1738   return str;
1739 
1740  bytes:
1741 #endif
1742   {
1743     struct RString *s = mrb_str_ptr(str);
1744     char *p, *e;
1745     char c;
1746 
1747     mrb_str_modify(mrb, s);
1748     if (RSTR_LEN(s) > 1) {
1749       p = RSTR_PTR(s);
1750       e = p + RSTR_LEN(s) - 1;
1751       while (p < e) {
1752       c = *p;
1753       *p++ = *e;
1754       *e-- = c;
1755       }
1756     }
1757     return str;
1758   }
1759 }
1760 
1761 /* ---------------------------------- */
1762 /* 15.2.10.5.29 */
1763 /*
1764  *  call-seq:
1765  *     str.reverse   => new_str
1766  *
1767  *  Returns a new string with the characters from <i>str</i> in reverse order.
1768  *
1769  *     "stressed".reverse   #=> "desserts"
1770  */
1771 static mrb_value
mrb_str_reverse(mrb_state * mrb,mrb_value str)1772 mrb_str_reverse(mrb_state *mrb, mrb_value str)
1773 {
1774   mrb_value str2 = mrb_str_dup(mrb, str);
1775   mrb_str_reverse_bang(mrb, str2);
1776   return str2;
1777 }
1778 
1779 /* 15.2.10.5.31 */
1780 /*
1781  *  call-seq:
1782  *     str.rindex(substring [, fixnum])   => fixnum or nil
1783  *     str.rindex(fixnum [, fixnum])   => fixnum or nil
1784  *     str.rindex(regexp [, fixnum])   => fixnum or nil
1785  *
1786  *  Returns the index of the last occurrence of the given <i>substring</i>,
1787  *  character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
1788  *  <code>nil</code> if not found. If the second parameter is present, it
1789  *  specifies the position in the string to end the search---characters beyond
1790  *  this point will not be considered.
1791  *
1792  *     "hello".rindex('e')             #=> 1
1793  *     "hello".rindex('l')             #=> 3
1794  *     "hello".rindex('a')             #=> nil
1795  *     "hello".rindex(101)             #=> 1
1796  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
1797  */
1798 static mrb_value
mrb_str_rindex(mrb_state * mrb,mrb_value str)1799 mrb_str_rindex(mrb_state *mrb, mrb_value str)
1800 {
1801   mrb_value *argv;
1802   mrb_int argc;
1803   mrb_value sub;
1804   mrb_int pos, len = RSTRING_CHAR_LEN(str);
1805 
1806   mrb_get_args(mrb, "*!", &argv, &argc);
1807   if (argc == 2) {
1808     mrb_get_args(mrb, "oi", &sub, &pos);
1809     if (pos < 0) {
1810       pos += len;
1811       if (pos < 0) {
1812         mrb_regexp_check(mrb, sub);
1813         return mrb_nil_value();
1814       }
1815     }
1816     if (pos > len) pos = len;
1817   }
1818   else {
1819     pos = len;
1820     if (argc > 0)
1821       sub = argv[0];
1822     else
1823       sub = mrb_nil_value();
1824   }
1825   pos = chars2bytes(str, 0, pos);
1826   mrb_regexp_check(mrb, sub);
1827 
1828   switch (mrb_type(sub)) {
1829     default: {
1830       mrb_value tmp;
1831 
1832       tmp = mrb_check_string_type(mrb, sub);
1833       if (mrb_nil_p(tmp)) {
1834         mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub);
1835       }
1836       sub = tmp;
1837     }
1838      /* fall through */
1839     case MRB_TT_STRING:
1840       pos = str_rindex(mrb, str, sub, pos);
1841       if (pos >= 0) {
1842         pos = bytes2chars(RSTRING_PTR(str), pos);
1843         BYTES_ALIGN_CHECK(pos);
1844         return mrb_fixnum_value(pos);
1845       }
1846       break;
1847 
1848   } /* end of switch (TYPE(sub)) */
1849   return mrb_nil_value();
1850 }
1851 
1852 /* 15.2.10.5.35 */
1853 
1854 /*
1855  *  call-seq:
1856  *     str.split(pattern="\n", [limit])   => anArray
1857  *
1858  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
1859  *  of these substrings.
1860  *
1861  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
1862  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
1863  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
1864  *  of contiguous whitespace characters ignored.
1865  *
1866  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
1867  *  pattern matches. Whenever the pattern matches a zero-length string,
1868  *  <i>str</i> is split into individual characters.
1869  *
1870  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
1871  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
1872  *  split on whitespace as if ' ' were specified.
1873  *
1874  *  If the <i>limit</i> parameter is omitted, trailing null fields are
1875  *  suppressed. If <i>limit</i> is a positive number, at most that number of
1876  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
1877  *  string is returned as the only entry in an array). If negative, there is no
1878  *  limit to the number of fields returned, and trailing null fields are not
1879  *  suppressed.
1880  *
1881  *     " now's  the time".split        #=> ["now's", "the", "time"]
1882  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
1883  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
1884  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
1885  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
1886  *
1887  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
1888  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
1889  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
1890  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
1891  */
1892 
1893 static mrb_value
mrb_str_split_m(mrb_state * mrb,mrb_value str)1894 mrb_str_split_m(mrb_state *mrb, mrb_value str)
1895 {
1896   mrb_int argc;
1897   mrb_value spat = mrb_nil_value();
1898   enum {awk, string, regexp} split_type = string;
1899   mrb_int i = 0;
1900   mrb_int beg;
1901   mrb_int end;
1902   mrb_int lim = 0;
1903   mrb_bool lim_p;
1904   mrb_value result, tmp;
1905 
1906   argc = mrb_get_args(mrb, "|oi", &spat, &lim);
1907   lim_p = (lim > 0 && argc == 2);
1908   if (argc == 2) {
1909     if (lim == 1) {
1910       if (RSTRING_LEN(str) == 0)
1911         return mrb_ary_new_capa(mrb, 0);
1912       return mrb_ary_new_from_values(mrb, 1, &str);
1913     }
1914     i = 1;
1915   }
1916 
1917   if (argc == 0 || mrb_nil_p(spat)) {
1918     split_type = awk;
1919   }
1920   else {
1921     if (mrb_string_p(spat)) {
1922       split_type = string;
1923       if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
1924           split_type = awk;
1925       }
1926     }
1927     else {
1928       mrb_noregexp(mrb, str);
1929     }
1930   }
1931 
1932   result = mrb_ary_new(mrb);
1933   beg = 0;
1934   if (split_type == awk) {
1935     mrb_bool skip = TRUE;
1936     mrb_int idx = 0;
1937     mrb_int str_len = RSTRING_LEN(str);
1938     unsigned int c;
1939     int ai = mrb_gc_arena_save(mrb);
1940 
1941     idx = end = beg;
1942     while (idx < str_len) {
1943       c = (unsigned char)RSTRING_PTR(str)[idx++];
1944       if (skip) {
1945         if (ISSPACE(c)) {
1946           beg = idx;
1947         }
1948         else {
1949           end = idx;
1950           skip = FALSE;
1951           if (lim_p && lim <= i) break;
1952         }
1953       }
1954       else if (ISSPACE(c)) {
1955         mrb_ary_push(mrb, result, byte_subseq(mrb, str, beg, end-beg));
1956         mrb_gc_arena_restore(mrb, ai);
1957         skip = TRUE;
1958         beg = idx;
1959         if (lim_p) ++i;
1960       }
1961       else {
1962         end = idx;
1963       }
1964     }
1965   }
1966   else if (split_type == string) {
1967     mrb_int str_len = RSTRING_LEN(str);
1968     mrb_int pat_len = RSTRING_LEN(spat);
1969     mrb_int idx = 0;
1970     int ai = mrb_gc_arena_save(mrb);
1971 
1972     while (idx < str_len) {
1973       if (pat_len > 0) {
1974         end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx);
1975         if (end < 0) break;
1976       }
1977       else {
1978         end = chars2bytes(str, idx, 1);
1979       }
1980       mrb_ary_push(mrb, result, byte_subseq(mrb, str, idx, end));
1981       mrb_gc_arena_restore(mrb, ai);
1982       idx += end + pat_len;
1983       if (lim_p && lim <= ++i) break;
1984     }
1985     beg = idx;
1986   }
1987   else {
1988     mrb_noregexp(mrb, str);
1989   }
1990   if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) {
1991     if (RSTRING_LEN(str) == beg) {
1992       tmp = mrb_str_new_empty(mrb, str);
1993     }
1994     else {
1995       tmp = byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg);
1996     }
1997     mrb_ary_push(mrb, result, tmp);
1998   }
1999   if (!lim_p && lim == 0) {
2000     mrb_int len;
2001     while ((len = RARRAY_LEN(result)) > 0 &&
2002            (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
2003       mrb_ary_pop(mrb, result);
2004   }
2005 
2006   return result;
2007 }
2008 
2009 static mrb_value
mrb_str_len_to_inum(mrb_state * mrb,const char * str,mrb_int len,mrb_int base,int badcheck)2010 mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, int badcheck)
2011 {
2012   const char *p = str;
2013   const char *pend = str + len;
2014   char sign = 1;
2015   int c;
2016   uint64_t n = 0;
2017   mrb_int val;
2018 
2019 #define conv_digit(c) \
2020     (ISDIGIT(c) ? ((c) - '0') : \
2021      ISLOWER(c) ? ((c) - 'a' + 10) : \
2022      ISUPPER(c) ? ((c) - 'A' + 10) : \
2023      -1)
2024 
2025   if (!p) {
2026     if (badcheck) goto bad;
2027     return mrb_fixnum_value(0);
2028   }
2029   while (p<pend && ISSPACE(*p))
2030     p++;
2031 
2032   if (p[0] == '+') {
2033     p++;
2034   }
2035   else if (p[0] == '-') {
2036     p++;
2037     sign = 0;
2038   }
2039   if (base <= 0) {
2040     if (p[0] == '0') {
2041       switch (p[1]) {
2042         case 'x': case 'X':
2043           base = 16;
2044           break;
2045         case 'b': case 'B':
2046           base = 2;
2047           break;
2048         case 'o': case 'O':
2049           base = 8;
2050           break;
2051         case 'd': case 'D':
2052           base = 10;
2053           break;
2054         default:
2055           base = 8;
2056           break;
2057       }
2058     }
2059     else if (base < -1) {
2060       base = -base;
2061     }
2062     else {
2063       base = 10;
2064     }
2065   }
2066   switch (base) {
2067     case 2:
2068       if (p[0] == '0' && (p[1] == 'b'||p[1] == 'B')) {
2069         p += 2;
2070       }
2071       break;
2072     case 3:
2073       break;
2074     case 8:
2075       if (p[0] == '0' && (p[1] == 'o'||p[1] == 'O')) {
2076         p += 2;
2077       }
2078     case 4: case 5: case 6: case 7:
2079       break;
2080     case 10:
2081       if (p[0] == '0' && (p[1] == 'd'||p[1] == 'D')) {
2082         p += 2;
2083       }
2084     case 9: case 11: case 12: case 13: case 14: case 15:
2085       break;
2086     case 16:
2087       if (p[0] == '0' && (p[1] == 'x'||p[1] == 'X')) {
2088         p += 2;
2089       }
2090       break;
2091     default:
2092       if (base < 2 || 36 < base) {
2093         mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base));
2094       }
2095       break;
2096   } /* end of switch (base) { */
2097   if (p>=pend) {
2098     if (badcheck) goto bad;
2099     return mrb_fixnum_value(0);
2100   }
2101   if (*p == '0') {    /* squeeze preceding 0s */
2102     p++;
2103     while (p<pend) {
2104       c = *p++;
2105       if (c == '_') {
2106         if (p<pend && *p == '_') {
2107           if (badcheck) goto bad;
2108           break;
2109         }
2110         continue;
2111       }
2112       if (c != '0') {
2113         p--;
2114         break;
2115       }
2116     }
2117     if (*(p - 1) == '0')
2118       p--;
2119   }
2120   if (p == pend) {
2121     if (badcheck) goto bad;
2122     return mrb_fixnum_value(0);
2123   }
2124   for ( ;p<pend;p++) {
2125     if (*p == '_') {
2126       p++;
2127       if (p==pend) {
2128         if (badcheck) goto bad;
2129         continue;
2130       }
2131       if (*p == '_') {
2132         if (badcheck) goto bad;
2133         break;
2134       }
2135     }
2136     if (badcheck && *p == '\0') {
2137       goto nullbyte;
2138     }
2139     c = conv_digit(*p);
2140     if (c < 0 || c >= base) {
2141       break;
2142     }
2143     n *= base;
2144     n += c;
2145     if (n > (uint64_t)MRB_INT_MAX + (sign ? 0 : 1)) {
2146 #ifndef MRB_WITHOUT_FLOAT
2147       if (base == 10) {
2148         return mrb_float_value(mrb, mrb_str_to_dbl(mrb, mrb_str_new(mrb, str, len), badcheck));
2149       }
2150       else
2151 #endif
2152       {
2153         mrb_raisef(mrb, E_ARGUMENT_ERROR, "string (%S) too big for integer",
2154                    mrb_str_new(mrb, str, pend-str));
2155       }
2156     }
2157   }
2158   val = (mrb_int)n;
2159   if (badcheck) {
2160     if (p == str) goto bad; /* no number */
2161     while (p<pend && ISSPACE(*p)) p++;
2162     if (p<pend) goto bad;       /* trailing garbage */
2163   }
2164 
2165   return mrb_fixnum_value(sign ? val : -val);
2166  nullbyte:
2167   mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
2168   /* not reached */
2169  bad:
2170   mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%S)",
2171              mrb_inspect(mrb, mrb_str_new(mrb, str, pend-str)));
2172   /* not reached */
2173   return mrb_fixnum_value(0);
2174 }
2175 
2176 MRB_API mrb_value
mrb_cstr_to_inum(mrb_state * mrb,const char * str,mrb_int base,mrb_bool badcheck)2177 mrb_cstr_to_inum(mrb_state *mrb, const char *str, mrb_int base, mrb_bool badcheck)
2178 {
2179   return mrb_str_len_to_inum(mrb, str, strlen(str), base, badcheck);
2180 }
2181 
2182 MRB_API const char*
mrb_string_value_cstr(mrb_state * mrb,mrb_value * ptr)2183 mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr)
2184 {
2185   mrb_value str = mrb_to_str(mrb, *ptr);
2186   struct RString *ps = mrb_str_ptr(str);
2187   mrb_int len = mrb_str_strlen(mrb, ps);
2188   char *p = RSTR_PTR(ps);
2189 
2190   if (!p || p[len] != '\0') {
2191     if (MRB_FROZEN_P(ps)) {
2192       *ptr = str = mrb_str_dup(mrb, str);
2193       ps = mrb_str_ptr(str);
2194     }
2195     mrb_str_modify(mrb, ps);
2196     return RSTR_PTR(ps);
2197   }
2198   return p;
2199 }
2200 
2201 MRB_API mrb_value
mrb_str_to_inum(mrb_state * mrb,mrb_value str,mrb_int base,mrb_bool badcheck)2202 mrb_str_to_inum(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck)
2203 {
2204   const char *s;
2205   mrb_int len;
2206 
2207   s = mrb_string_value_ptr(mrb, str);
2208   len = RSTRING_LEN(str);
2209   return mrb_str_len_to_inum(mrb, s, len, base, badcheck);
2210 }
2211 
2212 /* 15.2.10.5.38 */
2213 /*
2214  *  call-seq:
2215  *     str.to_i(base=10)   => integer
2216  *
2217  *  Returns the result of interpreting leading characters in <i>str</i> as an
2218  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
2219  *  end of a valid number are ignored. If there is not a valid number at the
2220  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
2221  *  exception.
2222  *
2223  *     "12345".to_i             #=> 12345
2224  *     "99 red balloons".to_i   #=> 99
2225  *     "0a".to_i                #=> 0
2226  *     "0a".to_i(16)            #=> 10
2227  *     "hello".to_i             #=> 0
2228  *     "1100101".to_i(2)        #=> 101
2229  *     "1100101".to_i(8)        #=> 294977
2230  *     "1100101".to_i(10)       #=> 1100101
2231  *     "1100101".to_i(16)       #=> 17826049
2232  */
2233 static mrb_value
mrb_str_to_i(mrb_state * mrb,mrb_value self)2234 mrb_str_to_i(mrb_state *mrb, mrb_value self)
2235 {
2236   mrb_int base = 10;
2237 
2238   mrb_get_args(mrb, "|i", &base);
2239   if (base < 0) {
2240     mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %S", mrb_fixnum_value(base));
2241   }
2242   return mrb_str_to_inum(mrb, self, base, FALSE);
2243 }
2244 
2245 #ifndef MRB_WITHOUT_FLOAT
2246 MRB_API double
mrb_cstr_to_dbl(mrb_state * mrb,const char * p,mrb_bool badcheck)2247 mrb_cstr_to_dbl(mrb_state *mrb, const char * p, mrb_bool badcheck)
2248 {
2249   char *end;
2250   char buf[DBL_DIG * 4 + 10];
2251   double d;
2252 
2253   enum {max_width = 20};
2254 
2255   if (!p) return 0.0;
2256   while (ISSPACE(*p)) p++;
2257 
2258   if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
2259     return 0.0;
2260   }
2261   d = mrb_float_read(p, &end);
2262   if (p == end) {
2263     if (badcheck) {
2264 bad:
2265       mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%S)", mrb_str_new_cstr(mrb, p));
2266       /* not reached */
2267     }
2268     return d;
2269   }
2270   if (*end) {
2271     char *n = buf;
2272     char *e = buf + sizeof(buf) - 1;
2273     char prev = 0;
2274 
2275     while (p < end && n < e) prev = *n++ = *p++;
2276     while (*p) {
2277       if (*p == '_') {
2278         /* remove underscores between digits */
2279         if (badcheck) {
2280           if (n == buf || !ISDIGIT(prev)) goto bad;
2281           ++p;
2282           if (!ISDIGIT(*p)) goto bad;
2283         }
2284         else {
2285           while (*++p == '_');
2286           continue;
2287         }
2288       }
2289       prev = *p++;
2290       if (n < e) *n++ = prev;
2291     }
2292     *n = '\0';
2293     p = buf;
2294 
2295     if (!badcheck && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
2296       return 0.0;
2297     }
2298 
2299     d = mrb_float_read(p, &end);
2300     if (badcheck) {
2301       if (!end || p == end) goto bad;
2302       while (*end && ISSPACE(*end)) end++;
2303       if (*end) goto bad;
2304     }
2305   }
2306   return d;
2307 }
2308 
2309 MRB_API double
mrb_str_to_dbl(mrb_state * mrb,mrb_value str,mrb_bool badcheck)2310 mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck)
2311 {
2312   char *s;
2313   mrb_int len;
2314 
2315   mrb_to_str(mrb, str);
2316   s = RSTRING_PTR(str);
2317   len = RSTRING_LEN(str);
2318   if (s) {
2319     if (badcheck && memchr(s, '\0', len)) {
2320       mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte");
2321     }
2322     if (s[len]) {    /* no sentinel somehow */
2323       struct RString *temp_str = str_new(mrb, s, len);
2324       s = RSTR_PTR(temp_str);
2325     }
2326   }
2327   return mrb_cstr_to_dbl(mrb, s, badcheck);
2328 }
2329 
2330 /* 15.2.10.5.39 */
2331 /*
2332  *  call-seq:
2333  *     str.to_f   => float
2334  *
2335  *  Returns the result of interpreting leading characters in <i>str</i> as a
2336  *  floating point number. Extraneous characters past the end of a valid number
2337  *  are ignored. If there is not a valid number at the start of <i>str</i>,
2338  *  <code>0.0</code> is returned. This method never raises an exception.
2339  *
2340  *     "123.45e1".to_f        #=> 1234.5
2341  *     "45.67 degrees".to_f   #=> 45.67
2342  *     "thx1138".to_f         #=> 0.0
2343  */
2344 static mrb_value
mrb_str_to_f(mrb_state * mrb,mrb_value self)2345 mrb_str_to_f(mrb_state *mrb, mrb_value self)
2346 {
2347   return mrb_float_value(mrb, mrb_str_to_dbl(mrb, self, FALSE));
2348 }
2349 #endif
2350 
2351 /* 15.2.10.5.40 */
2352 /*
2353  *  call-seq:
2354  *     str.to_s     => str
2355  *
2356  *  Returns the receiver.
2357  */
2358 static mrb_value
mrb_str_to_s(mrb_state * mrb,mrb_value self)2359 mrb_str_to_s(mrb_state *mrb, mrb_value self)
2360 {
2361   if (mrb_obj_class(mrb, self) != mrb->string_class) {
2362     return mrb_str_dup(mrb, self);
2363   }
2364   return self;
2365 }
2366 
2367 /* 15.2.10.5.43 */
2368 /*
2369  *  call-seq:
2370  *     str.upcase!   => str or nil
2371  *
2372  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
2373  *  were made.
2374  */
2375 static mrb_value
mrb_str_upcase_bang(mrb_state * mrb,mrb_value str)2376 mrb_str_upcase_bang(mrb_state *mrb, mrb_value str)
2377 {
2378   struct RString *s = mrb_str_ptr(str);
2379   char *p, *pend;
2380   mrb_bool modify = FALSE;
2381 
2382   mrb_str_modify(mrb, s);
2383   p = RSTRING_PTR(str);
2384   pend = RSTRING_END(str);
2385   while (p < pend) {
2386     if (ISLOWER(*p)) {
2387       *p = TOUPPER(*p);
2388       modify = TRUE;
2389     }
2390     p++;
2391   }
2392 
2393   if (modify) return str;
2394   return mrb_nil_value();
2395 }
2396 
2397 /* 15.2.10.5.42 */
2398 /*
2399  *  call-seq:
2400  *     str.upcase   => new_str
2401  *
2402  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
2403  *  uppercase counterparts. The operation is locale insensitive---only
2404  *  characters 'a' to 'z' are affected.
2405  *
2406  *     "hEllO".upcase   #=> "HELLO"
2407  */
2408 static mrb_value
mrb_str_upcase(mrb_state * mrb,mrb_value self)2409 mrb_str_upcase(mrb_state *mrb, mrb_value self)
2410 {
2411   mrb_value str;
2412 
2413   str = mrb_str_dup(mrb, self);
2414   mrb_str_upcase_bang(mrb, str);
2415   return str;
2416 }
2417 
2418 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
2419 
2420 /*
2421  *  call-seq:
2422  *     str.dump   -> new_str
2423  *
2424  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
2425  *  <code>\nnn</code> notation and all special characters escaped.
2426  */
2427 mrb_value
mrb_str_dump(mrb_state * mrb,mrb_value str)2428 mrb_str_dump(mrb_state *mrb, mrb_value str)
2429 {
2430   mrb_int len;
2431   const char *p, *pend;
2432   char *q;
2433   struct RString *result;
2434 
2435   len = 2;                  /* "" */
2436   p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
2437   while (p < pend) {
2438     unsigned char c = *p++;
2439     switch (c) {
2440       case '"':  case '\\':
2441       case '\n': case '\r':
2442       case '\t': case '\f':
2443       case '\013': case '\010': case '\007': case '\033':
2444         len += 2;
2445         break;
2446 
2447       case '#':
2448         len += IS_EVSTR(p, pend) ? 2 : 1;
2449         break;
2450 
2451       default:
2452         if (ISPRINT(c)) {
2453           len++;
2454         }
2455         else {
2456           len += 4;                /* \NNN */
2457         }
2458         break;
2459     }
2460   }
2461 
2462   result = str_new(mrb, 0, len);
2463   str_with_class(mrb, result, str);
2464   p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
2465   q = RSTR_PTR(result);
2466   *q++ = '"';
2467   while (p < pend) {
2468     unsigned char c = *p++;
2469 
2470     switch (c) {
2471       case '"':
2472       case '\\':
2473         *q++ = '\\';
2474         *q++ = c;
2475         break;
2476 
2477       case '\n':
2478         *q++ = '\\';
2479         *q++ = 'n';
2480         break;
2481 
2482       case '\r':
2483         *q++ = '\\';
2484         *q++ = 'r';
2485         break;
2486 
2487       case '\t':
2488         *q++ = '\\';
2489         *q++ = 't';
2490         break;
2491 
2492       case '\f':
2493         *q++ = '\\';
2494         *q++ = 'f';
2495         break;
2496 
2497       case '\013':
2498         *q++ = '\\';
2499         *q++ = 'v';
2500         break;
2501 
2502       case '\010':
2503         *q++ = '\\';
2504         *q++ = 'b';
2505         break;
2506 
2507       case '\007':
2508         *q++ = '\\';
2509         *q++ = 'a';
2510         break;
2511 
2512       case '\033':
2513         *q++ = '\\';
2514         *q++ = 'e';
2515         break;
2516 
2517       case '#':
2518         if (IS_EVSTR(p, pend)) *q++ = '\\';
2519         *q++ = '#';
2520         break;
2521 
2522       default:
2523         if (ISPRINT(c)) {
2524           *q++ = c;
2525         }
2526         else {
2527           *q++ = '\\';
2528           *q++ = 'x';
2529           q[1] = mrb_digitmap[c % 16]; c /= 16;
2530           q[0] = mrb_digitmap[c % 16];
2531           q += 2;
2532         }
2533     }
2534   }
2535   *q = '"';
2536   return mrb_obj_value(result);
2537 }
2538 
2539 MRB_API mrb_value
mrb_str_cat(mrb_state * mrb,mrb_value str,const char * ptr,size_t len)2540 mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len)
2541 {
2542   struct RString *s = mrb_str_ptr(str);
2543   size_t capa;
2544   size_t total;
2545   ptrdiff_t off = -1;
2546 
2547   if (len == 0) return str;
2548   mrb_str_modify(mrb, s);
2549   if (ptr >= RSTR_PTR(s) && ptr <= RSTR_PTR(s) + (size_t)RSTR_LEN(s)) {
2550       off = ptr - RSTR_PTR(s);
2551   }
2552 
2553   capa = RSTR_CAPA(s);
2554   total = RSTR_LEN(s)+len;
2555   if (total >= MRB_INT_MAX) {
2556   size_error:
2557     mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
2558   }
2559   if (capa <= total) {
2560     if (capa == 0) capa = 1;
2561     while (capa <= total) {
2562       if (capa <= MRB_INT_MAX / 2) {
2563         capa *= 2;
2564       }
2565       else {
2566         capa = total+1;
2567       }
2568     }
2569     if (capa <= total || capa > MRB_INT_MAX) {
2570       goto size_error;
2571     }
2572     resize_capa(mrb, s, capa);
2573   }
2574   if (off != -1) {
2575       ptr = RSTR_PTR(s) + off;
2576   }
2577   memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len);
2578   mrb_assert_int_fit(size_t, total, mrb_int, MRB_INT_MAX);
2579   RSTR_SET_LEN(s, total);
2580   RSTR_PTR(s)[total] = '\0';   /* sentinel */
2581   return str;
2582 }
2583 
2584 MRB_API mrb_value
mrb_str_cat_cstr(mrb_state * mrb,mrb_value str,const char * ptr)2585 mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr)
2586 {
2587   return mrb_str_cat(mrb, str, ptr, strlen(ptr));
2588 }
2589 
2590 MRB_API mrb_value
mrb_str_cat_str(mrb_state * mrb,mrb_value str,mrb_value str2)2591 mrb_str_cat_str(mrb_state *mrb, mrb_value str, mrb_value str2)
2592 {
2593   if (mrb_str_ptr(str) == mrb_str_ptr(str2)) {
2594     mrb_str_modify(mrb, mrb_str_ptr(str));
2595   }
2596   return mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2));
2597 }
2598 
2599 MRB_API mrb_value
mrb_str_append(mrb_state * mrb,mrb_value str1,mrb_value str2)2600 mrb_str_append(mrb_state *mrb, mrb_value str1, mrb_value str2)
2601 {
2602   mrb_to_str(mrb, str2);
2603   return mrb_str_cat_str(mrb, str1, str2);
2604 }
2605 
2606 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
2607 
2608 /*
2609  * call-seq:
2610  *   str.inspect   -> string
2611  *
2612  * Returns a printable version of _str_, surrounded by quote marks,
2613  * with special characters escaped.
2614  *
2615  *    str = "hello"
2616  *    str[3] = "\b"
2617  *    str.inspect       #=> "\"hel\\bo\""
2618  */
2619 mrb_value
mrb_str_inspect(mrb_state * mrb,mrb_value str)2620 mrb_str_inspect(mrb_state *mrb, mrb_value str)
2621 {
2622   const char *p, *pend;
2623   char buf[CHAR_ESC_LEN + 1];
2624   mrb_value result = mrb_str_new_lit(mrb, "\"");
2625 
2626   p = RSTRING_PTR(str); pend = RSTRING_END(str);
2627   for (;p < pend; p++) {
2628     unsigned char c, cc;
2629 #ifdef MRB_UTF8_STRING
2630     mrb_int clen;
2631 
2632     clen = utf8len(p, pend);
2633     if (clen > 1) {
2634       mrb_int i;
2635 
2636       for (i=0; i<clen; i++) {
2637         buf[i] = p[i];
2638       }
2639       mrb_str_cat(mrb, result, buf, clen);
2640       p += clen-1;
2641       continue;
2642     }
2643 #endif
2644     c = *p;
2645     if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p+1, pend))) {
2646       buf[0] = '\\'; buf[1] = c;
2647       mrb_str_cat(mrb, result, buf, 2);
2648       continue;
2649     }
2650     if (ISPRINT(c)) {
2651       buf[0] = c;
2652       mrb_str_cat(mrb, result, buf, 1);
2653       continue;
2654     }
2655     switch (c) {
2656       case '\n': cc = 'n'; break;
2657       case '\r': cc = 'r'; break;
2658       case '\t': cc = 't'; break;
2659       case '\f': cc = 'f'; break;
2660       case '\013': cc = 'v'; break;
2661       case '\010': cc = 'b'; break;
2662       case '\007': cc = 'a'; break;
2663       case 033: cc = 'e'; break;
2664       default: cc = 0; break;
2665     }
2666     if (cc) {
2667       buf[0] = '\\';
2668       buf[1] = (char)cc;
2669       mrb_str_cat(mrb, result, buf, 2);
2670       continue;
2671     }
2672     else {
2673       buf[0] = '\\';
2674       buf[1] = 'x';
2675       buf[3] = mrb_digitmap[c % 16]; c /= 16;
2676       buf[2] = mrb_digitmap[c % 16];
2677       mrb_str_cat(mrb, result, buf, 4);
2678       continue;
2679     }
2680   }
2681   mrb_str_cat_lit(mrb, result, "\"");
2682 
2683   return result;
2684 }
2685 
2686 /*
2687  * call-seq:
2688  *   str.bytes   -> array of fixnums
2689  *
2690  * Returns an array of bytes in _str_.
2691  *
2692  *    str = "hello"
2693  *    str.bytes       #=> [104, 101, 108, 108, 111]
2694  */
2695 static mrb_value
mrb_str_bytes(mrb_state * mrb,mrb_value str)2696 mrb_str_bytes(mrb_state *mrb, mrb_value str)
2697 {
2698   struct RString *s = mrb_str_ptr(str);
2699   mrb_value a = mrb_ary_new_capa(mrb, RSTR_LEN(s));
2700   unsigned char *p = (unsigned char *)(RSTR_PTR(s)), *pend = p + RSTR_LEN(s);
2701 
2702   while (p < pend) {
2703     mrb_ary_push(mrb, a, mrb_fixnum_value(p[0]));
2704     p++;
2705   }
2706   return a;
2707 }
2708 
2709 /* ---------------------------*/
2710 void
mrb_init_string(mrb_state * mrb)2711 mrb_init_string(mrb_state *mrb)
2712 {
2713   struct RClass *s;
2714 
2715   mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << 5), "pointer size too big for embedded string");
2716 
2717   mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class);             /* 15.2.10 */
2718   MRB_SET_INSTANCE_TT(s, MRB_TT_STRING);
2719 
2720   mrb_define_method(mrb, s, "bytesize",        mrb_str_bytesize,        MRB_ARGS_NONE());
2721 
2722   mrb_define_method(mrb, s, "<=>",             mrb_str_cmp_m,           MRB_ARGS_REQ(1)); /* 15.2.10.5.1  */
2723   mrb_define_method(mrb, s, "==",              mrb_str_equal_m,         MRB_ARGS_REQ(1)); /* 15.2.10.5.2  */
2724   mrb_define_method(mrb, s, "+",               mrb_str_plus_m,          MRB_ARGS_REQ(1)); /* 15.2.10.5.4  */
2725   mrb_define_method(mrb, s, "*",               mrb_str_times,           MRB_ARGS_REQ(1)); /* 15.2.10.5.5  */
2726   mrb_define_method(mrb, s, "[]",              mrb_str_aref_m,          MRB_ARGS_ANY());  /* 15.2.10.5.6  */
2727   mrb_define_method(mrb, s, "capitalize",      mrb_str_capitalize,      MRB_ARGS_NONE()); /* 15.2.10.5.7  */
2728   mrb_define_method(mrb, s, "capitalize!",     mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8  */
2729   mrb_define_method(mrb, s, "chomp",           mrb_str_chomp,           MRB_ARGS_ANY());  /* 15.2.10.5.9  */
2730   mrb_define_method(mrb, s, "chomp!",          mrb_str_chomp_bang,      MRB_ARGS_ANY());  /* 15.2.10.5.10 */
2731   mrb_define_method(mrb, s, "chop",            mrb_str_chop,            MRB_ARGS_NONE()); /* 15.2.10.5.11 */
2732   mrb_define_method(mrb, s, "chop!",           mrb_str_chop_bang,       MRB_ARGS_NONE()); /* 15.2.10.5.12 */
2733   mrb_define_method(mrb, s, "downcase",        mrb_str_downcase,        MRB_ARGS_NONE()); /* 15.2.10.5.13 */
2734   mrb_define_method(mrb, s, "downcase!",       mrb_str_downcase_bang,   MRB_ARGS_NONE()); /* 15.2.10.5.14 */
2735   mrb_define_method(mrb, s, "empty?",          mrb_str_empty_p,         MRB_ARGS_NONE()); /* 15.2.10.5.16 */
2736   mrb_define_method(mrb, s, "eql?",            mrb_str_eql,             MRB_ARGS_REQ(1)); /* 15.2.10.5.17 */
2737 
2738   mrb_define_method(mrb, s, "hash",            mrb_str_hash_m,          MRB_ARGS_NONE()); /* 15.2.10.5.20 */
2739   mrb_define_method(mrb, s, "include?",        mrb_str_include,         MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */
2740   mrb_define_method(mrb, s, "index",           mrb_str_index_m,         MRB_ARGS_ANY());  /* 15.2.10.5.22 */
2741   mrb_define_method(mrb, s, "initialize",      mrb_str_init,            MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */
2742   mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace,         MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */
2743   mrb_define_method(mrb, s, "intern",          mrb_str_intern,          MRB_ARGS_NONE()); /* 15.2.10.5.25 */
2744   mrb_define_method(mrb, s, "length",          mrb_str_size,            MRB_ARGS_NONE()); /* 15.2.10.5.26 */
2745   mrb_define_method(mrb, s, "replace",         mrb_str_replace,         MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */
2746   mrb_define_method(mrb, s, "reverse",         mrb_str_reverse,         MRB_ARGS_NONE()); /* 15.2.10.5.29 */
2747   mrb_define_method(mrb, s, "reverse!",        mrb_str_reverse_bang,    MRB_ARGS_NONE()); /* 15.2.10.5.30 */
2748   mrb_define_method(mrb, s, "rindex",          mrb_str_rindex,          MRB_ARGS_ANY());  /* 15.2.10.5.31 */
2749   mrb_define_method(mrb, s, "size",            mrb_str_size,            MRB_ARGS_NONE()); /* 15.2.10.5.33 */
2750   mrb_define_method(mrb, s, "slice",           mrb_str_aref_m,          MRB_ARGS_ANY());  /* 15.2.10.5.34 */
2751   mrb_define_method(mrb, s, "split",           mrb_str_split_m,         MRB_ARGS_ANY());  /* 15.2.10.5.35 */
2752 
2753 #ifndef MRB_WITHOUT_FLOAT
2754   mrb_define_method(mrb, s, "to_f",            mrb_str_to_f,            MRB_ARGS_NONE()); /* 15.2.10.5.38 */
2755 #endif
2756   mrb_define_method(mrb, s, "to_i",            mrb_str_to_i,            MRB_ARGS_ANY());  /* 15.2.10.5.39 */
2757   mrb_define_method(mrb, s, "to_s",            mrb_str_to_s,            MRB_ARGS_NONE()); /* 15.2.10.5.40 */
2758   mrb_define_method(mrb, s, "to_str",          mrb_str_to_s,            MRB_ARGS_NONE());
2759   mrb_define_method(mrb, s, "to_sym",          mrb_str_intern,          MRB_ARGS_NONE()); /* 15.2.10.5.41 */
2760   mrb_define_method(mrb, s, "upcase",          mrb_str_upcase,          MRB_ARGS_NONE()); /* 15.2.10.5.42 */
2761   mrb_define_method(mrb, s, "upcase!",         mrb_str_upcase_bang,     MRB_ARGS_NONE()); /* 15.2.10.5.43 */
2762   mrb_define_method(mrb, s, "inspect",         mrb_str_inspect,         MRB_ARGS_NONE()); /* 15.2.10.5.46(x) */
2763   mrb_define_method(mrb, s, "bytes",           mrb_str_bytes,           MRB_ARGS_NONE());
2764 }
2765 
2766 #ifndef MRB_WITHOUT_FLOAT
2767 /*
2768  * Source code for the "strtod" library procedure.
2769  *
2770  * Copyright (c) 1988-1993 The Regents of the University of California.
2771  * Copyright (c) 1994 Sun Microsystems, Inc.
2772  *
2773  * Permission to use, copy, modify, and distribute this
2774  * software and its documentation for any purpose and without
2775  * fee is hereby granted, provided that the above copyright
2776  * notice appear in all copies.  The University of California
2777  * makes no representations about the suitability of this
2778  * software for any purpose.  It is provided "as is" without
2779  * express or implied warranty.
2780  *
2781  * RCS: @(#) $Id: strtod.c 11708 2007-02-12 23:01:19Z shyouhei $
2782  */
2783 
2784 #include <ctype.h>
2785 #include <errno.h>
2786 
2787 static const int maxExponent = 511; /* Largest possible base 10 exponent.  Any
2788                                      * exponent larger than this will already
2789                                      * produce underflow or overflow, so there's
2790                                      * no need to worry about additional digits.
2791                                      */
2792 static const double powersOf10[] = {/* Table giving binary powers of 10.  Entry */
2793     10.,                            /* is 10^2^i.  Used to convert decimal */
2794     100.,                           /* exponents into floating-point numbers. */
2795     1.0e4,
2796     1.0e8,
2797     1.0e16,
2798     1.0e32,
2799     1.0e64,
2800     1.0e128,
2801     1.0e256
2802 };
2803 
2804 MRB_API double
mrb_float_read(const char * string,char ** endPtr)2805 mrb_float_read(const char *string, char **endPtr)
2806 /*  const char *string;            A decimal ASCII floating-point number,
2807                                  * optionally preceded by white space.
2808                                  * Must have form "-I.FE-X", where I is the
2809                                  * integer part of the mantissa, F is the
2810                                  * fractional part of the mantissa, and X
2811                                  * is the exponent.  Either of the signs
2812                                  * may be "+", "-", or omitted.  Either I
2813                                  * or F may be omitted, or both.  The decimal
2814                                  * point isn't necessary unless F is present.
2815                                  * The "E" may actually be an "e".  E and X
2816                                  * may both be omitted (but not just one).
2817                                  */
2818 /*  char **endPtr;                 If non-NULL, store terminating character's
2819                                  * address here. */
2820 {
2821     int sign, expSign = FALSE;
2822     double fraction, dblExp;
2823     const double *d;
2824     const char *p;
2825     int c;
2826     int exp = 0;                /* Exponent read from "EX" field. */
2827     int fracExp = 0;            /* Exponent that derives from the fractional
2828                                  * part.  Under normal circumstatnces, it is
2829                                  * the negative of the number of digits in F.
2830                                  * However, if I is very long, the last digits
2831                                  * of I get dropped (otherwise a long I with a
2832                                  * large negative exponent could cause an
2833                                  * unnecessary overflow on I alone).  In this
2834                                  * case, fracExp is incremented one for each
2835                                  * dropped digit. */
2836     int mantSize;               /* Number of digits in mantissa. */
2837     int decPt;                  /* Number of mantissa digits BEFORE decimal
2838                                  * point. */
2839     const char *pExp;           /* Temporarily holds location of exponent
2840                                  * in string. */
2841 
2842     /*
2843      * Strip off leading blanks and check for a sign.
2844      */
2845 
2846     p = string;
2847     while (ISSPACE(*p)) {
2848       p += 1;
2849     }
2850     if (*p == '-') {
2851       sign = TRUE;
2852       p += 1;
2853     }
2854     else {
2855       if (*p == '+') {
2856         p += 1;
2857       }
2858       sign = FALSE;
2859     }
2860 
2861     /*
2862      * Count the number of digits in the mantissa (including the decimal
2863      * point), and also locate the decimal point.
2864      */
2865 
2866     decPt = -1;
2867     for (mantSize = 0; ; mantSize += 1)
2868     {
2869       c = *p;
2870       if (!ISDIGIT(c)) {
2871         if ((c != '.') || (decPt >= 0)) {
2872           break;
2873         }
2874         decPt = mantSize;
2875       }
2876       p += 1;
2877     }
2878 
2879     /*
2880      * Now suck up the digits in the mantissa.  Use two integers to
2881      * collect 9 digits each (this is faster than using floating-point).
2882      * If the mantissa has more than 18 digits, ignore the extras, since
2883      * they can't affect the value anyway.
2884      */
2885 
2886     pExp  = p;
2887     p -= mantSize;
2888     if (decPt < 0) {
2889       decPt = mantSize;
2890     }
2891     else {
2892       mantSize -= 1; /* One of the digits was the point. */
2893     }
2894     if (mantSize > 18) {
2895       if (decPt - 18 > 29999) {
2896         fracExp = 29999;
2897       }
2898       else {
2899         fracExp = decPt - 18;
2900       }
2901       mantSize = 18;
2902     }
2903     else {
2904       fracExp = decPt - mantSize;
2905     }
2906     if (mantSize == 0) {
2907       fraction = 0.0;
2908       p = string;
2909       goto done;
2910     }
2911     else {
2912       int frac1, frac2;
2913       frac1 = 0;
2914       for ( ; mantSize > 9; mantSize -= 1)
2915       {
2916         c = *p;
2917         p += 1;
2918         if (c == '.') {
2919           c = *p;
2920           p += 1;
2921         }
2922         frac1 = 10*frac1 + (c - '0');
2923       }
2924       frac2 = 0;
2925       for (; mantSize > 0; mantSize -= 1)
2926       {
2927         c = *p;
2928         p += 1;
2929         if (c == '.') {
2930           c = *p;
2931           p += 1;
2932         }
2933         frac2 = 10*frac2 + (c - '0');
2934       }
2935       fraction = (1.0e9 * frac1) + frac2;
2936     }
2937 
2938     /*
2939      * Skim off the exponent.
2940      */
2941 
2942     p = pExp;
2943     if ((*p == 'E') || (*p == 'e')) {
2944       p += 1;
2945       if (*p == '-') {
2946         expSign = TRUE;
2947         p += 1;
2948       }
2949       else {
2950         if (*p == '+') {
2951           p += 1;
2952         }
2953         expSign = FALSE;
2954       }
2955       while (ISDIGIT(*p)) {
2956         exp = exp * 10 + (*p - '0');
2957         if (exp > 19999) {
2958           exp = 19999;
2959         }
2960         p += 1;
2961       }
2962     }
2963     if (expSign) {
2964       exp = fracExp - exp;
2965     }
2966     else {
2967       exp = fracExp + exp;
2968     }
2969 
2970     /*
2971      * Generate a floating-point number that represents the exponent.
2972      * Do this by processing the exponent one bit at a time to combine
2973      * many powers of 2 of 10. Then combine the exponent with the
2974      * fraction.
2975      */
2976 
2977     if (exp < 0) {
2978       expSign = TRUE;
2979       exp = -exp;
2980     }
2981     else {
2982       expSign = FALSE;
2983     }
2984     if (exp > maxExponent) {
2985       exp = maxExponent;
2986       errno = ERANGE;
2987     }
2988     dblExp = 1.0;
2989     for (d = powersOf10; exp != 0; exp >>= 1, d += 1) {
2990       if (exp & 01) {
2991         dblExp *= *d;
2992       }
2993     }
2994     if (expSign) {
2995       fraction /= dblExp;
2996     }
2997     else {
2998       fraction *= dblExp;
2999     }
3000 
3001 done:
3002     if (endPtr != NULL) {
3003       *endPtr = (char *) p;
3004     }
3005 
3006     if (sign) {
3007       return -fraction;
3008     }
3009     return fraction;
3010 }
3011 #endif
3012