1 /*
2 ** symbol.c - Symbol class
3 **
4 ** See Copyright Notice in mruby.h
5 */
6 
7 #include <limits.h>
8 #include <string.h>
9 #include <mruby.h>
10 #include <mruby/khash.h>
11 #include <mruby/string.h>
12 #include <mruby/dump.h>
13 #include <mruby/class.h>
14 
15 /* ------------------------------------------------------ */
16 typedef struct symbol_name {
17   mrb_bool lit : 1;
18   uint8_t prev;
19   uint16_t len;
20   const char *name;
21 } symbol_name;
22 
23 #define SYMBOL_INLINE_BIT_POS       1
24 #define SYMBOL_INLINE_LOWER_BIT_POS 2
25 #define SYMBOL_INLINE               (1 << (SYMBOL_INLINE_BIT_POS - 1))
26 #define SYMBOL_INLINE_LOWER         (1 << (SYMBOL_INLINE_LOWER_BIT_POS - 1))
27 #define SYMBOL_NORMAL_SHIFT         SYMBOL_INLINE_BIT_POS
28 #define SYMBOL_INLINE_SHIFT         SYMBOL_INLINE_LOWER_BIT_POS
29 #ifdef MRB_ENABLE_ALL_SYMBOLS
30 # define SYMBOL_INLINE_P(sym) FALSE
31 # define SYMBOL_INLINE_LOWER_P(sym) FALSE
32 # define sym_inline_pack(name, len) 0
33 # define sym_inline_unpack(sym, buf, lenp) NULL
34 #else
35 # define SYMBOL_INLINE_P(sym) ((sym) & SYMBOL_INLINE)
36 # define SYMBOL_INLINE_LOWER_P(sym) ((sym) & SYMBOL_INLINE_LOWER)
37 #endif
38 
39 static void
sym_validate_len(mrb_state * mrb,size_t len)40 sym_validate_len(mrb_state *mrb, size_t len)
41 {
42   if (len >= RITE_LV_NULL_MARK) {
43     mrb_raise(mrb, E_ARGUMENT_ERROR, "symbol length too long");
44   }
45 }
46 
47 #ifndef MRB_ENABLE_ALL_SYMBOLS
48 static const char pack_table[] = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
49 
50 static mrb_sym
sym_inline_pack(const char * name,size_t len)51 sym_inline_pack(const char *name, size_t len)
52 {
53   const size_t lower_length_max = (MRB_SYMBOL_BIT - 2) / 5;
54   const size_t mix_length_max   = (MRB_SYMBOL_BIT - 2) / 6;
55 
56   char c;
57   const char *p;
58   size_t i;
59   mrb_sym sym = 0;
60   mrb_bool lower = TRUE;
61 
62   if (len > lower_length_max) return 0; /* too long */
63   for (i=0; i<len; i++) {
64     uint32_t bits;
65 
66     c = name[i];
67     if (c == 0) return 0;       /* NUL in name */
68     p = strchr(pack_table, (int)c);
69     if (p == 0) return 0;       /* non alnum char */
70     bits = (uint32_t)(p - pack_table)+1;
71     if (bits > 27) lower = FALSE;
72     if (i >= mix_length_max) break;
73     sym |= bits<<(i*6+SYMBOL_INLINE_SHIFT);
74   }
75   if (lower) {
76     sym = 0;
77     for (i=0; i<len; i++) {
78       uint32_t bits;
79 
80       c = name[i];
81       p = strchr(pack_table, (int)c);
82       bits = (uint32_t)(p - pack_table)+1;
83       sym |= bits<<(i*5+SYMBOL_INLINE_SHIFT);
84     }
85     return sym | SYMBOL_INLINE | SYMBOL_INLINE_LOWER;
86   }
87   if (len > mix_length_max) return 0;
88   return sym | SYMBOL_INLINE;
89 }
90 
91 static const char*
sym_inline_unpack(mrb_sym sym,char * buf,mrb_int * lenp)92 sym_inline_unpack(mrb_sym sym, char *buf, mrb_int *lenp)
93 {
94   int bit_per_char = SYMBOL_INLINE_LOWER_P(sym) ? 5 : 6;
95   int i;
96 
97   mrb_assert(SYMBOL_INLINE_P(sym));
98 
99   for (i=0; i<30/bit_per_char; i++) {
100     uint32_t bits = sym>>(i*bit_per_char+SYMBOL_INLINE_SHIFT) & ((1<<bit_per_char)-1);
101     if (bits == 0) break;
102     buf[i] = pack_table[bits-1];;
103   }
104   buf[i] = '\0';
105   if (lenp) *lenp = i;
106   return buf;
107 }
108 #endif
109 
110 static uint8_t
symhash(const char * key,size_t len)111 symhash(const char *key, size_t len)
112 {
113     uint32_t hash, i;
114 
115     for(hash = i = 0; i < len; ++i) {
116         hash += key[i];
117         hash += (hash << 10);
118         hash ^= (hash >> 6);
119     }
120     hash += (hash << 3);
121     hash ^= (hash >> 11);
122     hash += (hash << 15);
123     return hash & 0xff;
124 }
125 
126 static mrb_sym
find_symbol(mrb_state * mrb,const char * name,size_t len,uint8_t * hashp)127 find_symbol(mrb_state *mrb, const char *name, size_t len, uint8_t *hashp)
128 {
129   mrb_sym i;
130   symbol_name *sname;
131   uint8_t hash;
132 
133   /* inline symbol */
134   i = sym_inline_pack(name, len);
135   if (i > 0) return i;
136 
137   hash = symhash(name, len);
138   if (hashp) *hashp = hash;
139 
140   i = mrb->symhash[hash];
141   if (i == 0) return 0;
142   do {
143     sname = &mrb->symtbl[i];
144     if (sname->len == len && memcmp(sname->name, name, len) == 0) {
145       return i<<SYMBOL_NORMAL_SHIFT;
146     }
147     if (sname->prev == 0xff) {
148       i -= 0xff;
149       sname = &mrb->symtbl[i];
150       while (mrb->symtbl < sname) {
151         if (sname->len == len && memcmp(sname->name, name, len) == 0) {
152           return (mrb_sym)(sname - mrb->symtbl)<<SYMBOL_NORMAL_SHIFT;
153         }
154         sname--;
155       }
156       return 0;
157     }
158     i -= sname->prev;
159   } while (sname->prev > 0);
160   return 0;
161 }
162 
163 static mrb_sym
sym_intern(mrb_state * mrb,const char * name,size_t len,mrb_bool lit)164 sym_intern(mrb_state *mrb, const char *name, size_t len, mrb_bool lit)
165 {
166   mrb_sym sym;
167   symbol_name *sname;
168   uint8_t hash;
169 
170   sym_validate_len(mrb, len);
171   sym = find_symbol(mrb, name, len, &hash);
172   if (sym > 0) return sym;
173 
174   /* registering a new symbol */
175   sym = mrb->symidx + 1;
176   if (mrb->symcapa < sym) {
177     size_t symcapa = mrb->symcapa;
178     if (symcapa == 0) symcapa = 100;
179     else symcapa = (size_t)(symcapa * 6 / 5);
180     mrb->symtbl = (symbol_name*)mrb_realloc(mrb, mrb->symtbl, sizeof(symbol_name)*(symcapa+1));
181     mrb->symcapa = symcapa;
182   }
183   sname = &mrb->symtbl[sym];
184   sname->len = (uint16_t)len;
185   if (lit || mrb_ro_data_p(name)) {
186     sname->name = name;
187     sname->lit = TRUE;
188   }
189   else {
190     char *p = (char *)mrb_malloc(mrb, len+1);
191     memcpy(p, name, len);
192     p[len] = 0;
193     sname->name = (const char*)p;
194     sname->lit = FALSE;
195   }
196   if (mrb->symhash[hash]) {
197     mrb_sym i = sym - mrb->symhash[hash];
198     if (i > 0xff)
199       sname->prev = 0xff;
200     else
201       sname->prev = i;
202   }
203   else {
204     sname->prev = 0;
205   }
206   mrb->symhash[hash] = mrb->symidx = sym;
207 
208   return sym<<SYMBOL_NORMAL_SHIFT;
209 }
210 
211 MRB_API mrb_sym
mrb_intern(mrb_state * mrb,const char * name,size_t len)212 mrb_intern(mrb_state *mrb, const char *name, size_t len)
213 {
214   return sym_intern(mrb, name, len, FALSE);
215 }
216 
217 MRB_API mrb_sym
mrb_intern_static(mrb_state * mrb,const char * name,size_t len)218 mrb_intern_static(mrb_state *mrb, const char *name, size_t len)
219 {
220   return sym_intern(mrb, name, len, TRUE);
221 }
222 
223 MRB_API mrb_sym
mrb_intern_cstr(mrb_state * mrb,const char * name)224 mrb_intern_cstr(mrb_state *mrb, const char *name)
225 {
226   return mrb_intern(mrb, name, strlen(name));
227 }
228 
229 MRB_API mrb_sym
mrb_intern_str(mrb_state * mrb,mrb_value str)230 mrb_intern_str(mrb_state *mrb, mrb_value str)
231 {
232   return mrb_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
233 }
234 
235 MRB_API mrb_value
mrb_check_intern(mrb_state * mrb,const char * name,size_t len)236 mrb_check_intern(mrb_state *mrb, const char *name, size_t len)
237 {
238   mrb_sym sym;
239 
240   sym_validate_len(mrb, len);
241   sym = find_symbol(mrb, name, len, NULL);
242   if (sym > 0) return mrb_symbol_value(sym);
243   return mrb_nil_value();
244 }
245 
246 MRB_API mrb_value
mrb_check_intern_cstr(mrb_state * mrb,const char * name)247 mrb_check_intern_cstr(mrb_state *mrb, const char *name)
248 {
249   return mrb_check_intern(mrb, name, strlen(name));
250 }
251 
252 MRB_API mrb_value
mrb_check_intern_str(mrb_state * mrb,mrb_value str)253 mrb_check_intern_str(mrb_state *mrb, mrb_value str)
254 {
255   return mrb_check_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
256 }
257 
258 static const char*
sym2name_len(mrb_state * mrb,mrb_sym sym,char * buf,mrb_int * lenp)259 sym2name_len(mrb_state *mrb, mrb_sym sym, char *buf, mrb_int *lenp)
260 {
261   if (SYMBOL_INLINE_P(sym)) return sym_inline_unpack(sym, buf, lenp);
262 
263   sym >>= SYMBOL_NORMAL_SHIFT;
264   if (sym == 0 || mrb->symidx < sym) {
265     if (lenp) *lenp = 0;
266     return NULL;
267   }
268 
269   if (lenp) *lenp = mrb->symtbl[sym].len;
270   return mrb->symtbl[sym].name;
271 }
272 
273 MRB_API const char*
mrb_sym_name_len(mrb_state * mrb,mrb_sym sym,mrb_int * lenp)274 mrb_sym_name_len(mrb_state *mrb, mrb_sym sym, mrb_int *lenp)
275 {
276   return sym2name_len(mrb, sym, mrb->symbuf, lenp);
277 }
278 
279 void
mrb_free_symtbl(mrb_state * mrb)280 mrb_free_symtbl(mrb_state *mrb)
281 {
282   mrb_sym i, lim;
283 
284   for (i=1, lim=mrb->symidx+1; i<lim; i++) {
285     if (!mrb->symtbl[i].lit) {
286       mrb_free(mrb, (char*)mrb->symtbl[i].name);
287     }
288   }
289   mrb_free(mrb, mrb->symtbl);
290 }
291 
292 void
mrb_init_symtbl(mrb_state * mrb)293 mrb_init_symtbl(mrb_state *mrb)
294 {
295 }
296 
297 /**********************************************************************
298  * Document-class: Symbol
299  *
300  *  <code>Symbol</code> objects represent names and some strings
301  *  inside the Ruby
302  *  interpreter. They are generated using the <code>:name</code> and
303  *  <code>:"string"</code> literals
304  *  syntax, and by the various <code>to_sym</code> methods. The same
305  *  <code>Symbol</code> object will be created for a given name or string
306  *  for the duration of a program's execution, regardless of the context
307  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
308  *  one context, a method in another, and a class in a third, the
309  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
310  *  all three contexts.
311  *
312  *     module One
313  *       class Fred
314  *       end
315  *       $f1 = :Fred
316  *     end
317  *     module Two
318  *       Fred = 1
319  *       $f2 = :Fred
320  *     end
321  *     def Fred()
322  *     end
323  *     $f3 = :Fred
324  *     $f1.object_id   #=> 2514190
325  *     $f2.object_id   #=> 2514190
326  *     $f3.object_id   #=> 2514190
327  *
328  */
329 
330 /* 15.2.11.3.2  */
331 /* 15.2.11.3.3  */
332 /*
333  *  call-seq:
334  *     sym.id2name   -> string
335  *     sym.to_s      -> string
336  *
337  *  Returns the name or string corresponding to <i>sym</i>.
338  *
339  *     :fred.id2name   #=> "fred"
340  */
341 static mrb_value
sym_to_s(mrb_state * mrb,mrb_value sym)342 sym_to_s(mrb_state *mrb, mrb_value sym)
343 {
344   return mrb_sym_str(mrb, mrb_symbol(sym));
345 }
346 
347 /* 15.2.11.3.4  */
348 /*
349  * call-seq:
350  *   sym.to_sym   -> sym
351  *   sym.intern   -> sym
352  *
353  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
354  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
355  * in this case.
356  */
357 
358 static mrb_value
sym_to_sym(mrb_state * mrb,mrb_value sym)359 sym_to_sym(mrb_state *mrb, mrb_value sym)
360 {
361   return sym;
362 }
363 
364 /* 15.2.11.3.5(x)  */
365 /*
366  *  call-seq:
367  *     sym.inspect    -> string
368  *
369  *  Returns the representation of <i>sym</i> as a symbol literal.
370  *
371  *     :fred.inspect   #=> ":fred"
372  */
373 
374 #if __STDC__
375 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
376 #else  /* not __STDC__ */
377 /* As in Harbison and Steele.  */
378 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
379 #endif
380 #define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_'))
381 
382 static mrb_bool
is_special_global_name(const char * m)383 is_special_global_name(const char* m)
384 {
385   switch (*m) {
386     case '~': case '*': case '$': case '?': case '!': case '@':
387     case '/': case '\\': case ';': case ',': case '.': case '=':
388     case ':': case '<': case '>': case '\"':
389     case '&': case '`': case '\'': case '+':
390     case '0':
391       ++m;
392       break;
393     case '-':
394       ++m;
395       if (is_identchar(*m)) m += 1;
396       break;
397     default:
398       if (!ISDIGIT(*m)) return FALSE;
399       do ++m; while (ISDIGIT(*m));
400       break;
401   }
402   return !*m;
403 }
404 
405 static mrb_bool
symname_p(const char * name)406 symname_p(const char *name)
407 {
408   const char *m = name;
409   mrb_bool localid = FALSE;
410 
411   if (!m) return FALSE;
412   switch (*m) {
413     case '\0':
414       return FALSE;
415 
416     case '$':
417       if (is_special_global_name(++m)) return TRUE;
418       goto id;
419 
420     case '@':
421       if (*++m == '@') ++m;
422       goto id;
423 
424     case '<':
425       switch (*++m) {
426         case '<': ++m; break;
427         case '=': if (*++m == '>') ++m; break;
428         default: break;
429       }
430       break;
431 
432     case '>':
433       switch (*++m) {
434         case '>': case '=': ++m; break;
435         default: break;
436       }
437       break;
438 
439     case '=':
440       switch (*++m) {
441         case '~': ++m; break;
442         case '=': if (*++m == '=') ++m; break;
443         default: return FALSE;
444       }
445       break;
446 
447     case '*':
448       if (*++m == '*') ++m;
449       break;
450     case '!':
451       switch (*++m) {
452         case '=': case '~': ++m;
453       }
454       break;
455     case '+': case '-':
456       if (*++m == '@') ++m;
457       break;
458     case '|':
459       if (*++m == '|') ++m;
460       break;
461     case '&':
462       if (*++m == '&') ++m;
463       break;
464 
465     case '^': case '/': case '%': case '~': case '`':
466       ++m;
467       break;
468 
469     case '[':
470       if (*++m != ']') return FALSE;
471       if (*++m == '=') ++m;
472       break;
473 
474     default:
475       localid = !ISUPPER(*m);
476 id:
477       if (*m != '_' && !ISALPHA(*m)) return FALSE;
478       while (is_identchar(*m)) m += 1;
479       if (localid) {
480         switch (*m) {
481           case '!': case '?': case '=': ++m;
482           default: break;
483         }
484       }
485       break;
486   }
487   return *m ? FALSE : TRUE;
488 }
489 
490 static mrb_value
sym_inspect(mrb_state * mrb,mrb_value sym)491 sym_inspect(mrb_state *mrb, mrb_value sym)
492 {
493   mrb_value str;
494   const char *name;
495   mrb_int len;
496   mrb_sym id = mrb_symbol(sym);
497   char *sp;
498 
499   name = mrb_sym_name_len(mrb, id, &len);
500   str = mrb_str_new(mrb, 0, len+1);
501   sp = RSTRING_PTR(str);
502   sp[0] = ':';
503   memcpy(sp+1, name, len);
504   mrb_assert_int_fit(mrb_int, len, size_t, SIZE_MAX);
505   if (!symname_p(name) || strlen(name) != (size_t)len) {
506     str = mrb_str_inspect(mrb, str);
507     sp = RSTRING_PTR(str);
508     sp[0] = ':';
509     sp[1] = '"';
510   }
511 #ifdef MRB_UTF8_STRING
512   if (SYMBOL_INLINE_P(id)) RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
513 #endif
514   return str;
515 }
516 
517 MRB_API mrb_value
mrb_sym_str(mrb_state * mrb,mrb_sym sym)518 mrb_sym_str(mrb_state *mrb, mrb_sym sym)
519 {
520   mrb_int len;
521   const char *name = mrb_sym_name_len(mrb, sym, &len);
522 
523   if (!name) return mrb_undef_value(); /* can't happen */
524   if (SYMBOL_INLINE_P(sym)) {
525     mrb_value str = mrb_str_new(mrb, name, len);
526     RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
527     return str;
528   }
529   return mrb_str_new_static(mrb, name, len);
530 }
531 
532 static const char*
sym_name(mrb_state * mrb,mrb_sym sym,mrb_bool dump)533 sym_name(mrb_state *mrb, mrb_sym sym, mrb_bool dump)
534 {
535   mrb_int len;
536   const char *name = mrb_sym_name_len(mrb, sym, &len);
537 
538   if (!name) return NULL;
539   if (strlen(name) == (size_t)len && (!dump || symname_p(name))) {
540     return name;
541   }
542   else {
543     mrb_value str = SYMBOL_INLINE_P(sym) ?
544       mrb_str_new(mrb, name, len) : mrb_str_new_static(mrb, name, len);
545     str = mrb_str_dump(mrb, str);
546     return RSTRING_PTR(str);
547   }
548 }
549 
550 MRB_API const char*
mrb_sym_name(mrb_state * mrb,mrb_sym sym)551 mrb_sym_name(mrb_state *mrb, mrb_sym sym)
552 {
553   return sym_name(mrb, sym, FALSE);
554 }
555 
556 MRB_API const char*
mrb_sym_dump(mrb_state * mrb,mrb_sym sym)557 mrb_sym_dump(mrb_state *mrb, mrb_sym sym)
558 {
559   return sym_name(mrb, sym, TRUE);
560 }
561 
562 #define lesser(a,b) (((a)>(b))?(b):(a))
563 
564 static mrb_value
sym_cmp(mrb_state * mrb,mrb_value s1)565 sym_cmp(mrb_state *mrb, mrb_value s1)
566 {
567   mrb_value s2 = mrb_get_arg1(mrb);
568   mrb_sym sym1, sym2;
569 
570   if (!mrb_symbol_p(s2)) return mrb_nil_value();
571   sym1 = mrb_symbol(s1);
572   sym2 = mrb_symbol(s2);
573   if (sym1 == sym2) return mrb_fixnum_value(0);
574   else {
575     const char *p1, *p2;
576     int retval;
577     mrb_int len, len1, len2;
578     char buf1[8], buf2[8];
579 
580     p1 = sym2name_len(mrb, sym1, buf1, &len1);
581     p2 = sym2name_len(mrb, sym2, buf2, &len2);
582     len = lesser(len1, len2);
583     retval = memcmp(p1, p2, len);
584     if (retval == 0) {
585       if (len1 == len2) return mrb_fixnum_value(0);
586       if (len1 > len2)  return mrb_fixnum_value(1);
587       return mrb_fixnum_value(-1);
588     }
589     if (retval > 0) return mrb_fixnum_value(1);
590     return mrb_fixnum_value(-1);
591   }
592 }
593 
594 void
mrb_init_symbol(mrb_state * mrb)595 mrb_init_symbol(mrb_state *mrb)
596 {
597   struct RClass *sym;
598 
599   mrb->symbol_class = sym = mrb_define_class(mrb, "Symbol", mrb->object_class);  /* 15.2.11 */
600   MRB_SET_INSTANCE_TT(sym, MRB_TT_SYMBOL);
601   mrb_undef_class_method(mrb,  sym, "new");
602 
603   mrb_define_method(mrb, sym, "id2name", sym_to_s,    MRB_ARGS_NONE());          /* 15.2.11.3.2 */
604   mrb_define_method(mrb, sym, "to_s",    sym_to_s,    MRB_ARGS_NONE());          /* 15.2.11.3.3 */
605   mrb_define_method(mrb, sym, "to_sym",  sym_to_sym,  MRB_ARGS_NONE());          /* 15.2.11.3.4 */
606   mrb_define_method(mrb, sym, "inspect", sym_inspect, MRB_ARGS_NONE());          /* 15.2.11.3.5(x) */
607   mrb_define_method(mrb, sym, "<=>",     sym_cmp,     MRB_ARGS_REQ(1));
608 }
609