1 /*
2 ** symbol.c - Symbol class
3 **
4 ** See Copyright Notice in mruby.h
5 */
6 
7 #include <limits.h>
8 #include <string.h>
9 #include <mruby.h>
10 #include <mruby/khash.h>
11 #include <mruby/string.h>
12 #include <mruby/dump.h>
13 #include <mruby/class.h>
14 
15 /* ------------------------------------------------------ */
16 typedef struct symbol_name {
17   mrb_bool lit : 1;
18   uint8_t prev;
19   uint16_t len;
20   const char *name;
21 } symbol_name;
22 
23 static void
sym_validate_len(mrb_state * mrb,size_t len)24 sym_validate_len(mrb_state *mrb, size_t len)
25 {
26   if (len >= RITE_LV_NULL_MARK) {
27     mrb_raise(mrb, E_ARGUMENT_ERROR, "symbol length too long");
28   }
29 }
30 
31 #ifndef MRB_ENABLE_ALL_SYMBOLS
32 static const char pack_table[] = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
33 
34 static mrb_sym
sym_inline_pack(const char * name,uint16_t len)35 sym_inline_pack(const char *name, uint16_t len)
36 {
37   char c;
38   const char *p;
39   int i;
40   mrb_sym sym = 0;
41   int lower = 1;
42 
43   if (len > 6) return 0;        /* too long */
44   for (i=0; i<len; i++) {
45     uint32_t bits;
46 
47     c = name[i];
48     if (c == 0) return 0;       /* NUL in name */
49     p = strchr(pack_table, (int)c);
50     if (p == 0) return 0;       /* non alnum char */
51     bits = (uint32_t)(p - pack_table)+1;
52     if (bits > 27) lower = 0;
53     sym |= bits<<(i*6+2);
54   }
55   if (lower) {
56     sym = 0;
57     for (i=0; i<len; i++) {
58       uint32_t bits;
59 
60       c = name[i];
61       p = strchr(pack_table, (int)c);
62       bits = (uint32_t)(p - pack_table)+1;
63       sym |= bits<<(i*5+2);
64     }
65     return sym | 3;
66   }
67   if (len == 6) return 0;
68   return sym | 1;
69 }
70 
71 static const char*
sym_inline_unpack_with_bit(mrb_sym sym,char * buf,int bit_per_char)72 sym_inline_unpack_with_bit(mrb_sym sym, char *buf, int bit_per_char)
73 {
74   int i;
75 
76   for (i=0; i<30/bit_per_char; i++) {
77     uint32_t bits;
78     char c;
79 
80     bits = sym>>(i*bit_per_char+2) & (1<<bit_per_char)-1;
81     if (bits == 0) break;
82     c = pack_table[bits-1];
83     buf[i] = c;
84   }
85   buf[i] = '\0';
86   return buf;
87 }
88 
89 static const char*
sym_inline_unpack(mrb_sym sym,char * buf)90 sym_inline_unpack(mrb_sym sym, char *buf)
91 {
92   mrb_assert(sym&1);
93 
94   if (sym&2) {                  /* all lower case (5bits/char) */
95     return sym_inline_unpack_with_bit(sym, buf, 5);
96   }
97   return sym_inline_unpack_with_bit(sym, buf, 6);
98 }
99 #endif
100 
101 uint8_t
symhash(const char * key,size_t len)102 symhash(const char *key, size_t len)
103 {
104     uint32_t hash, i;
105 
106     for(hash = i = 0; i < len; ++i) {
107         hash += key[i];
108         hash += (hash << 10);
109         hash ^= (hash >> 6);
110     }
111     hash += (hash << 3);
112     hash ^= (hash >> 11);
113     hash += (hash << 15);
114     return hash & 0xff;
115 }
116 
117 static mrb_sym
find_symbol(mrb_state * mrb,const char * name,uint16_t len,uint8_t hash)118 find_symbol(mrb_state *mrb, const char *name, uint16_t len, uint8_t hash)
119 {
120   mrb_sym i;
121   symbol_name *sname;
122 
123 #ifndef MRB_ENABLE_ALL_SYMBOLS
124   /* inline symbol */
125   i = sym_inline_pack(name, len);
126   if (i > 0) return i;
127 #endif
128 
129   i = mrb->symhash[hash];
130   if (i == 0) return 0;
131   do {
132     sname = &mrb->symtbl[i];
133     if (sname->len == len && memcmp(sname->name, name, len) == 0) {
134       return i<<1;
135     }
136     if (sname->prev == 0xff) {
137       i -= 0xff;
138       sname = &mrb->symtbl[i];
139       while (mrb->symtbl < sname) {
140         if (sname->len == len && memcmp(sname->name, name, len) == 0) {
141           return (mrb_sym)(sname - mrb->symtbl)<<1;
142         }
143         sname--;
144       }
145       return 0;
146     }
147     i -= sname->prev;
148   } while (sname->prev > 0);
149   return 0;
150 }
151 
152 static mrb_sym
sym_intern(mrb_state * mrb,const char * name,size_t len,mrb_bool lit)153 sym_intern(mrb_state *mrb, const char *name, size_t len, mrb_bool lit)
154 {
155   mrb_sym sym;
156   symbol_name *sname;
157   uint8_t hash;
158 
159   sym_validate_len(mrb, len);
160   hash = symhash(name, len);
161   sym = find_symbol(mrb, name, len, hash);
162   if (sym > 0) return sym;
163 
164   /* registering a new symbol */
165   sym = ++mrb->symidx;
166   if (mrb->symcapa < sym) {
167     if (mrb->symcapa == 0) mrb->symcapa = 100;
168     else mrb->symcapa = (size_t)(mrb->symcapa * 6 / 5);
169     mrb->symtbl = (symbol_name*)mrb_realloc(mrb, mrb->symtbl, sizeof(symbol_name)*(mrb->symcapa+1));
170   }
171   sname = &mrb->symtbl[sym];
172   sname->len = (uint16_t)len;
173   if (lit || mrb_ro_data_p(name)) {
174     sname->name = name;
175     sname->lit = TRUE;
176   }
177   else {
178     char *p = (char *)mrb_malloc(mrb, len+1);
179     memcpy(p, name, len);
180     p[len] = 0;
181     sname->name = (const char*)p;
182     sname->lit = FALSE;
183   }
184   if (mrb->symhash[hash]) {
185     mrb_sym i = sym - mrb->symhash[hash];
186     if (i > 0xff)
187       sname->prev = 0xff;
188     else
189       sname->prev = i;
190   }
191   else {
192     sname->prev = 0;
193   }
194   mrb->symhash[hash] = sym;
195 
196   return sym<<1;
197 }
198 
199 MRB_API mrb_sym
mrb_intern(mrb_state * mrb,const char * name,size_t len)200 mrb_intern(mrb_state *mrb, const char *name, size_t len)
201 {
202   return sym_intern(mrb, name, len, FALSE);
203 }
204 
205 MRB_API mrb_sym
mrb_intern_static(mrb_state * mrb,const char * name,size_t len)206 mrb_intern_static(mrb_state *mrb, const char *name, size_t len)
207 {
208   return sym_intern(mrb, name, len, TRUE);
209 }
210 
211 MRB_API mrb_sym
mrb_intern_cstr(mrb_state * mrb,const char * name)212 mrb_intern_cstr(mrb_state *mrb, const char *name)
213 {
214   return mrb_intern(mrb, name, strlen(name));
215 }
216 
217 MRB_API mrb_sym
mrb_intern_str(mrb_state * mrb,mrb_value str)218 mrb_intern_str(mrb_state *mrb, mrb_value str)
219 {
220   return mrb_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
221 }
222 
223 MRB_API mrb_value
mrb_check_intern(mrb_state * mrb,const char * name,size_t len)224 mrb_check_intern(mrb_state *mrb, const char *name, size_t len)
225 {
226   mrb_sym sym;
227 
228   sym_validate_len(mrb, len);
229   sym = find_symbol(mrb, name, len, symhash(name, len));
230   if (sym > 0) return mrb_symbol_value(sym);
231   return mrb_nil_value();
232 }
233 
234 MRB_API mrb_value
mrb_check_intern_cstr(mrb_state * mrb,const char * name)235 mrb_check_intern_cstr(mrb_state *mrb, const char *name)
236 {
237   return mrb_check_intern(mrb, name, strlen(name));
238 }
239 
240 MRB_API mrb_value
mrb_check_intern_str(mrb_state * mrb,mrb_value str)241 mrb_check_intern_str(mrb_state *mrb, mrb_value str)
242 {
243   return mrb_check_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
244 }
245 
246 MRB_API const char*
mrb_sym2name_len(mrb_state * mrb,mrb_sym sym,mrb_int * lenp)247 mrb_sym2name_len(mrb_state *mrb, mrb_sym sym, mrb_int *lenp)
248 {
249 #ifndef MRB_ENABLE_ALL_SYMBOLS
250   if (sym & 1) {                /* inline packed symbol */
251     sym_inline_unpack(sym, mrb->symbuf);
252     if (lenp) *lenp = strlen(mrb->symbuf);
253     return mrb->symbuf;
254   }
255 #endif
256 
257   sym >>= 1;
258   if (sym == 0 || mrb->symidx < sym) {
259     if (lenp) *lenp = 0;
260     return NULL;
261   }
262 
263   if (lenp) *lenp = mrb->symtbl[sym].len;
264   return mrb->symtbl[sym].name;
265 }
266 
267 void
mrb_free_symtbl(mrb_state * mrb)268 mrb_free_symtbl(mrb_state *mrb)
269 {
270   mrb_sym i, lim;
271 
272   for (i=1, lim=mrb->symidx+1; i<lim; i++) {
273     if (!mrb->symtbl[i].lit) {
274       mrb_free(mrb, (char*)mrb->symtbl[i].name);
275     }
276   }
277   mrb_free(mrb, mrb->symtbl);
278 }
279 
280 void
mrb_init_symtbl(mrb_state * mrb)281 mrb_init_symtbl(mrb_state *mrb)
282 {
283 }
284 
285 /**********************************************************************
286  * Document-class: Symbol
287  *
288  *  <code>Symbol</code> objects represent names and some strings
289  *  inside the Ruby
290  *  interpreter. They are generated using the <code>:name</code> and
291  *  <code>:"string"</code> literals
292  *  syntax, and by the various <code>to_sym</code> methods. The same
293  *  <code>Symbol</code> object will be created for a given name or string
294  *  for the duration of a program's execution, regardless of the context
295  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
296  *  one context, a method in another, and a class in a third, the
297  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
298  *  all three contexts.
299  *
300  *     module One
301  *       class Fred
302  *       end
303  *       $f1 = :Fred
304  *     end
305  *     module Two
306  *       Fred = 1
307  *       $f2 = :Fred
308  *     end
309  *     def Fred()
310  *     end
311  *     $f3 = :Fred
312  *     $f1.object_id   #=> 2514190
313  *     $f2.object_id   #=> 2514190
314  *     $f3.object_id   #=> 2514190
315  *
316  */
317 
318 
319 /* 15.2.11.3.1  */
320 /*
321  *  call-seq:
322  *     sym == obj   -> true or false
323  *
324  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
325  *  symbol, returns <code>true</code>.
326  */
327 
328 static mrb_value
sym_equal(mrb_state * mrb,mrb_value sym1)329 sym_equal(mrb_state *mrb, mrb_value sym1)
330 {
331   mrb_value sym2;
332 
333   mrb_get_args(mrb, "o", &sym2);
334 
335   return mrb_bool_value(mrb_obj_equal(mrb, sym1, sym2));
336 }
337 
338 /* 15.2.11.3.2  */
339 /* 15.2.11.3.3  */
340 /*
341  *  call-seq:
342  *     sym.id2name   -> string
343  *     sym.to_s      -> string
344  *
345  *  Returns the name or string corresponding to <i>sym</i>.
346  *
347  *     :fred.id2name   #=> "fred"
348  */
349 static mrb_value
mrb_sym_to_s(mrb_state * mrb,mrb_value sym)350 mrb_sym_to_s(mrb_state *mrb, mrb_value sym)
351 {
352   mrb_sym id = mrb_symbol(sym);
353   const char *p;
354   mrb_int len;
355 
356   p = mrb_sym2name_len(mrb, id, &len);
357   return mrb_str_new_static(mrb, p, len);
358 }
359 
360 /* 15.2.11.3.4  */
361 /*
362  * call-seq:
363  *   sym.to_sym   -> sym
364  *   sym.intern   -> sym
365  *
366  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
367  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
368  * in this case.
369  */
370 
371 static mrb_value
sym_to_sym(mrb_state * mrb,mrb_value sym)372 sym_to_sym(mrb_state *mrb, mrb_value sym)
373 {
374   return sym;
375 }
376 
377 /* 15.2.11.3.5(x)  */
378 /*
379  *  call-seq:
380  *     sym.inspect    -> string
381  *
382  *  Returns the representation of <i>sym</i> as a symbol literal.
383  *
384  *     :fred.inspect   #=> ":fred"
385  */
386 
387 #if __STDC__
388 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
389 #else  /* not __STDC__ */
390 /* As in Harbison and Steele.  */
391 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
392 #endif
393 #define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_'))
394 
395 static mrb_bool
is_special_global_name(const char * m)396 is_special_global_name(const char* m)
397 {
398   switch (*m) {
399     case '~': case '*': case '$': case '?': case '!': case '@':
400     case '/': case '\\': case ';': case ',': case '.': case '=':
401     case ':': case '<': case '>': case '\"':
402     case '&': case '`': case '\'': case '+':
403     case '0':
404       ++m;
405       break;
406     case '-':
407       ++m;
408       if (is_identchar(*m)) m += 1;
409       break;
410     default:
411       if (!ISDIGIT(*m)) return FALSE;
412       do ++m; while (ISDIGIT(*m));
413       break;
414   }
415   return !*m;
416 }
417 
418 static mrb_bool
symname_p(const char * name)419 symname_p(const char *name)
420 {
421   const char *m = name;
422   mrb_bool localid = FALSE;
423 
424   if (!m) return FALSE;
425   switch (*m) {
426     case '\0':
427       return FALSE;
428 
429     case '$':
430       if (is_special_global_name(++m)) return TRUE;
431       goto id;
432 
433     case '@':
434       if (*++m == '@') ++m;
435       goto id;
436 
437     case '<':
438       switch (*++m) {
439         case '<': ++m; break;
440         case '=': if (*++m == '>') ++m; break;
441         default: break;
442       }
443       break;
444 
445     case '>':
446       switch (*++m) {
447         case '>': case '=': ++m; break;
448         default: break;
449       }
450       break;
451 
452     case '=':
453       switch (*++m) {
454         case '~': ++m; break;
455         case '=': if (*++m == '=') ++m; break;
456         default: return FALSE;
457       }
458       break;
459 
460     case '*':
461       if (*++m == '*') ++m;
462       break;
463     case '!':
464       switch (*++m) {
465         case '=': case '~': ++m;
466       }
467       break;
468     case '+': case '-':
469       if (*++m == '@') ++m;
470       break;
471     case '|':
472       if (*++m == '|') ++m;
473       break;
474     case '&':
475       if (*++m == '&') ++m;
476       break;
477 
478     case '^': case '/': case '%': case '~': case '`':
479       ++m;
480       break;
481 
482     case '[':
483       if (*++m != ']') return FALSE;
484       if (*++m == '=') ++m;
485       break;
486 
487     default:
488       localid = !ISUPPER(*m);
489 id:
490       if (*m != '_' && !ISALPHA(*m)) return FALSE;
491       while (is_identchar(*m)) m += 1;
492       if (localid) {
493         switch (*m) {
494           case '!': case '?': case '=': ++m;
495           default: break;
496         }
497       }
498       break;
499   }
500   return *m ? FALSE : TRUE;
501 }
502 
503 static mrb_value
sym_inspect(mrb_state * mrb,mrb_value sym)504 sym_inspect(mrb_state *mrb, mrb_value sym)
505 {
506   mrb_value str;
507   const char *name;
508   mrb_int len;
509   mrb_sym id = mrb_symbol(sym);
510   char *sp;
511 
512   name = mrb_sym2name_len(mrb, id, &len);
513   str = mrb_str_new(mrb, 0, len+1);
514   sp = RSTRING_PTR(str);
515   RSTRING_PTR(str)[0] = ':';
516   memcpy(sp+1, name, len);
517   mrb_assert_int_fit(mrb_int, len, size_t, SIZE_MAX);
518   if (!symname_p(name) || strlen(name) != (size_t)len) {
519     str = mrb_str_dump(mrb, str);
520     sp = RSTRING_PTR(str);
521     sp[0] = ':';
522     sp[1] = '"';
523   }
524   return str;
525 }
526 
527 MRB_API mrb_value
mrb_sym2str(mrb_state * mrb,mrb_sym sym)528 mrb_sym2str(mrb_state *mrb, mrb_sym sym)
529 {
530   mrb_int len;
531   const char *name = mrb_sym2name_len(mrb, sym, &len);
532 
533   if (!name) return mrb_undef_value(); /* can't happen */
534   if (sym&1) {                         /* inline symbol */
535     return mrb_str_new(mrb, name, len);
536   }
537   return mrb_str_new_static(mrb, name, len);
538 }
539 
540 MRB_API const char*
mrb_sym2name(mrb_state * mrb,mrb_sym sym)541 mrb_sym2name(mrb_state *mrb, mrb_sym sym)
542 {
543   mrb_int len;
544   const char *name = mrb_sym2name_len(mrb, sym, &len);
545 
546   if (!name) return NULL;
547   if (symname_p(name) && strlen(name) == (size_t)len) {
548     return name;
549   }
550   else {
551     mrb_value str = mrb_str_dump(mrb, mrb_str_new_static(mrb, name, len));
552     return RSTRING_PTR(str);
553   }
554 }
555 
556 #define lesser(a,b) (((a)>(b))?(b):(a))
557 
558 static mrb_value
sym_cmp(mrb_state * mrb,mrb_value s1)559 sym_cmp(mrb_state *mrb, mrb_value s1)
560 {
561   mrb_value s2;
562   mrb_sym sym1, sym2;
563 
564   mrb_get_args(mrb, "o", &s2);
565   if (mrb_type(s2) != MRB_TT_SYMBOL) return mrb_nil_value();
566   sym1 = mrb_symbol(s1);
567   sym2 = mrb_symbol(s2);
568   if (sym1 == sym2) return mrb_fixnum_value(0);
569   else {
570     const char *p1, *p2;
571     int retval;
572     mrb_int len, len1, len2;
573 
574     p1 = mrb_sym2name_len(mrb, sym1, &len1);
575     p2 = mrb_sym2name_len(mrb, sym2, &len2);
576     len = lesser(len1, len2);
577     retval = memcmp(p1, p2, len);
578     if (retval == 0) {
579       if (len1 == len2) return mrb_fixnum_value(0);
580       if (len1 > len2)  return mrb_fixnum_value(1);
581       return mrb_fixnum_value(-1);
582     }
583     if (retval > 0) return mrb_fixnum_value(1);
584     return mrb_fixnum_value(-1);
585   }
586 }
587 
588 void
mrb_init_symbol(mrb_state * mrb)589 mrb_init_symbol(mrb_state *mrb)
590 {
591   struct RClass *sym;
592 
593   mrb->symbol_class = sym = mrb_define_class(mrb, "Symbol", mrb->object_class);                 /* 15.2.11 */
594   MRB_SET_INSTANCE_TT(sym, MRB_TT_SYMBOL);
595   mrb_undef_class_method(mrb,  sym, "new");
596 
597   mrb_define_method(mrb, sym, "===",             sym_equal,      MRB_ARGS_REQ(1));              /* 15.2.11.3.1  */
598   mrb_define_method(mrb, sym, "id2name",         mrb_sym_to_s,   MRB_ARGS_NONE());              /* 15.2.11.3.2  */
599   mrb_define_method(mrb, sym, "to_s",            mrb_sym_to_s,   MRB_ARGS_NONE());              /* 15.2.11.3.3  */
600   mrb_define_method(mrb, sym, "to_sym",          sym_to_sym,     MRB_ARGS_NONE());              /* 15.2.11.3.4  */
601   mrb_define_method(mrb, sym, "inspect",         sym_inspect,    MRB_ARGS_NONE());              /* 15.2.11.3.5(x)  */
602   mrb_define_method(mrb, sym, "<=>",             sym_cmp,        MRB_ARGS_REQ(1));
603 }
604