1 /*
2 ** symbol.c - Symbol class
3 **
4 ** See Copyright Notice in mruby.h
5 */
6
7 #include <limits.h>
8 #include <string.h>
9 #include <mruby.h>
10 #include <mruby/khash.h>
11 #include <mruby/string.h>
12 #include <mruby/dump.h>
13 #include <mruby/class.h>
14
15 /* ------------------------------------------------------ */
16 typedef struct symbol_name {
17 mrb_bool lit : 1;
18 uint8_t prev;
19 uint16_t len;
20 const char *name;
21 } symbol_name;
22
23 #define SYMBOL_INLINE_BIT_POS 1
24 #define SYMBOL_INLINE_LOWER_BIT_POS 2
25 #define SYMBOL_INLINE (1 << (SYMBOL_INLINE_BIT_POS - 1))
26 #define SYMBOL_INLINE_LOWER (1 << (SYMBOL_INLINE_LOWER_BIT_POS - 1))
27 #define SYMBOL_NORMAL_SHIFT SYMBOL_INLINE_BIT_POS
28 #define SYMBOL_INLINE_SHIFT SYMBOL_INLINE_LOWER_BIT_POS
29 #ifdef MRB_ENABLE_ALL_SYMBOLS
30 # define SYMBOL_INLINE_P(sym) FALSE
31 # define SYMBOL_INLINE_LOWER_P(sym) FALSE
32 # define sym_inline_pack(name, len) 0
33 # define sym_inline_unpack(sym, buf, lenp) NULL
34 #else
35 # define SYMBOL_INLINE_P(sym) ((sym) & SYMBOL_INLINE)
36 # define SYMBOL_INLINE_LOWER_P(sym) ((sym) & SYMBOL_INLINE_LOWER)
37 #endif
38
39 static void
sym_validate_len(mrb_state * mrb,size_t len)40 sym_validate_len(mrb_state *mrb, size_t len)
41 {
42 if (len >= RITE_LV_NULL_MARK) {
43 mrb_raise(mrb, E_ARGUMENT_ERROR, "symbol length too long");
44 }
45 }
46
47 #ifndef MRB_ENABLE_ALL_SYMBOLS
48 static const char pack_table[] = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
49
50 static mrb_sym
sym_inline_pack(const char * name,size_t len)51 sym_inline_pack(const char *name, size_t len)
52 {
53 const size_t lower_length_max = (MRB_SYMBOL_BIT - 2) / 5;
54 const size_t mix_length_max = (MRB_SYMBOL_BIT - 2) / 6;
55
56 char c;
57 const char *p;
58 size_t i;
59 mrb_sym sym = 0;
60 mrb_bool lower = TRUE;
61
62 if (len > lower_length_max) return 0; /* too long */
63 for (i=0; i<len; i++) {
64 uint32_t bits;
65
66 c = name[i];
67 if (c == 0) return 0; /* NUL in name */
68 p = strchr(pack_table, (int)c);
69 if (p == 0) return 0; /* non alnum char */
70 bits = (uint32_t)(p - pack_table)+1;
71 if (bits > 27) lower = FALSE;
72 if (i >= mix_length_max) break;
73 sym |= bits<<(i*6+SYMBOL_INLINE_SHIFT);
74 }
75 if (lower) {
76 sym = 0;
77 for (i=0; i<len; i++) {
78 uint32_t bits;
79
80 c = name[i];
81 p = strchr(pack_table, (int)c);
82 bits = (uint32_t)(p - pack_table)+1;
83 sym |= bits<<(i*5+SYMBOL_INLINE_SHIFT);
84 }
85 return sym | SYMBOL_INLINE | SYMBOL_INLINE_LOWER;
86 }
87 if (len > mix_length_max) return 0;
88 return sym | SYMBOL_INLINE;
89 }
90
91 static const char*
sym_inline_unpack(mrb_sym sym,char * buf,mrb_int * lenp)92 sym_inline_unpack(mrb_sym sym, char *buf, mrb_int *lenp)
93 {
94 int bit_per_char = SYMBOL_INLINE_LOWER_P(sym) ? 5 : 6;
95 int i;
96
97 mrb_assert(SYMBOL_INLINE_P(sym));
98
99 for (i=0; i<30/bit_per_char; i++) {
100 uint32_t bits = sym>>(i*bit_per_char+SYMBOL_INLINE_SHIFT) & ((1<<bit_per_char)-1);
101 if (bits == 0) break;
102 buf[i] = pack_table[bits-1];;
103 }
104 buf[i] = '\0';
105 if (lenp) *lenp = i;
106 return buf;
107 }
108 #endif
109
110 static uint8_t
symhash(const char * key,size_t len)111 symhash(const char *key, size_t len)
112 {
113 uint32_t hash, i;
114
115 for(hash = i = 0; i < len; ++i) {
116 hash += key[i];
117 hash += (hash << 10);
118 hash ^= (hash >> 6);
119 }
120 hash += (hash << 3);
121 hash ^= (hash >> 11);
122 hash += (hash << 15);
123 return hash & 0xff;
124 }
125
126 static mrb_sym
find_symbol(mrb_state * mrb,const char * name,size_t len,uint8_t * hashp)127 find_symbol(mrb_state *mrb, const char *name, size_t len, uint8_t *hashp)
128 {
129 mrb_sym i;
130 symbol_name *sname;
131 uint8_t hash;
132
133 /* inline symbol */
134 i = sym_inline_pack(name, len);
135 if (i > 0) return i;
136
137 hash = symhash(name, len);
138 if (hashp) *hashp = hash;
139
140 i = mrb->symhash[hash];
141 if (i == 0) return 0;
142 do {
143 sname = &mrb->symtbl[i];
144 if (sname->len == len && memcmp(sname->name, name, len) == 0) {
145 return i<<SYMBOL_NORMAL_SHIFT;
146 }
147 if (sname->prev == 0xff) {
148 i -= 0xff;
149 sname = &mrb->symtbl[i];
150 while (mrb->symtbl < sname) {
151 if (sname->len == len && memcmp(sname->name, name, len) == 0) {
152 return (mrb_sym)(sname - mrb->symtbl)<<SYMBOL_NORMAL_SHIFT;
153 }
154 sname--;
155 }
156 return 0;
157 }
158 i -= sname->prev;
159 } while (sname->prev > 0);
160 return 0;
161 }
162
163 static mrb_sym
sym_intern(mrb_state * mrb,const char * name,size_t len,mrb_bool lit)164 sym_intern(mrb_state *mrb, const char *name, size_t len, mrb_bool lit)
165 {
166 mrb_sym sym;
167 symbol_name *sname;
168 uint8_t hash;
169
170 sym_validate_len(mrb, len);
171 sym = find_symbol(mrb, name, len, &hash);
172 if (sym > 0) return sym;
173
174 /* registering a new symbol */
175 sym = mrb->symidx + 1;
176 if (mrb->symcapa < sym) {
177 size_t symcapa = mrb->symcapa;
178 if (symcapa == 0) symcapa = 100;
179 else symcapa = (size_t)(symcapa * 6 / 5);
180 mrb->symtbl = (symbol_name*)mrb_realloc(mrb, mrb->symtbl, sizeof(symbol_name)*(symcapa+1));
181 mrb->symcapa = symcapa;
182 }
183 sname = &mrb->symtbl[sym];
184 sname->len = (uint16_t)len;
185 if (lit || mrb_ro_data_p(name)) {
186 sname->name = name;
187 sname->lit = TRUE;
188 }
189 else {
190 char *p = (char *)mrb_malloc(mrb, len+1);
191 memcpy(p, name, len);
192 p[len] = 0;
193 sname->name = (const char*)p;
194 sname->lit = FALSE;
195 }
196 if (mrb->symhash[hash]) {
197 mrb_sym i = sym - mrb->symhash[hash];
198 if (i > 0xff)
199 sname->prev = 0xff;
200 else
201 sname->prev = i;
202 }
203 else {
204 sname->prev = 0;
205 }
206 mrb->symhash[hash] = mrb->symidx = sym;
207
208 return sym<<SYMBOL_NORMAL_SHIFT;
209 }
210
211 MRB_API mrb_sym
mrb_intern(mrb_state * mrb,const char * name,size_t len)212 mrb_intern(mrb_state *mrb, const char *name, size_t len)
213 {
214 return sym_intern(mrb, name, len, FALSE);
215 }
216
217 MRB_API mrb_sym
mrb_intern_static(mrb_state * mrb,const char * name,size_t len)218 mrb_intern_static(mrb_state *mrb, const char *name, size_t len)
219 {
220 return sym_intern(mrb, name, len, TRUE);
221 }
222
223 MRB_API mrb_sym
mrb_intern_cstr(mrb_state * mrb,const char * name)224 mrb_intern_cstr(mrb_state *mrb, const char *name)
225 {
226 return mrb_intern(mrb, name, strlen(name));
227 }
228
229 MRB_API mrb_sym
mrb_intern_str(mrb_state * mrb,mrb_value str)230 mrb_intern_str(mrb_state *mrb, mrb_value str)
231 {
232 return mrb_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
233 }
234
235 MRB_API mrb_value
mrb_check_intern(mrb_state * mrb,const char * name,size_t len)236 mrb_check_intern(mrb_state *mrb, const char *name, size_t len)
237 {
238 mrb_sym sym;
239
240 sym_validate_len(mrb, len);
241 sym = find_symbol(mrb, name, len, NULL);
242 if (sym > 0) return mrb_symbol_value(sym);
243 return mrb_nil_value();
244 }
245
246 MRB_API mrb_value
mrb_check_intern_cstr(mrb_state * mrb,const char * name)247 mrb_check_intern_cstr(mrb_state *mrb, const char *name)
248 {
249 return mrb_check_intern(mrb, name, strlen(name));
250 }
251
252 MRB_API mrb_value
mrb_check_intern_str(mrb_state * mrb,mrb_value str)253 mrb_check_intern_str(mrb_state *mrb, mrb_value str)
254 {
255 return mrb_check_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
256 }
257
258 static const char*
sym2name_len(mrb_state * mrb,mrb_sym sym,char * buf,mrb_int * lenp)259 sym2name_len(mrb_state *mrb, mrb_sym sym, char *buf, mrb_int *lenp)
260 {
261 if (SYMBOL_INLINE_P(sym)) return sym_inline_unpack(sym, buf, lenp);
262
263 sym >>= SYMBOL_NORMAL_SHIFT;
264 if (sym == 0 || mrb->symidx < sym) {
265 if (lenp) *lenp = 0;
266 return NULL;
267 }
268
269 if (lenp) *lenp = mrb->symtbl[sym].len;
270 return mrb->symtbl[sym].name;
271 }
272
273 MRB_API const char*
mrb_sym_name_len(mrb_state * mrb,mrb_sym sym,mrb_int * lenp)274 mrb_sym_name_len(mrb_state *mrb, mrb_sym sym, mrb_int *lenp)
275 {
276 return sym2name_len(mrb, sym, mrb->symbuf, lenp);
277 }
278
279 void
mrb_free_symtbl(mrb_state * mrb)280 mrb_free_symtbl(mrb_state *mrb)
281 {
282 mrb_sym i, lim;
283
284 for (i=1, lim=mrb->symidx+1; i<lim; i++) {
285 if (!mrb->symtbl[i].lit) {
286 mrb_free(mrb, (char*)mrb->symtbl[i].name);
287 }
288 }
289 mrb_free(mrb, mrb->symtbl);
290 }
291
292 void
mrb_init_symtbl(mrb_state * mrb)293 mrb_init_symtbl(mrb_state *mrb)
294 {
295 }
296
297 /**********************************************************************
298 * Document-class: Symbol
299 *
300 * <code>Symbol</code> objects represent names and some strings
301 * inside the Ruby
302 * interpreter. They are generated using the <code>:name</code> and
303 * <code>:"string"</code> literals
304 * syntax, and by the various <code>to_sym</code> methods. The same
305 * <code>Symbol</code> object will be created for a given name or string
306 * for the duration of a program's execution, regardless of the context
307 * or meaning of that name. Thus if <code>Fred</code> is a constant in
308 * one context, a method in another, and a class in a third, the
309 * <code>Symbol</code> <code>:Fred</code> will be the same object in
310 * all three contexts.
311 *
312 * module One
313 * class Fred
314 * end
315 * $f1 = :Fred
316 * end
317 * module Two
318 * Fred = 1
319 * $f2 = :Fred
320 * end
321 * def Fred()
322 * end
323 * $f3 = :Fred
324 * $f1.object_id #=> 2514190
325 * $f2.object_id #=> 2514190
326 * $f3.object_id #=> 2514190
327 *
328 */
329
330 /* 15.2.11.3.2 */
331 /* 15.2.11.3.3 */
332 /*
333 * call-seq:
334 * sym.id2name -> string
335 * sym.to_s -> string
336 *
337 * Returns the name or string corresponding to <i>sym</i>.
338 *
339 * :fred.id2name #=> "fred"
340 */
341 static mrb_value
sym_to_s(mrb_state * mrb,mrb_value sym)342 sym_to_s(mrb_state *mrb, mrb_value sym)
343 {
344 return mrb_sym_str(mrb, mrb_symbol(sym));
345 }
346
347 /* 15.2.11.3.4 */
348 /*
349 * call-seq:
350 * sym.to_sym -> sym
351 * sym.intern -> sym
352 *
353 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
354 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
355 * in this case.
356 */
357
358 static mrb_value
sym_to_sym(mrb_state * mrb,mrb_value sym)359 sym_to_sym(mrb_state *mrb, mrb_value sym)
360 {
361 return sym;
362 }
363
364 /* 15.2.11.3.5(x) */
365 /*
366 * call-seq:
367 * sym.inspect -> string
368 *
369 * Returns the representation of <i>sym</i> as a symbol literal.
370 *
371 * :fred.inspect #=> ":fred"
372 */
373
374 #if __STDC__
375 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
376 #else /* not __STDC__ */
377 /* As in Harbison and Steele. */
378 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
379 #endif
380 #define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_'))
381
382 static mrb_bool
is_special_global_name(const char * m)383 is_special_global_name(const char* m)
384 {
385 switch (*m) {
386 case '~': case '*': case '$': case '?': case '!': case '@':
387 case '/': case '\\': case ';': case ',': case '.': case '=':
388 case ':': case '<': case '>': case '\"':
389 case '&': case '`': case '\'': case '+':
390 case '0':
391 ++m;
392 break;
393 case '-':
394 ++m;
395 if (is_identchar(*m)) m += 1;
396 break;
397 default:
398 if (!ISDIGIT(*m)) return FALSE;
399 do ++m; while (ISDIGIT(*m));
400 break;
401 }
402 return !*m;
403 }
404
405 static mrb_bool
symname_p(const char * name)406 symname_p(const char *name)
407 {
408 const char *m = name;
409 mrb_bool localid = FALSE;
410
411 if (!m) return FALSE;
412 switch (*m) {
413 case '\0':
414 return FALSE;
415
416 case '$':
417 if (is_special_global_name(++m)) return TRUE;
418 goto id;
419
420 case '@':
421 if (*++m == '@') ++m;
422 goto id;
423
424 case '<':
425 switch (*++m) {
426 case '<': ++m; break;
427 case '=': if (*++m == '>') ++m; break;
428 default: break;
429 }
430 break;
431
432 case '>':
433 switch (*++m) {
434 case '>': case '=': ++m; break;
435 default: break;
436 }
437 break;
438
439 case '=':
440 switch (*++m) {
441 case '~': ++m; break;
442 case '=': if (*++m == '=') ++m; break;
443 default: return FALSE;
444 }
445 break;
446
447 case '*':
448 if (*++m == '*') ++m;
449 break;
450 case '!':
451 switch (*++m) {
452 case '=': case '~': ++m;
453 }
454 break;
455 case '+': case '-':
456 if (*++m == '@') ++m;
457 break;
458 case '|':
459 if (*++m == '|') ++m;
460 break;
461 case '&':
462 if (*++m == '&') ++m;
463 break;
464
465 case '^': case '/': case '%': case '~': case '`':
466 ++m;
467 break;
468
469 case '[':
470 if (*++m != ']') return FALSE;
471 if (*++m == '=') ++m;
472 break;
473
474 default:
475 localid = !ISUPPER(*m);
476 id:
477 if (*m != '_' && !ISALPHA(*m)) return FALSE;
478 while (is_identchar(*m)) m += 1;
479 if (localid) {
480 switch (*m) {
481 case '!': case '?': case '=': ++m;
482 default: break;
483 }
484 }
485 break;
486 }
487 return *m ? FALSE : TRUE;
488 }
489
490 static mrb_value
sym_inspect(mrb_state * mrb,mrb_value sym)491 sym_inspect(mrb_state *mrb, mrb_value sym)
492 {
493 mrb_value str;
494 const char *name;
495 mrb_int len;
496 mrb_sym id = mrb_symbol(sym);
497 char *sp;
498
499 name = mrb_sym_name_len(mrb, id, &len);
500 str = mrb_str_new(mrb, 0, len+1);
501 sp = RSTRING_PTR(str);
502 sp[0] = ':';
503 memcpy(sp+1, name, len);
504 mrb_assert_int_fit(mrb_int, len, size_t, SIZE_MAX);
505 if (!symname_p(name) || strlen(name) != (size_t)len) {
506 str = mrb_str_inspect(mrb, str);
507 sp = RSTRING_PTR(str);
508 sp[0] = ':';
509 sp[1] = '"';
510 }
511 #ifdef MRB_UTF8_STRING
512 if (SYMBOL_INLINE_P(id)) RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
513 #endif
514 return str;
515 }
516
517 MRB_API mrb_value
mrb_sym_str(mrb_state * mrb,mrb_sym sym)518 mrb_sym_str(mrb_state *mrb, mrb_sym sym)
519 {
520 mrb_int len;
521 const char *name = mrb_sym_name_len(mrb, sym, &len);
522
523 if (!name) return mrb_undef_value(); /* can't happen */
524 if (SYMBOL_INLINE_P(sym)) {
525 mrb_value str = mrb_str_new(mrb, name, len);
526 RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
527 return str;
528 }
529 return mrb_str_new_static(mrb, name, len);
530 }
531
532 static const char*
sym_name(mrb_state * mrb,mrb_sym sym,mrb_bool dump)533 sym_name(mrb_state *mrb, mrb_sym sym, mrb_bool dump)
534 {
535 mrb_int len;
536 const char *name = mrb_sym_name_len(mrb, sym, &len);
537
538 if (!name) return NULL;
539 if (strlen(name) == (size_t)len && (!dump || symname_p(name))) {
540 return name;
541 }
542 else {
543 mrb_value str = SYMBOL_INLINE_P(sym) ?
544 mrb_str_new(mrb, name, len) : mrb_str_new_static(mrb, name, len);
545 str = mrb_str_dump(mrb, str);
546 return RSTRING_PTR(str);
547 }
548 }
549
550 MRB_API const char*
mrb_sym_name(mrb_state * mrb,mrb_sym sym)551 mrb_sym_name(mrb_state *mrb, mrb_sym sym)
552 {
553 return sym_name(mrb, sym, FALSE);
554 }
555
556 MRB_API const char*
mrb_sym_dump(mrb_state * mrb,mrb_sym sym)557 mrb_sym_dump(mrb_state *mrb, mrb_sym sym)
558 {
559 return sym_name(mrb, sym, TRUE);
560 }
561
562 #define lesser(a,b) (((a)>(b))?(b):(a))
563
564 static mrb_value
sym_cmp(mrb_state * mrb,mrb_value s1)565 sym_cmp(mrb_state *mrb, mrb_value s1)
566 {
567 mrb_value s2 = mrb_get_arg1(mrb);
568 mrb_sym sym1, sym2;
569
570 if (!mrb_symbol_p(s2)) return mrb_nil_value();
571 sym1 = mrb_symbol(s1);
572 sym2 = mrb_symbol(s2);
573 if (sym1 == sym2) return mrb_fixnum_value(0);
574 else {
575 const char *p1, *p2;
576 int retval;
577 mrb_int len, len1, len2;
578 char buf1[8], buf2[8];
579
580 p1 = sym2name_len(mrb, sym1, buf1, &len1);
581 p2 = sym2name_len(mrb, sym2, buf2, &len2);
582 len = lesser(len1, len2);
583 retval = memcmp(p1, p2, len);
584 if (retval == 0) {
585 if (len1 == len2) return mrb_fixnum_value(0);
586 if (len1 > len2) return mrb_fixnum_value(1);
587 return mrb_fixnum_value(-1);
588 }
589 if (retval > 0) return mrb_fixnum_value(1);
590 return mrb_fixnum_value(-1);
591 }
592 }
593
594 void
mrb_init_symbol(mrb_state * mrb)595 mrb_init_symbol(mrb_state *mrb)
596 {
597 struct RClass *sym;
598
599 mrb->symbol_class = sym = mrb_define_class(mrb, "Symbol", mrb->object_class); /* 15.2.11 */
600 MRB_SET_INSTANCE_TT(sym, MRB_TT_SYMBOL);
601 mrb_undef_class_method(mrb, sym, "new");
602
603 mrb_define_method(mrb, sym, "id2name", sym_to_s, MRB_ARGS_NONE()); /* 15.2.11.3.2 */
604 mrb_define_method(mrb, sym, "to_s", sym_to_s, MRB_ARGS_NONE()); /* 15.2.11.3.3 */
605 mrb_define_method(mrb, sym, "to_sym", sym_to_sym, MRB_ARGS_NONE()); /* 15.2.11.3.4 */
606 mrb_define_method(mrb, sym, "inspect", sym_inspect, MRB_ARGS_NONE()); /* 15.2.11.3.5(x) */
607 mrb_define_method(mrb, sym, "<=>", sym_cmp, MRB_ARGS_REQ(1));
608 }
609