1 /*
2 ** symbol.c - Symbol class
3 **
4 ** See Copyright Notice in mruby.h
5 */
6
7 #include <limits.h>
8 #include <string.h>
9 #include <mruby.h>
10 #include <mruby/khash.h>
11 #include <mruby/string.h>
12 #include <mruby/dump.h>
13 #include <mruby/class.h>
14
15 /* ------------------------------------------------------ */
16 typedef struct symbol_name {
17 mrb_bool lit : 1;
18 uint8_t prev;
19 uint16_t len;
20 const char *name;
21 } symbol_name;
22
23 static void
sym_validate_len(mrb_state * mrb,size_t len)24 sym_validate_len(mrb_state *mrb, size_t len)
25 {
26 if (len >= RITE_LV_NULL_MARK) {
27 mrb_raise(mrb, E_ARGUMENT_ERROR, "symbol length too long");
28 }
29 }
30
31 #ifndef MRB_ENABLE_ALL_SYMBOLS
32 static const char pack_table[] = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
33
34 static mrb_sym
sym_inline_pack(const char * name,uint16_t len)35 sym_inline_pack(const char *name, uint16_t len)
36 {
37 char c;
38 const char *p;
39 int i;
40 mrb_sym sym = 0;
41 int lower = 1;
42
43 if (len > 6) return 0; /* too long */
44 for (i=0; i<len; i++) {
45 uint32_t bits;
46
47 c = name[i];
48 if (c == 0) return 0; /* NUL in name */
49 p = strchr(pack_table, (int)c);
50 if (p == 0) return 0; /* non alnum char */
51 bits = (uint32_t)(p - pack_table)+1;
52 if (bits > 27) lower = 0;
53 sym |= bits<<(i*6+2);
54 }
55 if (lower) {
56 sym = 0;
57 for (i=0; i<len; i++) {
58 uint32_t bits;
59
60 c = name[i];
61 p = strchr(pack_table, (int)c);
62 bits = (uint32_t)(p - pack_table)+1;
63 sym |= bits<<(i*5+2);
64 }
65 return sym | 3;
66 }
67 if (len == 6) return 0;
68 return sym | 1;
69 }
70
71 static const char*
sym_inline_unpack_with_bit(mrb_sym sym,char * buf,int bit_per_char)72 sym_inline_unpack_with_bit(mrb_sym sym, char *buf, int bit_per_char)
73 {
74 int i;
75
76 for (i=0; i<30/bit_per_char; i++) {
77 uint32_t bits;
78 char c;
79
80 bits = sym>>(i*bit_per_char+2) & (1<<bit_per_char)-1;
81 if (bits == 0) break;
82 c = pack_table[bits-1];
83 buf[i] = c;
84 }
85 buf[i] = '\0';
86 return buf;
87 }
88
89 static const char*
sym_inline_unpack(mrb_sym sym,char * buf)90 sym_inline_unpack(mrb_sym sym, char *buf)
91 {
92 mrb_assert(sym&1);
93
94 if (sym&2) { /* all lower case (5bits/char) */
95 return sym_inline_unpack_with_bit(sym, buf, 5);
96 }
97 return sym_inline_unpack_with_bit(sym, buf, 6);
98 }
99 #endif
100
101 uint8_t
symhash(const char * key,size_t len)102 symhash(const char *key, size_t len)
103 {
104 uint32_t hash, i;
105
106 for(hash = i = 0; i < len; ++i) {
107 hash += key[i];
108 hash += (hash << 10);
109 hash ^= (hash >> 6);
110 }
111 hash += (hash << 3);
112 hash ^= (hash >> 11);
113 hash += (hash << 15);
114 return hash & 0xff;
115 }
116
117 static mrb_sym
find_symbol(mrb_state * mrb,const char * name,uint16_t len,uint8_t hash)118 find_symbol(mrb_state *mrb, const char *name, uint16_t len, uint8_t hash)
119 {
120 mrb_sym i;
121 symbol_name *sname;
122
123 #ifndef MRB_ENABLE_ALL_SYMBOLS
124 /* inline symbol */
125 i = sym_inline_pack(name, len);
126 if (i > 0) return i;
127 #endif
128
129 i = mrb->symhash[hash];
130 if (i == 0) return 0;
131 do {
132 sname = &mrb->symtbl[i];
133 if (sname->len == len && memcmp(sname->name, name, len) == 0) {
134 return i<<1;
135 }
136 if (sname->prev == 0xff) {
137 i -= 0xff;
138 sname = &mrb->symtbl[i];
139 while (mrb->symtbl < sname) {
140 if (sname->len == len && memcmp(sname->name, name, len) == 0) {
141 return (mrb_sym)(sname - mrb->symtbl)<<1;
142 }
143 sname--;
144 }
145 return 0;
146 }
147 i -= sname->prev;
148 } while (sname->prev > 0);
149 return 0;
150 }
151
152 static mrb_sym
sym_intern(mrb_state * mrb,const char * name,size_t len,mrb_bool lit)153 sym_intern(mrb_state *mrb, const char *name, size_t len, mrb_bool lit)
154 {
155 mrb_sym sym;
156 symbol_name *sname;
157 uint8_t hash;
158
159 sym_validate_len(mrb, len);
160 hash = symhash(name, len);
161 sym = find_symbol(mrb, name, len, hash);
162 if (sym > 0) return sym;
163
164 /* registering a new symbol */
165 sym = ++mrb->symidx;
166 if (mrb->symcapa < sym) {
167 if (mrb->symcapa == 0) mrb->symcapa = 100;
168 else mrb->symcapa = (size_t)(mrb->symcapa * 6 / 5);
169 mrb->symtbl = (symbol_name*)mrb_realloc(mrb, mrb->symtbl, sizeof(symbol_name)*(mrb->symcapa+1));
170 }
171 sname = &mrb->symtbl[sym];
172 sname->len = (uint16_t)len;
173 if (lit || mrb_ro_data_p(name)) {
174 sname->name = name;
175 sname->lit = TRUE;
176 }
177 else {
178 char *p = (char *)mrb_malloc(mrb, len+1);
179 memcpy(p, name, len);
180 p[len] = 0;
181 sname->name = (const char*)p;
182 sname->lit = FALSE;
183 }
184 if (mrb->symhash[hash]) {
185 mrb_sym i = sym - mrb->symhash[hash];
186 if (i > 0xff)
187 sname->prev = 0xff;
188 else
189 sname->prev = i;
190 }
191 else {
192 sname->prev = 0;
193 }
194 mrb->symhash[hash] = sym;
195
196 return sym<<1;
197 }
198
199 MRB_API mrb_sym
mrb_intern(mrb_state * mrb,const char * name,size_t len)200 mrb_intern(mrb_state *mrb, const char *name, size_t len)
201 {
202 return sym_intern(mrb, name, len, FALSE);
203 }
204
205 MRB_API mrb_sym
mrb_intern_static(mrb_state * mrb,const char * name,size_t len)206 mrb_intern_static(mrb_state *mrb, const char *name, size_t len)
207 {
208 return sym_intern(mrb, name, len, TRUE);
209 }
210
211 MRB_API mrb_sym
mrb_intern_cstr(mrb_state * mrb,const char * name)212 mrb_intern_cstr(mrb_state *mrb, const char *name)
213 {
214 return mrb_intern(mrb, name, strlen(name));
215 }
216
217 MRB_API mrb_sym
mrb_intern_str(mrb_state * mrb,mrb_value str)218 mrb_intern_str(mrb_state *mrb, mrb_value str)
219 {
220 return mrb_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
221 }
222
223 MRB_API mrb_value
mrb_check_intern(mrb_state * mrb,const char * name,size_t len)224 mrb_check_intern(mrb_state *mrb, const char *name, size_t len)
225 {
226 mrb_sym sym;
227
228 sym_validate_len(mrb, len);
229 sym = find_symbol(mrb, name, len, symhash(name, len));
230 if (sym > 0) return mrb_symbol_value(sym);
231 return mrb_nil_value();
232 }
233
234 MRB_API mrb_value
mrb_check_intern_cstr(mrb_state * mrb,const char * name)235 mrb_check_intern_cstr(mrb_state *mrb, const char *name)
236 {
237 return mrb_check_intern(mrb, name, strlen(name));
238 }
239
240 MRB_API mrb_value
mrb_check_intern_str(mrb_state * mrb,mrb_value str)241 mrb_check_intern_str(mrb_state *mrb, mrb_value str)
242 {
243 return mrb_check_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
244 }
245
246 MRB_API const char*
mrb_sym2name_len(mrb_state * mrb,mrb_sym sym,mrb_int * lenp)247 mrb_sym2name_len(mrb_state *mrb, mrb_sym sym, mrb_int *lenp)
248 {
249 #ifndef MRB_ENABLE_ALL_SYMBOLS
250 if (sym & 1) { /* inline packed symbol */
251 sym_inline_unpack(sym, mrb->symbuf);
252 if (lenp) *lenp = strlen(mrb->symbuf);
253 return mrb->symbuf;
254 }
255 #endif
256
257 sym >>= 1;
258 if (sym == 0 || mrb->symidx < sym) {
259 if (lenp) *lenp = 0;
260 return NULL;
261 }
262
263 if (lenp) *lenp = mrb->symtbl[sym].len;
264 return mrb->symtbl[sym].name;
265 }
266
267 void
mrb_free_symtbl(mrb_state * mrb)268 mrb_free_symtbl(mrb_state *mrb)
269 {
270 mrb_sym i, lim;
271
272 for (i=1, lim=mrb->symidx+1; i<lim; i++) {
273 if (!mrb->symtbl[i].lit) {
274 mrb_free(mrb, (char*)mrb->symtbl[i].name);
275 }
276 }
277 mrb_free(mrb, mrb->symtbl);
278 }
279
280 void
mrb_init_symtbl(mrb_state * mrb)281 mrb_init_symtbl(mrb_state *mrb)
282 {
283 }
284
285 /**********************************************************************
286 * Document-class: Symbol
287 *
288 * <code>Symbol</code> objects represent names and some strings
289 * inside the Ruby
290 * interpreter. They are generated using the <code>:name</code> and
291 * <code>:"string"</code> literals
292 * syntax, and by the various <code>to_sym</code> methods. The same
293 * <code>Symbol</code> object will be created for a given name or string
294 * for the duration of a program's execution, regardless of the context
295 * or meaning of that name. Thus if <code>Fred</code> is a constant in
296 * one context, a method in another, and a class in a third, the
297 * <code>Symbol</code> <code>:Fred</code> will be the same object in
298 * all three contexts.
299 *
300 * module One
301 * class Fred
302 * end
303 * $f1 = :Fred
304 * end
305 * module Two
306 * Fred = 1
307 * $f2 = :Fred
308 * end
309 * def Fred()
310 * end
311 * $f3 = :Fred
312 * $f1.object_id #=> 2514190
313 * $f2.object_id #=> 2514190
314 * $f3.object_id #=> 2514190
315 *
316 */
317
318
319 /* 15.2.11.3.1 */
320 /*
321 * call-seq:
322 * sym == obj -> true or false
323 *
324 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
325 * symbol, returns <code>true</code>.
326 */
327
328 static mrb_value
sym_equal(mrb_state * mrb,mrb_value sym1)329 sym_equal(mrb_state *mrb, mrb_value sym1)
330 {
331 mrb_value sym2;
332
333 mrb_get_args(mrb, "o", &sym2);
334
335 return mrb_bool_value(mrb_obj_equal(mrb, sym1, sym2));
336 }
337
338 /* 15.2.11.3.2 */
339 /* 15.2.11.3.3 */
340 /*
341 * call-seq:
342 * sym.id2name -> string
343 * sym.to_s -> string
344 *
345 * Returns the name or string corresponding to <i>sym</i>.
346 *
347 * :fred.id2name #=> "fred"
348 */
349 static mrb_value
mrb_sym_to_s(mrb_state * mrb,mrb_value sym)350 mrb_sym_to_s(mrb_state *mrb, mrb_value sym)
351 {
352 mrb_sym id = mrb_symbol(sym);
353 const char *p;
354 mrb_int len;
355
356 p = mrb_sym2name_len(mrb, id, &len);
357 return mrb_str_new_static(mrb, p, len);
358 }
359
360 /* 15.2.11.3.4 */
361 /*
362 * call-seq:
363 * sym.to_sym -> sym
364 * sym.intern -> sym
365 *
366 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
367 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
368 * in this case.
369 */
370
371 static mrb_value
sym_to_sym(mrb_state * mrb,mrb_value sym)372 sym_to_sym(mrb_state *mrb, mrb_value sym)
373 {
374 return sym;
375 }
376
377 /* 15.2.11.3.5(x) */
378 /*
379 * call-seq:
380 * sym.inspect -> string
381 *
382 * Returns the representation of <i>sym</i> as a symbol literal.
383 *
384 * :fred.inspect #=> ":fred"
385 */
386
387 #if __STDC__
388 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
389 #else /* not __STDC__ */
390 /* As in Harbison and Steele. */
391 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
392 #endif
393 #define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_'))
394
395 static mrb_bool
is_special_global_name(const char * m)396 is_special_global_name(const char* m)
397 {
398 switch (*m) {
399 case '~': case '*': case '$': case '?': case '!': case '@':
400 case '/': case '\\': case ';': case ',': case '.': case '=':
401 case ':': case '<': case '>': case '\"':
402 case '&': case '`': case '\'': case '+':
403 case '0':
404 ++m;
405 break;
406 case '-':
407 ++m;
408 if (is_identchar(*m)) m += 1;
409 break;
410 default:
411 if (!ISDIGIT(*m)) return FALSE;
412 do ++m; while (ISDIGIT(*m));
413 break;
414 }
415 return !*m;
416 }
417
418 static mrb_bool
symname_p(const char * name)419 symname_p(const char *name)
420 {
421 const char *m = name;
422 mrb_bool localid = FALSE;
423
424 if (!m) return FALSE;
425 switch (*m) {
426 case '\0':
427 return FALSE;
428
429 case '$':
430 if (is_special_global_name(++m)) return TRUE;
431 goto id;
432
433 case '@':
434 if (*++m == '@') ++m;
435 goto id;
436
437 case '<':
438 switch (*++m) {
439 case '<': ++m; break;
440 case '=': if (*++m == '>') ++m; break;
441 default: break;
442 }
443 break;
444
445 case '>':
446 switch (*++m) {
447 case '>': case '=': ++m; break;
448 default: break;
449 }
450 break;
451
452 case '=':
453 switch (*++m) {
454 case '~': ++m; break;
455 case '=': if (*++m == '=') ++m; break;
456 default: return FALSE;
457 }
458 break;
459
460 case '*':
461 if (*++m == '*') ++m;
462 break;
463 case '!':
464 switch (*++m) {
465 case '=': case '~': ++m;
466 }
467 break;
468 case '+': case '-':
469 if (*++m == '@') ++m;
470 break;
471 case '|':
472 if (*++m == '|') ++m;
473 break;
474 case '&':
475 if (*++m == '&') ++m;
476 break;
477
478 case '^': case '/': case '%': case '~': case '`':
479 ++m;
480 break;
481
482 case '[':
483 if (*++m != ']') return FALSE;
484 if (*++m == '=') ++m;
485 break;
486
487 default:
488 localid = !ISUPPER(*m);
489 id:
490 if (*m != '_' && !ISALPHA(*m)) return FALSE;
491 while (is_identchar(*m)) m += 1;
492 if (localid) {
493 switch (*m) {
494 case '!': case '?': case '=': ++m;
495 default: break;
496 }
497 }
498 break;
499 }
500 return *m ? FALSE : TRUE;
501 }
502
503 static mrb_value
sym_inspect(mrb_state * mrb,mrb_value sym)504 sym_inspect(mrb_state *mrb, mrb_value sym)
505 {
506 mrb_value str;
507 const char *name;
508 mrb_int len;
509 mrb_sym id = mrb_symbol(sym);
510 char *sp;
511
512 name = mrb_sym2name_len(mrb, id, &len);
513 str = mrb_str_new(mrb, 0, len+1);
514 sp = RSTRING_PTR(str);
515 RSTRING_PTR(str)[0] = ':';
516 memcpy(sp+1, name, len);
517 mrb_assert_int_fit(mrb_int, len, size_t, SIZE_MAX);
518 if (!symname_p(name) || strlen(name) != (size_t)len) {
519 str = mrb_str_dump(mrb, str);
520 sp = RSTRING_PTR(str);
521 sp[0] = ':';
522 sp[1] = '"';
523 }
524 return str;
525 }
526
527 MRB_API mrb_value
mrb_sym2str(mrb_state * mrb,mrb_sym sym)528 mrb_sym2str(mrb_state *mrb, mrb_sym sym)
529 {
530 mrb_int len;
531 const char *name = mrb_sym2name_len(mrb, sym, &len);
532
533 if (!name) return mrb_undef_value(); /* can't happen */
534 if (sym&1) { /* inline symbol */
535 return mrb_str_new(mrb, name, len);
536 }
537 return mrb_str_new_static(mrb, name, len);
538 }
539
540 MRB_API const char*
mrb_sym2name(mrb_state * mrb,mrb_sym sym)541 mrb_sym2name(mrb_state *mrb, mrb_sym sym)
542 {
543 mrb_int len;
544 const char *name = mrb_sym2name_len(mrb, sym, &len);
545
546 if (!name) return NULL;
547 if (symname_p(name) && strlen(name) == (size_t)len) {
548 return name;
549 }
550 else {
551 mrb_value str = mrb_str_dump(mrb, mrb_str_new_static(mrb, name, len));
552 return RSTRING_PTR(str);
553 }
554 }
555
556 #define lesser(a,b) (((a)>(b))?(b):(a))
557
558 static mrb_value
sym_cmp(mrb_state * mrb,mrb_value s1)559 sym_cmp(mrb_state *mrb, mrb_value s1)
560 {
561 mrb_value s2;
562 mrb_sym sym1, sym2;
563
564 mrb_get_args(mrb, "o", &s2);
565 if (mrb_type(s2) != MRB_TT_SYMBOL) return mrb_nil_value();
566 sym1 = mrb_symbol(s1);
567 sym2 = mrb_symbol(s2);
568 if (sym1 == sym2) return mrb_fixnum_value(0);
569 else {
570 const char *p1, *p2;
571 int retval;
572 mrb_int len, len1, len2;
573
574 p1 = mrb_sym2name_len(mrb, sym1, &len1);
575 p2 = mrb_sym2name_len(mrb, sym2, &len2);
576 len = lesser(len1, len2);
577 retval = memcmp(p1, p2, len);
578 if (retval == 0) {
579 if (len1 == len2) return mrb_fixnum_value(0);
580 if (len1 > len2) return mrb_fixnum_value(1);
581 return mrb_fixnum_value(-1);
582 }
583 if (retval > 0) return mrb_fixnum_value(1);
584 return mrb_fixnum_value(-1);
585 }
586 }
587
588 void
mrb_init_symbol(mrb_state * mrb)589 mrb_init_symbol(mrb_state *mrb)
590 {
591 struct RClass *sym;
592
593 mrb->symbol_class = sym = mrb_define_class(mrb, "Symbol", mrb->object_class); /* 15.2.11 */
594 MRB_SET_INSTANCE_TT(sym, MRB_TT_SYMBOL);
595 mrb_undef_class_method(mrb, sym, "new");
596
597 mrb_define_method(mrb, sym, "===", sym_equal, MRB_ARGS_REQ(1)); /* 15.2.11.3.1 */
598 mrb_define_method(mrb, sym, "id2name", mrb_sym_to_s, MRB_ARGS_NONE()); /* 15.2.11.3.2 */
599 mrb_define_method(mrb, sym, "to_s", mrb_sym_to_s, MRB_ARGS_NONE()); /* 15.2.11.3.3 */
600 mrb_define_method(mrb, sym, "to_sym", sym_to_sym, MRB_ARGS_NONE()); /* 15.2.11.3.4 */
601 mrb_define_method(mrb, sym, "inspect", sym_inspect, MRB_ARGS_NONE()); /* 15.2.11.3.5(x) */
602 mrb_define_method(mrb, sym, "<=>", sym_cmp, MRB_ARGS_REQ(1));
603 }
604