1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019, Joyent, Inc. 14 * Copyright 2021 Jason King 15 */ 16 17 #include <errno.h> 18 #include <libcustr.h> 19 #include <limits.h> 20 #include <string.h> 21 #include <sys/ctype.h> /* We want the C locale ISXXX() versions */ 22 #include <sys/debug.h> 23 #include <stdio.h> 24 #include <sys/sysmacros.h> 25 26 #include "strview.h" 27 #include "demangle_int.h" 28 29 /* 30 * Unfortunately, there is currently no official specification for the rust 31 * name mangling. This is an attempt to document the understanding of the 32 * mangling used here. It is based off examination of 33 * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/ 34 * 35 * A mangled rust name is: 36 * <prefix> <name> 37 * 38 * <prefix> ::= _Z 39 * __Z 40 * 41 * <name> ::= N <name-segment>+ [<hash>] E 42 * 43 * <name-segment> ::= <len> <name-chars>{len} 44 * 45 * <len> ::= [1-9][0-9]+ 46 * 47 * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>* 48 * <separator> 49 * <special> 50 * 51 * <separator> ::= '..' # '::' 52 * 53 * <special> ::= $SP$ # ' ' 54 * $BP$ # '*' 55 * $RF$ # '&' 56 * $LT$ # '<' 57 * $GT$ # '>' 58 * $LP$ # '(' 59 * $RP$ # ')' 60 * $C$ # ',' 61 * $u7e$ # '~' 62 * $u20$ # ' ' 63 * $u27$ # '\'' 64 * $u3d$ # '=' 65 * $u5b$ # '[' 66 * $u5d$ # ']' 67 * $u7b$ # '{' 68 * $u7d$ # '}' 69 * $u3b$ # ';' 70 * $u2b$ # '+' 71 * $u22$ # '"' 72 * 73 * <hash> := <len> h <hex-digits>+ 74 * 75 * <hex-digits> := <[0-9a-f]> 76 */ 77 78 typedef struct rustdem_state { 79 const char *rds_str; 80 custr_t *rds_demangled; 81 sysdem_ops_t *rds_ops; 82 int rds_error; 83 } rustdem_state_t; 84 85 static const struct rust_charmap { 86 const char *ruc_seq; 87 char ruc_ch; 88 } rust_charmap[] = { 89 { "$SP$", '@' }, 90 { "$BP$", '*' }, 91 { "$RF$", '&' }, 92 { "$LT$", '<' }, 93 { "$GT$", '>' }, 94 { "$LP$", '(' }, 95 { "$RP$", ')' }, 96 { "$C$", ',' }, 97 { "$u7e$", '~' }, 98 { "$u20$", ' ' }, 99 { "$u27$", '\'' }, 100 { "$u3d$", '=' }, 101 { "$u5b$", '[' }, 102 { "$u5d$", ']' }, 103 { "$u7b$", '{' }, 104 { "$u7d$", '}' }, 105 { "$u3b$", ';' }, 106 { "$u2b$", '+' }, 107 { "$u22$", '"' } 108 }; 109 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap); 110 111 static void *rustdem_alloc(custr_alloc_t *, size_t); 112 static void rustdem_free(custr_alloc_t *, void *, size_t); 113 114 static boolean_t rustdem_append_c(rustdem_state_t *, char); 115 static boolean_t rustdem_all_ascii(const strview_t *); 116 117 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *); 118 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *); 119 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *); 120 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *); 121 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *); 122 static boolean_t rustdem_add_sep(rustdem_state_t *); 123 124 char * 125 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops) 126 { 127 rustdem_state_t st = { 128 .rds_str = s, 129 .rds_ops = ops, 130 }; 131 custr_alloc_ops_t custr_ops = { 132 .custr_ao_alloc = rustdem_alloc, 133 .custr_ao_free = rustdem_free 134 }; 135 custr_alloc_t custr_alloc = { 136 .cua_version = CUSTR_VERSION 137 }; 138 strview_t sv; 139 int ret; 140 141 if (custr_alloc_init(&custr_alloc, &custr_ops) != 0) 142 return (NULL); 143 custr_alloc.cua_arg = &st; 144 145 sv_init_str(&sv, s, s + slen); 146 147 if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') { 148 DEMDEBUG("ERROR: string is either too small or does not end " 149 "with 'E'"); 150 errno = EINVAL; 151 return (NULL); 152 } 153 154 if (!rustdem_parse_prefix(&st, &sv)) { 155 DEMDEBUG("ERROR: could not parse prefix"); 156 errno = EINVAL; 157 return (NULL); 158 } 159 DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv)); 160 161 if (!rustdem_all_ascii(&sv)) { 162 /* rustdem_all_ascii() provides debug output */ 163 errno = EINVAL; 164 return (NULL); 165 } 166 167 if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0) 168 return (NULL); 169 170 if (!rustdem_parse_name(&st, &sv)) { 171 if (st.rds_error == 0) 172 st.rds_error = EINVAL; 173 goto fail; 174 } 175 176 if (sv_remaining(&sv) > 0) { 177 DEMDEBUG("ERROR: unexpected trailing characters after " 178 "terminating 'E': '%.*s'", SV_PRINT(&sv)); 179 st.rds_error = EINVAL; 180 goto fail; 181 } 182 183 char *res = xstrdup(ops, custr_cstr(st.rds_demangled)); 184 if (res == NULL) { 185 st.rds_error = errno; 186 goto fail; 187 } 188 189 custr_free(st.rds_demangled); 190 DEMDEBUG("result = '%s'", res); 191 return (res); 192 193 fail: 194 custr_free(st.rds_demangled); 195 errno = st.rds_error; 196 return (NULL); 197 } 198 199 static boolean_t 200 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp) 201 { 202 strview_t pfx; 203 204 sv_init_sv(&pfx, svp); 205 206 DEMDEBUG("checking for '_Z' or '__Z' in '%.*s'", SV_PRINT(&pfx)); 207 208 if (st->rds_error != 0) 209 return (B_FALSE); 210 211 if (!sv_consume_if_c(&pfx, '_')) 212 return (B_FALSE); 213 214 (void) sv_consume_if_c(&pfx, '_'); 215 216 if (!sv_consume_if_c(&pfx, 'Z')) 217 return (B_FALSE); 218 219 /* Update svp with new position */ 220 sv_init_sv(svp, &pfx); 221 return (B_TRUE); 222 } 223 224 static boolean_t 225 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first) 226 { 227 strview_t sv; 228 strview_t name; 229 uint64_t len; 230 size_t rem; 231 boolean_t last = B_FALSE; 232 233 if (st->rds_error != 0 || sv_remaining(svp) == 0) 234 return (B_FALSE); 235 236 sv_init_sv(&sv, svp); 237 238 if (!rustdem_parse_num(st, &sv, &len)) { 239 DEMDEBUG("ERROR: no leading length"); 240 st->rds_error = EINVAL; 241 return (B_FALSE); 242 } 243 244 rem = sv_remaining(&sv); 245 246 if (rem < len) { 247 st->rds_error = EINVAL; 248 return (B_FALSE); 249 } 250 251 /* Is this the last segment before the terminating E? */ 252 if (rem == len + 1) { 253 VERIFY3U(sv_peek(&sv, -1), ==, 'E'); 254 last = B_TRUE; 255 } 256 257 if (!first && !rustdem_add_sep(st)) 258 return (B_FALSE); 259 260 /* Reduce length of seg to the length we parsed */ 261 (void) sv_init_sv_range(&name, &sv, len); 262 263 DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name)); 264 265 /* 266 * A rust hash starts with 'h', and is the last component of a name 267 * before the terminating 'E'. It is however not always present 268 * in every mangled symbol, and a last segment that starts with 'h' 269 * could be confused for it, so failing to parse it just means 270 * we don't have a trailing hash. 271 */ 272 if (sv_peek(&name, 0) == 'h' && last) { 273 if (rustdem_parse_hash(st, &name)) 274 goto done; 275 276 /* 277 * However any error other than 'not a hash' (e.g. ENOMEM) 278 * means we should fail. 279 */ 280 if (st->rds_error != 0) 281 goto done; 282 } 283 284 while (sv_remaining(&name) > 0) { 285 switch (sv_peek(&name, 0)) { 286 case '$': 287 if (rustdem_parse_special(st, &name)) 288 continue; 289 break; 290 case '_': 291 if (sv_peek(&name, 1) == '$') { 292 /* 293 * Only consume/ignore '_'. Leave 294 * $ for next round. 295 */ 296 sv_consume_n(&name, 1); 297 continue; 298 } 299 break; 300 case '.': 301 /* Convert '..' to '::' */ 302 if (sv_peek(&name, 1) != '.') 303 break; 304 305 if (!rustdem_add_sep(st)) 306 return (B_FALSE); 307 308 sv_consume_n(&name, 2); 309 continue; 310 default: 311 break; 312 } 313 314 if (custr_appendc(st->rds_demangled, 315 sv_consume_c(&name)) != 0) { 316 st->rds_error = ENOMEM; 317 return (B_FALSE); 318 } 319 } 320 321 done: 322 sv_consume_n(&sv, len); 323 VERIFY3P(svp->sv_first, <=, sv.sv_first); 324 DEMDEBUG("%s: consumed '%.*s'", __func__, 325 (int)(sv.sv_first - svp->sv_first), svp->sv_first); 326 sv_init_sv(svp, &sv); 327 return (B_TRUE); 328 } 329 330 /* 331 * Parse N (<num><name>{num})+[<num>h<hex digits>]E 332 */ 333 static boolean_t 334 rustdem_parse_name(rustdem_state_t *st, strview_t *svp) 335 { 336 strview_t name; 337 boolean_t first = B_TRUE; 338 339 if (st->rds_error != 0) 340 return (B_FALSE); 341 342 sv_init_sv(&name, svp); 343 344 DEMDEBUG("%s: name = '%.*s'", __func__, SV_PRINT(&name)); 345 346 if (sv_remaining(&name) == 0) { 347 DEMDEBUG("%s: empty name", __func__); 348 return (B_FALSE); 349 } 350 351 if (!sv_consume_if_c(&name, 'N')) { 352 DEMDEBUG("%s: does not start with 'N'", __func__); 353 return (B_FALSE); 354 } 355 356 while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') { 357 if (!rustdem_parse_name_segment(st, &name, first)) 358 return (B_FALSE); 359 first = B_FALSE; 360 } 361 VERIFY(sv_consume_if_c(&name, 'E')); 362 363 VERIFY3P(svp->sv_first, <=, name.sv_first); 364 DEMDEBUG("%s: consumed '%.*s'", __func__, 365 (int)(name.sv_first - svp->sv_first), svp->sv_first); 366 367 sv_init_sv(svp, &name); 368 return (B_TRUE); 369 } 370 371 static boolean_t 372 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp) 373 { 374 strview_t sv; 375 376 sv_init_sv(&sv, svp); 377 378 VERIFY(sv_consume_if_c(&sv, 'h')); 379 if (!rustdem_append_c(st, 'h')) 380 return (B_FALSE); 381 382 while (sv_remaining(&sv) > 0) { 383 char c = sv_consume_c(&sv); 384 385 switch (c) { 386 /* 387 * The upper-case hex digits (A-F) are excluded as valid 388 * hash values for several reasons: 389 * 390 * 1. It would result in two different possible names for 391 * the same function, leading to ambiguity in linking (among 392 * other things). 393 * 394 * 2. It would cause potential ambiguity in parsing -- is a 395 * trailing 'E' part of the hash, or the terminating character 396 * in the mangled name? 397 * 398 * 3. No examples were able to be found in the wild where 399 * uppercase digits are used, and other rust demanglers all 400 * seem to assume the hash must contain lower-case hex digits. 401 */ 402 case '0': case '1': case '2': case '3': 403 case '4': case '5': case '6': case '7': 404 case '8': case '9': case 'a': case 'b': 405 case 'c': case 'd': case 'e': case 'f': 406 if (!rustdem_append_c(st, c)) 407 return (B_FALSE); 408 break; 409 default: 410 return (B_FALSE); 411 } 412 } 413 414 sv_init_sv(svp, &sv); 415 return (B_TRUE); 416 } 417 418 /* 419 * We have to pick an arbitrary limit here; 999,999,999 fits comfortably 420 * within an int32_t, so let's go with that, as it seems unlikely we'd 421 * ever see a larger value in context. 422 */ 423 #define MAX_DIGITS 9 424 425 static boolean_t 426 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp, 427 uint64_t *restrict valp) 428 { 429 strview_t snum; 430 uint64_t v = 0; 431 size_t ndigits = 0; 432 char c; 433 434 if (st->rds_error != 0) 435 return (B_FALSE); 436 437 sv_init_sv(&snum, svp); 438 439 DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum)); 440 441 c = sv_peek(&snum, 0); 442 if (!ISDIGIT(c)) { 443 DEMDEBUG("%s: ERROR no digits in str\n", __func__); 444 st->rds_error = EINVAL; 445 return (B_FALSE); 446 } 447 448 /* 449 * Since there is currently no official specification on rust name 450 * mangling, only that it has been stated that rust follows what 451 * C++ mangling does. In the Itanium C++ ABI (what practically 452 * every non-Windows C++ implementation uses these days), it 453 * explicitly disallows leading 0s in numeric values (except for 454 * substition and template indexes, which aren't relevant here). 455 * We enforce the same restriction -- if a rust implementation allowed 456 * leading zeros in numbers (basically segment lengths) it'd 457 * cause all sorts of ambiguity problems with names that likely lead 458 * to much bigger problems with linking and such, so this seems 459 * reasonable. 460 */ 461 if (c == '0') { 462 DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__); 463 st->rds_error = EINVAL; 464 return (B_FALSE); 465 } 466 467 while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) { 468 c = sv_consume_c(&snum); 469 470 if (!ISDIGIT(c)) 471 break; 472 473 v *= 10; 474 v += c - '0'; 475 ndigits++; 476 } 477 478 if (ndigits > MAX_DIGITS) { 479 DEMDEBUG("%s: value %llu is too large\n", __func__, v); 480 st->rds_error = ERANGE; 481 return (B_FALSE); 482 } 483 484 DEMDEBUG("%s: num=%llu", __func__, v); 485 486 *valp = v; 487 sv_consume_n(svp, ndigits); 488 return (B_TRUE); 489 } 490 491 static boolean_t 492 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp) 493 { 494 if (st->rds_error != 0) 495 return (B_FALSE); 496 497 if (sv_peek(svp, 0) != '$') 498 return (B_FALSE); 499 500 for (size_t i = 0; i < rust_charmap_sz; i++) { 501 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) { 502 if (!rustdem_append_c(st, rust_charmap[i].ruc_ch)) 503 return (B_FALSE); 504 return (B_TRUE); 505 } 506 } 507 return (B_FALSE); 508 } 509 510 static boolean_t 511 rustdem_add_sep(rustdem_state_t *st) 512 { 513 if (st->rds_error != 0) 514 return (B_FALSE); 515 516 if (!rustdem_append_c(st, ':') || 517 !rustdem_append_c(st, ':')) 518 return (B_FALSE); 519 520 return (B_TRUE); 521 } 522 523 static boolean_t 524 rustdem_append_c(rustdem_state_t *st, char c) 525 { 526 if (st->rds_error != 0) 527 return (B_FALSE); 528 529 if (custr_appendc(st->rds_demangled, c) == 0) 530 return (B_TRUE); 531 532 st->rds_error = errno; 533 return (B_FALSE); 534 } 535 536 static boolean_t 537 rustdem_all_ascii(const strview_t *svp) 538 { 539 strview_t p; 540 541 sv_init_sv(&p, svp); 542 543 while (sv_remaining(&p) > 0) { 544 char c = sv_consume_c(&p); 545 546 /* 547 * #including <sys/ctype.h> conflicts with <ctype.h>. Since 548 * we want the C locale macros (ISDIGIT, etc), it also means 549 * we can't use isascii(3C). 550 */ 551 if ((c & 0x80) != 0) { 552 DEMDEBUG("%s: found non-ascii character 0x%02hhx at " 553 "offset %tu", __func__, c, 554 (ptrdiff_t)(p.sv_first - svp->sv_first)); 555 return (B_FALSE); 556 } 557 } 558 return (B_TRUE); 559 } 560 561 static void * 562 rustdem_alloc(custr_alloc_t *cao, size_t len) 563 { 564 rustdem_state_t *st = cao->cua_arg; 565 return (zalloc(st->rds_ops, len)); 566 } 567 568 static void 569 rustdem_free(custr_alloc_t *cao, void *p, size_t len) 570 { 571 rustdem_state_t *st = cao->cua_arg; 572 xfree(st->rds_ops, p, len); 573 } 574