1 //===--- RustDemangle.cpp ---------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a demangler for Rust v0 mangled symbols as specified in
10 // https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Demangle/Demangle.h"
15 #include "llvm/Demangle/StringView.h"
16 #include "llvm/Demangle/Utility.h"
17 
18 #include <algorithm>
19 #include <cassert>
20 #include <cstdint>
21 #include <cstring>
22 #include <limits>
23 
24 using namespace llvm;
25 
26 using llvm::itanium_demangle::OutputStream;
27 using llvm::itanium_demangle::StringView;
28 using llvm::itanium_demangle::SwapAndRestore;
29 
30 namespace {
31 
32 struct Identifier {
33   StringView Name;
34   bool Punycode;
35 
empty__anonadfcf2c00111::Identifier36   bool empty() const { return Name.empty(); }
37 };
38 
39 enum class BasicType {
40   Bool,
41   Char,
42   I8,
43   I16,
44   I32,
45   I64,
46   I128,
47   ISize,
48   U8,
49   U16,
50   U32,
51   U64,
52   U128,
53   USize,
54   F32,
55   F64,
56   Str,
57   Placeholder,
58   Unit,
59   Variadic,
60   Never,
61 };
62 
63 enum class IsInType {
64   No,
65   Yes,
66 };
67 
68 enum class LeaveGenericsOpen {
69   No,
70   Yes,
71 };
72 
73 class Demangler {
74   // Maximum recursion level. Used to avoid stack overflow.
75   size_t MaxRecursionLevel;
76   // Current recursion level.
77   size_t RecursionLevel;
78   size_t BoundLifetimes;
79   // Input string that is being demangled with "_R" prefix removed.
80   StringView Input;
81   // Position in the input string.
82   size_t Position;
83   // When true, print methods append the output to the stream.
84   // When false, the output is suppressed.
85   bool Print;
86   // True if an error occurred.
87   bool Error;
88 
89 public:
90   // Demangled output.
91   OutputStream Output;
92 
93   Demangler(size_t MaxRecursionLevel = 500);
94 
95   bool demangle(StringView MangledName);
96 
97 private:
98   bool demanglePath(IsInType Type,
99                     LeaveGenericsOpen LeaveOpen = LeaveGenericsOpen::No);
100   void demangleImplPath(IsInType InType);
101   void demangleGenericArg();
102   void demangleType();
103   void demangleFnSig();
104   void demangleDynBounds();
105   void demangleDynTrait();
106   void demangleOptionalBinder();
107   void demangleConst();
108   void demangleConstInt();
109   void demangleConstBool();
110   void demangleConstChar();
111 
demangleBackref(Callable Demangler)112   template <typename Callable> void demangleBackref(Callable Demangler) {
113     uint64_t Backref = parseBase62Number();
114     if (Error || Backref >= Position) {
115       Error = true;
116       return;
117     }
118 
119     if (!Print)
120       return;
121 
122     SwapAndRestore<size_t> SavePosition(Position, Position);
123     Position = Backref;
124     Demangler();
125   }
126 
127   Identifier parseIdentifier();
128   uint64_t parseOptionalBase62Number(char Tag);
129   uint64_t parseBase62Number();
130   uint64_t parseDecimalNumber();
131   uint64_t parseHexNumber(StringView &HexDigits);
132 
133   void print(char C);
134   void print(StringView S);
135   void printDecimalNumber(uint64_t N);
136   void printBasicType(BasicType);
137   void printLifetime(uint64_t Index);
138   void printIdentifier(Identifier Ident);
139 
140   char look() const;
141   char consume();
142   bool consumeIf(char Prefix);
143 
144   bool addAssign(uint64_t &A, uint64_t B);
145   bool mulAssign(uint64_t &A, uint64_t B);
146 };
147 
148 } // namespace
149 
rustDemangle(const char * MangledName,char * Buf,size_t * N,int * Status)150 char *llvm::rustDemangle(const char *MangledName, char *Buf, size_t *N,
151                          int *Status) {
152   if (MangledName == nullptr || (Buf != nullptr && N == nullptr)) {
153     if (Status != nullptr)
154       *Status = demangle_invalid_args;
155     return nullptr;
156   }
157 
158   // Return early if mangled name doesn't look like a Rust symbol.
159   StringView Mangled(MangledName);
160   if (!Mangled.startsWith("_R")) {
161     if (Status != nullptr)
162       *Status = demangle_invalid_mangled_name;
163     return nullptr;
164   }
165 
166   Demangler D;
167   if (!initializeOutputStream(nullptr, nullptr, D.Output, 1024)) {
168     if (Status != nullptr)
169       *Status = demangle_memory_alloc_failure;
170     return nullptr;
171   }
172 
173   if (!D.demangle(Mangled)) {
174     if (Status != nullptr)
175       *Status = demangle_invalid_mangled_name;
176     std::free(D.Output.getBuffer());
177     return nullptr;
178   }
179 
180   D.Output += '\0';
181   char *Demangled = D.Output.getBuffer();
182   size_t DemangledLen = D.Output.getCurrentPosition();
183 
184   if (Buf != nullptr) {
185     if (DemangledLen <= *N) {
186       std::memcpy(Buf, Demangled, DemangledLen);
187       std::free(Demangled);
188       Demangled = Buf;
189     } else {
190       std::free(Buf);
191     }
192   }
193 
194   if (N != nullptr)
195     *N = DemangledLen;
196 
197   if (Status != nullptr)
198     *Status = demangle_success;
199 
200   return Demangled;
201 }
202 
Demangler(size_t MaxRecursionLevel)203 Demangler::Demangler(size_t MaxRecursionLevel)
204     : MaxRecursionLevel(MaxRecursionLevel) {}
205 
isDigit(const char C)206 static inline bool isDigit(const char C) { return '0' <= C && C <= '9'; }
207 
isHexDigit(const char C)208 static inline bool isHexDigit(const char C) {
209   return ('0' <= C && C <= '9') || ('a' <= C && C <= 'f');
210 }
211 
isLower(const char C)212 static inline bool isLower(const char C) { return 'a' <= C && C <= 'z'; }
213 
isUpper(const char C)214 static inline bool isUpper(const char C) { return 'A' <= C && C <= 'Z'; }
215 
216 /// Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
isValid(const char C)217 static inline bool isValid(const char C) {
218   return isDigit(C) || isLower(C) || isUpper(C) || C == '_';
219 }
220 
221 // Demangles Rust v0 mangled symbol. Returns true when successful, and false
222 // otherwise. The demangled symbol is stored in Output field. It is
223 // responsibility of the caller to free the memory behind the output stream.
224 //
225 // <symbol-name> = "_R" <path> [<instantiating-crate>]
demangle(StringView Mangled)226 bool Demangler::demangle(StringView Mangled) {
227   Position = 0;
228   Error = false;
229   Print = true;
230   RecursionLevel = 0;
231   BoundLifetimes = 0;
232 
233   if (!Mangled.consumeFront("_R")) {
234     Error = true;
235     return false;
236   }
237   size_t Dot = Mangled.find('.');
238   Input = Mangled.substr(0, Dot);
239   StringView Suffix = Mangled.dropFront(Dot);
240 
241   demanglePath(IsInType::No);
242 
243   if (Position != Input.size()) {
244     SwapAndRestore<bool> SavePrint(Print, false);
245     demanglePath(IsInType::No);
246   }
247 
248   if (Position != Input.size())
249     Error = true;
250 
251   if (!Suffix.empty()) {
252     print(" (");
253     print(Suffix);
254     print(")");
255   }
256 
257   return !Error;
258 }
259 
260 // Demangles a path. InType indicates whether a path is inside a type. When
261 // LeaveOpen is true, a closing `>` after generic arguments is omitted from the
262 // output. Return value indicates whether generics arguments have been left
263 // open.
264 //
265 // <path> = "C" <identifier>               // crate root
266 //        | "M" <impl-path> <type>         // <T> (inherent impl)
267 //        | "X" <impl-path> <type> <path>  // <T as Trait> (trait impl)
268 //        | "Y" <type> <path>              // <T as Trait> (trait definition)
269 //        | "N" <ns> <path> <identifier>   // ...::ident (nested path)
270 //        | "I" <path> {<generic-arg>} "E" // ...<T, U> (generic args)
271 //        | <backref>
272 // <identifier> = [<disambiguator>] <undisambiguated-identifier>
273 // <ns> = "C"      // closure
274 //      | "S"      // shim
275 //      | <A-Z>    // other special namespaces
276 //      | <a-z>    // internal namespaces
demanglePath(IsInType InType,LeaveGenericsOpen LeaveOpen)277 bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
278   if (Error || RecursionLevel >= MaxRecursionLevel) {
279     Error = true;
280     return false;
281   }
282   SwapAndRestore<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
283 
284   switch (consume()) {
285   case 'C': {
286     parseOptionalBase62Number('s');
287     printIdentifier(parseIdentifier());
288     break;
289   }
290   case 'M': {
291     demangleImplPath(InType);
292     print("<");
293     demangleType();
294     print(">");
295     break;
296   }
297   case 'X': {
298     demangleImplPath(InType);
299     print("<");
300     demangleType();
301     print(" as ");
302     demanglePath(IsInType::Yes);
303     print(">");
304     break;
305   }
306   case 'Y': {
307     print("<");
308     demangleType();
309     print(" as ");
310     demanglePath(IsInType::Yes);
311     print(">");
312     break;
313   }
314   case 'N': {
315     char NS = consume();
316     if (!isLower(NS) && !isUpper(NS)) {
317       Error = true;
318       break;
319     }
320     demanglePath(InType);
321 
322     uint64_t Disambiguator = parseOptionalBase62Number('s');
323     Identifier Ident = parseIdentifier();
324 
325     if (isUpper(NS)) {
326       // Special namespaces
327       print("::{");
328       if (NS == 'C')
329         print("closure");
330       else if (NS == 'S')
331         print("shim");
332       else
333         print(NS);
334       if (!Ident.empty()) {
335         print(":");
336         printIdentifier(Ident);
337       }
338       print('#');
339       printDecimalNumber(Disambiguator);
340       print('}');
341     } else {
342       // Implementation internal namespaces.
343       if (!Ident.empty()) {
344         print("::");
345         printIdentifier(Ident);
346       }
347     }
348     break;
349   }
350   case 'I': {
351     demanglePath(InType);
352     // Omit "::" when in a type, where it is optional.
353     if (InType == IsInType::No)
354       print("::");
355     print("<");
356     for (size_t I = 0; !Error && !consumeIf('E'); ++I) {
357       if (I > 0)
358         print(", ");
359       demangleGenericArg();
360     }
361     if (LeaveOpen == LeaveGenericsOpen::Yes)
362       return true;
363     else
364       print(">");
365     break;
366   }
367   case 'B': {
368     bool IsOpen = false;
369     demangleBackref([&] { IsOpen = demanglePath(InType, LeaveOpen); });
370     return IsOpen;
371   }
372   default:
373     Error = true;
374     break;
375   }
376 
377   return false;
378 }
379 
380 // <impl-path> = [<disambiguator>] <path>
381 // <disambiguator> = "s" <base-62-number>
demangleImplPath(IsInType InType)382 void Demangler::demangleImplPath(IsInType InType) {
383   SwapAndRestore<bool> SavePrint(Print, false);
384   parseOptionalBase62Number('s');
385   demanglePath(InType);
386 }
387 
388 // <generic-arg> = <lifetime>
389 //               | <type>
390 //               | "K" <const>
391 // <lifetime> = "L" <base-62-number>
demangleGenericArg()392 void Demangler::demangleGenericArg() {
393   if (consumeIf('L'))
394     printLifetime(parseBase62Number());
395   else if (consumeIf('K'))
396     demangleConst();
397   else
398     demangleType();
399 }
400 
401 // <basic-type> = "a"      // i8
402 //              | "b"      // bool
403 //              | "c"      // char
404 //              | "d"      // f64
405 //              | "e"      // str
406 //              | "f"      // f32
407 //              | "h"      // u8
408 //              | "i"      // isize
409 //              | "j"      // usize
410 //              | "l"      // i32
411 //              | "m"      // u32
412 //              | "n"      // i128
413 //              | "o"      // u128
414 //              | "s"      // i16
415 //              | "t"      // u16
416 //              | "u"      // ()
417 //              | "v"      // ...
418 //              | "x"      // i64
419 //              | "y"      // u64
420 //              | "z"      // !
421 //              | "p"      // placeholder (e.g. for generic params), shown as _
parseBasicType(char C,BasicType & Type)422 static bool parseBasicType(char C, BasicType &Type) {
423   switch (C) {
424   case 'a':
425     Type = BasicType::I8;
426     return true;
427   case 'b':
428     Type = BasicType::Bool;
429     return true;
430   case 'c':
431     Type = BasicType::Char;
432     return true;
433   case 'd':
434     Type = BasicType::F64;
435     return true;
436   case 'e':
437     Type = BasicType::Str;
438     return true;
439   case 'f':
440     Type = BasicType::F32;
441     return true;
442   case 'h':
443     Type = BasicType::U8;
444     return true;
445   case 'i':
446     Type = BasicType::ISize;
447     return true;
448   case 'j':
449     Type = BasicType::USize;
450     return true;
451   case 'l':
452     Type = BasicType::I32;
453     return true;
454   case 'm':
455     Type = BasicType::U32;
456     return true;
457   case 'n':
458     Type = BasicType::I128;
459     return true;
460   case 'o':
461     Type = BasicType::U128;
462     return true;
463   case 'p':
464     Type = BasicType::Placeholder;
465     return true;
466   case 's':
467     Type = BasicType::I16;
468     return true;
469   case 't':
470     Type = BasicType::U16;
471     return true;
472   case 'u':
473     Type = BasicType::Unit;
474     return true;
475   case 'v':
476     Type = BasicType::Variadic;
477     return true;
478   case 'x':
479     Type = BasicType::I64;
480     return true;
481   case 'y':
482     Type = BasicType::U64;
483     return true;
484   case 'z':
485     Type = BasicType::Never;
486     return true;
487   default:
488     return false;
489   }
490 }
491 
printBasicType(BasicType Type)492 void Demangler::printBasicType(BasicType Type) {
493   switch (Type) {
494   case BasicType::Bool:
495     print("bool");
496     break;
497   case BasicType::Char:
498     print("char");
499     break;
500   case BasicType::I8:
501     print("i8");
502     break;
503   case BasicType::I16:
504     print("i16");
505     break;
506   case BasicType::I32:
507     print("i32");
508     break;
509   case BasicType::I64:
510     print("i64");
511     break;
512   case BasicType::I128:
513     print("i128");
514     break;
515   case BasicType::ISize:
516     print("isize");
517     break;
518   case BasicType::U8:
519     print("u8");
520     break;
521   case BasicType::U16:
522     print("u16");
523     break;
524   case BasicType::U32:
525     print("u32");
526     break;
527   case BasicType::U64:
528     print("u64");
529     break;
530   case BasicType::U128:
531     print("u128");
532     break;
533   case BasicType::USize:
534     print("usize");
535     break;
536   case BasicType::F32:
537     print("f32");
538     break;
539   case BasicType::F64:
540     print("f64");
541     break;
542   case BasicType::Str:
543     print("str");
544     break;
545   case BasicType::Placeholder:
546     print("_");
547     break;
548   case BasicType::Unit:
549     print("()");
550     break;
551   case BasicType::Variadic:
552     print("...");
553     break;
554   case BasicType::Never:
555     print("!");
556     break;
557   }
558 }
559 
560 // <type> = | <basic-type>
561 //          | <path>                      // named type
562 //          | "A" <type> <const>          // [T; N]
563 //          | "S" <type>                  // [T]
564 //          | "T" {<type>} "E"            // (T1, T2, T3, ...)
565 //          | "R" [<lifetime>] <type>     // &T
566 //          | "Q" [<lifetime>] <type>     // &mut T
567 //          | "P" <type>                  // *const T
568 //          | "O" <type>                  // *mut T
569 //          | "F" <fn-sig>                // fn(...) -> ...
570 //          | "D" <dyn-bounds> <lifetime> // dyn Trait<Assoc = X> + Send + 'a
571 //          | <backref>                   // backref
demangleType()572 void Demangler::demangleType() {
573   if (Error || RecursionLevel >= MaxRecursionLevel) {
574     Error = true;
575     return;
576   }
577   SwapAndRestore<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
578 
579   size_t Start = Position;
580   char C = consume();
581   BasicType Type;
582   if (parseBasicType(C, Type))
583     return printBasicType(Type);
584 
585   switch (C) {
586   case 'A':
587     print("[");
588     demangleType();
589     print("; ");
590     demangleConst();
591     print("]");
592     break;
593   case 'S':
594     print("[");
595     demangleType();
596     print("]");
597     break;
598   case 'T': {
599     print("(");
600     size_t I = 0;
601     for (; !Error && !consumeIf('E'); ++I) {
602       if (I > 0)
603         print(", ");
604       demangleType();
605     }
606     if (I == 1)
607       print(",");
608     print(")");
609     break;
610   }
611   case 'R':
612   case 'Q':
613     print('&');
614     if (consumeIf('L')) {
615       if (auto Lifetime = parseBase62Number()) {
616         printLifetime(Lifetime);
617         print(' ');
618       }
619     }
620     if (C == 'Q')
621       print("mut ");
622     demangleType();
623     break;
624   case 'P':
625     print("*const ");
626     demangleType();
627     break;
628   case 'O':
629     print("*mut ");
630     demangleType();
631     break;
632   case 'F':
633     demangleFnSig();
634     break;
635   case 'D':
636     demangleDynBounds();
637     if (consumeIf('L')) {
638       if (auto Lifetime = parseBase62Number()) {
639         print(" + ");
640         printLifetime(Lifetime);
641       }
642     } else {
643       Error = true;
644     }
645     break;
646   case 'B':
647     demangleBackref([&] { demangleType(); });
648     break;
649   default:
650     Position = Start;
651     demanglePath(IsInType::Yes);
652     break;
653   }
654 }
655 
656 // <fn-sig> := [<binder>] ["U"] ["K" <abi>] {<type>} "E" <type>
657 // <abi> = "C"
658 //       | <undisambiguated-identifier>
demangleFnSig()659 void Demangler::demangleFnSig() {
660   SwapAndRestore<size_t> SaveBoundLifetimes(BoundLifetimes, BoundLifetimes);
661   demangleOptionalBinder();
662 
663   if (consumeIf('U'))
664     print("unsafe ");
665 
666   if (consumeIf('K')) {
667     print("extern \"");
668     if (consumeIf('C')) {
669       print("C");
670     } else {
671       Identifier Ident = parseIdentifier();
672       if (Ident.Punycode)
673         Error = true;
674       for (char C : Ident.Name) {
675         // When mangling ABI string, the "-" is replaced with "_".
676         if (C == '_')
677           C = '-';
678         print(C);
679       }
680     }
681     print("\" ");
682   }
683 
684   print("fn(");
685   for (size_t I = 0; !Error && !consumeIf('E'); ++I) {
686     if (I > 0)
687       print(", ");
688     demangleType();
689   }
690   print(")");
691 
692   if (consumeIf('u')) {
693     // Skip the unit type from the output.
694   } else {
695     print(" -> ");
696     demangleType();
697   }
698 }
699 
700 // <dyn-bounds> = [<binder>] {<dyn-trait>} "E"
demangleDynBounds()701 void Demangler::demangleDynBounds() {
702   SwapAndRestore<size_t> SaveBoundLifetimes(BoundLifetimes, BoundLifetimes);
703   print("dyn ");
704   demangleOptionalBinder();
705   for (size_t I = 0; !Error && !consumeIf('E'); ++I) {
706     if (I > 0)
707       print(" + ");
708     demangleDynTrait();
709   }
710 }
711 
712 // <dyn-trait> = <path> {<dyn-trait-assoc-binding>}
713 // <dyn-trait-assoc-binding> = "p" <undisambiguated-identifier> <type>
demangleDynTrait()714 void Demangler::demangleDynTrait() {
715   bool IsOpen = demanglePath(IsInType::Yes, LeaveGenericsOpen::Yes);
716   while (!Error && consumeIf('p')) {
717     if (!IsOpen) {
718       IsOpen = true;
719       print('<');
720     } else {
721       print(", ");
722     }
723     print(parseIdentifier().Name);
724     print(" = ");
725     demangleType();
726   }
727   if (IsOpen)
728     print(">");
729 }
730 
731 // Demangles optional binder and updates the number of bound lifetimes.
732 //
733 // <binder> = "G" <base-62-number>
demangleOptionalBinder()734 void Demangler::demangleOptionalBinder() {
735   uint64_t Binder = parseOptionalBase62Number('G');
736   if (Error || Binder == 0)
737     return;
738 
739   // In valid inputs each bound lifetime is referenced later. Referencing a
740   // lifetime requires at least one byte of input. Reject inputs that are too
741   // short to reference all bound lifetimes. Otherwise demangling of invalid
742   // binders could generate excessive amounts of output.
743   if (Binder >= Input.size() - BoundLifetimes) {
744     Error = true;
745     return;
746   }
747 
748   print("for<");
749   for (size_t I = 0; I != Binder; ++I) {
750     BoundLifetimes += 1;
751     if (I > 0)
752       print(", ");
753     printLifetime(1);
754   }
755   print("> ");
756 }
757 
758 // <const> = <basic-type> <const-data>
759 //         | "p"                          // placeholder
760 //         | <backref>
demangleConst()761 void Demangler::demangleConst() {
762   if (Error || RecursionLevel >= MaxRecursionLevel) {
763     Error = true;
764     return;
765   }
766   SwapAndRestore<size_t> SaveRecursionLevel(RecursionLevel, RecursionLevel + 1);
767 
768   char C = consume();
769   BasicType Type;
770   if (parseBasicType(C, Type)) {
771     switch (Type) {
772     case BasicType::I8:
773     case BasicType::I16:
774     case BasicType::I32:
775     case BasicType::I64:
776     case BasicType::I128:
777     case BasicType::ISize:
778     case BasicType::U8:
779     case BasicType::U16:
780     case BasicType::U32:
781     case BasicType::U64:
782     case BasicType::U128:
783     case BasicType::USize:
784       demangleConstInt();
785       break;
786     case BasicType::Bool:
787       demangleConstBool();
788       break;
789     case BasicType::Char:
790       demangleConstChar();
791       break;
792     case BasicType::Placeholder:
793       print('_');
794       break;
795     default:
796       Error = true;
797       break;
798     }
799   } else if (C == 'B') {
800     demangleBackref([&] { demangleConst(); });
801   } else {
802     Error = true;
803   }
804 }
805 
806 // <const-data> = ["n"] <hex-number>
demangleConstInt()807 void Demangler::demangleConstInt() {
808   if (consumeIf('n'))
809     print('-');
810 
811   StringView HexDigits;
812   uint64_t Value = parseHexNumber(HexDigits);
813   if (HexDigits.size() <= 16) {
814     printDecimalNumber(Value);
815   } else {
816     print("0x");
817     print(HexDigits);
818   }
819 }
820 
821 // <const-data> = "0_" // false
822 //              | "1_" // true
demangleConstBool()823 void Demangler::demangleConstBool() {
824   StringView HexDigits;
825   parseHexNumber(HexDigits);
826   if (HexDigits == "0")
827     print("false");
828   else if (HexDigits == "1")
829     print("true");
830   else
831     Error = true;
832 }
833 
834 /// Returns true if CodePoint represents a printable ASCII character.
isAsciiPrintable(uint64_t CodePoint)835 static bool isAsciiPrintable(uint64_t CodePoint) {
836   return 0x20 <= CodePoint && CodePoint <= 0x7e;
837 }
838 
839 // <const-data> = <hex-number>
demangleConstChar()840 void Demangler::demangleConstChar() {
841   StringView HexDigits;
842   uint64_t CodePoint = parseHexNumber(HexDigits);
843   if (Error || HexDigits.size() > 6) {
844     Error = true;
845     return;
846   }
847 
848   print("'");
849   switch (CodePoint) {
850   case '\t':
851     print(R"(\t)");
852     break;
853   case '\r':
854     print(R"(\r)");
855     break;
856   case '\n':
857     print(R"(\n)");
858     break;
859   case '\\':
860     print(R"(\\)");
861     break;
862   case '"':
863     print(R"(")");
864     break;
865   case '\'':
866     print(R"(\')");
867     break;
868   default:
869     if (isAsciiPrintable(CodePoint)) {
870       char C = CodePoint;
871       print(C);
872     } else {
873       print(R"(\u{)");
874       print(HexDigits);
875       print('}');
876     }
877     break;
878   }
879   print('\'');
880 }
881 
882 // <undisambiguated-identifier> = ["u"] <decimal-number> ["_"] <bytes>
parseIdentifier()883 Identifier Demangler::parseIdentifier() {
884   bool Punycode = consumeIf('u');
885   uint64_t Bytes = parseDecimalNumber();
886 
887   // Underscore resolves the ambiguity when identifier starts with a decimal
888   // digit or another underscore.
889   consumeIf('_');
890 
891   if (Error || Bytes > Input.size() - Position) {
892     Error = true;
893     return {};
894   }
895   StringView S = Input.substr(Position, Bytes);
896   Position += Bytes;
897 
898   if (!std::all_of(S.begin(), S.end(), isValid)) {
899     Error = true;
900     return {};
901   }
902 
903   return {S, Punycode};
904 }
905 
906 // Parses optional base 62 number. The presence of a number is determined using
907 // Tag. Returns 0 when tag is absent and parsed value + 1 otherwise
908 //
909 // This function is indended for parsing disambiguators and binders which when
910 // not present have their value interpreted as 0, and otherwise as decoded
911 // value + 1. For example for binders, value for "G_" is 1, for "G0_" value is
912 // 2. When "G" is absent value is 0.
parseOptionalBase62Number(char Tag)913 uint64_t Demangler::parseOptionalBase62Number(char Tag) {
914   if (!consumeIf(Tag))
915     return 0;
916 
917   uint64_t N = parseBase62Number();
918   if (Error || !addAssign(N, 1))
919     return 0;
920 
921   return N;
922 }
923 
924 // Parses base 62 number with <0-9a-zA-Z> as digits. Number is terminated by
925 // "_". All values are offset by 1, so that "_" encodes 0, "0_" encodes 1,
926 // "1_" encodes 2, etc.
927 //
928 // <base-62-number> = {<0-9a-zA-Z>} "_"
parseBase62Number()929 uint64_t Demangler::parseBase62Number() {
930   if (consumeIf('_'))
931     return 0;
932 
933   uint64_t Value = 0;
934 
935   while (true) {
936     uint64_t Digit;
937     char C = consume();
938 
939     if (C == '_') {
940       break;
941     } else if (isDigit(C)) {
942       Digit = C - '0';
943     } else if (isLower(C)) {
944       Digit = 10 + (C - 'a');
945     } else if (isUpper(C)) {
946       Digit = 10 + 26 + (C - 'A');
947     } else {
948       Error = true;
949       return 0;
950     }
951 
952     if (!mulAssign(Value, 62))
953       return 0;
954 
955     if (!addAssign(Value, Digit))
956       return 0;
957   }
958 
959   if (!addAssign(Value, 1))
960     return 0;
961 
962   return Value;
963 }
964 
965 // Parses a decimal number that had been encoded without any leading zeros.
966 //
967 // <decimal-number> = "0"
968 //                  | <1-9> {<0-9>}
parseDecimalNumber()969 uint64_t Demangler::parseDecimalNumber() {
970   char C = look();
971   if (!isDigit(C)) {
972     Error = true;
973     return 0;
974   }
975 
976   if (C == '0') {
977     consume();
978     return 0;
979   }
980 
981   uint64_t Value = 0;
982 
983   while (isDigit(look())) {
984     if (!mulAssign(Value, 10)) {
985       Error = true;
986       return 0;
987     }
988 
989     uint64_t D = consume() - '0';
990     if (!addAssign(Value, D))
991       return 0;
992   }
993 
994   return Value;
995 }
996 
997 // Parses a hexadecimal number with <0-9a-f> as a digits. Returns the parsed
998 // value and stores hex digits in HexDigits. The return value is unspecified if
999 // HexDigits.size() > 16.
1000 //
1001 // <hex-number> = "0_"
1002 //              | <1-9a-f> {<0-9a-f>} "_"
parseHexNumber(StringView & HexDigits)1003 uint64_t Demangler::parseHexNumber(StringView &HexDigits) {
1004   size_t Start = Position;
1005   uint64_t Value = 0;
1006 
1007   if (!isHexDigit(look()))
1008     Error = true;
1009 
1010   if (consumeIf('0')) {
1011     if (!consumeIf('_'))
1012       Error = true;
1013   } else {
1014     while (!Error && !consumeIf('_')) {
1015       char C = consume();
1016       Value *= 16;
1017       if (isDigit(C))
1018         Value += C - '0';
1019       else if ('a' <= C && C <= 'f')
1020         Value += 10 + (C - 'a');
1021       else
1022         Error = true;
1023     }
1024   }
1025 
1026   if (Error) {
1027     HexDigits = StringView();
1028     return 0;
1029   }
1030 
1031   size_t End = Position - 1;
1032   assert(Start < End);
1033   HexDigits = Input.substr(Start, End - Start);
1034   return Value;
1035 }
1036 
print(char C)1037 void Demangler::print(char C) {
1038   if (Error || !Print)
1039     return;
1040 
1041   Output += C;
1042 }
1043 
print(StringView S)1044 void Demangler::print(StringView S) {
1045   if (Error || !Print)
1046     return;
1047 
1048   Output += S;
1049 }
1050 
printDecimalNumber(uint64_t N)1051 void Demangler::printDecimalNumber(uint64_t N) {
1052   if (Error || !Print)
1053     return;
1054 
1055   Output << N;
1056 }
1057 
1058 // Prints a lifetime. An index 0 always represents an erased lifetime. Indices
1059 // starting from 1, are De Bruijn indices, referring to higher-ranked lifetimes
1060 // bound by one of the enclosing binders.
printLifetime(uint64_t Index)1061 void Demangler::printLifetime(uint64_t Index) {
1062   if (Index == 0) {
1063     print("'_");
1064     return;
1065   }
1066 
1067   if (Index - 1 >= BoundLifetimes) {
1068     Error = true;
1069     return;
1070   }
1071 
1072   uint64_t Depth = BoundLifetimes - Index;
1073   print('\'');
1074   if (Depth < 26) {
1075     char C = 'a' + Depth;
1076     print(C);
1077   } else {
1078     print('z');
1079     printDecimalNumber(Depth - 26 + 1);
1080   }
1081 }
1082 
decodePunycodeDigit(char C,size_t & Value)1083 static inline bool decodePunycodeDigit(char C, size_t &Value) {
1084   if (isLower(C)) {
1085     Value = C - 'a';
1086     return true;
1087   }
1088 
1089   if (isDigit(C)) {
1090     Value = 26 + (C - '0');
1091     return true;
1092   }
1093 
1094   return false;
1095 }
1096 
removeNullBytes(OutputStream & Output,size_t StartIdx)1097 static void removeNullBytes(OutputStream &Output, size_t StartIdx) {
1098   char *Buffer = Output.getBuffer();
1099   char *Start = Buffer + StartIdx;
1100   char *End = Buffer + Output.getCurrentPosition();
1101   Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer);
1102 }
1103 
1104 // Encodes code point as UTF-8 and stores results in Output. Returns false if
1105 // CodePoint is not a valid unicode scalar value.
encodeUTF8(size_t CodePoint,char * Output)1106 static inline bool encodeUTF8(size_t CodePoint, char *Output) {
1107   if (0xD800 <= CodePoint && CodePoint <= 0xDFFF)
1108     return false;
1109 
1110   if (CodePoint <= 0x7F) {
1111     Output[0] = CodePoint;
1112     return true;
1113   }
1114 
1115   if (CodePoint <= 0x7FF) {
1116     Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F);
1117     Output[1] = 0x80 | (CodePoint & 0x3F);
1118     return true;
1119   }
1120 
1121   if (CodePoint <= 0xFFFF) {
1122     Output[0] = 0xE0 | (CodePoint >> 12);
1123     Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F);
1124     Output[2] = 0x80 | (CodePoint & 0x3F);
1125     return true;
1126   }
1127 
1128   if (CodePoint <= 0x10FFFF) {
1129     Output[0] = 0xF0 | (CodePoint >> 18);
1130     Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F);
1131     Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F);
1132     Output[3] = 0x80 | (CodePoint & 0x3F);
1133     return true;
1134   }
1135 
1136   return false;
1137 }
1138 
1139 // Decodes string encoded using punycode and appends results to Output.
1140 // Returns true if decoding was successful.
decodePunycode(StringView Input,OutputStream & Output)1141 static bool decodePunycode(StringView Input, OutputStream &Output) {
1142   size_t OutputSize = Output.getCurrentPosition();
1143   size_t InputIdx = 0;
1144 
1145   // Rust uses an underscore as a delimiter.
1146   size_t DelimiterPos = StringView::npos;
1147   for (size_t I = 0; I != Input.size(); ++I)
1148     if (Input[I] == '_')
1149       DelimiterPos = I;
1150 
1151   if (DelimiterPos != StringView::npos) {
1152     // Copy basic code points before the last delimiter to the output.
1153     for (; InputIdx != DelimiterPos; ++InputIdx) {
1154       char C = Input[InputIdx];
1155       if (!isValid(C))
1156         return false;
1157       // Code points are padded with zeros while decoding is in progress.
1158       char UTF8[4] = {C};
1159       Output += StringView(UTF8, UTF8 + 4);
1160     }
1161     // Skip over the delimiter.
1162     ++InputIdx;
1163   }
1164 
1165   size_t Base = 36;
1166   size_t Skew = 38;
1167   size_t Bias = 72;
1168   size_t N = 0x80;
1169   size_t TMin = 1;
1170   size_t TMax = 26;
1171   size_t Damp = 700;
1172 
1173   auto Adapt = [&](size_t Delta, size_t NumPoints) {
1174     Delta /= Damp;
1175     Delta += Delta / NumPoints;
1176     Damp = 2;
1177 
1178     size_t K = 0;
1179     while (Delta > (Base - TMin) * TMax / 2) {
1180       Delta /= Base - TMin;
1181       K += Base;
1182     }
1183     return K + (((Base - TMin + 1) * Delta) / (Delta + Skew));
1184   };
1185 
1186   // Main decoding loop.
1187   for (size_t I = 0; InputIdx != Input.size(); I += 1) {
1188     size_t OldI = I;
1189     size_t W = 1;
1190     size_t Max = std::numeric_limits<size_t>::max();
1191     for (size_t K = Base; true; K += Base) {
1192       if (InputIdx == Input.size())
1193         return false;
1194       char C = Input[InputIdx++];
1195       size_t Digit = 0;
1196       if (!decodePunycodeDigit(C, Digit))
1197         return false;
1198 
1199       if (Digit > (Max - I) / W)
1200         return false;
1201       I += Digit * W;
1202 
1203       size_t T;
1204       if (K <= Bias)
1205         T = TMin;
1206       else if (K >= Bias + TMax)
1207         T = TMax;
1208       else
1209         T = K - Bias;
1210 
1211       if (Digit < T)
1212         break;
1213 
1214       if (W > Max / (Base - T))
1215         return false;
1216       W *= (Base - T);
1217     }
1218     size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1;
1219     Bias = Adapt(I - OldI, NumPoints);
1220 
1221     if (I / NumPoints > Max - N)
1222       return false;
1223     N += I / NumPoints;
1224     I = I % NumPoints;
1225 
1226     // Insert N at position I in the output.
1227     char UTF8[4] = {};
1228     if (!encodeUTF8(N, UTF8))
1229       return false;
1230     Output.insert(OutputSize + I * 4, UTF8, 4);
1231   }
1232 
1233   removeNullBytes(Output, OutputSize);
1234   return true;
1235 }
1236 
printIdentifier(Identifier Ident)1237 void Demangler::printIdentifier(Identifier Ident) {
1238   if (Error || !Print)
1239     return;
1240 
1241   if (Ident.Punycode) {
1242     if (!decodePunycode(Ident.Name, Output))
1243       Error = true;
1244   } else {
1245     print(Ident.Name);
1246   }
1247 }
1248 
look() const1249 char Demangler::look() const {
1250   if (Error || Position >= Input.size())
1251     return 0;
1252 
1253   return Input[Position];
1254 }
1255 
consume()1256 char Demangler::consume() {
1257   if (Error || Position >= Input.size()) {
1258     Error = true;
1259     return 0;
1260   }
1261 
1262   return Input[Position++];
1263 }
1264 
consumeIf(char Prefix)1265 bool Demangler::consumeIf(char Prefix) {
1266   if (Error || Position >= Input.size() || Input[Position] != Prefix)
1267     return false;
1268 
1269   Position += 1;
1270   return true;
1271 }
1272 
1273 /// Computes A + B. When computation wraps around sets the error and returns
1274 /// false. Otherwise assigns the result to A and returns true.
addAssign(uint64_t & A,uint64_t B)1275 bool Demangler::addAssign(uint64_t &A, uint64_t B) {
1276   if (A > std::numeric_limits<uint64_t>::max() - B) {
1277     Error = true;
1278     return false;
1279   }
1280 
1281   A += B;
1282   return true;
1283 }
1284 
1285 /// Computes A * B. When computation wraps around sets the error and returns
1286 /// false. Otherwise assigns the result to A and returns true.
mulAssign(uint64_t & A,uint64_t B)1287 bool Demangler::mulAssign(uint64_t &A, uint64_t B) {
1288   if (B != 0 && A > std::numeric_limits<uint64_t>::max() / B) {
1289     Error = true;
1290     return false;
1291   }
1292 
1293   A *= B;
1294   return true;
1295 }
1296