1 //===- llvm/MC/MCDisassembler.h - Disassembler interface --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
10 #define LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
11 
12 #include "llvm/ADT/Optional.h"
13 #include "llvm/ADT/StringRef.h"
14 #include "llvm/BinaryFormat/XCOFF.h"
15 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
16 #include <cstdint>
17 #include <memory>
18 #include <vector>
19 
20 namespace llvm {
21 
22 struct XCOFFSymbolInfo {
23   Optional<XCOFF::StorageMappingClass> StorageMappingClass;
24   Optional<uint32_t> Index;
25   bool IsLabel;
26   XCOFFSymbolInfo(Optional<XCOFF::StorageMappingClass> Smc,
27                   Optional<uint32_t> Idx, bool Label)
28       : StorageMappingClass(Smc), Index(Idx), IsLabel(Label) {}
29 
30   bool operator<(const XCOFFSymbolInfo &SymInfo) const;
31 };
32 
33 struct SymbolInfoTy {
34   uint64_t Addr;
35   StringRef Name;
36   union {
37     uint8_t Type;
38     XCOFFSymbolInfo XCOFFSymInfo;
39   };
40 
41 private:
42   bool IsXCOFF;
43   bool HasType;
44 
45 public:
46   SymbolInfoTy(uint64_t Addr, StringRef Name,
47                Optional<XCOFF::StorageMappingClass> Smc, Optional<uint32_t> Idx,
48                bool Label)
49       : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true),
50         HasType(false) {}
51   SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
52                bool IsXCOFF = false)
53       : Addr(Addr), Name(Name), Type(Type), IsXCOFF(IsXCOFF), HasType(true) {}
54   bool isXCOFF() const { return IsXCOFF; }
55 
56 private:
57   friend bool operator<(const SymbolInfoTy &P1, const SymbolInfoTy &P2) {
58     assert((P1.IsXCOFF == P2.IsXCOFF && P1.HasType == P2.HasType) &&
59            "The value of IsXCOFF and HasType in P1 and P2 should be the same "
60            "respectively.");
61 
62     if (P1.IsXCOFF && P1.HasType)
63       return std::tie(P1.Addr, P1.Type, P1.Name) <
64              std::tie(P2.Addr, P2.Type, P2.Name);
65 
66     if (P1.IsXCOFF)
67       return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
68              std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
69 
70     return std::tie(P1.Addr, P1.Name, P1.Type) <
71            std::tie(P2.Addr, P2.Name, P2.Type);
72   }
73 };
74 
75 using SectionSymbolsTy = std::vector<SymbolInfoTy>;
76 
77 template <typename T> class ArrayRef;
78 class MCContext;
79 class MCInst;
80 class MCSubtargetInfo;
81 class raw_ostream;
82 
83 /// Superclass for all disassemblers. Consumes a memory region and provides an
84 /// array of assembly instructions.
85 class MCDisassembler {
86 public:
87   /// Ternary decode status. Most backends will just use Fail and
88   /// Success, however some have a concept of an instruction with
89   /// understandable semantics but which is architecturally
90   /// incorrect. An example of this is ARM UNPREDICTABLE instructions
91   /// which are disassemblable but cause undefined behaviour.
92   ///
93   /// Because it makes sense to disassemble these instructions, there
94   /// is a "soft fail" failure mode that indicates the MCInst& is
95   /// valid but architecturally incorrect.
96   ///
97   /// The enum numbers are deliberately chosen such that reduction
98   /// from Success->SoftFail ->Fail can be done with a simple
99   /// bitwise-AND:
100   ///
101   ///   LEFT & TOP =  | Success       Unpredictable   Fail
102   ///   --------------+-----------------------------------
103   ///   Success       | Success       Unpredictable   Fail
104   ///   Unpredictable | Unpredictable Unpredictable   Fail
105   ///   Fail          | Fail          Fail            Fail
106   ///
107   /// An easy way of encoding this is as 0b11, 0b01, 0b00 for
108   /// Success, SoftFail, Fail respectively.
109   enum DecodeStatus {
110     Fail = 0,
111     SoftFail = 1,
112     Success = 3
113   };
114 
115   MCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
116     : Ctx(Ctx), STI(STI) {}
117 
118   virtual ~MCDisassembler();
119 
120   /// Returns the disassembly of a single instruction.
121   ///
122   /// \param Instr    - An MCInst to populate with the contents of the
123   ///                   instruction.
124   /// \param Size     - A value to populate with the size of the instruction, or
125   ///                   the number of bytes consumed while attempting to decode
126   ///                   an invalid instruction.
127   /// \param Address  - The address, in the memory space of region, of the first
128   ///                   byte of the instruction.
129   /// \param Bytes    - A reference to the actual bytes of the instruction.
130   /// \param CStream  - The stream to print comments and annotations on.
131   /// \return         - MCDisassembler::Success if the instruction is valid,
132   ///                   MCDisassembler::SoftFail if the instruction was
133   ///                                            disassemblable but invalid,
134   ///                   MCDisassembler::Fail if the instruction was invalid.
135   virtual DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
136                                       ArrayRef<uint8_t> Bytes, uint64_t Address,
137                                       raw_ostream &CStream) const = 0;
138 
139   /// Used to perform separate target specific disassembly for a particular
140   /// symbol. May parse any prelude that precedes instructions after the
141   /// start of a symbol, or the entire symbol.
142   /// This is used for example by WebAssembly to decode preludes.
143   ///
144   /// Base implementation returns None. So all targets by default ignore to
145   /// treat symbols separately.
146   ///
147   /// \param Symbol   - The symbol.
148   /// \param Size     - The number of bytes consumed.
149   /// \param Address  - The address, in the memory space of region, of the first
150   ///                   byte of the symbol.
151   /// \param Bytes    - A reference to the actual bytes at the symbol location.
152   /// \param CStream  - The stream to print comments and annotations on.
153   /// \return         - MCDisassembler::Success if bytes are decoded
154   ///                   successfully. Size must hold the number of bytes that
155   ///                   were decoded.
156   ///                 - MCDisassembler::Fail if the bytes are invalid. Size
157   ///                   must hold the number of bytes that were decoded before
158   ///                   failing. The target must print nothing. This can be
159   ///                   done by buffering the output if needed.
160   ///                 - None if the target doesn't want to handle the symbol
161   ///                   separately. Value of Size is ignored in this case.
162   virtual Optional<DecodeStatus>
163   onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes,
164                 uint64_t Address, raw_ostream &CStream) const;
165   // TODO:
166   // Implement similar hooks that can be used at other points during
167   // disassembly. Something along the following lines:
168   // - onBeforeInstructionDecode()
169   // - onAfterInstructionDecode()
170   // - onSymbolEnd()
171   // It should help move much of the target specific code from llvm-objdump to
172   // respective target disassemblers.
173 
174   /// Suggest a distance to skip in a buffer of data to find the next
175   /// place to look for the start of an instruction. For example, if
176   /// all instructions have a fixed alignment, this might advance to
177   /// the next multiple of that alignment.
178   ///
179   /// If not overridden, the default is 1.
180   ///
181   /// \param Address  - The address, in the memory space of region, of the
182   ///                   starting point (typically the first byte of something
183   ///                   that did not decode as a valid instruction at all).
184   /// \param Bytes    - A reference to the actual bytes at Address. May be
185   ///                   needed in order to determine the width of an
186   ///                   unrecognized instruction (e.g. in Thumb this is a simple
187   ///                   consistent criterion that doesn't require knowing the
188   ///                   specific instruction). The caller can pass as much data
189   ///                   as they have available, and the function is required to
190   ///                   make a reasonable default choice if not enough data is
191   ///                   available to make a better one.
192   /// \return         - A number of bytes to skip. Must always be greater than
193   ///                   zero. May be greater than the size of Bytes.
194   virtual uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
195                                       uint64_t Address) const;
196 
197 private:
198   MCContext &Ctx;
199 
200 protected:
201   // Subtarget information, for instruction decoding predicates if required.
202   const MCSubtargetInfo &STI;
203   std::unique_ptr<MCSymbolizer> Symbolizer;
204 
205 public:
206   // Helpers around MCSymbolizer
207   bool tryAddingSymbolicOperand(MCInst &Inst, int64_t Value, uint64_t Address,
208                                 bool IsBranch, uint64_t Offset, uint64_t OpSize,
209                                 uint64_t InstSize) const;
210 
211   void tryAddingPcLoadReferenceComment(int64_t Value, uint64_t Address) const;
212 
213   /// Set \p Symzer as the current symbolizer.
214   /// This takes ownership of \p Symzer, and deletes the previously set one.
215   void setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer);
216 
217   MCContext& getContext() const { return Ctx; }
218 
219   const MCSubtargetInfo& getSubtargetInfo() const { return STI; }
220 
221   // Marked mutable because we cache it inside the disassembler, rather than
222   // having to pass it around as an argument through all the autogenerated code.
223   mutable raw_ostream *CommentStream = nullptr;
224 };
225 
226 } // end namespace llvm
227 
228 #endif // LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
229