1 //===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file is part of the X86 Disassembler.
10 // It contains code to translate the data produced by the decoder into
11 // MCInsts.
12 //
13 //
14 // The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
15 // 64-bit X86 instruction sets. The main decode sequence for an assembly
16 // instruction in this disassembler is:
17 //
18 // 1. Read the prefix bytes and determine the attributes of the instruction.
19 // These attributes, recorded in enum attributeBits
20 // (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM
21 // provides a mapping from bitmasks to contexts, which are represented by
22 // enum InstructionContext (ibid.).
23 //
24 // 2. Read the opcode, and determine what kind of opcode it is. The
25 // disassembler distinguishes four kinds of opcodes, which are enumerated in
26 // OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
27 // (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
28 // (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context.
29 //
30 // 3. Depending on the opcode type, look in one of four ClassDecision structures
31 // (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which
32 // OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get
33 // a ModRMDecision (ibid.).
34 //
35 // 4. Some instructions, such as escape opcodes or extended opcodes, or even
36 // instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
37 // ModR/M byte to complete decode. The ModRMDecision's type is an entry from
38 // ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
39 // ModR/M byte is required and how to interpret it.
40 //
41 // 5. After resolving the ModRMDecision, the disassembler has a unique ID
42 // of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in
43 // INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
44 // meanings of its operands.
45 //
46 // 6. For each operand, its encoding is an entry from OperandEncoding
47 // (X86DisassemblerDecoderCommon.h) and its type is an entry from
48 // OperandType (ibid.). The encoding indicates how to read it from the
49 // instruction; the type indicates how to interpret the value once it has
50 // been read. For example, a register operand could be stored in the R/M
51 // field of the ModR/M byte, the REG field of the ModR/M byte, or added to
52 // the main opcode. This is orthogonal from its meaning (an GPR or an XMM
53 // register, for instance). Given this information, the operands can be
54 // extracted and interpreted.
55 //
56 // 7. As the last step, the disassembler translates the instruction information
57 // and operands into a format understandable by the client - in this case, an
58 // MCInst for use by the MC infrastructure.
59 //
60 // The disassembler is broken broadly into two parts: the table emitter that
61 // emits the instruction decode tables discussed above during compilation, and
62 // the disassembler itself. The table emitter is documented in more detail in
63 // utils/TableGen/X86DisassemblerEmitter.h.
64 //
65 // X86Disassembler.cpp contains the code responsible for step 7, and for
66 // invoking the decoder to execute steps 1-6.
67 // X86DisassemblerDecoderCommon.h contains the definitions needed by both the
68 // table emitter and the disassembler.
69 // X86DisassemblerDecoder.h contains the public interface of the decoder,
70 // factored out into C for possible use by other projects.
71 // X86DisassemblerDecoder.c contains the source code of the decoder, which is
72 // responsible for steps 1-6.
73 //
74 //===----------------------------------------------------------------------===//
75
76 #include "MCTargetDesc/X86BaseInfo.h"
77 #include "MCTargetDesc/X86MCTargetDesc.h"
78 #include "TargetInfo/X86TargetInfo.h"
79 #include "X86DisassemblerDecoder.h"
80 #include "llvm/MC/MCContext.h"
81 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
82 #include "llvm/MC/MCExpr.h"
83 #include "llvm/MC/MCInst.h"
84 #include "llvm/MC/MCInstrInfo.h"
85 #include "llvm/MC/MCSubtargetInfo.h"
86 #include "llvm/MC/TargetRegistry.h"
87 #include "llvm/Support/Debug.h"
88 #include "llvm/Support/Format.h"
89 #include "llvm/Support/raw_ostream.h"
90
91 using namespace llvm;
92 using namespace llvm::X86Disassembler;
93
94 #define DEBUG_TYPE "x86-disassembler"
95
96 #define debug(s) LLVM_DEBUG(dbgs() << __LINE__ << ": " << s);
97
98 // Specifies whether a ModR/M byte is needed and (if so) which
99 // instruction each possible value of the ModR/M byte corresponds to. Once
100 // this information is known, we have narrowed down to a single instruction.
101 struct ModRMDecision {
102 uint8_t modrm_type;
103 uint16_t instructionIDs;
104 };
105
106 // Specifies which set of ModR/M->instruction tables to look at
107 // given a particular opcode.
108 struct OpcodeDecision {
109 ModRMDecision modRMDecisions[256];
110 };
111
112 // Specifies which opcode->instruction tables to look at given
113 // a particular context (set of attributes). Since there are many possible
114 // contexts, the decoder first uses CONTEXTS_SYM to determine which context
115 // applies given a specific set of attributes. Hence there are only IC_max
116 // entries in this table, rather than 2^(ATTR_max).
117 struct ContextDecision {
118 OpcodeDecision opcodeDecisions[IC_max];
119 };
120
121 #include "X86GenDisassemblerTables.inc"
122
decode(OpcodeType type,InstructionContext insnContext,uint8_t opcode,uint8_t modRM)123 static InstrUID decode(OpcodeType type, InstructionContext insnContext,
124 uint8_t opcode, uint8_t modRM) {
125 const struct ModRMDecision *dec;
126
127 switch (type) {
128 case ONEBYTE:
129 dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
130 break;
131 case TWOBYTE:
132 dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
133 break;
134 case THREEBYTE_38:
135 dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
136 break;
137 case THREEBYTE_3A:
138 dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
139 break;
140 case XOP8_MAP:
141 dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
142 break;
143 case XOP9_MAP:
144 dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
145 break;
146 case XOPA_MAP:
147 dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
148 break;
149 case THREEDNOW_MAP:
150 dec =
151 &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
152 break;
153 case MAP5:
154 dec = &MAP5_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
155 break;
156 case MAP6:
157 dec = &MAP6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
158 break;
159 }
160
161 switch (dec->modrm_type) {
162 default:
163 llvm_unreachable("Corrupt table! Unknown modrm_type");
164 return 0;
165 case MODRM_ONEENTRY:
166 return modRMTable[dec->instructionIDs];
167 case MODRM_SPLITRM:
168 if (modFromModRM(modRM) == 0x3)
169 return modRMTable[dec->instructionIDs + 1];
170 return modRMTable[dec->instructionIDs];
171 case MODRM_SPLITREG:
172 if (modFromModRM(modRM) == 0x3)
173 return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3) + 8];
174 return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)];
175 case MODRM_SPLITMISC:
176 if (modFromModRM(modRM) == 0x3)
177 return modRMTable[dec->instructionIDs + (modRM & 0x3f) + 8];
178 return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)];
179 case MODRM_FULL:
180 return modRMTable[dec->instructionIDs + modRM];
181 }
182 }
183
peek(struct InternalInstruction * insn,uint8_t & byte)184 static bool peek(struct InternalInstruction *insn, uint8_t &byte) {
185 uint64_t offset = insn->readerCursor - insn->startLocation;
186 if (offset >= insn->bytes.size())
187 return true;
188 byte = insn->bytes[offset];
189 return false;
190 }
191
consume(InternalInstruction * insn,T & ptr)192 template <typename T> static bool consume(InternalInstruction *insn, T &ptr) {
193 auto r = insn->bytes;
194 uint64_t offset = insn->readerCursor - insn->startLocation;
195 if (offset + sizeof(T) > r.size())
196 return true;
197 ptr = support::endian::read<T>(&r[offset], support::little);
198 insn->readerCursor += sizeof(T);
199 return false;
200 }
201
isREX(struct InternalInstruction * insn,uint8_t prefix)202 static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
203 return insn->mode == MODE_64BIT && prefix >= 0x40 && prefix <= 0x4f;
204 }
205
206 // Consumes all of an instruction's prefix bytes, and marks the
207 // instruction as having them. Also sets the instruction's default operand,
208 // address, and other relevant data sizes to report operands correctly.
209 //
210 // insn must not be empty.
readPrefixes(struct InternalInstruction * insn)211 static int readPrefixes(struct InternalInstruction *insn) {
212 bool isPrefix = true;
213 uint8_t byte = 0;
214 uint8_t nextByte;
215
216 LLVM_DEBUG(dbgs() << "readPrefixes()");
217
218 while (isPrefix) {
219 // If we fail reading prefixes, just stop here and let the opcode reader
220 // deal with it.
221 if (consume(insn, byte))
222 break;
223
224 // If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
225 // break and let it be disassembled as a normal "instruction".
226 if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK
227 break;
228
229 if ((byte == 0xf2 || byte == 0xf3) && !peek(insn, nextByte)) {
230 // If the byte is 0xf2 or 0xf3, and any of the following conditions are
231 // met:
232 // - it is followed by a LOCK (0xf0) prefix
233 // - it is followed by an xchg instruction
234 // then it should be disassembled as a xacquire/xrelease not repne/rep.
235 if (((nextByte == 0xf0) ||
236 ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) {
237 insn->xAcquireRelease = true;
238 if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support
239 break;
240 }
241 // Also if the byte is 0xf3, and the following condition is met:
242 // - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
243 // "mov mem, imm" (opcode 0xc6/0xc7) instructions.
244 // then it should be disassembled as an xrelease not rep.
245 if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
246 nextByte == 0xc6 || nextByte == 0xc7)) {
247 insn->xAcquireRelease = true;
248 break;
249 }
250 if (isREX(insn, nextByte)) {
251 uint8_t nnextByte;
252 // Go to REX prefix after the current one
253 if (consume(insn, nnextByte))
254 return -1;
255 // We should be able to read next byte after REX prefix
256 if (peek(insn, nnextByte))
257 return -1;
258 --insn->readerCursor;
259 }
260 }
261
262 switch (byte) {
263 case 0xf0: // LOCK
264 insn->hasLockPrefix = true;
265 break;
266 case 0xf2: // REPNE/REPNZ
267 case 0xf3: { // REP or REPE/REPZ
268 uint8_t nextByte;
269 if (peek(insn, nextByte))
270 break;
271 // TODO:
272 // 1. There could be several 0x66
273 // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then
274 // it's not mandatory prefix
275 // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need
276 // 0x0f exactly after it to be mandatory prefix
277 if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66)
278 // The last of 0xf2 /0xf3 is mandatory prefix
279 insn->mandatoryPrefix = byte;
280 insn->repeatPrefix = byte;
281 break;
282 }
283 case 0x2e: // CS segment override -OR- Branch not taken
284 insn->segmentOverride = SEG_OVERRIDE_CS;
285 break;
286 case 0x36: // SS segment override -OR- Branch taken
287 insn->segmentOverride = SEG_OVERRIDE_SS;
288 break;
289 case 0x3e: // DS segment override
290 insn->segmentOverride = SEG_OVERRIDE_DS;
291 break;
292 case 0x26: // ES segment override
293 insn->segmentOverride = SEG_OVERRIDE_ES;
294 break;
295 case 0x64: // FS segment override
296 insn->segmentOverride = SEG_OVERRIDE_FS;
297 break;
298 case 0x65: // GS segment override
299 insn->segmentOverride = SEG_OVERRIDE_GS;
300 break;
301 case 0x66: { // Operand-size override {
302 uint8_t nextByte;
303 insn->hasOpSize = true;
304 if (peek(insn, nextByte))
305 break;
306 // 0x66 can't overwrite existing mandatory prefix and should be ignored
307 if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte)))
308 insn->mandatoryPrefix = byte;
309 break;
310 }
311 case 0x67: // Address-size override
312 insn->hasAdSize = true;
313 break;
314 default: // Not a prefix byte
315 isPrefix = false;
316 break;
317 }
318
319 if (isPrefix)
320 LLVM_DEBUG(dbgs() << format("Found prefix 0x%hhx", byte));
321 }
322
323 insn->vectorExtensionType = TYPE_NO_VEX_XOP;
324
325 if (byte == 0x62) {
326 uint8_t byte1, byte2;
327 if (consume(insn, byte1)) {
328 LLVM_DEBUG(dbgs() << "Couldn't read second byte of EVEX prefix");
329 return -1;
330 }
331
332 if (peek(insn, byte2)) {
333 LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix");
334 return -1;
335 }
336
337 if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
338 ((~byte1 & 0x8) == 0x8) && ((byte2 & 0x4) == 0x4)) {
339 insn->vectorExtensionType = TYPE_EVEX;
340 } else {
341 --insn->readerCursor; // unconsume byte1
342 --insn->readerCursor; // unconsume byte
343 }
344
345 if (insn->vectorExtensionType == TYPE_EVEX) {
346 insn->vectorExtensionPrefix[0] = byte;
347 insn->vectorExtensionPrefix[1] = byte1;
348 if (consume(insn, insn->vectorExtensionPrefix[2])) {
349 LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix");
350 return -1;
351 }
352 if (consume(insn, insn->vectorExtensionPrefix[3])) {
353 LLVM_DEBUG(dbgs() << "Couldn't read fourth byte of EVEX prefix");
354 return -1;
355 }
356
357 // We simulate the REX prefix for simplicity's sake
358 if (insn->mode == MODE_64BIT) {
359 insn->rexPrefix = 0x40 |
360 (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3) |
361 (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2) |
362 (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1) |
363 (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
364 }
365
366 LLVM_DEBUG(
367 dbgs() << format(
368 "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
369 insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
370 insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]));
371 }
372 } else if (byte == 0xc4) {
373 uint8_t byte1;
374 if (peek(insn, byte1)) {
375 LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX");
376 return -1;
377 }
378
379 if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
380 insn->vectorExtensionType = TYPE_VEX_3B;
381 else
382 --insn->readerCursor;
383
384 if (insn->vectorExtensionType == TYPE_VEX_3B) {
385 insn->vectorExtensionPrefix[0] = byte;
386 consume(insn, insn->vectorExtensionPrefix[1]);
387 consume(insn, insn->vectorExtensionPrefix[2]);
388
389 // We simulate the REX prefix for simplicity's sake
390
391 if (insn->mode == MODE_64BIT)
392 insn->rexPrefix = 0x40 |
393 (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) |
394 (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) |
395 (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) |
396 (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
397
398 LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
399 insn->vectorExtensionPrefix[0],
400 insn->vectorExtensionPrefix[1],
401 insn->vectorExtensionPrefix[2]));
402 }
403 } else if (byte == 0xc5) {
404 uint8_t byte1;
405 if (peek(insn, byte1)) {
406 LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX");
407 return -1;
408 }
409
410 if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
411 insn->vectorExtensionType = TYPE_VEX_2B;
412 else
413 --insn->readerCursor;
414
415 if (insn->vectorExtensionType == TYPE_VEX_2B) {
416 insn->vectorExtensionPrefix[0] = byte;
417 consume(insn, insn->vectorExtensionPrefix[1]);
418
419 if (insn->mode == MODE_64BIT)
420 insn->rexPrefix =
421 0x40 | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
422
423 switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
424 default:
425 break;
426 case VEX_PREFIX_66:
427 insn->hasOpSize = true;
428 break;
429 }
430
431 LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx",
432 insn->vectorExtensionPrefix[0],
433 insn->vectorExtensionPrefix[1]));
434 }
435 } else if (byte == 0x8f) {
436 uint8_t byte1;
437 if (peek(insn, byte1)) {
438 LLVM_DEBUG(dbgs() << "Couldn't read second byte of XOP");
439 return -1;
440 }
441
442 if ((byte1 & 0x38) != 0x0) // 0 in these 3 bits is a POP instruction.
443 insn->vectorExtensionType = TYPE_XOP;
444 else
445 --insn->readerCursor;
446
447 if (insn->vectorExtensionType == TYPE_XOP) {
448 insn->vectorExtensionPrefix[0] = byte;
449 consume(insn, insn->vectorExtensionPrefix[1]);
450 consume(insn, insn->vectorExtensionPrefix[2]);
451
452 // We simulate the REX prefix for simplicity's sake
453
454 if (insn->mode == MODE_64BIT)
455 insn->rexPrefix = 0x40 |
456 (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) |
457 (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) |
458 (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) |
459 (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
460
461 switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
462 default:
463 break;
464 case VEX_PREFIX_66:
465 insn->hasOpSize = true;
466 break;
467 }
468
469 LLVM_DEBUG(dbgs() << format("Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
470 insn->vectorExtensionPrefix[0],
471 insn->vectorExtensionPrefix[1],
472 insn->vectorExtensionPrefix[2]));
473 }
474 } else if (isREX(insn, byte)) {
475 if (peek(insn, nextByte))
476 return -1;
477 insn->rexPrefix = byte;
478 LLVM_DEBUG(dbgs() << format("Found REX prefix 0x%hhx", byte));
479 } else
480 --insn->readerCursor;
481
482 if (insn->mode == MODE_16BIT) {
483 insn->registerSize = (insn->hasOpSize ? 4 : 2);
484 insn->addressSize = (insn->hasAdSize ? 4 : 2);
485 insn->displacementSize = (insn->hasAdSize ? 4 : 2);
486 insn->immediateSize = (insn->hasOpSize ? 4 : 2);
487 } else if (insn->mode == MODE_32BIT) {
488 insn->registerSize = (insn->hasOpSize ? 2 : 4);
489 insn->addressSize = (insn->hasAdSize ? 2 : 4);
490 insn->displacementSize = (insn->hasAdSize ? 2 : 4);
491 insn->immediateSize = (insn->hasOpSize ? 2 : 4);
492 } else if (insn->mode == MODE_64BIT) {
493 insn->displacementSize = 4;
494 if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
495 insn->registerSize = 8;
496 insn->addressSize = (insn->hasAdSize ? 4 : 8);
497 insn->immediateSize = 4;
498 insn->hasOpSize = false;
499 } else {
500 insn->registerSize = (insn->hasOpSize ? 2 : 4);
501 insn->addressSize = (insn->hasAdSize ? 4 : 8);
502 insn->immediateSize = (insn->hasOpSize ? 2 : 4);
503 }
504 }
505
506 return 0;
507 }
508
509 // Consumes the SIB byte to determine addressing information.
readSIB(struct InternalInstruction * insn)510 static int readSIB(struct InternalInstruction *insn) {
511 SIBBase sibBaseBase = SIB_BASE_NONE;
512 uint8_t index, base;
513
514 LLVM_DEBUG(dbgs() << "readSIB()");
515 switch (insn->addressSize) {
516 case 2:
517 default:
518 llvm_unreachable("SIB-based addressing doesn't work in 16-bit mode");
519 case 4:
520 insn->sibIndexBase = SIB_INDEX_EAX;
521 sibBaseBase = SIB_BASE_EAX;
522 break;
523 case 8:
524 insn->sibIndexBase = SIB_INDEX_RAX;
525 sibBaseBase = SIB_BASE_RAX;
526 break;
527 }
528
529 if (consume(insn, insn->sib))
530 return -1;
531
532 index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
533
534 if (index == 0x4) {
535 insn->sibIndex = SIB_INDEX_NONE;
536 } else {
537 insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index);
538 }
539
540 insn->sibScale = 1 << scaleFromSIB(insn->sib);
541
542 base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
543
544 switch (base) {
545 case 0x5:
546 case 0xd:
547 switch (modFromModRM(insn->modRM)) {
548 case 0x0:
549 insn->eaDisplacement = EA_DISP_32;
550 insn->sibBase = SIB_BASE_NONE;
551 break;
552 case 0x1:
553 insn->eaDisplacement = EA_DISP_8;
554 insn->sibBase = (SIBBase)(sibBaseBase + base);
555 break;
556 case 0x2:
557 insn->eaDisplacement = EA_DISP_32;
558 insn->sibBase = (SIBBase)(sibBaseBase + base);
559 break;
560 default:
561 llvm_unreachable("Cannot have Mod = 0b11 and a SIB byte");
562 }
563 break;
564 default:
565 insn->sibBase = (SIBBase)(sibBaseBase + base);
566 break;
567 }
568
569 return 0;
570 }
571
readDisplacement(struct InternalInstruction * insn)572 static int readDisplacement(struct InternalInstruction *insn) {
573 int8_t d8;
574 int16_t d16;
575 int32_t d32;
576 LLVM_DEBUG(dbgs() << "readDisplacement()");
577
578 insn->displacementOffset = insn->readerCursor - insn->startLocation;
579 switch (insn->eaDisplacement) {
580 case EA_DISP_NONE:
581 break;
582 case EA_DISP_8:
583 if (consume(insn, d8))
584 return -1;
585 insn->displacement = d8;
586 break;
587 case EA_DISP_16:
588 if (consume(insn, d16))
589 return -1;
590 insn->displacement = d16;
591 break;
592 case EA_DISP_32:
593 if (consume(insn, d32))
594 return -1;
595 insn->displacement = d32;
596 break;
597 }
598
599 return 0;
600 }
601
602 // Consumes all addressing information (ModR/M byte, SIB byte, and displacement.
readModRM(struct InternalInstruction * insn)603 static int readModRM(struct InternalInstruction *insn) {
604 uint8_t mod, rm, reg, evexrm;
605 LLVM_DEBUG(dbgs() << "readModRM()");
606
607 if (insn->consumedModRM)
608 return 0;
609
610 if (consume(insn, insn->modRM))
611 return -1;
612 insn->consumedModRM = true;
613
614 mod = modFromModRM(insn->modRM);
615 rm = rmFromModRM(insn->modRM);
616 reg = regFromModRM(insn->modRM);
617
618 // This goes by insn->registerSize to pick the correct register, which messes
619 // up if we're using (say) XMM or 8-bit register operands. That gets fixed in
620 // fixupReg().
621 switch (insn->registerSize) {
622 case 2:
623 insn->regBase = MODRM_REG_AX;
624 insn->eaRegBase = EA_REG_AX;
625 break;
626 case 4:
627 insn->regBase = MODRM_REG_EAX;
628 insn->eaRegBase = EA_REG_EAX;
629 break;
630 case 8:
631 insn->regBase = MODRM_REG_RAX;
632 insn->eaRegBase = EA_REG_RAX;
633 break;
634 }
635
636 reg |= rFromREX(insn->rexPrefix) << 3;
637 rm |= bFromREX(insn->rexPrefix) << 3;
638
639 evexrm = 0;
640 if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) {
641 reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
642 evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
643 }
644
645 insn->reg = (Reg)(insn->regBase + reg);
646
647 switch (insn->addressSize) {
648 case 2: {
649 EABase eaBaseBase = EA_BASE_BX_SI;
650
651 switch (mod) {
652 case 0x0:
653 if (rm == 0x6) {
654 insn->eaBase = EA_BASE_NONE;
655 insn->eaDisplacement = EA_DISP_16;
656 if (readDisplacement(insn))
657 return -1;
658 } else {
659 insn->eaBase = (EABase)(eaBaseBase + rm);
660 insn->eaDisplacement = EA_DISP_NONE;
661 }
662 break;
663 case 0x1:
664 insn->eaBase = (EABase)(eaBaseBase + rm);
665 insn->eaDisplacement = EA_DISP_8;
666 insn->displacementSize = 1;
667 if (readDisplacement(insn))
668 return -1;
669 break;
670 case 0x2:
671 insn->eaBase = (EABase)(eaBaseBase + rm);
672 insn->eaDisplacement = EA_DISP_16;
673 if (readDisplacement(insn))
674 return -1;
675 break;
676 case 0x3:
677 insn->eaBase = (EABase)(insn->eaRegBase + rm);
678 if (readDisplacement(insn))
679 return -1;
680 break;
681 }
682 break;
683 }
684 case 4:
685 case 8: {
686 EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
687
688 switch (mod) {
689 case 0x0:
690 insn->eaDisplacement = EA_DISP_NONE; // readSIB may override this
691 // In determining whether RIP-relative mode is used (rm=5),
692 // or whether a SIB byte is present (rm=4),
693 // the extension bits (REX.b and EVEX.x) are ignored.
694 switch (rm & 7) {
695 case 0x4: // SIB byte is present
696 insn->eaBase = (insn->addressSize == 4 ? EA_BASE_sib : EA_BASE_sib64);
697 if (readSIB(insn) || readDisplacement(insn))
698 return -1;
699 break;
700 case 0x5: // RIP-relative
701 insn->eaBase = EA_BASE_NONE;
702 insn->eaDisplacement = EA_DISP_32;
703 if (readDisplacement(insn))
704 return -1;
705 break;
706 default:
707 insn->eaBase = (EABase)(eaBaseBase + rm);
708 break;
709 }
710 break;
711 case 0x1:
712 insn->displacementSize = 1;
713 [[fallthrough]];
714 case 0x2:
715 insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
716 switch (rm & 7) {
717 case 0x4: // SIB byte is present
718 insn->eaBase = EA_BASE_sib;
719 if (readSIB(insn) || readDisplacement(insn))
720 return -1;
721 break;
722 default:
723 insn->eaBase = (EABase)(eaBaseBase + rm);
724 if (readDisplacement(insn))
725 return -1;
726 break;
727 }
728 break;
729 case 0x3:
730 insn->eaDisplacement = EA_DISP_NONE;
731 insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm);
732 break;
733 }
734 break;
735 }
736 } // switch (insn->addressSize)
737
738 return 0;
739 }
740
741 #define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \
742 static uint16_t name(struct InternalInstruction *insn, OperandType type, \
743 uint8_t index, uint8_t *valid) { \
744 *valid = 1; \
745 switch (type) { \
746 default: \
747 debug("Unhandled register type"); \
748 *valid = 0; \
749 return 0; \
750 case TYPE_Rv: \
751 return base + index; \
752 case TYPE_R8: \
753 index &= mask; \
754 if (index > 0xf) \
755 *valid = 0; \
756 if (insn->rexPrefix && index >= 4 && index <= 7) { \
757 return prefix##_SPL + (index - 4); \
758 } else { \
759 return prefix##_AL + index; \
760 } \
761 case TYPE_R16: \
762 index &= mask; \
763 if (index > 0xf) \
764 *valid = 0; \
765 return prefix##_AX + index; \
766 case TYPE_R32: \
767 index &= mask; \
768 if (index > 0xf) \
769 *valid = 0; \
770 return prefix##_EAX + index; \
771 case TYPE_R64: \
772 index &= mask; \
773 if (index > 0xf) \
774 *valid = 0; \
775 return prefix##_RAX + index; \
776 case TYPE_ZMM: \
777 return prefix##_ZMM0 + index; \
778 case TYPE_YMM: \
779 return prefix##_YMM0 + index; \
780 case TYPE_XMM: \
781 return prefix##_XMM0 + index; \
782 case TYPE_TMM: \
783 if (index > 7) \
784 *valid = 0; \
785 return prefix##_TMM0 + index; \
786 case TYPE_VK: \
787 index &= 0xf; \
788 if (index > 7) \
789 *valid = 0; \
790 return prefix##_K0 + index; \
791 case TYPE_VK_PAIR: \
792 if (index > 7) \
793 *valid = 0; \
794 return prefix##_K0_K1 + (index / 2); \
795 case TYPE_MM64: \
796 return prefix##_MM0 + (index & 0x7); \
797 case TYPE_SEGMENTREG: \
798 if ((index & 7) > 5) \
799 *valid = 0; \
800 return prefix##_ES + (index & 7); \
801 case TYPE_DEBUGREG: \
802 return prefix##_DR0 + index; \
803 case TYPE_CONTROLREG: \
804 return prefix##_CR0 + index; \
805 case TYPE_MVSIBX: \
806 return prefix##_XMM0 + index; \
807 case TYPE_MVSIBY: \
808 return prefix##_YMM0 + index; \
809 case TYPE_MVSIBZ: \
810 return prefix##_ZMM0 + index; \
811 } \
812 }
813
814 // Consult an operand type to determine the meaning of the reg or R/M field. If
815 // the operand is an XMM operand, for example, an operand would be XMM0 instead
816 // of AX, which readModRM() would otherwise misinterpret it as.
817 //
818 // @param insn - The instruction containing the operand.
819 // @param type - The operand type.
820 // @param index - The existing value of the field as reported by readModRM().
821 // @param valid - The address of a uint8_t. The target is set to 1 if the
822 // field is valid for the register class; 0 if not.
823 // @return - The proper value.
824 GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f)
825 GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf)
826
827 // Consult an operand specifier to determine which of the fixup*Value functions
828 // to use in correcting readModRM()'ss interpretation.
829 //
830 // @param insn - See fixup*Value().
831 // @param op - The operand specifier.
832 // @return - 0 if fixup was successful; -1 if the register returned was
833 // invalid for its class.
fixupReg(struct InternalInstruction * insn,const struct OperandSpecifier * op)834 static int fixupReg(struct InternalInstruction *insn,
835 const struct OperandSpecifier *op) {
836 uint8_t valid;
837 LLVM_DEBUG(dbgs() << "fixupReg()");
838
839 switch ((OperandEncoding)op->encoding) {
840 default:
841 debug("Expected a REG or R/M encoding in fixupReg");
842 return -1;
843 case ENCODING_VVVV:
844 insn->vvvv =
845 (Reg)fixupRegValue(insn, (OperandType)op->type, insn->vvvv, &valid);
846 if (!valid)
847 return -1;
848 break;
849 case ENCODING_REG:
850 insn->reg = (Reg)fixupRegValue(insn, (OperandType)op->type,
851 insn->reg - insn->regBase, &valid);
852 if (!valid)
853 return -1;
854 break;
855 case ENCODING_SIB:
856 CASE_ENCODING_RM:
857 if (insn->eaBase >= insn->eaRegBase) {
858 insn->eaBase = (EABase)fixupRMValue(
859 insn, (OperandType)op->type, insn->eaBase - insn->eaRegBase, &valid);
860 if (!valid)
861 return -1;
862 }
863 break;
864 }
865
866 return 0;
867 }
868
869 // Read the opcode (except the ModR/M byte in the case of extended or escape
870 // opcodes).
readOpcode(struct InternalInstruction * insn)871 static bool readOpcode(struct InternalInstruction *insn) {
872 uint8_t current;
873 LLVM_DEBUG(dbgs() << "readOpcode()");
874
875 insn->opcodeType = ONEBYTE;
876 if (insn->vectorExtensionType == TYPE_EVEX) {
877 switch (mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
878 default:
879 LLVM_DEBUG(
880 dbgs() << format("Unhandled mmm field for instruction (0x%hhx)",
881 mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
882 return true;
883 case VEX_LOB_0F:
884 insn->opcodeType = TWOBYTE;
885 return consume(insn, insn->opcode);
886 case VEX_LOB_0F38:
887 insn->opcodeType = THREEBYTE_38;
888 return consume(insn, insn->opcode);
889 case VEX_LOB_0F3A:
890 insn->opcodeType = THREEBYTE_3A;
891 return consume(insn, insn->opcode);
892 case VEX_LOB_MAP5:
893 insn->opcodeType = MAP5;
894 return consume(insn, insn->opcode);
895 case VEX_LOB_MAP6:
896 insn->opcodeType = MAP6;
897 return consume(insn, insn->opcode);
898 }
899 } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
900 switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
901 default:
902 LLVM_DEBUG(
903 dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)",
904 mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])));
905 return true;
906 case VEX_LOB_0F:
907 insn->opcodeType = TWOBYTE;
908 return consume(insn, insn->opcode);
909 case VEX_LOB_0F38:
910 insn->opcodeType = THREEBYTE_38;
911 return consume(insn, insn->opcode);
912 case VEX_LOB_0F3A:
913 insn->opcodeType = THREEBYTE_3A;
914 return consume(insn, insn->opcode);
915 case VEX_LOB_MAP5:
916 insn->opcodeType = MAP5;
917 return consume(insn, insn->opcode);
918 case VEX_LOB_MAP6:
919 insn->opcodeType = MAP6;
920 return consume(insn, insn->opcode);
921 }
922 } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
923 insn->opcodeType = TWOBYTE;
924 return consume(insn, insn->opcode);
925 } else if (insn->vectorExtensionType == TYPE_XOP) {
926 switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
927 default:
928 LLVM_DEBUG(
929 dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)",
930 mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])));
931 return true;
932 case XOP_MAP_SELECT_8:
933 insn->opcodeType = XOP8_MAP;
934 return consume(insn, insn->opcode);
935 case XOP_MAP_SELECT_9:
936 insn->opcodeType = XOP9_MAP;
937 return consume(insn, insn->opcode);
938 case XOP_MAP_SELECT_A:
939 insn->opcodeType = XOPA_MAP;
940 return consume(insn, insn->opcode);
941 }
942 }
943
944 if (consume(insn, current))
945 return true;
946
947 if (current == 0x0f) {
948 LLVM_DEBUG(
949 dbgs() << format("Found a two-byte escape prefix (0x%hhx)", current));
950 if (consume(insn, current))
951 return true;
952
953 if (current == 0x38) {
954 LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)",
955 current));
956 if (consume(insn, current))
957 return true;
958
959 insn->opcodeType = THREEBYTE_38;
960 } else if (current == 0x3a) {
961 LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)",
962 current));
963 if (consume(insn, current))
964 return true;
965
966 insn->opcodeType = THREEBYTE_3A;
967 } else if (current == 0x0f) {
968 LLVM_DEBUG(
969 dbgs() << format("Found a 3dnow escape prefix (0x%hhx)", current));
970
971 // Consume operands before the opcode to comply with the 3DNow encoding
972 if (readModRM(insn))
973 return true;
974
975 if (consume(insn, current))
976 return true;
977
978 insn->opcodeType = THREEDNOW_MAP;
979 } else {
980 LLVM_DEBUG(dbgs() << "Didn't find a three-byte escape prefix");
981 insn->opcodeType = TWOBYTE;
982 }
983 } else if (insn->mandatoryPrefix)
984 // The opcode with mandatory prefix must start with opcode escape.
985 // If not it's legacy repeat prefix
986 insn->mandatoryPrefix = 0;
987
988 // At this point we have consumed the full opcode.
989 // Anything we consume from here on must be unconsumed.
990 insn->opcode = current;
991
992 return false;
993 }
994
995 // Determine whether equiv is the 16-bit equivalent of orig (32-bit or 64-bit).
is16BitEquivalent(const char * orig,const char * equiv)996 static bool is16BitEquivalent(const char *orig, const char *equiv) {
997 for (int i = 0;; i++) {
998 if (orig[i] == '\0' && equiv[i] == '\0')
999 return true;
1000 if (orig[i] == '\0' || equiv[i] == '\0')
1001 return false;
1002 if (orig[i] != equiv[i]) {
1003 if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
1004 continue;
1005 if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
1006 continue;
1007 if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
1008 continue;
1009 return false;
1010 }
1011 }
1012 }
1013
1014 // Determine whether this instruction is a 64-bit instruction.
is64Bit(const char * name)1015 static bool is64Bit(const char *name) {
1016 for (int i = 0;; ++i) {
1017 if (name[i] == '\0')
1018 return false;
1019 if (name[i] == '6' && name[i + 1] == '4')
1020 return true;
1021 }
1022 }
1023
1024 // Determine the ID of an instruction, consuming the ModR/M byte as appropriate
1025 // for extended and escape opcodes, and using a supplied attribute mask.
getInstructionIDWithAttrMask(uint16_t * instructionID,struct InternalInstruction * insn,uint16_t attrMask)1026 static int getInstructionIDWithAttrMask(uint16_t *instructionID,
1027 struct InternalInstruction *insn,
1028 uint16_t attrMask) {
1029 auto insnCtx = InstructionContext(x86DisassemblerContexts[attrMask]);
1030 const ContextDecision *decision;
1031 switch (insn->opcodeType) {
1032 case ONEBYTE:
1033 decision = &ONEBYTE_SYM;
1034 break;
1035 case TWOBYTE:
1036 decision = &TWOBYTE_SYM;
1037 break;
1038 case THREEBYTE_38:
1039 decision = &THREEBYTE38_SYM;
1040 break;
1041 case THREEBYTE_3A:
1042 decision = &THREEBYTE3A_SYM;
1043 break;
1044 case XOP8_MAP:
1045 decision = &XOP8_MAP_SYM;
1046 break;
1047 case XOP9_MAP:
1048 decision = &XOP9_MAP_SYM;
1049 break;
1050 case XOPA_MAP:
1051 decision = &XOPA_MAP_SYM;
1052 break;
1053 case THREEDNOW_MAP:
1054 decision = &THREEDNOW_MAP_SYM;
1055 break;
1056 case MAP5:
1057 decision = &MAP5_SYM;
1058 break;
1059 case MAP6:
1060 decision = &MAP6_SYM;
1061 break;
1062 }
1063
1064 if (decision->opcodeDecisions[insnCtx]
1065 .modRMDecisions[insn->opcode]
1066 .modrm_type != MODRM_ONEENTRY) {
1067 if (readModRM(insn))
1068 return -1;
1069 *instructionID =
1070 decode(insn->opcodeType, insnCtx, insn->opcode, insn->modRM);
1071 } else {
1072 *instructionID = decode(insn->opcodeType, insnCtx, insn->opcode, 0);
1073 }
1074
1075 return 0;
1076 }
1077
1078 // Determine the ID of an instruction, consuming the ModR/M byte as appropriate
1079 // for extended and escape opcodes. Determines the attributes and context for
1080 // the instruction before doing so.
getInstructionID(struct InternalInstruction * insn,const MCInstrInfo * mii)1081 static int getInstructionID(struct InternalInstruction *insn,
1082 const MCInstrInfo *mii) {
1083 uint16_t attrMask;
1084 uint16_t instructionID;
1085
1086 LLVM_DEBUG(dbgs() << "getID()");
1087
1088 attrMask = ATTR_NONE;
1089
1090 if (insn->mode == MODE_64BIT)
1091 attrMask |= ATTR_64BIT;
1092
1093 if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
1094 attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
1095
1096 if (insn->vectorExtensionType == TYPE_EVEX) {
1097 switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
1098 case VEX_PREFIX_66:
1099 attrMask |= ATTR_OPSIZE;
1100 break;
1101 case VEX_PREFIX_F3:
1102 attrMask |= ATTR_XS;
1103 break;
1104 case VEX_PREFIX_F2:
1105 attrMask |= ATTR_XD;
1106 break;
1107 }
1108
1109 if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
1110 attrMask |= ATTR_EVEXKZ;
1111 if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
1112 attrMask |= ATTR_EVEXB;
1113 if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
1114 attrMask |= ATTR_EVEXK;
1115 if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
1116 attrMask |= ATTR_VEXL;
1117 if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
1118 attrMask |= ATTR_EVEXL2;
1119 } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
1120 switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
1121 case VEX_PREFIX_66:
1122 attrMask |= ATTR_OPSIZE;
1123 break;
1124 case VEX_PREFIX_F3:
1125 attrMask |= ATTR_XS;
1126 break;
1127 case VEX_PREFIX_F2:
1128 attrMask |= ATTR_XD;
1129 break;
1130 }
1131
1132 if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
1133 attrMask |= ATTR_VEXL;
1134 } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
1135 switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
1136 case VEX_PREFIX_66:
1137 attrMask |= ATTR_OPSIZE;
1138 if (insn->hasAdSize)
1139 attrMask |= ATTR_ADSIZE;
1140 break;
1141 case VEX_PREFIX_F3:
1142 attrMask |= ATTR_XS;
1143 break;
1144 case VEX_PREFIX_F2:
1145 attrMask |= ATTR_XD;
1146 break;
1147 }
1148
1149 if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
1150 attrMask |= ATTR_VEXL;
1151 } else if (insn->vectorExtensionType == TYPE_XOP) {
1152 switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
1153 case VEX_PREFIX_66:
1154 attrMask |= ATTR_OPSIZE;
1155 break;
1156 case VEX_PREFIX_F3:
1157 attrMask |= ATTR_XS;
1158 break;
1159 case VEX_PREFIX_F2:
1160 attrMask |= ATTR_XD;
1161 break;
1162 }
1163
1164 if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
1165 attrMask |= ATTR_VEXL;
1166 } else {
1167 return -1;
1168 }
1169 } else if (!insn->mandatoryPrefix) {
1170 // If we don't have mandatory prefix we should use legacy prefixes here
1171 if (insn->hasOpSize && (insn->mode != MODE_16BIT))
1172 attrMask |= ATTR_OPSIZE;
1173 if (insn->hasAdSize)
1174 attrMask |= ATTR_ADSIZE;
1175 if (insn->opcodeType == ONEBYTE) {
1176 if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90))
1177 // Special support for PAUSE
1178 attrMask |= ATTR_XS;
1179 } else {
1180 if (insn->repeatPrefix == 0xf2)
1181 attrMask |= ATTR_XD;
1182 else if (insn->repeatPrefix == 0xf3)
1183 attrMask |= ATTR_XS;
1184 }
1185 } else {
1186 switch (insn->mandatoryPrefix) {
1187 case 0xf2:
1188 attrMask |= ATTR_XD;
1189 break;
1190 case 0xf3:
1191 attrMask |= ATTR_XS;
1192 break;
1193 case 0x66:
1194 if (insn->mode != MODE_16BIT)
1195 attrMask |= ATTR_OPSIZE;
1196 if (insn->hasAdSize)
1197 attrMask |= ATTR_ADSIZE;
1198 break;
1199 case 0x67:
1200 attrMask |= ATTR_ADSIZE;
1201 break;
1202 }
1203 }
1204
1205 if (insn->rexPrefix & 0x08) {
1206 attrMask |= ATTR_REXW;
1207 attrMask &= ~ATTR_ADSIZE;
1208 }
1209
1210 if (insn->mode == MODE_16BIT) {
1211 // JCXZ/JECXZ need special handling for 16-bit mode because the meaning
1212 // of the AdSize prefix is inverted w.r.t. 32-bit mode.
1213 if (insn->opcodeType == ONEBYTE && insn->opcode == 0xE3)
1214 attrMask ^= ATTR_ADSIZE;
1215 // If we're in 16-bit mode and this is one of the relative jumps and opsize
1216 // prefix isn't present, we need to force the opsize attribute since the
1217 // prefix is inverted relative to 32-bit mode.
1218 if (!insn->hasOpSize && insn->opcodeType == ONEBYTE &&
1219 (insn->opcode == 0xE8 || insn->opcode == 0xE9))
1220 attrMask |= ATTR_OPSIZE;
1221
1222 if (!insn->hasOpSize && insn->opcodeType == TWOBYTE &&
1223 insn->opcode >= 0x80 && insn->opcode <= 0x8F)
1224 attrMask |= ATTR_OPSIZE;
1225 }
1226
1227
1228 if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask))
1229 return -1;
1230
1231 // The following clauses compensate for limitations of the tables.
1232
1233 if (insn->mode != MODE_64BIT &&
1234 insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
1235 // The tables can't distinquish between cases where the W-bit is used to
1236 // select register size and cases where its a required part of the opcode.
1237 if ((insn->vectorExtensionType == TYPE_EVEX &&
1238 wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
1239 (insn->vectorExtensionType == TYPE_VEX_3B &&
1240 wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
1241 (insn->vectorExtensionType == TYPE_XOP &&
1242 wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
1243
1244 uint16_t instructionIDWithREXW;
1245 if (getInstructionIDWithAttrMask(&instructionIDWithREXW, insn,
1246 attrMask | ATTR_REXW)) {
1247 insn->instructionID = instructionID;
1248 insn->spec = &INSTRUCTIONS_SYM[instructionID];
1249 return 0;
1250 }
1251
1252 auto SpecName = mii->getName(instructionIDWithREXW);
1253 // If not a 64-bit instruction. Switch the opcode.
1254 if (!is64Bit(SpecName.data())) {
1255 insn->instructionID = instructionIDWithREXW;
1256 insn->spec = &INSTRUCTIONS_SYM[instructionIDWithREXW];
1257 return 0;
1258 }
1259 }
1260 }
1261
1262 // Absolute moves, umonitor, and movdir64b need special handling.
1263 // -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
1264 // inverted w.r.t.
1265 // -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
1266 // any position.
1267 if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) ||
1268 (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) ||
1269 (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) {
1270 // Make sure we observed the prefixes in any position.
1271 if (insn->hasAdSize)
1272 attrMask |= ATTR_ADSIZE;
1273 if (insn->hasOpSize)
1274 attrMask |= ATTR_OPSIZE;
1275
1276 // In 16-bit, invert the attributes.
1277 if (insn->mode == MODE_16BIT) {
1278 attrMask ^= ATTR_ADSIZE;
1279
1280 // The OpSize attribute is only valid with the absolute moves.
1281 if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0))
1282 attrMask ^= ATTR_OPSIZE;
1283 }
1284
1285 if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask))
1286 return -1;
1287
1288 insn->instructionID = instructionID;
1289 insn->spec = &INSTRUCTIONS_SYM[instructionID];
1290 return 0;
1291 }
1292
1293 if ((insn->mode == MODE_16BIT || insn->hasOpSize) &&
1294 !(attrMask & ATTR_OPSIZE)) {
1295 // The instruction tables make no distinction between instructions that
1296 // allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
1297 // particular spot (i.e., many MMX operations). In general we're
1298 // conservative, but in the specific case where OpSize is present but not in
1299 // the right place we check if there's a 16-bit operation.
1300 const struct InstructionSpecifier *spec;
1301 uint16_t instructionIDWithOpsize;
1302 llvm::StringRef specName, specWithOpSizeName;
1303
1304 spec = &INSTRUCTIONS_SYM[instructionID];
1305
1306 if (getInstructionIDWithAttrMask(&instructionIDWithOpsize, insn,
1307 attrMask | ATTR_OPSIZE)) {
1308 // ModRM required with OpSize but not present. Give up and return the
1309 // version without OpSize set.
1310 insn->instructionID = instructionID;
1311 insn->spec = spec;
1312 return 0;
1313 }
1314
1315 specName = mii->getName(instructionID);
1316 specWithOpSizeName = mii->getName(instructionIDWithOpsize);
1317
1318 if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
1319 (insn->mode == MODE_16BIT) ^ insn->hasOpSize) {
1320 insn->instructionID = instructionIDWithOpsize;
1321 insn->spec = &INSTRUCTIONS_SYM[instructionIDWithOpsize];
1322 } else {
1323 insn->instructionID = instructionID;
1324 insn->spec = spec;
1325 }
1326 return 0;
1327 }
1328
1329 if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
1330 insn->rexPrefix & 0x01) {
1331 // NOOP shouldn't decode as NOOP if REX.b is set. Instead it should decode
1332 // as XCHG %r8, %eax.
1333 const struct InstructionSpecifier *spec;
1334 uint16_t instructionIDWithNewOpcode;
1335 const struct InstructionSpecifier *specWithNewOpcode;
1336
1337 spec = &INSTRUCTIONS_SYM[instructionID];
1338
1339 // Borrow opcode from one of the other XCHGar opcodes
1340 insn->opcode = 0x91;
1341
1342 if (getInstructionIDWithAttrMask(&instructionIDWithNewOpcode, insn,
1343 attrMask)) {
1344 insn->opcode = 0x90;
1345
1346 insn->instructionID = instructionID;
1347 insn->spec = spec;
1348 return 0;
1349 }
1350
1351 specWithNewOpcode = &INSTRUCTIONS_SYM[instructionIDWithNewOpcode];
1352
1353 // Change back
1354 insn->opcode = 0x90;
1355
1356 insn->instructionID = instructionIDWithNewOpcode;
1357 insn->spec = specWithNewOpcode;
1358
1359 return 0;
1360 }
1361
1362 insn->instructionID = instructionID;
1363 insn->spec = &INSTRUCTIONS_SYM[insn->instructionID];
1364
1365 return 0;
1366 }
1367
1368 // Read an operand from the opcode field of an instruction and interprets it
1369 // appropriately given the operand width. Handles AddRegFrm instructions.
1370 //
1371 // @param insn - the instruction whose opcode field is to be read.
1372 // @param size - The width (in bytes) of the register being specified.
1373 // 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
1374 // RAX.
1375 // @return - 0 on success; nonzero otherwise.
readOpcodeRegister(struct InternalInstruction * insn,uint8_t size)1376 static int readOpcodeRegister(struct InternalInstruction *insn, uint8_t size) {
1377 LLVM_DEBUG(dbgs() << "readOpcodeRegister()");
1378
1379 if (size == 0)
1380 size = insn->registerSize;
1381
1382 switch (size) {
1383 case 1:
1384 insn->opcodeRegister = (Reg)(
1385 MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
1386 if (insn->rexPrefix && insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
1387 insn->opcodeRegister < MODRM_REG_AL + 0x8) {
1388 insn->opcodeRegister =
1389 (Reg)(MODRM_REG_SPL + (insn->opcodeRegister - MODRM_REG_AL - 4));
1390 }
1391
1392 break;
1393 case 2:
1394 insn->opcodeRegister = (Reg)(
1395 MODRM_REG_AX + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
1396 break;
1397 case 4:
1398 insn->opcodeRegister =
1399 (Reg)(MODRM_REG_EAX +
1400 ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
1401 break;
1402 case 8:
1403 insn->opcodeRegister =
1404 (Reg)(MODRM_REG_RAX +
1405 ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
1406 break;
1407 }
1408
1409 return 0;
1410 }
1411
1412 // Consume an immediate operand from an instruction, given the desired operand
1413 // size.
1414 //
1415 // @param insn - The instruction whose operand is to be read.
1416 // @param size - The width (in bytes) of the operand.
1417 // @return - 0 if the immediate was successfully consumed; nonzero
1418 // otherwise.
readImmediate(struct InternalInstruction * insn,uint8_t size)1419 static int readImmediate(struct InternalInstruction *insn, uint8_t size) {
1420 uint8_t imm8;
1421 uint16_t imm16;
1422 uint32_t imm32;
1423 uint64_t imm64;
1424
1425 LLVM_DEBUG(dbgs() << "readImmediate()");
1426
1427 assert(insn->numImmediatesConsumed < 2 && "Already consumed two immediates");
1428
1429 insn->immediateSize = size;
1430 insn->immediateOffset = insn->readerCursor - insn->startLocation;
1431
1432 switch (size) {
1433 case 1:
1434 if (consume(insn, imm8))
1435 return -1;
1436 insn->immediates[insn->numImmediatesConsumed] = imm8;
1437 break;
1438 case 2:
1439 if (consume(insn, imm16))
1440 return -1;
1441 insn->immediates[insn->numImmediatesConsumed] = imm16;
1442 break;
1443 case 4:
1444 if (consume(insn, imm32))
1445 return -1;
1446 insn->immediates[insn->numImmediatesConsumed] = imm32;
1447 break;
1448 case 8:
1449 if (consume(insn, imm64))
1450 return -1;
1451 insn->immediates[insn->numImmediatesConsumed] = imm64;
1452 break;
1453 default:
1454 llvm_unreachable("invalid size");
1455 }
1456
1457 insn->numImmediatesConsumed++;
1458
1459 return 0;
1460 }
1461
1462 // Consume vvvv from an instruction if it has a VEX prefix.
readVVVV(struct InternalInstruction * insn)1463 static int readVVVV(struct InternalInstruction *insn) {
1464 LLVM_DEBUG(dbgs() << "readVVVV()");
1465
1466 int vvvv;
1467 if (insn->vectorExtensionType == TYPE_EVEX)
1468 vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
1469 vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
1470 else if (insn->vectorExtensionType == TYPE_VEX_3B)
1471 vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
1472 else if (insn->vectorExtensionType == TYPE_VEX_2B)
1473 vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
1474 else if (insn->vectorExtensionType == TYPE_XOP)
1475 vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
1476 else
1477 return -1;
1478
1479 if (insn->mode != MODE_64BIT)
1480 vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later.
1481
1482 insn->vvvv = static_cast<Reg>(vvvv);
1483 return 0;
1484 }
1485
1486 // Read an mask register from the opcode field of an instruction.
1487 //
1488 // @param insn - The instruction whose opcode field is to be read.
1489 // @return - 0 on success; nonzero otherwise.
readMaskRegister(struct InternalInstruction * insn)1490 static int readMaskRegister(struct InternalInstruction *insn) {
1491 LLVM_DEBUG(dbgs() << "readMaskRegister()");
1492
1493 if (insn->vectorExtensionType != TYPE_EVEX)
1494 return -1;
1495
1496 insn->writemask =
1497 static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
1498 return 0;
1499 }
1500
1501 // Consults the specifier for an instruction and consumes all
1502 // operands for that instruction, interpreting them as it goes.
readOperands(struct InternalInstruction * insn)1503 static int readOperands(struct InternalInstruction *insn) {
1504 int hasVVVV, needVVVV;
1505 int sawRegImm = 0;
1506
1507 LLVM_DEBUG(dbgs() << "readOperands()");
1508
1509 // If non-zero vvvv specified, make sure one of the operands uses it.
1510 hasVVVV = !readVVVV(insn);
1511 needVVVV = hasVVVV && (insn->vvvv != 0);
1512
1513 for (const auto &Op : x86OperandSets[insn->spec->operands]) {
1514 switch (Op.encoding) {
1515 case ENCODING_NONE:
1516 case ENCODING_SI:
1517 case ENCODING_DI:
1518 break;
1519 CASE_ENCODING_VSIB:
1520 // VSIB can use the V2 bit so check only the other bits.
1521 if (needVVVV)
1522 needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
1523 if (readModRM(insn))
1524 return -1;
1525
1526 // Reject if SIB wasn't used.
1527 if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
1528 return -1;
1529
1530 // If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
1531 if (insn->sibIndex == SIB_INDEX_NONE)
1532 insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4);
1533
1534 // If EVEX.v2 is set this is one of the 16-31 registers.
1535 if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT &&
1536 v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
1537 insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
1538
1539 // Adjust the index register to the correct size.
1540 switch ((OperandType)Op.type) {
1541 default:
1542 debug("Unhandled VSIB index type");
1543 return -1;
1544 case TYPE_MVSIBX:
1545 insn->sibIndex =
1546 (SIBIndex)(SIB_INDEX_XMM0 + (insn->sibIndex - insn->sibIndexBase));
1547 break;
1548 case TYPE_MVSIBY:
1549 insn->sibIndex =
1550 (SIBIndex)(SIB_INDEX_YMM0 + (insn->sibIndex - insn->sibIndexBase));
1551 break;
1552 case TYPE_MVSIBZ:
1553 insn->sibIndex =
1554 (SIBIndex)(SIB_INDEX_ZMM0 + (insn->sibIndex - insn->sibIndexBase));
1555 break;
1556 }
1557
1558 // Apply the AVX512 compressed displacement scaling factor.
1559 if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
1560 insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
1561 break;
1562 case ENCODING_SIB:
1563 // Reject if SIB wasn't used.
1564 if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
1565 return -1;
1566 if (readModRM(insn))
1567 return -1;
1568 if (fixupReg(insn, &Op))
1569 return -1;
1570 break;
1571 case ENCODING_REG:
1572 CASE_ENCODING_RM:
1573 if (readModRM(insn))
1574 return -1;
1575 if (fixupReg(insn, &Op))
1576 return -1;
1577 // Apply the AVX512 compressed displacement scaling factor.
1578 if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
1579 insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
1580 break;
1581 case ENCODING_IB:
1582 if (sawRegImm) {
1583 // Saw a register immediate so don't read again and instead split the
1584 // previous immediate. FIXME: This is a hack.
1585 insn->immediates[insn->numImmediatesConsumed] =
1586 insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
1587 ++insn->numImmediatesConsumed;
1588 break;
1589 }
1590 if (readImmediate(insn, 1))
1591 return -1;
1592 if (Op.type == TYPE_XMM || Op.type == TYPE_YMM)
1593 sawRegImm = 1;
1594 break;
1595 case ENCODING_IW:
1596 if (readImmediate(insn, 2))
1597 return -1;
1598 break;
1599 case ENCODING_ID:
1600 if (readImmediate(insn, 4))
1601 return -1;
1602 break;
1603 case ENCODING_IO:
1604 if (readImmediate(insn, 8))
1605 return -1;
1606 break;
1607 case ENCODING_Iv:
1608 if (readImmediate(insn, insn->immediateSize))
1609 return -1;
1610 break;
1611 case ENCODING_Ia:
1612 if (readImmediate(insn, insn->addressSize))
1613 return -1;
1614 break;
1615 case ENCODING_IRC:
1616 insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) |
1617 lFromEVEX4of4(insn->vectorExtensionPrefix[3]);
1618 break;
1619 case ENCODING_RB:
1620 if (readOpcodeRegister(insn, 1))
1621 return -1;
1622 break;
1623 case ENCODING_RW:
1624 if (readOpcodeRegister(insn, 2))
1625 return -1;
1626 break;
1627 case ENCODING_RD:
1628 if (readOpcodeRegister(insn, 4))
1629 return -1;
1630 break;
1631 case ENCODING_RO:
1632 if (readOpcodeRegister(insn, 8))
1633 return -1;
1634 break;
1635 case ENCODING_Rv:
1636 if (readOpcodeRegister(insn, 0))
1637 return -1;
1638 break;
1639 case ENCODING_CC:
1640 insn->immediates[1] = insn->opcode & 0xf;
1641 break;
1642 case ENCODING_FP:
1643 break;
1644 case ENCODING_VVVV:
1645 needVVVV = 0; // Mark that we have found a VVVV operand.
1646 if (!hasVVVV)
1647 return -1;
1648 if (insn->mode != MODE_64BIT)
1649 insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7);
1650 if (fixupReg(insn, &Op))
1651 return -1;
1652 break;
1653 case ENCODING_WRITEMASK:
1654 if (readMaskRegister(insn))
1655 return -1;
1656 break;
1657 case ENCODING_DUP:
1658 break;
1659 default:
1660 LLVM_DEBUG(dbgs() << "Encountered an operand with an unknown encoding.");
1661 return -1;
1662 }
1663 }
1664
1665 // If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail
1666 if (needVVVV)
1667 return -1;
1668
1669 return 0;
1670 }
1671
1672 namespace llvm {
1673
1674 // Fill-ins to make the compiler happy. These constants are never actually
1675 // assigned; they are just filler to make an automatically-generated switch
1676 // statement work.
1677 namespace X86 {
1678 enum {
1679 BX_SI = 500,
1680 BX_DI = 501,
1681 BP_SI = 502,
1682 BP_DI = 503,
1683 sib = 504,
1684 sib64 = 505
1685 };
1686 } // namespace X86
1687
1688 } // namespace llvm
1689
1690 static bool translateInstruction(MCInst &target,
1691 InternalInstruction &source,
1692 const MCDisassembler *Dis);
1693
1694 namespace {
1695
1696 /// Generic disassembler for all X86 platforms. All each platform class should
1697 /// have to do is subclass the constructor, and provide a different
1698 /// disassemblerMode value.
1699 class X86GenericDisassembler : public MCDisassembler {
1700 std::unique_ptr<const MCInstrInfo> MII;
1701 public:
1702 X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
1703 std::unique_ptr<const MCInstrInfo> MII);
1704 public:
1705 DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
1706 ArrayRef<uint8_t> Bytes, uint64_t Address,
1707 raw_ostream &cStream) const override;
1708
1709 private:
1710 DisassemblerMode fMode;
1711 };
1712
1713 } // namespace
1714
X86GenericDisassembler(const MCSubtargetInfo & STI,MCContext & Ctx,std::unique_ptr<const MCInstrInfo> MII)1715 X86GenericDisassembler::X86GenericDisassembler(
1716 const MCSubtargetInfo &STI,
1717 MCContext &Ctx,
1718 std::unique_ptr<const MCInstrInfo> MII)
1719 : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
1720 const FeatureBitset &FB = STI.getFeatureBits();
1721 if (FB[X86::Is16Bit]) {
1722 fMode = MODE_16BIT;
1723 return;
1724 } else if (FB[X86::Is32Bit]) {
1725 fMode = MODE_32BIT;
1726 return;
1727 } else if (FB[X86::Is64Bit]) {
1728 fMode = MODE_64BIT;
1729 return;
1730 }
1731
1732 llvm_unreachable("Invalid CPU mode");
1733 }
1734
getInstruction(MCInst & Instr,uint64_t & Size,ArrayRef<uint8_t> Bytes,uint64_t Address,raw_ostream & CStream) const1735 MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
1736 MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
1737 raw_ostream &CStream) const {
1738 CommentStream = &CStream;
1739
1740 InternalInstruction Insn;
1741 memset(&Insn, 0, sizeof(InternalInstruction));
1742 Insn.bytes = Bytes;
1743 Insn.startLocation = Address;
1744 Insn.readerCursor = Address;
1745 Insn.mode = fMode;
1746
1747 if (Bytes.empty() || readPrefixes(&Insn) || readOpcode(&Insn) ||
1748 getInstructionID(&Insn, MII.get()) || Insn.instructionID == 0 ||
1749 readOperands(&Insn)) {
1750 Size = Insn.readerCursor - Address;
1751 return Fail;
1752 }
1753
1754 Insn.operands = x86OperandSets[Insn.spec->operands];
1755 Insn.length = Insn.readerCursor - Insn.startLocation;
1756 Size = Insn.length;
1757 if (Size > 15)
1758 LLVM_DEBUG(dbgs() << "Instruction exceeds 15-byte limit");
1759
1760 bool Ret = translateInstruction(Instr, Insn, this);
1761 if (!Ret) {
1762 unsigned Flags = X86::IP_NO_PREFIX;
1763 if (Insn.hasAdSize)
1764 Flags |= X86::IP_HAS_AD_SIZE;
1765 if (!Insn.mandatoryPrefix) {
1766 if (Insn.hasOpSize)
1767 Flags |= X86::IP_HAS_OP_SIZE;
1768 if (Insn.repeatPrefix == 0xf2)
1769 Flags |= X86::IP_HAS_REPEAT_NE;
1770 else if (Insn.repeatPrefix == 0xf3 &&
1771 // It should not be 'pause' f3 90
1772 Insn.opcode != 0x90)
1773 Flags |= X86::IP_HAS_REPEAT;
1774 if (Insn.hasLockPrefix)
1775 Flags |= X86::IP_HAS_LOCK;
1776 }
1777 Instr.setFlags(Flags);
1778 }
1779 return (!Ret) ? Success : Fail;
1780 }
1781
1782 //
1783 // Private code that translates from struct InternalInstructions to MCInsts.
1784 //
1785
1786 /// translateRegister - Translates an internal register to the appropriate LLVM
1787 /// register, and appends it as an operand to an MCInst.
1788 ///
1789 /// @param mcInst - The MCInst to append to.
1790 /// @param reg - The Reg to append.
translateRegister(MCInst & mcInst,Reg reg)1791 static void translateRegister(MCInst &mcInst, Reg reg) {
1792 #define ENTRY(x) X86::x,
1793 static constexpr MCPhysReg llvmRegnums[] = {ALL_REGS};
1794 #undef ENTRY
1795
1796 MCPhysReg llvmRegnum = llvmRegnums[reg];
1797 mcInst.addOperand(MCOperand::createReg(llvmRegnum));
1798 }
1799
1800 static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
1801 0, // SEG_OVERRIDE_NONE
1802 X86::CS,
1803 X86::SS,
1804 X86::DS,
1805 X86::ES,
1806 X86::FS,
1807 X86::GS
1808 };
1809
1810 /// translateSrcIndex - Appends a source index operand to an MCInst.
1811 ///
1812 /// @param mcInst - The MCInst to append to.
1813 /// @param insn - The internal instruction.
translateSrcIndex(MCInst & mcInst,InternalInstruction & insn)1814 static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
1815 unsigned baseRegNo;
1816
1817 if (insn.mode == MODE_64BIT)
1818 baseRegNo = insn.hasAdSize ? X86::ESI : X86::RSI;
1819 else if (insn.mode == MODE_32BIT)
1820 baseRegNo = insn.hasAdSize ? X86::SI : X86::ESI;
1821 else {
1822 assert(insn.mode == MODE_16BIT);
1823 baseRegNo = insn.hasAdSize ? X86::ESI : X86::SI;
1824 }
1825 MCOperand baseReg = MCOperand::createReg(baseRegNo);
1826 mcInst.addOperand(baseReg);
1827
1828 MCOperand segmentReg;
1829 segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
1830 mcInst.addOperand(segmentReg);
1831 return false;
1832 }
1833
1834 /// translateDstIndex - Appends a destination index operand to an MCInst.
1835 ///
1836 /// @param mcInst - The MCInst to append to.
1837 /// @param insn - The internal instruction.
1838
translateDstIndex(MCInst & mcInst,InternalInstruction & insn)1839 static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
1840 unsigned baseRegNo;
1841
1842 if (insn.mode == MODE_64BIT)
1843 baseRegNo = insn.hasAdSize ? X86::EDI : X86::RDI;
1844 else if (insn.mode == MODE_32BIT)
1845 baseRegNo = insn.hasAdSize ? X86::DI : X86::EDI;
1846 else {
1847 assert(insn.mode == MODE_16BIT);
1848 baseRegNo = insn.hasAdSize ? X86::EDI : X86::DI;
1849 }
1850 MCOperand baseReg = MCOperand::createReg(baseRegNo);
1851 mcInst.addOperand(baseReg);
1852 return false;
1853 }
1854
1855 /// translateImmediate - Appends an immediate operand to an MCInst.
1856 ///
1857 /// @param mcInst - The MCInst to append to.
1858 /// @param immediate - The immediate value to append.
1859 /// @param operand - The operand, as stored in the descriptor table.
1860 /// @param insn - The internal instruction.
translateImmediate(MCInst & mcInst,uint64_t immediate,const OperandSpecifier & operand,InternalInstruction & insn,const MCDisassembler * Dis)1861 static void translateImmediate(MCInst &mcInst, uint64_t immediate,
1862 const OperandSpecifier &operand,
1863 InternalInstruction &insn,
1864 const MCDisassembler *Dis) {
1865 // Sign-extend the immediate if necessary.
1866
1867 OperandType type = (OperandType)operand.type;
1868
1869 bool isBranch = false;
1870 uint64_t pcrel = 0;
1871 if (type == TYPE_REL) {
1872 isBranch = true;
1873 pcrel = insn.startLocation + insn.length;
1874 switch (operand.encoding) {
1875 default:
1876 break;
1877 case ENCODING_Iv:
1878 switch (insn.displacementSize) {
1879 default:
1880 break;
1881 case 1:
1882 if(immediate & 0x80)
1883 immediate |= ~(0xffull);
1884 break;
1885 case 2:
1886 if(immediate & 0x8000)
1887 immediate |= ~(0xffffull);
1888 break;
1889 case 4:
1890 if(immediate & 0x80000000)
1891 immediate |= ~(0xffffffffull);
1892 break;
1893 case 8:
1894 break;
1895 }
1896 break;
1897 case ENCODING_IB:
1898 if(immediate & 0x80)
1899 immediate |= ~(0xffull);
1900 break;
1901 case ENCODING_IW:
1902 if(immediate & 0x8000)
1903 immediate |= ~(0xffffull);
1904 break;
1905 case ENCODING_ID:
1906 if(immediate & 0x80000000)
1907 immediate |= ~(0xffffffffull);
1908 break;
1909 }
1910 }
1911 // By default sign-extend all X86 immediates based on their encoding.
1912 else if (type == TYPE_IMM) {
1913 switch (operand.encoding) {
1914 default:
1915 break;
1916 case ENCODING_IB:
1917 if(immediate & 0x80)
1918 immediate |= ~(0xffull);
1919 break;
1920 case ENCODING_IW:
1921 if(immediate & 0x8000)
1922 immediate |= ~(0xffffull);
1923 break;
1924 case ENCODING_ID:
1925 if(immediate & 0x80000000)
1926 immediate |= ~(0xffffffffull);
1927 break;
1928 case ENCODING_IO:
1929 break;
1930 }
1931 }
1932
1933 switch (type) {
1934 case TYPE_XMM:
1935 mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
1936 return;
1937 case TYPE_YMM:
1938 mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
1939 return;
1940 case TYPE_ZMM:
1941 mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
1942 return;
1943 default:
1944 // operand is 64 bits wide. Do nothing.
1945 break;
1946 }
1947
1948 if (!Dis->tryAddingSymbolicOperand(
1949 mcInst, immediate + pcrel, insn.startLocation, isBranch,
1950 insn.immediateOffset, insn.immediateSize, insn.length))
1951 mcInst.addOperand(MCOperand::createImm(immediate));
1952
1953 if (type == TYPE_MOFFS) {
1954 MCOperand segmentReg;
1955 segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
1956 mcInst.addOperand(segmentReg);
1957 }
1958 }
1959
1960 /// translateRMRegister - Translates a register stored in the R/M field of the
1961 /// ModR/M byte to its LLVM equivalent and appends it to an MCInst.
1962 /// @param mcInst - The MCInst to append to.
1963 /// @param insn - The internal instruction to extract the R/M field
1964 /// from.
1965 /// @return - 0 on success; -1 otherwise
translateRMRegister(MCInst & mcInst,InternalInstruction & insn)1966 static bool translateRMRegister(MCInst &mcInst,
1967 InternalInstruction &insn) {
1968 if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
1969 debug("A R/M register operand may not have a SIB byte");
1970 return true;
1971 }
1972
1973 switch (insn.eaBase) {
1974 default:
1975 debug("Unexpected EA base register");
1976 return true;
1977 case EA_BASE_NONE:
1978 debug("EA_BASE_NONE for ModR/M base");
1979 return true;
1980 #define ENTRY(x) case EA_BASE_##x:
1981 ALL_EA_BASES
1982 #undef ENTRY
1983 debug("A R/M register operand may not have a base; "
1984 "the operand must be a register.");
1985 return true;
1986 #define ENTRY(x) \
1987 case EA_REG_##x: \
1988 mcInst.addOperand(MCOperand::createReg(X86::x)); break;
1989 ALL_REGS
1990 #undef ENTRY
1991 }
1992
1993 return false;
1994 }
1995
1996 /// translateRMMemory - Translates a memory operand stored in the Mod and R/M
1997 /// fields of an internal instruction (and possibly its SIB byte) to a memory
1998 /// operand in LLVM's format, and appends it to an MCInst.
1999 ///
2000 /// @param mcInst - The MCInst to append to.
2001 /// @param insn - The instruction to extract Mod, R/M, and SIB fields
2002 /// from.
2003 /// @param ForceSIB - The instruction must use SIB.
2004 /// @return - 0 on success; nonzero otherwise
translateRMMemory(MCInst & mcInst,InternalInstruction & insn,const MCDisassembler * Dis,bool ForceSIB=false)2005 static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
2006 const MCDisassembler *Dis,
2007 bool ForceSIB = false) {
2008 // Addresses in an MCInst are represented as five operands:
2009 // 1. basereg (register) The R/M base, or (if there is a SIB) the
2010 // SIB base
2011 // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified
2012 // scale amount
2013 // 3. indexreg (register) x86_registerNONE, or (if there is a SIB)
2014 // the index (which is multiplied by the
2015 // scale amount)
2016 // 4. displacement (immediate) 0, or the displacement if there is one
2017 // 5. segmentreg (register) x86_registerNONE for now, but could be set
2018 // if we have segment overrides
2019
2020 MCOperand baseReg;
2021 MCOperand scaleAmount;
2022 MCOperand indexReg;
2023 MCOperand displacement;
2024 MCOperand segmentReg;
2025 uint64_t pcrel = 0;
2026
2027 if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
2028 if (insn.sibBase != SIB_BASE_NONE) {
2029 switch (insn.sibBase) {
2030 default:
2031 debug("Unexpected sibBase");
2032 return true;
2033 #define ENTRY(x) \
2034 case SIB_BASE_##x: \
2035 baseReg = MCOperand::createReg(X86::x); break;
2036 ALL_SIB_BASES
2037 #undef ENTRY
2038 }
2039 } else {
2040 baseReg = MCOperand::createReg(X86::NoRegister);
2041 }
2042
2043 if (insn.sibIndex != SIB_INDEX_NONE) {
2044 switch (insn.sibIndex) {
2045 default:
2046 debug("Unexpected sibIndex");
2047 return true;
2048 #define ENTRY(x) \
2049 case SIB_INDEX_##x: \
2050 indexReg = MCOperand::createReg(X86::x); break;
2051 EA_BASES_32BIT
2052 EA_BASES_64BIT
2053 REGS_XMM
2054 REGS_YMM
2055 REGS_ZMM
2056 #undef ENTRY
2057 }
2058 } else {
2059 // Use EIZ/RIZ for a few ambiguous cases where the SIB byte is present,
2060 // but no index is used and modrm alone should have been enough.
2061 // -No base register in 32-bit mode. In 64-bit mode this is used to
2062 // avoid rip-relative addressing.
2063 // -Any base register used other than ESP/RSP/R12D/R12. Using these as a
2064 // base always requires a SIB byte.
2065 // -A scale other than 1 is used.
2066 if (!ForceSIB &&
2067 (insn.sibScale != 1 ||
2068 (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
2069 (insn.sibBase != SIB_BASE_NONE &&
2070 insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
2071 insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12))) {
2072 indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ :
2073 X86::RIZ);
2074 } else
2075 indexReg = MCOperand::createReg(X86::NoRegister);
2076 }
2077
2078 scaleAmount = MCOperand::createImm(insn.sibScale);
2079 } else {
2080 switch (insn.eaBase) {
2081 case EA_BASE_NONE:
2082 if (insn.eaDisplacement == EA_DISP_NONE) {
2083 debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
2084 return true;
2085 }
2086 if (insn.mode == MODE_64BIT){
2087 pcrel = insn.startLocation + insn.length;
2088 Dis->tryAddingPcLoadReferenceComment(insn.displacement + pcrel,
2089 insn.startLocation +
2090 insn.displacementOffset);
2091 // Section 2.2.1.6
2092 baseReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIP :
2093 X86::RIP);
2094 }
2095 else
2096 baseReg = MCOperand::createReg(X86::NoRegister);
2097
2098 indexReg = MCOperand::createReg(X86::NoRegister);
2099 break;
2100 case EA_BASE_BX_SI:
2101 baseReg = MCOperand::createReg(X86::BX);
2102 indexReg = MCOperand::createReg(X86::SI);
2103 break;
2104 case EA_BASE_BX_DI:
2105 baseReg = MCOperand::createReg(X86::BX);
2106 indexReg = MCOperand::createReg(X86::DI);
2107 break;
2108 case EA_BASE_BP_SI:
2109 baseReg = MCOperand::createReg(X86::BP);
2110 indexReg = MCOperand::createReg(X86::SI);
2111 break;
2112 case EA_BASE_BP_DI:
2113 baseReg = MCOperand::createReg(X86::BP);
2114 indexReg = MCOperand::createReg(X86::DI);
2115 break;
2116 default:
2117 indexReg = MCOperand::createReg(X86::NoRegister);
2118 switch (insn.eaBase) {
2119 default:
2120 debug("Unexpected eaBase");
2121 return true;
2122 // Here, we will use the fill-ins defined above. However,
2123 // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
2124 // sib and sib64 were handled in the top-level if, so they're only
2125 // placeholders to keep the compiler happy.
2126 #define ENTRY(x) \
2127 case EA_BASE_##x: \
2128 baseReg = MCOperand::createReg(X86::x); break;
2129 ALL_EA_BASES
2130 #undef ENTRY
2131 #define ENTRY(x) case EA_REG_##x:
2132 ALL_REGS
2133 #undef ENTRY
2134 debug("A R/M memory operand may not be a register; "
2135 "the base field must be a base.");
2136 return true;
2137 }
2138 }
2139
2140 scaleAmount = MCOperand::createImm(1);
2141 }
2142
2143 displacement = MCOperand::createImm(insn.displacement);
2144
2145 segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
2146
2147 mcInst.addOperand(baseReg);
2148 mcInst.addOperand(scaleAmount);
2149 mcInst.addOperand(indexReg);
2150
2151 const uint8_t dispSize =
2152 (insn.eaDisplacement == EA_DISP_NONE) ? 0 : insn.displacementSize;
2153
2154 if (!Dis->tryAddingSymbolicOperand(
2155 mcInst, insn.displacement + pcrel, insn.startLocation, false,
2156 insn.displacementOffset, dispSize, insn.length))
2157 mcInst.addOperand(displacement);
2158 mcInst.addOperand(segmentReg);
2159 return false;
2160 }
2161
2162 /// translateRM - Translates an operand stored in the R/M (and possibly SIB)
2163 /// byte of an instruction to LLVM form, and appends it to an MCInst.
2164 ///
2165 /// @param mcInst - The MCInst to append to.
2166 /// @param operand - The operand, as stored in the descriptor table.
2167 /// @param insn - The instruction to extract Mod, R/M, and SIB fields
2168 /// from.
2169 /// @return - 0 on success; nonzero otherwise
translateRM(MCInst & mcInst,const OperandSpecifier & operand,InternalInstruction & insn,const MCDisassembler * Dis)2170 static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
2171 InternalInstruction &insn, const MCDisassembler *Dis) {
2172 switch (operand.type) {
2173 default:
2174 debug("Unexpected type for a R/M operand");
2175 return true;
2176 case TYPE_R8:
2177 case TYPE_R16:
2178 case TYPE_R32:
2179 case TYPE_R64:
2180 case TYPE_Rv:
2181 case TYPE_MM64:
2182 case TYPE_XMM:
2183 case TYPE_YMM:
2184 case TYPE_ZMM:
2185 case TYPE_TMM:
2186 case TYPE_VK_PAIR:
2187 case TYPE_VK:
2188 case TYPE_DEBUGREG:
2189 case TYPE_CONTROLREG:
2190 case TYPE_BNDR:
2191 return translateRMRegister(mcInst, insn);
2192 case TYPE_M:
2193 case TYPE_MVSIBX:
2194 case TYPE_MVSIBY:
2195 case TYPE_MVSIBZ:
2196 return translateRMMemory(mcInst, insn, Dis);
2197 case TYPE_MSIB:
2198 return translateRMMemory(mcInst, insn, Dis, true);
2199 }
2200 }
2201
2202 /// translateFPRegister - Translates a stack position on the FPU stack to its
2203 /// LLVM form, and appends it to an MCInst.
2204 ///
2205 /// @param mcInst - The MCInst to append to.
2206 /// @param stackPos - The stack position to translate.
translateFPRegister(MCInst & mcInst,uint8_t stackPos)2207 static void translateFPRegister(MCInst &mcInst,
2208 uint8_t stackPos) {
2209 mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos));
2210 }
2211
2212 /// translateMaskRegister - Translates a 3-bit mask register number to
2213 /// LLVM form, and appends it to an MCInst.
2214 ///
2215 /// @param mcInst - The MCInst to append to.
2216 /// @param maskRegNum - Number of mask register from 0 to 7.
2217 /// @return - false on success; true otherwise.
translateMaskRegister(MCInst & mcInst,uint8_t maskRegNum)2218 static bool translateMaskRegister(MCInst &mcInst,
2219 uint8_t maskRegNum) {
2220 if (maskRegNum >= 8) {
2221 debug("Invalid mask register number");
2222 return true;
2223 }
2224
2225 mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum));
2226 return false;
2227 }
2228
2229 /// translateOperand - Translates an operand stored in an internal instruction
2230 /// to LLVM's format and appends it to an MCInst.
2231 ///
2232 /// @param mcInst - The MCInst to append to.
2233 /// @param operand - The operand, as stored in the descriptor table.
2234 /// @param insn - The internal instruction.
2235 /// @return - false on success; true otherwise.
translateOperand(MCInst & mcInst,const OperandSpecifier & operand,InternalInstruction & insn,const MCDisassembler * Dis)2236 static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
2237 InternalInstruction &insn,
2238 const MCDisassembler *Dis) {
2239 switch (operand.encoding) {
2240 default:
2241 debug("Unhandled operand encoding during translation");
2242 return true;
2243 case ENCODING_REG:
2244 translateRegister(mcInst, insn.reg);
2245 return false;
2246 case ENCODING_WRITEMASK:
2247 return translateMaskRegister(mcInst, insn.writemask);
2248 case ENCODING_SIB:
2249 CASE_ENCODING_RM:
2250 CASE_ENCODING_VSIB:
2251 return translateRM(mcInst, operand, insn, Dis);
2252 case ENCODING_IB:
2253 case ENCODING_IW:
2254 case ENCODING_ID:
2255 case ENCODING_IO:
2256 case ENCODING_Iv:
2257 case ENCODING_Ia:
2258 translateImmediate(mcInst,
2259 insn.immediates[insn.numImmediatesTranslated++],
2260 operand,
2261 insn,
2262 Dis);
2263 return false;
2264 case ENCODING_IRC:
2265 mcInst.addOperand(MCOperand::createImm(insn.RC));
2266 return false;
2267 case ENCODING_SI:
2268 return translateSrcIndex(mcInst, insn);
2269 case ENCODING_DI:
2270 return translateDstIndex(mcInst, insn);
2271 case ENCODING_RB:
2272 case ENCODING_RW:
2273 case ENCODING_RD:
2274 case ENCODING_RO:
2275 case ENCODING_Rv:
2276 translateRegister(mcInst, insn.opcodeRegister);
2277 return false;
2278 case ENCODING_CC:
2279 mcInst.addOperand(MCOperand::createImm(insn.immediates[1]));
2280 return false;
2281 case ENCODING_FP:
2282 translateFPRegister(mcInst, insn.modRM & 7);
2283 return false;
2284 case ENCODING_VVVV:
2285 translateRegister(mcInst, insn.vvvv);
2286 return false;
2287 case ENCODING_DUP:
2288 return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
2289 insn, Dis);
2290 }
2291 }
2292
2293 /// translateInstruction - Translates an internal instruction and all its
2294 /// operands to an MCInst.
2295 ///
2296 /// @param mcInst - The MCInst to populate with the instruction's data.
2297 /// @param insn - The internal instruction.
2298 /// @return - false on success; true otherwise.
translateInstruction(MCInst & mcInst,InternalInstruction & insn,const MCDisassembler * Dis)2299 static bool translateInstruction(MCInst &mcInst,
2300 InternalInstruction &insn,
2301 const MCDisassembler *Dis) {
2302 if (!insn.spec) {
2303 debug("Instruction has no specification");
2304 return true;
2305 }
2306
2307 mcInst.clear();
2308 mcInst.setOpcode(insn.instructionID);
2309 // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
2310 // prefix bytes should be disassembled as xrelease and xacquire then set the
2311 // opcode to those instead of the rep and repne opcodes.
2312 if (insn.xAcquireRelease) {
2313 if(mcInst.getOpcode() == X86::REP_PREFIX)
2314 mcInst.setOpcode(X86::XRELEASE_PREFIX);
2315 else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
2316 mcInst.setOpcode(X86::XACQUIRE_PREFIX);
2317 }
2318
2319 insn.numImmediatesTranslated = 0;
2320
2321 for (const auto &Op : insn.operands) {
2322 if (Op.encoding != ENCODING_NONE) {
2323 if (translateOperand(mcInst, Op, insn, Dis)) {
2324 return true;
2325 }
2326 }
2327 }
2328
2329 return false;
2330 }
2331
createX86Disassembler(const Target & T,const MCSubtargetInfo & STI,MCContext & Ctx)2332 static MCDisassembler *createX86Disassembler(const Target &T,
2333 const MCSubtargetInfo &STI,
2334 MCContext &Ctx) {
2335 std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
2336 return new X86GenericDisassembler(STI, Ctx, std::move(MII));
2337 }
2338
LLVMInitializeX86Disassembler()2339 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Disassembler() {
2340 // Register the disassembler.
2341 TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(),
2342 createX86Disassembler);
2343 TargetRegistry::RegisterMCDisassembler(getTheX86_64Target(),
2344 createX86Disassembler);
2345 }
2346