1 // Copyright 2015 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4 
5 #pragma once
6 
7 #include <functional>
8 
9 #include "Common/ArmCommon.h"
10 #include "Common/BitSet.h"
11 #include "Common/CodeBlock.h"
12 #include "Common/Common.h"
13 #include "Common/Log.h"
14 
15 #define DYNA_REC JIT
16 
17 #ifdef FMAX
18 #undef FMAX
19 #endif
20 #ifdef FMIN
21 #undef FMIN
22 #endif
23 
24 namespace Arm64Gen
25 {
26 
27 // X30 serves a dual purpose as a link register
28 // Encoded as <u3:type><u5:reg>
29 // Types:
30 // 000 - 32bit GPR
31 // 001 - 64bit GPR
32 // 010 - VFP single precision
CountLeadingZeros(uint64_t value,int width)33 // 100 - VFP double precision
34 // 110 - VFP quad precision
35 enum ARM64Reg
36 {
37 	// 32bit registers
38 	W0 = 0, W1, W2, W3, W4, W5, W6,
39 	W7, W8, W9, W10, W11, W12, W13, W14,
40 	W15, W16, W17, W18, W19, W20, W21, W22,
41 	W23, W24, W25, W26, W27, W28, W29, W30,
42 
43 	WSP, // 32bit stack pointer
44 
45 	// 64bit registers
46 	X0 = 0x20, X1, X2, X3, X4, X5, X6,
47 	X7, X8, X9, X10, X11, X12, X13, X14,
48 	X15, X16, X17, X18, X19, X20, X21, X22,
49 	X23, X24, X25, X26, X27, X28, X29, X30,
50 
51 	SP, // 64bit stack pointer
52 
53 	// VFP single precision registers
54 	S0 = 0x40, S1, S2, S3, S4, S5, S6,
55 	S7, S8, S9, S10, S11, S12, S13,
56 	S14, S15, S16, S17, S18, S19, S20,
57 	S21, S22, S23, S24, S25, S26, S27,
58 	S28, S29, S30, S31,
59 
60 	// VFP Double Precision registers
61 	D0 = 0x80, D1, D2, D3, D4, D5, D6, D7,
62 	D8, D9, D10, D11, D12, D13, D14, D15,
63 	D16, D17, D18, D19, D20, D21, D22, D23,
64 	D24, D25, D26, D27, D28, D29, D30, D31,
65 
66 	// ASIMD Quad-Word registers
67 	Q0 = 0xC0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
68 	Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15,
69 	Q16, Q17, Q18, Q19, Q20, Q21, Q22, Q23,
70 	Q24, Q25, Q26, Q27, Q28, Q29, Q30, Q31,
71 
72 	// For PRFM(prefetch memory) encoding
73 	// This is encoded in the Rt register
74 	// Data preload
75 	PLDL1KEEP = 0, PLDL1STRM,
76 	PLDL2KEEP, PLDL2STRM,
77 	PLDL3KEEP, PLDL3STRM,
78 	// Instruction preload
79 	PLIL1KEEP = 8, PLIL1STRM,
80 	PLIL2KEEP, PLIL2STRM,
81 	PLIL3KEEP, PLIL3STRM,
82 	// Prepare for store
83 	PLTL1KEEP = 16, PLTL1STRM,
84 	PLTL2KEEP, PLTL2STRM,
85 	PLTL3KEEP, PLTL3STRM,
86 
87 	WZR = WSP,
88 	ZR = SP,
89 	FP = X29,
90 	LR = X30,
91 
92 	INVALID_REG = 0xFFFFFFFF
93 };
94 
95 // R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately.
96 const u32 ALL_CALLEE_SAVED = 0x1FF80000;
97 const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00;  // d8-d15
98 
99 inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
100 inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; }
101 inline bool IsDouble(ARM64Reg reg) { return (reg & 0xC0) == 0x80; }
102 inline bool IsScalar(ARM64Reg reg) { return IsSingle(reg) || IsDouble(reg); }
103 inline bool IsQuad(ARM64Reg reg) { return (reg & 0xC0) == 0xC0; }
104 inline bool IsVector(ARM64Reg reg) { return (reg & 0xC0) != 0; }
105 inline bool IsGPR(ARM64Reg reg) { return (int)reg < 0x40; }
106 
107 int CountLeadingZeros(uint64_t value, int width);
108 
109 inline ARM64Reg DecodeReg(ARM64Reg reg) { return (ARM64Reg)(reg & 0x1F); }
110 inline ARM64Reg EncodeRegTo64(ARM64Reg reg) { return (ARM64Reg)(reg | 0x20); }
111 inline ARM64Reg EncodeRegToSingle(ARM64Reg reg) { return (ARM64Reg)(DecodeReg(reg) + S0); }
112 inline ARM64Reg EncodeRegToDouble(ARM64Reg reg) { return (ARM64Reg)((reg & ~0xC0) | 0x80); }
113 inline ARM64Reg EncodeRegToQuad(ARM64Reg reg) { return (ARM64Reg)(reg | 0xC0); }
114 
115 // For AND/TST/ORR/EOR etc
116 bool IsImmLogical(uint64_t value, unsigned int width, unsigned int *n, unsigned int *imm_s, unsigned int *imm_r);
117 // For ADD/SUB
118 bool IsImmArithmetic(uint64_t input, u32 *val, bool *shift);
119 
120 float FPImm8ToFloat(uint8_t bits);
121 bool FPImm8FromFloat(float value, uint8_t *immOut);
122 
123 enum OpType
124 {
125 	TYPE_IMM = 0,
126 	TYPE_REG,
127 	TYPE_IMMSREG,
128 	TYPE_RSR,
129 	TYPE_MEM
130 };
131 
132 enum ShiftType
133 {
134 	ST_LSL = 0,
135 	ST_LSR = 1,
136 	ST_ASR = 2,
137 	ST_ROR = 3,
138 };
139 
140 enum IndexType
141 {
142 	INDEX_UNSIGNED = 0,
143 	INDEX_POST = 1,
144 	INDEX_PRE = 2,
145 	INDEX_SIGNED = 3, // used in LDP/STP
146 };
147 
148 enum ShiftAmount
149 {
150 	SHIFT_0 = 0,
151 	SHIFT_16 = 1,
152 	SHIFT_32 = 2,
153 	SHIFT_48 = 3,
154 };
155 
156 enum RoundingMode {
157 	ROUND_A,  // round to nearest, ties to away
158 	ROUND_M,  // round towards -inf
159 	ROUND_N,  // round to nearest, ties to even
160 	ROUND_P,  // round towards +inf
161 	ROUND_Z,  // round towards zero
162 };
163 
164 struct FixupBranch
165 {
166 	// Pointer to executable code address.
167 	const u8 *ptr;
168 	// Type defines
169 	// 0 = CBZ (32bit)
170 	// 1 = CBNZ (32bit)
171 	// 2 = B (conditional)
172 	// 3 = TBZ
173 	// 4 = TBNZ
174 	// 5 = B (unconditional)
175 	// 6 = BL (unconditional)
176 	u32 type;
177 
178 	// Used with B.cond
179 	CCFlags cond;
180 
181 	// Used with TBZ/TBNZ
182 	u8 bit;
183 
184 	// Used with Test/Compare and Branch
185 	ARM64Reg reg;
186 };
187 
188 enum PStateField
189 {
190 	FIELD_SPSel = 0,
191 	FIELD_DAIFSet,
192 	FIELD_DAIFClr,
193 	FIELD_NZCV,	// The only system registers accessible from EL0 (user space)
194 	FIELD_FPCR = 0x340,
195 	FIELD_FPSR = 0x341,
196 };
197 
198 enum SystemHint
199 {
200 	HINT_NOP = 0,
201 	HINT_YIELD,
202 	HINT_WFE,
203 	HINT_WFI,
204 	HINT_SEV,
205 	HINT_SEVL,
206 };
207 
208 enum BarrierType
209 {
210 	OSHLD = 1,
211 	OSHST = 2,
212 	OSH   = 3,
213 	NSHLD = 5,
214 	NSHST = 6,
215 	NSH   = 7,
216 	ISHLD = 9,
217 	ISHST = 10,
218 	ISH   = 11,
219 	LD    = 13,
220 	ST    = 14,
221 	SY    = 15,
222 };
223 
224 class ArithOption
225 {
226 public:
227 	enum WidthSpecifier
228 	{
229 		WIDTH_DEFAULT,
230 		WIDTH_32BIT,
231 		WIDTH_64BIT,
232 	};
233 
234 	enum ExtendSpecifier
235 	{
236 		EXTEND_UXTB = 0x0,
237 		EXTEND_UXTH = 0x1,
238 		EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
239 		EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
240 		EXTEND_SXTB = 0x4,
241 		EXTEND_SXTH = 0x5,
242 		EXTEND_SXTW = 0x6,
243 		EXTEND_SXTX = 0x7,
244 	};
245 
246 	enum TypeSpecifier
247 	{
248 		TYPE_EXTENDEDREG,
249 		TYPE_IMM,
250 		TYPE_SHIFTEDREG,
251 	};
252 
253 private:
254 	ARM64Reg        m_destReg;
255 	WidthSpecifier  m_width;
256 	ExtendSpecifier m_extend;
257 	TypeSpecifier   m_type;
258 	ShiftType       m_shifttype;
259 	u32             m_shift;
260 
261 public:
262 	ArithOption(ARM64Reg Rd, bool index = false)
263 	{
EncodeSize(int size)264 		// Indexed registers are a certain feature of AARch64
265 		// On Loadstore instructions that use a register offset
266 		// We can have the register as an index
267 		// If we are indexing then the offset register will
268 		// be shifted to the left so we are indexing at intervals
269 		// of the size of what we are loading
270 		// 8-bit: Index does nothing
271 		// 16-bit: Index LSL 1
272 		// 32-bit: Index LSL 2
273 		// 64-bit: Index LSL 3
274 		if (index)
275 			m_shift = 4;
276 		else
277 			m_shift = 0;
SetCodePointer(const u8 * ptr,u8 * writePtr)278 
279 		m_destReg = Rd;
280 		m_type = TYPE_EXTENDEDREG;
281 		if (Is64Bit(Rd))
282 		{
283 			m_width = WIDTH_64BIT;
284 			m_extend = EXTEND_UXTX;
GetCodePointer() const285 		}
286 		else
287 		{
288 			m_width = WIDTH_32BIT;
289 			m_extend = EXTEND_UXTW;
290 		}
291 		m_shifttype = ST_LSL;
292 	}
293 	ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
294 	{
ReserveCodeSpace(u32 bytes)295 		m_destReg = Rd;
296 		m_shift = shift;
297 		m_shifttype = shift_type;
298 		m_type = TYPE_SHIFTEDREG;
299 		if (Is64Bit(Rd))
300 		{
AlignCode16()301 			m_width = WIDTH_64BIT;
302 			if (shift == 64)
303 				m_shift = 0;
304 		}
305 		else
306 		{
307 			m_width = WIDTH_32BIT;
308 			if (shift == 32)
AlignCodePage()309 				m_shift = 0;
310 		}
311 	}
312 	TypeSpecifier GetType() const
313 	{
314 		return m_type;
315 	}
316 	ARM64Reg GetReg() const
317 	{
FlushIcache()318 		return m_destReg;
319 	}
320 	u32 GetData() const
321 	{
322 		switch (m_type)
323 		{
FlushIcacheSection(const u8 * start,const u8 * end)324 			case TYPE_EXTENDEDREG:
325 				return (m_extend << 13) |
326 				       (m_shift << 10);
327 			break;
328 			case TYPE_SHIFTEDREG:
329 				return (m_shifttype << 22) |
330 				       (m_shift << 10);
331 			break;
332 			default:
333 				_dbg_assert_msg_(false, "Invalid type in GetData");
334 			break;
335 		}
336 		return 0;
337 	}
338 };
339 
340 class ARM64XEmitter
341 {
342 	friend class ARM64FloatEmitter;
343 	friend class ARM64CodeBlock;
344 
345 private:
346 	const u8 *m_code = nullptr;
347 	u8 *m_writable = nullptr;
348 	const u8 *m_lastCacheFlushEnd = nullptr;
349 
350 	void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
351 	void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
352 	void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
353 	void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
354 	void EncodeExceptionInst(u32 instenc, u32 imm);
355 	void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
356 	void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
357 	void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
358 	void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
359 	void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
360 	void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
361 	void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
362 	void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
363 	void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
364 	void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
365 	void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
366 	void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
367 	void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
368 	void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
369 	void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
370 	void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
371 	void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
372 	void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
373 	void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
374 	void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
375 	void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
376 	void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
377 	void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
378 
379 protected:
380 	inline void Write32(u32 value)
381 	{
382 		*(u32 *)m_writable = value;
383 		m_code += 4;
384 		m_writable += 4;
385 	}
386 
387 public:
388 	ARM64XEmitter()
389 	{
390 	}
391 
392 	ARM64XEmitter(const u8 *codePtr, u8 *writablePtr);
393 
394 	virtual ~ARM64XEmitter()
395 	{
396 	}
397 
398 	void SetCodePointer(const u8 *ptr, u8 *writePtr);
399 	const u8* GetCodePointer() const;
400 
401 	void ReserveCodeSpace(u32 bytes);
402 	const u8* AlignCode16();
403 	const u8* AlignCodePage();
404 	void FlushIcache();
405 	void FlushIcacheSection(const u8* start, const u8* end);
406 	u8* GetWritableCodePtr();
407 
408 	// FixupBranch branching
409 	void SetJumpTarget(FixupBranch const& branch);
410 	FixupBranch CBZ(ARM64Reg Rt);
411 	FixupBranch CBNZ(ARM64Reg Rt);
412 	FixupBranch B(CCFlags cond);
413 	FixupBranch TBZ(ARM64Reg Rt, u8 bit);
414 	FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
415 	FixupBranch B();
416 	FixupBranch BL();
417 
418 	// Compare and Branch
419 	void CBZ(ARM64Reg Rt, const void* ptr);
420 	void CBNZ(ARM64Reg Rt, const void* ptr);
421 
422 	// Conditional Branch
423 	void B(CCFlags cond, const void* ptr);
424 
425 	// Test and Branch
426 	void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
427 	void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
428 
429 	// Unconditional Branch
430 	void B(const void* ptr);
431 	void BL(const void* ptr);
432 
433 	// Unconditional Branch (register)
434 	void BR(ARM64Reg Rn);
435 	void BLR(ARM64Reg Rn);
436 	void RET(ARM64Reg Rn = X30);
437 	void ERET();
438 	void DRPS();
439 
440 	// Exception generation
441 	void SVC(u32 imm);
442 	void HVC(u32 imm);
443 	void SMC(u32 imm);
444 	void BRK(u32 imm);
445 	void HLT(u32 imm);
446 	void DCPS1(u32 imm);
447 	void DCPS2(u32 imm);
448 	void DCPS3(u32 imm);
449 
450 	// System
451 	void _MSR(PStateField field, u8 imm);
452 
453 	void _MSR(PStateField field, ARM64Reg Rt);
454 	void MRS(ARM64Reg Rt, PStateField field);
455 
456 	void HINT(SystemHint op);
457 	void CLREX();
458 	void DSB(BarrierType type);
459 	void DMB(BarrierType type);
460 	void ISB(BarrierType type);
461 
462 	// Add/Subtract (Extended/Shifted register)
463 	void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
464 	void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
465 	void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
466 	void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
467 	void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
468 	void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
469 	void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
470 	void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
471 	void CMN(ARM64Reg Rn, ARM64Reg Rm);
472 	void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
473 	void CMP(ARM64Reg Rn, ARM64Reg Rm);
474 	void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
475 
476 	// Add/Subtract (with carry)
477 	void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
478 	void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
479 	void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
480 	void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
481 
482 	// Conditional Compare (immediate)
483 	void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
EncodeCompareBranchInst(u32 op,ARM64Reg Rt,const void * ptr)484 	void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
485 
486 	// Conditional Compare (register)
487 	void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
488 	void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
489 
490 	// Conditional Select
491 	void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
492 	void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
493 	void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
494 	void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
495 
496 	// Aliases
497 	void CSET(ARM64Reg Rd, CCFlags cond) {
498 		ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
499 		CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
EncodeTestBranchInst(u32 op,ARM64Reg Rt,u8 bits,const void * ptr)500 	}
501 	void NEG(ARM64Reg Rd, ARM64Reg Rs) {
502 		SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs);
503 	}
504 
505 	// Data-Processing 1 source
506 	void RBIT(ARM64Reg Rd, ARM64Reg Rn);
507 	void REV16(ARM64Reg Rd, ARM64Reg Rn);
508 	void REV32(ARM64Reg Rd, ARM64Reg Rn);
509 	void REV64(ARM64Reg Rd, ARM64Reg Rn);
510 	void CLZ(ARM64Reg Rd, ARM64Reg Rn);
511 	void CLS(ARM64Reg Rd, ARM64Reg Rn);
512 
513 	// Data-Processing 2 source
514 	void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
515 	void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
EncodeUnconditionalBranchInst(u32 op,const void * ptr)516 	void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
517 	void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
518 	void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
519 	void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
520 	void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
521 	void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
522 	void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
523 	void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
524 	void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
525 	void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
526 	void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
527 	void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
528 
EncodeUnconditionalBranchInst(u32 opc,u32 op2,u32 op3,u32 op4,ARM64Reg Rn)529 	// Data-Processing 3 source
530 	void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
531 	void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
532 	void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
533 	void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
534 	void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
EncodeExceptionInst(u32 instenc,u32 imm)535 	void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
536 	void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
537 	void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
538 	void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
539 	void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
540 	void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
541 	void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
EncodeSystemInst(u32 op0,u32 op1,u32 CRn,u32 CRm,u32 op2,ARM64Reg Rt)542 
543 	// Logical (shifted register)
544 	void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
545 	void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
546 	void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
EncodeArithmeticInst(u32 instenc,bool flags,ARM64Reg Rd,ARM64Reg Rn,ARM64Reg Rm,ArithOption Option)547 	void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
548 	void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
549 	void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
550 	void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
551 	void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
552 	void TST(ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
553 
554 	// Wrap the above for saner syntax
555 	void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
556 	void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
557 	void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
EncodeArithmeticCarryInst(u32 op,bool flags,ARM64Reg Rd,ARM64Reg Rn,ARM64Reg Rm)558 	void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
559 	void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
560 	void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
561 	void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
562 	void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
563 	void TST(ARM64Reg Rn, ARM64Reg Rm) { TST(Rn, Rm, ArithOption(Is64Bit(Rn) ? ZR : WZR, ST_LSL, 0)); }
564 
565 	// Convenience wrappers around ORR. These match the official convenience syntax.
566 	void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
567 	void MOV(ARM64Reg Rd, ARM64Reg Rm);
568 	void MVN(ARM64Reg Rd, ARM64Reg Rm);
EncodeCondCompareImmInst(u32 op,ARM64Reg Rn,u32 imm,u32 nzcv,CCFlags cond)569 
570 	// Wrapper around ADD reg, reg, imm.
571 	void MOVfromSP(ARM64Reg Rd);
572 	void MOVtoSP(ARM64Reg Rn);
573 
574 	// TODO: These are "slow" as they use arith+shift, should be replaced with UBFM/EXTR variants.
575 	void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
576 	void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
577 	void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
578 	void ROR(ARM64Reg Rd, ARM64Reg Rm, int shift);
579 
580 	// Logical (immediate)
EncodeCondCompareRegInst(u32 op,ARM64Reg Rn,ARM64Reg Rm,u32 nzcv,CCFlags cond)581 	void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
582 	void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
583 	void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
584 	void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
585 	void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
586 
587 	// Add/subtract (immediate)
588 	void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
589 	void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
590 	void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
591 	void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
592 	void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
EncodeCondSelectInst(u32 instenc,ARM64Reg Rd,ARM64Reg Rn,ARM64Reg Rm,CCFlags cond)593 	void CMN(ARM64Reg Rn, u32 imm, bool shift = false);
594 
595 	// Data Processing (Immediate)
596 	void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
597 	void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
598 	void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
599 
600 	// Bitfield move
601 	void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
602 	void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
603 	void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
604 	void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
EncodeData1SrcInst(u32 instenc,ARM64Reg Rd,ARM64Reg Rn)605 	void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
606 
607 	// Extract register (ROR with two inputs, if same then faster on A67)
608 	void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
609 
610 	// Aliases
611 	void SXTB(ARM64Reg Rd, ARM64Reg Rn);
612 	void SXTH(ARM64Reg Rd, ARM64Reg Rn);
613 	void SXTW(ARM64Reg Rd, ARM64Reg Rn);
614 	void UXTB(ARM64Reg Rd, ARM64Reg Rn);
615 	void UXTH(ARM64Reg Rd, ARM64Reg Rn);
EncodeData2SrcInst(u32 instenc,ARM64Reg Rd,ARM64Reg Rn,ARM64Reg Rm)616 
617 	void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) {
618 		UBFM(Rd, Rn, lsb, lsb + width - 1);
619 	}
620 
621 	// Load Register (Literal)
622 	void LDR(ARM64Reg Rt, u32 imm);
623 	void LDRSW(ARM64Reg Rt, u32 imm);
624 	void PRFM(ARM64Reg Rt, u32 imm);
625 
626 	// Load/Store Exclusive
627 	void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
EncodeData3SrcInst(u32 instenc,ARM64Reg Rd,ARM64Reg Rn,ARM64Reg Rm,ARM64Reg Ra)628 	void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
629 	void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
630 	void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
631 	void STLRB(ARM64Reg Rt, ARM64Reg Rn);
632 	void LDARB(ARM64Reg Rt, ARM64Reg Rn);
633 	void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
634 	void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
635 	void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
636 	void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
637 	void STLRH(ARM64Reg Rt, ARM64Reg Rn);
638 	void LDARH(ARM64Reg Rt, ARM64Reg Rn);
639 	void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
640 	void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
EncodeLogicalInst(u32 instenc,ARM64Reg Rd,ARM64Reg Rn,ARM64Reg Rm,ArithOption Shift)641 	void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
642 	void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
643 	void LDXR(ARM64Reg Rt, ARM64Reg Rn);
644 	void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
645 	void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
646 	void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
647 	void STLR(ARM64Reg Rt, ARM64Reg Rn);
648 	void LDAR(ARM64Reg Rt, ARM64Reg Rn);
649 
650 	// Load/Store no-allocate pair (offset)
651 	void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
EncodeLoadRegisterInst(u32 bitop,ARM64Reg Rt,u32 imm)652 	void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
653 
654 	// Load/Store register (immediate indexed)
655 	void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
656 	void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
657 	void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
658 	void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
659 	void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
660 	void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
661 	void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
662 	void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
663 	void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
664 
EncodeLoadStoreExcInst(u32 instenc,ARM64Reg Rs,ARM64Reg Rt2,ARM64Reg Rn,ARM64Reg Rt)665 	// Load/Store register (register offset)
666 	void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
667 	void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
668 	void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
669 	void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
670 	void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
671 	void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
672 	void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
673 	void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
674 	void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
675 	void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
676 
EncodeLoadStorePairedInst(u32 op,ARM64Reg Rt,ARM64Reg Rt2,ARM64Reg Rn,u32 imm)677 	// Load/Store register (unscaled offset)
678 	void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
679 	void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
680 	void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
681 	void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
682 	void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
683 	void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
684 	void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
685 	void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
686 	void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
687 
688 	// Load/Store pair
689 	void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
690 	void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
691 	void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
692 
693 	// Address of label/page PC-relative
694 	void ADR(ARM64Reg Rd, s32 imm);
695 	void ADRP(ARM64Reg Rd, s32 imm);
696 
697 	// Wrapper around MOVZ+MOVK
698 	void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
699 	template <class P>
700 	void MOVP2R(ARM64Reg Rd, P *ptr) {
701 		_assert_msg_(Is64Bit(Rd), "Can't store pointers in 32-bit registers");
702 		MOVI2R(Rd, (uintptr_t)ptr);
703 	}
704 
705 	// Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch register.
EncodeLoadStoreIndexedInst(u32 op,u32 op2,ARM64Reg Rt,ARM64Reg Rn,s32 imm)706 	void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
707 	void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
708 	void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) { ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); }
709 	void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
710 	void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
711 	void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
712 
713 	void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
714 	void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
715 	void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
716 
717 	bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
718 	bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
719 	bool TryCMPI2R(ARM64Reg Rn, u64 imm);
EncodeLoadStoreIndexedInst(u32 op,ARM64Reg Rt,ARM64Reg Rn,s32 imm,u8 size)720 
721 	bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
722 	bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
723 	bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm);
724 
725 	// Pseudo-instruction for convenience. PUSH pushes 16 bytes even though we only push a single register.
726 	// This is so the stack pointer is always 16-byte aligned, which is checked by hardware!
727 	void PUSH(ARM64Reg Rd);
728 	void POP(ARM64Reg Rd);
729 	void PUSH2(ARM64Reg Rd, ARM64Reg Rn);
730 	void POP2(ARM64Reg Rd, ARM64Reg Rn);
731 
732 
733 	// Utility to generate a call to a std::function object.
734 	//
735 	// Unfortunately, calling operator() directly is undefined behavior in C++
736 	// (this method might be a thunk in the case of multi-inheritance) so we
737 	// have to go through a trampoline function.
738 	template <typename T, typename... Args>
739 	static void CallLambdaTrampoline(const std::function<T(Args...)>* f,
740 	                                 Args... args)
741 	{
742 		(*f)(args...);
743 	}
744 
745 	// This function expects you to have set up the state.
EncodeMOVWideInst(u32 op,ARM64Reg Rd,u32 imm,ShiftAmount pos)746 	// Overwrites X0 and X30
747 	template <typename T, typename... Args>
748 	ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f)
749 	{
750 		auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
751 		MOVI2R(X30, (uintptr_t)trampoline);
752 		MOVI2R(X0, (uintptr_t)const_cast<void*>((const void*)f));
753 		return X30;
754 	}
755 
EncodeBitfieldMOVInst(u32 op,ARM64Reg Rd,ARM64Reg Rn,u32 immr,u32 imms)756 	// Plain function call
757 	void QuickCallFunction(ARM64Reg scratchreg, const void *func);
758 	template <typename T> void QuickCallFunction(ARM64Reg scratchreg, T func) {
759 		QuickCallFunction(scratchreg, (const void *)func);
760 	}
761 };
762 
763 class ARM64FloatEmitter
764 {
765 public:
EncodeLoadStoreRegisterOffset(u32 size,u32 opc,ARM64Reg Rt,ARM64Reg Rn,ArithOption Rm)766 	ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
767 
768 	void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
769 	void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
770 
771 	// Loadstore unscaled
772 	void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
773 	void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
774 
775 	// Loadstore single structure
EncodeAddSubImmInst(u32 op,bool flags,u32 shift,u32 imm,ARM64Reg Rn,ARM64Reg Rd)776 	void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
777 	void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
778 	void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
779 	void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
780 	void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
781 	void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
782 	void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
783 	void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
784 
785 	// Loadstore multiple structure
786 	void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
787 	void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
EncodeLogicalImmInst(u32 op,ARM64Reg Rd,ARM64Reg Rn,u32 immr,u32 imms,int n)788 	void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
789 	void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
790 
791 	// Loadstore paired
792 	void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
793 	void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
794 
795 	// Loadstore register offset
796 	void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
797 	void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
798 
799 	// Scalar - 1 Source
800 	void FABS(ARM64Reg Rd, ARM64Reg Rn);
EncodeLoadStorePair(u32 op,u32 load,IndexType type,ARM64Reg Rt,ARM64Reg Rt2,ARM64Reg Rn,s32 imm)801 	void FNEG(ARM64Reg Rd, ARM64Reg Rn);
802 	void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
803 	void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP
804 
805 	// Scalar - 2 Source
806 	void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
807 	void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
808 	void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
809 	void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
810 	void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
811 	void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
812 	void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
813 	void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
814 	void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
815 
816 	// Scalar - 3 Source. Note - the accumulator is last on ARM!
817 	void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
818 	void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
819 	void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
820 	void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
821 
822 	// Scalar floating point immediate
823 	void FMOV(ARM64Reg Rd, uint8_t imm8);
824 
825 	// Vector
826 	void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
827 	void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
828 	void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
829 	void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
830 	void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
831 	void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
832 	void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
833 	void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
834 	void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
835 	void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
836 	void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
837 	void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
838 	void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
EncodeAddressInst(u32 op,ARM64Reg Rd,s32 imm)839 	void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
840 	void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
841 	void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
842 	void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
843 	void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
844 	void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
845 	void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
846 	void NOT(ARM64Reg Rd, ARM64Reg Rn);
EncodeLoadStoreUnscaled(u32 size,u32 op,ARM64Reg Rt,ARM64Reg Rn,s32 imm)847 	void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
848 	void MOV(ARM64Reg Rd, ARM64Reg Rn) {
849 		ORR(Rd, Rn, Rn);
850 	}
851 
852 	void UMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
853 	void UMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
854 	void SMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
855 	void SMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
IsInRangeImm19(s64 distance)856 
857 	void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
858 	void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
859 	void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
IsInRangeImm14(s64 distance)860 	void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
861 	void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
862 	void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
863 	void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
IsInRangeImm26(s64 distance)864 	void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
865 	void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
866 	void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
867 	void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
MaskImm19(s64 distance)868 	void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
869 	void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
870 
871 	// Move
MaskImm14(s64 distance)872 	void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
873 	void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
874 	void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
875 	void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
MaskImm26(s64 distance)876 	void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
877 
878 	// One source
879 	void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
880 
SetJumpTarget(FixupBranch const & branch)881 	// Scalar convert float to int, in a lot of variants.
882 	// Note that the scalar version of this operation has two encodings, one that goes to an integer register
883 	// and one that outputs to a scalar fp register.
884 	void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
885 	void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
886 
887 	// Scalar convert int to float. No rounding mode specifier necessary.
888 	void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
889 	void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
890 
891 	// Scalar fixed point to float. scale is the number of fractional bits.
892 	void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
893 	void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
894 
895 	// Float comparison
896 	void FCMP(ARM64Reg Rn, ARM64Reg Rm);
897 	void FCMP(ARM64Reg Rn);
898 	void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
899 	void FCMPE(ARM64Reg Rn);
900 	void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
901 	void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
902 	void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
903 	void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
904 	void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
905 	void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
906 	void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
907 	void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
908 
909 	// Conditional select
910 	void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
911 
912 	// Permute
913 	void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
914 	void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
915 	void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
916 	void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
917 	void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
918 	void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
919 
920 	// Shift by immediate
921 	void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
922 	void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
923 	void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
924 	void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
925 	void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
926 	void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
CBZ(ARM64Reg Rt)927 	void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
928 	void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
929 	void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
930 	void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
931 
932 	void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
933 	void USHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
934 	void SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
935 
CBNZ(ARM64Reg Rt)936 	// vector x indexed element
937 	void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
938 	void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
939 
940 	void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
941 	void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
942 
943 	// ABI related
944 	void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers);
B(CCFlags cond)945 	void ABI_PopRegisters(uint32_t gpr_registers, uint32_t fp_registers);
946 
947 private:
948 	ARM64XEmitter* m_emit;
949 	inline void Write32(u32 value) { m_emit->Write32(value); }
950 
951 	// Emitting functions
952 	void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
953 	void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
TBZ(ARM64Reg Rt,u8 bit)954 	void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
955 	void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
956 	void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
957 	void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn);
958 	void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
959 	void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
960 	void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
961 	void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn);
962 	void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
963 	void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
TBNZ(ARM64Reg Rt,u8 bit)964 	void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
965 	void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
966 	void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
967 	void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
968 	void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
969 	void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
970 	void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
971 	void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
972 	void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
973 	void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
B()974 	void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode);
975 	void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
976 	void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
977 
978 	void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
979 	void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
980 	void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
981 	void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
BL()982 	void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
983 };
984 
985 class ARM64CodeBlock : public CodeBlock<ARM64XEmitter>
986 {
987 private:
988 	void PoisonMemory(int offset) override;
989 };
990 }
991