1 // Copyright (C) 2003 Dolphin Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official SVN repository and contact information can be found at
16 // http://code.google.com/p/dolphin-emu/
17
18 #pragma once
19
20 #include <vector>
21 #include <cstdint>
22
23 #include "Common/Common.h"
24 #include "Common/Log.h"
25 #include "Common/ArmCommon.h"
26 #include "Common/CodeBlock.h"
27
28 // VCVT flags
29 #define TO_FLOAT 0
30 #define TO_INT 1 << 0
31 #define IS_SIGNED 1 << 1
32 #define ROUND_TO_ZERO 1 << 2
33
34 namespace ArmGen
35 {
36 enum ARMReg
37 {
38 // GPRs
39 R0 = 0, R1, R2, R3, R4, R5,
40 R6, R7, R8, R9, R10, R11,
41
42 // SPRs
43 // R13 - R15 are SP, LR, and PC.
44 // Almost always referred to by name instead of register number
45 R12 = 12, R13 = 13, R14 = 14, R15 = 15,
46 R_IP = 12, R_SP = 13, R_LR = 14, R_PC = 15,
47
48
49 // VFP single precision registers
50 S0, S1, S2, S3, S4, S5, S6,
51 S7, S8, S9, S10, S11, S12, S13,
52 S14, S15, S16, S17, S18, S19, S20,
53 S21, S22, S23, S24, S25, S26, S27,
54 S28, S29, S30, S31,
55
56 // VFP Double Precision registers
57 D0, D1, D2, D3, D4, D5, D6, D7,
58 D8, D9, D10, D11, D12, D13, D14, D15,
59 D16, D17, D18, D19, D20, D21, D22, D23,
60 D24, D25, D26, D27, D28, D29, D30, D31,
61
62 // ASIMD Quad-Word registers
63 Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
64 Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15,
65
66 // for NEON VLD/VST instructions
67 REG_UPDATE = R13,
68 INVALID_REG = 0xFFFFFFFF
69 };
70
71 enum ShiftType
72 {
73 ST_LSL = 0,
74 ST_ASL = 0,
75 ST_LSR = 1,
76 ST_ASR = 2,
77 ST_ROR = 3,
78 ST_RRX = 4
79 };
80 enum IntegerSize
81 {
82 I_I8 = 0,
83 I_I16,
84 I_I32,
85 I_I64
86 };
87
88 enum
89 {
90 NUMGPRs = 13,
91 };
92
93 class ARMXEmitter;
94
95 enum OpType
96 {
97 TYPE_IMM = 0,
98 TYPE_REG,
99 TYPE_IMMSREG,
100 TYPE_RSR,
101 TYPE_MEM
102 };
103
104 // This is no longer a proper operand2 class. Need to split up.
105 class Operand2
106 {
107 friend class ARMXEmitter;
108 protected:
109 u32 Value;
110
111 private:
112 OpType Type;
113
114 // IMM types
115 u8 Rotation; // Only for u8 values
116
117 // Register types
118 u8 IndexOrShift;
119 ShiftType Shift;
120 public:
GetType()121 OpType GetType() const
122 {
123 return Type;
124 }
Operand2()125 Operand2() {}
126 Operand2(u32 imm, OpType type = TYPE_IMM)
127 {
128 Type = type;
129 Value = imm;
130 Rotation = 0;
131 }
132
Operand2(ARMReg Reg)133 Operand2(ARMReg Reg)
134 {
135 Type = TYPE_REG;
136 Value = Reg;
137 Rotation = 0;
138 }
Operand2(u8 imm,u8 rotation)139 Operand2(u8 imm, u8 rotation)
140 {
141 Type = TYPE_IMM;
142 Value = imm;
143 Rotation = rotation;
144 }
Operand2(ARMReg base,ShiftType type,ARMReg shift)145 Operand2(ARMReg base, ShiftType type, ARMReg shift) // RSR
146 {
147 Type = TYPE_RSR;
148 _assert_msg_(type != ST_RRX, "Invalid Operand2: RRX does not take a register shift amount");
149 IndexOrShift = shift;
150 Shift = type;
151 Value = base;
152 }
153
Operand2(ARMReg base,ShiftType type,u8 shift)154 Operand2(ARMReg base, ShiftType type, u8 shift)// For IMM shifted register
155 {
156 if(shift == 32) shift = 0;
157 switch (type)
158 {
159 case ST_LSL:
160 _assert_msg_(shift < 32, "Invalid Operand2: LSL %u", shift);
161 break;
162 case ST_LSR:
163 _assert_msg_(shift <= 32, "Invalid Operand2: LSR %u", shift);
164 if (!shift)
165 type = ST_LSL;
166 if (shift == 32)
167 shift = 0;
168 break;
169 case ST_ASR:
170 _assert_msg_(shift < 32, "Invalid Operand2: ASR %u", shift);
171 if (!shift)
172 type = ST_LSL;
173 if (shift == 32)
174 shift = 0;
175 break;
176 case ST_ROR:
177 _assert_msg_(shift < 32, "Invalid Operand2: ROR %u", shift);
178 if (!shift)
179 type = ST_LSL;
180 break;
181 case ST_RRX:
182 _assert_msg_(shift == 0, "Invalid Operand2: RRX does not take an immediate shift amount");
183 type = ST_ROR;
184 break;
185 }
186 IndexOrShift = shift;
187 Shift = type;
188 Value = base;
189 Type = TYPE_IMMSREG;
190 }
GetData()191 u32 GetData()
192 {
193 switch(Type)
194 {
195 case TYPE_IMM:
196 return Imm12Mod(); // This'll need to be changed later
197 case TYPE_REG:
198 return Rm();
199 case TYPE_IMMSREG:
200 return IMMSR();
201 case TYPE_RSR:
202 return RSR();
203 default:
204 _assert_msg_(false, "GetData with Invalid Type");
205 return 0;
206 }
207 }
IMMSR()208 u32 IMMSR() // IMM shifted register
209 {
210 _assert_msg_(Type == TYPE_IMMSREG, "IMMSR must be imm shifted register");
211 return ((IndexOrShift & 0x1f) << 7 | (Shift << 5) | Value);
212 }
RSR()213 u32 RSR() // Register shifted register
214 {
215 _assert_msg_(Type == TYPE_RSR, "RSR must be RSR Of Course");
216 return (IndexOrShift << 8) | (Shift << 5) | 0x10 | Value;
217 }
Rm()218 u32 Rm() const
219 {
220 _assert_msg_(Type == TYPE_REG, "Rm must be with Reg");
221 return Value;
222 }
223
Imm5()224 u32 Imm5() const
225 {
226 _assert_msg_((Type == TYPE_IMM), "Imm5 not IMM value");
227 return ((Value & 0x0000001F) << 7);
228 }
Imm8()229 u32 Imm8() const
230 {
231 _assert_msg_((Type == TYPE_IMM), "Imm8Rot not IMM value");
232 return Value & 0xFF;
233 }
Imm8Rot()234 u32 Imm8Rot() const // IMM8 with Rotation
235 {
236 _assert_msg_((Type == TYPE_IMM), "Imm8Rot not IMM value");
237 _assert_msg_((Rotation & 0xE1) != 0, "Invalid Operand2: immediate rotation %u", Rotation);
238 return (1 << 25) | (Rotation << 7) | (Value & 0x000000FF);
239 }
Imm12()240 u32 Imm12() const
241 {
242 _assert_msg_((Type == TYPE_IMM), "Imm12 not IMM");
243 return (Value & 0x00000FFF);
244 }
245
Imm12Mod()246 u32 Imm12Mod() const
247 {
248 // This is an IMM12 with the top four bits being rotation and the
249 // bottom eight being an IMM. This is for instructions that need to
250 // expand a 8bit IMM to a 32bit value and gives you some rotation as
251 // well.
252 // Each rotation rotates to the right by 2 bits
253 _assert_msg_((Type == TYPE_IMM), "Imm12Mod not IMM");
254 return ((Rotation & 0xF) << 8) | (Value & 0xFF);
255 }
Imm16()256 u32 Imm16() const
257 {
258 _assert_msg_((Type == TYPE_IMM), "Imm16 not IMM");
259 return ( (Value & 0xF000) << 4) | (Value & 0x0FFF);
260 }
Imm16Low()261 u32 Imm16Low() const
262 {
263 return Imm16();
264 }
Imm16High()265 u32 Imm16High() const // Returns high 16bits
266 {
267 _assert_msg_((Type == TYPE_IMM), "Imm16 not IMM");
268 return ( ((Value >> 16) & 0xF000) << 4) | ((Value >> 16) & 0x0FFF);
269 }
Imm24()270 u32 Imm24() const
271 {
272 _assert_msg_((Type == TYPE_IMM), "Imm16 not IMM");
273 return (Value & 0x0FFFFFFF);
274 }
275 // NEON and ASIMD specific
Imm8ASIMD()276 u32 Imm8ASIMD() const
277 {
278 _assert_msg_((Type == TYPE_IMM), "Imm8ASIMD not IMM");
279 return ((Value & 0x80) << 17) | ((Value & 0x70) << 12) | (Value & 0xF);
280 }
Imm8VFP()281 u32 Imm8VFP() const
282 {
283 _assert_msg_((Type == TYPE_IMM), "Imm8VFP not IMM");
284 return ((Value & 0xF0) << 12) | (Value & 0xF);
285 }
286 };
287
288 // Use these when you don't know if an imm can be represented as an operand2.
289 // This lets you generate both an optimal and a fallback solution by checking
290 // the return value, which will be false if these fail to find a Operand2 that
291 // represents your 32-bit imm value.
292 bool TryMakeOperand2(u32 imm, Operand2 &op2);
293 bool TryMakeOperand2_AllowInverse(u32 imm, Operand2 &op2, bool *inverse);
294 bool TryMakeOperand2_AllowNegation(s32 imm, Operand2 &op2, bool *negated);
295
296 // Use this only when you know imm can be made into an Operand2.
297 Operand2 AssumeMakeOperand2(u32 imm);
298
R(ARMReg Reg)299 inline Operand2 R(ARMReg Reg) { return Operand2(Reg, TYPE_REG); }
IMM(u32 Imm)300 inline Operand2 IMM(u32 Imm) { return Operand2(Imm, TYPE_IMM); }
Mem(void * ptr)301 inline Operand2 Mem(void *ptr) { return Operand2((u32)(uintptr_t)ptr, TYPE_IMM); }
302 //usage: struct {int e;} s; STRUCT_OFFSET(s,e)
303 #define STRUCT_OFF(str,elem) ((u32)((u32)&(str).elem-(u32)&(str)))
304
305
306 struct FixupBranch
307 {
308 u8 *ptr;
309 u32 condition; // Remembers our codition at the time
310 int type; //0 = B 1 = BL
311 };
312
313 struct LiteralPool
314 {
315 intptr_t loc;
316 u8* ldr_address;
317 u32 val;
318 };
319
320 typedef const u8* JumpTarget;
321
322 // XXX: Stop polluting the global namespace
323 const u32 I_8 = (1 << 0);
324 const u32 I_16 = (1 << 1);
325 const u32 I_32 = (1 << 2);
326 const u32 I_64 = (1 << 3);
327 const u32 I_SIGNED = (1 << 4);
328 const u32 I_UNSIGNED = (1 << 5);
329 const u32 F_32 = (1 << 6);
330 const u32 I_POLYNOMIAL = (1 << 7); // Only used in VMUL/VMULL
331
332 enum VIMMMode {
333 VIMM___x___x = 0x0, // 0000 VMOV
334 VIMM__x___x_ = 0x2, // 0010
335 VIMM_x___x__ = 0x4, // 0100
336 VIMMx___x___ = 0x6, // 0110
337 VIMM_x_x_x_x = 0x8, // 1000
338 VIMMx_x_x_x_ = 0xA, // 1010
339 VIMM__x1__x1 = 0xC, // 1100
340 VIMM_x11_x11 = 0xD, // 1101
341 VIMMxxxxxxxx = 0xE, // 1110 // op == 0
342 VIMMf000f000 = 0xF, // 1111 // op == 0 ( really aBbbbbbc defgh 00000000 00000000 ) where B = NOT b
343 VIMMbits2bytes = 0x1E, // Bit replication into bytes! Easily created 111111111 00000000 masks!
344 };
345
346 u32 EncodeVd(ARMReg Vd);
347 u32 EncodeVn(ARMReg Vn);
348 u32 EncodeVm(ARMReg Vm);
349
350 u32 encodedSize(u32 value);
351
352 // Subtracts the base from the register to give us the real one
353 ARMReg SubBase(ARMReg Reg);
354
IsQ(ARMReg r)355 inline bool IsQ(ARMReg r) {
356 return r >= Q0 && r <= Q15;
357 }
358
IsD(ARMReg r)359 inline bool IsD(ARMReg r) {
360 return r >= D0 && r <= D31;
361 }
362
363 // See A.7.1 in the ARMv7-A
364 // VMUL F32 scalars can only be up to D15[0], D15[1] - higher scalars cannot be individually addressed
365 ARMReg DScalar(ARMReg dreg, int subScalar);
366 ARMReg QScalar(ARMReg qreg, int subScalar);
XScalar(ARMReg reg,int subScalar)367 inline ARMReg XScalar(ARMReg reg, int subScalar) {
368 if (IsQ(reg))
369 return QScalar(reg, subScalar);
370 else
371 return DScalar(reg, subScalar);
372 }
373
374 const char *ARMRegAsString(ARMReg reg);
375
376 // Get the two halves of a Q register.
D_0(ARMReg q)377 inline ARMReg D_0(ARMReg q) {
378 if (q >= Q0 && q <= Q15) {
379 return ARMReg(D0 + (q - Q0) * 2);
380 } else if (q >= D0 && q <= D31) {
381 return q;
382 } else {
383 return INVALID_REG;
384 }
385 }
D_1(ARMReg q)386 inline ARMReg D_1(ARMReg q) {
387 return ARMReg(D0 + (q - Q0) * 2 + 1);
388 }
389
390 enum NEONAlignment {
391 ALIGN_NONE = 0,
392 ALIGN_64 = 1,
393 ALIGN_128 = 2,
394 ALIGN_256 = 3
395 };
396
397
398 class NEONXEmitter;
399
400 class ARMXEmitter
401 {
402 friend struct OpArg; // for Write8 etc
403 friend class NEONXEmitter;
404 private:
405 u8 *code, *startcode;
406 u8 *lastCacheFlushEnd;
407 u32 condition;
408 std::vector<LiteralPool> currentLitPool;
409
410 void WriteStoreOp(u32 Op, ARMReg Rt, ARMReg Rn, Operand2 op2, bool RegAdd);
411 void WriteRegStoreOp(u32 op, ARMReg dest, bool WriteBack, u16 RegList);
412 void WriteVRegStoreOp(u32 op, ARMReg dest, bool Double, bool WriteBack, ARMReg firstreg, u8 numregs);
413 void WriteShiftedDataOp(u32 op, bool SetFlags, ARMReg dest, ARMReg src, ARMReg op2);
414 void WriteShiftedDataOp(u32 op, bool SetFlags, ARMReg dest, ARMReg src, Operand2 op2);
415 void WriteSignedMultiply(u32 Op, u32 Op2, u32 Op3, ARMReg dest, ARMReg r1, ARMReg r2);
416
417 void WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm);
418
419 void Write4OpMultiply(u32 op, ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);
420
421 // New Ops
422 void WriteInstruction(u32 op, ARMReg Rd, ARMReg Rn, Operand2 Rm, bool SetFlags = false);
423
424 void WriteVLDST1(bool load, u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align, ARMReg Rm);
425 void WriteVLDST1_lane(bool load, u32 Size, ARMReg Vd, ARMReg Rn, int lane, bool aligned, ARMReg Rm);
426
427 void WriteVimm(ARMReg Vd, int cmode, u8 imm, int op);
428
429 void EncodeShiftByImm(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount, u8 opcode, bool quad, bool inverse, bool halve);
430
431 protected:
Write32(u32 value)432 inline void Write32(u32 value) {*(u32*)code = value; code+=4;}
433
434 public:
ARMXEmitter()435 ARMXEmitter() : code(0), startcode(0), lastCacheFlushEnd(0) {
436 condition = CC_AL << 28;
437 }
ARMXEmitter(u8 * code_ptr)438 ARMXEmitter(u8 *code_ptr) {
439 code = code_ptr;
440 lastCacheFlushEnd = code_ptr;
441 startcode = code_ptr;
442 condition = CC_AL << 28;
443 }
~ARMXEmitter()444 virtual ~ARMXEmitter() {}
445
446 void SetCodePointer(u8 *ptr, u8 *writePtr);
447 const u8 *GetCodePointer() const;
448
449 void ReserveCodeSpace(u32 bytes);
450 const u8 *AlignCode16();
451 const u8 *AlignCodePage();
452 void FlushIcache();
453 void FlushIcacheSection(u8 *start, u8 *end);
454 u8 *GetWritableCodePtr();
455
456 void FlushLitPool();
457 void AddNewLit(u32 val);
458 bool TrySetValue_TwoOp(ARMReg reg, u32 val);
459
GetCC()460 CCFlags GetCC() { return CCFlags(condition >> 28); }
461 void SetCC(CCFlags cond = CC_AL);
462
463 // Special purpose instructions
464
465 // Dynamic Endian Switching
466 void SETEND(bool BE);
467 // Debug Breakpoint
468 void BKPT(u16 arg);
469
470 // Hint instruction
471 void YIELD();
472
473 // Do nothing
474 void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
475
476 #ifdef CALL
477 #undef CALL
478 #endif
479
480 // Branching
481 FixupBranch B();
482 FixupBranch B_CC(CCFlags Cond);
483 void B_CC(CCFlags Cond, const void *fnptr);
484 FixupBranch BL();
485 FixupBranch BL_CC(CCFlags Cond);
486 void SetJumpTarget(FixupBranch const &branch);
487
488 void B (const void *fnptr);
489 void B (ARMReg src);
490 void BL(const void *fnptr);
491 void BL(ARMReg src);
492 bool BLInRange(const void *fnptr) const;
493
494 void PUSH(const int num, ...);
495 void POP(const int num, ...);
496
497 // New Data Ops
498 void AND (ARMReg Rd, ARMReg Rn, Operand2 Rm);
499 void ANDS(ARMReg Rd, ARMReg Rn, Operand2 Rm);
500 void EOR (ARMReg dest, ARMReg src, Operand2 op2);
501 void EORS(ARMReg dest, ARMReg src, Operand2 op2);
502 void SUB (ARMReg dest, ARMReg src, Operand2 op2);
503 void SUBS(ARMReg dest, ARMReg src, Operand2 op2);
504 void RSB (ARMReg dest, ARMReg src, Operand2 op2);
505 void RSBS(ARMReg dest, ARMReg src, Operand2 op2);
506 void ADD (ARMReg dest, ARMReg src, Operand2 op2);
507 void ADDS(ARMReg dest, ARMReg src, Operand2 op2);
508 void ADC (ARMReg dest, ARMReg src, Operand2 op2);
509 void ADCS(ARMReg dest, ARMReg src, Operand2 op2);
510 void LSL (ARMReg dest, ARMReg src, Operand2 op2);
511 void LSL (ARMReg dest, ARMReg src, ARMReg op2);
512 void LSLS(ARMReg dest, ARMReg src, Operand2 op2);
513 void LSLS(ARMReg dest, ARMReg src, ARMReg op2);
514 void LSR (ARMReg dest, ARMReg src, Operand2 op2);
515 void LSRS(ARMReg dest, ARMReg src, Operand2 op2);
516 void LSR (ARMReg dest, ARMReg src, ARMReg op2);
517 void LSRS(ARMReg dest, ARMReg src, ARMReg op2);
518 void ASR (ARMReg dest, ARMReg src, Operand2 op2);
519 void ASRS(ARMReg dest, ARMReg src, Operand2 op2);
520 void ASR (ARMReg dest, ARMReg src, ARMReg op2);
521 void ASRS(ARMReg dest, ARMReg src, ARMReg op2);
522
523 void SBC (ARMReg dest, ARMReg src, Operand2 op2);
524 void SBCS(ARMReg dest, ARMReg src, Operand2 op2);
525 void RBIT(ARMReg dest, ARMReg src);
526 void REV (ARMReg dest, ARMReg src);
527 void REV16 (ARMReg dest, ARMReg src);
528 void RSC (ARMReg dest, ARMReg src, Operand2 op2);
529 void RSCS(ARMReg dest, ARMReg src, Operand2 op2);
530 void TST ( ARMReg src, Operand2 op2);
531 void TEQ ( ARMReg src, Operand2 op2);
532 void CMP ( ARMReg src, Operand2 op2);
533 void CMN ( ARMReg src, Operand2 op2);
534 void ORR (ARMReg dest, ARMReg src, Operand2 op2);
535 void ORRS(ARMReg dest, ARMReg src, Operand2 op2);
536 void MOV (ARMReg dest, Operand2 op2);
537 void MOVS(ARMReg dest, Operand2 op2);
538 void BIC (ARMReg dest, ARMReg src, Operand2 op2); // BIC = ANDN
539 void BICS(ARMReg dest, ARMReg src, Operand2 op2);
540 void MVN (ARMReg dest, Operand2 op2);
541 void MVNS(ARMReg dest, Operand2 op2);
542 void MOVW(ARMReg dest, Operand2 op2);
543 void MOVT(ARMReg dest, Operand2 op2, bool TopBits = false);
544
545 // UDIV and SDIV are only available on CPUs that have
546 // the idiva hardare capacity
547 void UDIV(ARMReg dest, ARMReg dividend, ARMReg divisor);
548 void SDIV(ARMReg dest, ARMReg dividend, ARMReg divisor);
549
550 void MUL (ARMReg dest, ARMReg src, ARMReg op2);
551 void MULS(ARMReg dest, ARMReg src, ARMReg op2);
552
553 void UMULL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);
554 void SMULL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);
555
556 void UMLAL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);
557 void SMLAL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);
558
559 void SXTB(ARMReg dest, ARMReg op2);
560 void SXTH(ARMReg dest, ARMReg op2, u8 rotation = 0);
561 void SXTAH(ARMReg dest, ARMReg src, ARMReg op2, u8 rotation = 0);
562 void BFI(ARMReg rd, ARMReg rn, u8 lsb, u8 width);
563 void BFC(ARMReg rd, u8 lsb, u8 width);
564 void UBFX(ARMReg dest, ARMReg op2, u8 lsb, u8 width);
565 void SBFX(ARMReg dest, ARMReg op2, u8 lsb, u8 width);
566 void CLZ(ARMReg rd, ARMReg rm);
567 void PLD(ARMReg rd, int offset, bool forWrite = false);
568
569 // Using just MSR here messes with our defines on the PPC side of stuff (when this code was in dolphin...)
570 // Just need to put an underscore here, bit annoying.
571 void _MSR (bool nzcvq, bool g, Operand2 op2);
572 void _MSR (bool nzcvq, bool g, ARMReg src);
573 void MRS (ARMReg dest);
574
575 // Memory load/store operations
576 void LDR (ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
577 void LDRB (ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
578 void LDRH (ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
579 void LDRSB(ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
580 void LDRSH(ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
581 void STR (ARMReg result, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
582 void STRB (ARMReg result, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
583 void STRH (ARMReg result, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);
584
585 void STMFD(ARMReg dest, bool WriteBack, const int Regnum, ...);
586 void LDMFD(ARMReg dest, bool WriteBack, const int Regnum, ...);
587 void STMIA(ARMReg dest, bool WriteBack, const int Regnum, ...);
588 void LDMIA(ARMReg dest, bool WriteBack, const int Regnum, ...);
589 void STM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...);
590 void LDM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...);
591 void STMBitmask(ARMReg dest, bool Add, bool Before, bool WriteBack, const u16 RegList);
592 void LDMBitmask(ARMReg dest, bool Add, bool Before, bool WriteBack, const u16 RegList);
593
594 // Exclusive Access operations
595 void LDREX(ARMReg dest, ARMReg base);
596 // result contains the result if the instruction managed to store the value
597 void STREX(ARMReg result, ARMReg base, ARMReg op);
598 void DMB ();
599 void SVC(Operand2 op);
600
601 // NEON and ASIMD instructions
602 // None of these will be created with conditional since ARM
603 // is deprecating conditional execution of ASIMD instructions.
604 // ASIMD instructions don't even have a conditional encoding.
605
606 // NEON Only
607 void VABD(IntegerSize size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
608 void VADD(IntegerSize size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
609 void VSUB(IntegerSize size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
610
611 // VFP Only
612 void VLDMIA(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);
613 void VSTMIA(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);
614 void VLDMDB(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);
615 void VSTMDB(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);
VPUSH(ARMReg firstvreg,int numvregs)616 void VPUSH(ARMReg firstvreg, int numvregs) {
617 VSTMDB(R_SP, true, firstvreg, numvregs);
618 }
VPOP(ARMReg firstvreg,int numvregs)619 void VPOP(ARMReg firstvreg, int numvregs) {
620 VLDMIA(R_SP, true, firstvreg, numvregs);
621 }
622 void VLDR(ARMReg Dest, ARMReg Base, s16 offset);
623 void VSTR(ARMReg Src, ARMReg Base, s16 offset);
624 void VCMP(ARMReg Vd, ARMReg Vm);
625 void VCMPE(ARMReg Vd, ARMReg Vm);
626 // Compares against zero
627 void VCMP(ARMReg Vd);
628 void VCMPE(ARMReg Vd);
629
630 void VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
631 void VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
632 void VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
633 void VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm);
634 void VSQRT(ARMReg Vd, ARMReg Vm);
635
636 // NEON and VFP
637 void VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm);
638 void VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm);
639 void VABS(ARMReg Vd, ARMReg Vm);
640 void VNEG(ARMReg Vd, ARMReg Vm);
641 void VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
642 void VMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
643 void VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
644 void VMOV(ARMReg Dest, Operand2 op2);
645 void VMOV(ARMReg Dest, ARMReg Src, bool high);
646 void VMOV(ARMReg Dest, ARMReg Src);
647 // Either Vd, Rt, Rt2 or Rt, Rt2, Vd.
648 void VMOV(ARMReg Dest, ARMReg Src1, ARMReg Src2);
649 void VCVT(ARMReg Dest, ARMReg Src, int flags);
650
651 // NEON, need to check for this (supported if VFP4 is supported)
652 void VCVTF32F16(ARMReg Dest, ARMReg Src);
653 void VCVTF16F32(ARMReg Dest, ARMReg Src);
654
655 void VABA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
656 void VABAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
657 void VABD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
658 void VABDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
659 void VABS(u32 Size, ARMReg Vd, ARMReg Vm);
660 void VACGE(ARMReg Vd, ARMReg Vn, ARMReg Vm);
661 void VACGT(ARMReg Vd, ARMReg Vn, ARMReg Vm);
662 void VACLE(ARMReg Vd, ARMReg Vn, ARMReg Vm);
663 void VACLT(ARMReg Vd, ARMReg Vn, ARMReg Vm);
664 void VADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
665 void VADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
666 void VADDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
667 void VADDW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
668 void VBIF(ARMReg Vd, ARMReg Vn, ARMReg Vm);
669 void VBIT(ARMReg Vd, ARMReg Vn, ARMReg Vm);
670 void VBSL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
671 void VCEQ(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
672 void VCEQ(u32 Size, ARMReg Vd, ARMReg Vm);
673 void VCGE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
674 void VCGE(u32 Size, ARMReg Vd, ARMReg Vm);
675 void VCGT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
676 void VCGT(u32 Size, ARMReg Vd, ARMReg Vm);
677 void VCLE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
678 void VCLE(u32 Size, ARMReg Vd, ARMReg Vm);
679 void VCLS(u32 Size, ARMReg Vd, ARMReg Vm);
680 void VCLT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
681 void VCLT(u32 Size, ARMReg Vd, ARMReg Vm);
682 void VCLZ(u32 Size, ARMReg Vd, ARMReg Vm);
683 void VCNT(u32 Size, ARMReg Vd, ARMReg Vm);
684 void VDUP(u32 Size, ARMReg Vd, ARMReg Vm, u8 index);
685 void VDUP(u32 Size, ARMReg Vd, ARMReg Rt);
686 void VEXT(ARMReg Vd, ARMReg Vn, ARMReg Vm, u8 index);
687 void VFMA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
688 void VFMS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
689 void VHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
690 void VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
691 void VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
692 void VMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
693
694 // Three registers
695 void VMLA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
696 void VMLS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
697 void VMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
698 void VMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
699 void VMUL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
700 void VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
701 void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
702 void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
703 void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
704 void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
705 void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
706
707 // Two registers and a scalar
708 // These two are super useful for matrix multiplication
709 void VMUL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
710 void VMLA_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
711
712 // TODO:
713 /*
714 void VMLS_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
715 void VMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
716 void VMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
717 void VMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
718 void VQDMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
719 void VQDMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
720 void VQDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
721 void VQDMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
722 void VQRDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
723 */
724
725 // Vector bitwise. These don't have an element size for obvious reasons.
726 void VAND(ARMReg Vd, ARMReg Vn, ARMReg Vm);
727 void VBIC(ARMReg Vd, ARMReg Vn, ARMReg Vm);
728 void VEOR(ARMReg Vd, ARMReg Vn, ARMReg Vm);
729 void VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm);
730 void VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm);
VMOV_neon(ARMReg Dest,ARMReg Src)731 inline void VMOV_neon(ARMReg Dest, ARMReg Src) {
732 VORR(Dest, Src, Src);
733 }
734 void VMOV_neon(u32 Size, ARMReg Vd, u32 imm);
VMOV_neon(u32 Size,ARMReg Vd,float imm)735 void VMOV_neon(u32 Size, ARMReg Vd, float imm) {
736 _dbg_assert_msg_(Size == F_32, "Expecting F_32 immediate for VMOV_neon float arg.");
737 union {
738 float f;
739 u32 u;
740 } val;
741 val.f = imm;
742 VMOV_neon(I_32, Vd, val.u);
743 }
744 void VMOV_neon(u32 Size, ARMReg Vd, ARMReg Rt, int lane);
745
746 void VNEG(u32 Size, ARMReg Vd, ARMReg Vm);
747 void VMVN(ARMReg Vd, ARMReg Vm);
748 void VPADAL(u32 Size, ARMReg Vd, ARMReg Vm);
749 void VPADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
750 void VPADDL(u32 Size, ARMReg Vd, ARMReg Vm);
751 void VPMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
752 void VPMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
753 void VQABS(u32 Size, ARMReg Vd, ARMReg Vm);
754 void VQADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
755 void VQNEG(u32 Size, ARMReg Vd, ARMReg Vm);
756 void VQRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
757 void VQSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
758 void VQSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
759 void VRADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
760 void VRECPE(u32 Size, ARMReg Vd, ARMReg Vm);
761 void VRECPS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
762 void VRHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
763 void VRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
764 void VRSQRTE(u32 Size, ARMReg Vd, ARMReg Vm);
765 void VRSQRTS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
766 void VRSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
767 void VSHL(u32 Size, ARMReg Vd, ARMReg Vm, ARMReg Vn); // Register shift
768 void VSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
769 void VSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
770 void VSUBL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
771 void VSUBW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
772 void VSWP(ARMReg Vd, ARMReg Vm);
773 void VTRN(u32 Size, ARMReg Vd, ARMReg Vm);
774 void VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
775 void VUZP(u32 Size, ARMReg Vd, ARMReg Vm);
776 void VZIP(u32 Size, ARMReg Vd, ARMReg Vm);
777 void VREVX(u32 size, u32 Size, ARMReg Vd, ARMReg Vm);
778 void VREV64(u32 Size, ARMReg Vd, ARMReg Vm);
779 void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);
780 void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);
781
782
783 // NEON immediate instructions
784
785
786 void VMOV_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);
787 void VMOV_immf(ARMReg Vd, float value); // This only works with a select few values (1.0f and -1.0f).
788
789 void VORR_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);
790 void VMVN_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);
791 void VBIC_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);
792
793 // Widening and narrowing moves
794 void VMOVL(u32 Size, ARMReg Vd, ARMReg Vm);
795 void VMOVN(u32 Size, ARMReg Vd, ARMReg Vm);
796 void VQMOVN(u32 Size, ARMReg Vd, ARMReg Vm);
797 void VQMOVUN(u32 Size, ARMReg Vd, ARMReg Vm);
798
799 // Shifts by immediate
800 void VSHL(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount);
801 void VSHLL(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount); // widening
802 void VSHR(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount);
803 void VSHRN(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount); // narrowing
804
805 // Vector VCVT
806 void VCVT(u32 DestSize, ARMReg Dest, ARMReg Src);
807
808
809 // Notes:
810 // Rm == R_PC is interpreted as no offset, otherwise, effective address is sum of Rn and Rm
811 // Rm == R13 is interpreted as VLD1, .... [Rn]! Added a REG_UPDATE pseudo register.
812
813 // Load/store multiple registers full of elements (a register is a D register)
814 // Specifying alignment when it can be guaranteed is documented to improve load/store performance.
815 // For example, when loading a set of four 64-bit registers that we know is 32-byte aligned, we should specify ALIGN_256.
816 void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
817 void VST1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
818
819 // Load/store single lanes of D registers
820 void VLD1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, bool aligned, ARMReg Rm = R_PC);
821 void VST1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, bool aligned, ARMReg Rm = R_PC);
822
823 // Load one value into all lanes of a D or a Q register (either supported, all formats should work).
824 void VLD1_all_lanes(u32 Size, ARMReg Vd, ARMReg Rn, bool aligned, ARMReg Rm = R_PC);
825
826 /*
827 // Deinterleave two loads... or something. TODO
828 void VLD2(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
829 void VST2(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
830
831 void VLD2_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);
832 void VST2_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);
833
834 void VLD3(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
835 void VST3(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
836
837 void VLD3_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);
838 void VST3_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);
839
840 void VLD4(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
841 void VST4(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);
842
843 void VLD4_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);
844 void VST4_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);
845 */
846
847 void VMRS_APSR();
848 void VMRS(ARMReg Rt);
849 void VMSR(ARMReg Rt);
850
851 void QuickCallFunction(ARMReg scratchreg, const void *func);
QuickCallFunction(ARMReg scratchreg,T func)852 template <typename T> void QuickCallFunction(ARMReg scratchreg, T func) {
853 QuickCallFunction(scratchreg, (const void *)func);
854 }
855
856 // Wrapper around MOVT/MOVW with fallbacks.
857 void MOVI2R(ARMReg reg, u32 val, bool optimize = true);
858 void MOVI2FR(ARMReg dest, float val, bool negate = false);
859 void MOVI2F(ARMReg dest, float val, ARMReg tempReg, bool negate = false);
860 void MOVI2F_neon(ARMReg dest, float val, ARMReg tempReg, bool negate = false);
861
862 // Load pointers without casting
MOVP2R(ARMReg reg,T * val)863 template <class T> void MOVP2R(ARMReg reg, T *val) {
864 MOVI2R(reg, (u32)(uintptr_t)(void *)val);
865 }
866
867 void MOVIU2F(ARMReg dest, u32 val, ARMReg tempReg, bool negate = false) {
868 union {
869 u32 u;
870 float f;
871 } v = {val};
872 MOVI2F(dest, v.f, tempReg, negate);
873 }
874
875 void ADDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
876 bool TryADDI2R(ARMReg rd, ARMReg rs, u32 val);
877 void SUBI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
878 bool TrySUBI2R(ARMReg rd, ARMReg rs, u32 val);
879 void ANDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
880 bool TryANDI2R(ARMReg rd, ARMReg rs, u32 val);
881 void CMPI2R(ARMReg rs, u32 val, ARMReg scratch);
882 bool TryCMPI2R(ARMReg rs, u32 val);
883 void TSTI2R(ARMReg rs, u32 val, ARMReg scratch);
884 bool TryTSTI2R(ARMReg rs, u32 val);
885 void ORI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
886 bool TryORI2R(ARMReg rd, ARMReg rs, u32 val);
887 void EORI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
888 bool TryEORI2R(ARMReg rd, ARMReg rs, u32 val);
889 }; // class ARMXEmitter
890
891
892 // Everything that needs to generate machine code should inherit from this.
893 // You get memory management for free, plus, you can use all the MOV etc functions without
894 // having to prefix them with gen-> or something similar.
895
896 class ARMXCodeBlock : public CodeBlock<ARMXEmitter> {
897 public:
898 void PoisonMemory(int offset) override;
899 };
900
901 // VFP Specific
902 struct VFPEnc {
903 s16 opc1;
904 s16 opc2;
905 };
906 extern const VFPEnc VFPOps[16][2];
907 extern const char *VFPOpNames[16];
908
909 } // namespace
910