1 /*
2 ** ARM64 instruction emitter.
3 ** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
4 **
5 ** Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com.
6 ** Sponsored by Cisco Systems, Inc.
7 */
8 
9 /* -- Constant encoding --------------------------------------------------- */
10 
get_k64val(ASMState * as,IRRef ref)11 static uint64_t get_k64val(ASMState *as, IRRef ref)
12 {
13   IRIns *ir = IR(ref);
14   if (ir->o == IR_KINT64) {
15     return ir_kint64(ir)->u64;
16   } else if (ir->o == IR_KGC) {
17     return (uint64_t)ir_kgc(ir);
18   } else if (ir->o == IR_KPTR || ir->o == IR_KKPTR) {
19     return (uint64_t)ir_kptr(ir);
20   } else {
21     lj_assertA(ir->o == IR_KINT || ir->o == IR_KNULL,
22 	       "bad 64 bit const IR op %d", ir->o);
23     return ir->i;  /* Sign-extended. */
24   }
25 }
26 
27 /* Encode constant in K12 format for data processing instructions. */
emit_isk12(int64_t n)28 static uint32_t emit_isk12(int64_t n)
29 {
30   uint64_t k = (n < 0) ? -n : n;
31   uint32_t m = (n < 0) ? 0x40000000 : 0;
32   if (k < 0x1000) {
33     return A64I_K12|m|A64F_U12(k);
34   } else if ((k & 0xfff000) == k) {
35     return A64I_K12|m|0x400000|A64F_U12(k>>12);
36   }
37   return 0;
38 }
39 
40 #define emit_clz64(n)	__builtin_clzll(n)
41 #define emit_ctz64(n)	__builtin_ctzll(n)
42 
43 /* Encode constant in K13 format for logical data processing instructions. */
emit_isk13(uint64_t n,int is64)44 static uint32_t emit_isk13(uint64_t n, int is64)
45 {
46   int inv = 0, w = 128, lz, tz;
47   if (n & 1) { n = ~n; w = 64; inv = 1; }  /* Avoid wrap-around of ones. */
48   if (!n) return 0;  /* Neither all-zero nor all-ones are allowed. */
49   do {  /* Find the repeat width. */
50     if (is64 && (uint32_t)(n^(n>>32))) break;
51     n = (uint32_t)n;
52     if (!n) return 0;  /* Ditto when passing n=0xffffffff and is64=0. */
53     w = 32; if ((n^(n>>16)) & 0xffff) break;
54     n = n & 0xffff; w = 16; if ((n^(n>>8)) & 0xff) break;
55     n = n & 0xff; w = 8; if ((n^(n>>4)) & 0xf) break;
56     n = n & 0xf; w = 4; if ((n^(n>>2)) & 0x3) break;
57     n = n & 0x3; w = 2;
58   } while (0);
59   lz = emit_clz64(n);
60   tz = emit_ctz64(n);
61   if ((int64_t)(n << lz) >> (lz+tz) != -1ll) return 0; /* Non-contiguous? */
62   if (inv)
63     return A64I_K13 | (((lz-w) & 127) << 16) | (((lz+tz-w-1) & 63) << 10);
64   else
65     return A64I_K13 | ((w-tz) << 16) | (((63-lz-tz-w-w) & 63) << 10);
66 }
67 
emit_isfpk64(uint64_t n)68 static uint32_t emit_isfpk64(uint64_t n)
69 {
70   uint64_t etop9 = ((n >> 54) & 0x1ff);
71   if ((n << 16) == 0 && (etop9 == 0x100 || etop9 == 0x0ff)) {
72     return (uint32_t)(((n >> 48) & 0x7f) | ((n >> 56) & 0x80));
73   }
74   return ~0u;
75 }
76 
77 /* -- Emit basic instructions --------------------------------------------- */
78 
emit_dnma(ASMState * as,A64Ins ai,Reg rd,Reg rn,Reg rm,Reg ra)79 static void emit_dnma(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm, Reg ra)
80 {
81   *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm) | A64F_A(ra);
82 }
83 
emit_dnm(ASMState * as,A64Ins ai,Reg rd,Reg rn,Reg rm)84 static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm)
85 {
86   *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_M(rm);
87 }
88 
emit_dm(ASMState * as,A64Ins ai,Reg rd,Reg rm)89 static void emit_dm(ASMState *as, A64Ins ai, Reg rd, Reg rm)
90 {
91   *--as->mcp = ai | A64F_D(rd) | A64F_M(rm);
92 }
93 
emit_dn(ASMState * as,A64Ins ai,Reg rd,Reg rn)94 static void emit_dn(ASMState *as, A64Ins ai, Reg rd, Reg rn)
95 {
96   *--as->mcp = ai | A64F_D(rd) | A64F_N(rn);
97 }
98 
emit_nm(ASMState * as,A64Ins ai,Reg rn,Reg rm)99 static void emit_nm(ASMState *as, A64Ins ai, Reg rn, Reg rm)
100 {
101   *--as->mcp = ai | A64F_N(rn) | A64F_M(rm);
102 }
103 
emit_d(ASMState * as,A64Ins ai,Reg rd)104 static void emit_d(ASMState *as, A64Ins ai, Reg rd)
105 {
106   *--as->mcp = ai | A64F_D(rd);
107 }
108 
emit_n(ASMState * as,A64Ins ai,Reg rn)109 static void emit_n(ASMState *as, A64Ins ai, Reg rn)
110 {
111   *--as->mcp = ai | A64F_N(rn);
112 }
113 
emit_checkofs(A64Ins ai,int64_t ofs)114 static int emit_checkofs(A64Ins ai, int64_t ofs)
115 {
116   int scale = (ai >> 30) & 3;
117   if (ofs < 0 || (ofs & ((1<<scale)-1))) {
118     return (ofs >= -256 && ofs <= 255) ? -1 : 0;
119   } else {
120     return (ofs < (4096<<scale)) ? 1 : 0;
121   }
122 }
123 
emit_lso(ASMState * as,A64Ins ai,Reg rd,Reg rn,int64_t ofs)124 static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
125 {
126   int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
127   lj_assertA(ot, "load/store offset %d out of range", ofs);
128   /* Combine LDR/STR pairs to LDP/STP. */
129   if ((sc == 2 || sc == 3) &&
130       (!(ai & 0x400000) || rd != rn) &&
131       as->mcp != as->mcloop) {
132     uint32_t prev = *as->mcp & ~A64F_D(31);
133     int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
134     A64Ins aip;
135     if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
136 	prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
137       aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
138     } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
139 	       prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
140       aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
141       ofsm = ofs;
142     } else {
143       goto nopair;
144     }
145     if (ofsm >= (int)((unsigned int)-64<<sc) && ofsm <= (63<<sc)) {
146       *as->mcp = aip | A64F_N(rn) | ((ofsm >> sc) << 15) |
147 	(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
148       return;
149     }
150   }
151 nopair:
152   if (ot == 1)
153     *--as->mcp = ai | A64F_D(rd) | A64F_N(rn) | A64F_U12(ofs >> sc);
154   else
155     *--as->mcp = (ai^A64I_LS_U) | A64F_D(rd) | A64F_N(rn) | A64F_S9(ofs & 0x1ff);
156 }
157 
158 /* -- Emit loads/stores --------------------------------------------------- */
159 
160 /* Prefer rematerialization of BASE/L from global_State over spills. */
161 #define emit_canremat(ref)	((ref) <= ASMREF_L)
162 
163 /* Try to find an N-step delta relative to other consts with N < lim. */
emit_kdelta(ASMState * as,Reg rd,uint64_t k,int lim)164 static int emit_kdelta(ASMState *as, Reg rd, uint64_t k, int lim)
165 {
166   RegSet work = (~as->freeset & RSET_GPR) | RID2RSET(RID_GL);
167   if (lim <= 1) return 0;  /* Can't beat that. */
168   while (work) {
169     Reg r = rset_picktop(work);
170     IRRef ref = regcost_ref(as->cost[r]);
171     lj_assertA(r != rd, "dest reg %d not free", rd);
172     if (ref < REF_TRUE) {
173       uint64_t kx = ra_iskref(ref) ? (uint64_t)ra_krefk(as, ref) :
174 				     get_k64val(as, ref);
175       int64_t delta = (int64_t)(k - kx);
176       if (delta == 0) {
177 	emit_dm(as, A64I_MOVx, rd, r);
178 	return 1;
179       } else {
180 	uint32_t k12 = emit_isk12(delta < 0 ? -delta : delta);
181 	if (k12) {
182 	  emit_dn(as, (delta < 0 ? A64I_SUBx : A64I_ADDx)^k12, rd, r);
183 	  return 1;
184 	}
185 	/* Do other ops or multi-step deltas pay off? Probably not.
186 	** E.g. XOR rarely helps with pointer consts.
187 	*/
188       }
189     }
190     rset_clear(work, r);
191   }
192   return 0;  /* Failed. */
193 }
194 
emit_loadk(ASMState * as,Reg rd,uint64_t u64,int is64)195 static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
196 {
197   int i, zeros = 0, ones = 0, neg;
198   if (!is64) u64 = (int64_t)(int32_t)u64;  /* Sign-extend. */
199   /* Count homogeneous 16 bit fragments. */
200   for (i = 0; i < 4; i++) {
201     uint64_t frag = (u64 >> i*16) & 0xffff;
202     zeros += (frag == 0);
203     ones += (frag == 0xffff);
204   }
205   neg = ones > zeros;  /* Use MOVN if it pays off. */
206   if ((neg ? ones : zeros) < 3) {  /* Need 2+ ins. Try shorter K13 encoding. */
207     uint32_t k13 = emit_isk13(u64, is64);
208     if (k13) {
209       emit_dn(as, (is64|A64I_ORRw)^k13, rd, RID_ZERO);
210       return;
211     }
212   }
213   if (!emit_kdelta(as, rd, u64, 4 - (neg ? ones : zeros))) {
214     int shift = 0, lshift = 0;
215     uint64_t n64 = neg ? ~u64 : u64;
216     if (n64 != 0) {
217       /* Find first/last fragment to be filled. */
218       shift = (63-emit_clz64(n64)) & ~15;
219       lshift = emit_ctz64(n64) & ~15;
220     }
221     /* MOVK requires the original value (u64). */
222     while (shift > lshift) {
223       uint32_t u16 = (u64 >> shift) & 0xffff;
224       /* Skip fragments that are correctly filled by MOVN/MOVZ. */
225       if (u16 != (neg ? 0xffff : 0))
226 	emit_d(as, is64 | A64I_MOVKw | A64F_U16(u16) | A64F_LSL16(shift), rd);
227       shift -= 16;
228     }
229     /* But MOVN needs an inverted value (n64). */
230     emit_d(as, (neg ? A64I_MOVNx : A64I_MOVZx) |
231 	       A64F_U16((n64 >> lshift) & 0xffff) | A64F_LSL16(lshift), rd);
232   }
233 }
234 
235 /* Load a 32 bit constant into a GPR. */
236 #define emit_loadi(as, rd, i)	emit_loadk(as, rd, i, 0)
237 
238 /* Load a 64 bit constant into a GPR. */
239 #define emit_loadu64(as, rd, i)	emit_loadk(as, rd, i, A64I_X)
240 
241 #define emit_loada(as, r, addr)	emit_loadu64(as, (r), (uintptr_t)(addr))
242 
243 #define glofs(as, k) \
244   ((intptr_t)((uintptr_t)(k) - (uintptr_t)&J2GG(as->J)->g))
245 #define mcpofs(as, k) \
246   ((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
247 #define checkmcpofs(as, k) \
248   (A64F_S_OK(mcpofs(as, k)>>2, 19))
249 
250 static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
251 
252 /* Get/set from constant pointer. */
emit_lsptr(ASMState * as,A64Ins ai,Reg r,void * p)253 static void emit_lsptr(ASMState *as, A64Ins ai, Reg r, void *p)
254 {
255   /* First, check if ip + offset is in range. */
256   if ((ai & 0x00400000) && checkmcpofs(as, p)) {
257     emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, p)>>2), r);
258   } else {
259     Reg base = RID_GL;  /* Next, try GL + offset. */
260     int64_t ofs = glofs(as, p);
261     if (!emit_checkofs(ai, ofs)) {  /* Else split up into base reg + offset. */
262       int64_t i64 = i64ptr(p);
263       base = ra_allock(as, (i64 & ~0x7fffull), rset_exclude(RSET_GPR, r));
264       ofs = i64 & 0x7fffull;
265     }
266     emit_lso(as, ai, r, base, ofs);
267   }
268 }
269 
270 /* Load 64 bit IR constant into register. */
emit_loadk64(ASMState * as,Reg r,IRIns * ir)271 static void emit_loadk64(ASMState *as, Reg r, IRIns *ir)
272 {
273   const uint64_t *k = &ir_k64(ir)->u64;
274   int64_t ofs;
275   if (r >= RID_MAX_GPR) {
276     uint32_t fpk = emit_isfpk64(*k);
277     if (fpk != ~0u) {
278       emit_d(as, A64I_FMOV_DI | A64F_FP8(fpk), (r & 31));
279       return;
280     }
281   }
282   ofs = glofs(as, k);
283   if (emit_checkofs(A64I_LDRx, ofs)) {
284     emit_lso(as, r >= RID_MAX_GPR ? A64I_LDRd : A64I_LDRx,
285 	     (r & 31), RID_GL, ofs);
286   } else {
287     if (r >= RID_MAX_GPR) {
288       emit_dn(as, A64I_FMOV_D_R, (r & 31), RID_TMP);
289       r = RID_TMP;
290     }
291     if (checkmcpofs(as, k))
292       emit_d(as, A64I_LDRLx | A64F_S19(mcpofs(as, k)>>2), r);
293     else
294       emit_loadu64(as, r, *k);
295   }
296 }
297 
298 /* Get/set global_State fields. */
299 #define emit_getgl(as, r, field) \
300   emit_lsptr(as, A64I_LDRx, (r), (void *)&J2G(as->J)->field)
301 #define emit_setgl(as, r, field) \
302   emit_lsptr(as, A64I_STRx, (r), (void *)&J2G(as->J)->field)
303 
304 /* Trace number is determined from pc of exit instruction. */
305 #define emit_setvmstate(as, i)	UNUSED(i)
306 
307 /* -- Emit control-flow instructions -------------------------------------- */
308 
309 /* Label for internal jumps. */
310 typedef MCode *MCLabel;
311 
312 /* Return label pointing to current PC. */
313 #define emit_label(as)		((as)->mcp)
314 
emit_cond_branch(ASMState * as,A64CC cond,MCode * target)315 static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
316 {
317   MCode *p = --as->mcp;
318   ptrdiff_t delta = target - p;
319   lj_assertA(A64F_S_OK(delta, 19), "branch target out of range");
320   *p = A64I_BCC | A64F_S19(delta) | cond;
321 }
322 
emit_branch(ASMState * as,A64Ins ai,MCode * target)323 static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
324 {
325   MCode *p = --as->mcp;
326   ptrdiff_t delta = target - p;
327   lj_assertA(A64F_S_OK(delta, 26), "branch target out of range");
328   *p = ai | A64F_S26(delta);
329 }
330 
emit_tnb(ASMState * as,A64Ins ai,Reg r,uint32_t bit,MCode * target)331 static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target)
332 {
333   MCode *p = --as->mcp;
334   ptrdiff_t delta = target - p;
335   lj_assertA(bit < 63, "bit number out of range");
336   lj_assertA(A64F_S_OK(delta, 14), "branch target out of range");
337   if (bit > 31) ai |= A64I_X;
338   *p = ai | A64F_BIT(bit & 31) | A64F_S14(delta) | r;
339 }
340 
emit_cnb(ASMState * as,A64Ins ai,Reg r,MCode * target)341 static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target)
342 {
343   MCode *p = --as->mcp;
344   ptrdiff_t delta = target - p;
345   lj_assertA(A64F_S_OK(delta, 19), "branch target out of range");
346   *p = ai | A64F_S19(delta) | r;
347 }
348 
349 #define emit_jmp(as, target)	emit_branch(as, A64I_B, (target))
350 
emit_call(ASMState * as,void * target)351 static void emit_call(ASMState *as, void *target)
352 {
353   MCode *p = --as->mcp;
354   ptrdiff_t delta = (char *)target - (char *)p;
355   if (A64F_S_OK(delta>>2, 26)) {
356     *p = A64I_BL | A64F_S26(delta>>2);
357   } else {  /* Target out of range: need indirect call. But don't use R0-R7. */
358     Reg r = ra_allock(as, i64ptr(target),
359 		      RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);
360     *p = A64I_BLR | A64F_N(r);
361   }
362 }
363 
364 /* -- Emit generic operations --------------------------------------------- */
365 
366 /* Generic move between two regs. */
emit_movrr(ASMState * as,IRIns * ir,Reg dst,Reg src)367 static void emit_movrr(ASMState *as, IRIns *ir, Reg dst, Reg src)
368 {
369   if (dst >= RID_MAX_GPR) {
370     emit_dn(as, irt_isnum(ir->t) ? A64I_FMOV_D : A64I_FMOV_S,
371 	    (dst & 31), (src & 31));
372     return;
373   }
374   if (as->mcp != as->mcloop) {  /* Swap early registers for loads/stores. */
375     MCode ins = *as->mcp, swp = (src^dst);
376     if ((ins & 0xbf800000) == 0xb9000000) {
377       if (!((ins ^ (dst << 5)) & 0x000003e0))
378 	*as->mcp = ins ^ (swp << 5);  /* Swap N in load/store. */
379       if (!(ins & 0x00400000) && !((ins ^ dst) & 0x0000001f))
380 	*as->mcp = ins ^ swp;  /* Swap D in store. */
381     }
382   }
383   emit_dm(as, A64I_MOVx, dst, src);
384 }
385 
386 /* Generic load of register with base and (small) offset address. */
emit_loadofs(ASMState * as,IRIns * ir,Reg r,Reg base,int32_t ofs)387 static void emit_loadofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
388 {
389   if (r >= RID_MAX_GPR)
390     emit_lso(as, irt_isnum(ir->t) ? A64I_LDRd : A64I_LDRs, (r & 31), base, ofs);
391   else
392     emit_lso(as, irt_is64(ir->t) ? A64I_LDRx : A64I_LDRw, r, base, ofs);
393 }
394 
395 /* Generic store of register with base and (small) offset address. */
emit_storeofs(ASMState * as,IRIns * ir,Reg r,Reg base,int32_t ofs)396 static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
397 {
398   if (r >= RID_MAX_GPR)
399     emit_lso(as, irt_isnum(ir->t) ? A64I_STRd : A64I_STRs, (r & 31), base, ofs);
400   else
401     emit_lso(as, irt_is64(ir->t) ? A64I_STRx : A64I_STRw, r, base, ofs);
402 }
403 
404 /* Emit an arithmetic operation with a constant operand. */
emit_opk(ASMState * as,A64Ins ai,Reg dest,Reg src,int32_t i,RegSet allow)405 static void emit_opk(ASMState *as, A64Ins ai, Reg dest, Reg src,
406 		     int32_t i, RegSet allow)
407 {
408   uint32_t k = emit_isk12(i);
409   if (k)
410     emit_dn(as, ai^k, dest, src);
411   else
412     emit_dnm(as, ai, dest, src, ra_allock(as, i, allow));
413 }
414 
415 /* Add offset to pointer. */
emit_addptr(ASMState * as,Reg r,int32_t ofs)416 static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
417 {
418   if (ofs)
419     emit_opk(as, ofs < 0 ? A64I_SUBx : A64I_ADDx, r, r,
420 		 ofs < 0 ? -ofs : ofs, rset_exclude(RSET_GPR, r));
421 }
422 
423 #define emit_spsub(as, ofs)	emit_addptr(as, RID_SP, -(ofs))
424 
425