1|// Low-level VM code for x64 CPUs in LJ_GC64 mode.
2|// Bytecode interpreter, fast functions and helper functions.
3|// Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
4|
5|.arch x64
6|.section code_op, code_sub
7|
8|.actionlist build_actionlist
9|.globals GLOB_
10|.globalnames globnames
11|.externnames extnames
12|
13|//-----------------------------------------------------------------------
14|
15|.if WIN
16|.define X64WIN, 1			// Windows/x64 calling conventions.
17|.endif
18|
19|// Fixed register assignments for the interpreter.
20|// This is very fragile and has many dependencies. Caveat emptor.
21|.define BASE,		rdx		// Not C callee-save, refetched anyway.
22|.if X64WIN
23|.define KBASE,		rdi		// Must be C callee-save.
24|.define PC,		rsi		// Must be C callee-save.
25|.define DISPATCH,	rbx		// Must be C callee-save.
26|.define KBASEd,	edi
27|.define PCd,		esi
28|.define DISPATCHd,	ebx
29|.else
30|.define KBASE,		r15		// Must be C callee-save.
31|.define PC,		rbx		// Must be C callee-save.
32|.define DISPATCH,	r14		// Must be C callee-save.
33|.define KBASEd,	r15d
34|.define PCd,		ebx
35|.define DISPATCHd,	r14d
36|.endif
37|
38|.define RA,		rcx
39|.define RAd,		ecx
40|.define RAH,		ch
41|.define RAL,		cl
42|.define RB,		rbp		// Must be rbp (C callee-save).
43|.define RBd,		ebp
44|.define RC,		rax		// Must be rax.
45|.define RCd,		eax
46|.define RCW,		ax
47|.define RCH,		ah
48|.define RCL,		al
49|.define OP,		RBd
50|.define RD,		RC
51|.define RDd,		RCd
52|.define RDW,		RCW
53|.define RDL,		RCL
54|.define TMPR,		r10
55|.define TMPRd,		r10d
56|.define ITYPE,		r11
57|.define ITYPEd,	r11d
58|
59|.if X64WIN
60|.define CARG1,		rcx		// x64/WIN64 C call arguments.
61|.define CARG2,		rdx
62|.define CARG3,		r8
63|.define CARG4,		r9
64|.define CARG1d,	ecx
65|.define CARG2d,	edx
66|.define CARG3d,	r8d
67|.define CARG4d,	r9d
68|.else
69|.define CARG1,		rdi		// x64/POSIX C call arguments.
70|.define CARG2,		rsi
71|.define CARG3,		rdx
72|.define CARG4,		rcx
73|.define CARG5,		r8
74|.define CARG6,		r9
75|.define CARG1d,	edi
76|.define CARG2d,	esi
77|.define CARG3d,	edx
78|.define CARG4d,	ecx
79|.define CARG5d,	r8d
80|.define CARG6d,	r9d
81|.endif
82|
83|// Type definitions. Some of these are only used for documentation.
84|.type L,		lua_State
85|.type GL,		global_State
86|.type TVALUE,		TValue
87|.type GCOBJ,		GCobj
88|.type STR,		GCstr
89|.type TAB,		GCtab
90|.type LFUNC,		GCfuncL
91|.type CFUNC,		GCfuncC
92|.type PROTO,		GCproto
93|.type UPVAL,		GCupval
94|.type NODE,		Node
95|.type NARGS,		int
96|.type TRACE,		GCtrace
97|.type SBUF,		SBuf
98|
99|// Stack layout while in interpreter. Must match with lj_frame.h.
100|//-----------------------------------------------------------------------
101|.if X64WIN		// x64/Windows stack layout
102|
103|.define CFRAME_SPACE,	aword*5			// Delta for rsp (see <--).
104|.macro saveregs_
105|  push rdi; push rsi; push rbx
106|  sub rsp, CFRAME_SPACE
107|.endmacro
108|.macro saveregs
109|  push rbp; saveregs_
110|.endmacro
111|.macro restoreregs
112|  add rsp, CFRAME_SPACE
113|  pop rbx; pop rsi; pop rdi; pop rbp
114|.endmacro
115|
116|.define SAVE_CFRAME,	aword [rsp+aword*13]
117|.define SAVE_PC,	aword [rsp+aword*12]
118|.define SAVE_L,	aword [rsp+aword*11]
119|.define SAVE_ERRF,	dword [rsp+dword*21]
120|.define SAVE_NRES,	dword [rsp+dword*20]
121|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by interpreter
122|.define SAVE_RET,	aword [rsp+aword*9]	//<-- rsp entering interpreter.
123|.define SAVE_R4,	aword [rsp+aword*8]
124|.define SAVE_R3,	aword [rsp+aword*7]
125|.define SAVE_R2,	aword [rsp+aword*6]
126|.define SAVE_R1,	aword [rsp+aword*5]	//<-- rsp after register saves.
127|.define ARG5,		aword [rsp+aword*4]
128|.define CSAVE_4,	aword [rsp+aword*3]
129|.define CSAVE_3,	aword [rsp+aword*2]
130|.define CSAVE_2,	aword [rsp+aword*1]
131|.define CSAVE_1,	aword [rsp]		//<-- rsp while in interpreter.
132|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by callee
133|
134|.define ARG5d,		dword [rsp+dword*8]
135|.define TMP1,		ARG5			// TMP1 overlaps ARG5
136|.define TMP1d,		ARG5d
137|.define TMP1hi,	dword [rsp+dword*9]
138|.define MULTRES,	TMP1d			// MULTRES overlaps TMP1d.
139|
140|//-----------------------------------------------------------------------
141|.else			// x64/POSIX stack layout
142|
143|.define CFRAME_SPACE,	aword*5			// Delta for rsp (see <--).
144|.macro saveregs_
145|  push rbx; push r15; push r14
146|.if NO_UNWIND
147|  push r13; push r12
148|.endif
149|  sub rsp, CFRAME_SPACE
150|.endmacro
151|.macro saveregs
152|  push rbp; saveregs_
153|.endmacro
154|.macro restoreregs
155|  add rsp, CFRAME_SPACE
156|.if NO_UNWIND
157|  pop r12; pop r13
158|.endif
159|  pop r14; pop r15; pop rbx; pop rbp
160|.endmacro
161|
162|//----- 16 byte aligned,
163|.if NO_UNWIND
164|.define SAVE_RET,	aword [rsp+aword*11]	//<-- rsp entering interpreter.
165|.define SAVE_R4,	aword [rsp+aword*10]
166|.define SAVE_R3,	aword [rsp+aword*9]
167|.define SAVE_R2,	aword [rsp+aword*8]
168|.define SAVE_R1,	aword [rsp+aword*7]
169|.define SAVE_RU2,	aword [rsp+aword*6]
170|.define SAVE_RU1,	aword [rsp+aword*5]	//<-- rsp after register saves.
171|.else
172|.define SAVE_RET,	aword [rsp+aword*9]	//<-- rsp entering interpreter.
173|.define SAVE_R4,	aword [rsp+aword*8]
174|.define SAVE_R3,	aword [rsp+aword*7]
175|.define SAVE_R2,	aword [rsp+aword*6]
176|.define SAVE_R1,	aword [rsp+aword*5]	//<-- rsp after register saves.
177|.endif
178|.define SAVE_CFRAME,	aword [rsp+aword*4]
179|.define SAVE_PC,	aword [rsp+aword*3]
180|.define SAVE_L,	aword [rsp+aword*2]
181|.define SAVE_ERRF,	dword [rsp+dword*3]
182|.define SAVE_NRES,	dword [rsp+dword*2]
183|.define TMP1,		aword [rsp]		//<-- rsp while in interpreter.
184|//----- 16 byte aligned
185|
186|.define TMP1d,		dword [rsp]
187|.define TMP1hi,	dword [rsp+dword*1]
188|.define MULTRES,	TMP1d			// MULTRES overlaps TMP1d.
189|
190|.endif
191|
192|//-----------------------------------------------------------------------
193|
194|// Instruction headers.
195|.macro ins_A; .endmacro
196|.macro ins_AD; .endmacro
197|.macro ins_AJ; .endmacro
198|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro
199|.macro ins_AB_; movzx RBd, RCH; .endmacro
200|.macro ins_A_C; movzx RCd, RCL; .endmacro
201|.macro ins_AND; not RD; .endmacro
202|
203|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
204|.macro ins_NEXT
205|  mov RCd, [PC]
206|  movzx RAd, RCH
207|  movzx OP, RCL
208|  add PC, 4
209|  shr RCd, 16
210|  jmp aword [DISPATCH+OP*8]
211|.endmacro
212|
213|// Instruction footer.
214|.if 1
215|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
216|  .define ins_next, ins_NEXT
217|  .define ins_next_, ins_NEXT
218|.else
219|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
220|  // Affects only certain kinds of benchmarks (and only with -j off).
221|  // Around 10%-30% slower on Core2, a lot more slower on P4.
222|  .macro ins_next
223|    jmp ->ins_next
224|  .endmacro
225|  .macro ins_next_
226|  ->ins_next:
227|    ins_NEXT
228|  .endmacro
229|.endif
230|
231|// Call decode and dispatch.
232|.macro ins_callt
233|  // BASE = new base, RB = LFUNC, RD = nargs+1, [BASE-8] = PC
234|  mov PC, LFUNC:RB->pc
235|  mov RAd, [PC]
236|  movzx OP, RAL
237|  movzx RAd, RAH
238|  add PC, 4
239|  jmp aword [DISPATCH+OP*8]
240|.endmacro
241|
242|.macro ins_call
243|  // BASE = new base, RB = LFUNC, RD = nargs+1
244|  mov [BASE-8], PC
245|  ins_callt
246|.endmacro
247|
248|//-----------------------------------------------------------------------
249|
250|// Macros to clear or set tags.
251|.macro cleartp, reg; shl reg, 17; shr reg, 17; .endmacro
252|.macro settp, reg, tp
253|  mov64 ITYPE, ((uint64_t)tp<<47)
254|  or reg, ITYPE
255|.endmacro
256|.macro settp, dst, reg, tp
257|  mov64 dst, ((uint64_t)tp<<47)
258|  or dst, reg
259|.endmacro
260|.macro setint, reg
261|  settp reg, LJ_TISNUM
262|.endmacro
263|.macro setint, dst, reg
264|  settp dst, reg, LJ_TISNUM
265|.endmacro
266|
267|// Macros to test operand types.
268|.macro checktp_nc, reg, tp, target
269|  mov ITYPE, reg
270|  sar ITYPE, 47
271|  cmp ITYPEd, tp
272|  jne target
273|.endmacro
274|.macro checktp, reg, tp, target
275|  mov ITYPE, reg
276|  cleartp reg
277|  sar ITYPE, 47
278|  cmp ITYPEd, tp
279|  jne target
280|.endmacro
281|.macro checktptp, src, tp, target
282|  mov ITYPE, src
283|  sar ITYPE, 47
284|  cmp ITYPEd, tp
285|  jne target
286|.endmacro
287|.macro checkstr, reg, target; checktp reg, LJ_TSTR, target; .endmacro
288|.macro checktab, reg, target; checktp reg, LJ_TTAB, target; .endmacro
289|.macro checkfunc, reg, target; checktp reg, LJ_TFUNC, target; .endmacro
290|
291|.macro checknumx, reg, target, jump
292|  mov ITYPE, reg
293|  sar ITYPE, 47
294|  cmp ITYPEd, LJ_TISNUM
295|  jump target
296|.endmacro
297|.macro checkint, reg, target; checknumx reg, target, jne; .endmacro
298|.macro checkinttp, src, target; checknumx src, target, jne; .endmacro
299|.macro checknum, reg, target; checknumx reg, target, jae; .endmacro
300|.macro checknumtp, src, target; checknumx src, target, jae; .endmacro
301|.macro checknumber, src, target; checknumx src, target, ja; .endmacro
302|
303|.macro mov_false, reg; mov64 reg, (int64_t)~((uint64_t)1<<47); .endmacro
304|.macro mov_true, reg; mov64 reg, (int64_t)~((uint64_t)2<<47); .endmacro
305|
306|// These operands must be used with movzx.
307|.define PC_OP, byte [PC-4]
308|.define PC_RA, byte [PC-3]
309|.define PC_RB, byte [PC-1]
310|.define PC_RC, byte [PC-2]
311|.define PC_RD, word [PC-2]
312|
313|.macro branchPC, reg
314|  lea PC, [PC+reg*4-BCBIAS_J*4]
315|.endmacro
316|
317|// Assumes DISPATCH is relative to GL.
318#define DISPATCH_GL(field)	(GG_DISP2G + (int)offsetof(global_State, field))
319#define DISPATCH_J(field)	(GG_DISP2J + (int)offsetof(jit_State, field))
320|
321#define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
322|
323|// Decrement hashed hotcount and trigger trace recorder if zero.
324|.macro hotloop, reg
325|  mov reg, PCd
326|  shr reg, 1
327|  and reg, HOTCOUNT_PCMASK
328|  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
329|  jb ->vm_hotloop
330|.endmacro
331|
332|.macro hotcall, reg
333|  mov reg, PCd
334|  shr reg, 1
335|  and reg, HOTCOUNT_PCMASK
336|  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_CALL
337|  jb ->vm_hotcall
338|.endmacro
339|
340|// Set current VM state.
341|.macro set_vmstate, st
342|  mov dword [DISPATCH+DISPATCH_GL(vmstate)], ~LJ_VMST_..st
343|.endmacro
344|
345|.macro fpop1; fstp st1; .endmacro
346|
347|// Synthesize SSE FP constants.
348|.macro sseconst_abs, reg, tmp		// Synthesize abs mask.
349|  mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
350|.endmacro
351|
352|.macro sseconst_hi, reg, tmp, val	// Synthesize hi-32 bit const.
353|  mov64 tmp, U64x(val,00000000); movd reg, tmp
354|.endmacro
355|
356|.macro sseconst_sign, reg, tmp		// Synthesize sign mask.
357|  sseconst_hi reg, tmp, 80000000
358|.endmacro
359|.macro sseconst_1, reg, tmp		// Synthesize 1.0.
360|  sseconst_hi reg, tmp, 3ff00000
361|.endmacro
362|.macro sseconst_m1, reg, tmp		// Synthesize -1.0.
363|  sseconst_hi reg, tmp, bff00000
364|.endmacro
365|.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
366|  sseconst_hi reg, tmp, 43300000
367|.endmacro
368|.macro sseconst_tobit, reg, tmp	// Synthesize 2^52 + 2^51.
369|  sseconst_hi reg, tmp, 43380000
370|.endmacro
371|
372|// Move table write barrier back. Overwrites reg.
373|.macro barrierback, tab, reg
374|  and byte tab->marked, (uint8_t)~LJ_GC_BLACK	// black2gray(tab)
375|  mov reg, [DISPATCH+DISPATCH_GL(gc.grayagain)]
376|  mov [DISPATCH+DISPATCH_GL(gc.grayagain)], tab
377|  mov tab->gclist, reg
378|.endmacro
379|
380|//-----------------------------------------------------------------------
381
382/* Generate subroutines used by opcodes and other parts of the VM. */
383/* The .code_sub section should be last to help static branch prediction. */
384static void build_subroutines(BuildCtx *ctx)
385{
386  |.code_sub
387  |
388  |//-----------------------------------------------------------------------
389  |//-- Return handling ----------------------------------------------------
390  |//-----------------------------------------------------------------------
391  |
392  |->vm_returnp:
393  |  test PCd, FRAME_P
394  |  jz ->cont_dispatch
395  |
396  |  // Return from pcall or xpcall fast func.
397  |  and PC, -8
398  |  sub BASE, PC			// Restore caller base.
399  |  lea RA, [RA+PC-8]			// Rebase RA and prepend one result.
400  |  mov PC, [BASE-8]			// Fetch PC of previous frame.
401  |  // Prepending may overwrite the pcall frame, so do it at the end.
402  |  mov_true ITYPE
403  |  mov aword [BASE+RA], ITYPE		// Prepend true to results.
404  |
405  |->vm_returnc:
406  |  add RDd, 1				// RD = nresults+1
407  |  jz ->vm_unwind_yield
408  |  mov MULTRES, RDd
409  |  test PC, FRAME_TYPE
410  |  jz ->BC_RET_Z			// Handle regular return to Lua.
411  |
412  |->vm_return:
413  |  // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return
414  |  xor PC, FRAME_C
415  |  test PCd, FRAME_TYPE
416  |  jnz ->vm_returnp
417  |
418  |  // Return to C.
419  |  set_vmstate C
420  |  and PC, -8
421  |  sub PC, BASE
422  |  neg PC				// Previous base = BASE - delta.
423  |
424  |  sub RDd, 1
425  |  jz >2
426  |1:  // Move results down.
427  |  mov RB, [BASE+RA]
428  |  mov [BASE-16], RB
429  |  add BASE, 8
430  |  sub RDd, 1
431  |  jnz <1
432  |2:
433  |  mov L:RB, SAVE_L
434  |  mov L:RB->base, PC
435  |3:
436  |  mov RDd, MULTRES
437  |  mov RAd, SAVE_NRES			// RA = wanted nresults+1
438  |4:
439  |  cmp RAd, RDd
440  |  jne >6				// More/less results wanted?
441  |5:
442  |  sub BASE, 16
443  |  mov L:RB->top, BASE
444  |
445  |->vm_leave_cp:
446  |  mov RA, SAVE_CFRAME		// Restore previous C frame.
447  |  mov L:RB->cframe, RA
448  |  xor eax, eax			// Ok return status for vm_pcall.
449  |
450  |->vm_leave_unw:
451  |  restoreregs
452  |  ret
453  |
454  |6:
455  |  jb >7				// Less results wanted?
456  |  // More results wanted. Check stack size and fill up results with nil.
457  |  cmp BASE, L:RB->maxstack
458  |  ja >8
459  |  mov aword [BASE-16], LJ_TNIL
460  |  add BASE, 8
461  |  add RDd, 1
462  |  jmp <4
463  |
464  |7:  // Less results wanted.
465  |  test RAd, RAd
466  |  jz <5				// But check for LUA_MULTRET+1.
467  |  sub RA, RD				// Negative result!
468  |  lea BASE, [BASE+RA*8]		// Correct top.
469  |  jmp <5
470  |
471  |8:  // Corner case: need to grow stack for filling up results.
472  |  // This can happen if:
473  |  // - A C function grows the stack (a lot).
474  |  // - The GC shrinks the stack in between.
475  |  // - A return back from a lua_call() with (high) nresults adjustment.
476  |  mov L:RB->top, BASE		// Save current top held in BASE (yes).
477  |  mov MULTRES, RDd			// Need to fill only remainder with nil.
478  |  mov CARG2d, RAd
479  |  mov CARG1, L:RB
480  |  call extern lj_state_growstack	// (lua_State *L, int n)
481  |  mov BASE, L:RB->top		// Need the (realloced) L->top in BASE.
482  |  jmp <3
483  |
484  |->vm_unwind_yield:
485  |  mov al, LUA_YIELD
486  |  jmp ->vm_unwind_c_eh
487  |
488  |->vm_unwind_c:			// Unwind C stack, return from vm_pcall.
489  |  // (void *cframe, int errcode)
490  |  mov eax, CARG2d			// Error return status for vm_pcall.
491  |  mov rsp, CARG1
492  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
493  |  mov L:RB, SAVE_L
494  |  mov GL:RB, L:RB->glref
495  |  mov dword GL:RB->vmstate, ~LJ_VMST_C
496  |  jmp ->vm_leave_unw
497  |
498  |->vm_unwind_rethrow:
499  |.if not X64WIN
500  |  mov CARG1, SAVE_L
501  |  mov CARG2d, eax
502  |  restoreregs
503  |  jmp extern lj_err_throw		// (lua_State *L, int errcode)
504  |.endif
505  |
506  |->vm_unwind_ff:			// Unwind C stack, return from ff pcall.
507  |  // (void *cframe)
508  |  and CARG1, CFRAME_RAWMASK
509  |  mov rsp, CARG1
510  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
511  |  mov L:RB, SAVE_L
512  |  mov RDd, 1+1			// Really 1+2 results, incr. later.
513  |  mov BASE, L:RB->base
514  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
515  |  add DISPATCH, GG_G2DISP
516  |  mov PC, [BASE-8]			// Fetch PC of previous frame.
517  |  mov_false RA
518  |  mov RB, [BASE]
519  |  mov [BASE-16], RA			// Prepend false to error message.
520  |  mov [BASE-8], RB
521  |  mov RA, -16			// Results start at BASE+RA = BASE-16.
522  |  set_vmstate INTERP
523  |  jmp ->vm_returnc			// Increments RD/MULTRES and returns.
524  |
525  |//-----------------------------------------------------------------------
526  |//-- Grow stack for calls -----------------------------------------------
527  |//-----------------------------------------------------------------------
528  |
529  |->vm_growstack_c:			// Grow stack for C function.
530  |  mov CARG2d, LUA_MINSTACK
531  |  jmp >2
532  |
533  |->vm_growstack_v:			// Grow stack for vararg Lua function.
534  |  sub RD, 16				// LJ_FR2
535  |  jmp >1
536  |
537  |->vm_growstack_f:			// Grow stack for fixarg Lua function.
538  |  // BASE = new base, RD = nargs+1, RB = L, PC = first PC
539  |  lea RD, [BASE+NARGS:RD*8-8]
540  |1:
541  |  movzx RAd, byte [PC-4+PC2PROTO(framesize)]
542  |  add PC, 4				// Must point after first instruction.
543  |  mov L:RB->base, BASE
544  |  mov L:RB->top, RD
545  |  mov SAVE_PC, PC
546  |  mov CARG2, RA
547  |2:
548  |  // RB = L, L->base = new base, L->top = top
549  |  mov CARG1, L:RB
550  |  call extern lj_state_growstack	// (lua_State *L, int n)
551  |  mov BASE, L:RB->base
552  |  mov RD, L:RB->top
553  |  mov LFUNC:RB, [BASE-16]
554  |  cleartp LFUNC:RB
555  |  sub RD, BASE
556  |  shr RDd, 3
557  |  add NARGS:RDd, 1
558  |  // BASE = new base, RB = LFUNC, RD = nargs+1
559  |  ins_callt				// Just retry the call.
560  |
561  |//-----------------------------------------------------------------------
562  |//-- Entry points into the assembler VM ---------------------------------
563  |//-----------------------------------------------------------------------
564  |
565  |->vm_resume:				// Setup C frame and resume thread.
566  |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
567  |  saveregs
568  |  mov L:RB, CARG1			// Caveat: CARG1 may be RA.
569  |  mov SAVE_L, CARG1
570  |  mov RA, CARG2
571  |  mov PCd, FRAME_CP
572  |  xor RDd, RDd
573  |  lea KBASE, [esp+CFRAME_RESUME]
574  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
575  |  add DISPATCH, GG_G2DISP
576  |  mov SAVE_PC, RD			// Any value outside of bytecode is ok.
577  |  mov SAVE_CFRAME, RD
578  |  mov SAVE_NRES, RDd
579  |  mov SAVE_ERRF, RDd
580  |  mov L:RB->cframe, KBASE
581  |  cmp byte L:RB->status, RDL
582  |  je >2				// Initial resume (like a call).
583  |
584  |  // Resume after yield (like a return).
585  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
586  |  set_vmstate INTERP
587  |  mov byte L:RB->status, RDL
588  |  mov BASE, L:RB->base
589  |  mov RD, L:RB->top
590  |  sub RD, RA
591  |  shr RDd, 3
592  |  add RDd, 1				// RD = nresults+1
593  |  sub RA, BASE			// RA = resultofs
594  |  mov PC, [BASE-8]
595  |  mov MULTRES, RDd
596  |  test PCd, FRAME_TYPE
597  |  jz ->BC_RET_Z
598  |  jmp ->vm_return
599  |
600  |->vm_pcall:				// Setup protected C frame and enter VM.
601  |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
602  |  saveregs
603  |  mov PCd, FRAME_CP
604  |  mov SAVE_ERRF, CARG4d
605  |  jmp >1
606  |
607  |->vm_call:				// Setup C frame and enter VM.
608  |  // (lua_State *L, TValue *base, int nres1)
609  |  saveregs
610  |  mov PCd, FRAME_C
611  |
612  |1:  // Entry point for vm_pcall above (PC = ftype).
613  |  mov SAVE_NRES, CARG3d
614  |  mov L:RB, CARG1			// Caveat: CARG1 may be RA.
615  |  mov SAVE_L, CARG1
616  |  mov RA, CARG2
617  |
618  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
619  |  mov KBASE, L:RB->cframe		// Add our C frame to cframe chain.
620  |  mov SAVE_CFRAME, KBASE
621  |  mov SAVE_PC, L:RB			// Any value outside of bytecode is ok.
622  |  add DISPATCH, GG_G2DISP
623  |  mov L:RB->cframe, rsp
624  |
625  |2:  // Entry point for vm_resume/vm_cpcall (RA = base, RB = L, PC = ftype).
626  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
627  |  set_vmstate INTERP
628  |  mov BASE, L:RB->base		// BASE = old base (used in vmeta_call).
629  |  add PC, RA
630  |  sub PC, BASE			// PC = frame delta + frame type
631  |
632  |  mov RD, L:RB->top
633  |  sub RD, RA
634  |  shr NARGS:RDd, 3
635  |  add NARGS:RDd, 1			// RD = nargs+1
636  |
637  |->vm_call_dispatch:
638  |  mov LFUNC:RB, [RA-16]
639  |  checkfunc LFUNC:RB, ->vmeta_call	// Ensure KBASE defined and != BASE.
640  |
641  |->vm_call_dispatch_f:
642  |  mov BASE, RA
643  |  ins_call
644  |  // BASE = new base, RB = func, RD = nargs+1, PC = caller PC
645  |
646  |->vm_cpcall:				// Setup protected C frame, call C.
647  |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
648  |  saveregs
649  |  mov L:RB, CARG1			// Caveat: CARG1 may be RA.
650  |  mov SAVE_L, CARG1
651  |  mov SAVE_PC, L:RB			// Any value outside of bytecode is ok.
652  |
653  |  mov KBASE, L:RB->stack		// Compute -savestack(L, L->top).
654  |  sub KBASE, L:RB->top
655  |   mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
656  |  mov SAVE_ERRF, 0			// No error function.
657  |  mov SAVE_NRES, KBASEd		// Neg. delta means cframe w/o frame.
658  |   add DISPATCH, GG_G2DISP
659  |  // Handler may change cframe_nres(L->cframe) or cframe_errfunc(L->cframe).
660  |
661  |  mov KBASE, L:RB->cframe		// Add our C frame to cframe chain.
662  |  mov SAVE_CFRAME, KBASE
663  |  mov L:RB->cframe, rsp
664  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
665  |
666  |  call CARG4			// (lua_State *L, lua_CFunction func, void *ud)
667  |  // TValue * (new base) or NULL returned in eax (RC).
668  |  test RC, RC
669  |  jz ->vm_leave_cp			// No base? Just remove C frame.
670  |  mov RA, RC
671  |  mov PCd, FRAME_CP
672  |  jmp <2				// Else continue with the call.
673  |
674  |//-----------------------------------------------------------------------
675  |//-- Metamethod handling ------------------------------------------------
676  |//-----------------------------------------------------------------------
677  |
678  |//-- Continuation dispatch ----------------------------------------------
679  |
680  |->cont_dispatch:
681  |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
682  |  add RA, BASE
683  |  and PC, -8
684  |  mov RB, BASE
685  |  sub BASE, PC			// Restore caller BASE.
686  |  mov aword [RA+RD*8-8], LJ_TNIL	// Ensure one valid arg.
687  |  mov RC, RA				// ... in [RC]
688  |  mov PC, [RB-24]			// Restore PC from [cont|PC].
689  |  mov RA, qword [RB-32]		// May be negative on WIN64 with debug.
690  |.if FFI
691  |  cmp RA, 1
692  |  jbe >1
693  |.endif
694  |  mov LFUNC:KBASE, [BASE-16]
695  |  cleartp LFUNC:KBASE
696  |  mov KBASE, LFUNC:KBASE->pc
697  |  mov KBASE, [KBASE+PC2PROTO(k)]
698  |  // BASE = base, RC = result, RB = meta base
699  |  jmp RA				// Jump to continuation.
700  |
701  |.if FFI
702  |1:
703  |  je ->cont_ffi_callback		// cont = 1: return from FFI callback.
704  |  // cont = 0: Tail call from C function.
705  |  sub RB, BASE
706  |  shr RBd, 3
707  |  lea RDd, [RBd-3]
708  |  jmp ->vm_call_tail
709  |.endif
710  |
711  |->cont_cat:				// BASE = base, RC = result, RB = mbase
712  |  movzx RAd, PC_RB
713  |  sub RB, 32
714  |  lea RA, [BASE+RA*8]
715  |  sub RA, RB
716  |  je ->cont_ra
717  |  neg RA
718  |  shr RAd, 3
719  |.if X64WIN
720  |  mov CARG3d, RAd
721  |  mov L:CARG1, SAVE_L
722  |  mov L:CARG1->base, BASE
723  |  mov RC, [RC]
724  |  mov [RB], RC
725  |  mov CARG2, RB
726  |.else
727  |  mov L:CARG1, SAVE_L
728  |  mov L:CARG1->base, BASE
729  |  mov CARG3d, RAd
730  |  mov RA, [RC]
731  |  mov [RB], RA
732  |  mov CARG2, RB
733  |.endif
734  |  jmp ->BC_CAT_Z
735  |
736  |//-- Table indexing metamethods -----------------------------------------
737  |
738  |->vmeta_tgets:
739  |  settp STR:RC, LJ_TSTR		// STR:RC = GCstr *
740  |  mov TMP1, STR:RC
741  |  lea RC, TMP1
742  |  cmp PC_OP, BC_GGET
743  |  jne >1
744  |  settp TAB:RA, TAB:RB, LJ_TTAB	// TAB:RB = GCtab *
745  |  lea RB, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
746  |  mov [RB], TAB:RA
747  |  jmp >2
748  |
749  |->vmeta_tgetb:
750  |  movzx RCd, PC_RC
751  |.if DUALNUM
752  |  setint RC
753  |  mov TMP1, RC
754  |.else
755  |  cvtsi2sd xmm0, RCd
756  |  movsd TMP1, xmm0
757  |.endif
758  |  lea RC, TMP1
759  |  jmp >1
760  |
761  |->vmeta_tgetv:
762  |  movzx RCd, PC_RC			// Reload TValue *k from RC.
763  |  lea RC, [BASE+RC*8]
764  |1:
765  |  movzx RBd, PC_RB			// Reload TValue *t from RB.
766  |  lea RB, [BASE+RB*8]
767  |2:
768  |  mov L:CARG1, SAVE_L
769  |  mov L:CARG1->base, BASE		// Caveat: CARG2/CARG3 may be BASE.
770  |  mov CARG2, RB
771  |  mov CARG3, RC
772  |  mov L:RB, L:CARG1
773  |  mov SAVE_PC, PC
774  |  call extern lj_meta_tget		// (lua_State *L, TValue *o, TValue *k)
775  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
776  |  mov BASE, L:RB->base
777  |  test RC, RC
778  |  jz >3
779  |->cont_ra:				// BASE = base, RC = result
780  |  movzx RAd, PC_RA
781  |  mov RB, [RC]
782  |  mov [BASE+RA*8], RB
783  |  ins_next
784  |
785  |3:  // Call __index metamethod.
786  |  // BASE = base, L->top = new base, stack = cont/func/t/k
787  |  mov RA, L:RB->top
788  |  mov [RA-24], PC			// [cont|PC]
789  |  lea PC, [RA+FRAME_CONT]
790  |  sub PC, BASE
791  |  mov LFUNC:RB, [RA-16]		// Guaranteed to be a function here.
792  |  mov NARGS:RDd, 2+1			// 2 args for func(t, k).
793  |  cleartp LFUNC:RB
794  |  jmp ->vm_call_dispatch_f
795  |
796  |->vmeta_tgetr:
797  |  mov CARG1, TAB:RB
798  |  mov RB, BASE			// Save BASE.
799  |  mov CARG2d, RCd			// Caveat: CARG2 == BASE
800  |  call extern lj_tab_getinth		// (GCtab *t, int32_t key)
801  |  // cTValue * or NULL returned in eax (RC).
802  |  movzx RAd, PC_RA
803  |  mov BASE, RB			// Restore BASE.
804  |  test RC, RC
805  |  jnz ->BC_TGETR_Z
806  |  mov ITYPE, LJ_TNIL
807  |  jmp ->BC_TGETR2_Z
808  |
809  |//-----------------------------------------------------------------------
810  |
811  |->vmeta_tsets:
812  |  settp STR:RC, LJ_TSTR		// STR:RC = GCstr *
813  |  mov TMP1, STR:RC
814  |  lea RC, TMP1
815  |  cmp PC_OP, BC_GSET
816  |  jne >1
817  |  settp TAB:RA, TAB:RB, LJ_TTAB	// TAB:RB = GCtab *
818  |  lea RB, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
819  |  mov [RB], TAB:RA
820  |  jmp >2
821  |
822  |->vmeta_tsetb:
823  |  movzx RCd, PC_RC
824  |.if DUALNUM
825  |  setint RC
826  |  mov TMP1, RC
827  |.else
828  |  cvtsi2sd xmm0, RCd
829  |  movsd TMP1, xmm0
830  |.endif
831  |  lea RC, TMP1
832  |  jmp >1
833  |
834  |->vmeta_tsetv:
835  |  movzx RCd, PC_RC			// Reload TValue *k from RC.
836  |  lea RC, [BASE+RC*8]
837  |1:
838  |  movzx RBd, PC_RB			// Reload TValue *t from RB.
839  |  lea RB, [BASE+RB*8]
840  |2:
841  |  mov L:CARG1, SAVE_L
842  |  mov L:CARG1->base, BASE		// Caveat: CARG2/CARG3 may be BASE.
843  |  mov CARG2, RB
844  |  mov CARG3, RC
845  |  mov L:RB, L:CARG1
846  |  mov SAVE_PC, PC
847  |  call extern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
848  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
849  |  mov BASE, L:RB->base
850  |  test RC, RC
851  |  jz >3
852  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
853  |  movzx RAd, PC_RA
854  |  mov RB, [BASE+RA*8]
855  |  mov [RC], RB
856  |->cont_nop:				// BASE = base, (RC = result)
857  |  ins_next
858  |
859  |3:  // Call __newindex metamethod.
860  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
861  |  mov RA, L:RB->top
862  |  mov [RA-24], PC			// [cont|PC]
863  |  movzx RCd, PC_RA
864  |  // Copy value to third argument.
865  |  mov RB, [BASE+RC*8]
866  |  mov [RA+16], RB
867  |  lea PC, [RA+FRAME_CONT]
868  |  sub PC, BASE
869  |  mov LFUNC:RB, [RA-16]		// Guaranteed to be a function here.
870  |  mov NARGS:RDd, 3+1			// 3 args for func(t, k, v).
871  |  cleartp LFUNC:RB
872  |  jmp ->vm_call_dispatch_f
873  |
874  |->vmeta_tsetr:
875  |.if X64WIN
876  |  mov L:CARG1, SAVE_L
877  |  mov CARG3d, RCd
878  |  mov L:CARG1->base, BASE
879  |  xchg CARG2, TAB:RB			// Caveat: CARG2 == BASE.
880  |.else
881  |  mov L:CARG1, SAVE_L
882  |  mov CARG2, TAB:RB
883  |  mov L:CARG1->base, BASE
884  |  mov RB, BASE			// Save BASE.
885  |  mov CARG3d, RCd			// Caveat: CARG3 == BASE.
886  |.endif
887  |  mov SAVE_PC, PC
888  |  call extern lj_tab_setinth  // (lua_State *L, GCtab *t, int32_t key)
889  |  // TValue * returned in eax (RC).
890  |  movzx RAd, PC_RA
891  |  mov BASE, RB			// Restore BASE.
892  |  jmp ->BC_TSETR_Z
893  |
894  |//-- Comparison metamethods ---------------------------------------------
895  |
896  |->vmeta_comp:
897  |  movzx RDd, PC_RD
898  |  movzx RAd, PC_RA
899  |  mov L:RB, SAVE_L
900  |  mov L:RB->base, BASE		// Caveat: CARG2/CARG3 == BASE.
901  |.if X64WIN
902  |  lea CARG3, [BASE+RD*8]
903  |  lea CARG2, [BASE+RA*8]
904  |.else
905  |  lea CARG2, [BASE+RA*8]
906  |  lea CARG3, [BASE+RD*8]
907  |.endif
908  |  mov CARG1, L:RB			// Caveat: CARG1/CARG4 == RA.
909  |  movzx CARG4d, PC_OP
910  |  mov SAVE_PC, PC
911  |  call extern lj_meta_comp	// (lua_State *L, TValue *o1, *o2, int op)
912  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
913  |3:
914  |  mov BASE, L:RB->base
915  |  cmp RC, 1
916  |  ja ->vmeta_binop
917  |4:
918  |  lea PC, [PC+4]
919  |  jb >6
920  |5:
921  |  movzx RDd, PC_RD
922  |  branchPC RD
923  |6:
924  |  ins_next
925  |
926  |->cont_condt:			// BASE = base, RC = result
927  |  add PC, 4
928  |  mov ITYPE, [RC]
929  |  sar ITYPE, 47
930  |  cmp ITYPEd, LJ_TISTRUECOND		// Branch if result is true.
931  |  jb <5
932  |  jmp <6
933  |
934  |->cont_condf:			// BASE = base, RC = result
935  |  mov ITYPE, [RC]
936  |  sar ITYPE, 47
937  |  cmp ITYPEd, LJ_TISTRUECOND		// Branch if result is false.
938  |  jmp <4
939  |
940  |->vmeta_equal:
941  |  cleartp TAB:RD
942  |  sub PC, 4
943  |.if X64WIN
944  |  mov CARG3, RD
945  |  mov CARG4d, RBd
946  |  mov L:RB, SAVE_L
947  |  mov L:RB->base, BASE		// Caveat: CARG2 == BASE.
948  |  mov CARG2, RA
949  |  mov CARG1, L:RB			// Caveat: CARG1 == RA.
950  |.else
951  |  mov CARG2, RA
952  |  mov CARG4d, RBd			// Caveat: CARG4 == RA.
953  |  mov L:RB, SAVE_L
954  |  mov L:RB->base, BASE		// Caveat: CARG3 == BASE.
955  |  mov CARG3, RD
956  |  mov CARG1, L:RB
957  |.endif
958  |  mov SAVE_PC, PC
959  |  call extern lj_meta_equal	// (lua_State *L, GCobj *o1, *o2, int ne)
960  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
961  |  jmp <3
962  |
963  |->vmeta_equal_cd:
964  |.if FFI
965  |  sub PC, 4
966  |  mov L:RB, SAVE_L
967  |  mov L:RB->base, BASE
968  |  mov CARG1, L:RB
969  |  mov CARG2d, dword [PC-4]
970  |  mov SAVE_PC, PC
971  |  call extern lj_meta_equal_cd	// (lua_State *L, BCIns ins)
972  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
973  |  jmp <3
974  |.endif
975  |
976  |->vmeta_istype:
977  |  mov L:RB, SAVE_L
978  |  mov L:RB->base, BASE		// Caveat: CARG2/CARG3 may be BASE.
979  |  mov CARG2d, RAd
980  |  mov CARG3d, RDd
981  |  mov L:CARG1, L:RB
982  |  mov SAVE_PC, PC
983  |  call extern lj_meta_istype  // (lua_State *L, BCReg ra, BCReg tp)
984  |  mov BASE, L:RB->base
985  |  jmp <6
986  |
987  |//-- Arithmetic metamethods ---------------------------------------------
988  |
989  |->vmeta_arith_vno:
990  |.if DUALNUM
991  |  movzx RBd, PC_RB
992  |  movzx RCd, PC_RC
993  |.endif
994  |->vmeta_arith_vn:
995  |  lea RC, [KBASE+RC*8]
996  |  jmp >1
997  |
998  |->vmeta_arith_nvo:
999  |.if DUALNUM
1000  |  movzx RBd, PC_RB
1001  |  movzx RCd, PC_RC
1002  |.endif
1003  |->vmeta_arith_nv:
1004  |  lea TMPR, [KBASE+RC*8]
1005  |  lea RC, [BASE+RB*8]
1006  |  mov RB, TMPR
1007  |  jmp >2
1008  |
1009  |->vmeta_unm:
1010  |  lea RC, [BASE+RD*8]
1011  |  mov RB, RC
1012  |  jmp >2
1013  |
1014  |->vmeta_arith_vvo:
1015  |.if DUALNUM
1016  |  movzx RBd, PC_RB
1017  |  movzx RCd, PC_RC
1018  |.endif
1019  |->vmeta_arith_vv:
1020  |  lea RC, [BASE+RC*8]
1021  |1:
1022  |  lea RB, [BASE+RB*8]
1023  |2:
1024  |  lea RA, [BASE+RA*8]
1025  |.if X64WIN
1026  |  mov CARG3, RB
1027  |  mov CARG4, RC
1028  |  movzx RCd, PC_OP
1029  |  mov ARG5d, RCd
1030  |  mov L:RB, SAVE_L
1031  |  mov L:RB->base, BASE		// Caveat: CARG2 == BASE.
1032  |  mov CARG2, RA
1033  |  mov CARG1, L:RB			// Caveat: CARG1 == RA.
1034  |.else
1035  |  movzx CARG5d, PC_OP
1036  |  mov CARG2, RA
1037  |  mov CARG4, RC			// Caveat: CARG4 == RA.
1038  |  mov L:CARG1, SAVE_L
1039  |  mov L:CARG1->base, BASE		// Caveat: CARG3 == BASE.
1040  |  mov CARG3, RB
1041  |  mov L:RB, L:CARG1
1042  |.endif
1043  |  mov SAVE_PC, PC
1044  |  call extern lj_meta_arith	// (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
1045  |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
1046  |  mov BASE, L:RB->base
1047  |  test RC, RC
1048  |  jz ->cont_nop
1049  |
1050  |  // Call metamethod for binary op.
1051  |->vmeta_binop:
1052  |  // BASE = base, RC = new base, stack = cont/func/o1/o2
1053  |  mov RA, RC
1054  |  sub RC, BASE
1055  |  mov [RA-24], PC			// [cont|PC]
1056  |  lea PC, [RC+FRAME_CONT]
1057  |  mov NARGS:RDd, 2+1			// 2 args for func(o1, o2).
1058  |  jmp ->vm_call_dispatch
1059  |
1060  |->vmeta_len:
1061  |  movzx RDd, PC_RD
1062  |  mov L:RB, SAVE_L
1063  |  mov L:RB->base, BASE
1064  |  lea CARG2, [BASE+RD*8]		// Caveat: CARG2 == BASE
1065  |  mov L:CARG1, L:RB
1066  |  mov SAVE_PC, PC
1067  |  call extern lj_meta_len		// (lua_State *L, TValue *o)
1068  |  // NULL (retry) or TValue * (metamethod) returned in eax (RC).
1069  |  mov BASE, L:RB->base
1070#if LJ_52
1071  |  test RC, RC
1072  |  jne ->vmeta_binop			// Binop call for compatibility.
1073  |  movzx RDd, PC_RD
1074  |  mov TAB:CARG1, [BASE+RD*8]
1075  |  cleartp TAB:CARG1
1076  |  jmp ->BC_LEN_Z
1077#else
1078  |  jmp ->vmeta_binop			// Binop call for compatibility.
1079#endif
1080  |
1081  |//-- Call metamethod ----------------------------------------------------
1082  |
1083  |->vmeta_call_ra:
1084  |  lea RA, [BASE+RA*8+16]
1085  |->vmeta_call:			// Resolve and call __call metamethod.
1086  |  // BASE = old base, RA = new base, RC = nargs+1, PC = return
1087  |  mov TMP1d, NARGS:RDd		// Save RA, RC for us.
1088  |  mov RB, RA
1089  |.if X64WIN
1090  |  mov L:TMPR, SAVE_L
1091  |  mov L:TMPR->base, BASE		// Caveat: CARG2 is BASE.
1092  |  lea CARG2, [RA-16]
1093  |  lea CARG3, [RA+NARGS:RD*8-8]
1094  |  mov CARG1, L:TMPR			// Caveat: CARG1 is RA.
1095  |.else
1096  |  mov L:CARG1, SAVE_L
1097  |  mov L:CARG1->base, BASE		// Caveat: CARG3 is BASE.
1098  |  lea CARG2, [RA-16]
1099  |  lea CARG3, [RA+NARGS:RD*8-8]
1100  |.endif
1101  |  mov SAVE_PC, PC
1102  |  call extern lj_meta_call	// (lua_State *L, TValue *func, TValue *top)
1103  |  mov RA, RB
1104  |  mov L:RB, SAVE_L
1105  |  mov BASE, L:RB->base
1106  |  mov NARGS:RDd, TMP1d
1107  |  mov LFUNC:RB, [RA-16]
1108  |  add NARGS:RDd, 1
1109  |  // This is fragile. L->base must not move, KBASE must always be defined.
1110  |  cmp KBASE, BASE			// Continue with CALLT if flag set.
1111  |  je ->BC_CALLT_Z
1112  |  cleartp LFUNC:RB
1113  |  mov BASE, RA
1114  |  ins_call				// Otherwise call resolved metamethod.
1115  |
1116  |//-- Argument coercion for 'for' statement ------------------------------
1117  |
1118  |->vmeta_for:
1119  |  mov L:RB, SAVE_L
1120  |  mov L:RB->base, BASE
1121  |  mov CARG2, RA			// Caveat: CARG2 == BASE
1122  |  mov L:CARG1, L:RB			// Caveat: CARG1 == RA
1123  |  mov SAVE_PC, PC
1124  |  call extern lj_meta_for	// (lua_State *L, TValue *base)
1125  |  mov BASE, L:RB->base
1126  |  mov RCd, [PC-4]
1127  |  movzx RAd, RCH
1128  |  movzx OP, RCL
1129  |  shr RCd, 16
1130  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Retry FORI or JFORI.
1131  |
1132  |//-----------------------------------------------------------------------
1133  |//-- Fast functions -----------------------------------------------------
1134  |//-----------------------------------------------------------------------
1135  |
1136  |.macro .ffunc, name
1137  |->ff_ .. name:
1138  |.endmacro
1139  |
1140  |.macro .ffunc_1, name
1141  |->ff_ .. name:
1142  |  cmp NARGS:RDd, 1+1;  jb ->fff_fallback
1143  |.endmacro
1144  |
1145  |.macro .ffunc_2, name
1146  |->ff_ .. name:
1147  |  cmp NARGS:RDd, 2+1;  jb ->fff_fallback
1148  |.endmacro
1149  |
1150  |.macro .ffunc_n, name, op
1151  |  .ffunc_1 name
1152  |  checknumtp [BASE], ->fff_fallback
1153  |  op xmm0, qword [BASE]
1154  |.endmacro
1155  |
1156  |.macro .ffunc_n, name
1157  |  .ffunc_n name, movsd
1158  |.endmacro
1159  |
1160  |.macro .ffunc_nn, name
1161  |  .ffunc_2 name
1162  |  checknumtp [BASE], ->fff_fallback
1163  |  checknumtp [BASE+8], ->fff_fallback
1164  |  movsd xmm0, qword [BASE]
1165  |  movsd xmm1, qword [BASE+8]
1166  |.endmacro
1167  |
1168  |// Inlined GC threshold check. Caveat: uses label 1.
1169  |.macro ffgccheck
1170  |  mov RB, [DISPATCH+DISPATCH_GL(gc.total)]
1171  |  cmp RB, [DISPATCH+DISPATCH_GL(gc.threshold)]
1172  |  jb >1
1173  |  call ->fff_gcstep
1174  |1:
1175  |.endmacro
1176  |
1177  |//-- Base library: checks -----------------------------------------------
1178  |
1179  |.ffunc_1 assert
1180  |  mov ITYPE, [BASE]
1181  |  mov RB, ITYPE
1182  |  sar ITYPE, 47
1183  |  cmp ITYPEd, LJ_TISTRUECOND; jae ->fff_fallback
1184  |  mov PC, [BASE-8]
1185  |  mov MULTRES, RDd
1186  |  mov RB, [BASE]
1187  |  mov [BASE-16], RB
1188  |  sub RDd, 2
1189  |  jz >2
1190  |  mov RA, BASE
1191  |1:
1192  |  add RA, 8
1193  |  mov RB, [RA]
1194  |  mov [RA-16], RB
1195  |  sub RDd, 1
1196  |  jnz <1
1197  |2:
1198  |  mov RDd, MULTRES
1199  |  jmp ->fff_res_
1200  |
1201  |.ffunc_1 type
1202  |  mov RC, [BASE]
1203  |  sar RC, 47
1204  |  mov RBd, LJ_TISNUM
1205  |  cmp RCd, RBd
1206  |  cmovb RCd, RBd
1207  |  not RCd
1208  |2:
1209  |  mov CFUNC:RB, [BASE-16]
1210  |  cleartp CFUNC:RB
1211  |  mov STR:RC, [CFUNC:RB+RC*8+((char *)(&((GCfuncC *)0)->upvalue))]
1212  |  mov PC, [BASE-8]
1213  |  settp STR:RC, LJ_TSTR
1214  |  mov [BASE-16], STR:RC
1215  |  jmp ->fff_res1
1216  |
1217  |//-- Base library: getters and setters ---------------------------------
1218  |
1219  |.ffunc_1 getmetatable
1220  |  mov TAB:RB, [BASE]
1221  |  mov PC, [BASE-8]
1222  |  checktab TAB:RB, >6
1223  |1:  // Field metatable must be at same offset for GCtab and GCudata!
1224  |  mov TAB:RB, TAB:RB->metatable
1225  |2:
1226  |  test TAB:RB, TAB:RB
1227  |  mov aword [BASE-16], LJ_TNIL
1228  |  jz ->fff_res1
1229  |  settp TAB:RC, TAB:RB, LJ_TTAB
1230  |  mov [BASE-16], TAB:RC		// Store metatable as default result.
1231  |  mov STR:RC, [DISPATCH+DISPATCH_GL(gcroot)+8*(GCROOT_MMNAME+MM_metatable)]
1232  |  mov RAd, TAB:RB->hmask
1233  |  and RAd, STR:RC->sid
1234  |  settp STR:RC, LJ_TSTR
1235  |  imul RAd, #NODE
1236  |  add NODE:RA, TAB:RB->node
1237  |3:  // Rearranged logic, because we expect _not_ to find the key.
1238  |  cmp NODE:RA->key, STR:RC
1239  |  je >5
1240  |4:
1241  |  mov NODE:RA, NODE:RA->next
1242  |  test NODE:RA, NODE:RA
1243  |  jnz <3
1244  |  jmp ->fff_res1			// Not found, keep default result.
1245  |5:
1246  |  mov RB, NODE:RA->val
1247  |  cmp RB, LJ_TNIL; je ->fff_res1	// Ditto for nil value.
1248  |  mov [BASE-16], RB			// Return value of mt.__metatable.
1249  |  jmp ->fff_res1
1250  |
1251  |6:
1252  |  cmp ITYPEd, LJ_TUDATA; je <1
1253  |  cmp ITYPEd, LJ_TISNUM; ja >7
1254  |  mov ITYPEd, LJ_TISNUM
1255  |7:
1256  |  not ITYPEd
1257  |  mov TAB:RB, [DISPATCH+ITYPE*8+DISPATCH_GL(gcroot[GCROOT_BASEMT])]
1258  |  jmp <2
1259  |
1260  |.ffunc_2 setmetatable
1261  |  mov TAB:RB, [BASE]
1262  |  mov TAB:TMPR, TAB:RB
1263  |  checktab TAB:RB, ->fff_fallback
1264  |  // Fast path: no mt for table yet and not clearing the mt.
1265  |  cmp aword TAB:RB->metatable, 0; jne ->fff_fallback
1266  |  mov TAB:RA, [BASE+8]
1267  |  checktab TAB:RA, ->fff_fallback
1268  |  mov TAB:RB->metatable, TAB:RA
1269  |  mov PC, [BASE-8]
1270  |  mov [BASE-16], TAB:TMPR			// Return original table.
1271  |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
1272  |  jz >1
1273  |  // Possible write barrier. Table is black, but skip iswhite(mt) check.
1274  |  barrierback TAB:RB, RC
1275  |1:
1276  |  jmp ->fff_res1
1277  |
1278  |.ffunc_2 rawget
1279  |.if X64WIN
1280  |  mov TAB:RA, [BASE]
1281  |  checktab TAB:RA, ->fff_fallback
1282  |  mov RB, BASE			// Save BASE.
1283  |  lea CARG3, [BASE+8]
1284  |  mov CARG2, TAB:RA			// Caveat: CARG2 == BASE.
1285  |  mov CARG1, SAVE_L
1286  |.else
1287  |  mov TAB:CARG2, [BASE]
1288  |  checktab TAB:CARG2, ->fff_fallback
1289  |  mov RB, BASE			// Save BASE.
1290  |  lea CARG3, [BASE+8]		// Caveat: CARG3 == BASE.
1291  |  mov CARG1, SAVE_L
1292  |.endif
1293  |  call extern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
1294  |  // cTValue * returned in eax (RD).
1295  |  mov BASE, RB			// Restore BASE.
1296  |  // Copy table slot.
1297  |  mov RB, [RD]
1298  |  mov PC, [BASE-8]
1299  |  mov [BASE-16], RB
1300  |  jmp ->fff_res1
1301  |
1302  |//-- Base library: conversions ------------------------------------------
1303  |
1304  |.ffunc tonumber
1305  |  // Only handles the number case inline (without a base argument).
1306  |  cmp NARGS:RDd, 1+1;  jne ->fff_fallback	// Exactly one argument.
1307  |  mov RB, [BASE]
1308  |  checknumber RB, ->fff_fallback
1309  |  mov PC, [BASE-8]
1310  |  mov [BASE-16], RB
1311  |  jmp ->fff_res1
1312  |
1313  |.ffunc_1 tostring
1314  |  // Only handles the string or number case inline.
1315  |  mov PC, [BASE-8]
1316  |  mov STR:RB, [BASE]
1317  |  checktp_nc STR:RB, LJ_TSTR, >3
1318  |  // A __tostring method in the string base metatable is ignored.
1319  |2:
1320  |  mov [BASE-16], STR:RB
1321  |  jmp ->fff_res1
1322  |3:  // Handle numbers inline, unless a number base metatable is present.
1323  |  cmp ITYPEd, LJ_TISNUM;  ja ->fff_fallback_1
1324  |  cmp aword [DISPATCH+DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])], 0
1325  |  jne ->fff_fallback
1326  |  ffgccheck				// Caveat: uses label 1.
1327  |  mov L:RB, SAVE_L
1328  |  mov L:RB->base, BASE		// Add frame since C call can throw.
1329  |  mov SAVE_PC, PC			// Redundant (but a defined value).
1330  |.if not X64WIN
1331  |  mov CARG2, BASE			// Otherwise: CARG2 == BASE
1332  |.endif
1333  |  mov L:CARG1, L:RB
1334  |.if DUALNUM
1335  |  call extern lj_strfmt_number	// (lua_State *L, cTValue *o)
1336  |.else
1337  |  call extern lj_strfmt_num		// (lua_State *L, lua_Number *np)
1338  |.endif
1339  |  // GCstr returned in eax (RD).
1340  |  mov BASE, L:RB->base
1341  |  settp STR:RB, RD, LJ_TSTR
1342  |  jmp <2
1343  |
1344  |//-- Base library: iterators -------------------------------------------
1345  |
1346  |.ffunc_1 next
1347  |  je >2				// Missing 2nd arg?
1348  |1:
1349  |  mov CARG1, [BASE]
1350  |  mov PC, [BASE-8]
1351  |  checktab CARG1, ->fff_fallback
1352  |  mov RB, BASE			// Save BASE.
1353  |.if X64WIN
1354  |  lea CARG3, [BASE-16]
1355  |  lea CARG2, [BASE+8]		// Caveat: CARG2 == BASE.
1356  |.else
1357  |  lea CARG2, [BASE+8]
1358  |  lea CARG3, [BASE-16]		// Caveat: CARG3 == BASE.
1359  |.endif
1360  |  call extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
1361  |  // 1=found, 0=end, -1=error returned in eax (RD).
1362  |  mov BASE, RB			// Restore BASE.
1363  |  test RDd, RDd;  jg ->fff_res2	// Found key/value.
1364  |  js ->fff_fallback_2		// Invalid key.
1365  |  // End of traversal: return nil.
1366  |  mov aword [BASE-16], LJ_TNIL
1367  |  jmp ->fff_res1
1368  |2:  // Set missing 2nd arg to nil.
1369  |  mov aword [BASE+8], LJ_TNIL
1370  |  jmp <1
1371  |
1372  |.ffunc_1 pairs
1373  |  mov TAB:RB, [BASE]
1374  |  mov TMPR, TAB:RB
1375  |  checktab TAB:RB, ->fff_fallback
1376#if LJ_52
1377  |  cmp aword TAB:RB->metatable, 0; jne ->fff_fallback
1378#endif
1379  |  mov CFUNC:RD, [BASE-16]
1380  |  cleartp CFUNC:RD
1381  |  mov CFUNC:RD, CFUNC:RD->upvalue[0]
1382  |  settp CFUNC:RD, LJ_TFUNC
1383  |  mov PC, [BASE-8]
1384  |  mov [BASE-16], CFUNC:RD
1385  |  mov [BASE-8], TMPR
1386  |  mov aword [BASE], LJ_TNIL
1387  |  mov RDd, 1+3
1388  |  jmp ->fff_res
1389  |
1390  |.ffunc_2 ipairs_aux
1391  |  mov TAB:RB, [BASE]
1392  |  checktab TAB:RB, ->fff_fallback
1393  |.if DUALNUM
1394  |  mov RA, [BASE+8]
1395  |  checkint RA, ->fff_fallback
1396  |.else
1397  |  checknumtp [BASE+8], ->fff_fallback
1398  |  movsd xmm0, qword [BASE+8]
1399  |.endif
1400  |  mov PC, [BASE-8]
1401  |.if DUALNUM
1402  |  add RAd, 1
1403  |  setint ITYPE, RA
1404  |  mov [BASE-16], ITYPE
1405  |.else
1406  |  sseconst_1 xmm1, TMPR
1407  |  addsd xmm0, xmm1
1408  |  cvttsd2si RAd, xmm0
1409  |  movsd qword [BASE-16], xmm0
1410  |.endif
1411  |  cmp RAd, TAB:RB->asize;  jae >2	// Not in array part?
1412  |  mov RD, TAB:RB->array
1413  |  lea RD, [RD+RA*8]
1414  |1:
1415  |  cmp aword [RD], LJ_TNIL;  je ->fff_res0
1416  |  // Copy array slot.
1417  |  mov RB, [RD]
1418  |  mov [BASE-8], RB
1419  |->fff_res2:
1420  |  mov RDd, 1+2
1421  |  jmp ->fff_res
1422  |2:  // Check for empty hash part first. Otherwise call C function.
1423  |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
1424  |.if X64WIN
1425  |  mov TMPR, BASE
1426  |  mov CARG2d, RAd
1427  |  mov CARG1, TAB:RB
1428  |  mov RB, TMPR
1429  |.else
1430  |  mov CARG1, TAB:RB
1431  |  mov RB, BASE			// Save BASE.
1432  |  mov CARG2d, RAd			// Caveat: CARG2 == BASE
1433  |.endif
1434  |  call extern lj_tab_getinth		// (GCtab *t, int32_t key)
1435  |  // cTValue * or NULL returned in eax (RD).
1436  |  mov BASE, RB
1437  |  test RD, RD
1438  |  jnz <1
1439  |->fff_res0:
1440  |  mov RDd, 1+0
1441  |  jmp ->fff_res
1442  |
1443  |.ffunc_1 ipairs
1444  |  mov TAB:RB, [BASE]
1445  |  mov TMPR, TAB:RB
1446  |  checktab TAB:RB, ->fff_fallback
1447#if LJ_52
1448  |  cmp aword TAB:RB->metatable, 0; jne ->fff_fallback
1449#endif
1450  |  mov CFUNC:RD, [BASE-16]
1451  |  cleartp CFUNC:RD
1452  |  mov CFUNC:RD, CFUNC:RD->upvalue[0]
1453  |  settp CFUNC:RD, LJ_TFUNC
1454  |  mov PC, [BASE-8]
1455  |  mov [BASE-16], CFUNC:RD
1456  |  mov [BASE-8], TMPR
1457  |.if DUALNUM
1458  |  mov64 RD, ((uint64_t)LJ_TISNUM<<47)
1459  |  mov [BASE], RD
1460  |.else
1461  |  mov qword [BASE], 0
1462  |.endif
1463  |  mov RDd, 1+3
1464  |  jmp ->fff_res
1465  |
1466  |//-- Base library: catch errors ----------------------------------------
1467  |
1468  |.ffunc_1 pcall
1469  |  lea RA, [BASE+16]
1470  |  sub NARGS:RDd, 1
1471  |  mov PCd, 16+FRAME_PCALL
1472  |1:
1473  |  movzx RBd, byte [DISPATCH+DISPATCH_GL(hookmask)]
1474  |  shr RB, HOOK_ACTIVE_SHIFT
1475  |  and RB, 1
1476  |  add PC, RB				// Remember active hook before pcall.
1477  |  // Note: this does a (harmless) copy of the function to the PC slot, too.
1478  |  mov KBASE, RD
1479  |2:
1480  |  mov RB, [RA+KBASE*8-24]
1481  |  mov [RA+KBASE*8-16], RB
1482  |  sub KBASE, 1
1483  |  ja <2
1484  |  jmp ->vm_call_dispatch
1485  |
1486  |.ffunc_2 xpcall
1487  |  mov LFUNC:RA, [BASE+8]
1488  |  checktp_nc LFUNC:RA, LJ_TFUNC, ->fff_fallback
1489  |  mov LFUNC:RB, [BASE]		// Swap function and traceback.
1490  |  mov [BASE], LFUNC:RA
1491  |  mov [BASE+8], LFUNC:RB
1492  |  lea RA, [BASE+24]
1493  |  sub NARGS:RDd, 2
1494  |  mov PCd, 24+FRAME_PCALL
1495  |  jmp <1
1496  |
1497  |//-- Coroutine library --------------------------------------------------
1498  |
1499  |.macro coroutine_resume_wrap, resume
1500  |.if resume
1501  |.ffunc_1 coroutine_resume
1502  |  mov L:RB, [BASE]
1503  |  cleartp L:RB
1504  |.else
1505  |.ffunc coroutine_wrap_aux
1506  |  mov CFUNC:RB, [BASE-16]
1507  |  cleartp CFUNC:RB
1508  |  mov L:RB, CFUNC:RB->upvalue[0].gcr
1509  |  cleartp L:RB
1510  |.endif
1511  |  mov PC, [BASE-8]
1512  |  mov SAVE_PC, PC
1513  |  mov TMP1, L:RB
1514  |.if resume
1515  |  checktptp [BASE], LJ_TTHREAD, ->fff_fallback
1516  |.endif
1517  |  cmp aword L:RB->cframe, 0; jne ->fff_fallback
1518  |  cmp byte L:RB->status, LUA_YIELD;  ja ->fff_fallback
1519  |  mov RA, L:RB->top
1520  |  je >1				// Status != LUA_YIELD (i.e. 0)?
1521  |  cmp RA, L:RB->base			// Check for presence of initial func.
1522  |  je ->fff_fallback
1523  |  mov PC, [RA-8]			// Move initial function up.
1524  |  mov [RA], PC
1525  |  add RA, 8
1526  |1:
1527  |.if resume
1528  |  lea PC, [RA+NARGS:RD*8-16]		// Check stack space (-1-thread).
1529  |.else
1530  |  lea PC, [RA+NARGS:RD*8-8]		// Check stack space (-1).
1531  |.endif
1532  |  cmp PC, L:RB->maxstack; ja ->fff_fallback
1533  |  mov L:RB->top, PC
1534  |
1535  |  mov L:RB, SAVE_L
1536  |  mov L:RB->base, BASE
1537  |.if resume
1538  |  add BASE, 8			// Keep resumed thread in stack for GC.
1539  |.endif
1540  |  mov L:RB->top, BASE
1541  |.if resume
1542  |  lea RB, [BASE+NARGS:RD*8-24]	// RB = end of source for stack move.
1543  |.else
1544  |  lea RB, [BASE+NARGS:RD*8-16]	// RB = end of source for stack move.
1545  |.endif
1546  |  sub RB, PC			// Relative to PC.
1547  |
1548  |  cmp PC, RA
1549  |  je >3
1550  |2:  // Move args to coroutine.
1551  |  mov RC, [PC+RB]
1552  |  mov [PC-8], RC
1553  |  sub PC, 8
1554  |  cmp PC, RA
1555  |  jne <2
1556  |3:
1557  |  mov CARG2, RA
1558  |  mov CARG1, TMP1
1559  |  call ->vm_resume			// (lua_State *L, TValue *base, 0, 0)
1560  |
1561  |  mov L:RB, SAVE_L
1562  |  mov L:PC, TMP1
1563  |  mov BASE, L:RB->base
1564  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
1565  |  set_vmstate INTERP
1566  |
1567  |  cmp eax, LUA_YIELD
1568  |  ja >8
1569  |4:
1570  |  mov RA, L:PC->base
1571  |  mov KBASE, L:PC->top
1572  |  mov L:PC->top, RA			// Clear coroutine stack.
1573  |  mov PC, KBASE
1574  |  sub PC, RA
1575  |  je >6				// No results?
1576  |  lea RD, [BASE+PC]
1577  |  shr PCd, 3
1578  |  cmp RD, L:RB->maxstack
1579  |  ja >9				// Need to grow stack?
1580  |
1581  |  mov RB, BASE
1582  |  sub RB, RA
1583  |5:  // Move results from coroutine.
1584  |  mov RD, [RA]
1585  |  mov [RA+RB], RD
1586  |  add RA, 8
1587  |  cmp RA, KBASE
1588  |  jne <5
1589  |6:
1590  |.if resume
1591  |  lea RDd, [PCd+2]			// nresults+1 = 1 + true + results.
1592  |  mov_true ITYPE			// Prepend true to results.
1593  |  mov [BASE-8], ITYPE
1594  |.else
1595  |  lea RDd, [PCd+1]			// nresults+1 = 1 + results.
1596  |.endif
1597  |7:
1598  |  mov PC, SAVE_PC
1599  |  mov MULTRES, RDd
1600  |.if resume
1601  |  mov RA, -8
1602  |.else
1603  |  xor RAd, RAd
1604  |.endif
1605  |  test PCd, FRAME_TYPE
1606  |  jz ->BC_RET_Z
1607  |  jmp ->vm_return
1608  |
1609  |8:  // Coroutine returned with error (at co->top-1).
1610  |.if resume
1611  |  mov_false ITYPE			// Prepend false to results.
1612  |  mov [BASE-8], ITYPE
1613  |  mov RA, L:PC->top
1614  |  sub RA, 8
1615  |  mov L:PC->top, RA			// Clear error from coroutine stack.
1616  |  // Copy error message.
1617  |  mov RD, [RA]
1618  |  mov [BASE], RD
1619  |  mov RDd, 1+2			// nresults+1 = 1 + false + error.
1620  |  jmp <7
1621  |.else
1622  |  mov CARG2, L:PC
1623  |  mov CARG1, L:RB
1624  |  call extern lj_ffh_coroutine_wrap_err  // (lua_State *L, lua_State *co)
1625  |  // Error function does not return.
1626  |.endif
1627  |
1628  |9:  // Handle stack expansion on return from yield.
1629  |  mov L:RA, TMP1
1630  |  mov L:RA->top, KBASE		// Undo coroutine stack clearing.
1631  |  mov CARG2, PC
1632  |  mov CARG1, L:RB
1633  |  call extern lj_state_growstack	// (lua_State *L, int n)
1634  |  mov L:PC, TMP1
1635  |  mov BASE, L:RB->base
1636  |  jmp <4				// Retry the stack move.
1637  |.endmacro
1638  |
1639  |  coroutine_resume_wrap 1		// coroutine.resume
1640  |  coroutine_resume_wrap 0		// coroutine.wrap
1641  |
1642  |.ffunc coroutine_yield
1643  |  mov L:RB, SAVE_L
1644  |  test aword L:RB->cframe, CFRAME_RESUME
1645  |  jz ->fff_fallback
1646  |  mov L:RB->base, BASE
1647  |  lea RD, [BASE+NARGS:RD*8-8]
1648  |  mov L:RB->top, RD
1649  |  xor RDd, RDd
1650  |  mov aword L:RB->cframe, RD
1651  |  mov al, LUA_YIELD
1652  |  mov byte L:RB->status, al
1653  |  jmp ->vm_leave_unw
1654  |
1655  |//-- Math library -------------------------------------------------------
1656  |
1657  |  .ffunc_1 math_abs
1658  |  mov RB, [BASE]
1659  |.if DUALNUM
1660  |  checkint RB, >3
1661  |  cmp RBd, 0; jns ->fff_resi
1662  |  neg RBd; js >2
1663  |->fff_resbit:
1664  |->fff_resi:
1665  |  setint RB
1666  |->fff_resRB:
1667  |  mov PC, [BASE-8]
1668  |  mov [BASE-16], RB
1669  |  jmp ->fff_res1
1670  |2:
1671  |  mov64 RB, U64x(41e00000,00000000)  // 2^31.
1672  |  jmp ->fff_resRB
1673  |3:
1674  |  ja ->fff_fallback
1675  |.else
1676  |  checknum RB, ->fff_fallback
1677  |.endif
1678  |  shl RB, 1
1679  |  shr RB, 1
1680  |  mov PC, [BASE-8]
1681  |  mov [BASE-16], RB
1682  |  jmp ->fff_res1
1683  |
1684  |.ffunc_n math_sqrt, sqrtsd
1685  |->fff_resxmm0:
1686  |  mov PC, [BASE-8]
1687  |  movsd qword [BASE-16], xmm0
1688  |  // fallthrough
1689  |
1690  |->fff_res1:
1691  |  mov RDd, 1+1
1692  |->fff_res:
1693  |  mov MULTRES, RDd
1694  |->fff_res_:
1695  |  test PCd, FRAME_TYPE
1696  |  jnz >7
1697  |5:
1698  |  cmp PC_RB, RDL			// More results expected?
1699  |  ja >6
1700  |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
1701  |  movzx RAd, PC_RA
1702  |  neg RA
1703  |  lea BASE, [BASE+RA*8-16]		// base = base - (RA+2)*8
1704  |  ins_next
1705  |
1706  |6:  // Fill up results with nil.
1707  |  mov aword [BASE+RD*8-24], LJ_TNIL
1708  |  add RD, 1
1709  |  jmp <5
1710  |
1711  |7:  // Non-standard return case.
1712  |  mov RA, -16			// Results start at BASE+RA = BASE-16.
1713  |  jmp ->vm_return
1714  |
1715  |.macro math_round, func
1716  |  .ffunc math_ .. func
1717  |.if DUALNUM
1718  |  mov RB, [BASE]
1719  |  checknumx RB, ->fff_resRB, je
1720  |  ja ->fff_fallback
1721  |.else
1722  |  checknumtp [BASE], ->fff_fallback
1723  |.endif
1724  |  movsd xmm0, qword [BASE]
1725  |  call ->vm_ .. func .. _sse
1726  |.if DUALNUM
1727  |  cvttsd2si RBd, xmm0
1728  |  cmp RBd, 0x80000000
1729  |  jne ->fff_resi
1730  |  cvtsi2sd xmm1, RBd
1731  |  ucomisd xmm0, xmm1
1732  |  jp ->fff_resxmm0
1733  |  je ->fff_resi
1734  |.endif
1735  |  jmp ->fff_resxmm0
1736  |.endmacro
1737  |
1738  |  math_round floor
1739  |  math_round ceil
1740  |
1741  |.ffunc math_log
1742  |  cmp NARGS:RDd, 1+1; jne ->fff_fallback	// Exactly one argument.
1743  |  checknumtp [BASE], ->fff_fallback
1744  |  movsd xmm0, qword [BASE]
1745  |  mov RB, BASE
1746  |  call extern log
1747  |  mov BASE, RB
1748  |  jmp ->fff_resxmm0
1749  |
1750  |.macro math_extern, func
1751  |  .ffunc_n math_ .. func
1752  |  mov RB, BASE
1753  |  call extern func
1754  |  mov BASE, RB
1755  |  jmp ->fff_resxmm0
1756  |.endmacro
1757  |
1758  |.macro math_extern2, func
1759  |  .ffunc_nn math_ .. func
1760  |  mov RB, BASE
1761  |  call extern func
1762  |  mov BASE, RB
1763  |  jmp ->fff_resxmm0
1764  |.endmacro
1765  |
1766  |  math_extern log10
1767  |  math_extern exp
1768  |  math_extern sin
1769  |  math_extern cos
1770  |  math_extern tan
1771  |  math_extern asin
1772  |  math_extern acos
1773  |  math_extern atan
1774  |  math_extern sinh
1775  |  math_extern cosh
1776  |  math_extern tanh
1777  |  math_extern2 pow
1778  |  math_extern2 atan2
1779  |  math_extern2 fmod
1780  |
1781  |.ffunc_2 math_ldexp
1782  |  checknumtp [BASE], ->fff_fallback
1783  |  checknumtp [BASE+8], ->fff_fallback
1784  |  fld qword [BASE+8]
1785  |  fld qword [BASE]
1786  |  fscale
1787  |  fpop1
1788  |  mov PC, [BASE-8]
1789  |  fstp qword [BASE-16]
1790  |  jmp ->fff_res1
1791  |
1792  |.ffunc_n math_frexp
1793  |  mov RB, BASE
1794  |.if X64WIN
1795  |  lea CARG2, TMP1		// Caveat: CARG2 == BASE
1796  |.else
1797  |  lea CARG1, TMP1
1798  |.endif
1799  |  call extern frexp
1800  |  mov BASE, RB
1801  |  mov RBd, TMP1d
1802  |  mov PC, [BASE-8]
1803  |  movsd qword [BASE-16], xmm0
1804  |.if DUALNUM
1805  |  setint RB
1806  |  mov [BASE-8], RB
1807  |.else
1808  |  cvtsi2sd xmm1, RBd
1809  |  movsd qword [BASE-8], xmm1
1810  |.endif
1811  |  mov RDd, 1+2
1812  |  jmp ->fff_res
1813  |
1814  |.ffunc_n math_modf
1815  |  mov RB, BASE
1816  |.if X64WIN
1817  |  lea CARG2, [BASE-16]	// Caveat: CARG2 == BASE
1818  |.else
1819  |  lea CARG1, [BASE-16]
1820  |.endif
1821  |  call extern modf
1822  |  mov BASE, RB
1823  |  mov PC, [BASE-8]
1824  |  movsd qword [BASE-8], xmm0
1825  |  mov RDd, 1+2
1826  |  jmp ->fff_res
1827  |
1828  |.macro math_minmax, name, cmovop, sseop
1829  |  .ffunc_1 name
1830  |  mov RAd, 2
1831  |.if DUALNUM
1832  |  mov RB, [BASE]
1833  |  checkint RB, >4
1834  |1:  // Handle integers.
1835  |  cmp RAd, RDd; jae ->fff_resRB
1836  |  mov TMPR, [BASE+RA*8-8]
1837  |  checkint TMPR, >3
1838  |  cmp RBd, TMPRd
1839  |  cmovop RB, TMPR
1840  |  add RAd, 1
1841  |  jmp <1
1842  |3:
1843  |  ja ->fff_fallback
1844  |  // Convert intermediate result to number and continue below.
1845  |  cvtsi2sd xmm0, RBd
1846  |  jmp >6
1847  |4:
1848  |  ja ->fff_fallback
1849  |.else
1850  |  checknumtp [BASE], ->fff_fallback
1851  |.endif
1852  |
1853  |  movsd xmm0, qword [BASE]
1854  |5:  // Handle numbers or integers.
1855  |  cmp RAd, RDd; jae ->fff_resxmm0
1856  |.if DUALNUM
1857  |  mov RB, [BASE+RA*8-8]
1858  |  checknumx RB, >6, jb
1859  |  ja ->fff_fallback
1860  |  cvtsi2sd xmm1, RBd
1861  |  jmp >7
1862  |.else
1863  |  checknumtp [BASE+RA*8-8], ->fff_fallback
1864  |.endif
1865  |6:
1866  |  movsd xmm1, qword [BASE+RA*8-8]
1867  |7:
1868  |  sseop xmm0, xmm1
1869  |  add RAd, 1
1870  |  jmp <5
1871  |.endmacro
1872  |
1873  |  math_minmax math_min, cmovg, minsd
1874  |  math_minmax math_max, cmovl, maxsd
1875  |
1876  |//-- String library -----------------------------------------------------
1877  |
1878  |.ffunc string_byte			// Only handle the 1-arg case here.
1879  |  cmp NARGS:RDd, 1+1;  jne ->fff_fallback
1880  |  mov STR:RB, [BASE]
1881  |  checkstr STR:RB, ->fff_fallback
1882  |  mov PC, [BASE-8]
1883  |  cmp dword STR:RB->len, 1
1884  |  jb ->fff_res0			// Return no results for empty string.
1885  |  movzx RBd, byte STR:RB[1]
1886  |.if DUALNUM
1887  |  jmp ->fff_resi
1888  |.else
1889  |  cvtsi2sd xmm0, RBd; jmp ->fff_resxmm0
1890  |.endif
1891  |
1892  |.ffunc string_char			// Only handle the 1-arg case here.
1893  |  ffgccheck
1894  |  cmp NARGS:RDd, 1+1;  jne ->fff_fallback	// *Exactly* 1 arg.
1895  |.if DUALNUM
1896  |  mov RB, [BASE]
1897  |  checkint RB, ->fff_fallback
1898  |.else
1899  |  checknumtp [BASE], ->fff_fallback
1900  |  cvttsd2si RBd, qword [BASE]
1901  |.endif
1902  |  cmp RBd, 255;  ja ->fff_fallback
1903  |  mov TMP1d, RBd
1904  |  mov TMPRd, 1
1905  |  lea RD, TMP1			// Points to stack. Little-endian.
1906  |->fff_newstr:
1907  |  mov L:RB, SAVE_L
1908  |  mov L:RB->base, BASE
1909  |  mov CARG3d, TMPRd			// Zero-extended to size_t.
1910  |  mov CARG2, RD
1911  |  mov CARG1, L:RB
1912  |  mov SAVE_PC, PC
1913  |  call extern lj_str_new		// (lua_State *L, char *str, size_t l)
1914  |->fff_resstr:
1915  |  // GCstr * returned in eax (RD).
1916  |  mov BASE, L:RB->base
1917  |  mov PC, [BASE-8]
1918  |  settp STR:RD, LJ_TSTR
1919  |  mov [BASE-16], STR:RD
1920  |  jmp ->fff_res1
1921  |
1922  |.ffunc string_sub
1923  |  ffgccheck
1924  |  mov TMPRd, -1
1925  |  cmp NARGS:RDd, 1+2;  jb ->fff_fallback
1926  |  jna >1
1927  |.if DUALNUM
1928  |  mov TMPR, [BASE+16]
1929  |  checkint TMPR, ->fff_fallback
1930  |.else
1931  |  checknumtp [BASE+16], ->fff_fallback
1932  |  cvttsd2si TMPRd, qword [BASE+16]
1933  |.endif
1934  |1:
1935  |  mov STR:RB, [BASE]
1936  |  checkstr STR:RB, ->fff_fallback
1937  |.if DUALNUM
1938  |  mov ITYPE, [BASE+8]
1939  |  mov RAd, ITYPEd			// Must clear hiword for lea below.
1940  |  sar ITYPE, 47
1941  |  cmp ITYPEd, LJ_TISNUM
1942  |  jne ->fff_fallback
1943  |.else
1944  |  checknumtp [BASE+8], ->fff_fallback
1945  |  cvttsd2si RAd, qword [BASE+8]
1946  |.endif
1947  |  mov RCd, STR:RB->len
1948  |  cmp RCd, TMPRd			// len < end? (unsigned compare)
1949  |  jb >5
1950  |2:
1951  |  test RAd, RAd			// start <= 0?
1952  |  jle >7
1953  |3:
1954  |  sub TMPRd, RAd			// start > end?
1955  |  jl ->fff_emptystr
1956  |  lea RD, [STR:RB+RAd+#STR-1]
1957  |  add TMPRd, 1
1958  |4:
1959  |  jmp ->fff_newstr
1960  |
1961  |5:  // Negative end or overflow.
1962  |  jl >6
1963  |  lea TMPRd, [TMPRd+RCd+1]		// end = end+(len+1)
1964  |  jmp <2
1965  |6:  // Overflow.
1966  |  mov TMPRd, RCd			// end = len
1967  |  jmp <2
1968  |
1969  |7:  // Negative start or underflow.
1970  |  je >8
1971  |  add RAd, RCd			// start = start+(len+1)
1972  |  add RAd, 1
1973  |  jg <3				// start > 0?
1974  |8:  // Underflow.
1975  |  mov RAd, 1				// start = 1
1976  |  jmp <3
1977  |
1978  |->fff_emptystr:  // Range underflow.
1979  |  xor TMPRd, TMPRd			// Zero length. Any ptr in RD is ok.
1980  |  jmp <4
1981  |
1982  |.macro ffstring_op, name
1983  |  .ffunc_1 string_ .. name
1984  |  ffgccheck
1985  |.if X64WIN
1986  |  mov STR:TMPR, [BASE]
1987  |  checkstr STR:TMPR, ->fff_fallback
1988  |.else
1989  |  mov STR:CARG2, [BASE]
1990  |  checkstr STR:CARG2, ->fff_fallback
1991  |.endif
1992  |  mov L:RB, SAVE_L
1993  |   lea SBUF:CARG1, [DISPATCH+DISPATCH_GL(tmpbuf)]
1994  |  mov L:RB->base, BASE
1995  |.if X64WIN
1996  |  mov STR:CARG2, STR:TMPR		// Caveat: CARG2 == BASE
1997  |.endif
1998  |   mov RC, SBUF:CARG1->b
1999  |   mov SBUF:CARG1->L, L:RB
2000  |   mov SBUF:CARG1->w, RC
2001  |  mov SAVE_PC, PC
2002  |  call extern lj_buf_putstr_ .. name
2003  |  mov CARG1, rax
2004  |  call extern lj_buf_tostr
2005  |  jmp ->fff_resstr
2006  |.endmacro
2007  |
2008  |ffstring_op reverse
2009  |ffstring_op lower
2010  |ffstring_op upper
2011  |
2012  |//-- Bit library --------------------------------------------------------
2013  |
2014  |.macro .ffunc_bit, name, kind, fdef
2015  |  fdef name
2016  |.if kind == 2
2017  |  sseconst_tobit xmm1, RB
2018  |.endif
2019  |.if DUALNUM
2020  |  mov RB, [BASE]
2021  |  checkint RB, >1
2022  |.if kind > 0
2023  |  jmp >2
2024  |.else
2025  |  jmp ->fff_resbit
2026  |.endif
2027  |1:
2028  |  ja ->fff_fallback
2029  |  movd xmm0, RB
2030  |.else
2031  |  checknumtp [BASE], ->fff_fallback
2032  |  movsd xmm0, qword [BASE]
2033  |.endif
2034  |.if kind < 2
2035  |  sseconst_tobit xmm1, RB
2036  |.endif
2037  |  addsd xmm0, xmm1
2038  |  movd RBd, xmm0
2039  |2:
2040  |.endmacro
2041  |
2042  |.macro .ffunc_bit, name, kind
2043  |  .ffunc_bit name, kind, .ffunc_1
2044  |.endmacro
2045  |
2046  |.ffunc_bit bit_tobit, 0
2047  |  jmp ->fff_resbit
2048  |
2049  |.macro .ffunc_bit_op, name, ins
2050  |  .ffunc_bit name, 2
2051  |  mov TMPRd, NARGS:RDd		// Save for fallback.
2052  |  lea RD, [BASE+NARGS:RD*8-16]
2053  |1:
2054  |  cmp RD, BASE
2055  |  jbe ->fff_resbit
2056  |.if DUALNUM
2057  |  mov RA, [RD]
2058  |  checkint RA, >2
2059  |  ins RBd, RAd
2060  |  sub RD, 8
2061  |  jmp <1
2062  |2:
2063  |  ja ->fff_fallback_bit_op
2064  |  movd xmm0, RA
2065  |.else
2066  |  checknumtp [RD], ->fff_fallback_bit_op
2067  |  movsd xmm0, qword [RD]
2068  |.endif
2069  |  addsd xmm0, xmm1
2070  |  movd RAd, xmm0
2071  |  ins RBd, RAd
2072  |  sub RD, 8
2073  |  jmp <1
2074  |.endmacro
2075  |
2076  |.ffunc_bit_op bit_band, and
2077  |.ffunc_bit_op bit_bor, or
2078  |.ffunc_bit_op bit_bxor, xor
2079  |
2080  |.ffunc_bit bit_bswap, 1
2081  |  bswap RBd
2082  |  jmp ->fff_resbit
2083  |
2084  |.ffunc_bit bit_bnot, 1
2085  |  not RBd
2086  |.if DUALNUM
2087  |  jmp ->fff_resbit
2088  |.else
2089  |->fff_resbit:
2090  |  cvtsi2sd xmm0, RBd
2091  |  jmp ->fff_resxmm0
2092  |.endif
2093  |
2094  |->fff_fallback_bit_op:
2095  |  mov NARGS:RDd, TMPRd		// Restore for fallback
2096  |  jmp ->fff_fallback
2097  |
2098  |.macro .ffunc_bit_sh, name, ins
2099  |.if DUALNUM
2100  |  .ffunc_bit name, 1, .ffunc_2
2101  |  // Note: no inline conversion from number for 2nd argument!
2102  |  mov RA, [BASE+8]
2103  |  checkint RA, ->fff_fallback
2104  |.else
2105  |  .ffunc_nn name
2106  |  sseconst_tobit xmm2, RB
2107  |  addsd xmm0, xmm2
2108  |  addsd xmm1, xmm2
2109  |  movd RBd, xmm0
2110  |  movd RAd, xmm1
2111  |.endif
2112  |  ins RBd, cl			// Assumes RA is ecx.
2113  |  jmp ->fff_resbit
2114  |.endmacro
2115  |
2116  |.ffunc_bit_sh bit_lshift, shl
2117  |.ffunc_bit_sh bit_rshift, shr
2118  |.ffunc_bit_sh bit_arshift, sar
2119  |.ffunc_bit_sh bit_rol, rol
2120  |.ffunc_bit_sh bit_ror, ror
2121  |
2122  |//-----------------------------------------------------------------------
2123  |
2124  |->fff_fallback_2:
2125  |  mov NARGS:RDd, 1+2			// Other args are ignored, anyway.
2126  |  jmp ->fff_fallback
2127  |->fff_fallback_1:
2128  |  mov NARGS:RDd, 1+1			// Other args are ignored, anyway.
2129  |->fff_fallback:			// Call fast function fallback handler.
2130  |  // BASE = new base, RD = nargs+1
2131  |  mov L:RB, SAVE_L
2132  |  mov PC, [BASE-8]			// Fallback may overwrite PC.
2133  |  mov SAVE_PC, PC			// Redundant (but a defined value).
2134  |  mov L:RB->base, BASE
2135  |  lea RD, [BASE+NARGS:RD*8-8]
2136  |  lea RA, [RD+8*LUA_MINSTACK]	// Ensure enough space for handler.
2137  |  mov L:RB->top, RD
2138  |  mov CFUNC:RD, [BASE-16]
2139  |  cleartp CFUNC:RD
2140  |  cmp RA, L:RB->maxstack
2141  |  ja >5				// Need to grow stack.
2142  |  mov CARG1, L:RB
2143  |  call aword CFUNC:RD->f		// (lua_State *L)
2144  |  mov BASE, L:RB->base
2145  |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
2146  |  test RDd, RDd; jg ->fff_res	// Returned nresults+1?
2147  |1:
2148  |  mov RA, L:RB->top
2149  |  sub RA, BASE
2150  |  shr RAd, 3
2151  |  test RDd, RDd
2152  |  lea NARGS:RDd, [RAd+1]
2153  |  mov LFUNC:RB, [BASE-16]
2154  |  jne ->vm_call_tail			// Returned -1?
2155  |  cleartp LFUNC:RB
2156  |  ins_callt				// Returned 0: retry fast path.
2157  |
2158  |// Reconstruct previous base for vmeta_call during tailcall.
2159  |->vm_call_tail:
2160  |  mov RA, BASE
2161  |  test PCd, FRAME_TYPE
2162  |  jnz >3
2163  |  movzx RBd, PC_RA
2164  |  neg RB
2165  |  lea BASE, [BASE+RB*8-16]		// base = base - (RB+2)*8
2166  |  jmp ->vm_call_dispatch		// Resolve again for tailcall.
2167  |3:
2168  |  mov RB, PC
2169  |  and RB, -8
2170  |  sub BASE, RB
2171  |  jmp ->vm_call_dispatch		// Resolve again for tailcall.
2172  |
2173  |5:  // Grow stack for fallback handler.
2174  |  mov CARG2d, LUA_MINSTACK
2175  |  mov CARG1, L:RB
2176  |  call extern lj_state_growstack	// (lua_State *L, int n)
2177  |  mov BASE, L:RB->base
2178  |  xor RDd, RDd			// Simulate a return 0.
2179  |  jmp <1				// Dumb retry (goes through ff first).
2180  |
2181  |->fff_gcstep:			// Call GC step function.
2182  |  // BASE = new base, RD = nargs+1
2183  |  pop RB				// Must keep stack at same level.
2184  |  mov TMP1, RB			// Save return address
2185  |  mov L:RB, SAVE_L
2186  |  mov SAVE_PC, PC			// Redundant (but a defined value).
2187  |  mov L:RB->base, BASE
2188  |  lea RD, [BASE+NARGS:RD*8-8]
2189  |  mov CARG1, L:RB
2190  |  mov L:RB->top, RD
2191  |  call extern lj_gc_step		// (lua_State *L)
2192  |  mov BASE, L:RB->base
2193  |  mov RD, L:RB->top
2194  |  sub RD, BASE
2195  |  shr RDd, 3
2196  |  add NARGS:RDd, 1
2197  |  mov RB, TMP1
2198  |  push RB				// Restore return address.
2199  |  ret
2200  |
2201  |//-----------------------------------------------------------------------
2202  |//-- Special dispatch targets -------------------------------------------
2203  |//-----------------------------------------------------------------------
2204  |
2205  |->vm_record:				// Dispatch target for recording phase.
2206  |.if JIT
2207  |  movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
2208  |  test RDL, HOOK_VMEVENT		// No recording while in vmevent.
2209  |  jnz >5
2210  |  // Decrement the hookcount for consistency, but always do the call.
2211  |  test RDL, HOOK_ACTIVE
2212  |  jnz >1
2213  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
2214  |  jz >1
2215  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
2216  |  jmp >1
2217  |.endif
2218  |
2219  |->vm_rethook:			// Dispatch target for return hooks.
2220  |  movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
2221  |  test RDL, HOOK_ACTIVE		// Hook already active?
2222  |  jnz >5
2223  |  jmp >1
2224  |
2225  |->vm_inshook:			// Dispatch target for instr/line hooks.
2226  |  movzx RDd, byte [DISPATCH+DISPATCH_GL(hookmask)]
2227  |  test RDL, HOOK_ACTIVE		// Hook already active?
2228  |  jnz >5
2229  |
2230  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
2231  |  jz >5
2232  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
2233  |  jz >1
2234  |  test RDL, LUA_MASKLINE
2235  |  jz >5
2236  |1:
2237  |  mov L:RB, SAVE_L
2238  |  mov L:RB->base, BASE
2239  |  mov CARG2, PC			// Caveat: CARG2 == BASE
2240  |  mov CARG1, L:RB
2241  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
2242  |  call extern lj_dispatch_ins	// (lua_State *L, const BCIns *pc)
2243  |3:
2244  |  mov BASE, L:RB->base
2245  |4:
2246  |  movzx RAd, PC_RA
2247  |5:
2248  |  movzx OP, PC_OP
2249  |  movzx RDd, PC_RD
2250  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Re-dispatch to static ins.
2251  |
2252  |->cont_hook:				// Continue from hook yield.
2253  |  add PC, 4
2254  |  mov RA, [RB-40]
2255  |  mov MULTRES, RAd			// Restore MULTRES for *M ins.
2256  |  jmp <4
2257  |
2258  |->vm_hotloop:			// Hot loop counter underflow.
2259  |.if JIT
2260  |  mov LFUNC:RB, [BASE-16]		// Same as curr_topL(L).
2261  |  cleartp LFUNC:RB
2262  |  mov RB, LFUNC:RB->pc
2263  |  movzx RDd, byte [RB+PC2PROTO(framesize)]
2264  |  lea RD, [BASE+RD*8]
2265  |  mov L:RB, SAVE_L
2266  |  mov L:RB->base, BASE
2267  |  mov L:RB->top, RD
2268  |  mov CARG2, PC
2269  |  lea CARG1, [DISPATCH+GG_DISP2J]
2270  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RB
2271  |  mov SAVE_PC, PC
2272  |  call extern lj_trace_hot		// (jit_State *J, const BCIns *pc)
2273  |  jmp <3
2274  |.endif
2275  |
2276  |->vm_callhook:			// Dispatch target for call hooks.
2277  |  mov SAVE_PC, PC
2278  |.if JIT
2279  |  jmp >1
2280  |.endif
2281  |
2282  |->vm_hotcall:			// Hot call counter underflow.
2283  |.if JIT
2284  |  mov SAVE_PC, PC
2285  |  or PC, 1				// Marker for hot call.
2286  |1:
2287  |.endif
2288  |  lea RD, [BASE+NARGS:RD*8-8]
2289  |  mov L:RB, SAVE_L
2290  |  mov L:RB->base, BASE
2291  |  mov L:RB->top, RD
2292  |  mov CARG2, PC
2293  |  mov CARG1, L:RB
2294  |  call extern lj_dispatch_call	// (lua_State *L, const BCIns *pc)
2295  |  // ASMFunction returned in eax/rax (RD).
2296  |  mov SAVE_PC, 0			// Invalidate for subsequent line hook.
2297  |.if JIT
2298  |  and PC, -2
2299  |.endif
2300  |  mov BASE, L:RB->base
2301  |  mov RA, RD
2302  |  mov RD, L:RB->top
2303  |  sub RD, BASE
2304  |  mov RB, RA
2305  |  movzx RAd, PC_RA
2306  |  shr RDd, 3
2307  |  add NARGS:RDd, 1
2308  |  jmp RB
2309  |
2310  |->cont_stitch:			// Trace stitching.
2311  |.if JIT
2312  |  // BASE = base, RC = result, RB = mbase
2313  |  mov TRACE:ITYPE, [RB-40]		// Save previous trace.
2314  |  cleartp TRACE:ITYPE
2315  |  mov TMPRd, MULTRES
2316  |  movzx RAd, PC_RA
2317  |  lea RA, [BASE+RA*8]		// Call base.
2318  |  sub TMPRd, 1
2319  |  jz >2
2320  |1:  // Move results down.
2321  |  mov RB, [RC]
2322  |  mov [RA], RB
2323  |  add RC, 8
2324  |  add RA, 8
2325  |  sub TMPRd, 1
2326  |  jnz <1
2327  |2:
2328  |  movzx RCd, PC_RA
2329  |  movzx RBd, PC_RB
2330  |  add RC, RB
2331  |  lea RC, [BASE+RC*8-8]
2332  |3:
2333  |  cmp RC, RA
2334  |  ja >9				// More results wanted?
2335  |
2336  |  test TRACE:ITYPE, TRACE:ITYPE
2337  |  jz ->cont_nop
2338  |  movzx RBd, word TRACE:ITYPE->traceno
2339  |  movzx RDd, word TRACE:ITYPE->link
2340  |  cmp RDd, RBd
2341  |  je ->cont_nop			// Blacklisted.
2342  |  test RDd, RDd
2343  |  jne =>BC_JLOOP			// Jump to stitched trace.
2344  |
2345  |  // Stitch a new trace to the previous trace.
2346  |  mov [DISPATCH+DISPATCH_J(exitno)], RB
2347  |  mov L:RB, SAVE_L
2348  |  mov L:RB->base, BASE
2349  |  mov CARG2, PC
2350  |  lea CARG1, [DISPATCH+GG_DISP2J]
2351  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RB
2352  |  call extern lj_dispatch_stitch	// (jit_State *J, const BCIns *pc)
2353  |  mov BASE, L:RB->base
2354  |  jmp ->cont_nop
2355  |
2356  |9:  // Fill up results with nil.
2357  |  mov aword [RA], LJ_TNIL
2358  |  add RA, 8
2359  |  jmp <3
2360  |.endif
2361  |
2362  |->vm_profhook:			// Dispatch target for profiler hook.
2363#if LJ_HASPROFILE
2364  |  mov L:RB, SAVE_L
2365  |  mov L:RB->base, BASE
2366  |  mov CARG2, PC			// Caveat: CARG2 == BASE
2367  |  mov CARG1, L:RB
2368  |  call extern lj_dispatch_profile	// (lua_State *L, const BCIns *pc)
2369  |  mov BASE, L:RB->base
2370  |  // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
2371  |  sub PC, 4
2372  |  jmp ->cont_nop
2373#endif
2374  |
2375  |//-----------------------------------------------------------------------
2376  |//-- Trace exit handler -------------------------------------------------
2377  |//-----------------------------------------------------------------------
2378  |
2379  |// Called from an exit stub with the exit number on the stack.
2380  |// The 16 bit exit number is stored with two (sign-extended) push imm8.
2381  |->vm_exit_handler:
2382  |.if JIT
2383  |  push r13; push r12
2384  |  push r11; push r10; push r9; push r8
2385  |  push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp
2386  |  push rbx; push rdx; push rcx; push rax
2387  |  movzx RCd, byte [rbp-8]		// Reconstruct exit number.
2388  |  mov RCH, byte [rbp-16]
2389  |  mov [rbp-8], r15; mov [rbp-16], r14
2390  |  // DISPATCH is preserved on-trace in LJ_GC64 mode.
2391  |  mov RAd, [DISPATCH+DISPATCH_GL(vmstate)]	// Get trace number.
2392  |  set_vmstate EXIT
2393  |  mov [DISPATCH+DISPATCH_J(exitno)], RCd
2394  |  mov [DISPATCH+DISPATCH_J(parent)], RAd
2395  |.if X64WIN
2396  |  sub rsp, 16*8+4*8			// Room for SSE regs + save area.
2397  |.else
2398  |  sub rsp, 16*8			// Room for SSE regs.
2399  |.endif
2400  |  add rbp, -128
2401  |  movsd qword [rbp-8],   xmm15; movsd qword [rbp-16],  xmm14
2402  |  movsd qword [rbp-24],  xmm13; movsd qword [rbp-32],  xmm12
2403  |  movsd qword [rbp-40],  xmm11; movsd qword [rbp-48],  xmm10
2404  |  movsd qword [rbp-56],  xmm9;  movsd qword [rbp-64],  xmm8
2405  |  movsd qword [rbp-72],  xmm7;  movsd qword [rbp-80],  xmm6
2406  |  movsd qword [rbp-88],  xmm5;  movsd qword [rbp-96],  xmm4
2407  |  movsd qword [rbp-104], xmm3;  movsd qword [rbp-112], xmm2
2408  |  movsd qword [rbp-120], xmm1;  movsd qword [rbp-128], xmm0
2409  |  // Caveat: RB is rbp.
2410  |  mov L:RB, [DISPATCH+DISPATCH_GL(cur_L)]
2411  |  mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
2412  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RB
2413  |  mov L:RB->base, BASE
2414  |.if X64WIN
2415  |  lea CARG2, [rsp+4*8]
2416  |.else
2417  |  mov CARG2, rsp
2418  |.endif
2419  |  lea CARG1, [DISPATCH+GG_DISP2J]
2420  |  mov qword [DISPATCH+DISPATCH_GL(jit_base)], 0
2421  |  call extern lj_trace_exit		// (jit_State *J, ExitState *ex)
2422  |  // MULTRES or negated error code returned in eax (RD).
2423  |  mov RA, L:RB->cframe
2424  |  and RA, CFRAME_RAWMASK
2425  |  mov [RA+CFRAME_OFS_L], L:RB	// Set SAVE_L (on-trace resume/yield).
2426  |  mov BASE, L:RB->base
2427  |  mov PC, [RA+CFRAME_OFS_PC]	// Get SAVE_PC.
2428  |  jmp >1
2429  |.endif
2430  |->vm_exit_interp:
2431  |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
2432  |.if JIT
2433  |  // Restore additional callee-save registers only used in compiled code.
2434  |.if X64WIN
2435  |  lea RA, [rsp+10*16+4*8]
2436  |1:
2437  |  movdqa xmm15, [RA-10*16]
2438  |  movdqa xmm14, [RA-9*16]
2439  |  movdqa xmm13, [RA-8*16]
2440  |  movdqa xmm12, [RA-7*16]
2441  |  movdqa xmm11, [RA-6*16]
2442  |  movdqa xmm10, [RA-5*16]
2443  |  movdqa xmm9, [RA-4*16]
2444  |  movdqa xmm8, [RA-3*16]
2445  |  movdqa xmm7, [RA-2*16]
2446  |  mov rsp, RA			// Reposition stack to C frame.
2447  |  movdqa xmm6, [RA-1*16]
2448  |  mov r15, CSAVE_1
2449  |  mov r14, CSAVE_2
2450  |  mov r13, CSAVE_3
2451  |  mov r12, CSAVE_4
2452  |.else
2453  |  lea RA, [rsp+16]
2454  |1:
2455  |  mov r13, [RA-8]
2456  |  mov r12, [RA]
2457  |  mov rsp, RA			// Reposition stack to C frame.
2458#ifdef LUA_USE_TRACE_LOGS
2459  |  mov CARG1, SAVE_L
2460  |  mov L:CARG1->base, BASE
2461  |  mov RB, RD     // Save RD
2462  |  mov TMP1, PC  // Save PC
2463  |  mov CARG3, PC   // CARG3 == BASE
2464  |  mov CARG2d, dword [DISPATCH+DISPATCH_GL(vmstate)]
2465  |  call extern lj_log_trace_direct_exit@8
2466  |  mov PC, TMP1
2467  |  mov RD, RB
2468  |  mov RB, SAVE_L
2469  |  mov BASE, L:RB->base
2470#endif
2471  |.endif
2472  |  test RDd, RDd; js >9		// Check for error from exit.
2473  |  mov L:RB, SAVE_L
2474  |  mov MULTRES, RDd
2475  |  mov LFUNC:KBASE, [BASE-16]
2476  |  cleartp LFUNC:KBASE
2477  |  mov KBASE, LFUNC:KBASE->pc
2478  |  mov KBASE, [KBASE+PC2PROTO(k)]
2479  |  mov L:RB->base, BASE
2480  |  mov qword [DISPATCH+DISPATCH_GL(jit_base)], 0
2481  |  set_vmstate INTERP
2482  |  // Modified copy of ins_next which handles function header dispatch, too.
2483  |  mov RCd, [PC]
2484  |  movzx RAd, RCH
2485  |  movzx OP, RCL
2486  |  add PC, 4
2487  |  shr RCd, 16
2488  |  cmp OP, BC_FUNCF			// Function header?
2489  |  jb >3
2490  |  cmp OP, BC_FUNCC+2			// Fast function?
2491  |  jae >4
2492  |2:
2493  |  mov RCd, MULTRES			// RC/RD holds nres+1.
2494  |3:
2495  |  jmp aword [DISPATCH+OP*8]
2496  |
2497  |4:  // Check frame below fast function.
2498  |  mov RC, [BASE-8]
2499  |  test RCd, FRAME_TYPE
2500  |  jnz <2				// Trace stitching continuation?
2501  |  // Otherwise set KBASE for Lua function below fast function.
2502  |  movzx RCd, byte [RC-3]
2503  |  neg RC
2504  |  mov LFUNC:KBASE, [BASE+RC*8-32]
2505  |  cleartp LFUNC:KBASE
2506  |  mov KBASE, LFUNC:KBASE->pc
2507  |  mov KBASE, [KBASE+PC2PROTO(k)]
2508  |  jmp <2
2509  |
2510  |9:  // Rethrow error from the right C frame.
2511  |  mov CARG2d, RDd
2512  |  mov CARG1, L:RB
2513  |  neg CARG2d
2514  |  call extern lj_err_trace		// (lua_State *L, int errcode)
2515  |.endif
2516  |
2517  |//-----------------------------------------------------------------------
2518  |//-- Math helper functions ----------------------------------------------
2519  |//-----------------------------------------------------------------------
2520  |
2521  |// FP value rounding. Called by math.floor/math.ceil fast functions
2522  |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
2523  |.macro vm_round, name, mode, cond
2524  |->name:
2525  |->name .. _sse:
2526  |  sseconst_abs xmm2, RD
2527  |  sseconst_2p52 xmm3, RD
2528  |  movaps xmm1, xmm0
2529  |  andpd xmm1, xmm2			// |x|
2530  |  ucomisd xmm3, xmm1			// No truncation if 2^52 <= |x|.
2531  |  jbe >1
2532  |  andnpd xmm2, xmm0			// Isolate sign bit.
2533  |.if mode == 2		// trunc(x)?
2534  |  movaps xmm0, xmm1
2535  |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
2536  |  subsd xmm1, xmm3
2537  |  sseconst_1 xmm3, RD
2538  |  cmpsd xmm0, xmm1, 1		// |x| < result?
2539  |  andpd xmm0, xmm3
2540  |  subsd xmm1, xmm0			// If yes, subtract -1.
2541  |  orpd xmm1, xmm2			// Merge sign bit back in.
2542  |.else
2543  |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
2544  |  subsd xmm1, xmm3
2545  |  orpd xmm1, xmm2			// Merge sign bit back in.
2546  |  .if mode == 1		// ceil(x)?
2547  |    sseconst_m1 xmm2, RD		// Must subtract -1 to preserve -0.
2548  |    cmpsd xmm0, xmm1, 6		// x > result?
2549  |  .else			// floor(x)?
2550  |    sseconst_1 xmm2, RD
2551  |    cmpsd xmm0, xmm1, 1		// x < result?
2552  |  .endif
2553  |  andpd xmm0, xmm2
2554  |  subsd xmm1, xmm0			// If yes, subtract +-1.
2555  |.endif
2556  |  movaps xmm0, xmm1
2557  |1:
2558  |  ret
2559  |.endmacro
2560  |
2561  |  vm_round vm_floor, 0, 1
2562  |  vm_round vm_ceil,  1, JIT
2563  |  vm_round vm_trunc, 2, JIT
2564  |
2565  |// FP modulo x%y. Called by BC_MOD* and vm_arith.
2566  |->vm_mod:
2567  |// Args in xmm0/xmm1, return value in xmm0.
2568  |// Caveat: xmm0-xmm5 and RC (eax) modified!
2569  |  movaps xmm5, xmm0
2570  |  divsd xmm0, xmm1
2571  |  sseconst_abs xmm2, RD
2572  |  sseconst_2p52 xmm3, RD
2573  |  movaps xmm4, xmm0
2574  |  andpd xmm4, xmm2			// |x/y|
2575  |  ucomisd xmm3, xmm4			// No truncation if 2^52 <= |x/y|.
2576  |  jbe >1
2577  |  andnpd xmm2, xmm0			// Isolate sign bit.
2578  |  addsd xmm4, xmm3			// (|x/y| + 2^52) - 2^52
2579  |  subsd xmm4, xmm3
2580  |  orpd xmm4, xmm2			// Merge sign bit back in.
2581  |  sseconst_1 xmm2, RD
2582  |  cmpsd xmm0, xmm4, 1		// x/y < result?
2583  |  andpd xmm0, xmm2
2584  |  subsd xmm4, xmm0			// If yes, subtract 1.0.
2585  |  movaps xmm0, xmm5
2586  |  mulsd xmm1, xmm4
2587  |  subsd xmm0, xmm1
2588  |  ret
2589  |1:
2590  |  mulsd xmm1, xmm0
2591  |  movaps xmm0, xmm5
2592  |  subsd xmm0, xmm1
2593  |  ret
2594  |
2595  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
2596  |->vm_powi_sse:
2597  |  cmp eax, 1; jle >6			// i<=1?
2598  |  // Now 1 < (unsigned)i <= 0x80000000.
2599  |1:  // Handle leading zeros.
2600  |  test eax, 1; jnz >2
2601  |  mulsd xmm0, xmm0
2602  |  shr eax, 1
2603  |  jmp <1
2604  |2:
2605  |  shr eax, 1; jz >5
2606  |  movaps xmm1, xmm0
2607  |3:  // Handle trailing bits.
2608  |  mulsd xmm0, xmm0
2609  |  shr eax, 1; jz >4
2610  |  jnc <3
2611  |  mulsd xmm1, xmm0
2612  |  jmp <3
2613  |4:
2614  |  mulsd xmm0, xmm1
2615  |5:
2616  |  ret
2617  |6:
2618  |  je <5				// x^1 ==> x
2619  |  jb >7				// x^0 ==> 1
2620  |  neg eax
2621  |  call <1
2622  |  sseconst_1 xmm1, RD
2623  |  divsd xmm1, xmm0
2624  |  movaps xmm0, xmm1
2625  |  ret
2626  |7:
2627  |  sseconst_1 xmm0, RD
2628  |  ret
2629  |
2630  |//-----------------------------------------------------------------------
2631  |//-- Miscellaneous functions --------------------------------------------
2632  |//-----------------------------------------------------------------------
2633  |
2634  |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
2635  |->vm_cpuid:
2636  |  mov eax, CARG1d
2637  |  .if X64WIN; push rsi; mov rsi, CARG2; .endif
2638  |  push rbx
2639  |  xor ecx, ecx
2640  |  cpuid
2641  |  mov [rsi], eax
2642  |  mov [rsi+4], ebx
2643  |  mov [rsi+8], ecx
2644  |  mov [rsi+12], edx
2645  |  pop rbx
2646  |  .if X64WIN; pop rsi; .endif
2647  |  ret
2648  |
2649  |.define NEXT_TAB,		TAB:CARG1
2650  |.define NEXT_IDX,		CARG2d
2651  |.define NEXT_IDXa,		CARG2
2652  |.define NEXT_PTR,		RC
2653  |.define NEXT_PTRd,		RCd
2654  |.define NEXT_TMP,		CARG3
2655  |.define NEXT_ASIZE,		CARG4d
2656  |.macro NEXT_RES_IDXL, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
2657  |.if X64WIN
2658  |.define NEXT_RES_PTR,	[rsp+aword*5]
2659  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
2660  |.else
2661  |.define NEXT_RES_PTR,	[rsp+aword*1]
2662  |.macro NEXT_RES_IDX, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
2663  |.endif
2664  |
2665  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
2666  |// Next idx returned in edx.
2667  |->vm_next:
2668  |.if JIT
2669  |  mov NEXT_ASIZE, NEXT_TAB->asize
2670  |1:  // Traverse array part.
2671  |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
2672  |  mov NEXT_TMP, NEXT_TAB->array
2673  |  mov NEXT_TMP, qword [NEXT_TMP+NEXT_IDX*8]
2674  |  cmp NEXT_TMP, LJ_TNIL;  je >2
2675  |  lea NEXT_PTR, NEXT_RES_PTR
2676  |  mov qword [NEXT_PTR], NEXT_TMP
2677  |.if DUALNUM
2678  |  setint NEXT_TMP, NEXT_IDXa
2679  |  mov qword [NEXT_PTR+qword*1], NEXT_TMP
2680  |.else
2681  |  cvtsi2sd xmm0, NEXT_IDX
2682  |  movsd qword [NEXT_PTR+qword*1], xmm0
2683  |.endif
2684  |  NEXT_RES_IDX 1
2685  |  ret
2686  |2:  // Skip holes in array part.
2687  |  add NEXT_IDX, 1
2688  |  jmp <1
2689  |
2690  |5:  // Traverse hash part.
2691  |  sub NEXT_IDX, NEXT_ASIZE
2692  |6:
2693  |  cmp NEXT_IDX, NEXT_TAB->hmask; ja >9
2694  |  imul NEXT_PTRd, NEXT_IDX, #NODE
2695  |  add NODE:NEXT_PTR, NEXT_TAB->node
2696  |  cmp qword NODE:NEXT_PTR->val, LJ_TNIL; je >7
2697  |  NEXT_RES_IDXL NEXT_ASIZE+1
2698  |  ret
2699  |7:  // Skip holes in hash part.
2700  |  add NEXT_IDX, 1
2701  |  jmp <6
2702  |
2703  |9:  // End of iteration. Set the key to nil (not the value).
2704  |  NEXT_RES_IDX NEXT_ASIZE
2705  |  lea NEXT_PTR, NEXT_RES_PTR
2706  |  mov qword [NEXT_PTR+qword*1], LJ_TNIL
2707  |  ret
2708  |.endif
2709  |
2710  |//-----------------------------------------------------------------------
2711  |//-- Assertions ---------------------------------------------------------
2712  |//-----------------------------------------------------------------------
2713  |
2714  |->assert_bad_for_arg_type:
2715#ifdef LUA_USE_ASSERT
2716  |  int3
2717#endif
2718  |  int3
2719  |
2720  |//-----------------------------------------------------------------------
2721  |//-- FFI helper functions -----------------------------------------------
2722  |//-----------------------------------------------------------------------
2723  |
2724  |// Handler for callback functions. Callback slot number in ah/al.
2725  |->vm_ffi_callback:
2726  |.if FFI
2727  |.type CTSTATE, CTState, PC
2728  |  saveregs_	// ebp/rbp already saved. ebp now holds global_State *.
2729  |  lea DISPATCH, [ebp+GG_G2DISP]
2730  |  mov CTSTATE, GL:ebp->ctype_state
2731  |  movzx eax, ax
2732  |  mov CTSTATE->cb.slot, eax
2733  |  mov CTSTATE->cb.gpr[0], CARG1
2734  |  mov CTSTATE->cb.gpr[1], CARG2
2735  |  mov CTSTATE->cb.gpr[2], CARG3
2736  |  mov CTSTATE->cb.gpr[3], CARG4
2737  |  movsd qword CTSTATE->cb.fpr[0], xmm0
2738  |  movsd qword CTSTATE->cb.fpr[1], xmm1
2739  |  movsd qword CTSTATE->cb.fpr[2], xmm2
2740  |  movsd qword CTSTATE->cb.fpr[3], xmm3
2741  |.if X64WIN
2742  |  lea rax, [rsp+CFRAME_SIZE+4*8]
2743  |.else
2744  |  lea rax, [rsp+CFRAME_SIZE]
2745  |  mov CTSTATE->cb.gpr[4], CARG5
2746  |  mov CTSTATE->cb.gpr[5], CARG6
2747  |  movsd qword CTSTATE->cb.fpr[4], xmm4
2748  |  movsd qword CTSTATE->cb.fpr[5], xmm5
2749  |  movsd qword CTSTATE->cb.fpr[6], xmm6
2750  |  movsd qword CTSTATE->cb.fpr[7], xmm7
2751  |.endif
2752  |  mov CTSTATE->cb.stack, rax
2753  |  mov CARG2, rsp
2754  |  mov SAVE_PC, CTSTATE		// Any value outside of bytecode is ok.
2755  |  mov CARG1, CTSTATE
2756  |  call extern lj_ccallback_enter	// (CTState *cts, void *cf)
2757  |  // lua_State * returned in eax (RD).
2758  |  set_vmstate INTERP
2759  |  mov BASE, L:RD->base
2760  |  mov RD, L:RD->top
2761  |  sub RD, BASE
2762  |  mov LFUNC:RB, [BASE-16]
2763  |  cleartp LFUNC:RB
2764  |  shr RD, 3
2765  |  add RD, 1
2766  |  ins_callt
2767  |.endif
2768  |
2769  |->cont_ffi_callback:			// Return from FFI callback.
2770  |.if FFI
2771  |  mov L:RA, SAVE_L
2772  |  mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)]
2773  |  mov aword CTSTATE->L, L:RA
2774  |  mov L:RA->base, BASE
2775  |  mov L:RA->top, RB
2776  |  mov CARG1, CTSTATE
2777  |  mov CARG2, RC
2778  |  call extern lj_ccallback_leave	// (CTState *cts, TValue *o)
2779  |  mov rax, CTSTATE->cb.gpr[0]
2780  |  movsd xmm0, qword CTSTATE->cb.fpr[0]
2781  |  jmp ->vm_leave_unw
2782  |.endif
2783  |
2784  |->vm_ffi_call:			// Call C function via FFI.
2785  |  // Caveat: needs special frame unwinding, see below.
2786  |.if FFI
2787  |  .type CCSTATE, CCallState, rbx
2788  |  push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
2789  |
2790  |  // Readjust stack.
2791  |  mov eax, CCSTATE->spadj
2792  |  sub rsp, rax
2793  |
2794  |  // Copy stack slots.
2795  |  movzx ecx, byte CCSTATE->nsp
2796  |  sub ecx, 1
2797  |  js >2
2798  |1:
2799  |  mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
2800  |  mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
2801  |  sub ecx, 1
2802  |  jns <1
2803  |2:
2804  |
2805  |  movzx eax, byte CCSTATE->nfpr
2806  |  mov CARG1, CCSTATE->gpr[0]
2807  |  mov CARG2, CCSTATE->gpr[1]
2808  |  mov CARG3, CCSTATE->gpr[2]
2809  |  mov CARG4, CCSTATE->gpr[3]
2810  |.if not X64WIN
2811  |  mov CARG5, CCSTATE->gpr[4]
2812  |  mov CARG6, CCSTATE->gpr[5]
2813  |.endif
2814  |  test eax, eax; jz >5
2815  |  movaps xmm0, CCSTATE->fpr[0]
2816  |  movaps xmm1, CCSTATE->fpr[1]
2817  |  movaps xmm2, CCSTATE->fpr[2]
2818  |  movaps xmm3, CCSTATE->fpr[3]
2819  |.if not X64WIN
2820  |  cmp eax, 4; jbe >5
2821  |  movaps xmm4, CCSTATE->fpr[4]
2822  |  movaps xmm5, CCSTATE->fpr[5]
2823  |  movaps xmm6, CCSTATE->fpr[6]
2824  |  movaps xmm7, CCSTATE->fpr[7]
2825  |.endif
2826  |5:
2827  |
2828  |  call aword CCSTATE->func
2829  |
2830  |  mov CCSTATE->gpr[0], rax
2831  |  movaps CCSTATE->fpr[0], xmm0
2832  |.if not X64WIN
2833  |  mov CCSTATE->gpr[1], rdx
2834  |  movaps CCSTATE->fpr[1], xmm1
2835  |.endif
2836  |
2837  |  mov rbx, [rbp-8]; leave; ret
2838  |.endif
2839  |// Note: vm_ffi_call must be the last function in this object file!
2840  |
2841  |//-----------------------------------------------------------------------
2842}
2843
2844/* Generate the code for a single instruction. */
2845static void build_ins(BuildCtx *ctx, BCOp op, int defop)
2846{
2847  int vk = 0;
2848  |// Note: aligning all instructions does not pay off.
2849  |=>defop:
2850
2851  switch (op) {
2852
2853  /* -- Comparison ops ---------------------------------------------------- */
2854
2855  /* Remember: all ops branch for a true comparison, fall through otherwise. */
2856
2857  |.macro jmp_comp, lt, ge, le, gt, target
2858  ||switch (op) {
2859  ||case BC_ISLT:
2860  |   lt target
2861  ||break;
2862  ||case BC_ISGE:
2863  |   ge target
2864  ||break;
2865  ||case BC_ISLE:
2866  |   le target
2867  ||break;
2868  ||case BC_ISGT:
2869  |   gt target
2870  ||break;
2871  ||default: break;  /* Shut up GCC. */
2872  ||}
2873  |.endmacro
2874
2875  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
2876    |  // RA = src1, RD = src2, JMP with RD = target
2877    |  ins_AD
2878    |  mov ITYPE, [BASE+RA*8]
2879    |  mov RB, [BASE+RD*8]
2880    |  mov RA, ITYPE
2881    |  mov RD, RB
2882    |  sar ITYPE, 47
2883    |  sar RB, 47
2884    |.if DUALNUM
2885    |  cmp ITYPEd, LJ_TISNUM; jne >7
2886    |  cmp RBd, LJ_TISNUM; jne >8
2887    |  add PC, 4
2888    |  cmp RAd, RDd
2889    |  jmp_comp jge, jl, jg, jle, >9
2890    |6:
2891    |  movzx RDd, PC_RD
2892    |  branchPC RD
2893    |9:
2894    |  ins_next
2895    |
2896    |7:  // RA is not an integer.
2897    |  ja ->vmeta_comp
2898    |  // RA is a number.
2899    |  cmp RBd, LJ_TISNUM; jb >1; jne ->vmeta_comp
2900    |  // RA is a number, RD is an integer.
2901    |  cvtsi2sd xmm0, RDd
2902    |  jmp >2
2903    |
2904    |8:  // RA is an integer, RD is not an integer.
2905    |  ja ->vmeta_comp
2906    |  // RA is an integer, RD is a number.
2907    |  cvtsi2sd xmm1, RAd
2908    |  movd xmm0, RD
2909    |  jmp >3
2910    |.else
2911    |  cmp ITYPEd, LJ_TISNUM; jae ->vmeta_comp
2912    |  cmp RBd, LJ_TISNUM; jae ->vmeta_comp
2913    |.endif
2914    |1:
2915    |  movd xmm0, RD
2916    |2:
2917    |  movd xmm1, RA
2918    |3:
2919    |  add PC, 4
2920    |  ucomisd xmm0, xmm1
2921    |  // Unordered: all of ZF CF PF set, ordered: PF clear.
2922    |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
2923    |.if DUALNUM
2924    |  jmp_comp jbe, ja, jb, jae, <9
2925    |  jmp <6
2926    |.else
2927    |  jmp_comp jbe, ja, jb, jae, >1
2928    |  movzx RDd, PC_RD
2929    |  branchPC RD
2930    |1:
2931    |  ins_next
2932    |.endif
2933    break;
2934
2935  case BC_ISEQV: case BC_ISNEV:
2936    vk = op == BC_ISEQV;
2937    |  ins_AD	// RA = src1, RD = src2, JMP with RD = target
2938    |  mov RB, [BASE+RD*8]
2939    |  mov ITYPE, [BASE+RA*8]
2940    |  add PC, 4
2941    |  mov RD, RB
2942    |  mov RA, ITYPE
2943    |  sar RB, 47
2944    |  sar ITYPE, 47
2945    |.if DUALNUM
2946    |  cmp RBd, LJ_TISNUM; jne >7
2947    |  cmp ITYPEd, LJ_TISNUM; jne >8
2948    |  cmp RDd, RAd
2949    if (vk) {
2950      |  jne >9
2951    } else {
2952      |  je >9
2953    }
2954    |  movzx RDd, PC_RD
2955    |  branchPC RD
2956    |9:
2957    |  ins_next
2958    |
2959    |7:  // RD is not an integer.
2960    |  ja >5
2961    |  // RD is a number.
2962    |  movd xmm1, RD
2963    |  cmp ITYPEd, LJ_TISNUM; jb >1; jne >5
2964    |  // RD is a number, RA is an integer.
2965    |  cvtsi2sd xmm0, RAd
2966    |  jmp >2
2967    |
2968    |8:  // RD is an integer, RA is not an integer.
2969    |  ja >5
2970    |  // RD is an integer, RA is a number.
2971    |  cvtsi2sd xmm1, RDd
2972    |  jmp >1
2973    |
2974    |.else
2975    |  cmp RBd, LJ_TISNUM; jae >5
2976    |  cmp ITYPEd, LJ_TISNUM; jae >5
2977    |  movd xmm1, RD
2978    |.endif
2979    |1:
2980    |  movd xmm0, RA
2981    |2:
2982    |  ucomisd xmm0, xmm1
2983    |4:
2984  iseqne_fp:
2985    if (vk) {
2986      |  jp >2				// Unordered means not equal.
2987      |  jne >2
2988    } else {
2989      |  jp >2				// Unordered means not equal.
2990      |  je >1
2991    }
2992  iseqne_end:
2993    if (vk) {
2994      |1:				// EQ: Branch to the target.
2995      |  movzx RDd, PC_RD
2996      |  branchPC RD
2997      |2:				// NE: Fallthrough to next instruction.
2998      |.if not FFI
2999      |3:
3000      |.endif
3001    } else {
3002      |.if not FFI
3003      |3:
3004      |.endif
3005      |2:				// NE: Branch to the target.
3006      |  movzx RDd, PC_RD
3007      |  branchPC RD
3008      |1:				// EQ: Fallthrough to next instruction.
3009    }
3010    if (LJ_DUALNUM && (op == BC_ISEQV || op == BC_ISNEV ||
3011		       op == BC_ISEQN || op == BC_ISNEN)) {
3012      |  jmp <9
3013    } else {
3014      |  ins_next
3015    }
3016    |
3017    if (op == BC_ISEQV || op == BC_ISNEV) {
3018      |5:  // Either or both types are not numbers.
3019      |.if FFI
3020      |  cmp RBd, LJ_TCDATA; je ->vmeta_equal_cd
3021      |  cmp ITYPEd, LJ_TCDATA; je ->vmeta_equal_cd
3022      |.endif
3023      |  cmp RA, RD
3024      |  je <1				// Same GCobjs or pvalues?
3025      |  cmp RBd, ITYPEd
3026      |  jne <2				// Not the same type?
3027      |  cmp RBd, LJ_TISTABUD
3028      |  ja <2				// Different objects and not table/ud?
3029      |
3030      |  // Different tables or userdatas. Need to check __eq metamethod.
3031      |  // Field metatable must be at same offset for GCtab and GCudata!
3032      |  cleartp TAB:RA
3033      |  mov TAB:RB, TAB:RA->metatable
3034      |  test TAB:RB, TAB:RB
3035      |  jz <2				// No metatable?
3036      |  test byte TAB:RB->nomm, 1<<MM_eq
3037      |  jnz <2				// Or 'no __eq' flag set?
3038      if (vk) {
3039	|  xor RBd, RBd			// ne = 0
3040      } else {
3041	|  mov RBd, 1			// ne = 1
3042      }
3043      |  jmp ->vmeta_equal		// Handle __eq metamethod.
3044    } else {
3045      |.if FFI
3046      |3:
3047      |  cmp ITYPEd, LJ_TCDATA
3048      if (LJ_DUALNUM && vk) {
3049	|  jne <9
3050      } else {
3051	|  jne <2
3052      }
3053      |  jmp ->vmeta_equal_cd
3054      |.endif
3055    }
3056    break;
3057  case BC_ISEQS: case BC_ISNES:
3058    vk = op == BC_ISEQS;
3059    |  ins_AND	// RA = src, RD = str const, JMP with RD = target
3060    |  mov RB, [BASE+RA*8]
3061    |  add PC, 4
3062    |  checkstr RB, >3
3063    |  cmp RB, [KBASE+RD*8]
3064  iseqne_test:
3065    if (vk) {
3066      |  jne >2
3067    } else {
3068      |  je >1
3069    }
3070    goto iseqne_end;
3071  case BC_ISEQN: case BC_ISNEN:
3072    vk = op == BC_ISEQN;
3073    |  ins_AD	// RA = src, RD = num const, JMP with RD = target
3074    |  mov RB, [BASE+RA*8]
3075    |  add PC, 4
3076    |.if DUALNUM
3077    |  checkint RB, >7
3078    |  mov RD, [KBASE+RD*8]
3079    |  checkint RD, >8
3080    |  cmp RBd, RDd
3081    if (vk) {
3082      |  jne >9
3083    } else {
3084      |  je >9
3085    }
3086    |  movzx RDd, PC_RD
3087    |  branchPC RD
3088    |9:
3089    |  ins_next
3090    |
3091    |7:  // RA is not an integer.
3092    |  ja >3
3093    |  // RA is a number.
3094    |  mov RD, [KBASE+RD*8]
3095    |  checkint RD, >1
3096    |  // RA is a number, RD is an integer.
3097    |  cvtsi2sd xmm0, RDd
3098    |  jmp >2
3099    |
3100    |8:  // RA is an integer, RD is a number.
3101    |  cvtsi2sd xmm0, RBd
3102    |  movd xmm1, RD
3103    |  ucomisd xmm0, xmm1
3104    |  jmp >4
3105    |1:
3106    |  movd xmm0, RD
3107    |.else
3108    |  checknum RB, >3
3109    |1:
3110    |  movsd xmm0, qword [KBASE+RD*8]
3111    |.endif
3112    |2:
3113    |  ucomisd xmm0, qword [BASE+RA*8]
3114    |4:
3115    goto iseqne_fp;
3116  case BC_ISEQP: case BC_ISNEP:
3117    vk = op == BC_ISEQP;
3118    |  ins_AND	// RA = src, RD = primitive type (~), JMP with RD = target
3119    |  mov RB, [BASE+RA*8]
3120    |  sar RB, 47
3121    |  add PC, 4
3122    |  cmp RBd, RDd
3123    if (!LJ_HASFFI) goto iseqne_test;
3124    if (vk) {
3125      |  jne >3
3126      |  movzx RDd, PC_RD
3127      |  branchPC RD
3128      |2:
3129      |  ins_next
3130      |3:
3131      |  cmp RBd, LJ_TCDATA; jne <2
3132      |  jmp ->vmeta_equal_cd
3133    } else {
3134      |  je >2
3135      |  cmp RBd, LJ_TCDATA; je ->vmeta_equal_cd
3136      |  movzx RDd, PC_RD
3137      |  branchPC RD
3138      |2:
3139      |  ins_next
3140    }
3141    break;
3142
3143  /* -- Unary test and copy ops ------------------------------------------- */
3144
3145  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
3146    |  ins_AD	// RA = dst or unused, RD = src, JMP with RD = target
3147    |  mov ITYPE, [BASE+RD*8]
3148    |  add PC, 4
3149    if (op == BC_ISTC || op == BC_ISFC) {
3150      |  mov RB, ITYPE
3151    }
3152    |  sar ITYPE, 47
3153    |  cmp ITYPEd, LJ_TISTRUECOND
3154    if (op == BC_IST || op == BC_ISTC) {
3155      |  jae >1
3156    } else {
3157      |  jb >1
3158    }
3159    if (op == BC_ISTC || op == BC_ISFC) {
3160      |  mov [BASE+RA*8], RB
3161    }
3162    |  movzx RDd, PC_RD
3163    |  branchPC RD
3164    |1:					// Fallthrough to the next instruction.
3165    |  ins_next
3166    break;
3167
3168  case BC_ISTYPE:
3169    |  ins_AD	// RA = src, RD = -type
3170    |  mov RB, [BASE+RA*8]
3171    |  sar RB, 47
3172    |  add RBd, RDd
3173    |  jne ->vmeta_istype
3174    |  ins_next
3175    break;
3176  case BC_ISNUM:
3177    |  ins_AD	// RA = src, RD = -(TISNUM-1)
3178    |  checknumtp [BASE+RA*8], ->vmeta_istype
3179    |  ins_next
3180    break;
3181
3182  /* -- Unary ops --------------------------------------------------------- */
3183
3184  case BC_MOV:
3185    |  ins_AD	// RA = dst, RD = src
3186    |  mov RB, [BASE+RD*8]
3187    |  mov [BASE+RA*8], RB
3188    |  ins_next_
3189    break;
3190  case BC_NOT:
3191    |  ins_AD	// RA = dst, RD = src
3192    |  mov RB, [BASE+RD*8]
3193    |  sar RB, 47
3194    |  mov RCd, 2
3195    |  cmp RB, LJ_TISTRUECOND
3196    |  sbb RCd, 0
3197    |  shl RC, 47
3198    |  not RC
3199    |  mov [BASE+RA*8], RC
3200    |  ins_next
3201    break;
3202  case BC_UNM:
3203    |  ins_AD	// RA = dst, RD = src
3204    |  mov RB, [BASE+RD*8]
3205    |.if DUALNUM
3206    |  checkint RB, >5
3207    |  neg RBd
3208    |  jo >4
3209    |  setint RB
3210    |9:
3211    |  mov [BASE+RA*8], RB
3212    |  ins_next
3213    |4:
3214    |  mov64 RB, U64x(41e00000,00000000)  // 2^31.
3215    |  jmp <9
3216    |5:
3217    |  ja ->vmeta_unm
3218    |.else
3219    |  checknum RB, ->vmeta_unm
3220    |.endif
3221    |  mov64 RD, U64x(80000000,00000000)
3222    |  xor RB, RD
3223    |.if DUALNUM
3224    |  jmp <9
3225    |.else
3226    |  mov [BASE+RA*8], RB
3227    |  ins_next
3228    |.endif
3229    break;
3230  case BC_LEN:
3231    |  ins_AD	// RA = dst, RD = src
3232    |  mov RD, [BASE+RD*8]
3233    |  checkstr RD, >2
3234    |.if DUALNUM
3235    |  mov RDd, dword STR:RD->len
3236    |1:
3237    |  setint RD
3238    |  mov [BASE+RA*8], RD
3239    |.else
3240    |  xorps xmm0, xmm0
3241    |  cvtsi2sd xmm0, dword STR:RD->len
3242    |1:
3243    |  movsd qword [BASE+RA*8], xmm0
3244    |.endif
3245    |  ins_next
3246    |2:
3247    |  cmp ITYPEd, LJ_TTAB; jne ->vmeta_len
3248    |  mov TAB:CARG1, TAB:RD
3249#if LJ_52
3250    |  mov TAB:RB, TAB:RD->metatable
3251    |  cmp TAB:RB, 0
3252    |  jnz >9
3253    |3:
3254#endif
3255    |->BC_LEN_Z:
3256    |  mov RB, BASE			// Save BASE.
3257    |  call extern lj_tab_len		// (GCtab *t)
3258    |  // Length of table returned in eax (RD).
3259    |.if DUALNUM
3260    |  // Nothing to do.
3261    |.else
3262    |  cvtsi2sd xmm0, RDd
3263    |.endif
3264    |  mov BASE, RB			// Restore BASE.
3265    |  movzx RAd, PC_RA
3266    |  jmp <1
3267#if LJ_52
3268    |9:  // Check for __len.
3269    |  test byte TAB:RB->nomm, 1<<MM_len
3270    |  jnz <3
3271    |  jmp ->vmeta_len			// 'no __len' flag NOT set: check.
3272#endif
3273    break;
3274
3275  /* -- Binary ops -------------------------------------------------------- */
3276
3277    |.macro ins_arithpre, sseins, ssereg
3278    |  ins_ABC
3279    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
3280    ||switch (vk) {
3281    ||case 0:
3282    |   checknumtp [BASE+RB*8], ->vmeta_arith_vn
3283    |   .if DUALNUM
3284    |     checknumtp [KBASE+RC*8], ->vmeta_arith_vn
3285    |   .endif
3286    |   movsd xmm0, qword [BASE+RB*8]
3287    |   sseins ssereg, qword [KBASE+RC*8]
3288    ||  break;
3289    ||case 1:
3290    |   checknumtp [BASE+RB*8], ->vmeta_arith_nv
3291    |   .if DUALNUM
3292    |     checknumtp [KBASE+RC*8], ->vmeta_arith_nv
3293    |   .endif
3294    |   movsd xmm0, qword [KBASE+RC*8]
3295    |   sseins ssereg, qword [BASE+RB*8]
3296    ||  break;
3297    ||default:
3298    |   checknumtp [BASE+RB*8], ->vmeta_arith_vv
3299    |   checknumtp [BASE+RC*8], ->vmeta_arith_vv
3300    |   movsd xmm0, qword [BASE+RB*8]
3301    |   sseins ssereg, qword [BASE+RC*8]
3302    ||  break;
3303    ||}
3304    |.endmacro
3305    |
3306    |.macro ins_arithdn, intins
3307    |  ins_ABC
3308    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
3309    ||switch (vk) {
3310    ||case 0:
3311    |   mov RB, [BASE+RB*8]
3312    |   mov RC, [KBASE+RC*8]
3313    |   checkint RB, ->vmeta_arith_vno
3314    |   checkint RC, ->vmeta_arith_vno
3315    |   intins RBd, RCd; jo ->vmeta_arith_vno
3316    ||  break;
3317    ||case 1:
3318    |   mov RB, [BASE+RB*8]
3319    |   mov RC, [KBASE+RC*8]
3320    |   checkint RB, ->vmeta_arith_nvo
3321    |   checkint RC, ->vmeta_arith_nvo
3322    |   intins RCd, RBd; jo ->vmeta_arith_nvo
3323    ||  break;
3324    ||default:
3325    |   mov RB, [BASE+RB*8]
3326    |   mov RC, [BASE+RC*8]
3327    |   checkint RB, ->vmeta_arith_vvo
3328    |   checkint RC, ->vmeta_arith_vvo
3329    |   intins RBd, RCd; jo ->vmeta_arith_vvo
3330    ||  break;
3331    ||}
3332    ||if (vk == 1) {
3333    |   setint RC
3334    |   mov [BASE+RA*8], RC
3335    ||} else {
3336    |   setint RB
3337    |   mov [BASE+RA*8], RB
3338    ||}
3339    |  ins_next
3340    |.endmacro
3341    |
3342    |.macro ins_arithpost
3343    |  movsd qword [BASE+RA*8], xmm0
3344    |.endmacro
3345    |
3346    |.macro ins_arith, sseins
3347    |  ins_arithpre sseins, xmm0
3348    |  ins_arithpost
3349    |  ins_next
3350    |.endmacro
3351    |
3352    |.macro ins_arith, intins, sseins
3353    |.if DUALNUM
3354    |  ins_arithdn intins
3355    |.else
3356    |  ins_arith, sseins
3357    |.endif
3358    |.endmacro
3359
3360    |  // RA = dst, RB = src1 or num const, RC = src2 or num const
3361  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
3362    |  ins_arith add, addsd
3363    break;
3364  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
3365    |  ins_arith sub, subsd
3366    break;
3367  case BC_MULVN: case BC_MULNV: case BC_MULVV:
3368    |  ins_arith imul, mulsd
3369    break;
3370  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
3371    |  ins_arith divsd
3372    break;
3373  case BC_MODVN:
3374    |  ins_arithpre movsd, xmm1
3375    |->BC_MODVN_Z:
3376    |  call ->vm_mod
3377    |  ins_arithpost
3378    |  ins_next
3379    break;
3380  case BC_MODNV: case BC_MODVV:
3381    |  ins_arithpre movsd, xmm1
3382    |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
3383    break;
3384  case BC_POW:
3385    |  ins_arithpre movsd, xmm1
3386    |  mov RB, BASE
3387    |  call extern pow
3388    |  movzx RAd, PC_RA
3389    |  mov BASE, RB
3390    |  ins_arithpost
3391    |  ins_next
3392    break;
3393
3394  case BC_CAT:
3395    |  ins_ABC	// RA = dst, RB = src_start, RC = src_end
3396    |  mov L:CARG1, SAVE_L
3397    |  mov L:CARG1->base, BASE
3398    |  lea CARG2, [BASE+RC*8]
3399    |  mov CARG3d, RCd
3400    |  sub CARG3d, RBd
3401    |->BC_CAT_Z:
3402    |  mov L:RB, L:CARG1
3403    |  mov SAVE_PC, PC
3404    |  call extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
3405    |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
3406    |  mov BASE, L:RB->base
3407    |  test RC, RC
3408    |  jnz ->vmeta_binop
3409    |  movzx RBd, PC_RB			// Copy result to Stk[RA] from Stk[RB].
3410    |  movzx RAd, PC_RA
3411    |  mov RC, [BASE+RB*8]
3412    |  mov [BASE+RA*8], RC
3413    |  ins_next
3414    break;
3415
3416  /* -- Constant ops ------------------------------------------------------ */
3417
3418  case BC_KSTR:
3419    |  ins_AND	// RA = dst, RD = str const (~)
3420    |  mov RD, [KBASE+RD*8]
3421    |  settp RD, LJ_TSTR
3422    |  mov [BASE+RA*8], RD
3423    |  ins_next
3424    break;
3425  case BC_KCDATA:
3426    |.if FFI
3427    |  ins_AND	// RA = dst, RD = cdata const (~)
3428    |  mov RD, [KBASE+RD*8]
3429    |  settp RD, LJ_TCDATA
3430    |  mov [BASE+RA*8], RD
3431    |  ins_next
3432    |.endif
3433    break;
3434  case BC_KSHORT:
3435    |  ins_AD	// RA = dst, RD = signed int16 literal
3436    |.if DUALNUM
3437    |  movsx RDd, RDW
3438    |  setint RD
3439    |  mov [BASE+RA*8], RD
3440    |.else
3441    |  movsx RDd, RDW			// Sign-extend literal.
3442    |  cvtsi2sd xmm0, RDd
3443    |  movsd qword [BASE+RA*8], xmm0
3444    |.endif
3445    |  ins_next
3446    break;
3447  case BC_KNUM:
3448    |  ins_AD	// RA = dst, RD = num const
3449    |  movsd xmm0, qword [KBASE+RD*8]
3450    |  movsd qword [BASE+RA*8], xmm0
3451    |  ins_next
3452    break;
3453  case BC_KPRI:
3454    |  ins_AD	// RA = dst, RD = primitive type (~)
3455    |  shl RD, 47
3456    |  not RD
3457    |  mov [BASE+RA*8], RD
3458    |  ins_next
3459    break;
3460  case BC_KNIL:
3461    |  ins_AD	// RA = dst_start, RD = dst_end
3462    |  lea RA, [BASE+RA*8+8]
3463    |  lea RD, [BASE+RD*8]
3464    |  mov RB, LJ_TNIL
3465    |  mov [RA-8], RB			// Sets minimum 2 slots.
3466    |1:
3467    |  mov [RA], RB
3468    |  add RA, 8
3469    |  cmp RA, RD
3470    |  jbe <1
3471    |  ins_next
3472    break;
3473
3474  /* -- Upvalue and function ops ------------------------------------------ */
3475
3476  case BC_UGET:
3477    |  ins_AD	// RA = dst, RD = upvalue #
3478    |  mov LFUNC:RB, [BASE-16]
3479    |  cleartp LFUNC:RB
3480    |  mov UPVAL:RB, [LFUNC:RB+RD*8+offsetof(GCfuncL, uvptr)]
3481    |  mov RB, UPVAL:RB->v
3482    |  mov RD, [RB]
3483    |  mov [BASE+RA*8], RD
3484    |  ins_next
3485    break;
3486  case BC_USETV:
3487#define TV2MARKOFS \
3488 ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv))
3489    |  ins_AD	// RA = upvalue #, RD = src
3490    |  mov LFUNC:RB, [BASE-16]
3491    |  cleartp LFUNC:RB
3492    |  mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
3493    |  cmp byte UPVAL:RB->closed, 0
3494    |  mov RB, UPVAL:RB->v
3495    |  mov RA, [BASE+RD*8]
3496    |  mov [RB], RA
3497    |  jz >1
3498    |  // Check barrier for closed upvalue.
3499    |  test byte [RB+TV2MARKOFS], LJ_GC_BLACK		// isblack(uv)
3500    |  jnz >2
3501    |1:
3502    |  ins_next
3503    |
3504    |2:  // Upvalue is black. Check if new value is collectable and white.
3505    |  mov RD, RA
3506    |  sar RD, 47
3507    |  sub RDd, LJ_TISGCV
3508    |  cmp RDd, LJ_TNUMX - LJ_TISGCV			// tvisgcv(v)
3509    |  jbe <1
3510    |  cleartp GCOBJ:RA
3511    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(v)
3512    |  jz <1
3513    |  // Crossed a write barrier. Move the barrier forward.
3514    |.if not X64WIN
3515    |  mov CARG2, RB
3516    |  mov RB, BASE			// Save BASE.
3517    |.else
3518    |  xchg CARG2, RB			// Save BASE (CARG2 == BASE).
3519    |.endif
3520    |  lea GL:CARG1, [DISPATCH+GG_DISP2G]
3521    |  call extern lj_gc_barrieruv	// (global_State *g, TValue *tv)
3522    |  mov BASE, RB			// Restore BASE.
3523    |  jmp <1
3524    break;
3525#undef TV2MARKOFS
3526  case BC_USETS:
3527    |  ins_AND	// RA = upvalue #, RD = str const (~)
3528    |  mov LFUNC:RB, [BASE-16]
3529    |  cleartp LFUNC:RB
3530    |  mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
3531    |  mov STR:RA, [KBASE+RD*8]
3532    |  mov RD, UPVAL:RB->v
3533    |  settp STR:ITYPE, STR:RA, LJ_TSTR
3534    |  mov [RD], STR:ITYPE
3535    |  test byte UPVAL:RB->marked, LJ_GC_BLACK		// isblack(uv)
3536    |  jnz >2
3537    |1:
3538    |  ins_next
3539    |
3540    |2:  // Check if string is white and ensure upvalue is closed.
3541    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(str)
3542    |  jz <1
3543    |  cmp byte UPVAL:RB->closed, 0
3544    |  jz <1
3545    |  // Crossed a write barrier. Move the barrier forward.
3546    |  mov RB, BASE			// Save BASE (CARG2 == BASE).
3547    |  mov CARG2, RD
3548    |  lea GL:CARG1, [DISPATCH+GG_DISP2G]
3549    |  call extern lj_gc_barrieruv	// (global_State *g, TValue *tv)
3550    |  mov BASE, RB			// Restore BASE.
3551    |  jmp <1
3552    break;
3553  case BC_USETN:
3554    |  ins_AD	// RA = upvalue #, RD = num const
3555    |  mov LFUNC:RB, [BASE-16]
3556    |  cleartp LFUNC:RB
3557    |  movsd xmm0, qword [KBASE+RD*8]
3558    |  mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
3559    |  mov RA, UPVAL:RB->v
3560    |  movsd qword [RA], xmm0
3561    |  ins_next
3562    break;
3563  case BC_USETP:
3564    |  ins_AD	// RA = upvalue #, RD = primitive type (~)
3565    |  mov LFUNC:RB, [BASE-16]
3566    |  cleartp LFUNC:RB
3567    |  mov UPVAL:RB, [LFUNC:RB+RA*8+offsetof(GCfuncL, uvptr)]
3568    |  shl RD, 47
3569    |  not RD
3570    |  mov RA, UPVAL:RB->v
3571    |  mov [RA], RD
3572    |  ins_next
3573    break;
3574  case BC_UCLO:
3575    |  ins_AD	// RA = level, RD = target
3576    |  branchPC RD			// Do this first to free RD.
3577    |  mov L:RB, SAVE_L
3578    |  cmp aword L:RB->openupval, 0
3579    |  je >1
3580    |  mov L:RB->base, BASE
3581    |  lea CARG2, [BASE+RA*8]		// Caveat: CARG2 == BASE
3582    |  mov L:CARG1, L:RB		// Caveat: CARG1 == RA
3583    |  call extern lj_func_closeuv	// (lua_State *L, TValue *level)
3584    |  mov BASE, L:RB->base
3585    |1:
3586    |  ins_next
3587    break;
3588
3589  case BC_FNEW:
3590    |  ins_AND	// RA = dst, RD = proto const (~) (holding function prototype)
3591    |  mov L:RB, SAVE_L
3592    |  mov L:RB->base, BASE		// Caveat: CARG2/CARG3 may be BASE.
3593    |  mov CARG3, [BASE-16]
3594    |  cleartp CARG3
3595    |  mov CARG2, [KBASE+RD*8]		// Fetch GCproto *.
3596    |  mov CARG1, L:RB
3597    |  mov SAVE_PC, PC
3598    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
3599    |  call extern lj_func_newL_gc
3600    |  // GCfuncL * returned in eax (RC).
3601    |  mov BASE, L:RB->base
3602    |  movzx RAd, PC_RA
3603    |  settp LFUNC:RC, LJ_TFUNC
3604    |  mov [BASE+RA*8], LFUNC:RC
3605    |  ins_next
3606    break;
3607
3608  /* -- Table ops --------------------------------------------------------- */
3609
3610  case BC_TNEW:
3611    |  ins_AD	// RA = dst, RD = hbits|asize
3612    |  mov L:RB, SAVE_L
3613    |  mov L:RB->base, BASE
3614    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
3615    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
3616    |  mov SAVE_PC, PC
3617    |  jae >5
3618    |1:
3619    |  mov CARG3d, RDd
3620    |  and RDd, 0x7ff
3621    |  shr CARG3d, 11
3622    |  cmp RDd, 0x7ff
3623    |  je >3
3624    |2:
3625    |  mov L:CARG1, L:RB
3626    |  mov CARG2d, RDd
3627    |  call extern lj_tab_new  // (lua_State *L, int32_t asize, uint32_t hbits)
3628    |  // Table * returned in eax (RC).
3629    |  mov BASE, L:RB->base
3630    |  movzx RAd, PC_RA
3631    |  settp TAB:RC, LJ_TTAB
3632    |  mov [BASE+RA*8], TAB:RC
3633    |  ins_next
3634    |3:  // Turn 0x7ff into 0x801.
3635    |  mov RDd, 0x801
3636    |  jmp <2
3637    |5:
3638    |  mov L:CARG1, L:RB
3639    |  call extern lj_gc_step_fixtop	// (lua_State *L)
3640    |  movzx RDd, PC_RD
3641    |  jmp <1
3642    break;
3643  case BC_TDUP:
3644    |  ins_AND	// RA = dst, RD = table const (~) (holding template table)
3645    |  mov L:RB, SAVE_L
3646    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
3647    |  mov SAVE_PC, PC
3648    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
3649    |  mov L:RB->base, BASE
3650    |  jae >3
3651    |2:
3652    |  mov TAB:CARG2, [KBASE+RD*8]	// Caveat: CARG2 == BASE
3653    |  mov L:CARG1, L:RB		// Caveat: CARG1 == RA
3654    |  call extern lj_tab_dup		// (lua_State *L, Table *kt)
3655    |  // Table * returned in eax (RC).
3656    |  mov BASE, L:RB->base
3657    |  movzx RAd, PC_RA
3658    |  settp TAB:RC, LJ_TTAB
3659    |  mov [BASE+RA*8], TAB:RC
3660    |  ins_next
3661    |3:
3662    |  mov L:CARG1, L:RB
3663    |  call extern lj_gc_step_fixtop	// (lua_State *L)
3664    |  movzx RDd, PC_RD			// Need to reload RD.
3665    |  not RD
3666    |  jmp <2
3667    break;
3668
3669  case BC_GGET:
3670    |  ins_AND	// RA = dst, RD = str const (~)
3671    |  mov LFUNC:RB, [BASE-16]
3672    |  cleartp LFUNC:RB
3673    |  mov TAB:RB, LFUNC:RB->env
3674    |  mov STR:RC, [KBASE+RD*8]
3675    |  jmp ->BC_TGETS_Z
3676    break;
3677  case BC_GSET:
3678    |  ins_AND	// RA = src, RD = str const (~)
3679    |  mov LFUNC:RB, [BASE-16]
3680    |  cleartp LFUNC:RB
3681    |  mov TAB:RB, LFUNC:RB->env
3682    |  mov STR:RC, [KBASE+RD*8]
3683    |  jmp ->BC_TSETS_Z
3684    break;
3685
3686  case BC_TGETV:
3687    |  ins_ABC	// RA = dst, RB = table, RC = key
3688    |  mov TAB:RB, [BASE+RB*8]
3689    |  mov RC, [BASE+RC*8]
3690    |  checktab TAB:RB, ->vmeta_tgetv
3691    |
3692    |  // Integer key?
3693    |.if DUALNUM
3694    |  checkint RC, >5
3695    |.else
3696    |  // Convert number to int and back and compare.
3697    |  checknum RC, >5
3698    |  movd xmm0, RC
3699    |  cvttsd2si RCd, xmm0
3700    |  cvtsi2sd xmm1, RCd
3701    |  ucomisd xmm0, xmm1
3702    |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
3703    |.endif
3704    |  cmp RCd, TAB:RB->asize		// Takes care of unordered, too.
3705    |  jae ->vmeta_tgetv		// Not in array part? Use fallback.
3706    |  shl RCd, 3
3707    |  add RC, TAB:RB->array
3708    |  // Get array slot.
3709    |  mov ITYPE, [RC]
3710    |  cmp ITYPE, LJ_TNIL		// Avoid overwriting RB in fastpath.
3711    |  je >2
3712    |1:
3713    |  mov [BASE+RA*8], ITYPE
3714    |  ins_next
3715    |
3716    |2:  // Check for __index if table value is nil.
3717    |  mov TAB:TMPR, TAB:RB->metatable
3718    |  test TAB:TMPR, TAB:TMPR
3719    |  jz <1
3720    |  test byte TAB:TMPR->nomm, 1<<MM_index
3721    |  jz ->vmeta_tgetv			// 'no __index' flag NOT set: check.
3722    |  jmp <1
3723    |
3724    |5:  // String key?
3725    |  cmp ITYPEd, LJ_TSTR; jne ->vmeta_tgetv
3726    |  cleartp STR:RC
3727    |  jmp ->BC_TGETS_Z
3728    break;
3729  case BC_TGETS:
3730    |  ins_ABC	// RA = dst, RB = table, RC = str const (~)
3731    |  mov TAB:RB, [BASE+RB*8]
3732    |  not RC
3733    |  mov STR:RC, [KBASE+RC*8]
3734    |  checktab TAB:RB, ->vmeta_tgets
3735    |->BC_TGETS_Z:	// RB = GCtab *, RC = GCstr *
3736    |  mov TMPRd, TAB:RB->hmask
3737    |  and TMPRd, STR:RC->sid
3738    |  imul TMPRd, #NODE
3739    |  add NODE:TMPR, TAB:RB->node
3740    |  settp ITYPE, STR:RC, LJ_TSTR
3741    |1:
3742    |  cmp NODE:TMPR->key, ITYPE
3743    |  jne >4
3744    |  // Get node value.
3745    |  mov ITYPE, NODE:TMPR->val
3746    |  cmp ITYPE, LJ_TNIL
3747    |  je >5				// Key found, but nil value?
3748    |2:
3749    |  mov [BASE+RA*8], ITYPE
3750    |  ins_next
3751    |
3752    |4:  // Follow hash chain.
3753    |  mov NODE:TMPR, NODE:TMPR->next
3754    |  test NODE:TMPR, NODE:TMPR
3755    |  jnz <1
3756    |  // End of hash chain: key not found, nil result.
3757    |  mov ITYPE, LJ_TNIL
3758    |
3759    |5:  // Check for __index if table value is nil.
3760    |  mov TAB:TMPR, TAB:RB->metatable
3761    |  test TAB:TMPR, TAB:TMPR
3762    |  jz <2				// No metatable: done.
3763    |  test byte TAB:TMPR->nomm, 1<<MM_index
3764    |  jnz <2				// 'no __index' flag set: done.
3765    |  jmp ->vmeta_tgets		// Caveat: preserve STR:RC.
3766    break;
3767  case BC_TGETB:
3768    |  ins_ABC	// RA = dst, RB = table, RC = byte literal
3769    |  mov TAB:RB, [BASE+RB*8]
3770    |  checktab TAB:RB, ->vmeta_tgetb
3771    |  cmp RCd, TAB:RB->asize
3772    |  jae ->vmeta_tgetb
3773    |  shl RCd, 3
3774    |  add RC, TAB:RB->array
3775    |  // Get array slot.
3776    |  mov ITYPE, [RC]
3777    |  cmp ITYPE, LJ_TNIL
3778    |  je >2
3779    |1:
3780    |  mov [BASE+RA*8], ITYPE
3781    |  ins_next
3782    |
3783    |2:  // Check for __index if table value is nil.
3784    |  mov TAB:TMPR, TAB:RB->metatable
3785    |  test TAB:TMPR, TAB:TMPR
3786    |  jz <1
3787    |  test byte TAB:TMPR->nomm, 1<<MM_index
3788    |  jz ->vmeta_tgetb			// 'no __index' flag NOT set: check.
3789    |  jmp <1
3790    break;
3791  case BC_TGETR:
3792    |  ins_ABC	// RA = dst, RB = table, RC = key
3793    |  mov TAB:RB, [BASE+RB*8]
3794    |  cleartp TAB:RB
3795    |.if DUALNUM
3796    |  mov RCd, dword [BASE+RC*8]
3797    |.else
3798    |  cvttsd2si RCd, qword [BASE+RC*8]
3799    |.endif
3800    |  cmp RCd, TAB:RB->asize
3801    |  jae ->vmeta_tgetr		// Not in array part? Use fallback.
3802    |  shl RCd, 3
3803    |  add RC, TAB:RB->array
3804    |  // Get array slot.
3805    |->BC_TGETR_Z:
3806    |  mov ITYPE, [RC]
3807    |->BC_TGETR2_Z:
3808    |  mov [BASE+RA*8], ITYPE
3809    |  ins_next
3810    break;
3811
3812  case BC_TSETV:
3813    |  ins_ABC	// RA = src, RB = table, RC = key
3814    |  mov TAB:RB, [BASE+RB*8]
3815    |  mov RC, [BASE+RC*8]
3816    |  checktab TAB:RB, ->vmeta_tsetv
3817    |
3818    |  // Integer key?
3819    |.if DUALNUM
3820    |  checkint RC, >5
3821    |.else
3822    |  // Convert number to int and back and compare.
3823    |  checknum RC, >5
3824    |  movd xmm0, RC
3825    |  cvttsd2si RCd, xmm0
3826    |  cvtsi2sd xmm1, RCd
3827    |  ucomisd xmm0, xmm1
3828    |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
3829    |.endif
3830    |  cmp RCd, TAB:RB->asize		// Takes care of unordered, too.
3831    |  jae ->vmeta_tsetv
3832    |  shl RCd, 3
3833    |  add RC, TAB:RB->array
3834    |  cmp aword [RC], LJ_TNIL
3835    |  je >3				// Previous value is nil?
3836    |1:
3837    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
3838    |  jnz >7
3839    |2:  // Set array slot.
3840    |  mov RB, [BASE+RA*8]
3841    |  mov [RC], RB
3842    |  ins_next
3843    |
3844    |3:  // Check for __newindex if previous value is nil.
3845    |  mov TAB:TMPR, TAB:RB->metatable
3846    |  test TAB:TMPR, TAB:TMPR
3847    |  jz <1
3848    |  test byte TAB:TMPR->nomm, 1<<MM_newindex
3849    |  jz ->vmeta_tsetv			// 'no __newindex' flag NOT set: check.
3850    |  jmp <1
3851    |
3852    |5:  // String key?
3853    |  cmp ITYPEd, LJ_TSTR; jne ->vmeta_tsetv
3854    |  cleartp STR:RC
3855    |  jmp ->BC_TSETS_Z
3856    |
3857    |7:  // Possible table write barrier for the value. Skip valiswhite check.
3858    |  barrierback TAB:RB, TMPR
3859    |  jmp <2
3860    break;
3861  case BC_TSETS:
3862    |  ins_ABC	// RA = src, RB = table, RC = str const (~)
3863    |  mov TAB:RB, [BASE+RB*8]
3864    |  not RC
3865    |  mov STR:RC, [KBASE+RC*8]
3866    |  checktab TAB:RB, ->vmeta_tsets
3867    |->BC_TSETS_Z:	// RB = GCtab *, RC = GCstr *
3868    |  mov TMPRd, TAB:RB->hmask
3869    |  and TMPRd, STR:RC->sid
3870    |  imul TMPRd, #NODE
3871    |  mov byte TAB:RB->nomm, 0		// Clear metamethod cache.
3872    |  add NODE:TMPR, TAB:RB->node
3873    |  settp ITYPE, STR:RC, LJ_TSTR
3874    |1:
3875    |  cmp NODE:TMPR->key, ITYPE
3876    |  jne >5
3877    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
3878    |  cmp aword [TMPR], LJ_TNIL
3879    |  je >4				// Previous value is nil?
3880    |2:
3881    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
3882    |  jnz >7
3883    |3:  // Set node value.
3884    |  mov ITYPE, [BASE+RA*8]
3885    |  mov [TMPR], ITYPE
3886    |  ins_next
3887    |
3888    |4:  // Check for __newindex if previous value is nil.
3889    |  mov TAB:ITYPE, TAB:RB->metatable
3890    |  test TAB:ITYPE, TAB:ITYPE
3891    |  jz <2
3892    |  test byte TAB:ITYPE->nomm, 1<<MM_newindex
3893    |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
3894    |  jmp <2
3895    |
3896    |5:  // Follow hash chain.
3897    |  mov NODE:TMPR, NODE:TMPR->next
3898    |  test NODE:TMPR, NODE:TMPR
3899    |  jnz <1
3900    |  // End of hash chain: key not found, add a new one.
3901    |
3902    |  // But check for __newindex first.
3903    |  mov TAB:TMPR, TAB:RB->metatable
3904    |  test TAB:TMPR, TAB:TMPR
3905    |  jz >6				// No metatable: continue.
3906    |  test byte TAB:TMPR->nomm, 1<<MM_newindex
3907    |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
3908    |6:
3909    |  mov TMP1, ITYPE
3910    |  mov L:CARG1, SAVE_L
3911    |  mov L:CARG1->base, BASE
3912    |  lea CARG3, TMP1
3913    |  mov CARG2, TAB:RB
3914    |  mov SAVE_PC, PC
3915    |  call extern lj_tab_newkey	// (lua_State *L, GCtab *t, TValue *k)
3916    |  // Handles write barrier for the new key. TValue * returned in eax (RC).
3917    |  mov L:CARG1, SAVE_L
3918    |  mov BASE, L:CARG1->base
3919    |  mov TMPR, rax
3920    |  movzx RAd, PC_RA
3921    |  jmp <2				// Must check write barrier for value.
3922    |
3923    |7:  // Possible table write barrier for the value. Skip valiswhite check.
3924    |  barrierback TAB:RB, ITYPE
3925    |  jmp <3
3926    break;
3927  case BC_TSETB:
3928    |  ins_ABC	// RA = src, RB = table, RC = byte literal
3929    |  mov TAB:RB, [BASE+RB*8]
3930    |  checktab TAB:RB, ->vmeta_tsetb
3931    |  cmp RCd, TAB:RB->asize
3932    |  jae ->vmeta_tsetb
3933    |  shl RCd, 3
3934    |  add RC, TAB:RB->array
3935    |  cmp aword [RC], LJ_TNIL
3936    |  je >3				// Previous value is nil?
3937    |1:
3938    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
3939    |  jnz >7
3940    |2:	 // Set array slot.
3941    |  mov ITYPE, [BASE+RA*8]
3942    |  mov [RC], ITYPE
3943    |  ins_next
3944    |
3945    |3:  // Check for __newindex if previous value is nil.
3946    |  mov TAB:TMPR, TAB:RB->metatable
3947    |  test TAB:TMPR, TAB:TMPR
3948    |  jz <1
3949    |  test byte TAB:TMPR->nomm, 1<<MM_newindex
3950    |  jz ->vmeta_tsetb			// 'no __newindex' flag NOT set: check.
3951    |  jmp <1
3952    |
3953    |7:  // Possible table write barrier for the value. Skip valiswhite check.
3954    |  barrierback TAB:RB, TMPR
3955    |  jmp <2
3956    break;
3957  case BC_TSETR:
3958    |  ins_ABC	// RA = src, RB = table, RC = key
3959    |  mov TAB:RB, [BASE+RB*8]
3960    |  cleartp TAB:RB
3961    |.if DUALNUM
3962    |  mov RC, [BASE+RC*8]
3963    |.else
3964    |  cvttsd2si RCd, qword [BASE+RC*8]
3965    |.endif
3966    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
3967    |  jnz >7
3968    |2:
3969    |  cmp RCd, TAB:RB->asize
3970    |  jae ->vmeta_tsetr
3971    |  shl RCd, 3
3972    |  add RC, TAB:RB->array
3973    |  // Set array slot.
3974    |->BC_TSETR_Z:
3975    |  mov ITYPE, [BASE+RA*8]
3976    |  mov [RC], ITYPE
3977    |  ins_next
3978    |
3979    |7:  // Possible table write barrier for the value. Skip valiswhite check.
3980    |  barrierback TAB:RB, TMPR
3981    |  jmp <2
3982    break;
3983
3984  case BC_TSETM:
3985    |  ins_AD	// RA = base (table at base-1), RD = num const (start index)
3986    |1:
3987    |  mov TMPRd, dword [KBASE+RD*8]	// Integer constant is in lo-word.
3988    |  lea RA, [BASE+RA*8]
3989    |  mov TAB:RB, [RA-8]		// Guaranteed to be a table.
3990    |  cleartp TAB:RB
3991    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
3992    |  jnz >7
3993    |2:
3994    |  mov RDd, MULTRES
3995    |  sub RDd, 1
3996    |  jz >4				// Nothing to copy?
3997    |  add RDd, TMPRd			// Compute needed size.
3998    |  cmp RDd, TAB:RB->asize
3999    |  ja >5				// Doesn't fit into array part?
4000    |  sub RDd, TMPRd
4001    |  shl TMPRd, 3
4002    |  add TMPR, TAB:RB->array
4003    |3:  // Copy result slots to table.
4004    |  mov RB, [RA]
4005    |  add RA, 8
4006    |  mov [TMPR], RB
4007    |  add TMPR, 8
4008    |  sub RDd, 1
4009    |  jnz <3
4010    |4:
4011    |  ins_next
4012    |
4013    |5:  // Need to resize array part.
4014    |  mov L:CARG1, SAVE_L
4015    |  mov L:CARG1->base, BASE		// Caveat: CARG2/CARG3 may be BASE.
4016    |  mov CARG2, TAB:RB
4017    |  mov CARG3d, RDd
4018    |  mov L:RB, L:CARG1
4019    |  mov SAVE_PC, PC
4020    |  call extern lj_tab_reasize	// (lua_State *L, GCtab *t, int nasize)
4021    |  mov BASE, L:RB->base
4022    |  movzx RAd, PC_RA			// Restore RA.
4023    |  movzx RDd, PC_RD			// Restore RD.
4024    |  jmp <1				// Retry.
4025    |
4026    |7:  // Possible table write barrier for any value. Skip valiswhite check.
4027    |  barrierback TAB:RB, RD
4028    |  jmp <2
4029    break;
4030
4031  /* -- Calls and vararg handling ----------------------------------------- */
4032
4033  case BC_CALL: case BC_CALLM:
4034    |  ins_A_C	// RA = base, (RB = nresults+1,) RC = nargs+1 | extra_nargs
4035    if (op == BC_CALLM) {
4036      |  add NARGS:RDd, MULTRES
4037    }
4038    |  mov LFUNC:RB, [BASE+RA*8]
4039    |  checkfunc LFUNC:RB, ->vmeta_call_ra
4040    |  lea BASE, [BASE+RA*8+16]
4041    |  ins_call
4042    break;
4043
4044  case BC_CALLMT:
4045    |  ins_AD	// RA = base, RD = extra_nargs
4046    |  add NARGS:RDd, MULTRES
4047    |  // Fall through. Assumes BC_CALLT follows and ins_AD is a no-op.
4048    break;
4049  case BC_CALLT:
4050    |  ins_AD	// RA = base, RD = nargs+1
4051    |  lea RA, [BASE+RA*8+16]
4052    |  mov KBASE, BASE			// Use KBASE for move + vmeta_call hint.
4053    |  mov LFUNC:RB, [RA-16]
4054    |  checktp_nc LFUNC:RB, LJ_TFUNC, ->vmeta_call
4055    |->BC_CALLT_Z:
4056    |  mov PC, [BASE-8]
4057    |  test PCd, FRAME_TYPE
4058    |  jnz >7
4059    |1:
4060    |  mov [BASE-16], LFUNC:RB		// Copy func+tag down, reloaded below.
4061    |  mov MULTRES, NARGS:RDd
4062    |  sub NARGS:RDd, 1
4063    |  jz >3
4064    |2:  // Move args down.
4065    |  mov RB, [RA]
4066    |  add RA, 8
4067    |  mov [KBASE], RB
4068    |  add KBASE, 8
4069    |  sub NARGS:RDd, 1
4070    |  jnz <2
4071    |
4072    |  mov LFUNC:RB, [BASE-16]
4073    |3:
4074    |  cleartp LFUNC:RB
4075    |  mov NARGS:RDd, MULTRES
4076    |  cmp byte LFUNC:RB->ffid, 1	// (> FF_C) Calling a fast function?
4077    |  ja >5
4078    |4:
4079    |  ins_callt
4080    |
4081    |5:  // Tailcall to a fast function.
4082    |  test PCd, FRAME_TYPE		// Lua frame below?
4083    |  jnz <4
4084    |  movzx RAd, PC_RA
4085    |  neg RA
4086    |  mov LFUNC:KBASE, [BASE+RA*8-32]	// Need to prepare KBASE.
4087    |  cleartp LFUNC:KBASE
4088    |  mov KBASE, LFUNC:KBASE->pc
4089    |  mov KBASE, [KBASE+PC2PROTO(k)]
4090    |  jmp <4
4091    |
4092    |7:  // Tailcall from a vararg function.
4093    |  sub PC, FRAME_VARG
4094    |  test PCd, FRAME_TYPEP
4095    |  jnz >8				// Vararg frame below?
4096    |  sub BASE, PC			// Need to relocate BASE/KBASE down.
4097    |  mov KBASE, BASE
4098    |  mov PC, [BASE-8]
4099    |  jmp <1
4100    |8:
4101    |  add PCd, FRAME_VARG
4102    |  jmp <1
4103    break;
4104
4105  case BC_ITERC:
4106    |  ins_A	// RA = base, (RB = nresults+1,) RC = nargs+1 (2+1)
4107    |  lea RA, [BASE+RA*8+16]		// fb = base+2
4108    |  mov RB, [RA-32]			// Copy state. fb[0] = fb[-4].
4109    |  mov RC, [RA-24]			// Copy control var. fb[1] = fb[-3].
4110    |  mov [RA], RB
4111    |  mov [RA+8], RC
4112    |  mov LFUNC:RB, [RA-40]		// Copy callable. fb[-2] = fb[-5]
4113    |  mov [RA-16], LFUNC:RB
4114    |  mov NARGS:RDd, 2+1		// Handle like a regular 2-arg call.
4115    |  checkfunc LFUNC:RB, ->vmeta_call
4116    |  mov BASE, RA
4117    |  ins_call
4118    break;
4119
4120  case BC_ITERN:
4121    |.if JIT
4122    |  hotloop RBd
4123    |.endif
4124    |->vm_IITERN:
4125    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
4126    |  mov TAB:RB, [BASE+RA*8-16]
4127    |  cleartp TAB:RB
4128    |  mov RCd, [BASE+RA*8-8]		// Get index from control var.
4129    |  mov TMPRd, TAB:RB->asize
4130    |  add PC, 4
4131    |  mov ITYPE, TAB:RB->array
4132    |1:  // Traverse array part.
4133    |  cmp RCd, TMPRd; jae >5		// Index points after array part?
4134    |  cmp aword [ITYPE+RC*8], LJ_TNIL; je >4
4135    |.if not DUALNUM
4136    |  cvtsi2sd xmm0, RCd
4137    |.endif
4138    |  // Copy array slot to returned value.
4139    |  mov RB, [ITYPE+RC*8]
4140    |  mov [BASE+RA*8+8], RB
4141    |  // Return array index as a numeric key.
4142    |.if DUALNUM
4143    |  setint ITYPE, RC
4144    |  mov [BASE+RA*8], ITYPE
4145    |.else
4146    |  movsd qword [BASE+RA*8], xmm0
4147    |.endif
4148    |  add RCd, 1
4149    |  mov [BASE+RA*8-8], RCd		// Update control var.
4150    |2:
4151    |  movzx RDd, PC_RD			// Get target from ITERL.
4152    |  branchPC RD
4153    |3:
4154    |  ins_next
4155    |
4156    |4:  // Skip holes in array part.
4157    |  add RCd, 1
4158    |  jmp <1
4159    |
4160    |5:  // Traverse hash part.
4161    |  sub RCd, TMPRd
4162    |6:
4163    |  cmp RCd, TAB:RB->hmask; ja <3	// End of iteration? Branch to ITERL+1.
4164    |  imul ITYPEd, RCd, #NODE
4165    |  add NODE:ITYPE, TAB:RB->node
4166    |  cmp aword NODE:ITYPE->val, LJ_TNIL; je >7
4167    |  lea TMPRd, [RCd+TMPRd+1]
4168    |  // Copy key and value from hash slot.
4169    |  mov RB, NODE:ITYPE->key
4170    |  mov RC, NODE:ITYPE->val
4171    |  mov [BASE+RA*8], RB
4172    |  mov [BASE+RA*8+8], RC
4173    |  mov [BASE+RA*8-8], TMPRd
4174    |  jmp <2
4175    |
4176    |7:  // Skip holes in hash part.
4177    |  add RCd, 1
4178    |  jmp <6
4179    break;
4180
4181  case BC_ISNEXT:
4182    |  ins_AD	// RA = base, RD = target (points to ITERN)
4183    |  mov CFUNC:RB, [BASE+RA*8-24]
4184    |  checkfunc CFUNC:RB, >5
4185    |  checktptp [BASE+RA*8-16], LJ_TTAB, >5
4186    |  cmp aword [BASE+RA*8-8], LJ_TNIL; jne >5
4187    |  cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
4188    |  branchPC RD
4189    |  mov64 TMPR, ((uint64_t)LJ_KEYINDEX << 32)
4190    |  mov [BASE+RA*8-8], TMPR		// Initialize control var.
4191    |1:
4192    |  ins_next
4193    |5:  // Despecialize bytecode if any of the checks fail.
4194    |  mov PC_OP, BC_JMP
4195    |  branchPC RD
4196    |.if JIT
4197    |  cmp byte [PC], BC_ITERN
4198    |  jne >6
4199    |.endif
4200    |  mov byte [PC], BC_ITERC
4201    |  jmp <1
4202    |.if JIT
4203    |6:  // Unpatch JLOOP.
4204    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
4205    |  movzx RCd, word [PC+2]
4206    |  mov TRACE:RA, [RA+RC*8]
4207    |  mov eax, TRACE:RA->startins
4208    |  mov al, BC_ITERC
4209    |  mov dword [PC], eax
4210    |  jmp <1
4211    |.endif
4212    break;
4213
4214  case BC_VARG:
4215    |  ins_ABC	// RA = base, RB = nresults+1, RC = numparams
4216    |  lea TMPR, [BASE+RC*8+(16+FRAME_VARG)]
4217    |  lea RA, [BASE+RA*8]
4218    |  sub TMPR, [BASE-8]
4219    |  // Note: TMPR may now be even _above_ BASE if nargs was < numparams.
4220    |  test RB, RB
4221    |  jz >5				// Copy all varargs?
4222    |  lea RB, [RA+RB*8-8]
4223    |  cmp TMPR, BASE			// No vararg slots?
4224    |  jnb >2
4225    |1:  // Copy vararg slots to destination slots.
4226    |  mov RC, [TMPR-16]
4227    |  add TMPR, 8
4228    |  mov [RA], RC
4229    |  add RA, 8
4230    |  cmp RA, RB			// All destination slots filled?
4231    |  jnb >3
4232    |  cmp TMPR, BASE			// No more vararg slots?
4233    |  jb <1
4234    |2:  // Fill up remainder with nil.
4235    |  mov aword [RA], LJ_TNIL
4236    |  add RA, 8
4237    |  cmp RA, RB
4238    |  jb <2
4239    |3:
4240    |  ins_next
4241    |
4242    |5:  // Copy all varargs.
4243    |  mov MULTRES, 1			// MULTRES = 0+1
4244    |  mov RC, BASE
4245    |  sub RC, TMPR
4246    |  jbe <3				// No vararg slots?
4247    |  mov RBd, RCd
4248    |  shr RBd, 3
4249    |  add RBd, 1
4250    |  mov MULTRES, RBd			// MULTRES = #varargs+1
4251    |  mov L:RB, SAVE_L
4252    |  add RC, RA
4253    |  cmp RC, L:RB->maxstack
4254    |  ja >7				// Need to grow stack?
4255    |6:  // Copy all vararg slots.
4256    |  mov RC, [TMPR-16]
4257    |  add TMPR, 8
4258    |  mov [RA], RC
4259    |  add RA, 8
4260    |  cmp TMPR, BASE			// No more vararg slots?
4261    |  jb <6
4262    |  jmp <3
4263    |
4264    |7:  // Grow stack for varargs.
4265    |  mov L:RB->base, BASE
4266    |  mov L:RB->top, RA
4267    |  mov SAVE_PC, PC
4268    |  sub TMPR, BASE			// Need delta, because BASE may change.
4269    |  mov TMP1hi, TMPRd
4270    |  mov CARG2d, MULTRES
4271    |  sub CARG2d, 1
4272    |  mov CARG1, L:RB
4273    |  call extern lj_state_growstack	// (lua_State *L, int n)
4274    |  mov BASE, L:RB->base
4275    |  movsxd TMPR, TMP1hi
4276    |  mov RA, L:RB->top
4277    |  add TMPR, BASE
4278    |  jmp <6
4279    break;
4280
4281  /* -- Returns ----------------------------------------------------------- */
4282
4283  case BC_RETM:
4284    |  ins_AD	// RA = results, RD = extra_nresults
4285    |  add RDd, MULTRES			// MULTRES >=1, so RD >=1.
4286    |  // Fall through. Assumes BC_RET follows and ins_AD is a no-op.
4287    break;
4288
4289  case BC_RET: case BC_RET0: case BC_RET1:
4290    |  ins_AD	// RA = results, RD = nresults+1
4291    if (op != BC_RET0) {
4292      |  shl RAd, 3
4293    }
4294    |1:
4295    |  mov PC, [BASE-8]
4296    |  mov MULTRES, RDd			// Save nresults+1.
4297    |  test PCd, FRAME_TYPE		// Check frame type marker.
4298    |  jnz >7				// Not returning to a fixarg Lua func?
4299    switch (op) {
4300    case BC_RET:
4301      |->BC_RET_Z:
4302      |  mov KBASE, BASE		// Use KBASE for result move.
4303      |  sub RDd, 1
4304      |  jz >3
4305      |2:  // Move results down.
4306      |  mov RB, [KBASE+RA]
4307      |  mov [KBASE-16], RB
4308      |  add KBASE, 8
4309      |  sub RDd, 1
4310      |  jnz <2
4311      |3:
4312      |  mov RDd, MULTRES		// Note: MULTRES may be >255.
4313      |  movzx RBd, PC_RB		// So cannot compare with RDL!
4314      |5:
4315      |  cmp RBd, RDd			// More results expected?
4316      |  ja >6
4317      break;
4318    case BC_RET1:
4319      |  mov RB, [BASE+RA]
4320      |  mov [BASE-16], RB
4321      /* fallthrough */
4322    case BC_RET0:
4323      |5:
4324      |  cmp PC_RB, RDL			// More results expected?
4325      |  ja >6
4326    default:
4327      break;
4328    }
4329    |  movzx RAd, PC_RA
4330    |  neg RA
4331    |  lea BASE, [BASE+RA*8-16]		// base = base - (RA+2)*8
4332    |  mov LFUNC:KBASE, [BASE-16]
4333    |  cleartp LFUNC:KBASE
4334    |  mov KBASE, LFUNC:KBASE->pc
4335    |  mov KBASE, [KBASE+PC2PROTO(k)]
4336    |  ins_next
4337    |
4338    |6:  // Fill up results with nil.
4339    if (op == BC_RET) {
4340      |  mov aword [KBASE-16], LJ_TNIL	// Note: relies on shifted base.
4341      |  add KBASE, 8
4342    } else {
4343      |  mov aword [BASE+RD*8-24], LJ_TNIL
4344    }
4345    |  add RD, 1
4346    |  jmp <5
4347    |
4348    |7:  // Non-standard return case.
4349    |  lea RB, [PC-FRAME_VARG]
4350    |  test RBd, FRAME_TYPEP
4351    |  jnz ->vm_return
4352    |  // Return from vararg function: relocate BASE down and RA up.
4353    |  sub BASE, RB
4354    if (op != BC_RET0) {
4355      |  add RA, RB
4356    }
4357    |  jmp <1
4358    break;
4359
4360  /* -- Loops and branches ------------------------------------------------ */
4361
4362  |.define FOR_IDX,  [RA]
4363  |.define FOR_STOP, [RA+8]
4364  |.define FOR_STEP, [RA+16]
4365  |.define FOR_EXT,  [RA+24]
4366
4367  case BC_FORL:
4368    |.if JIT
4369    |  hotloop RBd
4370    |.endif
4371    | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
4372    break;
4373
4374  case BC_JFORI:
4375  case BC_JFORL:
4376#if !LJ_HASJIT
4377    break;
4378#endif
4379  case BC_FORI:
4380  case BC_IFORL:
4381    vk = (op == BC_IFORL || op == BC_JFORL);
4382    |  ins_AJ	// RA = base, RD = target (after end of loop or start of loop)
4383    |  lea RA, [BASE+RA*8]
4384    if (LJ_DUALNUM) {
4385      |  mov RB, FOR_IDX
4386      |  checkint RB, >9
4387      |  mov TMPR, FOR_STOP
4388      if (!vk) {
4389	|  checkint TMPR, ->vmeta_for
4390	|  mov ITYPE, FOR_STEP
4391	|  test ITYPEd, ITYPEd; js >5
4392	|  sar ITYPE, 47;
4393	|  cmp ITYPEd, LJ_TISNUM; jne ->vmeta_for
4394      } else {
4395#ifdef LUA_USE_ASSERT
4396	|  checkinttp FOR_STOP, ->assert_bad_for_arg_type
4397	|  checkinttp FOR_STEP, ->assert_bad_for_arg_type
4398#endif
4399	|  mov ITYPE, FOR_STEP
4400	|  test ITYPEd, ITYPEd; js >5
4401	|  add RBd, ITYPEd; jo >1
4402	|  setint RB
4403	|  mov FOR_IDX, RB
4404      }
4405      |  cmp RBd, TMPRd
4406      |  mov FOR_EXT, RB
4407      if (op == BC_FORI) {
4408	|  jle >7
4409	|1:
4410	|6:
4411	|  branchPC RD
4412      } else if (op == BC_JFORI) {
4413	|  branchPC RD
4414	|  movzx RDd, PC_RD
4415	|  jle =>BC_JLOOP
4416	|1:
4417	|6:
4418      } else if (op == BC_IFORL) {
4419	|  jg >7
4420	|6:
4421	|  branchPC RD
4422	|1:
4423      } else {
4424	|  jle =>BC_JLOOP
4425	|1:
4426	|6:
4427      }
4428      |7:
4429      |  ins_next
4430      |
4431      |5:  // Invert check for negative step.
4432      if (!vk) {
4433	|  sar ITYPE, 47;
4434	|  cmp ITYPEd, LJ_TISNUM; jne ->vmeta_for
4435      } else {
4436	|  add RBd, ITYPEd; jo <1
4437	|  setint RB
4438	|  mov FOR_IDX, RB
4439      }
4440      |  cmp RBd, TMPRd
4441      |  mov FOR_EXT, RB
4442      if (op == BC_FORI) {
4443	|  jge <7
4444      } else if (op == BC_JFORI) {
4445	|  branchPC RD
4446	|  movzx RDd, PC_RD
4447	|  jge =>BC_JLOOP
4448      } else if (op == BC_IFORL) {
4449	|  jl <7
4450      } else {
4451	|  jge =>BC_JLOOP
4452      }
4453      |  jmp <6
4454      |9:  // Fallback to FP variant.
4455      if (!vk) {
4456	|  jae ->vmeta_for
4457      }
4458    } else if (!vk) {
4459      |  checknumtp FOR_IDX, ->vmeta_for
4460    }
4461    if (!vk) {
4462      |  checknumtp FOR_STOP, ->vmeta_for
4463    } else {
4464#ifdef LUA_USE_ASSERT
4465      |  checknumtp FOR_STOP, ->assert_bad_for_arg_type
4466      |  checknumtp FOR_STEP, ->assert_bad_for_arg_type
4467#endif
4468    }
4469    |  mov RB, FOR_STEP
4470    if (!vk) {
4471      |  checknum RB, ->vmeta_for
4472    }
4473    |  movsd xmm0, qword FOR_IDX
4474    |  movsd xmm1, qword FOR_STOP
4475    if (vk) {
4476      |  addsd xmm0, qword FOR_STEP
4477      |  movsd qword FOR_IDX, xmm0
4478      |  test RB, RB; js >3
4479    } else {
4480      |  jl >3
4481    }
4482    |  ucomisd xmm1, xmm0
4483    |1:
4484    |  movsd qword FOR_EXT, xmm0
4485    if (op == BC_FORI) {
4486      |.if DUALNUM
4487      |  jnb <7
4488      |.else
4489      |  jnb >2
4490      |  branchPC RD
4491      |.endif
4492    } else if (op == BC_JFORI) {
4493      |  branchPC RD
4494      |  movzx RDd, PC_RD
4495      |  jnb =>BC_JLOOP
4496    } else if (op == BC_IFORL) {
4497      |.if DUALNUM
4498      |  jb <7
4499      |.else
4500      |  jb >2
4501      |  branchPC RD
4502      |.endif
4503    } else {
4504      |  jnb =>BC_JLOOP
4505    }
4506    |.if DUALNUM
4507    |  jmp <6
4508    |.else
4509    |2:
4510    |  ins_next
4511    |.endif
4512    |
4513    |3:  // Invert comparison if step is negative.
4514    |  ucomisd xmm0, xmm1
4515    |  jmp <1
4516    break;
4517
4518  case BC_ITERL:
4519    |.if JIT
4520    |  hotloop RBd
4521    |.endif
4522    | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
4523    break;
4524
4525  case BC_JITERL:
4526#if !LJ_HASJIT
4527    break;
4528#endif
4529  case BC_IITERL:
4530    |  ins_AJ	// RA = base, RD = target
4531    |  lea RA, [BASE+RA*8]
4532    |  mov RB, [RA]
4533    |  cmp RB, LJ_TNIL; je >1		// Stop if iterator returned nil.
4534    if (op == BC_JITERL) {
4535      |  mov [RA-8], RB
4536      |  jmp =>BC_JLOOP
4537    } else {
4538      |  branchPC RD			// Otherwise save control var + branch.
4539      |  mov [RA-8], RB
4540    }
4541    |1:
4542    |  ins_next
4543    break;
4544
4545  case BC_LOOP:
4546    |  ins_A	// RA = base, RD = target (loop extent)
4547    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
4548    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
4549    |.if JIT
4550    |  hotloop RBd
4551    |.endif
4552    | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
4553    break;
4554
4555  case BC_ILOOP:
4556    |  ins_A	// RA = base, RD = target (loop extent)
4557    |  ins_next
4558    break;
4559
4560  case BC_JLOOP:
4561    |.if JIT
4562    |  ins_AD	// RA = base (ignored), RD = traceno
4563#ifdef LUA_USE_TRACE_LOGS
4564    |.if not X64WIN
4565    |  mov L:RB, SAVE_L
4566    |  mov L:RB->base, BASE  // Save BASE
4567    |  mov TMP1, RD     // Save RD
4568    |  mov CARG3, PC  // CARG3 == BASE
4569    |  mov CARG2, RD
4570    |  mov CARG1, RB
4571    |  call extern lj_log_trace_entry@8
4572    |  mov RD, TMP1
4573    |  mov BASE, L:RB->base
4574    |.endif
4575#endif
4576    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
4577    |  mov TRACE:RD, [RA+RD*8]
4578    |  mov RD, TRACE:RD->mcode
4579    |  mov L:RB, SAVE_L
4580    |  mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
4581    |  mov [DISPATCH+DISPATCH_GL(tmpbuf.L)], L:RB
4582    |  // Save additional callee-save registers only used in compiled code.
4583    |.if X64WIN
4584    |  mov CSAVE_4, r12
4585    |  mov CSAVE_3, r13
4586    |  mov CSAVE_2, r14
4587    |  mov CSAVE_1, r15
4588    |  mov RA, rsp
4589    |  sub rsp, 10*16+4*8
4590    |  movdqa [RA-1*16], xmm6
4591    |  movdqa [RA-2*16], xmm7
4592    |  movdqa [RA-3*16], xmm8
4593    |  movdqa [RA-4*16], xmm9
4594    |  movdqa [RA-5*16], xmm10
4595    |  movdqa [RA-6*16], xmm11
4596    |  movdqa [RA-7*16], xmm12
4597    |  movdqa [RA-8*16], xmm13
4598    |  movdqa [RA-9*16], xmm14
4599    |  movdqa [RA-10*16], xmm15
4600    |.else
4601    |  sub rsp, 16
4602    |  mov [rsp+16], r12
4603    |  mov [rsp+8], r13
4604    |.endif
4605    |  jmp RD
4606    |.endif
4607    break;
4608
4609  case BC_JMP:
4610    |  ins_AJ	// RA = unused, RD = target
4611    |  branchPC RD
4612    |  ins_next
4613    break;
4614
4615  /* -- Function headers -------------------------------------------------- */
4616
4617   /*
4618   ** Reminder: A function may be called with func/args above L->maxstack,
4619   ** i.e. occupying EXTRA_STACK slots. And vmeta_call may add one extra slot,
4620   ** too. This means all FUNC* ops (including fast functions) must check
4621   ** for stack overflow _before_ adding more slots!
4622   */
4623
4624  case BC_FUNCF:
4625    |.if JIT
4626    |  hotcall RBd
4627    |.endif
4628  case BC_FUNCV:  /* NYI: compiled vararg functions. */
4629    | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
4630    break;
4631
4632  case BC_JFUNCF:
4633#if !LJ_HASJIT
4634    break;
4635#endif
4636  case BC_IFUNCF:
4637    |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
4638    |  mov KBASE, [PC-4+PC2PROTO(k)]
4639    |  mov L:RB, SAVE_L
4640    |  lea RA, [BASE+RA*8]		// Top of frame.
4641    |  cmp RA, L:RB->maxstack
4642    |  ja ->vm_growstack_f
4643    |  movzx RAd, byte [PC-4+PC2PROTO(numparams)]
4644    |  cmp NARGS:RDd, RAd		// Check for missing parameters.
4645    |  jbe >3
4646    |2:
4647    if (op == BC_JFUNCF) {
4648      |  movzx RDd, PC_RD
4649      |  jmp =>BC_JLOOP
4650    } else {
4651      |  ins_next
4652    }
4653    |
4654    |3:  // Clear missing parameters.
4655    |  mov aword [BASE+NARGS:RD*8-8], LJ_TNIL
4656    |  add NARGS:RDd, 1
4657    |  cmp NARGS:RDd, RAd
4658    |  jbe <3
4659    |  jmp <2
4660    break;
4661
4662  case BC_JFUNCV:
4663#if !LJ_HASJIT
4664    break;
4665#endif
4666    | int3  // NYI: compiled vararg functions
4667    break;  /* NYI: compiled vararg functions. */
4668
4669  case BC_IFUNCV:
4670    |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
4671    |  lea RBd, [NARGS:RD*8+FRAME_VARG+8]
4672    |  lea RD, [BASE+NARGS:RD*8+8]
4673    |  mov LFUNC:KBASE, [BASE-16]
4674    |  mov [RD-8], RB			// Store delta + FRAME_VARG.
4675    |  mov [RD-16], LFUNC:KBASE		// Store copy of LFUNC.
4676    |  mov L:RB, SAVE_L
4677    |  lea RA, [RD+RA*8]
4678    |  cmp RA, L:RB->maxstack
4679    |  ja ->vm_growstack_v		// Need to grow stack.
4680    |  mov RA, BASE
4681    |  mov BASE, RD
4682    |  movzx RBd, byte [PC-4+PC2PROTO(numparams)]
4683    |  test RBd, RBd
4684    |  jz >2
4685    |  add RA, 8
4686    |1:  // Copy fixarg slots up to new frame.
4687    |  add RA, 8
4688    |  cmp RA, BASE
4689    |  jnb >3				// Less args than parameters?
4690    |  mov KBASE, [RA-16]
4691    |  mov [RD], KBASE
4692    |  add RD, 8
4693    |  mov aword [RA-16], LJ_TNIL	// Clear old fixarg slot (help the GC).
4694    |  sub RBd, 1
4695    |  jnz <1
4696    |2:
4697    if (op == BC_JFUNCV) {
4698      |  movzx RDd, PC_RD
4699      |  jmp =>BC_JLOOP
4700    } else {
4701      |  mov KBASE, [PC-4+PC2PROTO(k)]
4702      |  ins_next
4703    }
4704    |
4705    |3:  // Clear missing parameters.
4706    |  mov aword [RD], LJ_TNIL
4707    |  add RD, 8
4708    |  sub RBd, 1
4709    |  jnz <3
4710    |  jmp <2
4711    break;
4712
4713  case BC_FUNCC:
4714  case BC_FUNCCW:
4715    |  ins_AD  // BASE = new base, RA = ins RA|RD (unused), RD = nargs+1
4716    |  mov CFUNC:RB, [BASE-16]
4717    |  cleartp CFUNC:RB
4718    |  mov KBASE, CFUNC:RB->f
4719    |  mov L:RB, SAVE_L
4720    |  lea RD, [BASE+NARGS:RD*8-8]
4721    |  mov L:RB->base, BASE
4722    |  lea RA, [RD+8*LUA_MINSTACK]
4723    |  cmp RA, L:RB->maxstack
4724    |  mov L:RB->top, RD
4725    if (op == BC_FUNCC) {
4726      |  mov CARG1, L:RB		// Caveat: CARG1 may be RA.
4727    } else {
4728      |  mov CARG2, KBASE
4729      |  mov CARG1, L:RB		// Caveat: CARG1 may be RA.
4730    }
4731    |  ja ->vm_growstack_c		// Need to grow stack.
4732    |  set_vmstate C
4733    if (op == BC_FUNCC) {
4734      |  call KBASE			// (lua_State *L)
4735    } else {
4736      |  // (lua_State *L, lua_CFunction f)
4737      |  call aword [DISPATCH+DISPATCH_GL(wrapf)]
4738    }
4739    |  // nresults returned in eax (RD).
4740    |  mov BASE, L:RB->base
4741    |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
4742    |  set_vmstate INTERP
4743    |  lea RA, [BASE+RD*8]
4744    |  neg RA
4745    |  add RA, L:RB->top		// RA = (L->top-(L->base+nresults))*8
4746    |  mov PC, [BASE-8]			// Fetch PC of caller.
4747    |  jmp ->vm_returnc
4748    break;
4749
4750  /* ---------------------------------------------------------------------- */
4751
4752  default:
4753    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
4754    exit(2);
4755    break;
4756  }
4757}
4758
4759static int build_backend(BuildCtx *ctx)
4760{
4761  int op;
4762  dasm_growpc(Dst, BC__MAX);
4763  build_subroutines(ctx);
4764  |.code_op
4765  for (op = 0; op < BC__MAX; op++)
4766    build_ins(ctx, (BCOp)op, op);
4767  return BC__MAX;
4768}
4769
4770/* Emit pseudo frame-info for all assembler functions. */
4771static void emit_asm_debug(BuildCtx *ctx)
4772{
4773  int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
4774  switch (ctx->mode) {
4775  case BUILD_elfasm:
4776    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
4777    fprintf(ctx->fp,
4778	".Lframe0:\n"
4779	"\t.long .LECIE0-.LSCIE0\n"
4780	".LSCIE0:\n"
4781	"\t.long 0xffffffff\n"
4782	"\t.byte 0x1\n"
4783	"\t.string \"\"\n"
4784	"\t.uleb128 0x1\n"
4785	"\t.sleb128 -8\n"
4786	"\t.byte 0x10\n"
4787	"\t.byte 0xc\n\t.uleb128 0x7\n\t.uleb128 8\n"
4788	"\t.byte 0x80+0x10\n\t.uleb128 0x1\n"
4789	"\t.align 8\n"
4790	".LECIE0:\n\n");
4791    fprintf(ctx->fp,
4792	".LSFDE0:\n"
4793	"\t.long .LEFDE0-.LASFDE0\n"
4794	".LASFDE0:\n"
4795	"\t.long .Lframe0\n"
4796	"\t.quad .Lbegin\n"
4797	"\t.quad %d\n"
4798	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
4799	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
4800	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
4801	"\t.byte 0x8f\n\t.uleb128 0x4\n"	/* offset r15 */
4802	"\t.byte 0x8e\n\t.uleb128 0x5\n"	/* offset r14 */
4803#if LJ_NO_UNWIND
4804	"\t.byte 0x8d\n\t.uleb128 0x6\n"	/* offset r13 */
4805	"\t.byte 0x8c\n\t.uleb128 0x7\n"	/* offset r12 */
4806#endif
4807	"\t.align 8\n"
4808	".LEFDE0:\n\n", fcofs, CFRAME_SIZE);
4809#if LJ_HASFFI
4810    fprintf(ctx->fp,
4811	".LSFDE1:\n"
4812	"\t.long .LEFDE1-.LASFDE1\n"
4813	".LASFDE1:\n"
4814	"\t.long .Lframe0\n"
4815	"\t.quad lj_vm_ffi_call\n"
4816	"\t.quad %d\n"
4817	"\t.byte 0xe\n\t.uleb128 16\n"		/* def_cfa_offset */
4818	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
4819	"\t.byte 0xd\n\t.uleb128 0x6\n"		/* def_cfa_register rbp */
4820	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
4821	"\t.align 8\n"
4822	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
4823#endif
4824#if !LJ_NO_UNWIND
4825#if LJ_TARGET_SOLARIS
4826    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@unwind\n");
4827#else
4828    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
4829#endif
4830    fprintf(ctx->fp,
4831	".Lframe1:\n"
4832	"\t.long .LECIE1-.LSCIE1\n"
4833	".LSCIE1:\n"
4834	"\t.long 0\n"
4835	"\t.byte 0x1\n"
4836	"\t.string \"zPR\"\n"
4837	"\t.uleb128 0x1\n"
4838	"\t.sleb128 -8\n"
4839	"\t.byte 0x10\n"
4840	"\t.uleb128 6\n"			/* augmentation length */
4841	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
4842	"\t.long lj_err_unwind_dwarf-.\n"
4843	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
4844	"\t.byte 0xc\n\t.uleb128 0x7\n\t.uleb128 8\n"
4845	"\t.byte 0x80+0x10\n\t.uleb128 0x1\n"
4846	"\t.align 8\n"
4847	".LECIE1:\n\n");
4848    fprintf(ctx->fp,
4849	".LSFDE2:\n"
4850	"\t.long .LEFDE2-.LASFDE2\n"
4851	".LASFDE2:\n"
4852	"\t.long .LASFDE2-.Lframe1\n"
4853	"\t.long .Lbegin-.\n"
4854	"\t.long %d\n"
4855	"\t.uleb128 0\n"			/* augmentation length */
4856	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
4857	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
4858	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
4859	"\t.byte 0x8f\n\t.uleb128 0x4\n"	/* offset r15 */
4860	"\t.byte 0x8e\n\t.uleb128 0x5\n"	/* offset r14 */
4861	"\t.align 8\n"
4862	".LEFDE2:\n\n", fcofs, CFRAME_SIZE);
4863#if LJ_HASFFI
4864    fprintf(ctx->fp,
4865	".Lframe2:\n"
4866	"\t.long .LECIE2-.LSCIE2\n"
4867	".LSCIE2:\n"
4868	"\t.long 0\n"
4869	"\t.byte 0x1\n"
4870	"\t.string \"zR\"\n"
4871	"\t.uleb128 0x1\n"
4872	"\t.sleb128 -8\n"
4873	"\t.byte 0x10\n"
4874	"\t.uleb128 1\n"			/* augmentation length */
4875	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
4876	"\t.byte 0xc\n\t.uleb128 0x7\n\t.uleb128 8\n"
4877	"\t.byte 0x80+0x10\n\t.uleb128 0x1\n"
4878	"\t.align 8\n"
4879	".LECIE2:\n\n");
4880    fprintf(ctx->fp,
4881	".LSFDE3:\n"
4882	"\t.long .LEFDE3-.LASFDE3\n"
4883	".LASFDE3:\n"
4884	"\t.long .LASFDE3-.Lframe2\n"
4885	"\t.long lj_vm_ffi_call-.\n"
4886	"\t.long %d\n"
4887	"\t.uleb128 0\n"			/* augmentation length */
4888	"\t.byte 0xe\n\t.uleb128 16\n"		/* def_cfa_offset */
4889	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
4890	"\t.byte 0xd\n\t.uleb128 0x6\n"		/* def_cfa_register rbp */
4891	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
4892	"\t.align 8\n"
4893	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
4894#endif
4895#endif
4896    break;
4897#if !LJ_NO_UNWIND
4898  /* Mental note: never let Apple design an assembler.
4899  ** Or a linker. Or a plastic case. But I digress.
4900  */
4901  case BUILD_machasm: {
4902#if LJ_HASFFI
4903    int fcsize = 0;
4904#endif
4905    int i;
4906    fprintf(ctx->fp, "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support\n");
4907    fprintf(ctx->fp,
4908	"EH_frame1:\n"
4909	"\t.set L$set$x,LECIEX-LSCIEX\n"
4910	"\t.long L$set$x\n"
4911	"LSCIEX:\n"
4912	"\t.long 0\n"
4913	"\t.byte 0x1\n"
4914	"\t.ascii \"zPR\\0\"\n"
4915	"\t.byte 0x1\n"
4916	"\t.byte 128-8\n"
4917	"\t.byte 0x10\n"
4918	"\t.byte 6\n"				/* augmentation length */
4919	"\t.byte 0x9b\n"			/* indirect|pcrel|sdata4 */
4920	"\t.long _lj_err_unwind_dwarf+4@GOTPCREL\n"
4921	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
4922	"\t.byte 0xc\n\t.byte 0x7\n\t.byte 8\n"
4923	"\t.byte 0x80+0x10\n\t.byte 0x1\n"
4924	"\t.align 3\n"
4925	"LECIEX:\n\n");
4926    for (i = 0; i < ctx->nsym; i++) {
4927      const char *name = ctx->sym[i].name;
4928      int32_t size = ctx->sym[i+1].ofs - ctx->sym[i].ofs;
4929      if (size == 0) continue;
4930#if LJ_HASFFI
4931      if (!strcmp(name, "_lj_vm_ffi_call")) { fcsize = size; continue; }
4932#endif
4933      fprintf(ctx->fp,
4934	  "%s.eh:\n"
4935	  "LSFDE%d:\n"
4936	  "\t.set L$set$%d,LEFDE%d-LASFDE%d\n"
4937	  "\t.long L$set$%d\n"
4938	  "LASFDE%d:\n"
4939	  "\t.long LASFDE%d-EH_frame1\n"
4940	  "\t.long %s-.\n"
4941	  "\t.long %d\n"
4942	  "\t.byte 0\n"				/* augmentation length */
4943	  "\t.byte 0xe\n\t.byte %d\n"		/* def_cfa_offset */
4944	  "\t.byte 0x86\n\t.byte 0x2\n"		/* offset rbp */
4945	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset rbx */
4946	  "\t.byte 0x8f\n\t.byte 0x4\n"		/* offset r15 */
4947	  "\t.byte 0x8e\n\t.byte 0x5\n"		/* offset r14 */
4948	  "\t.align 3\n"
4949	  "LEFDE%d:\n\n",
4950	  name, i, i, i, i, i, i, i, name, size, CFRAME_SIZE, i);
4951    }
4952#if LJ_HASFFI
4953    if (fcsize) {
4954      fprintf(ctx->fp,
4955	  "EH_frame2:\n"
4956	  "\t.set L$set$y,LECIEY-LSCIEY\n"
4957	  "\t.long L$set$y\n"
4958	  "LSCIEY:\n"
4959	  "\t.long 0\n"
4960	  "\t.byte 0x1\n"
4961	  "\t.ascii \"zR\\0\"\n"
4962	  "\t.byte 0x1\n"
4963	  "\t.byte 128-8\n"
4964	  "\t.byte 0x10\n"
4965	  "\t.byte 1\n"				/* augmentation length */
4966	  "\t.byte 0x1b\n"			/* pcrel|sdata4 */
4967	  "\t.byte 0xc\n\t.byte 0x7\n\t.byte 8\n"
4968	  "\t.byte 0x80+0x10\n\t.byte 0x1\n"
4969	  "\t.align 3\n"
4970	  "LECIEY:\n\n");
4971      fprintf(ctx->fp,
4972	  "_lj_vm_ffi_call.eh:\n"
4973	  "LSFDEY:\n"
4974	  "\t.set L$set$yy,LEFDEY-LASFDEY\n"
4975	  "\t.long L$set$yy\n"
4976	  "LASFDEY:\n"
4977	  "\t.long LASFDEY-EH_frame2\n"
4978	  "\t.long _lj_vm_ffi_call-.\n"
4979	  "\t.long %d\n"
4980	  "\t.byte 0\n"				/* augmentation length */
4981	  "\t.byte 0xe\n\t.byte 16\n"		/* def_cfa_offset */
4982	  "\t.byte 0x86\n\t.byte 0x2\n"		/* offset rbp */
4983	  "\t.byte 0xd\n\t.byte 0x6\n"		/* def_cfa_register rbp */
4984	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset rbx */
4985	  "\t.align 3\n"
4986	  "LEFDEY:\n\n", fcsize);
4987    }
4988#endif
4989    fprintf(ctx->fp, ".subsections_via_symbols\n");
4990    }
4991    break;
4992#endif
4993  default:  /* Difficult for other modes. */
4994    break;
4995  }
4996}
4997
4998