1 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 Copyright (c) 2008-2017, Petr Kobalicek
3 
4 This software is provided 'as-is', without any express or implied
5 warranty. In no event will the authors be held liable for any damages
6 arising from the use of this software.
7 
8 Permission is granted to anyone to use this software for any purpose,
9 including commercial applications, and to alter it and redistribute it
10 freely, subject to the following restrictions:
11 
12 1. The origin of this software must not be misrepresented; you must not
13    claim that you wrote the original software. If you use this software
14    in a product, an acknowledgment in the product documentation would be
15    appreciated but is not required.
16 2. Altered source versions must be plainly marked as such, and must not be
17    misrepresented as being the original software.
18 3. This notice may not be removed or altered from any source distribution.
19 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
20 #ifdef __PLUMED_HAS_ASMJIT
21 #pragma GCC diagnostic push
22 #pragma GCC diagnostic ignored "-Wpedantic"
23 // [AsmJit]
24 // Complete x86/x64 JIT and Remote Assembler for C++.
25 //
26 // [License]
27 // Zlib - See LICENSE.md file in the package.
28 
29 // [Export]
30 #define ASMJIT_EXPORTS
31 
32 // [Guard]
33 #include "./asmjit_build.h"
34 #if defined(ASMJIT_BUILD_X86)
35 
36 // [Dependencies]
37 #include "./x86internal_p.h"
38 
39 // [Api-Begin]
40 #include "./asmjit_apibegin.h"
41 
42 namespace PLMD {
43 namespace asmjit {
44 
45 // ============================================================================
46 // [asmjit::X86Internal - Helpers]
47 // ============================================================================
48 
x86GetXmmMovInst(const FuncFrameLayout & layout)49 static ASMJIT_INLINE uint32_t x86GetXmmMovInst(const FuncFrameLayout& layout) {
50   bool avx = layout.isAvxEnabled();
51   bool aligned = layout.hasAlignedVecSR();
52 
53   return aligned ? (avx ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps)
54                  : (avx ? X86Inst::kIdVmovups : X86Inst::kIdMovups);
55 }
56 
x86VecTypeIdToRegType(uint32_t typeId)57 static ASMJIT_INLINE uint32_t x86VecTypeIdToRegType(uint32_t typeId) noexcept {
58   return typeId <= TypeId::_kVec128End ? X86Reg::kRegXmm :
59          typeId <= TypeId::_kVec256End ? X86Reg::kRegYmm :
60                                          X86Reg::kRegZmm ;
61 }
62 
63 // ============================================================================
64 // [asmjit::X86FuncArgsContext]
65 // ============================================================================
66 
67 // Used by both, `Utils::argsToFrameInfo()` and `Utils::allocArgs()`.
68 class X86FuncArgsContext {
69 public:
70   typedef FuncDetail::Value SrcArg;
71   typedef FuncArgsMapper::Value DstArg;
72 
73   enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
74 
75   struct WorkData {
76     uint32_t archRegs;                   //!< Architecture provided and allocable regs.
77     uint32_t workRegs;                   //!< Registers that can be used by shuffler.
78     uint32_t usedRegs;                   //!< Only registers used to pass arguments.
79     uint32_t srcRegs;                    //!< Source registers that need shuffling.
80     uint32_t dstRegs;                    //!< Destination registers that need shuffling.
81     uint8_t numOps;                      //!< Number of operations to finish.
82     uint8_t numSwaps;                    //!< Number of register swaps.
83     uint8_t numStackArgs;                //!< Number of stack loads.
84     uint8_t reserved[9];                 //!< Reserved (only used as padding).
85     uint8_t argIndex[32];                //!< Only valid if a corresponding bit in `userRegs` is true.
86   };
87 
88   X86FuncArgsContext() noexcept;
89   Error initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept;
90 
91   Error markRegsForSwaps(FuncFrameInfo& ffi) noexcept;
92   Error markDstRegsDirty(FuncFrameInfo& ffi) noexcept;
93   Error markStackArgsReg(FuncFrameInfo& ffi) noexcept;
94 
95   // --------------------------------------------------------------------------
96   // [Members]
97   // --------------------------------------------------------------------------
98 
99   WorkData _workData[kMaxVRegKinds];
100   bool _hasStackArgs;
101   bool _hasRegSwaps;
102 };
103 
X86FuncArgsContext()104 X86FuncArgsContext::X86FuncArgsContext() noexcept {
105   ::memset(_workData, 0, sizeof(_workData));
106   _hasStackArgs = false;
107   _hasRegSwaps = false;
108 }
109 
initWorkData(const FuncArgsMapper & args,const uint32_t * dirtyRegs,bool preservedFP)110 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept {
111   // This code has to be updated if this changes.
112   ASMJIT_ASSERT(kMaxVRegKinds == 4);
113 
114   uint32_t i;
115   const FuncDetail& func = *args.getFuncDetail();
116 
117   uint32_t archType = func.getCallConv().getArchType();
118   uint32_t count = (archType == ArchInfo::kTypeX86) ? 8 : 16;
119 
120   // Initialize WorkData::archRegs.
121   _workData[X86Reg::kKindGp ].archRegs = Utils::bits(count) & ~Utils::mask(X86Gp::kIdSp);
122   _workData[X86Reg::kKindMm ].archRegs = Utils::bits(8);
123   _workData[X86Reg::kKindK  ].archRegs = Utils::bits(8);
124   _workData[X86Reg::kKindVec].archRegs = Utils::bits(count);
125 
126   if (preservedFP)
127     _workData[X86Reg::kKindGp].archRegs &= ~Utils::mask(X86Gp::kIdBp);
128 
129   // Initialize WorkData::workRegs.
130   for (i = 0; i < kMaxVRegKinds; i++)
131     _workData[i].workRegs = _workData[i].archRegs & (dirtyRegs[i] | ~func.getCallConv().getPreservedRegs(i));
132 
133   // Build WorkData.
134   for (i = 0; i < kFuncArgCountLoHi; i++) {
135     const DstArg& dstArg = args.getArg(i);
136     if (!dstArg.isAssigned()) continue;
137 
138     const SrcArg& srcArg = func.getArg(i);
139     if (ASMJIT_UNLIKELY(!srcArg.isAssigned()))
140       return DebugUtils::errored(kErrorInvalidState);
141 
142     uint32_t dstRegType = dstArg.getRegType();
143     if (ASMJIT_UNLIKELY(dstRegType >= X86Reg::kRegCount))
144       return DebugUtils::errored(kErrorInvalidRegType);
145 
146     uint32_t dstRegKind = X86Reg::kindOf(dstRegType);
147     if (ASMJIT_UNLIKELY(dstRegKind >= kMaxVRegKinds))
148       return DebugUtils::errored(kErrorInvalidState);
149 
150     WorkData& dstData = _workData[dstRegKind];
151     uint32_t dstRegId = dstArg.getRegId();
152     if (ASMJIT_UNLIKELY(dstRegId >= 32 || !(dstData.archRegs & Utils::mask(dstRegId))))
153       return DebugUtils::errored(kErrorInvalidPhysId);
154 
155     uint32_t dstRegMask = Utils::mask(dstRegId);
156     if (ASMJIT_UNLIKELY(dstData.usedRegs & dstRegMask))
157       return DebugUtils::errored(kErrorOverlappedRegs);
158 
159     dstData.usedRegs |= dstRegMask;
160     dstData.argIndex[dstRegId] = static_cast<uint8_t>(i);
161 
162     if (srcArg.byReg()) {
163       uint32_t srcRegKind = X86Reg::kindOf(srcArg.getRegType());
164       uint32_t srcRegId = srcArg.getRegId();
165       uint32_t srcRegMask = Utils::mask(srcRegId);
166 
167       if (dstRegKind == srcRegKind) {
168         // The best case, register is allocated where it is expected to be.
169         if (dstRegId == srcRegId) continue;
170 
171         // Detect a register swap.
172         if (dstData.usedRegs & srcRegMask) {
173           const SrcArg& ref = func.getArg(dstData.argIndex[srcRegId]);
174           if (ref.byReg() && X86Reg::kindOf(ref.getRegType()) == dstRegKind && ref.getRegId() == dstRegId) {
175             dstData.numSwaps++;
176             _hasRegSwaps = true;
177           }
178         }
179         dstData.srcRegs |= srcRegMask;
180       }
181       else {
182         if (ASMJIT_UNLIKELY(srcRegKind >= kMaxVRegKinds))
183           return DebugUtils::errored(kErrorInvalidState);
184 
185         WorkData& srcData = _workData[srcRegKind];
186         srcData.srcRegs |= srcRegMask;
187       }
188     }
189     else {
190       dstData.numStackArgs++;
191       _hasStackArgs = true;
192     }
193 
194     dstData.numOps++;
195     dstData.dstRegs |= dstRegMask;
196   }
197 
198   return kErrorOk;
199 }
200 
markDstRegsDirty(FuncFrameInfo & ffi)201 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markDstRegsDirty(FuncFrameInfo& ffi) noexcept {
202   for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
203     WorkData& wd = _workData[i];
204     uint32_t regs = wd.usedRegs | wd.dstRegs;
205 
206     wd.workRegs |= regs;
207     ffi.addDirtyRegs(i, regs);
208   }
209 
210   return kErrorOk;
211 }
212 
markRegsForSwaps(FuncFrameInfo & ffi)213 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markRegsForSwaps(FuncFrameInfo& ffi) noexcept {
214   if (!_hasRegSwaps)
215     return kErrorOk;
216 
217   // If some registers require swapping then select one dirty register that
218   // can be used as a temporary. We can do it also without it (by using xors),
219   // but using temporary is always safer and also faster approach.
220   for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
221     // Skip all register kinds where swapping is natively supported (GP regs).
222     if (i == X86Reg::kKindGp) continue;
223 
224     // Skip all register kinds that don't require swapping.
225     WorkData& wd = _workData[i];
226     if (!wd.numSwaps) continue;
227 
228     // Initially, pick some clobbered or dirty register.
229     uint32_t workRegs = wd.workRegs;
230     uint32_t regs = workRegs & ~(wd.usedRegs | wd.dstRegs);
231 
232     // If that didn't work out pick some register which is not in 'used'.
233     if (!regs) regs = workRegs & ~wd.usedRegs;
234 
235     // If that didn't work out pick any other register that is allocable.
236     // This last resort case will, however, result in marking one more
237     // register dirty.
238     if (!regs) regs = wd.archRegs & ~workRegs;
239 
240     // If that didn't work out we will have to use xors instead of moves.
241     if (!regs) continue;
242 
243     uint32_t regMask = Utils::mask(Utils::findFirstBit(regs));
244     wd.workRegs |= regMask;
245     ffi.addDirtyRegs(i, regMask);
246   }
247 
248   return kErrorOk;
249 }
250 
markStackArgsReg(FuncFrameInfo & ffi)251 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markStackArgsReg(FuncFrameInfo& ffi) noexcept {
252   if (!_hasStackArgs)
253     return kErrorOk;
254 
255   // Decide which register to use to hold the stack base address.
256   if (!ffi.hasPreservedFP()) {
257     WorkData& wd = _workData[X86Reg::kKindGp];
258     uint32_t saRegId = ffi.getStackArgsRegId();
259     uint32_t usedRegs = wd.usedRegs;
260 
261     if (saRegId != Globals::kInvalidRegId) {
262       // Check if the user chosen SA register doesn't overlap with others.
263       // However, it's fine if it overlaps with some 'dstMove' register.
264       if (usedRegs & Utils::mask(saRegId))
265         return DebugUtils::errored(kErrorOverlappingStackRegWithRegArg);
266     }
267     else {
268       // Initially, pick some clobbered or dirty register that is neither
269       // in 'used' and neither in 'dstMove'. That's the safest bet as the
270       // register won't collide with anything right now.
271       uint32_t regs = wd.workRegs & ~(usedRegs | wd.dstRegs);
272 
273       // If that didn't work out pick some register which is not in 'used'.
274       if (!regs) regs = wd.workRegs & ~usedRegs;
275 
276       // If that didn't work out then we have to make one more register dirty.
277       if (!regs) regs = wd.archRegs & ~wd.workRegs;
278 
279       // If that didn't work out we can't continue.
280       if (ASMJIT_UNLIKELY(!regs))
281         return DebugUtils::errored(kErrorNoMorePhysRegs);
282 
283       saRegId = Utils::findFirstBit(regs);
284       ffi.setStackArgsRegId(saRegId);
285     }
286   }
287   else {
288     ffi.setStackArgsRegId(X86Gp::kIdBp);
289   }
290 
291   return kErrorOk;
292 }
293 
294 // ============================================================================
295 // [asmjit::X86Internal - CallConv]
296 // ============================================================================
297 
initCallConv(CallConv & cc,uint32_t ccId)298 ASMJIT_FAVOR_SIZE Error X86Internal::initCallConv(CallConv& cc, uint32_t ccId) noexcept {
299   const uint32_t kKindGp  = X86Reg::kKindGp;
300   const uint32_t kKindVec = X86Reg::kKindVec;
301   const uint32_t kKindMm  = X86Reg::kKindMm;
302   const uint32_t kKindK   = X86Reg::kKindK;
303 
304   const uint32_t kZax = X86Gp::kIdAx;
305   const uint32_t kZbx = X86Gp::kIdBx;
306   const uint32_t kZcx = X86Gp::kIdCx;
307   const uint32_t kZdx = X86Gp::kIdDx;
308   const uint32_t kZsp = X86Gp::kIdSp;
309   const uint32_t kZbp = X86Gp::kIdBp;
310   const uint32_t kZsi = X86Gp::kIdSi;
311   const uint32_t kZdi = X86Gp::kIdDi;
312 
313   switch (ccId) {
314     case CallConv::kIdX86StdCall:
315       cc.setFlags(CallConv::kFlagCalleePopsStack);
316       goto X86CallConv;
317 
318     case CallConv::kIdX86MsThisCall:
319       cc.setFlags(CallConv::kFlagCalleePopsStack);
320       cc.setPassedOrder(kKindGp, kZcx);
321       goto X86CallConv;
322 
323     case CallConv::kIdX86MsFastCall:
324     case CallConv::kIdX86GccFastCall:
325       cc.setFlags(CallConv::kFlagCalleePopsStack);
326       cc.setPassedOrder(kKindGp, kZcx, kZdx);
327       goto X86CallConv;
328 
329     case CallConv::kIdX86GccRegParm1:
330       cc.setPassedOrder(kKindGp, kZax);
331       goto X86CallConv;
332 
333     case CallConv::kIdX86GccRegParm2:
334       cc.setPassedOrder(kKindGp, kZax, kZdx);
335       goto X86CallConv;
336 
337     case CallConv::kIdX86GccRegParm3:
338       cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx);
339       goto X86CallConv;
340 
341     case CallConv::kIdX86CDecl:
342 X86CallConv:
343       cc.setNaturalStackAlignment(4);
344       cc.setArchType(ArchInfo::kTypeX86);
345       cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi));
346       break;
347 
348     case CallConv::kIdX86Win64:
349       cc.setArchType(ArchInfo::kTypeX64);
350       cc.setAlgorithm(CallConv::kAlgorithmWin64);
351       cc.setFlags(CallConv::kFlagPassFloatsByVec | CallConv::kFlagIndirectVecArgs);
352       cc.setNaturalStackAlignment(16);
353       cc.setSpillZoneSize(32);
354       cc.setPassedOrder(kKindGp, kZcx, kZdx, 8, 9);
355       cc.setPassedOrder(kKindVec, 0, 1, 2, 3);
356       cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15));
357       cc.setPreservedRegs(kKindVec, Utils::mask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
358       break;
359 
360     case CallConv::kIdX86SysV64:
361       cc.setArchType(ArchInfo::kTypeX64);
362       cc.setFlags(CallConv::kFlagPassFloatsByVec);
363       cc.setNaturalStackAlignment(16);
364       cc.setRedZoneSize(128);
365       cc.setPassedOrder(kKindGp, kZdi, kZsi, kZdx, kZcx, 8, 9);
366       cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
367       cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, 12, 13, 14, 15));
368       break;
369 
370     case CallConv::kIdX86FastEval2:
371     case CallConv::kIdX86FastEval3:
372     case CallConv::kIdX86FastEval4: {
373       uint32_t n = ccId - CallConv::kIdX86FastEval2;
374 
375       cc.setArchType(ArchInfo::kTypeX86);
376       cc.setFlags(CallConv::kFlagPassFloatsByVec);
377       cc.setNaturalStackAlignment(16);
378       cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
379       cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
380       cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
381 
382       cc.setPreservedRegs(kKindGp , Utils::bits(8));
383       cc.setPreservedRegs(kKindVec, Utils::bits(8) & ~Utils::bits(n));
384       cc.setPreservedRegs(kKindMm , Utils::bits(8));
385       cc.setPreservedRegs(kKindK  , Utils::bits(8));
386       break;
387     }
388 
389     case CallConv::kIdX64FastEval2:
390     case CallConv::kIdX64FastEval3:
391     case CallConv::kIdX64FastEval4: {
392       uint32_t n = ccId - CallConv::kIdX64FastEval2;
393 
394       cc.setArchType(ArchInfo::kTypeX64);
395       cc.setFlags(CallConv::kFlagPassFloatsByVec);
396       cc.setNaturalStackAlignment(16);
397       cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
398       cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
399       cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
400 
401       cc.setPreservedRegs(kKindGp , Utils::bits(16));
402       cc.setPreservedRegs(kKindVec,~Utils::bits(n));
403       cc.setPreservedRegs(kKindMm , Utils::bits(8));
404       cc.setPreservedRegs(kKindK  , Utils::bits(8));
405       break;
406     }
407 
408     default:
409       return DebugUtils::errored(kErrorInvalidArgument);
410   }
411 
412   cc.setId(ccId);
413   return kErrorOk;
414 }
415 
416 // ============================================================================
417 // [asmjit::X86Internal - FuncDetail]
418 // ============================================================================
419 
initFuncDetail(FuncDetail & func,const FuncSignature & sign,uint32_t gpSize)420 ASMJIT_FAVOR_SIZE Error X86Internal::initFuncDetail(FuncDetail& func, const FuncSignature& sign, uint32_t gpSize) noexcept {
421   const CallConv& cc = func.getCallConv();
422   uint32_t archType = cc.getArchType();
423 
424   uint32_t i;
425   uint32_t argCount = func.getArgCount();
426 
427   if (func.getRetCount() != 0) {
428     uint32_t typeId = func._rets[0].getTypeId();
429     switch (typeId) {
430       case TypeId::kI64:
431       case TypeId::kU64: {
432         if (archType == ArchInfo::kTypeX86) {
433           // Convert a 64-bit return to two 32-bit returns.
434           func._retCount = 2;
435           typeId -= 2;
436 
437           // 64-bit value is returned in EDX:EAX on X86.
438           func._rets[0].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdAx);
439           func._rets[1].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdDx);
440           break;
441         }
442         else {
443           func._rets[0].initReg(typeId, X86Gp::kRegGpq, X86Gp::kIdAx);
444         }
445         break;
446       }
447 
448       case TypeId::kI8:
449       case TypeId::kU8:
450       case TypeId::kI16:
451       case TypeId::kU16:
452       case TypeId::kI32:
453       case TypeId::kU32: {
454         func._rets[0].assignToReg(X86Gp::kRegGpd, X86Gp::kIdAx);
455         break;
456       }
457 
458       case TypeId::kF32:
459       case TypeId::kF64: {
460         uint32_t regType = (archType == ArchInfo::kTypeX86) ? X86Reg::kRegFp : X86Reg::kRegXmm;
461         func._rets[0].assignToReg(regType, 0);
462         break;
463       }
464 
465       case TypeId::kF80: {
466         // 80-bit floats are always returned by FP0.
467         func._rets[0].assignToReg(X86Reg::kRegFp, 0);
468         break;
469       }
470 
471       case TypeId::kMmx32:
472       case TypeId::kMmx64: {
473         // On X64 MM register(s) are returned through XMM or GPQ (Win64).
474         uint32_t regType = X86Reg::kRegMm;
475         if (archType != ArchInfo::kTypeX86)
476           regType = cc.getAlgorithm() == CallConv::kAlgorithmDefault ? X86Reg::kRegXmm : X86Reg::kRegGpq;
477 
478         func._rets[0].assignToReg(regType, 0);
479         break;
480       }
481 
482       default: {
483         func._rets[0].assignToReg(x86VecTypeIdToRegType(typeId), 0);
484         break;
485       }
486     }
487   }
488 
489   uint32_t stackBase = gpSize;
490   uint32_t stackOffset = stackBase + cc._spillZoneSize;
491 
492   if (cc.getAlgorithm() == CallConv::kAlgorithmDefault) {
493     uint32_t gpzPos = 0;
494     uint32_t vecPos = 0;
495 
496     for (i = 0; i < argCount; i++) {
497       FuncDetail::Value& arg = func._args[i];
498       uint32_t typeId = arg.getTypeId();
499 
500       if (TypeId::isInt(typeId)) {
501         uint32_t regId = gpzPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[gpzPos] : Globals::kInvalidRegId;
502         if (regId != Globals::kInvalidRegId) {
503           uint32_t regType = (typeId <= TypeId::kU32)
504             ? X86Reg::kRegGpd
505             : X86Reg::kRegGpq;
506           arg.assignToReg(regType, regId);
507           func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
508           gpzPos++;
509         }
510         else {
511           uint32_t size = std::max<uint32_t>(TypeId::sizeOf(typeId), gpSize);
512           arg.assignToStack(stackOffset);
513           stackOffset += size;
514         }
515         continue;
516       }
517 
518       if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
519         uint32_t regId = vecPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[vecPos] : Globals::kInvalidRegId;
520 
521         // If this is a float, but `floatByVec` is false, we have to pass by stack.
522         if (TypeId::isFloat(typeId) && !cc.hasFlag(CallConv::kFlagPassFloatsByVec))
523           regId = Globals::kInvalidRegId;
524 
525         if (regId != Globals::kInvalidRegId) {
526           arg.initReg(typeId, x86VecTypeIdToRegType(typeId), regId);
527           func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
528           vecPos++;
529         }
530         else {
531           int32_t size = TypeId::sizeOf(typeId);
532           arg.assignToStack(stackOffset);
533           stackOffset += size;
534         }
535         continue;
536       }
537     }
538   }
539 
540   if (cc.getAlgorithm() == CallConv::kAlgorithmWin64) {
541     for (i = 0; i < argCount; i++) {
542       FuncDetail::Value& arg = func._args[i];
543 
544       uint32_t typeId = arg.getTypeId();
545       uint32_t size = TypeId::sizeOf(typeId);
546 
547       if (TypeId::isInt(typeId) || TypeId::isMmx(typeId)) {
548         uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[i] : Globals::kInvalidRegId;
549         if (regId != Globals::kInvalidRegId) {
550           uint32_t regType = (size <= 4 && !TypeId::isMmx(typeId))
551             ? X86Reg::kRegGpd
552             : X86Reg::kRegGpq;
553 
554           arg.assignToReg(regType, regId);
555           func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
556         }
557         else {
558           arg.assignToStack(stackOffset);
559           stackOffset += gpSize;
560         }
561         continue;
562       }
563 
564       if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
565         uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[i] : Globals::kInvalidRegId;
566         if (regId != Globals::kInvalidRegId && (TypeId::isFloat(typeId) || cc.hasFlag(CallConv::kFlagVectorCall))) {
567           uint32_t regType = x86VecTypeIdToRegType(typeId);
568           uint32_t regId = cc._passedOrder[X86Reg::kKindVec].id[i];
569 
570           arg.assignToReg(regType, regId);
571           func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
572         }
573         else {
574           arg.assignToStack(stackOffset);
575           stackOffset += 8; // Always 8 bytes (float/double).
576         }
577         continue;
578       }
579     }
580   }
581 
582   func._argStackSize = stackOffset - stackBase;
583   return kErrorOk;
584 }
585 
586 // ============================================================================
587 // [asmjit::X86Internal - FrameLayout]
588 // ============================================================================
589 
initFrameLayout(FuncFrameLayout & layout,const FuncDetail & func,const FuncFrameInfo & ffi)590 ASMJIT_FAVOR_SIZE Error X86Internal::initFrameLayout(FuncFrameLayout& layout, const FuncDetail& func, const FuncFrameInfo& ffi) noexcept {
591   layout.reset();
592 
593   uint32_t kind;
594   uint32_t gpSize = (func.getCallConv().getArchType() == ArchInfo::kTypeX86) ? 4 : 8;
595 
596   // Calculate a bit-mask of all registers that must be saved & restored.
597   for (kind = 0; kind < Globals::kMaxVRegKinds; kind++)
598     layout._savedRegs[kind] = (ffi.getDirtyRegs(kind) & ~func.getPassedRegs(kind)) & func.getPreservedRegs(kind);
599 
600   // Include EBP|RBP if the function preserves the frame-pointer.
601   if (ffi.hasPreservedFP()) {
602     layout._preservedFP = true;
603     layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(X86Gp::kIdBp);
604   }
605 
606   // Exclude ESP/RSP - this register is never included in saved-regs.
607   layout._savedRegs[X86Reg::kKindGp] &= ~Utils::mask(X86Gp::kIdSp);
608 
609   // Calculate the final stack alignment.
610   uint32_t stackAlignment =
611     std::max<uint32_t>(
612       std::max<uint32_t>(
613         ffi.getStackFrameAlignment(),
614         ffi.getCallFrameAlignment()),
615       func.getCallConv().getNaturalStackAlignment());
616   layout._stackAlignment = static_cast<uint8_t>(stackAlignment);
617 
618   // Calculate if dynamic stack alignment is required. If true the function has
619   // to align stack dynamically to match `_stackAlignment` and would require to
620   // access its stack-based arguments through `_stackArgsRegId`.
621   bool dsa = stackAlignment > func.getCallConv().getNaturalStackAlignment() && stackAlignment >= 16;
622   layout._dynamicAlignment = dsa;
623 
624   // This flag describes if the prolog inserter must store the previous ESP|RSP
625   // to stack so the epilog inserter can load the stack from it before returning.
626   bool dsaSlotUsed = dsa && !ffi.hasPreservedFP();
627   layout._dsaSlotUsed = dsaSlotUsed;
628 
629   // These two are identical if the function doesn't align its stack dynamically.
630   uint32_t stackArgsRegId = ffi.getStackArgsRegId();
631   if (stackArgsRegId == Globals::kInvalidRegId)
632     stackArgsRegId = X86Gp::kIdSp;
633 
634   // Fix stack arguments base-register from ESP|RSP to EBP|RBP in case it was
635   // not picked before and the function performs dynamic stack alignment.
636   if (dsa && stackArgsRegId == X86Gp::kIdSp)
637     stackArgsRegId = X86Gp::kIdBp;
638 
639   if (stackArgsRegId != X86Gp::kIdSp)
640     layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(stackArgsRegId) & func.getPreservedRegs(X86Gp::kKindGp);
641 
642   layout._stackBaseRegId = X86Gp::kIdSp;
643   layout._stackArgsRegId = static_cast<uint8_t>(stackArgsRegId);
644 
645   // Setup stack size used to save preserved registers.
646   layout._gpStackSize  = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindGp )) * gpSize;
647   layout._vecStackSize = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindVec)) * 16 +
648                          Utils::bitCount(layout.getSavedRegs(X86Reg::kKindMm )) *  8 ;
649 
650   uint32_t v = 0;                        // The beginning of the stack frame, aligned to CallFrame alignment.
651   v += ffi._callFrameSize;               // Count '_callFrameSize'  <- This is used to call functions.
652   v  = Utils::alignTo(v, stackAlignment);// Align to function's SA
653 
654   layout._stackBaseOffset = v;           // Store '_stackBaseOffset'<- Function's own stack starts here..
655   v += ffi._stackFrameSize;              // Count '_stackFrameSize' <- Function's own stack ends here.
656 
657   // If the function is aligned, calculate the alignment necessary to store
658   // vector registers, and set `FuncFrameInfo::kX86FlagAlignedVecSR` to inform
659   // PrologEpilog inserter that it can use instructions to perform aligned
660   // stores/loads to save/restore VEC registers.
661   if (stackAlignment >= 16 && layout._vecStackSize) {
662     v = Utils::alignTo(v, 16);           // Align '_vecStackOffset'.
663     layout._alignedVecSR = true;
664   }
665 
666   layout._vecStackOffset = v;            // Store '_vecStackOffset' <- Functions VEC Save|Restore starts here.
667   v += layout._vecStackSize;             // Count '_vecStackSize'   <- Functions VEC Save|Restore ends here.
668 
669   if (dsaSlotUsed) {
670     layout._dsaSlot = v;                 // Store '_dsaSlot'        <- Old stack pointer is stored here.
671     v += gpSize;
672   }
673 
674   // The return address should be stored after GP save/restore regs. It has
675   // the same size as `gpSize` (basically the native register/pointer size).
676   // We don't adjust it now as `v` now contains the exact size that the
677   // function requires to adjust (call frame + stack frame, vec stack size).
678   // The stack (if we consider this size) is misaligned now, as it's always
679   // aligned before the function call - when `call()` is executed it pushes
680   // the current EIP|RIP onto the stack, and misaligns it by 12 or 8 bytes
681   // (depending on the architecture). So count number of bytes needed to align
682   // it up to the function's CallFrame (the beginning).
683   if (v || ffi.hasCalls())
684     v += Utils::alignDiff(v + layout._gpStackSize + gpSize, stackAlignment);
685 
686   layout._stackAdjustment = v;           // Store '_stackAdjustment'<- SA used by 'add zsp, SA' and 'sub zsp, SA'.
687   layout._gpStackOffset = v;             // Store '_gpStackOffset'  <- Functions GP Save|Restore starts here.
688   v += layout._gpStackSize;              // Count '_gpStackSize'    <- Functions GP Save|Restore ends here.
689 
690   v += gpSize;                           // Count 'ReturnAddress'.
691   v += func.getSpillZoneSize();          // Count 'SpillZoneSize'.
692 
693   // Calculate where function arguments start, relative to the stackArgsRegId.
694   // If the register that will be used to access arguments passed by stack is
695   // ESP|RSP then it's exactly where we are now, otherwise we must calculate
696   // how many 'push regs' we did and adjust it based on that.
697   uint32_t stackArgsOffset = v;
698   if (stackArgsRegId != X86Gp::kIdSp) {
699     if (ffi.hasPreservedFP())
700       stackArgsOffset = gpSize;
701     else
702       stackArgsOffset = layout._gpStackSize;
703   }
704   layout._stackArgsOffset = stackArgsOffset;
705 
706   // If the function does dynamic stack adjustment then the stack-adjustment
707   // must be aligned.
708   if (dsa)
709     layout._stackAdjustment = Utils::alignTo(layout._stackAdjustment, stackAlignment);
710 
711   // Initialize variables based on CallConv flags.
712   if (func.hasFlag(CallConv::kFlagCalleePopsStack))
713     layout._calleeStackCleanup = static_cast<uint16_t>(func.getArgStackSize());
714 
715   // Initialize variables based on FFI flags.
716   layout._mmxCleanup = ffi.hasMmxCleanup();
717   layout._avxEnabled = ffi.isAvxEnabled();
718   layout._avxCleanup = ffi.hasAvxCleanup();
719 
720   return kErrorOk;
721 }
722 
723 // ============================================================================
724 // [asmjit::X86Internal - ArgsToFrameInfo]
725 // ============================================================================
726 
argsToFrameInfo(const FuncArgsMapper & args,FuncFrameInfo & ffi)727 ASMJIT_FAVOR_SIZE Error X86Internal::argsToFrameInfo(const FuncArgsMapper& args, FuncFrameInfo& ffi) noexcept {
728   X86FuncArgsContext ctx;
729   ASMJIT_PROPAGATE(ctx.initWorkData(args, ffi._dirtyRegs, ffi.hasPreservedFP()));
730 
731   ASMJIT_PROPAGATE(ctx.markDstRegsDirty(ffi));
732   ASMJIT_PROPAGATE(ctx.markRegsForSwaps(ffi));
733   ASMJIT_PROPAGATE(ctx.markStackArgsReg(ffi));
734   return kErrorOk;
735 }
736 
737 // ============================================================================
738 // [asmjit::X86Internal - Emit Helpers]
739 // ============================================================================
740 
emitRegMove(X86Emitter * emitter,const Operand_ & dst_,const Operand_ & src_,uint32_t typeId,bool avxEnabled,const char * comment)741 ASMJIT_FAVOR_SIZE Error X86Internal::emitRegMove(X86Emitter* emitter,
742   const Operand_& dst_,
743   const Operand_& src_, uint32_t typeId, bool avxEnabled, const char* comment) {
744 
745   // Invalid or abstract TypeIds are not allowed.
746   ASMJIT_ASSERT(TypeId::isValid(typeId) && !TypeId::isAbstract(typeId));
747 
748   Operand dst(dst_);
749   Operand src(src_);
750 
751   uint32_t instId = Inst::kIdNone;
752   uint32_t memFlags = 0;
753 
754   enum MemFlags {
755     kDstMem = 0x1,
756     kSrcMem = 0x2
757   };
758 
759   // Detect memory operands and patch them to have the same size as the register.
760   // CodeCompiler always sets memory size of allocs and spills, so it shouldn't
761   // be really necessary, however, after this function was separated from Compiler
762   // it's better to make sure that the size is always specified, as we can use
763   // 'movzx' and 'movsx' that rely on it.
764   if (dst.isMem()) { memFlags |= kDstMem; dst.as<X86Mem>().setSize(src.getSize()); }
765   if (src.isMem()) { memFlags |= kSrcMem; src.as<X86Mem>().setSize(dst.getSize()); }
766 
767   switch (typeId) {
768     case TypeId::kI8:
769     case TypeId::kU8:
770     case TypeId::kI16:
771     case TypeId::kU16:
772       // Special case - 'movzx' load.
773       if (memFlags & kSrcMem) {
774         instId = X86Inst::kIdMovzx;
775         dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
776       }
777       else if (!memFlags) {
778         // Change both destination and source registers to GPD (safer, no dependencies).
779         dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
780         src.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
781       }
782       ASMJIT_FALLTHROUGH;
783 
784     case TypeId::kI32:
785     case TypeId::kU32:
786     case TypeId::kI64:
787     case TypeId::kU64:
788       instId = X86Inst::kIdMov;
789       break;
790 
791     case TypeId::kMmx32:
792       instId = X86Inst::kIdMovd;
793       if (memFlags) break;
794       ASMJIT_FALLTHROUGH;
795     case TypeId::kMmx64 : instId = X86Inst::kIdMovq ; break;
796     case TypeId::kMask8 : instId = X86Inst::kIdKmovb; break;
797     case TypeId::kMask16: instId = X86Inst::kIdKmovw; break;
798     case TypeId::kMask32: instId = X86Inst::kIdKmovd; break;
799     case TypeId::kMask64: instId = X86Inst::kIdKmovq; break;
800 
801     default: {
802       uint32_t elementTypeId = TypeId::elementOf(typeId);
803       if (TypeId::isVec32(typeId) && memFlags) {
804         if (elementTypeId == TypeId::kF32)
805           instId = avxEnabled ? X86Inst::kIdVmovss : X86Inst::kIdMovss;
806         else
807           instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
808         break;
809       }
810 
811       if (TypeId::isVec64(typeId) && memFlags) {
812         if (elementTypeId == TypeId::kF64)
813           instId = avxEnabled ? X86Inst::kIdVmovsd : X86Inst::kIdMovsd;
814         else
815           instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
816         break;
817       }
818 
819       if (elementTypeId == TypeId::kF32)
820         instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
821       else if (elementTypeId == TypeId::kF64)
822         instId = avxEnabled ? X86Inst::kIdVmovapd : X86Inst::kIdMovapd;
823       else if (typeId <= TypeId::_kVec256End)
824         instId = avxEnabled ? X86Inst::kIdVmovdqa : X86Inst::kIdMovdqa;
825       else if (elementTypeId <= TypeId::kU32)
826         instId = X86Inst::kIdVmovdqa32;
827       else
828         instId = X86Inst::kIdVmovdqa64;
829       break;
830     }
831   }
832 
833   if (!instId)
834     return DebugUtils::errored(kErrorInvalidState);
835 
836   emitter->setInlineComment(comment);
837   return emitter->emit(instId, dst, src);
838 }
839 
emitArgMove(X86Emitter * emitter,const X86Reg & dst_,uint32_t dstTypeId,const Operand_ & src_,uint32_t srcTypeId,bool avxEnabled,const char * comment)840 ASMJIT_FAVOR_SIZE Error X86Internal::emitArgMove(X86Emitter* emitter,
841   const X86Reg& dst_, uint32_t dstTypeId,
842   const Operand_& src_, uint32_t srcTypeId, bool avxEnabled, const char* comment) {
843 
844   // Deduce optional `dstTypeId`, which may be `TypeId::kVoid` in some cases.
845   if (!dstTypeId) dstTypeId = x86OpData.archRegs.regTypeToTypeId[dst_.getType()];
846 
847   // Invalid or abstract TypeIds are not allowed.
848   ASMJIT_ASSERT(TypeId::isValid(dstTypeId) && !TypeId::isAbstract(dstTypeId));
849   ASMJIT_ASSERT(TypeId::isValid(srcTypeId) && !TypeId::isAbstract(srcTypeId));
850 
851   X86Reg dst(dst_);
852   Operand src(src_);
853 
854   uint32_t dstSize = TypeId::sizeOf(dstTypeId);
855   uint32_t srcSize = TypeId::sizeOf(srcTypeId);
856 
857   int32_t instId = Inst::kIdNone;
858 
859   // Not a real loop, just 'break' is nicer than 'goto'.
860   for (;;) {
861     if (TypeId::isInt(dstTypeId)) {
862       if (TypeId::isInt(srcTypeId)) {
863         instId = X86Inst::kIdMovsx;
864         uint32_t typeOp = (dstTypeId << 8) | srcTypeId;
865 
866         // Sign extend by using 'movsx'.
867         if (typeOp == ((TypeId::kI16 << 8) | TypeId::kI8 ) ||
868             typeOp == ((TypeId::kI32 << 8) | TypeId::kI8 ) ||
869             typeOp == ((TypeId::kI32 << 8) | TypeId::kI16) ||
870             typeOp == ((TypeId::kI64 << 8) | TypeId::kI8 ) ||
871             typeOp == ((TypeId::kI64 << 8) | TypeId::kI16)) break;
872 
873         // Sign extend by using 'movsxd'.
874         instId = X86Inst::kIdMovsxd;
875         if (typeOp == ((TypeId::kI64 << 8) | TypeId::kI32)) break;
876       }
877 
878       if (TypeId::isInt(srcTypeId) || src_.isMem()) {
879         // Zero extend by using 'movzx' or 'mov'.
880         if (dstSize <= 4 && srcSize < 4) {
881           instId = X86Inst::kIdMovzx;
882           dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
883         }
884         else {
885           // We should have caught all possibilities where `srcSize` is less
886           // than 4, so we don't have to worry about 'movzx' anymore. Minimum
887           // size is enough to determine if we want 32-bit or 64-bit move.
888           instId = X86Inst::kIdMov;
889           srcSize = std::min(srcSize, dstSize);
890 
891           dst.setSignature(srcSize == 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
892                                         : X86Reg::signatureOfT<X86Reg::kRegGpq>());
893           if (src.isReg()) src.setSignature(dst.getSignature());
894         }
895         break;
896       }
897 
898       // NOTE: The previous branch caught all memory sources, from here it's
899       // always register to register conversion, so catch the remaining cases.
900       srcSize = std::min(srcSize, dstSize);
901 
902       if (TypeId::isMmx(srcTypeId)) {
903         // 64-bit move.
904         instId = X86Inst::kIdMovq;
905         if (srcSize == 8) break;
906 
907         // 32-bit move.
908         instId = X86Inst::kIdMovd;
909         dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
910         break;
911       }
912 
913       if (TypeId::isMask(srcTypeId)) {
914         instId = X86Inst::kmovIdFromSize(srcSize);
915         dst.setSignature(srcSize <= 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
916                                       : X86Reg::signatureOfT<X86Reg::kRegGpq>());
917         break;
918       }
919 
920       if (TypeId::isVec(srcTypeId)) {
921         // 64-bit move.
922         instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
923         if (srcSize == 8) break;
924 
925         // 32-bit move.
926         instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
927         dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
928         break;
929       }
930     }
931 
932     if (TypeId::isMmx(dstTypeId)) {
933       instId = X86Inst::kIdMovq;
934       srcSize = std::min(srcSize, dstSize);
935 
936       if (TypeId::isInt(srcTypeId) || src.isMem()) {
937         // 64-bit move.
938         if (srcSize == 8) break;
939 
940         // 32-bit move.
941         instId = X86Inst::kIdMovd;
942         if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
943         break;
944       }
945 
946       if (TypeId::isMmx(srcTypeId)) break;
947 
948       // NOTE: This will hurt if `avxEnabled`.
949       instId = X86Inst::kIdMovdq2q;
950       if (TypeId::isVec(srcTypeId)) break;
951     }
952 
953     if (TypeId::isMask(dstTypeId)) {
954       srcSize = std::min(srcSize, dstSize);
955 
956       if (TypeId::isInt(srcTypeId) || TypeId::isMask(srcTypeId) || src.isMem()) {
957         instId = X86Inst::kmovIdFromSize(srcSize);
958         if (X86Reg::isGp(src) && srcSize <= 4) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
959         break;
960       }
961     }
962 
963     if (TypeId::isVec(dstTypeId)) {
964       // By default set destination to XMM, will be set to YMM|ZMM if needed.
965       dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegXmm>());
966 
967       // NOTE: This will hurt if `avxEnabled`.
968       if (X86Reg::isMm(src)) {
969         // 64-bit move.
970         instId = X86Inst::kIdMovq2dq;
971         break;
972       }
973 
974       // Argument conversion.
975       uint32_t dstElement = TypeId::elementOf(dstTypeId);
976       uint32_t srcElement = TypeId::elementOf(srcTypeId);
977 
978       if (dstElement == TypeId::kF32 && srcElement == TypeId::kF64) {
979         srcSize = std::min(dstSize * 2, srcSize);
980         dstSize = srcSize / 2;
981 
982         if (srcSize <= 8)
983           instId = avxEnabled ? X86Inst::kIdVcvtss2sd : X86Inst::kIdCvtss2sd;
984         else
985           instId = avxEnabled ? X86Inst::kIdVcvtps2pd : X86Inst::kIdCvtps2pd;
986 
987         if (dstSize == 32)
988           dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
989         if (src.isReg())
990           src.setSignature(X86Reg::signatureOfVecBySize(srcSize));
991         break;
992       }
993 
994       if (dstElement == TypeId::kF64 && srcElement == TypeId::kF32) {
995         srcSize = std::min(dstSize, srcSize * 2) / 2;
996         dstSize = srcSize * 2;
997 
998         if (srcSize <= 4)
999           instId = avxEnabled ? X86Inst::kIdVcvtsd2ss : X86Inst::kIdCvtsd2ss;
1000         else
1001           instId = avxEnabled ? X86Inst::kIdVcvtpd2ps : X86Inst::kIdCvtpd2ps;
1002 
1003         dst.setSignature(X86Reg::signatureOfVecBySize(dstSize));
1004         if (src.isReg() && srcSize >= 32)
1005           src.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
1006         break;
1007       }
1008 
1009       srcSize = std::min(srcSize, dstSize);
1010       if (X86Reg::isGp(src) || src.isMem()) {
1011         // 32-bit move.
1012         if (srcSize <= 4) {
1013           instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
1014           if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
1015           break;
1016         }
1017 
1018         // 64-bit move.
1019         if (srcSize == 8) {
1020           instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
1021           break;
1022         }
1023       }
1024 
1025       if (X86Reg::isVec(src) || src.isMem()) {
1026         instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
1027         uint32_t sign = X86Reg::signatureOfVecBySize(srcSize);
1028 
1029         dst.setSignature(sign);
1030         if (src.isReg()) src.setSignature(sign);
1031         break;
1032       }
1033     }
1034 
1035     return DebugUtils::errored(kErrorInvalidState);
1036   }
1037 
1038   if (src.isMem())
1039     src.as<X86Mem>().setSize(srcSize);
1040 
1041   emitter->setInlineComment(comment);
1042   return emitter->emit(instId, dst, src);
1043 }
1044 
1045 // ============================================================================
1046 // [asmjit::X86Internal - Emit Prolog & Epilog]
1047 // ============================================================================
1048 
emitProlog(X86Emitter * emitter,const FuncFrameLayout & layout)1049 ASMJIT_FAVOR_SIZE Error X86Internal::emitProlog(X86Emitter* emitter, const FuncFrameLayout& layout) {
1050   uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
1051 
1052   X86Gp zsp = emitter->zsp();   // ESP|RSP register.
1053   X86Gp zbp = emitter->zbp();   // EBP|RBP register.
1054   X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
1055   X86Gp saReg = emitter->zsp(); // Stack-arguments base register.
1056 
1057   // Emit: 'push zbp'
1058   //       'mov  zbp, zsp'.
1059   if (layout.hasPreservedFP()) {
1060     gpSaved &= ~Utils::mask(X86Gp::kIdBp);
1061     ASMJIT_PROPAGATE(emitter->push(zbp));
1062     ASMJIT_PROPAGATE(emitter->mov(zbp, zsp));
1063   }
1064 
1065   // Emit: 'push gp' sequence.
1066   if (gpSaved) {
1067     for (uint32_t i = gpSaved, regId = 0; i; i >>= 1, regId++) {
1068       if (!(i & 0x1)) continue;
1069       gpReg.setId(regId);
1070       ASMJIT_PROPAGATE(emitter->push(gpReg));
1071     }
1072   }
1073 
1074   // Emit: 'mov saReg, zsp'.
1075   uint32_t stackArgsRegId = layout.getStackArgsRegId();
1076   if (stackArgsRegId != Globals::kInvalidRegId && stackArgsRegId != X86Gp::kIdSp) {
1077     saReg.setId(stackArgsRegId);
1078     if (!(layout.hasPreservedFP() && stackArgsRegId == X86Gp::kIdBp))
1079       ASMJIT_PROPAGATE(emitter->mov(saReg, zsp));
1080   }
1081 
1082   // Emit: 'and zsp, StackAlignment'.
1083   if (layout.hasDynamicAlignment())
1084     ASMJIT_PROPAGATE(emitter->and_(zsp, -static_cast<int32_t>(layout.getStackAlignment())));
1085 
1086   // Emit: 'sub zsp, StackAdjustment'.
1087   if (layout.hasStackAdjustment())
1088     ASMJIT_PROPAGATE(emitter->sub(zsp, layout.getStackAdjustment()));
1089 
1090   // Emit: 'mov [zsp + dsaSlot], saReg'.
1091   if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
1092     X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
1093     ASMJIT_PROPAGATE(emitter->mov(saMem, saReg));
1094   }
1095 
1096   // Emit 'movaps|movups [zsp + X], xmm0..15'.
1097   uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
1098   if (xmmSaved) {
1099     X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
1100     X86Reg vecReg = x86::xmm(0);
1101 
1102     uint32_t vecInst = x86GetXmmMovInst(layout);
1103     uint32_t vecSize = 16;
1104 
1105     for (uint32_t i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
1106       if (!(i & 0x1)) continue;
1107       vecReg.setId(regId);
1108       ASMJIT_PROPAGATE(emitter->emit(vecInst, vecBase, vecReg));
1109       vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
1110     }
1111   }
1112 
1113   return kErrorOk;
1114 }
1115 
emitEpilog(X86Emitter * emitter,const FuncFrameLayout & layout)1116 ASMJIT_FAVOR_SIZE Error X86Internal::emitEpilog(X86Emitter* emitter, const FuncFrameLayout& layout) {
1117   uint32_t i;
1118   uint32_t regId;
1119 
1120   uint32_t gpSize = emitter->getGpSize();
1121   uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
1122 
1123   X86Gp zsp = emitter->zsp();   // ESP|RSP register.
1124   X86Gp zbp = emitter->zbp();   // EBP|RBP register.
1125   X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
1126 
1127   // Don't emit 'pop zbp' in the pop sequence, this case is handled separately.
1128   if (layout.hasPreservedFP()) gpSaved &= ~Utils::mask(X86Gp::kIdBp);
1129 
1130   // Emit 'movaps|movups xmm0..15, [zsp + X]'.
1131   uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
1132   if (xmmSaved) {
1133     X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
1134     X86Reg vecReg = x86::xmm(0);
1135 
1136     uint32_t vecInst = x86GetXmmMovInst(layout);
1137     uint32_t vecSize = 16;
1138 
1139     for (i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
1140       if (!(i & 0x1)) continue;
1141       vecReg.setId(regId);
1142       ASMJIT_PROPAGATE(emitter->emit(vecInst, vecReg, vecBase));
1143       vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
1144     }
1145   }
1146 
1147   // Emit 'emms' and 'vzeroupper'.
1148   if (layout.hasMmxCleanup()) ASMJIT_PROPAGATE(emitter->emms());
1149   if (layout.hasAvxCleanup()) ASMJIT_PROPAGATE(emitter->vzeroupper());
1150 
1151   if (layout.hasPreservedFP()) {
1152     // Emit 'mov zsp, zbp' or 'lea zsp, [zbp - x]'
1153     int32_t count = static_cast<int32_t>(layout.getGpStackSize() - gpSize);
1154     if (!count)
1155       ASMJIT_PROPAGATE(emitter->mov(zsp, zbp));
1156     else
1157       ASMJIT_PROPAGATE(emitter->lea(zsp, x86::ptr(zbp, -count)));
1158   }
1159   else {
1160     if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
1161       // Emit 'mov zsp, [zsp + DsaSlot]'.
1162       X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
1163       ASMJIT_PROPAGATE(emitter->mov(zsp, saMem));
1164     }
1165     else if (layout.hasStackAdjustment()) {
1166       // Emit 'add zsp, StackAdjustment'.
1167       ASMJIT_PROPAGATE(emitter->add(zsp, static_cast<int32_t>(layout.getStackAdjustment())));
1168     }
1169   }
1170 
1171   // Emit 'pop gp' sequence.
1172   if (gpSaved) {
1173     i = gpSaved;
1174     regId = 16;
1175 
1176     do {
1177       regId--;
1178       if (i & 0x8000) {
1179         gpReg.setId(regId);
1180         ASMJIT_PROPAGATE(emitter->pop(gpReg));
1181       }
1182       i <<= 1;
1183     } while (regId != 0);
1184   }
1185 
1186   // Emit 'pop zbp'.
1187   if (layout.hasPreservedFP()) ASMJIT_PROPAGATE(emitter->pop(zbp));
1188 
1189   // Emit 'ret' or 'ret x'.
1190   if (layout.hasCalleeStackCleanup())
1191     ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet, static_cast<int>(layout.getCalleeStackCleanup())));
1192   else
1193     ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet));
1194 
1195   return kErrorOk;
1196 }
1197 
1198 // ============================================================================
1199 // [asmjit::X86Internal - AllocArgs]
1200 // ============================================================================
1201 
allocArgs(X86Emitter * emitter,const FuncFrameLayout & layout,const FuncArgsMapper & args)1202 ASMJIT_FAVOR_SIZE Error X86Internal::allocArgs(X86Emitter* emitter, const FuncFrameLayout& layout, const FuncArgsMapper& args) {
1203   typedef X86FuncArgsContext::SrcArg SrcArg;
1204   typedef X86FuncArgsContext::DstArg DstArg;
1205   typedef X86FuncArgsContext::WorkData WorkData;
1206   enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
1207 
1208   uint32_t i;
1209   const FuncDetail& func = *args.getFuncDetail();
1210 
1211   X86FuncArgsContext ctx;
1212   ASMJIT_PROPAGATE(ctx.initWorkData(args, layout._savedRegs, layout.hasPreservedFP()));
1213 
1214   // We must honor AVX if it's enabled.
1215   bool avxEnabled = layout.isAvxEnabled();
1216 
1217   // Free registers that can be used as temporaries and during shuffling.
1218   // We initialize them to match all workRegs (registers that can be used
1219   // by the function) except source regs, which are used to pass arguments.
1220   // Free registers are changed during shuffling - when an argument is moved
1221   // to the final register then the register itself is removed from freeRegs
1222   // (it can't be altered anymore during shuffling).
1223   uint32_t freeRegs[kMaxVRegKinds];
1224   for (i = 0; i < kMaxVRegKinds; i++)
1225     freeRegs[i] = ctx._workData[i].workRegs & ~ctx._workData[i].srcRegs;
1226 
1227   // This is an iterative process that runs until there is a work to do. When
1228   // one register is moved it can create space for another move. Such moves can
1229   // depend on each other so the algorithm may run multiple times before all
1230   // arguments are in place. This part does only register-to-register work,
1231   // arguments moved from stack-to-register area handled later.
1232   for (;;) {
1233     bool hasWork = false; // Do we have a work to do?
1234     bool didWork = false; // If we did something...
1235 
1236     uint32_t dstRegKind = kMaxVRegKinds;
1237     do {
1238       WorkData& wd = ctx._workData[--dstRegKind];
1239       if (wd.numOps > wd.numStackArgs) {
1240         hasWork = true;
1241 
1242         // Iterate over all destination regs and check if we can do something.
1243         // We always go from destination to source, never the opposite.
1244         uint32_t regsToDo = wd.dstRegs;
1245         do {
1246           // If there is a work to do there has to be at least one dstReg.
1247           ASMJIT_ASSERT(regsToDo != 0);
1248           uint32_t dstRegId = Utils::findFirstBit(regsToDo);
1249           uint32_t dstRegMask = Utils::mask(dstRegId);
1250 
1251           uint32_t argIndex = wd.argIndex[dstRegId];
1252           const DstArg& dstArg = args.getArg(argIndex);
1253           const SrcArg& srcArg = func.getArg(argIndex);
1254 
1255           if (srcArg.byReg()) {
1256             uint32_t srcRegType = srcArg.getRegType();
1257             uint32_t srcRegKind = X86Reg::kindOf(srcRegType);
1258 
1259             if (freeRegs[dstRegKind] & dstRegMask) {
1260               X86Reg dstReg(X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId));
1261               X86Reg srcReg(X86Reg::fromTypeAndId(srcRegType, srcArg.getRegId()));
1262 
1263               ASMJIT_PROPAGATE(
1264                 emitArgMove(emitter,
1265                   dstReg, dstArg.getTypeId(),
1266                   srcReg, srcArg.getTypeId(), avxEnabled));
1267               freeRegs[dstRegKind] ^= dstRegMask;                     // Make the DST reg occupied.
1268               freeRegs[srcRegKind] |= Utils::mask(srcArg.getRegId()); // Make the SRC reg free.
1269 
1270               ASMJIT_ASSERT(wd.numOps >= 1);
1271               wd.numOps--;
1272               didWork = true;
1273             }
1274             else {
1275               // Check if this is a swap operation.
1276               if (dstRegKind == srcRegKind) {
1277                 uint32_t srcRegId = srcArg.getRegId();
1278 
1279                 uint32_t otherIndex = wd.argIndex[srcRegId];
1280                 const DstArg& otherArg = args.getArg(otherIndex);
1281 
1282                 if (otherArg.getRegId() == srcRegId && X86Reg::kindOf(otherArg.getRegType()) == dstRegKind) {
1283                   // If this is GP reg it can be handled by 'xchg'.
1284                   if (dstRegKind == X86Reg::kKindGp) {
1285                     uint32_t highestType = std::max(dstArg.getRegType(), srcRegType);
1286 
1287                     X86Reg dstReg = x86::gpd(dstRegId);
1288                     X86Reg srcReg = x86::gpd(srcRegId);
1289 
1290                     if (highestType == X86Reg::kRegGpq) {
1291                       dstReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
1292                       srcReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
1293                     }
1294                     ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdXchg, dstReg, srcReg));
1295                     regsToDo &= ~Utils::mask(srcRegId);
1296                     freeRegs[dstRegKind] &= ~(Utils::mask(srcRegId) | dstRegMask);
1297 
1298                     ASMJIT_ASSERT(wd.numOps >= 2);
1299                     ASMJIT_ASSERT(wd.numSwaps >= 1);
1300                     wd.numOps-=2;
1301                     wd.numSwaps--;
1302                     didWork = true;
1303                   }
1304                 }
1305               }
1306             }
1307           }
1308 
1309           // Clear the reg in `regsToDo` and continue if there are more.
1310           regsToDo ^= dstRegMask;
1311         } while (regsToDo);
1312       }
1313     } while (dstRegKind);
1314 
1315     if (!hasWork)
1316       break;
1317 
1318     if (!didWork)
1319       return DebugUtils::errored(kErrorInvalidState);
1320   }
1321 
1322   // Load arguments passed by stack into registers. This is pretty simple and
1323   // it never requires multiple iterations like the previous phase.
1324   if (ctx._hasStackArgs) {
1325     // Base address of all arguments passed by stack.
1326     X86Mem saBase = x86::ptr(emitter->gpz(layout.getStackArgsRegId()), layout.getStackArgsOffset());
1327 
1328     uint32_t dstRegKind = kMaxVRegKinds;
1329     do {
1330       WorkData& wd = ctx._workData[--dstRegKind];
1331       if (wd.numStackArgs) {
1332         // Iterate over all destination regs and check if we can do something.
1333         // We always go from destination to source, never the opposite.
1334         uint32_t regsToDo = wd.dstRegs;
1335         do {
1336           // If there is a work to do there has to be at least one dstReg.
1337           ASMJIT_ASSERT(regsToDo != 0);
1338           ASMJIT_ASSERT(wd.numOps > 0);
1339 
1340           uint32_t dstRegId = Utils::findFirstBit(regsToDo);
1341           uint32_t dstRegMask = Utils::mask(dstRegId);
1342 
1343           uint32_t argIndex = wd.argIndex[dstRegId];
1344           const DstArg& dstArg = args.getArg(argIndex);
1345           const SrcArg& srcArg = func.getArg(argIndex);
1346 
1347           // Only arguments passed by stack should remain, also the destination
1348           // registers must be free now (otherwise the first part of the algorithm
1349           // failed). Ideally this should be assert, but it's much safer to enforce
1350           // this in release as well.
1351           if (!srcArg.byStack() || !(freeRegs[dstRegKind] & dstRegMask))
1352             return DebugUtils::errored(kErrorInvalidState);
1353 
1354           X86Reg dstReg = X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId);
1355           X86Mem srcMem = saBase.adjusted(srcArg.getStackOffset());
1356 
1357           ASMJIT_PROPAGATE(
1358             emitArgMove(emitter,
1359               dstReg, dstArg.getTypeId(),
1360               srcMem, srcArg.getTypeId(), avxEnabled));
1361 
1362           freeRegs[dstRegKind] ^= dstRegMask;
1363           regsToDo ^= dstRegMask;
1364           wd.numOps--;
1365         } while (regsToDo);
1366       }
1367     } while (dstRegKind);
1368   }
1369 
1370   return kErrorOk;
1371 }
1372 
1373 } // asmjit namespace
1374 } // namespace PLMD
1375 
1376 // [Api-End]
1377 #include "./asmjit_apiend.h"
1378 
1379 // [Guard]
1380 #endif // ASMJIT_BUILD_X86
1381 #pragma GCC diagnostic pop
1382 #endif // __PLUMED_HAS_ASMJIT
1383