1 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 Copyright (c) 2008-2017, Petr Kobalicek
3
4 This software is provided 'as-is', without any express or implied
5 warranty. In no event will the authors be held liable for any damages
6 arising from the use of this software.
7
8 Permission is granted to anyone to use this software for any purpose,
9 including commercial applications, and to alter it and redistribute it
10 freely, subject to the following restrictions:
11
12 1. The origin of this software must not be misrepresented; you must not
13 claim that you wrote the original software. If you use this software
14 in a product, an acknowledgment in the product documentation would be
15 appreciated but is not required.
16 2. Altered source versions must be plainly marked as such, and must not be
17 misrepresented as being the original software.
18 3. This notice may not be removed or altered from any source distribution.
19 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
20 #ifdef __PLUMED_HAS_ASMJIT
21 #pragma GCC diagnostic push
22 #pragma GCC diagnostic ignored "-Wpedantic"
23 // [AsmJit]
24 // Complete x86/x64 JIT and Remote Assembler for C++.
25 //
26 // [License]
27 // Zlib - See LICENSE.md file in the package.
28
29 // [Export]
30 #define ASMJIT_EXPORTS
31
32 // [Guard]
33 #include "./asmjit_build.h"
34 #if defined(ASMJIT_BUILD_X86)
35
36 // [Dependencies]
37 #include "./x86internal_p.h"
38
39 // [Api-Begin]
40 #include "./asmjit_apibegin.h"
41
42 namespace PLMD {
43 namespace asmjit {
44
45 // ============================================================================
46 // [asmjit::X86Internal - Helpers]
47 // ============================================================================
48
x86GetXmmMovInst(const FuncFrameLayout & layout)49 static ASMJIT_INLINE uint32_t x86GetXmmMovInst(const FuncFrameLayout& layout) {
50 bool avx = layout.isAvxEnabled();
51 bool aligned = layout.hasAlignedVecSR();
52
53 return aligned ? (avx ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps)
54 : (avx ? X86Inst::kIdVmovups : X86Inst::kIdMovups);
55 }
56
x86VecTypeIdToRegType(uint32_t typeId)57 static ASMJIT_INLINE uint32_t x86VecTypeIdToRegType(uint32_t typeId) noexcept {
58 return typeId <= TypeId::_kVec128End ? X86Reg::kRegXmm :
59 typeId <= TypeId::_kVec256End ? X86Reg::kRegYmm :
60 X86Reg::kRegZmm ;
61 }
62
63 // ============================================================================
64 // [asmjit::X86FuncArgsContext]
65 // ============================================================================
66
67 // Used by both, `Utils::argsToFrameInfo()` and `Utils::allocArgs()`.
68 class X86FuncArgsContext {
69 public:
70 typedef FuncDetail::Value SrcArg;
71 typedef FuncArgsMapper::Value DstArg;
72
73 enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
74
75 struct WorkData {
76 uint32_t archRegs; //!< Architecture provided and allocable regs.
77 uint32_t workRegs; //!< Registers that can be used by shuffler.
78 uint32_t usedRegs; //!< Only registers used to pass arguments.
79 uint32_t srcRegs; //!< Source registers that need shuffling.
80 uint32_t dstRegs; //!< Destination registers that need shuffling.
81 uint8_t numOps; //!< Number of operations to finish.
82 uint8_t numSwaps; //!< Number of register swaps.
83 uint8_t numStackArgs; //!< Number of stack loads.
84 uint8_t reserved[9]; //!< Reserved (only used as padding).
85 uint8_t argIndex[32]; //!< Only valid if a corresponding bit in `userRegs` is true.
86 };
87
88 X86FuncArgsContext() noexcept;
89 Error initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept;
90
91 Error markRegsForSwaps(FuncFrameInfo& ffi) noexcept;
92 Error markDstRegsDirty(FuncFrameInfo& ffi) noexcept;
93 Error markStackArgsReg(FuncFrameInfo& ffi) noexcept;
94
95 // --------------------------------------------------------------------------
96 // [Members]
97 // --------------------------------------------------------------------------
98
99 WorkData _workData[kMaxVRegKinds];
100 bool _hasStackArgs;
101 bool _hasRegSwaps;
102 };
103
X86FuncArgsContext()104 X86FuncArgsContext::X86FuncArgsContext() noexcept {
105 ::memset(_workData, 0, sizeof(_workData));
106 _hasStackArgs = false;
107 _hasRegSwaps = false;
108 }
109
initWorkData(const FuncArgsMapper & args,const uint32_t * dirtyRegs,bool preservedFP)110 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::initWorkData(const FuncArgsMapper& args, const uint32_t* dirtyRegs, bool preservedFP) noexcept {
111 // This code has to be updated if this changes.
112 ASMJIT_ASSERT(kMaxVRegKinds == 4);
113
114 uint32_t i;
115 const FuncDetail& func = *args.getFuncDetail();
116
117 uint32_t archType = func.getCallConv().getArchType();
118 uint32_t count = (archType == ArchInfo::kTypeX86) ? 8 : 16;
119
120 // Initialize WorkData::archRegs.
121 _workData[X86Reg::kKindGp ].archRegs = Utils::bits(count) & ~Utils::mask(X86Gp::kIdSp);
122 _workData[X86Reg::kKindMm ].archRegs = Utils::bits(8);
123 _workData[X86Reg::kKindK ].archRegs = Utils::bits(8);
124 _workData[X86Reg::kKindVec].archRegs = Utils::bits(count);
125
126 if (preservedFP)
127 _workData[X86Reg::kKindGp].archRegs &= ~Utils::mask(X86Gp::kIdBp);
128
129 // Initialize WorkData::workRegs.
130 for (i = 0; i < kMaxVRegKinds; i++)
131 _workData[i].workRegs = _workData[i].archRegs & (dirtyRegs[i] | ~func.getCallConv().getPreservedRegs(i));
132
133 // Build WorkData.
134 for (i = 0; i < kFuncArgCountLoHi; i++) {
135 const DstArg& dstArg = args.getArg(i);
136 if (!dstArg.isAssigned()) continue;
137
138 const SrcArg& srcArg = func.getArg(i);
139 if (ASMJIT_UNLIKELY(!srcArg.isAssigned()))
140 return DebugUtils::errored(kErrorInvalidState);
141
142 uint32_t dstRegType = dstArg.getRegType();
143 if (ASMJIT_UNLIKELY(dstRegType >= X86Reg::kRegCount))
144 return DebugUtils::errored(kErrorInvalidRegType);
145
146 uint32_t dstRegKind = X86Reg::kindOf(dstRegType);
147 if (ASMJIT_UNLIKELY(dstRegKind >= kMaxVRegKinds))
148 return DebugUtils::errored(kErrorInvalidState);
149
150 WorkData& dstData = _workData[dstRegKind];
151 uint32_t dstRegId = dstArg.getRegId();
152 if (ASMJIT_UNLIKELY(dstRegId >= 32 || !(dstData.archRegs & Utils::mask(dstRegId))))
153 return DebugUtils::errored(kErrorInvalidPhysId);
154
155 uint32_t dstRegMask = Utils::mask(dstRegId);
156 if (ASMJIT_UNLIKELY(dstData.usedRegs & dstRegMask))
157 return DebugUtils::errored(kErrorOverlappedRegs);
158
159 dstData.usedRegs |= dstRegMask;
160 dstData.argIndex[dstRegId] = static_cast<uint8_t>(i);
161
162 if (srcArg.byReg()) {
163 uint32_t srcRegKind = X86Reg::kindOf(srcArg.getRegType());
164 uint32_t srcRegId = srcArg.getRegId();
165 uint32_t srcRegMask = Utils::mask(srcRegId);
166
167 if (dstRegKind == srcRegKind) {
168 // The best case, register is allocated where it is expected to be.
169 if (dstRegId == srcRegId) continue;
170
171 // Detect a register swap.
172 if (dstData.usedRegs & srcRegMask) {
173 const SrcArg& ref = func.getArg(dstData.argIndex[srcRegId]);
174 if (ref.byReg() && X86Reg::kindOf(ref.getRegType()) == dstRegKind && ref.getRegId() == dstRegId) {
175 dstData.numSwaps++;
176 _hasRegSwaps = true;
177 }
178 }
179 dstData.srcRegs |= srcRegMask;
180 }
181 else {
182 if (ASMJIT_UNLIKELY(srcRegKind >= kMaxVRegKinds))
183 return DebugUtils::errored(kErrorInvalidState);
184
185 WorkData& srcData = _workData[srcRegKind];
186 srcData.srcRegs |= srcRegMask;
187 }
188 }
189 else {
190 dstData.numStackArgs++;
191 _hasStackArgs = true;
192 }
193
194 dstData.numOps++;
195 dstData.dstRegs |= dstRegMask;
196 }
197
198 return kErrorOk;
199 }
200
markDstRegsDirty(FuncFrameInfo & ffi)201 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markDstRegsDirty(FuncFrameInfo& ffi) noexcept {
202 for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
203 WorkData& wd = _workData[i];
204 uint32_t regs = wd.usedRegs | wd.dstRegs;
205
206 wd.workRegs |= regs;
207 ffi.addDirtyRegs(i, regs);
208 }
209
210 return kErrorOk;
211 }
212
markRegsForSwaps(FuncFrameInfo & ffi)213 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markRegsForSwaps(FuncFrameInfo& ffi) noexcept {
214 if (!_hasRegSwaps)
215 return kErrorOk;
216
217 // If some registers require swapping then select one dirty register that
218 // can be used as a temporary. We can do it also without it (by using xors),
219 // but using temporary is always safer and also faster approach.
220 for (uint32_t i = 0; i < kMaxVRegKinds; i++) {
221 // Skip all register kinds where swapping is natively supported (GP regs).
222 if (i == X86Reg::kKindGp) continue;
223
224 // Skip all register kinds that don't require swapping.
225 WorkData& wd = _workData[i];
226 if (!wd.numSwaps) continue;
227
228 // Initially, pick some clobbered or dirty register.
229 uint32_t workRegs = wd.workRegs;
230 uint32_t regs = workRegs & ~(wd.usedRegs | wd.dstRegs);
231
232 // If that didn't work out pick some register which is not in 'used'.
233 if (!regs) regs = workRegs & ~wd.usedRegs;
234
235 // If that didn't work out pick any other register that is allocable.
236 // This last resort case will, however, result in marking one more
237 // register dirty.
238 if (!regs) regs = wd.archRegs & ~workRegs;
239
240 // If that didn't work out we will have to use xors instead of moves.
241 if (!regs) continue;
242
243 uint32_t regMask = Utils::mask(Utils::findFirstBit(regs));
244 wd.workRegs |= regMask;
245 ffi.addDirtyRegs(i, regMask);
246 }
247
248 return kErrorOk;
249 }
250
markStackArgsReg(FuncFrameInfo & ffi)251 ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markStackArgsReg(FuncFrameInfo& ffi) noexcept {
252 if (!_hasStackArgs)
253 return kErrorOk;
254
255 // Decide which register to use to hold the stack base address.
256 if (!ffi.hasPreservedFP()) {
257 WorkData& wd = _workData[X86Reg::kKindGp];
258 uint32_t saRegId = ffi.getStackArgsRegId();
259 uint32_t usedRegs = wd.usedRegs;
260
261 if (saRegId != Globals::kInvalidRegId) {
262 // Check if the user chosen SA register doesn't overlap with others.
263 // However, it's fine if it overlaps with some 'dstMove' register.
264 if (usedRegs & Utils::mask(saRegId))
265 return DebugUtils::errored(kErrorOverlappingStackRegWithRegArg);
266 }
267 else {
268 // Initially, pick some clobbered or dirty register that is neither
269 // in 'used' and neither in 'dstMove'. That's the safest bet as the
270 // register won't collide with anything right now.
271 uint32_t regs = wd.workRegs & ~(usedRegs | wd.dstRegs);
272
273 // If that didn't work out pick some register which is not in 'used'.
274 if (!regs) regs = wd.workRegs & ~usedRegs;
275
276 // If that didn't work out then we have to make one more register dirty.
277 if (!regs) regs = wd.archRegs & ~wd.workRegs;
278
279 // If that didn't work out we can't continue.
280 if (ASMJIT_UNLIKELY(!regs))
281 return DebugUtils::errored(kErrorNoMorePhysRegs);
282
283 saRegId = Utils::findFirstBit(regs);
284 ffi.setStackArgsRegId(saRegId);
285 }
286 }
287 else {
288 ffi.setStackArgsRegId(X86Gp::kIdBp);
289 }
290
291 return kErrorOk;
292 }
293
294 // ============================================================================
295 // [asmjit::X86Internal - CallConv]
296 // ============================================================================
297
initCallConv(CallConv & cc,uint32_t ccId)298 ASMJIT_FAVOR_SIZE Error X86Internal::initCallConv(CallConv& cc, uint32_t ccId) noexcept {
299 const uint32_t kKindGp = X86Reg::kKindGp;
300 const uint32_t kKindVec = X86Reg::kKindVec;
301 const uint32_t kKindMm = X86Reg::kKindMm;
302 const uint32_t kKindK = X86Reg::kKindK;
303
304 const uint32_t kZax = X86Gp::kIdAx;
305 const uint32_t kZbx = X86Gp::kIdBx;
306 const uint32_t kZcx = X86Gp::kIdCx;
307 const uint32_t kZdx = X86Gp::kIdDx;
308 const uint32_t kZsp = X86Gp::kIdSp;
309 const uint32_t kZbp = X86Gp::kIdBp;
310 const uint32_t kZsi = X86Gp::kIdSi;
311 const uint32_t kZdi = X86Gp::kIdDi;
312
313 switch (ccId) {
314 case CallConv::kIdX86StdCall:
315 cc.setFlags(CallConv::kFlagCalleePopsStack);
316 goto X86CallConv;
317
318 case CallConv::kIdX86MsThisCall:
319 cc.setFlags(CallConv::kFlagCalleePopsStack);
320 cc.setPassedOrder(kKindGp, kZcx);
321 goto X86CallConv;
322
323 case CallConv::kIdX86MsFastCall:
324 case CallConv::kIdX86GccFastCall:
325 cc.setFlags(CallConv::kFlagCalleePopsStack);
326 cc.setPassedOrder(kKindGp, kZcx, kZdx);
327 goto X86CallConv;
328
329 case CallConv::kIdX86GccRegParm1:
330 cc.setPassedOrder(kKindGp, kZax);
331 goto X86CallConv;
332
333 case CallConv::kIdX86GccRegParm2:
334 cc.setPassedOrder(kKindGp, kZax, kZdx);
335 goto X86CallConv;
336
337 case CallConv::kIdX86GccRegParm3:
338 cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx);
339 goto X86CallConv;
340
341 case CallConv::kIdX86CDecl:
342 X86CallConv:
343 cc.setNaturalStackAlignment(4);
344 cc.setArchType(ArchInfo::kTypeX86);
345 cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi));
346 break;
347
348 case CallConv::kIdX86Win64:
349 cc.setArchType(ArchInfo::kTypeX64);
350 cc.setAlgorithm(CallConv::kAlgorithmWin64);
351 cc.setFlags(CallConv::kFlagPassFloatsByVec | CallConv::kFlagIndirectVecArgs);
352 cc.setNaturalStackAlignment(16);
353 cc.setSpillZoneSize(32);
354 cc.setPassedOrder(kKindGp, kZcx, kZdx, 8, 9);
355 cc.setPassedOrder(kKindVec, 0, 1, 2, 3);
356 cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, kZsi, kZdi, 12, 13, 14, 15));
357 cc.setPreservedRegs(kKindVec, Utils::mask(6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
358 break;
359
360 case CallConv::kIdX86SysV64:
361 cc.setArchType(ArchInfo::kTypeX64);
362 cc.setFlags(CallConv::kFlagPassFloatsByVec);
363 cc.setNaturalStackAlignment(16);
364 cc.setRedZoneSize(128);
365 cc.setPassedOrder(kKindGp, kZdi, kZsi, kZdx, kZcx, 8, 9);
366 cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
367 cc.setPreservedRegs(kKindGp, Utils::mask(kZbx, kZsp, kZbp, 12, 13, 14, 15));
368 break;
369
370 case CallConv::kIdX86FastEval2:
371 case CallConv::kIdX86FastEval3:
372 case CallConv::kIdX86FastEval4: {
373 uint32_t n = ccId - CallConv::kIdX86FastEval2;
374
375 cc.setArchType(ArchInfo::kTypeX86);
376 cc.setFlags(CallConv::kFlagPassFloatsByVec);
377 cc.setNaturalStackAlignment(16);
378 cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
379 cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
380 cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
381
382 cc.setPreservedRegs(kKindGp , Utils::bits(8));
383 cc.setPreservedRegs(kKindVec, Utils::bits(8) & ~Utils::bits(n));
384 cc.setPreservedRegs(kKindMm , Utils::bits(8));
385 cc.setPreservedRegs(kKindK , Utils::bits(8));
386 break;
387 }
388
389 case CallConv::kIdX64FastEval2:
390 case CallConv::kIdX64FastEval3:
391 case CallConv::kIdX64FastEval4: {
392 uint32_t n = ccId - CallConv::kIdX64FastEval2;
393
394 cc.setArchType(ArchInfo::kTypeX64);
395 cc.setFlags(CallConv::kFlagPassFloatsByVec);
396 cc.setNaturalStackAlignment(16);
397 cc.setPassedOrder(kKindGp, kZax, kZdx, kZcx, kZsi, kZdi);
398 cc.setPassedOrder(kKindMm, 0, 1, 2, 3, 4, 5, 6, 7);
399 cc.setPassedOrder(kKindVec, 0, 1, 2, 3, 4, 5, 6, 7);
400
401 cc.setPreservedRegs(kKindGp , Utils::bits(16));
402 cc.setPreservedRegs(kKindVec,~Utils::bits(n));
403 cc.setPreservedRegs(kKindMm , Utils::bits(8));
404 cc.setPreservedRegs(kKindK , Utils::bits(8));
405 break;
406 }
407
408 default:
409 return DebugUtils::errored(kErrorInvalidArgument);
410 }
411
412 cc.setId(ccId);
413 return kErrorOk;
414 }
415
416 // ============================================================================
417 // [asmjit::X86Internal - FuncDetail]
418 // ============================================================================
419
initFuncDetail(FuncDetail & func,const FuncSignature & sign,uint32_t gpSize)420 ASMJIT_FAVOR_SIZE Error X86Internal::initFuncDetail(FuncDetail& func, const FuncSignature& sign, uint32_t gpSize) noexcept {
421 const CallConv& cc = func.getCallConv();
422 uint32_t archType = cc.getArchType();
423
424 uint32_t i;
425 uint32_t argCount = func.getArgCount();
426
427 if (func.getRetCount() != 0) {
428 uint32_t typeId = func._rets[0].getTypeId();
429 switch (typeId) {
430 case TypeId::kI64:
431 case TypeId::kU64: {
432 if (archType == ArchInfo::kTypeX86) {
433 // Convert a 64-bit return to two 32-bit returns.
434 func._retCount = 2;
435 typeId -= 2;
436
437 // 64-bit value is returned in EDX:EAX on X86.
438 func._rets[0].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdAx);
439 func._rets[1].initReg(typeId, X86Gp::kRegGpd, X86Gp::kIdDx);
440 break;
441 }
442 else {
443 func._rets[0].initReg(typeId, X86Gp::kRegGpq, X86Gp::kIdAx);
444 }
445 break;
446 }
447
448 case TypeId::kI8:
449 case TypeId::kU8:
450 case TypeId::kI16:
451 case TypeId::kU16:
452 case TypeId::kI32:
453 case TypeId::kU32: {
454 func._rets[0].assignToReg(X86Gp::kRegGpd, X86Gp::kIdAx);
455 break;
456 }
457
458 case TypeId::kF32:
459 case TypeId::kF64: {
460 uint32_t regType = (archType == ArchInfo::kTypeX86) ? X86Reg::kRegFp : X86Reg::kRegXmm;
461 func._rets[0].assignToReg(regType, 0);
462 break;
463 }
464
465 case TypeId::kF80: {
466 // 80-bit floats are always returned by FP0.
467 func._rets[0].assignToReg(X86Reg::kRegFp, 0);
468 break;
469 }
470
471 case TypeId::kMmx32:
472 case TypeId::kMmx64: {
473 // On X64 MM register(s) are returned through XMM or GPQ (Win64).
474 uint32_t regType = X86Reg::kRegMm;
475 if (archType != ArchInfo::kTypeX86)
476 regType = cc.getAlgorithm() == CallConv::kAlgorithmDefault ? X86Reg::kRegXmm : X86Reg::kRegGpq;
477
478 func._rets[0].assignToReg(regType, 0);
479 break;
480 }
481
482 default: {
483 func._rets[0].assignToReg(x86VecTypeIdToRegType(typeId), 0);
484 break;
485 }
486 }
487 }
488
489 uint32_t stackBase = gpSize;
490 uint32_t stackOffset = stackBase + cc._spillZoneSize;
491
492 if (cc.getAlgorithm() == CallConv::kAlgorithmDefault) {
493 uint32_t gpzPos = 0;
494 uint32_t vecPos = 0;
495
496 for (i = 0; i < argCount; i++) {
497 FuncDetail::Value& arg = func._args[i];
498 uint32_t typeId = arg.getTypeId();
499
500 if (TypeId::isInt(typeId)) {
501 uint32_t regId = gpzPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[gpzPos] : Globals::kInvalidRegId;
502 if (regId != Globals::kInvalidRegId) {
503 uint32_t regType = (typeId <= TypeId::kU32)
504 ? X86Reg::kRegGpd
505 : X86Reg::kRegGpq;
506 arg.assignToReg(regType, regId);
507 func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
508 gpzPos++;
509 }
510 else {
511 uint32_t size = std::max<uint32_t>(TypeId::sizeOf(typeId), gpSize);
512 arg.assignToStack(stackOffset);
513 stackOffset += size;
514 }
515 continue;
516 }
517
518 if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
519 uint32_t regId = vecPos < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[vecPos] : Globals::kInvalidRegId;
520
521 // If this is a float, but `floatByVec` is false, we have to pass by stack.
522 if (TypeId::isFloat(typeId) && !cc.hasFlag(CallConv::kFlagPassFloatsByVec))
523 regId = Globals::kInvalidRegId;
524
525 if (regId != Globals::kInvalidRegId) {
526 arg.initReg(typeId, x86VecTypeIdToRegType(typeId), regId);
527 func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
528 vecPos++;
529 }
530 else {
531 int32_t size = TypeId::sizeOf(typeId);
532 arg.assignToStack(stackOffset);
533 stackOffset += size;
534 }
535 continue;
536 }
537 }
538 }
539
540 if (cc.getAlgorithm() == CallConv::kAlgorithmWin64) {
541 for (i = 0; i < argCount; i++) {
542 FuncDetail::Value& arg = func._args[i];
543
544 uint32_t typeId = arg.getTypeId();
545 uint32_t size = TypeId::sizeOf(typeId);
546
547 if (TypeId::isInt(typeId) || TypeId::isMmx(typeId)) {
548 uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindGp].id[i] : Globals::kInvalidRegId;
549 if (regId != Globals::kInvalidRegId) {
550 uint32_t regType = (size <= 4 && !TypeId::isMmx(typeId))
551 ? X86Reg::kRegGpd
552 : X86Reg::kRegGpq;
553
554 arg.assignToReg(regType, regId);
555 func.addUsedRegs(X86Reg::kKindGp, Utils::mask(regId));
556 }
557 else {
558 arg.assignToStack(stackOffset);
559 stackOffset += gpSize;
560 }
561 continue;
562 }
563
564 if (TypeId::isFloat(typeId) || TypeId::isVec(typeId)) {
565 uint32_t regId = i < CallConv::kNumRegArgsPerKind ? cc._passedOrder[X86Reg::kKindVec].id[i] : Globals::kInvalidRegId;
566 if (regId != Globals::kInvalidRegId && (TypeId::isFloat(typeId) || cc.hasFlag(CallConv::kFlagVectorCall))) {
567 uint32_t regType = x86VecTypeIdToRegType(typeId);
568 uint32_t regId = cc._passedOrder[X86Reg::kKindVec].id[i];
569
570 arg.assignToReg(regType, regId);
571 func.addUsedRegs(X86Reg::kKindVec, Utils::mask(regId));
572 }
573 else {
574 arg.assignToStack(stackOffset);
575 stackOffset += 8; // Always 8 bytes (float/double).
576 }
577 continue;
578 }
579 }
580 }
581
582 func._argStackSize = stackOffset - stackBase;
583 return kErrorOk;
584 }
585
586 // ============================================================================
587 // [asmjit::X86Internal - FrameLayout]
588 // ============================================================================
589
initFrameLayout(FuncFrameLayout & layout,const FuncDetail & func,const FuncFrameInfo & ffi)590 ASMJIT_FAVOR_SIZE Error X86Internal::initFrameLayout(FuncFrameLayout& layout, const FuncDetail& func, const FuncFrameInfo& ffi) noexcept {
591 layout.reset();
592
593 uint32_t kind;
594 uint32_t gpSize = (func.getCallConv().getArchType() == ArchInfo::kTypeX86) ? 4 : 8;
595
596 // Calculate a bit-mask of all registers that must be saved & restored.
597 for (kind = 0; kind < Globals::kMaxVRegKinds; kind++)
598 layout._savedRegs[kind] = (ffi.getDirtyRegs(kind) & ~func.getPassedRegs(kind)) & func.getPreservedRegs(kind);
599
600 // Include EBP|RBP if the function preserves the frame-pointer.
601 if (ffi.hasPreservedFP()) {
602 layout._preservedFP = true;
603 layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(X86Gp::kIdBp);
604 }
605
606 // Exclude ESP/RSP - this register is never included in saved-regs.
607 layout._savedRegs[X86Reg::kKindGp] &= ~Utils::mask(X86Gp::kIdSp);
608
609 // Calculate the final stack alignment.
610 uint32_t stackAlignment =
611 std::max<uint32_t>(
612 std::max<uint32_t>(
613 ffi.getStackFrameAlignment(),
614 ffi.getCallFrameAlignment()),
615 func.getCallConv().getNaturalStackAlignment());
616 layout._stackAlignment = static_cast<uint8_t>(stackAlignment);
617
618 // Calculate if dynamic stack alignment is required. If true the function has
619 // to align stack dynamically to match `_stackAlignment` and would require to
620 // access its stack-based arguments through `_stackArgsRegId`.
621 bool dsa = stackAlignment > func.getCallConv().getNaturalStackAlignment() && stackAlignment >= 16;
622 layout._dynamicAlignment = dsa;
623
624 // This flag describes if the prolog inserter must store the previous ESP|RSP
625 // to stack so the epilog inserter can load the stack from it before returning.
626 bool dsaSlotUsed = dsa && !ffi.hasPreservedFP();
627 layout._dsaSlotUsed = dsaSlotUsed;
628
629 // These two are identical if the function doesn't align its stack dynamically.
630 uint32_t stackArgsRegId = ffi.getStackArgsRegId();
631 if (stackArgsRegId == Globals::kInvalidRegId)
632 stackArgsRegId = X86Gp::kIdSp;
633
634 // Fix stack arguments base-register from ESP|RSP to EBP|RBP in case it was
635 // not picked before and the function performs dynamic stack alignment.
636 if (dsa && stackArgsRegId == X86Gp::kIdSp)
637 stackArgsRegId = X86Gp::kIdBp;
638
639 if (stackArgsRegId != X86Gp::kIdSp)
640 layout._savedRegs[X86Reg::kKindGp] |= Utils::mask(stackArgsRegId) & func.getPreservedRegs(X86Gp::kKindGp);
641
642 layout._stackBaseRegId = X86Gp::kIdSp;
643 layout._stackArgsRegId = static_cast<uint8_t>(stackArgsRegId);
644
645 // Setup stack size used to save preserved registers.
646 layout._gpStackSize = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindGp )) * gpSize;
647 layout._vecStackSize = Utils::bitCount(layout.getSavedRegs(X86Reg::kKindVec)) * 16 +
648 Utils::bitCount(layout.getSavedRegs(X86Reg::kKindMm )) * 8 ;
649
650 uint32_t v = 0; // The beginning of the stack frame, aligned to CallFrame alignment.
651 v += ffi._callFrameSize; // Count '_callFrameSize' <- This is used to call functions.
652 v = Utils::alignTo(v, stackAlignment);// Align to function's SA
653
654 layout._stackBaseOffset = v; // Store '_stackBaseOffset'<- Function's own stack starts here..
655 v += ffi._stackFrameSize; // Count '_stackFrameSize' <- Function's own stack ends here.
656
657 // If the function is aligned, calculate the alignment necessary to store
658 // vector registers, and set `FuncFrameInfo::kX86FlagAlignedVecSR` to inform
659 // PrologEpilog inserter that it can use instructions to perform aligned
660 // stores/loads to save/restore VEC registers.
661 if (stackAlignment >= 16 && layout._vecStackSize) {
662 v = Utils::alignTo(v, 16); // Align '_vecStackOffset'.
663 layout._alignedVecSR = true;
664 }
665
666 layout._vecStackOffset = v; // Store '_vecStackOffset' <- Functions VEC Save|Restore starts here.
667 v += layout._vecStackSize; // Count '_vecStackSize' <- Functions VEC Save|Restore ends here.
668
669 if (dsaSlotUsed) {
670 layout._dsaSlot = v; // Store '_dsaSlot' <- Old stack pointer is stored here.
671 v += gpSize;
672 }
673
674 // The return address should be stored after GP save/restore regs. It has
675 // the same size as `gpSize` (basically the native register/pointer size).
676 // We don't adjust it now as `v` now contains the exact size that the
677 // function requires to adjust (call frame + stack frame, vec stack size).
678 // The stack (if we consider this size) is misaligned now, as it's always
679 // aligned before the function call - when `call()` is executed it pushes
680 // the current EIP|RIP onto the stack, and misaligns it by 12 or 8 bytes
681 // (depending on the architecture). So count number of bytes needed to align
682 // it up to the function's CallFrame (the beginning).
683 if (v || ffi.hasCalls())
684 v += Utils::alignDiff(v + layout._gpStackSize + gpSize, stackAlignment);
685
686 layout._stackAdjustment = v; // Store '_stackAdjustment'<- SA used by 'add zsp, SA' and 'sub zsp, SA'.
687 layout._gpStackOffset = v; // Store '_gpStackOffset' <- Functions GP Save|Restore starts here.
688 v += layout._gpStackSize; // Count '_gpStackSize' <- Functions GP Save|Restore ends here.
689
690 v += gpSize; // Count 'ReturnAddress'.
691 v += func.getSpillZoneSize(); // Count 'SpillZoneSize'.
692
693 // Calculate where function arguments start, relative to the stackArgsRegId.
694 // If the register that will be used to access arguments passed by stack is
695 // ESP|RSP then it's exactly where we are now, otherwise we must calculate
696 // how many 'push regs' we did and adjust it based on that.
697 uint32_t stackArgsOffset = v;
698 if (stackArgsRegId != X86Gp::kIdSp) {
699 if (ffi.hasPreservedFP())
700 stackArgsOffset = gpSize;
701 else
702 stackArgsOffset = layout._gpStackSize;
703 }
704 layout._stackArgsOffset = stackArgsOffset;
705
706 // If the function does dynamic stack adjustment then the stack-adjustment
707 // must be aligned.
708 if (dsa)
709 layout._stackAdjustment = Utils::alignTo(layout._stackAdjustment, stackAlignment);
710
711 // Initialize variables based on CallConv flags.
712 if (func.hasFlag(CallConv::kFlagCalleePopsStack))
713 layout._calleeStackCleanup = static_cast<uint16_t>(func.getArgStackSize());
714
715 // Initialize variables based on FFI flags.
716 layout._mmxCleanup = ffi.hasMmxCleanup();
717 layout._avxEnabled = ffi.isAvxEnabled();
718 layout._avxCleanup = ffi.hasAvxCleanup();
719
720 return kErrorOk;
721 }
722
723 // ============================================================================
724 // [asmjit::X86Internal - ArgsToFrameInfo]
725 // ============================================================================
726
argsToFrameInfo(const FuncArgsMapper & args,FuncFrameInfo & ffi)727 ASMJIT_FAVOR_SIZE Error X86Internal::argsToFrameInfo(const FuncArgsMapper& args, FuncFrameInfo& ffi) noexcept {
728 X86FuncArgsContext ctx;
729 ASMJIT_PROPAGATE(ctx.initWorkData(args, ffi._dirtyRegs, ffi.hasPreservedFP()));
730
731 ASMJIT_PROPAGATE(ctx.markDstRegsDirty(ffi));
732 ASMJIT_PROPAGATE(ctx.markRegsForSwaps(ffi));
733 ASMJIT_PROPAGATE(ctx.markStackArgsReg(ffi));
734 return kErrorOk;
735 }
736
737 // ============================================================================
738 // [asmjit::X86Internal - Emit Helpers]
739 // ============================================================================
740
emitRegMove(X86Emitter * emitter,const Operand_ & dst_,const Operand_ & src_,uint32_t typeId,bool avxEnabled,const char * comment)741 ASMJIT_FAVOR_SIZE Error X86Internal::emitRegMove(X86Emitter* emitter,
742 const Operand_& dst_,
743 const Operand_& src_, uint32_t typeId, bool avxEnabled, const char* comment) {
744
745 // Invalid or abstract TypeIds are not allowed.
746 ASMJIT_ASSERT(TypeId::isValid(typeId) && !TypeId::isAbstract(typeId));
747
748 Operand dst(dst_);
749 Operand src(src_);
750
751 uint32_t instId = Inst::kIdNone;
752 uint32_t memFlags = 0;
753
754 enum MemFlags {
755 kDstMem = 0x1,
756 kSrcMem = 0x2
757 };
758
759 // Detect memory operands and patch them to have the same size as the register.
760 // CodeCompiler always sets memory size of allocs and spills, so it shouldn't
761 // be really necessary, however, after this function was separated from Compiler
762 // it's better to make sure that the size is always specified, as we can use
763 // 'movzx' and 'movsx' that rely on it.
764 if (dst.isMem()) { memFlags |= kDstMem; dst.as<X86Mem>().setSize(src.getSize()); }
765 if (src.isMem()) { memFlags |= kSrcMem; src.as<X86Mem>().setSize(dst.getSize()); }
766
767 switch (typeId) {
768 case TypeId::kI8:
769 case TypeId::kU8:
770 case TypeId::kI16:
771 case TypeId::kU16:
772 // Special case - 'movzx' load.
773 if (memFlags & kSrcMem) {
774 instId = X86Inst::kIdMovzx;
775 dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
776 }
777 else if (!memFlags) {
778 // Change both destination and source registers to GPD (safer, no dependencies).
779 dst.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
780 src.setSignature(X86RegTraits<X86Reg::kRegGpd>::kSignature);
781 }
782 ASMJIT_FALLTHROUGH;
783
784 case TypeId::kI32:
785 case TypeId::kU32:
786 case TypeId::kI64:
787 case TypeId::kU64:
788 instId = X86Inst::kIdMov;
789 break;
790
791 case TypeId::kMmx32:
792 instId = X86Inst::kIdMovd;
793 if (memFlags) break;
794 ASMJIT_FALLTHROUGH;
795 case TypeId::kMmx64 : instId = X86Inst::kIdMovq ; break;
796 case TypeId::kMask8 : instId = X86Inst::kIdKmovb; break;
797 case TypeId::kMask16: instId = X86Inst::kIdKmovw; break;
798 case TypeId::kMask32: instId = X86Inst::kIdKmovd; break;
799 case TypeId::kMask64: instId = X86Inst::kIdKmovq; break;
800
801 default: {
802 uint32_t elementTypeId = TypeId::elementOf(typeId);
803 if (TypeId::isVec32(typeId) && memFlags) {
804 if (elementTypeId == TypeId::kF32)
805 instId = avxEnabled ? X86Inst::kIdVmovss : X86Inst::kIdMovss;
806 else
807 instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
808 break;
809 }
810
811 if (TypeId::isVec64(typeId) && memFlags) {
812 if (elementTypeId == TypeId::kF64)
813 instId = avxEnabled ? X86Inst::kIdVmovsd : X86Inst::kIdMovsd;
814 else
815 instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
816 break;
817 }
818
819 if (elementTypeId == TypeId::kF32)
820 instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
821 else if (elementTypeId == TypeId::kF64)
822 instId = avxEnabled ? X86Inst::kIdVmovapd : X86Inst::kIdMovapd;
823 else if (typeId <= TypeId::_kVec256End)
824 instId = avxEnabled ? X86Inst::kIdVmovdqa : X86Inst::kIdMovdqa;
825 else if (elementTypeId <= TypeId::kU32)
826 instId = X86Inst::kIdVmovdqa32;
827 else
828 instId = X86Inst::kIdVmovdqa64;
829 break;
830 }
831 }
832
833 if (!instId)
834 return DebugUtils::errored(kErrorInvalidState);
835
836 emitter->setInlineComment(comment);
837 return emitter->emit(instId, dst, src);
838 }
839
emitArgMove(X86Emitter * emitter,const X86Reg & dst_,uint32_t dstTypeId,const Operand_ & src_,uint32_t srcTypeId,bool avxEnabled,const char * comment)840 ASMJIT_FAVOR_SIZE Error X86Internal::emitArgMove(X86Emitter* emitter,
841 const X86Reg& dst_, uint32_t dstTypeId,
842 const Operand_& src_, uint32_t srcTypeId, bool avxEnabled, const char* comment) {
843
844 // Deduce optional `dstTypeId`, which may be `TypeId::kVoid` in some cases.
845 if (!dstTypeId) dstTypeId = x86OpData.archRegs.regTypeToTypeId[dst_.getType()];
846
847 // Invalid or abstract TypeIds are not allowed.
848 ASMJIT_ASSERT(TypeId::isValid(dstTypeId) && !TypeId::isAbstract(dstTypeId));
849 ASMJIT_ASSERT(TypeId::isValid(srcTypeId) && !TypeId::isAbstract(srcTypeId));
850
851 X86Reg dst(dst_);
852 Operand src(src_);
853
854 uint32_t dstSize = TypeId::sizeOf(dstTypeId);
855 uint32_t srcSize = TypeId::sizeOf(srcTypeId);
856
857 int32_t instId = Inst::kIdNone;
858
859 // Not a real loop, just 'break' is nicer than 'goto'.
860 for (;;) {
861 if (TypeId::isInt(dstTypeId)) {
862 if (TypeId::isInt(srcTypeId)) {
863 instId = X86Inst::kIdMovsx;
864 uint32_t typeOp = (dstTypeId << 8) | srcTypeId;
865
866 // Sign extend by using 'movsx'.
867 if (typeOp == ((TypeId::kI16 << 8) | TypeId::kI8 ) ||
868 typeOp == ((TypeId::kI32 << 8) | TypeId::kI8 ) ||
869 typeOp == ((TypeId::kI32 << 8) | TypeId::kI16) ||
870 typeOp == ((TypeId::kI64 << 8) | TypeId::kI8 ) ||
871 typeOp == ((TypeId::kI64 << 8) | TypeId::kI16)) break;
872
873 // Sign extend by using 'movsxd'.
874 instId = X86Inst::kIdMovsxd;
875 if (typeOp == ((TypeId::kI64 << 8) | TypeId::kI32)) break;
876 }
877
878 if (TypeId::isInt(srcTypeId) || src_.isMem()) {
879 // Zero extend by using 'movzx' or 'mov'.
880 if (dstSize <= 4 && srcSize < 4) {
881 instId = X86Inst::kIdMovzx;
882 dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
883 }
884 else {
885 // We should have caught all possibilities where `srcSize` is less
886 // than 4, so we don't have to worry about 'movzx' anymore. Minimum
887 // size is enough to determine if we want 32-bit or 64-bit move.
888 instId = X86Inst::kIdMov;
889 srcSize = std::min(srcSize, dstSize);
890
891 dst.setSignature(srcSize == 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
892 : X86Reg::signatureOfT<X86Reg::kRegGpq>());
893 if (src.isReg()) src.setSignature(dst.getSignature());
894 }
895 break;
896 }
897
898 // NOTE: The previous branch caught all memory sources, from here it's
899 // always register to register conversion, so catch the remaining cases.
900 srcSize = std::min(srcSize, dstSize);
901
902 if (TypeId::isMmx(srcTypeId)) {
903 // 64-bit move.
904 instId = X86Inst::kIdMovq;
905 if (srcSize == 8) break;
906
907 // 32-bit move.
908 instId = X86Inst::kIdMovd;
909 dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
910 break;
911 }
912
913 if (TypeId::isMask(srcTypeId)) {
914 instId = X86Inst::kmovIdFromSize(srcSize);
915 dst.setSignature(srcSize <= 4 ? X86Reg::signatureOfT<X86Reg::kRegGpd>()
916 : X86Reg::signatureOfT<X86Reg::kRegGpq>());
917 break;
918 }
919
920 if (TypeId::isVec(srcTypeId)) {
921 // 64-bit move.
922 instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
923 if (srcSize == 8) break;
924
925 // 32-bit move.
926 instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
927 dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
928 break;
929 }
930 }
931
932 if (TypeId::isMmx(dstTypeId)) {
933 instId = X86Inst::kIdMovq;
934 srcSize = std::min(srcSize, dstSize);
935
936 if (TypeId::isInt(srcTypeId) || src.isMem()) {
937 // 64-bit move.
938 if (srcSize == 8) break;
939
940 // 32-bit move.
941 instId = X86Inst::kIdMovd;
942 if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
943 break;
944 }
945
946 if (TypeId::isMmx(srcTypeId)) break;
947
948 // NOTE: This will hurt if `avxEnabled`.
949 instId = X86Inst::kIdMovdq2q;
950 if (TypeId::isVec(srcTypeId)) break;
951 }
952
953 if (TypeId::isMask(dstTypeId)) {
954 srcSize = std::min(srcSize, dstSize);
955
956 if (TypeId::isInt(srcTypeId) || TypeId::isMask(srcTypeId) || src.isMem()) {
957 instId = X86Inst::kmovIdFromSize(srcSize);
958 if (X86Reg::isGp(src) && srcSize <= 4) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
959 break;
960 }
961 }
962
963 if (TypeId::isVec(dstTypeId)) {
964 // By default set destination to XMM, will be set to YMM|ZMM if needed.
965 dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegXmm>());
966
967 // NOTE: This will hurt if `avxEnabled`.
968 if (X86Reg::isMm(src)) {
969 // 64-bit move.
970 instId = X86Inst::kIdMovq2dq;
971 break;
972 }
973
974 // Argument conversion.
975 uint32_t dstElement = TypeId::elementOf(dstTypeId);
976 uint32_t srcElement = TypeId::elementOf(srcTypeId);
977
978 if (dstElement == TypeId::kF32 && srcElement == TypeId::kF64) {
979 srcSize = std::min(dstSize * 2, srcSize);
980 dstSize = srcSize / 2;
981
982 if (srcSize <= 8)
983 instId = avxEnabled ? X86Inst::kIdVcvtss2sd : X86Inst::kIdCvtss2sd;
984 else
985 instId = avxEnabled ? X86Inst::kIdVcvtps2pd : X86Inst::kIdCvtps2pd;
986
987 if (dstSize == 32)
988 dst.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
989 if (src.isReg())
990 src.setSignature(X86Reg::signatureOfVecBySize(srcSize));
991 break;
992 }
993
994 if (dstElement == TypeId::kF64 && srcElement == TypeId::kF32) {
995 srcSize = std::min(dstSize, srcSize * 2) / 2;
996 dstSize = srcSize * 2;
997
998 if (srcSize <= 4)
999 instId = avxEnabled ? X86Inst::kIdVcvtsd2ss : X86Inst::kIdCvtsd2ss;
1000 else
1001 instId = avxEnabled ? X86Inst::kIdVcvtpd2ps : X86Inst::kIdCvtpd2ps;
1002
1003 dst.setSignature(X86Reg::signatureOfVecBySize(dstSize));
1004 if (src.isReg() && srcSize >= 32)
1005 src.setSignature(X86Reg::signatureOfT<X86Reg::kRegYmm>());
1006 break;
1007 }
1008
1009 srcSize = std::min(srcSize, dstSize);
1010 if (X86Reg::isGp(src) || src.isMem()) {
1011 // 32-bit move.
1012 if (srcSize <= 4) {
1013 instId = avxEnabled ? X86Inst::kIdVmovd : X86Inst::kIdMovd;
1014 if (src.isReg()) src.setSignature(X86Reg::signatureOfT<X86Reg::kRegGpd>());
1015 break;
1016 }
1017
1018 // 64-bit move.
1019 if (srcSize == 8) {
1020 instId = avxEnabled ? X86Inst::kIdVmovq : X86Inst::kIdMovq;
1021 break;
1022 }
1023 }
1024
1025 if (X86Reg::isVec(src) || src.isMem()) {
1026 instId = avxEnabled ? X86Inst::kIdVmovaps : X86Inst::kIdMovaps;
1027 uint32_t sign = X86Reg::signatureOfVecBySize(srcSize);
1028
1029 dst.setSignature(sign);
1030 if (src.isReg()) src.setSignature(sign);
1031 break;
1032 }
1033 }
1034
1035 return DebugUtils::errored(kErrorInvalidState);
1036 }
1037
1038 if (src.isMem())
1039 src.as<X86Mem>().setSize(srcSize);
1040
1041 emitter->setInlineComment(comment);
1042 return emitter->emit(instId, dst, src);
1043 }
1044
1045 // ============================================================================
1046 // [asmjit::X86Internal - Emit Prolog & Epilog]
1047 // ============================================================================
1048
emitProlog(X86Emitter * emitter,const FuncFrameLayout & layout)1049 ASMJIT_FAVOR_SIZE Error X86Internal::emitProlog(X86Emitter* emitter, const FuncFrameLayout& layout) {
1050 uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
1051
1052 X86Gp zsp = emitter->zsp(); // ESP|RSP register.
1053 X86Gp zbp = emitter->zbp(); // EBP|RBP register.
1054 X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
1055 X86Gp saReg = emitter->zsp(); // Stack-arguments base register.
1056
1057 // Emit: 'push zbp'
1058 // 'mov zbp, zsp'.
1059 if (layout.hasPreservedFP()) {
1060 gpSaved &= ~Utils::mask(X86Gp::kIdBp);
1061 ASMJIT_PROPAGATE(emitter->push(zbp));
1062 ASMJIT_PROPAGATE(emitter->mov(zbp, zsp));
1063 }
1064
1065 // Emit: 'push gp' sequence.
1066 if (gpSaved) {
1067 for (uint32_t i = gpSaved, regId = 0; i; i >>= 1, regId++) {
1068 if (!(i & 0x1)) continue;
1069 gpReg.setId(regId);
1070 ASMJIT_PROPAGATE(emitter->push(gpReg));
1071 }
1072 }
1073
1074 // Emit: 'mov saReg, zsp'.
1075 uint32_t stackArgsRegId = layout.getStackArgsRegId();
1076 if (stackArgsRegId != Globals::kInvalidRegId && stackArgsRegId != X86Gp::kIdSp) {
1077 saReg.setId(stackArgsRegId);
1078 if (!(layout.hasPreservedFP() && stackArgsRegId == X86Gp::kIdBp))
1079 ASMJIT_PROPAGATE(emitter->mov(saReg, zsp));
1080 }
1081
1082 // Emit: 'and zsp, StackAlignment'.
1083 if (layout.hasDynamicAlignment())
1084 ASMJIT_PROPAGATE(emitter->and_(zsp, -static_cast<int32_t>(layout.getStackAlignment())));
1085
1086 // Emit: 'sub zsp, StackAdjustment'.
1087 if (layout.hasStackAdjustment())
1088 ASMJIT_PROPAGATE(emitter->sub(zsp, layout.getStackAdjustment()));
1089
1090 // Emit: 'mov [zsp + dsaSlot], saReg'.
1091 if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
1092 X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
1093 ASMJIT_PROPAGATE(emitter->mov(saMem, saReg));
1094 }
1095
1096 // Emit 'movaps|movups [zsp + X], xmm0..15'.
1097 uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
1098 if (xmmSaved) {
1099 X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
1100 X86Reg vecReg = x86::xmm(0);
1101
1102 uint32_t vecInst = x86GetXmmMovInst(layout);
1103 uint32_t vecSize = 16;
1104
1105 for (uint32_t i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
1106 if (!(i & 0x1)) continue;
1107 vecReg.setId(regId);
1108 ASMJIT_PROPAGATE(emitter->emit(vecInst, vecBase, vecReg));
1109 vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
1110 }
1111 }
1112
1113 return kErrorOk;
1114 }
1115
emitEpilog(X86Emitter * emitter,const FuncFrameLayout & layout)1116 ASMJIT_FAVOR_SIZE Error X86Internal::emitEpilog(X86Emitter* emitter, const FuncFrameLayout& layout) {
1117 uint32_t i;
1118 uint32_t regId;
1119
1120 uint32_t gpSize = emitter->getGpSize();
1121 uint32_t gpSaved = layout.getSavedRegs(X86Reg::kKindGp);
1122
1123 X86Gp zsp = emitter->zsp(); // ESP|RSP register.
1124 X86Gp zbp = emitter->zbp(); // EBP|RBP register.
1125 X86Gp gpReg = emitter->zsp(); // General purpose register (temporary).
1126
1127 // Don't emit 'pop zbp' in the pop sequence, this case is handled separately.
1128 if (layout.hasPreservedFP()) gpSaved &= ~Utils::mask(X86Gp::kIdBp);
1129
1130 // Emit 'movaps|movups xmm0..15, [zsp + X]'.
1131 uint32_t xmmSaved = layout.getSavedRegs(X86Reg::kKindVec);
1132 if (xmmSaved) {
1133 X86Mem vecBase = x86::ptr(zsp, layout.getVecStackOffset());
1134 X86Reg vecReg = x86::xmm(0);
1135
1136 uint32_t vecInst = x86GetXmmMovInst(layout);
1137 uint32_t vecSize = 16;
1138
1139 for (i = xmmSaved, regId = 0; i; i >>= 1, regId++) {
1140 if (!(i & 0x1)) continue;
1141 vecReg.setId(regId);
1142 ASMJIT_PROPAGATE(emitter->emit(vecInst, vecReg, vecBase));
1143 vecBase.addOffsetLo32(static_cast<int32_t>(vecSize));
1144 }
1145 }
1146
1147 // Emit 'emms' and 'vzeroupper'.
1148 if (layout.hasMmxCleanup()) ASMJIT_PROPAGATE(emitter->emms());
1149 if (layout.hasAvxCleanup()) ASMJIT_PROPAGATE(emitter->vzeroupper());
1150
1151 if (layout.hasPreservedFP()) {
1152 // Emit 'mov zsp, zbp' or 'lea zsp, [zbp - x]'
1153 int32_t count = static_cast<int32_t>(layout.getGpStackSize() - gpSize);
1154 if (!count)
1155 ASMJIT_PROPAGATE(emitter->mov(zsp, zbp));
1156 else
1157 ASMJIT_PROPAGATE(emitter->lea(zsp, x86::ptr(zbp, -count)));
1158 }
1159 else {
1160 if (layout.hasDynamicAlignment() && layout.hasDsaSlotUsed()) {
1161 // Emit 'mov zsp, [zsp + DsaSlot]'.
1162 X86Mem saMem = x86::ptr(zsp, layout._dsaSlot);
1163 ASMJIT_PROPAGATE(emitter->mov(zsp, saMem));
1164 }
1165 else if (layout.hasStackAdjustment()) {
1166 // Emit 'add zsp, StackAdjustment'.
1167 ASMJIT_PROPAGATE(emitter->add(zsp, static_cast<int32_t>(layout.getStackAdjustment())));
1168 }
1169 }
1170
1171 // Emit 'pop gp' sequence.
1172 if (gpSaved) {
1173 i = gpSaved;
1174 regId = 16;
1175
1176 do {
1177 regId--;
1178 if (i & 0x8000) {
1179 gpReg.setId(regId);
1180 ASMJIT_PROPAGATE(emitter->pop(gpReg));
1181 }
1182 i <<= 1;
1183 } while (regId != 0);
1184 }
1185
1186 // Emit 'pop zbp'.
1187 if (layout.hasPreservedFP()) ASMJIT_PROPAGATE(emitter->pop(zbp));
1188
1189 // Emit 'ret' or 'ret x'.
1190 if (layout.hasCalleeStackCleanup())
1191 ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet, static_cast<int>(layout.getCalleeStackCleanup())));
1192 else
1193 ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdRet));
1194
1195 return kErrorOk;
1196 }
1197
1198 // ============================================================================
1199 // [asmjit::X86Internal - AllocArgs]
1200 // ============================================================================
1201
allocArgs(X86Emitter * emitter,const FuncFrameLayout & layout,const FuncArgsMapper & args)1202 ASMJIT_FAVOR_SIZE Error X86Internal::allocArgs(X86Emitter* emitter, const FuncFrameLayout& layout, const FuncArgsMapper& args) {
1203 typedef X86FuncArgsContext::SrcArg SrcArg;
1204 typedef X86FuncArgsContext::DstArg DstArg;
1205 typedef X86FuncArgsContext::WorkData WorkData;
1206 enum { kMaxVRegKinds = Globals::kMaxVRegKinds };
1207
1208 uint32_t i;
1209 const FuncDetail& func = *args.getFuncDetail();
1210
1211 X86FuncArgsContext ctx;
1212 ASMJIT_PROPAGATE(ctx.initWorkData(args, layout._savedRegs, layout.hasPreservedFP()));
1213
1214 // We must honor AVX if it's enabled.
1215 bool avxEnabled = layout.isAvxEnabled();
1216
1217 // Free registers that can be used as temporaries and during shuffling.
1218 // We initialize them to match all workRegs (registers that can be used
1219 // by the function) except source regs, which are used to pass arguments.
1220 // Free registers are changed during shuffling - when an argument is moved
1221 // to the final register then the register itself is removed from freeRegs
1222 // (it can't be altered anymore during shuffling).
1223 uint32_t freeRegs[kMaxVRegKinds];
1224 for (i = 0; i < kMaxVRegKinds; i++)
1225 freeRegs[i] = ctx._workData[i].workRegs & ~ctx._workData[i].srcRegs;
1226
1227 // This is an iterative process that runs until there is a work to do. When
1228 // one register is moved it can create space for another move. Such moves can
1229 // depend on each other so the algorithm may run multiple times before all
1230 // arguments are in place. This part does only register-to-register work,
1231 // arguments moved from stack-to-register area handled later.
1232 for (;;) {
1233 bool hasWork = false; // Do we have a work to do?
1234 bool didWork = false; // If we did something...
1235
1236 uint32_t dstRegKind = kMaxVRegKinds;
1237 do {
1238 WorkData& wd = ctx._workData[--dstRegKind];
1239 if (wd.numOps > wd.numStackArgs) {
1240 hasWork = true;
1241
1242 // Iterate over all destination regs and check if we can do something.
1243 // We always go from destination to source, never the opposite.
1244 uint32_t regsToDo = wd.dstRegs;
1245 do {
1246 // If there is a work to do there has to be at least one dstReg.
1247 ASMJIT_ASSERT(regsToDo != 0);
1248 uint32_t dstRegId = Utils::findFirstBit(regsToDo);
1249 uint32_t dstRegMask = Utils::mask(dstRegId);
1250
1251 uint32_t argIndex = wd.argIndex[dstRegId];
1252 const DstArg& dstArg = args.getArg(argIndex);
1253 const SrcArg& srcArg = func.getArg(argIndex);
1254
1255 if (srcArg.byReg()) {
1256 uint32_t srcRegType = srcArg.getRegType();
1257 uint32_t srcRegKind = X86Reg::kindOf(srcRegType);
1258
1259 if (freeRegs[dstRegKind] & dstRegMask) {
1260 X86Reg dstReg(X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId));
1261 X86Reg srcReg(X86Reg::fromTypeAndId(srcRegType, srcArg.getRegId()));
1262
1263 ASMJIT_PROPAGATE(
1264 emitArgMove(emitter,
1265 dstReg, dstArg.getTypeId(),
1266 srcReg, srcArg.getTypeId(), avxEnabled));
1267 freeRegs[dstRegKind] ^= dstRegMask; // Make the DST reg occupied.
1268 freeRegs[srcRegKind] |= Utils::mask(srcArg.getRegId()); // Make the SRC reg free.
1269
1270 ASMJIT_ASSERT(wd.numOps >= 1);
1271 wd.numOps--;
1272 didWork = true;
1273 }
1274 else {
1275 // Check if this is a swap operation.
1276 if (dstRegKind == srcRegKind) {
1277 uint32_t srcRegId = srcArg.getRegId();
1278
1279 uint32_t otherIndex = wd.argIndex[srcRegId];
1280 const DstArg& otherArg = args.getArg(otherIndex);
1281
1282 if (otherArg.getRegId() == srcRegId && X86Reg::kindOf(otherArg.getRegType()) == dstRegKind) {
1283 // If this is GP reg it can be handled by 'xchg'.
1284 if (dstRegKind == X86Reg::kKindGp) {
1285 uint32_t highestType = std::max(dstArg.getRegType(), srcRegType);
1286
1287 X86Reg dstReg = x86::gpd(dstRegId);
1288 X86Reg srcReg = x86::gpd(srcRegId);
1289
1290 if (highestType == X86Reg::kRegGpq) {
1291 dstReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
1292 srcReg.setSignature(X86RegTraits<X86Reg::kRegGpq>::kSignature);
1293 }
1294 ASMJIT_PROPAGATE(emitter->emit(X86Inst::kIdXchg, dstReg, srcReg));
1295 regsToDo &= ~Utils::mask(srcRegId);
1296 freeRegs[dstRegKind] &= ~(Utils::mask(srcRegId) | dstRegMask);
1297
1298 ASMJIT_ASSERT(wd.numOps >= 2);
1299 ASMJIT_ASSERT(wd.numSwaps >= 1);
1300 wd.numOps-=2;
1301 wd.numSwaps--;
1302 didWork = true;
1303 }
1304 }
1305 }
1306 }
1307 }
1308
1309 // Clear the reg in `regsToDo` and continue if there are more.
1310 regsToDo ^= dstRegMask;
1311 } while (regsToDo);
1312 }
1313 } while (dstRegKind);
1314
1315 if (!hasWork)
1316 break;
1317
1318 if (!didWork)
1319 return DebugUtils::errored(kErrorInvalidState);
1320 }
1321
1322 // Load arguments passed by stack into registers. This is pretty simple and
1323 // it never requires multiple iterations like the previous phase.
1324 if (ctx._hasStackArgs) {
1325 // Base address of all arguments passed by stack.
1326 X86Mem saBase = x86::ptr(emitter->gpz(layout.getStackArgsRegId()), layout.getStackArgsOffset());
1327
1328 uint32_t dstRegKind = kMaxVRegKinds;
1329 do {
1330 WorkData& wd = ctx._workData[--dstRegKind];
1331 if (wd.numStackArgs) {
1332 // Iterate over all destination regs and check if we can do something.
1333 // We always go from destination to source, never the opposite.
1334 uint32_t regsToDo = wd.dstRegs;
1335 do {
1336 // If there is a work to do there has to be at least one dstReg.
1337 ASMJIT_ASSERT(regsToDo != 0);
1338 ASMJIT_ASSERT(wd.numOps > 0);
1339
1340 uint32_t dstRegId = Utils::findFirstBit(regsToDo);
1341 uint32_t dstRegMask = Utils::mask(dstRegId);
1342
1343 uint32_t argIndex = wd.argIndex[dstRegId];
1344 const DstArg& dstArg = args.getArg(argIndex);
1345 const SrcArg& srcArg = func.getArg(argIndex);
1346
1347 // Only arguments passed by stack should remain, also the destination
1348 // registers must be free now (otherwise the first part of the algorithm
1349 // failed). Ideally this should be assert, but it's much safer to enforce
1350 // this in release as well.
1351 if (!srcArg.byStack() || !(freeRegs[dstRegKind] & dstRegMask))
1352 return DebugUtils::errored(kErrorInvalidState);
1353
1354 X86Reg dstReg = X86Reg::fromTypeAndId(dstArg.getRegType(), dstRegId);
1355 X86Mem srcMem = saBase.adjusted(srcArg.getStackOffset());
1356
1357 ASMJIT_PROPAGATE(
1358 emitArgMove(emitter,
1359 dstReg, dstArg.getTypeId(),
1360 srcMem, srcArg.getTypeId(), avxEnabled));
1361
1362 freeRegs[dstRegKind] ^= dstRegMask;
1363 regsToDo ^= dstRegMask;
1364 wd.numOps--;
1365 } while (regsToDo);
1366 }
1367 } while (dstRegKind);
1368 }
1369
1370 return kErrorOk;
1371 }
1372
1373 } // asmjit namespace
1374 } // namespace PLMD
1375
1376 // [Api-End]
1377 #include "./asmjit_apiend.h"
1378
1379 // [Guard]
1380 #endif // ASMJIT_BUILD_X86
1381 #pragma GCC diagnostic pop
1382 #endif // __PLUMED_HAS_ASMJIT
1383