1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file builder_misc.cpp
24  *
25  * @brief Implementation for miscellaneous builder functions
26  *
27  * Notes:
28  *
29  ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33 
34 #include <cstdarg>
35 
36 extern "C" void CallPrint(const char* fmt, ...);
37 
38 namespace SwrJit
39 {
40     //////////////////////////////////////////////////////////////////////////
41     /// @brief Convert an IEEE 754 32-bit single precision float to an
42     ///        16 bit float with 5 exponent bits and a variable
43     ///        number of mantissa bits.
44     /// @param val - 32-bit float
45     /// @todo Maybe move this outside of this file into a header?
ConvertFloat32ToFloat16(float val)46     static uint16_t ConvertFloat32ToFloat16(float val)
47     {
48         uint32_t sign, exp, mant;
49         uint32_t roundBits;
50 
51         // Extract the sign, exponent, and mantissa
52         uint32_t uf = *(uint32_t*)&val;
53         sign        = (uf & 0x80000000) >> 31;
54         exp         = (uf & 0x7F800000) >> 23;
55         mant        = uf & 0x007FFFFF;
56 
57         // Check for out of range
58         if (std::isnan(val))
59         {
60             exp  = 0x1F;
61             mant = 0x200;
62             sign = 1; // set the sign bit for NANs
63         }
64         else if (std::isinf(val))
65         {
66             exp  = 0x1f;
67             mant = 0x0;
68         }
69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
70         {
71             exp  = 0x1E;
72             mant = 0x3FF;
73         }
74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
75         {
76             mant |= 0x00800000;
77             for (; exp <= 0x70; mant >>= 1, exp++)
78                 ;
79             exp  = 0;
80             mant = mant >> 13;
81         }
82         else if (exp < 0x66) // Too small to represent -> Zero
83         {
84             exp  = 0;
85             mant = 0;
86         }
87         else
88         {
89             // Saves bits that will be shifted off for rounding
90             roundBits = mant & 0x1FFFu;
91             // convert exponent and mantissa to 16 bit format
92             exp  = exp - 0x70;
93             mant = mant >> 13;
94 
95             // Essentially RTZ, but round up if off by only 1 lsb
96             if (roundBits == 0x1FFFu)
97             {
98                 mant++;
99                 // check for overflow
100                 if ((mant & 0xC00u) != 0)
101                     exp++;
102                 // make sure only the needed bits are used
103                 mant &= 0x3FF;
104             }
105         }
106 
107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
108         return (uint16_t)tmpVal;
109     }
110 
C(bool i)111     Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
112 
C(char i)113     Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
114 
C(uint8_t i)115     Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
116 
C(int i)117     Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
118 
C(int64_t i)119     Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
120 
C(uint16_t i)121     Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
122 
C(uint32_t i)123     Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
124 
C(uint64_t i)125     Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
126 
C(float i)127     Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
128 
PRED(bool pred)129     Constant* Builder::PRED(bool pred)
130     {
131         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
132     }
133 
VIMMED1(uint64_t i)134     Value* Builder::VIMMED1(uint64_t i)
135     {
136 #if LLVM_VERSION_MAJOR <= 10
137         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
138 #elif LLVM_VERSION_MAJOR == 11
139         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
140 #else
141         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
142 #endif
143     }
144 
VIMMED1_16(uint64_t i)145     Value* Builder::VIMMED1_16(uint64_t i)
146     {
147 #if LLVM_VERSION_MAJOR <= 10
148         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
149 #elif LLVM_VERSION_MAJOR == 11
150         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
151 #else
152         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
153 #endif
154     }
155 
VIMMED1(int i)156     Value* Builder::VIMMED1(int i)
157     {
158 #if LLVM_VERSION_MAJOR <= 10
159         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
160 #elif LLVM_VERSION_MAJOR == 11
161         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
162 #else
163         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
164 #endif
165     }
166 
VIMMED1_16(int i)167     Value* Builder::VIMMED1_16(int i)
168     {
169 #if LLVM_VERSION_MAJOR <= 10
170         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
171 #elif LLVM_VERSION_MAJOR == 11
172         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
173 #else
174         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
175 #endif
176     }
177 
VIMMED1(uint32_t i)178     Value* Builder::VIMMED1(uint32_t i)
179     {
180 #if LLVM_VERSION_MAJOR <= 10
181         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
182 #elif LLVM_VERSION_MAJOR == 11
183         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
184 #else
185         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
186 #endif
187     }
188 
VIMMED1_16(uint32_t i)189     Value* Builder::VIMMED1_16(uint32_t i)
190     {
191 #if LLVM_VERSION_MAJOR <= 10
192         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
193 #elif LLVM_VERSION_MAJOR == 11
194         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
195 #else
196         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
197 #endif
198     }
199 
VIMMED1(float i)200     Value* Builder::VIMMED1(float i)
201     {
202 #if LLVM_VERSION_MAJOR <= 10
203         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
204 #elif LLVM_VERSION_MAJOR == 11
205         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantFP>(C(i)));
206 #else
207         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantFP>(C(i)));
208 #endif
209     }
210 
VIMMED1_16(float i)211     Value* Builder::VIMMED1_16(float i)
212     {
213 #if LLVM_VERSION_MAJOR <= 10
214         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
215 #elif LLVM_VERSION_MAJOR == 11
216         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantFP>(C(i)));
217 #else
218         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantFP>(C(i)));
219 #endif
220     }
221 
VIMMED1(bool i)222     Value* Builder::VIMMED1(bool i)
223     {
224 #if LLVM_VERSION_MAJOR <= 10
225         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
226 #elif LLVM_VERSION_MAJOR == 11
227         return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
228 #else
229         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
230 #endif
231     }
232 
VIMMED1_16(bool i)233     Value* Builder::VIMMED1_16(bool i)
234     {
235 #if LLVM_VERSION_MAJOR <= 10
236         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
237 #elif LLVM_VERSION_MAJOR == 11
238         return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
239 #else
240         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
241 #endif
242     }
243 
VUNDEF_IPTR()244     Value* Builder::VUNDEF_IPTR() { return UndefValue::get(getVectorType(mInt32PtrTy, mVWidth)); }
245 
VUNDEF(Type * t)246     Value* Builder::VUNDEF(Type* t) { return UndefValue::get(getVectorType(t, mVWidth)); }
247 
VUNDEF_I()248     Value* Builder::VUNDEF_I() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth)); }
249 
VUNDEF_I_16()250     Value* Builder::VUNDEF_I_16() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth16)); }
251 
VUNDEF_F()252     Value* Builder::VUNDEF_F() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth)); }
253 
VUNDEF_F_16()254     Value* Builder::VUNDEF_F_16() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth16)); }
255 
VUNDEF(Type * ty,uint32_t size)256     Value* Builder::VUNDEF(Type* ty, uint32_t size)
257     {
258         return UndefValue::get(getVectorType(ty, size));
259     }
260 
VBROADCAST(Value * src,const llvm::Twine & name)261     Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name)
262     {
263         // check if src is already a vector
264         if (src->getType()->isVectorTy())
265         {
266             return src;
267         }
268 
269         return VECTOR_SPLAT(mVWidth, src, name);
270     }
271 
VBROADCAST_16(Value * src)272     Value* Builder::VBROADCAST_16(Value* src)
273     {
274         // check if src is already a vector
275         if (src->getType()->isVectorTy())
276         {
277             return src;
278         }
279 
280         return VECTOR_SPLAT(mVWidth16, src);
281     }
282 
IMMED(Value * v)283     uint32_t Builder::IMMED(Value* v)
284     {
285         SWR_ASSERT(isa<ConstantInt>(v));
286         ConstantInt* pValConst = cast<ConstantInt>(v);
287         return pValConst->getZExtValue();
288     }
289 
S_IMMED(Value * v)290     int32_t Builder::S_IMMED(Value* v)
291     {
292         SWR_ASSERT(isa<ConstantInt>(v));
293         ConstantInt* pValConst = cast<ConstantInt>(v);
294         return pValConst->getSExtValue();
295     }
296 
CALL(Value * Callee,const std::initializer_list<Value * > & argsList,const llvm::Twine & name)297     CallInst* Builder::CALL(Value*                               Callee,
298                             const std::initializer_list<Value*>& argsList,
299                             const llvm::Twine&                   name)
300     {
301         std::vector<Value*> args;
302         for (auto arg : argsList)
303             args.push_back(arg);
304 #if LLVM_VERSION_MAJOR >= 11
305         // see comment to CALLA(Callee) function in the header
306         return CALLA(FunctionCallee(cast<Function>(Callee)), args, name);
307 #else
308         return CALLA(Callee, args, name);
309 #endif
310     }
311 
CALL(Value * Callee,Value * arg)312     CallInst* Builder::CALL(Value* Callee, Value* arg)
313     {
314         std::vector<Value*> args;
315         args.push_back(arg);
316 #if LLVM_VERSION_MAJOR >= 11
317         // see comment to CALLA(Callee) function in the header
318         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
319 #else
320         return CALLA(Callee, args);
321 #endif
322     }
323 
CALL2(Value * Callee,Value * arg1,Value * arg2)324     CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2)
325     {
326         std::vector<Value*> args;
327         args.push_back(arg1);
328         args.push_back(arg2);
329 #if LLVM_VERSION_MAJOR >= 11
330         // see comment to CALLA(Callee) function in the header
331         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
332 #else
333         return CALLA(Callee, args);
334 #endif
335     }
336 
CALL3(Value * Callee,Value * arg1,Value * arg2,Value * arg3)337     CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
338     {
339         std::vector<Value*> args;
340         args.push_back(arg1);
341         args.push_back(arg2);
342         args.push_back(arg3);
343 #if LLVM_VERSION_MAJOR >= 11
344         // see comment to CALLA(Callee) function in the header
345         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
346 #else
347         return CALLA(Callee, args);
348 #endif
349     }
350 
VRCP(Value * va,const llvm::Twine & name)351     Value* Builder::VRCP(Value* va, const llvm::Twine& name)
352     {
353         return FDIV(VIMMED1(1.0f), va, name); // 1 / a
354     }
355 
VPLANEPS(Value * vA,Value * vB,Value * vC,Value * & vX,Value * & vY)356     Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
357     {
358         Value* vOut = FMADDPS(vA, vX, vC);
359         vOut        = FMADDPS(vB, vY, vOut);
360         return vOut;
361     }
362 
363     //////////////////////////////////////////////////////////////////////////
364     /// @brief insert a JIT call to CallPrint
365     /// - outputs formatted string to both stdout and VS output window
366     /// - DEBUG builds only
367     /// Usage example:
368     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
369     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
370     ///   result from a GEP, printing out the pointer to memory
371     /// @param printStr - constant string to print, which includes format specifiers
372     /// @param printArgs - initializer list of Value*'s to print to std out
PRINT(const std::string & printStr,const std::initializer_list<Value * > & printArgs)373     CallInst* Builder::PRINT(const std::string&                   printStr,
374                              const std::initializer_list<Value*>& printArgs)
375     {
376         // push the arguments to CallPrint into a vector
377         std::vector<Value*> printCallArgs;
378         // save room for the format string.  we still need to modify it for vectors
379         printCallArgs.resize(1);
380 
381         // search through the format string for special processing
382         size_t      pos = 0;
383         std::string tempStr(printStr);
384         pos    = tempStr.find('%', pos);
385         auto v = printArgs.begin();
386 
387         while ((pos != std::string::npos) && (v != printArgs.end()))
388         {
389             Value* pArg  = *v;
390             Type*  pType = pArg->getType();
391 
392             if (pType->isVectorTy())
393             {
394                 Type* pContainedType = pType->getContainedType(0);
395 #if LLVM_VERSION_MAJOR >= 12
396                 FixedVectorType* pVectorType = cast<FixedVectorType>(pType);
397 #elif LLVM_VERSION_MAJOR >= 11
398                 VectorType* pVectorType = cast<VectorType>(pType);
399 #endif
400                 if (toupper(tempStr[pos + 1]) == 'X')
401                 {
402                     tempStr[pos]     = '0';
403                     tempStr[pos + 1] = 'x';
404                     tempStr.insert(pos + 2, "%08X ");
405                     pos += 7;
406 
407                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
408 
409                     std::string vectorFormatStr;
410 #if LLVM_VERSION_MAJOR >= 11
411                     for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i)
412 #else
413                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
414 #endif
415                     {
416                         vectorFormatStr += "0x%08X ";
417                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
418                     }
419 
420                     tempStr.insert(pos, vectorFormatStr);
421                     pos += vectorFormatStr.size();
422                 }
423                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
424                 {
425                     uint32_t i = 0;
426 #if LLVM_VERSION_MAJOR >= 11
427                     for (; i < pVectorType->getNumElements() - 1; i++)
428 #else
429                     for (; i < pType->getVectorNumElements() - 1; i++)
430 #endif
431                     {
432                         tempStr.insert(pos, std::string("%f "));
433                         pos += 3;
434                         printCallArgs.push_back(
435                             FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
436                     }
437                     printCallArgs.push_back(
438                         FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
439                 }
440                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
441                 {
442                     uint32_t i = 0;
443 #if LLVM_VERSION_MAJOR >= 11
444                     for (; i < pVectorType->getNumElements() - 1; i++)
445 #else
446                     for (; i < pType->getVectorNumElements() - 1; i++)
447 #endif
448                     {
449                         tempStr.insert(pos, std::string("%d "));
450                         pos += 3;
451                         printCallArgs.push_back(
452                             S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
453                     }
454                     printCallArgs.push_back(
455                         S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
456                 }
457                 else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy()))
458                 {
459                     uint32_t i = 0;
460 #if LLVM_VERSION_MAJOR >= 11
461                     for (; i < pVectorType->getNumElements() - 1; i++)
462 #else
463                     for (; i < pType->getVectorNumElements() - 1; i++)
464 #endif
465                     {
466                         tempStr.insert(pos, std::string("%d "));
467                         pos += 3;
468                         printCallArgs.push_back(
469                             Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
470                     }
471                     printCallArgs.push_back(
472                         Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
473                 }
474             }
475             else
476             {
477                 if (toupper(tempStr[pos + 1]) == 'X')
478                 {
479                     tempStr[pos] = '0';
480                     tempStr.insert(pos + 1, "x%08");
481                     printCallArgs.push_back(pArg);
482                     pos += 3;
483                 }
484                 // for %f we need to cast float Values to doubles so that they print out correctly
485                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
486                 {
487                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
488                     pos++;
489                 }
490                 else
491                 {
492                     printCallArgs.push_back(pArg);
493                 }
494             }
495 
496             // advance to the next argument
497             v++;
498             pos = tempStr.find('%', ++pos);
499         }
500 
501         // create global variable constant string
502         Constant*       constString = ConstantDataArray::getString(JM()->mContext, tempStr, true);
503         GlobalVariable* gvPtr       = new GlobalVariable(
504             constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr");
505         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
506 
507         // get a pointer to the first character in the constant string array
508         std::vector<Constant*> geplist{C(0), C(0)};
509         Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false);
510 
511         // insert the pointer to the format string in the argument vector
512         printCallArgs[0] = strGEP;
513 
514         // get pointer to CallPrint function and insert decl into the module if needed
515         std::vector<Type*> args;
516         args.push_back(PointerType::get(mInt8Ty, 0));
517         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true);
518         Function*     callPrintFn =
519 #if LLVM_VERSION_MAJOR >= 9
520             cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee());
521 #else
522             cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
523 #endif
524 
525         // if we haven't yet added the symbol to the symbol table
526         if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
527         {
528             sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint);
529         }
530 
531         // insert a call to CallPrint
532         return CALLA(callPrintFn, printCallArgs);
533     }
534 
535     //////////////////////////////////////////////////////////////////////////
536     /// @brief Wrapper around PRINT with initializer list.
PRINT(const std::string & printStr)537     CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); }
538 
EXTRACT_16(Value * x,uint32_t imm)539     Value* Builder::EXTRACT_16(Value* x, uint32_t imm)
540     {
541         if (imm == 0)
542         {
543             return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
544         }
545         else
546         {
547             return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
548         }
549     }
550 
JOIN_16(Value * a,Value * b)551     Value* Builder::JOIN_16(Value* a, Value* b)
552     {
553         return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
554     }
555 
556     //////////////////////////////////////////////////////////////////////////
557     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
MASK(Value * vmask)558     Value* Builder::MASK(Value* vmask)
559     {
560         Value* src = BITCAST(vmask, mSimdInt32Ty);
561         return ICMP_SLT(src, VIMMED1(0));
562     }
563 
MASK_16(Value * vmask)564     Value* Builder::MASK_16(Value* vmask)
565     {
566         Value* src = BITCAST(vmask, mSimd16Int32Ty);
567         return ICMP_SLT(src, VIMMED1_16(0));
568     }
569 
570     //////////////////////////////////////////////////////////////////////////
571     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
VMASK(Value * mask)572     Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
573 
VMASK_16(Value * mask)574     Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
575 
576     /// @brief Convert <Nxi1> llvm mask to integer
VMOVMSK(Value * mask)577     Value* Builder::VMOVMSK(Value* mask)
578     {
579 #if LLVM_VERSION_MAJOR >= 11
580 #if LLVM_VERSION_MAJOR >= 12
581         FixedVectorType* pVectorType = cast<FixedVectorType>(mask->getType());
582 #else
583         VectorType* pVectorType = cast<VectorType>(mask->getType());
584 #endif
585         SWR_ASSERT(pVectorType->getElementType() == mInt1Ty);
586         uint32_t numLanes = pVectorType->getNumElements();
587 #else
588         SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
589         uint32_t numLanes = mask->getType()->getVectorNumElements();
590 #endif
591         Value*   i32Result;
592         if (numLanes == 8)
593         {
594             i32Result = BITCAST(mask, mInt8Ty);
595         }
596         else if (numLanes == 16)
597         {
598             i32Result = BITCAST(mask, mInt16Ty);
599         }
600         else
601         {
602             SWR_ASSERT("Unsupported vector width");
603             i32Result = BITCAST(mask, mInt8Ty);
604         }
605         return Z_EXT(i32Result, mInt32Ty);
606     }
607 
608     //////////////////////////////////////////////////////////////////////////
609     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
610     /// supported on the underlying platform, emulate it
611     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
612     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
613     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
614     /// 128bits of a, and vice versa for the upper lanes.  If the mask
615     /// value is negative, '0' is inserted.
PSHUFB(Value * a,Value * b)616     Value* Builder::PSHUFB(Value* a, Value* b)
617     {
618         Value* res;
619         // use avx2 pshufb instruction if available
620         if (JM()->mArch.AVX2())
621         {
622             res = VPSHUFB(a, b);
623         }
624         else
625         {
626             Constant* cB = dyn_cast<Constant>(b);
627             assert(cB != nullptr);
628             // number of 8 bit elements in b
629 #if LLVM_VERSION_MAJOR >= 12
630             uint32_t numElms = cast<FixedVectorType>(cB->getType())->getNumElements();
631 #else
632             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
633 #endif
634             // output vector
635             Value* vShuf = UndefValue::get(getVectorType(mInt8Ty, numElms));
636 
637             // insert an 8 bit value from the high and low lanes of a per loop iteration
638             numElms /= 2;
639             for (uint32_t i = 0; i < numElms; i++)
640             {
641                 ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
642                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
643 
644                 // extract values from constant mask
645                 char valLow128bLane  = (char)(cLow128b->getSExtValue());
646                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
647 
648                 Value* insertValLow128b;
649                 Value* insertValHigh128b;
650 
651                 // if the mask value is negative, insert a '0' in the respective output position
652                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
653                 // byte) in a and insert in output vector
654                 insertValLow128b =
655                     (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
656                 insertValHigh128b = (valHigh128bLane < 0)
657                                         ? C((char)0)
658                                         : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
659 
660                 vShuf = VINSERT(vShuf, insertValLow128b, i);
661                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
662             }
663             res = vShuf;
664         }
665         return res;
666     }
667 
668     //////////////////////////////////////////////////////////////////////////
669     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
670     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
671     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
672     /// lower 8 values are used.
PMOVSXBD(Value * a)673     Value* Builder::PMOVSXBD(Value* a)
674     {
675         // VPMOVSXBD output type
676         Type* v8x32Ty = getVectorType(mInt32Ty, 8);
677         // Extract 8 values from 128bit lane and sign extend
678         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
679     }
680 
681     //////////////////////////////////////////////////////////////////////////
682     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
683     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
684     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
PMOVSXWD(Value * a)685     Value* Builder::PMOVSXWD(Value* a)
686     {
687         // VPMOVSXWD output type
688         Type* v8x32Ty = getVectorType(mInt32Ty, 8);
689         // Extract 8 values from 128bit lane and sign extend
690         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
691     }
692 
693     //////////////////////////////////////////////////////////////////////////
694     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
695     /// in LLVM IR.  If not supported on the underlying platform, emulate it
696     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPH2PS(Value * a,const llvm::Twine & name)697     Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
698     {
699         // Bitcast Nxint16 to Nxhalf
700 #if LLVM_VERSION_MAJOR >= 12
701         uint32_t numElems = cast<FixedVectorType>(a->getType())->getNumElements();
702 #elif LLVM_VERSION_MAJOR >= 11
703         uint32_t numElems = cast<VectorType>(a->getType())->getNumElements();
704 #else
705         uint32_t numElems = a->getType()->getVectorNumElements();
706 #endif
707         Value*   input    = BITCAST(a, getVectorType(mFP16Ty, numElems));
708 
709         return FP_EXT(input, getVectorType(mFP32Ty, numElems), name);
710     }
711 
712     //////////////////////////////////////////////////////////////////////////
713     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
714     /// in LLVM IR.  If not supported on the underlying platform, emulate it
715     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
CVTPS2PH(Value * a,Value * rounding)716     Value* Builder::CVTPS2PH(Value* a, Value* rounding)
717     {
718         if (JM()->mArch.F16C())
719         {
720             return VCVTPS2PH(a, rounding);
721         }
722         else
723         {
724             // call scalar C function for now
725             FunctionType* pFuncTy   = FunctionType::get(mInt16Ty, mFP32Ty);
726             Function*     pCvtPs2Ph = cast<Function>(
727 #if LLVM_VERSION_MAJOR >= 9
728                 JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee());
729 #else
730                 JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
731 #endif
732 
733             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
734             {
735                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16",
736                                                (void*)&ConvertFloat32ToFloat16);
737             }
738 
739             Value* pResult = UndefValue::get(mSimdInt16Ty);
740             for (uint32_t i = 0; i < mVWidth; ++i)
741             {
742                 Value* pSrc  = VEXTRACT(a, C(i));
743                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
744                 pResult      = VINSERT(pResult, pConv, C(i));
745             }
746 
747             return pResult;
748         }
749     }
750 
PMAXSD(Value * a,Value * b)751     Value* Builder::PMAXSD(Value* a, Value* b)
752     {
753         Value* cmp = ICMP_SGT(a, b);
754         return SELECT(cmp, a, b);
755     }
756 
PMINSD(Value * a,Value * b)757     Value* Builder::PMINSD(Value* a, Value* b)
758     {
759         Value* cmp = ICMP_SLT(a, b);
760         return SELECT(cmp, a, b);
761     }
762 
PMAXUD(Value * a,Value * b)763     Value* Builder::PMAXUD(Value* a, Value* b)
764     {
765         Value* cmp = ICMP_UGT(a, b);
766         return SELECT(cmp, a, b);
767     }
768 
PMINUD(Value * a,Value * b)769     Value* Builder::PMINUD(Value* a, Value* b)
770     {
771         Value* cmp = ICMP_ULT(a, b);
772         return SELECT(cmp, a, b);
773     }
774 
775     // Helper function to create alloca in entry block of function
CreateEntryAlloca(Function * pFunc,Type * pType)776     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
777     {
778         auto saveIP = IRB()->saveIP();
779         IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
780         Value* pAlloca = ALLOCA(pType);
781         if (saveIP.isSet())
782             IRB()->restoreIP(saveIP);
783         return pAlloca;
784     }
785 
CreateEntryAlloca(Function * pFunc,Type * pType,Value * pArraySize)786     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
787     {
788         auto saveIP = IRB()->saveIP();
789         IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
790         Value* pAlloca = ALLOCA(pType, pArraySize);
791         if (saveIP.isSet())
792             IRB()->restoreIP(saveIP);
793         return pAlloca;
794     }
795 
VABSPS(Value * a)796     Value* Builder::VABSPS(Value* a)
797     {
798         Value* asInt  = BITCAST(a, mSimdInt32Ty);
799         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
800         return result;
801     }
802 
ICLAMP(Value * src,Value * low,Value * high,const llvm::Twine & name)803     Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
804     {
805         Value* lowCmp = ICMP_SLT(src, low);
806         Value* ret    = SELECT(lowCmp, low, src);
807 
808         Value* highCmp = ICMP_SGT(ret, high);
809         ret            = SELECT(highCmp, high, ret, name);
810 
811         return ret;
812     }
813 
FCLAMP(Value * src,Value * low,Value * high)814     Value* Builder::FCLAMP(Value* src, Value* low, Value* high)
815     {
816         Value* lowCmp = FCMP_OLT(src, low);
817         Value* ret    = SELECT(lowCmp, low, src);
818 
819         Value* highCmp = FCMP_OGT(ret, high);
820         ret            = SELECT(highCmp, high, ret);
821 
822         return ret;
823     }
824 
FCLAMP(Value * src,float low,float high)825     Value* Builder::FCLAMP(Value* src, float low, float high)
826     {
827         Value* result = VMAXPS(src, VIMMED1(low));
828         result        = VMINPS(result, VIMMED1(high));
829 
830         return result;
831     }
832 
FMADDPS(Value * a,Value * b,Value * c)833     Value* Builder::FMADDPS(Value* a, Value* b, Value* c)
834     {
835         Value* vOut;
836         // This maps to LLVM fmuladd intrinsic
837         vOut = VFMADDPS(a, b, c);
838         return vOut;
839     }
840 
841     //////////////////////////////////////////////////////////////////////////
842     /// @brief pop count on vector mask (e.g. <8 x i1>)
VPOPCNT(Value * a)843     Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
844 
845     //////////////////////////////////////////////////////////////////////////
846     /// @brief Float / Fixed-point conversions
847     //////////////////////////////////////////////////////////////////////////
VCVT_F32_FIXED_SI(Value * vFloat,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)848     Value* Builder::VCVT_F32_FIXED_SI(Value*             vFloat,
849                                       uint32_t           numIntBits,
850                                       uint32_t           numFracBits,
851                                       const llvm::Twine& name)
852     {
853         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
854         Value* fixed = nullptr;
855 
856 #if 0   // This doesn't work for negative numbers!!
857         {
858             fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
859                                     C(_MM_FROUND_TO_NEAREST_INT)),
860                              mSimdInt32Ty);
861         }
862         else
863 #endif
864         {
865             // Do round to nearest int on fractional bits first
866             // Not entirely perfect for negative numbers, but close enough
867             vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
868                             C(_MM_FROUND_TO_NEAREST_INT));
869             vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
870 
871             // TODO: Handle INF, NAN, overflow / underflow, etc.
872 
873             Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
874             Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
875             Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
876             vFixed           = OR(vFixed, VIMMED1(1 << 23));
877             vFixed           = SELECT(vSgn, NEG(vFixed), vFixed);
878 
879             Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
880             vExp        = SUB(vExp, VIMMED1(127));
881 
882             Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
883 
884             fixed = ASHR(vFixed, vExtraBits, name);
885         }
886 
887         return fixed;
888     }
889 
VCVT_FIXED_SI_F32(Value * vFixed,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)890     Value* Builder::VCVT_FIXED_SI_F32(Value*             vFixed,
891                                       uint32_t           numIntBits,
892                                       uint32_t           numFracBits,
893                                       const llvm::Twine& name)
894     {
895         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
896         uint32_t extraBits = 32 - numIntBits - numFracBits;
897         if (numIntBits && extraBits)
898         {
899             // Sign extend
900             Value* shftAmt = VIMMED1(extraBits);
901             vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
902         }
903 
904         Value* fVal  = VIMMED1(0.0f);
905         Value* fFrac = VIMMED1(0.0f);
906         if (numIntBits)
907         {
908             fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
909         }
910 
911         if (numFracBits)
912         {
913             fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
914             fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
915         }
916 
917         return FADD(fVal, fFrac, name);
918     }
919 
VCVT_F32_FIXED_UI(Value * vFloat,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)920     Value* Builder::VCVT_F32_FIXED_UI(Value*             vFloat,
921                                       uint32_t           numIntBits,
922                                       uint32_t           numFracBits,
923                                       const llvm::Twine& name)
924     {
925         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
926         Value* fixed = nullptr;
927 #if 1   // KNOB_SIM_FAST_MATH?  Below works correctly from a precision
928         // standpoint...
929         {
930             fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
931                                     C(_MM_FROUND_TO_NEAREST_INT)),
932                              mSimdInt32Ty);
933         }
934 #else
935         {
936             // Do round to nearest int on fractional bits first
937             vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
938                             C(_MM_FROUND_TO_NEAREST_INT));
939             vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
940 
941             // TODO: Handle INF, NAN, overflow / underflow, etc.
942 
943             Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
944             Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
945             Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
946             vFixed           = OR(vFixed, VIMMED1(1 << 23));
947 
948             Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
949             vExp        = SUB(vExp, VIMMED1(127));
950 
951             Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
952 
953             fixed = LSHR(vFixed, vExtraBits, name);
954         }
955 #endif
956         return fixed;
957     }
958 
VCVT_FIXED_UI_F32(Value * vFixed,uint32_t numIntBits,uint32_t numFracBits,const llvm::Twine & name)959     Value* Builder::VCVT_FIXED_UI_F32(Value*             vFixed,
960                                       uint32_t           numIntBits,
961                                       uint32_t           numFracBits,
962                                       const llvm::Twine& name)
963     {
964         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
965         uint32_t extraBits = 32 - numIntBits - numFracBits;
966         if (numIntBits && extraBits)
967         {
968             // Sign extend
969             Value* shftAmt = VIMMED1(extraBits);
970             vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
971         }
972 
973         Value* fVal  = VIMMED1(0.0f);
974         Value* fFrac = VIMMED1(0.0f);
975         if (numIntBits)
976         {
977             fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
978         }
979 
980         if (numFracBits)
981         {
982             fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
983             fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
984         }
985 
986         return FADD(fVal, fFrac, name);
987     }
988 
989     //////////////////////////////////////////////////////////////////////////
990     /// @brief C functions called by LLVM IR
991     //////////////////////////////////////////////////////////////////////////
992 
VEXTRACTI128(Value * a,Constant * imm8)993     Value* Builder::VEXTRACTI128(Value* a, Constant* imm8)
994     {
995         bool                      flag = !imm8->isZeroValue();
996         SmallVector<Constant*, 8> idx;
997         for (unsigned i = 0; i < mVWidth / 2; i++)
998         {
999             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1000         }
1001         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1002     }
1003 
VINSERTI128(Value * a,Value * b,Constant * imm8)1004     Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1005     {
1006         bool                      flag = !imm8->isZeroValue();
1007         SmallVector<Constant*, 8> idx;
1008         for (unsigned i = 0; i < mVWidth; i++)
1009         {
1010             idx.push_back(C(i));
1011         }
1012         Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1013 
1014         SmallVector<Constant*, 8> idx2;
1015         for (unsigned i = 0; i < mVWidth / 2; i++)
1016         {
1017             idx2.push_back(C(flag ? i : i + mVWidth));
1018         }
1019         for (unsigned i = mVWidth / 2; i < mVWidth; i++)
1020         {
1021             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1022         }
1023         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1024     }
1025 
1026     // rdtsc buckets macros
RDTSC_START(Value * pBucketMgr,Value * pId)1027     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1028     {
1029         // @todo due to an issue with thread local storage propagation in llvm, we can only safely
1030         // call into buckets framework when single threaded
1031         if (KNOB_SINGLE_THREADED)
1032         {
1033             std::vector<Type*> args{
1034                 PointerType::get(mInt32Ty, 0), // pBucketMgr
1035                 mInt32Ty                       // id
1036             };
1037 
1038             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1039             Function*     pFunc   = cast<Function>(
1040 #if LLVM_VERSION_MAJOR >= 9
1041                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee());
1042 #else
1043                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1044 #endif
1045             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") ==
1046                 nullptr)
1047             {
1048                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket",
1049                                                (void*)&BucketManager_StartBucket);
1050             }
1051 
1052             CALL(pFunc, {pBucketMgr, pId});
1053         }
1054     }
1055 
RDTSC_STOP(Value * pBucketMgr,Value * pId)1056     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1057     {
1058         // @todo due to an issue with thread local storage propagation in llvm, we can only safely
1059         // call into buckets framework when single threaded
1060         if (KNOB_SINGLE_THREADED)
1061         {
1062             std::vector<Type*> args{
1063                 PointerType::get(mInt32Ty, 0), // pBucketMgr
1064                 mInt32Ty                       // id
1065             };
1066 
1067             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1068             Function*     pFunc   = cast<Function>(
1069 #if LLVM_VERSION_MAJOR >= 9
1070                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee());
1071 #else
1072                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1073 #endif
1074             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") ==
1075                 nullptr)
1076             {
1077                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket",
1078                                                (void*)&BucketManager_StopBucket);
1079             }
1080 
1081             CALL(pFunc, {pBucketMgr, pId});
1082         }
1083     }
1084 
GetTypeSize(Type * pType)1085     uint32_t Builder::GetTypeSize(Type* pType)
1086     {
1087         if (pType->isStructTy())
1088         {
1089             uint32_t numElems = pType->getStructNumElements();
1090             Type*    pElemTy  = pType->getStructElementType(0);
1091             return numElems * GetTypeSize(pElemTy);
1092         }
1093 
1094         if (pType->isArrayTy())
1095         {
1096             uint32_t numElems = pType->getArrayNumElements();
1097             Type*    pElemTy  = pType->getArrayElementType();
1098             return numElems * GetTypeSize(pElemTy);
1099         }
1100 
1101         if (pType->isIntegerTy())
1102         {
1103             uint32_t bitSize = pType->getIntegerBitWidth();
1104             return bitSize / 8;
1105         }
1106 
1107         if (pType->isFloatTy())
1108         {
1109             return 4;
1110         }
1111 
1112         if (pType->isHalfTy())
1113         {
1114             return 2;
1115         }
1116 
1117         if (pType->isDoubleTy())
1118         {
1119             return 8;
1120         }
1121 
1122         SWR_ASSERT(false, "Unimplemented type.");
1123         return 0;
1124     }
1125 } // namespace SwrJit
1126