1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "LLVMReactor.hpp"
16
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "EmulatedReactor.hpp"
20 #include "LLVMReactorDebugInfo.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "x86.hpp"
24
25 #include "llvm/IR/Intrinsics.h"
26 #if LLVM_VERSION_MAJOR >= 9
27 # include "llvm/IR/IntrinsicsX86.h"
28 #endif
29 #include "llvm/IR/LegacyPassManager.h"
30 #include "llvm/IR/Verifier.h"
31 #include "llvm/Transforms/Coroutines.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Scalar.h"
34
35 #define ARGS(...) \
36 { \
37 __VA_ARGS__ \
38 }
39 #define CreateCall2 CreateCall
40 #define CreateCall3 CreateCall
41
42 #include <fstream>
43 #include <iostream>
44 #include <mutex>
45 #include <numeric>
46 #include <thread>
47 #include <unordered_map>
48
49 #if defined(__i386__) || defined(__x86_64__)
50 # include <xmmintrin.h>
51 #endif
52
53 #include <math.h>
54
55 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()56 extern "C" void X86CompilationCallback()
57 {
58 UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
59 }
60 #endif
61
62 namespace {
63
64 std::unique_ptr<rr::JITBuilder> jit;
65 std::mutex codegenMutex;
66
67 // Default configuration settings. Must be accessed under mutex lock.
68 std::mutex defaultConfigLock;
defaultConfig()69 rr::Config &defaultConfig()
70 {
71 // This uses a static in a function to avoid the cost of a global static
72 // initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
73 static rr::Config config = rr::Config::Edit()
74 .add(rr::Optimization::Pass::ScalarReplAggregates)
75 .add(rr::Optimization::Pass::InstructionCombining)
76 .apply({});
77 return config;
78 }
79
lowerPAVG(llvm::Value * x,llvm::Value * y)80 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
81 {
82 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
83
84 llvm::VectorType *extTy =
85 llvm::VectorType::getExtendedElementVectorType(ty);
86 x = jit->builder->CreateZExt(x, extTy);
87 y = jit->builder->CreateZExt(y, extTy);
88
89 // (x + y + 1) >> 1
90 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
91 llvm::Value *res = jit->builder->CreateAdd(x, y);
92 res = jit->builder->CreateAdd(res, one);
93 res = jit->builder->CreateLShr(res, one);
94 return jit->builder->CreateTrunc(res, ty);
95 }
96
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)97 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
98 llvm::ICmpInst::Predicate pred)
99 {
100 return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
101 }
102
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)103 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
104 llvm::Value *y, llvm::Type *dstTy)
105 {
106 return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
107 }
108
109 #if defined(__i386__) || defined(__x86_64__)
lowerPMOV(llvm::Value * op,llvm::Type * dstType,bool sext)110 llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
111 {
112 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
113 llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
114
115 llvm::Value *undef = llvm::UndefValue::get(srcTy);
116 llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
117 std::iota(mask.begin(), mask.end(), 0);
118 llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
119
120 return sext ? jit->builder->CreateSExt(v, dstTy)
121 : jit->builder->CreateZExt(v, dstTy);
122 }
123
lowerPABS(llvm::Value * v)124 llvm::Value *lowerPABS(llvm::Value *v)
125 {
126 llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
127 llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
128 llvm::Value *neg = jit->builder->CreateNeg(v);
129 return jit->builder->CreateSelect(cmp, v, neg);
130 }
131 #endif // defined(__i386__) || defined(__x86_64__)
132
133 #if !defined(__i386__) && !defined(__x86_64__)
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)134 llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
135 llvm::FCmpInst::Predicate pred)
136 {
137 return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
138 }
139
lowerRound(llvm::Value * x)140 llvm::Value *lowerRound(llvm::Value *x)
141 {
142 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
143 jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
144 return jit->builder->CreateCall(nearbyint, ARGS(x));
145 }
146
lowerRoundInt(llvm::Value * x,llvm::Type * ty)147 llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
148 {
149 return jit->builder->CreateFPToSI(lowerRound(x), ty);
150 }
151
lowerFloor(llvm::Value * x)152 llvm::Value *lowerFloor(llvm::Value *x)
153 {
154 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
155 jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
156 return jit->builder->CreateCall(floor, ARGS(x));
157 }
158
lowerTrunc(llvm::Value * x)159 llvm::Value *lowerTrunc(llvm::Value *x)
160 {
161 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
162 jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
163 return jit->builder->CreateCall(trunc, ARGS(x));
164 }
165
166 // Packed add/sub with saturation
lowerPSAT(llvm::Value * x,llvm::Value * y,bool isAdd,bool isSigned)167 llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
168 {
169 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
170 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
171
172 unsigned numBits = ty->getScalarSizeInBits();
173
174 llvm::Value *max, *min, *extX, *extY;
175 if(isSigned)
176 {
177 max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
178 min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
179 extX = jit->builder->CreateSExt(x, extTy);
180 extY = jit->builder->CreateSExt(y, extTy);
181 }
182 else
183 {
184 ASSERT_MSG(numBits <= 64, "numBits: %d", int(numBits));
185 uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
186 max = llvm::ConstantInt::get(extTy, maxVal, false);
187 min = llvm::ConstantInt::get(extTy, 0, false);
188 extX = jit->builder->CreateZExt(x, extTy);
189 extY = jit->builder->CreateZExt(y, extTy);
190 }
191
192 llvm::Value *res = isAdd ? jit->builder->CreateAdd(extX, extY)
193 : jit->builder->CreateSub(extX, extY);
194
195 res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
196 res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
197
198 return jit->builder->CreateTrunc(res, ty);
199 }
200
lowerSQRT(llvm::Value * x)201 llvm::Value *lowerSQRT(llvm::Value *x)
202 {
203 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
204 jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
205 return jit->builder->CreateCall(sqrt, ARGS(x));
206 }
207
lowerRCP(llvm::Value * x)208 llvm::Value *lowerRCP(llvm::Value *x)
209 {
210 llvm::Type *ty = x->getType();
211 llvm::Constant *one;
212 if(llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
213 {
214 one = llvm::ConstantVector::getSplat(
215 vectorTy->getNumElements(),
216 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
217 }
218 else
219 {
220 one = llvm::ConstantFP::get(ty, 1);
221 }
222 return jit->builder->CreateFDiv(one, x);
223 }
224
lowerRSQRT(llvm::Value * x)225 llvm::Value *lowerRSQRT(llvm::Value *x)
226 {
227 return lowerRCP(lowerSQRT(x));
228 }
229
lowerVectorShl(llvm::Value * x,uint64_t scalarY)230 llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
231 {
232 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
233 llvm::Value *y = llvm::ConstantVector::getSplat(
234 ty->getNumElements(),
235 llvm::ConstantInt::get(ty->getElementType(), scalarY));
236 return jit->builder->CreateShl(x, y);
237 }
238
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)239 llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
240 {
241 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
242 llvm::Value *y = llvm::ConstantVector::getSplat(
243 ty->getNumElements(),
244 llvm::ConstantInt::get(ty->getElementType(), scalarY));
245 return jit->builder->CreateAShr(x, y);
246 }
247
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)248 llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
249 {
250 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
251 llvm::Value *y = llvm::ConstantVector::getSplat(
252 ty->getNumElements(),
253 llvm::ConstantInt::get(ty->getElementType(), scalarY));
254 return jit->builder->CreateLShr(x, y);
255 }
256
lowerMulAdd(llvm::Value * x,llvm::Value * y)257 llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
258 {
259 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
260 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
261
262 llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
263 llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
264 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
265
266 llvm::Value *undef = llvm::UndefValue::get(extTy);
267
268 llvm::SmallVector<uint32_t, 16> evenIdx;
269 llvm::SmallVector<uint32_t, 16> oddIdx;
270 for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
271 {
272 evenIdx.push_back(i);
273 oddIdx.push_back(i + 1);
274 }
275
276 llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
277 llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
278 return jit->builder->CreateAdd(lhs, rhs);
279 }
280
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)281 llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
282 {
283 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
284 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
285
286 llvm::IntegerType *dstElemTy =
287 llvm::cast<llvm::IntegerType>(dstTy->getElementType());
288
289 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
290 ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
291 llvm::Constant *max, *min;
292 if(isSigned)
293 {
294 max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
295 min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
296 }
297 else
298 {
299 max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
300 min = llvm::ConstantInt::get(srcTy, 0, false);
301 }
302
303 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
304 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
305 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
306 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
307
308 x = jit->builder->CreateTrunc(x, dstTy);
309 y = jit->builder->CreateTrunc(y, dstTy);
310
311 llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
312 std::iota(index.begin(), index.end(), 0);
313
314 return jit->builder->CreateShuffleVector(x, y, index);
315 }
316
lowerSignMask(llvm::Value * x,llvm::Type * retTy)317 llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
318 {
319 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
320 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
321 llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
322
323 llvm::Value *ret = jit->builder->CreateZExt(
324 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
325 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
326 {
327 llvm::Value *elem = jit->builder->CreateZExt(
328 jit->builder->CreateExtractElement(cmp, i), retTy);
329 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
330 }
331 return ret;
332 }
333
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)334 llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
335 {
336 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
337 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
338 llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
339
340 llvm::Value *ret = jit->builder->CreateZExt(
341 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
342 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
343 {
344 llvm::Value *elem = jit->builder->CreateZExt(
345 jit->builder->CreateExtractElement(cmp, i), retTy);
346 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
347 }
348 return ret;
349 }
350 #endif // !defined(__i386__) && !defined(__x86_64__)
351
352 #if(LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)353 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
354 {
355 # if LLVM_VERSION_MAJOR >= 8
356 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
357 # else
358 return lowerPSAT(x, y, true, false);
359 # endif
360 }
361
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)362 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
363 {
364 # if LLVM_VERSION_MAJOR >= 8
365 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
366 # else
367 return lowerPSAT(x, y, true, true);
368 # endif
369 }
370
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)371 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
372 {
373 # if LLVM_VERSION_MAJOR >= 8
374 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
375 # else
376 return lowerPSAT(x, y, false, false);
377 # endif
378 }
379
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)380 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
381 {
382 # if LLVM_VERSION_MAJOR >= 8
383 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
384 # else
385 return lowerPSAT(x, y, false, true);
386 # endif
387 }
388 #endif // (LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
389
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)390 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
391 {
392 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
393 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
394
395 llvm::Value *extX, *extY;
396 if(sext)
397 {
398 extX = jit->builder->CreateSExt(x, extTy);
399 extY = jit->builder->CreateSExt(y, extTy);
400 }
401 else
402 {
403 extX = jit->builder->CreateZExt(x, extTy);
404 extY = jit->builder->CreateZExt(y, extTy);
405 }
406
407 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
408
409 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
410 llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
411 return jit->builder->CreateTrunc(mulh, ty);
412 }
413
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)414 llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
415 {
416 ASSERT(base->getType()->isPointerTy());
417 ASSERT(offsets->getType()->isVectorTy());
418 ASSERT(mask->getType()->isVectorTy());
419
420 auto numEls = mask->getType()->getVectorNumElements();
421 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
422 auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
423 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
424 auto i8PtrTy = i8Ty->getPointerTo();
425 auto elPtrTy = elTy->getPointerTo();
426 auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
427 auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
428 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
429 auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
430 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
431 auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
432 auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
433 auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
434 auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
435 return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
436 }
437
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)438 void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
439 {
440 ASSERT(base->getType()->isPointerTy());
441 ASSERT(val->getType()->isVectorTy());
442 ASSERT(offsets->getType()->isVectorTy());
443 ASSERT(mask->getType()->isVectorTy());
444
445 auto numEls = mask->getType()->getVectorNumElements();
446 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
447 auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
448 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
449 auto i8PtrTy = i8Ty->getPointerTo();
450 auto elVecTy = val->getType();
451 auto elTy = elVecTy->getVectorElementType();
452 auto elPtrTy = elTy->getPointerTo();
453 auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
454 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
455 auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
456 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
457 auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
458 auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
459 auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
460 jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
461 }
462 } // namespace
463
464 namespace rr {
465
BackendName()466 std::string BackendName()
467 {
468 return std::string("LLVM ") + LLVM_VERSION_STRING;
469 }
470
471 const Capabilities Caps = {
472 true, // CoroutinesSupported
473 };
474
475 // The abstract Type* types are implemented as LLVM types, except that
476 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
477 // and VFP in ARM, and eliminate the overhead of converting them to explicit
478 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
479 // as abstract pointers with small enum values.
480 enum InternalType : uintptr_t
481 {
482 // Emulated types:
483 Type_v2i32,
484 Type_v4i16,
485 Type_v2i16,
486 Type_v8i8,
487 Type_v4i8,
488 Type_v2f32,
489 EmulatedTypeCount,
490 // Returned by asInternalType() to indicate that the abstract Type*
491 // should be interpreted as LLVM type pointer:
492 Type_LLVM
493 };
494
asInternalType(Type * type)495 inline InternalType asInternalType(Type *type)
496 {
497 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
498 return (t < EmulatedTypeCount) ? t : Type_LLVM;
499 }
500
T(Type * t)501 llvm::Type *T(Type *t)
502 {
503 // Use 128-bit vectors to implement logically shorter ones.
504 switch(asInternalType(t))
505 {
506 case Type_v2i32: return T(Int4::getType());
507 case Type_v4i16: return T(Short8::getType());
508 case Type_v2i16: return T(Short8::getType());
509 case Type_v8i8: return T(Byte16::getType());
510 case Type_v4i8: return T(Byte16::getType());
511 case Type_v2f32: return T(Float4::getType());
512 case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
513 default:
514 UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
515 return nullptr;
516 }
517 }
518
T(InternalType t)519 Type *T(InternalType t)
520 {
521 return reinterpret_cast<Type *>(t);
522 }
523
T(const std::vector<Type * > & t)524 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
525 {
526 return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
527 }
528
B(BasicBlock * t)529 inline llvm::BasicBlock *B(BasicBlock *t)
530 {
531 return reinterpret_cast<llvm::BasicBlock *>(t);
532 }
533
B(llvm::BasicBlock * t)534 inline BasicBlock *B(llvm::BasicBlock *t)
535 {
536 return reinterpret_cast<BasicBlock *>(t);
537 }
538
typeSize(Type * type)539 static size_t typeSize(Type *type)
540 {
541 switch(asInternalType(type))
542 {
543 case Type_v2i32: return 8;
544 case Type_v4i16: return 8;
545 case Type_v2i16: return 4;
546 case Type_v8i8: return 8;
547 case Type_v4i8: return 4;
548 case Type_v2f32: return 8;
549 case Type_LLVM:
550 {
551 llvm::Type *t = T(type);
552
553 if(t->isPointerTy())
554 {
555 return sizeof(void *);
556 }
557
558 // At this point we should only have LLVM 'primitive' types.
559 unsigned int bits = t->getPrimitiveSizeInBits();
560 ASSERT_MSG(bits != 0, "bits: %d", int(bits));
561
562 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
563 // but are typically stored as one byte. The DataLayout structure should
564 // be used here and many other places if this assumption fails.
565 return (bits + 7) / 8;
566 }
567 break;
568 default:
569 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
570 return 0;
571 }
572 }
573
elementCount(Type * type)574 static unsigned int elementCount(Type *type)
575 {
576 switch(asInternalType(type))
577 {
578 case Type_v2i32: return 2;
579 case Type_v4i16: return 4;
580 case Type_v2i16: return 2;
581 case Type_v8i8: return 8;
582 case Type_v4i8: return 4;
583 case Type_v2f32: return 2;
584 case Type_LLVM: return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
585 default:
586 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
587 return 0;
588 }
589 }
590
createFunction(const char * name,::llvm::Type * retTy,const std::vector<::llvm::Type * > & params)591 static ::llvm::Function *createFunction(const char *name, ::llvm::Type *retTy, const std::vector<::llvm::Type *> ¶ms)
592 {
593 llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
594 auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
595 func->setDoesNotThrow();
596 func->setCallingConv(llvm::CallingConv::C);
597 return func;
598 }
599
Nucleus()600 Nucleus::Nucleus()
601 {
602 ::codegenMutex.lock(); // Reactor and LLVM are currently not thread safe
603
604 ASSERT(jit == nullptr);
605 jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
606 }
607
~Nucleus()608 Nucleus::~Nucleus()
609 {
610 jit.reset();
611 ::codegenMutex.unlock();
612 }
613
setDefaultConfig(const Config & cfg)614 void Nucleus::setDefaultConfig(const Config &cfg)
615 {
616 std::unique_lock<std::mutex> lock(::defaultConfigLock);
617 ::defaultConfig() = cfg;
618 }
619
adjustDefaultConfig(const Config::Edit & cfgEdit)620 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
621 {
622 std::unique_lock<std::mutex> lock(::defaultConfigLock);
623 auto &config = ::defaultConfig();
624 config = cfgEdit.apply(config);
625 }
626
getDefaultConfig()627 Config Nucleus::getDefaultConfig()
628 {
629 std::unique_lock<std::mutex> lock(::defaultConfigLock);
630 return ::defaultConfig();
631 }
632
acquireRoutine(const char * name,const Config::Edit & cfgEdit)633 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
634 {
635 std::shared_ptr<Routine> routine;
636
637 auto acquire = [&]() {
638 auto cfg = cfgEdit.apply(jit->config);
639
640 if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
641 {
642 llvm::Type *type = jit->function->getReturnType();
643
644 if(type->isVoidTy())
645 {
646 createRetVoid();
647 }
648 else
649 {
650 createRet(V(llvm::UndefValue::get(type)));
651 }
652 }
653
654 #ifdef ENABLE_RR_DEBUG_INFO
655 if(jit->debugInfo != nullptr)
656 {
657 jit->debugInfo->Finalize();
658 }
659 #endif // ENABLE_RR_DEBUG_INFO
660
661 if(false)
662 {
663 std::error_code error;
664 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
665 jit->module->print(file, 0);
666 }
667
668 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
669 {
670 llvm::legacy::PassManager pm;
671 pm.add(llvm::createVerifierPass());
672 pm.run(*jit->module);
673 }
674 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
675
676 jit->optimize(cfg);
677
678 if(false)
679 {
680 std::error_code error;
681 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
682 jit->module->print(file, 0);
683 }
684
685 routine = jit->acquireRoutine(&jit->function, 1, cfg);
686 jit.reset();
687 };
688
689 #ifdef JIT_IN_SEPARATE_THREAD
690 // Perform optimizations and codegen in a separate thread to avoid stack overflow.
691 // FIXME(b/149829034): This is not a long-term solution. Reactor has no control
692 // over the threading and stack sizes of its users, so this should be addressed
693 // at a higher level instead.
694 std::thread thread(acquire);
695 thread.join();
696 #else
697 acquire();
698 #endif
699
700 return routine;
701 }
702
allocateStackVariable(Type * type,int arraySize)703 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
704 {
705 // Need to allocate it in the entry block for mem2reg to work
706 llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
707
708 llvm::Instruction *declaration;
709
710 if(arraySize)
711 {
712 declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
713 }
714 else
715 {
716 declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr);
717 }
718
719 entryBlock.getInstList().push_front(declaration);
720
721 return V(declaration);
722 }
723
createBasicBlock()724 BasicBlock *Nucleus::createBasicBlock()
725 {
726 return B(llvm::BasicBlock::Create(jit->context, "", jit->function));
727 }
728
getInsertBlock()729 BasicBlock *Nucleus::getInsertBlock()
730 {
731 return B(jit->builder->GetInsertBlock());
732 }
733
setInsertBlock(BasicBlock * basicBlock)734 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
735 {
736 // assert(jit->builder->GetInsertBlock()->back().isTerminator());
737
738 Variable::materializeAll();
739
740 jit->builder->SetInsertPoint(B(basicBlock));
741 }
742
createFunction(Type * ReturnType,const std::vector<Type * > & Params)743 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
744 {
745 jit->function = rr::createFunction("", T(ReturnType), T(Params));
746
747 #ifdef ENABLE_RR_DEBUG_INFO
748 jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), &jit->context, jit->module.get(), jit->function);
749 #endif // ENABLE_RR_DEBUG_INFO
750
751 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
752 }
753
getArgument(unsigned int index)754 Value *Nucleus::getArgument(unsigned int index)
755 {
756 llvm::Function::arg_iterator args = jit->function->arg_begin();
757
758 while(index)
759 {
760 args++;
761 index--;
762 }
763
764 return V(&*args);
765 }
766
createRetVoid()767 void Nucleus::createRetVoid()
768 {
769 RR_DEBUG_INFO_UPDATE_LOC();
770
771 ASSERT_MSG(jit->function->getReturnType() == T(Void::getType()), "Return type mismatch");
772
773 // Code generated after this point is unreachable, so any variables
774 // being read can safely return an undefined value. We have to avoid
775 // materializing variables after the terminator ret instruction.
776 Variable::killUnmaterialized();
777
778 jit->builder->CreateRetVoid();
779 }
780
createRet(Value * v)781 void Nucleus::createRet(Value *v)
782 {
783 RR_DEBUG_INFO_UPDATE_LOC();
784
785 ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
786
787 // Code generated after this point is unreachable, so any variables
788 // being read can safely return an undefined value. We have to avoid
789 // materializing variables after the terminator ret instruction.
790 Variable::killUnmaterialized();
791
792 jit->builder->CreateRet(V(v));
793 }
794
createBr(BasicBlock * dest)795 void Nucleus::createBr(BasicBlock *dest)
796 {
797 RR_DEBUG_INFO_UPDATE_LOC();
798 Variable::materializeAll();
799
800 jit->builder->CreateBr(B(dest));
801 }
802
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)803 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
804 {
805 RR_DEBUG_INFO_UPDATE_LOC();
806 Variable::materializeAll();
807 jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
808 }
809
createAdd(Value * lhs,Value * rhs)810 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
811 {
812 RR_DEBUG_INFO_UPDATE_LOC();
813 return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
814 }
815
createSub(Value * lhs,Value * rhs)816 Value *Nucleus::createSub(Value *lhs, Value *rhs)
817 {
818 RR_DEBUG_INFO_UPDATE_LOC();
819 return V(jit->builder->CreateSub(V(lhs), V(rhs)));
820 }
821
createMul(Value * lhs,Value * rhs)822 Value *Nucleus::createMul(Value *lhs, Value *rhs)
823 {
824 RR_DEBUG_INFO_UPDATE_LOC();
825 return V(jit->builder->CreateMul(V(lhs), V(rhs)));
826 }
827
createUDiv(Value * lhs,Value * rhs)828 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
829 {
830 RR_DEBUG_INFO_UPDATE_LOC();
831 return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
832 }
833
createSDiv(Value * lhs,Value * rhs)834 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
835 {
836 RR_DEBUG_INFO_UPDATE_LOC();
837 return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
838 }
839
createFAdd(Value * lhs,Value * rhs)840 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
841 {
842 RR_DEBUG_INFO_UPDATE_LOC();
843 return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
844 }
845
createFSub(Value * lhs,Value * rhs)846 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
847 {
848 RR_DEBUG_INFO_UPDATE_LOC();
849 return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
850 }
851
createFMul(Value * lhs,Value * rhs)852 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
853 {
854 RR_DEBUG_INFO_UPDATE_LOC();
855 return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
856 }
857
createFDiv(Value * lhs,Value * rhs)858 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
859 {
860 RR_DEBUG_INFO_UPDATE_LOC();
861 return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
862 }
863
createURem(Value * lhs,Value * rhs)864 Value *Nucleus::createURem(Value *lhs, Value *rhs)
865 {
866 RR_DEBUG_INFO_UPDATE_LOC();
867 return V(jit->builder->CreateURem(V(lhs), V(rhs)));
868 }
869
createSRem(Value * lhs,Value * rhs)870 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
871 {
872 RR_DEBUG_INFO_UPDATE_LOC();
873 return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
874 }
875
createFRem(Value * lhs,Value * rhs)876 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
877 {
878 RR_DEBUG_INFO_UPDATE_LOC();
879 return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
880 }
881
operator %(RValue<Float4> lhs,RValue<Float4> rhs)882 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
883 {
884 return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
885 }
886
createShl(Value * lhs,Value * rhs)887 Value *Nucleus::createShl(Value *lhs, Value *rhs)
888 {
889 RR_DEBUG_INFO_UPDATE_LOC();
890 return V(jit->builder->CreateShl(V(lhs), V(rhs)));
891 }
892
createLShr(Value * lhs,Value * rhs)893 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
894 {
895 RR_DEBUG_INFO_UPDATE_LOC();
896 return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
897 }
898
createAShr(Value * lhs,Value * rhs)899 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
900 {
901 RR_DEBUG_INFO_UPDATE_LOC();
902 return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
903 }
904
createAnd(Value * lhs,Value * rhs)905 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
906 {
907 RR_DEBUG_INFO_UPDATE_LOC();
908 return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
909 }
910
createOr(Value * lhs,Value * rhs)911 Value *Nucleus::createOr(Value *lhs, Value *rhs)
912 {
913 RR_DEBUG_INFO_UPDATE_LOC();
914 return V(jit->builder->CreateOr(V(lhs), V(rhs)));
915 }
916
createXor(Value * lhs,Value * rhs)917 Value *Nucleus::createXor(Value *lhs, Value *rhs)
918 {
919 RR_DEBUG_INFO_UPDATE_LOC();
920 return V(jit->builder->CreateXor(V(lhs), V(rhs)));
921 }
922
createNeg(Value * v)923 Value *Nucleus::createNeg(Value *v)
924 {
925 RR_DEBUG_INFO_UPDATE_LOC();
926 return V(jit->builder->CreateNeg(V(v)));
927 }
928
createFNeg(Value * v)929 Value *Nucleus::createFNeg(Value *v)
930 {
931 RR_DEBUG_INFO_UPDATE_LOC();
932 return V(jit->builder->CreateFNeg(V(v)));
933 }
934
createNot(Value * v)935 Value *Nucleus::createNot(Value *v)
936 {
937 RR_DEBUG_INFO_UPDATE_LOC();
938 return V(jit->builder->CreateNot(V(v)));
939 }
940
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)941 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
942 {
943 RR_DEBUG_INFO_UPDATE_LOC();
944 switch(asInternalType(type))
945 {
946 case Type_v2i32:
947 case Type_v4i16:
948 case Type_v8i8:
949 case Type_v2f32:
950 return createBitCast(
951 createInsertElement(
952 V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
953 createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
954 0),
955 type);
956 case Type_v2i16:
957 case Type_v4i8:
958 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
959 {
960 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
961 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
962 i = createZExt(i, Long::getType());
963 Value *v = createInsertElement(u, i, 0);
964 return createBitCast(v, type);
965 }
966 // Fallthrough to non-emulated case.
967 case Type_LLVM:
968 {
969 auto elTy = T(type);
970 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
971
972 if(!atomic)
973 {
974 return V(jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile));
975 }
976 else if(elTy->isIntegerTy() || elTy->isPointerTy())
977 {
978 // Integers and pointers can be atomically loaded by setting
979 // the ordering constraint on the load instruction.
980 auto load = jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile);
981 load->setAtomic(atomicOrdering(atomic, memoryOrder));
982 return V(load);
983 }
984 else if(elTy->isFloatTy() || elTy->isDoubleTy())
985 {
986 // LLVM claims to support atomic loads of float types as
987 // above, but certain backends cannot deal with this.
988 // Load as an integer and bitcast. See b/136037244.
989 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
990 auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
991 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
992 auto load = jit->builder->CreateAlignedLoad(ptrCast, alignment, isVolatile);
993 load->setAtomic(atomicOrdering(atomic, memoryOrder));
994 auto loadCast = jit->builder->CreateBitCast(load, elTy);
995 return V(loadCast);
996 }
997 else
998 {
999 // More exotic types require falling back to the extern:
1000 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
1001 auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
1002 auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
1003 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
1004 auto i8PtrTy = i8Ty->getPointerTo();
1005 auto voidTy = ::llvm::Type::getVoidTy(jit->context);
1006 auto funcTy = ::llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1007 auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
1008 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1009 auto out = allocateStackVariable(type);
1010 jit->builder->CreateCall(func, {
1011 ::llvm::ConstantInt::get(sizetTy, size),
1012 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1013 jit->builder->CreatePointerCast(V(out), i8PtrTy),
1014 ::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1015 });
1016 return V(jit->builder->CreateLoad(V(out)));
1017 }
1018 }
1019 default:
1020 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1021 return nullptr;
1022 }
1023 }
1024
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)1025 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
1026 {
1027 RR_DEBUG_INFO_UPDATE_LOC();
1028 switch(asInternalType(type))
1029 {
1030 case Type_v2i32:
1031 case Type_v4i16:
1032 case Type_v8i8:
1033 case Type_v2f32:
1034 createStore(
1035 createExtractElement(
1036 createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
1037 createBitCast(ptr, Pointer<Long>::getType()),
1038 Long::getType(), isVolatile, alignment, atomic, memoryOrder);
1039 return value;
1040 case Type_v2i16:
1041 case Type_v4i8:
1042 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
1043 {
1044 createStore(
1045 createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
1046 createBitCast(ptr, Pointer<Int>::getType()),
1047 Int::getType(), isVolatile, alignment, atomic, memoryOrder);
1048 return value;
1049 }
1050 // Fallthrough to non-emulated case.
1051 case Type_LLVM:
1052 {
1053 auto elTy = T(type);
1054 ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
1055
1056 #if __has_feature(memory_sanitizer)
1057 // Mark all memory writes as initialized by calling __msan_unpoison
1058 {
1059 // void __msan_unpoison(const volatile void *a, size_t size)
1060 auto voidTy = ::llvm::Type::getVoidTy(jit->context);
1061 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
1062 auto voidPtrTy = i8Ty->getPointerTo();
1063 auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
1064 auto funcTy = ::llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1065 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1066 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1067 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
1068 ::llvm::ConstantInt::get(sizetTy, size) });
1069 }
1070 #endif
1071
1072 if(!atomic)
1073 {
1074 jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
1075 }
1076 else if(elTy->isIntegerTy() || elTy->isPointerTy())
1077 {
1078 // Integers and pointers can be atomically stored by setting
1079 // the ordering constraint on the store instruction.
1080 auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
1081 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1082 }
1083 else if(elTy->isFloatTy() || elTy->isDoubleTy())
1084 {
1085 // LLVM claims to support atomic stores of float types as
1086 // above, but certain backends cannot deal with this.
1087 // Store as an bitcast integer. See b/136037244.
1088 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1089 auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
1090 auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1091 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1092 auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, alignment, isVolatile);
1093 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1094 }
1095 else
1096 {
1097 // More exotic types require falling back to the extern:
1098 // void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1099 auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
1100 auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
1101 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
1102 auto i8PtrTy = i8Ty->getPointerTo();
1103 auto voidTy = ::llvm::Type::getVoidTy(jit->context);
1104 auto funcTy = ::llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1105 auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1106 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1107 auto copy = allocateStackVariable(type);
1108 jit->builder->CreateStore(V(value), V(copy));
1109 jit->builder->CreateCall(func, {
1110 ::llvm::ConstantInt::get(sizetTy, size),
1111 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1112 jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1113 ::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1114 });
1115 }
1116
1117 return value;
1118 }
1119 default:
1120 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1121 return nullptr;
1122 }
1123 }
1124
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1125 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1126 {
1127 RR_DEBUG_INFO_UPDATE_LOC();
1128
1129 ASSERT(V(ptr)->getType()->isPointerTy());
1130 ASSERT(V(mask)->getType()->isVectorTy());
1131
1132 auto numEls = V(mask)->getType()->getVectorNumElements();
1133 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
1134 auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
1135 auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
1136 auto elVecPtrTy = elVecTy->getPointerTo();
1137 auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1138 auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1139 auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
1140 auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1141 return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1142 }
1143
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1144 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1145 {
1146 RR_DEBUG_INFO_UPDATE_LOC();
1147
1148 ASSERT(V(ptr)->getType()->isPointerTy());
1149 ASSERT(V(val)->getType()->isVectorTy());
1150 ASSERT(V(mask)->getType()->isVectorTy());
1151
1152 auto numEls = V(mask)->getType()->getVectorNumElements();
1153 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
1154 auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
1155 auto elVecTy = V(val)->getType();
1156 auto elVecPtrTy = elVecTy->getPointerTo();
1157 auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1158 auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
1159 auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1160 jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
1161 }
1162
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1163 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1164 {
1165 return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
1166 }
1167
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1168 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1169 {
1170 return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
1171 }
1172
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1173 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1174 {
1175 return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
1176 }
1177
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1178 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1179 {
1180 return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
1181 }
1182
createFence(std::memory_order memoryOrder)1183 void Nucleus::createFence(std::memory_order memoryOrder)
1184 {
1185 RR_DEBUG_INFO_UPDATE_LOC();
1186 jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1187 }
1188
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1189 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1190 {
1191 RR_DEBUG_INFO_UPDATE_LOC();
1192 ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1193 if(sizeof(void *) == 8)
1194 {
1195 // LLVM manual: "When indexing into an array, pointer or vector,
1196 // integers of any width are allowed, and they are not required to
1197 // be constant. These integers are treated as signed values where
1198 // relevant."
1199 //
1200 // Thus if we want indexes to be treated as unsigned we have to
1201 // zero-extend them ourselves.
1202 //
1203 // Note that this is not because we want to address anywhere near
1204 // 4 GB of data. Instead this is important for performance because
1205 // x86 supports automatic zero-extending of 32-bit registers to
1206 // 64-bit. Thus when indexing into an array using a uint32 is
1207 // actually faster than an int32.
1208 index = unsignedIndex ? createZExt(index, Long::getType()) : createSExt(index, Long::getType());
1209 }
1210
1211 // For non-emulated types we can rely on LLVM's GEP to calculate the
1212 // effective address correctly.
1213 if(asInternalType(type) == Type_LLVM)
1214 {
1215 return V(jit->builder->CreateGEP(V(ptr), V(index)));
1216 }
1217
1218 // For emulated types we have to multiply the index by the intended
1219 // type size ourselves to obain the byte offset.
1220 index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1221
1222 // Cast to a byte pointer, apply the byte offset, and cast back to the
1223 // original pointer type.
1224 return createBitCast(
1225 V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
1226 T(llvm::PointerType::get(T(type), 0)));
1227 }
1228
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1229 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1230 {
1231 RR_DEBUG_INFO_UPDATE_LOC();
1232 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1233 }
1234
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1235 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1236 {
1237 RR_DEBUG_INFO_UPDATE_LOC();
1238 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1239 }
1240
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1241 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1242 {
1243 RR_DEBUG_INFO_UPDATE_LOC();
1244 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1245 }
1246
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1247 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1248 {
1249 RR_DEBUG_INFO_UPDATE_LOC();
1250 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1251 }
1252
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1253 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1254 {
1255 RR_DEBUG_INFO_UPDATE_LOC();
1256 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1257 }
1258
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1259 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1260 {
1261 RR_DEBUG_INFO_UPDATE_LOC();
1262 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1263 }
1264
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1265 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1266 {
1267 RR_DEBUG_INFO_UPDATE_LOC();
1268 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1269 }
1270
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1271 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1272 {
1273 RR_DEBUG_INFO_UPDATE_LOC();
1274 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1275 }
1276
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1277 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1278 {
1279 RR_DEBUG_INFO_UPDATE_LOC();
1280 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1281 }
1282
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1283 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1284 {
1285 RR_DEBUG_INFO_UPDATE_LOC();
1286 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1287 }
1288
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1289 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1290 {
1291 RR_DEBUG_INFO_UPDATE_LOC();
1292 // Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1293 return V(jit->builder->CreateExtractValue(
1294 jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value), atomicOrdering(true, memoryOrderEqual), atomicOrdering(true, memoryOrderUnequal)),
1295 llvm::ArrayRef<unsigned>(0u)));
1296 }
1297
createTrunc(Value * v,Type * destType)1298 Value *Nucleus::createTrunc(Value *v, Type *destType)
1299 {
1300 RR_DEBUG_INFO_UPDATE_LOC();
1301 return V(jit->builder->CreateTrunc(V(v), T(destType)));
1302 }
1303
createZExt(Value * v,Type * destType)1304 Value *Nucleus::createZExt(Value *v, Type *destType)
1305 {
1306 RR_DEBUG_INFO_UPDATE_LOC();
1307 return V(jit->builder->CreateZExt(V(v), T(destType)));
1308 }
1309
createSExt(Value * v,Type * destType)1310 Value *Nucleus::createSExt(Value *v, Type *destType)
1311 {
1312 RR_DEBUG_INFO_UPDATE_LOC();
1313 return V(jit->builder->CreateSExt(V(v), T(destType)));
1314 }
1315
createFPToUI(Value * v,Type * destType)1316 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1317 {
1318 RR_DEBUG_INFO_UPDATE_LOC();
1319 return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1320 }
1321
createFPToSI(Value * v,Type * destType)1322 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1323 {
1324 RR_DEBUG_INFO_UPDATE_LOC();
1325 return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1326 }
1327
createSIToFP(Value * v,Type * destType)1328 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1329 {
1330 RR_DEBUG_INFO_UPDATE_LOC();
1331 return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1332 }
1333
createFPTrunc(Value * v,Type * destType)1334 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1335 {
1336 RR_DEBUG_INFO_UPDATE_LOC();
1337 return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1338 }
1339
createFPExt(Value * v,Type * destType)1340 Value *Nucleus::createFPExt(Value *v, Type *destType)
1341 {
1342 RR_DEBUG_INFO_UPDATE_LOC();
1343 return V(jit->builder->CreateFPExt(V(v), T(destType)));
1344 }
1345
createBitCast(Value * v,Type * destType)1346 Value *Nucleus::createBitCast(Value *v, Type *destType)
1347 {
1348 RR_DEBUG_INFO_UPDATE_LOC();
1349 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1350 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1351 // reading back as the destination type.
1352 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1353 {
1354 Value *readAddress = allocateStackVariable(destType);
1355 Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1356 createStore(v, writeAddress, T(V(v)->getType()));
1357 return createLoad(readAddress, destType);
1358 }
1359 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1360 {
1361 Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1362 createStore(v, writeAddress, T(V(v)->getType()));
1363 Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1364 return createLoad(readAddress, destType);
1365 }
1366
1367 return V(jit->builder->CreateBitCast(V(v), T(destType)));
1368 }
1369
createPtrEQ(Value * lhs,Value * rhs)1370 Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
1371 {
1372 RR_DEBUG_INFO_UPDATE_LOC();
1373 return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1374 }
1375
createICmpEQ(Value * lhs,Value * rhs)1376 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1377 {
1378 RR_DEBUG_INFO_UPDATE_LOC();
1379 return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1380 }
1381
createICmpNE(Value * lhs,Value * rhs)1382 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1383 {
1384 RR_DEBUG_INFO_UPDATE_LOC();
1385 return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1386 }
1387
createICmpUGT(Value * lhs,Value * rhs)1388 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1389 {
1390 RR_DEBUG_INFO_UPDATE_LOC();
1391 return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1392 }
1393
createICmpUGE(Value * lhs,Value * rhs)1394 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1395 {
1396 RR_DEBUG_INFO_UPDATE_LOC();
1397 return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1398 }
1399
createICmpULT(Value * lhs,Value * rhs)1400 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1401 {
1402 RR_DEBUG_INFO_UPDATE_LOC();
1403 return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1404 }
1405
createICmpULE(Value * lhs,Value * rhs)1406 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1407 {
1408 RR_DEBUG_INFO_UPDATE_LOC();
1409 return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1410 }
1411
createICmpSGT(Value * lhs,Value * rhs)1412 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1413 {
1414 RR_DEBUG_INFO_UPDATE_LOC();
1415 return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1416 }
1417
createICmpSGE(Value * lhs,Value * rhs)1418 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1419 {
1420 RR_DEBUG_INFO_UPDATE_LOC();
1421 return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1422 }
1423
createICmpSLT(Value * lhs,Value * rhs)1424 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1425 {
1426 RR_DEBUG_INFO_UPDATE_LOC();
1427 return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1428 }
1429
createICmpSLE(Value * lhs,Value * rhs)1430 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1431 {
1432 RR_DEBUG_INFO_UPDATE_LOC();
1433 return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1434 }
1435
createFCmpOEQ(Value * lhs,Value * rhs)1436 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1437 {
1438 RR_DEBUG_INFO_UPDATE_LOC();
1439 return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1440 }
1441
createFCmpOGT(Value * lhs,Value * rhs)1442 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1443 {
1444 RR_DEBUG_INFO_UPDATE_LOC();
1445 return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1446 }
1447
createFCmpOGE(Value * lhs,Value * rhs)1448 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1449 {
1450 RR_DEBUG_INFO_UPDATE_LOC();
1451 return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1452 }
1453
createFCmpOLT(Value * lhs,Value * rhs)1454 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1455 {
1456 RR_DEBUG_INFO_UPDATE_LOC();
1457 return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1458 }
1459
createFCmpOLE(Value * lhs,Value * rhs)1460 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1461 {
1462 RR_DEBUG_INFO_UPDATE_LOC();
1463 return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1464 }
1465
createFCmpONE(Value * lhs,Value * rhs)1466 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1467 {
1468 RR_DEBUG_INFO_UPDATE_LOC();
1469 return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1470 }
1471
createFCmpORD(Value * lhs,Value * rhs)1472 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1473 {
1474 RR_DEBUG_INFO_UPDATE_LOC();
1475 return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1476 }
1477
createFCmpUNO(Value * lhs,Value * rhs)1478 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1479 {
1480 RR_DEBUG_INFO_UPDATE_LOC();
1481 return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1482 }
1483
createFCmpUEQ(Value * lhs,Value * rhs)1484 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1485 {
1486 RR_DEBUG_INFO_UPDATE_LOC();
1487 return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1488 }
1489
createFCmpUGT(Value * lhs,Value * rhs)1490 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1491 {
1492 RR_DEBUG_INFO_UPDATE_LOC();
1493 return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1494 }
1495
createFCmpUGE(Value * lhs,Value * rhs)1496 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1497 {
1498 RR_DEBUG_INFO_UPDATE_LOC();
1499 return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1500 }
1501
createFCmpULT(Value * lhs,Value * rhs)1502 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1503 {
1504 RR_DEBUG_INFO_UPDATE_LOC();
1505 return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1506 }
1507
createFCmpULE(Value * lhs,Value * rhs)1508 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1509 {
1510 RR_DEBUG_INFO_UPDATE_LOC();
1511 return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1512 }
1513
createFCmpUNE(Value * lhs,Value * rhs)1514 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1515 {
1516 RR_DEBUG_INFO_UPDATE_LOC();
1517 return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1518 }
1519
createExtractElement(Value * vector,Type * type,int index)1520 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1521 {
1522 RR_DEBUG_INFO_UPDATE_LOC();
1523 ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1524 return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1525 }
1526
createInsertElement(Value * vector,Value * element,int index)1527 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1528 {
1529 RR_DEBUG_INFO_UPDATE_LOC();
1530 return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1531 }
1532
createShuffleVector(Value * v1,Value * v2,const int * select)1533 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1534 {
1535 RR_DEBUG_INFO_UPDATE_LOC();
1536
1537 int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
1538 const int maxSize = 16;
1539 llvm::Constant *swizzle[maxSize];
1540 ASSERT(size <= maxSize);
1541
1542 for(int i = 0; i < size; i++)
1543 {
1544 swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), select[i]);
1545 }
1546
1547 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
1548
1549 return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1550 }
1551
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1552 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1553 {
1554 RR_DEBUG_INFO_UPDATE_LOC();
1555 return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1556 }
1557
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1558 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1559 {
1560 RR_DEBUG_INFO_UPDATE_LOC();
1561 return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1562 }
1563
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1564 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1565 {
1566 RR_DEBUG_INFO_UPDATE_LOC();
1567 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1568 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), label, true), B(branch));
1569 }
1570
createUnreachable()1571 void Nucleus::createUnreachable()
1572 {
1573 RR_DEBUG_INFO_UPDATE_LOC();
1574 jit->builder->CreateUnreachable();
1575 }
1576
getType(Value * value)1577 Type *Nucleus::getType(Value *value)
1578 {
1579 return T(V(value)->getType());
1580 }
1581
getContainedType(Type * vectorType)1582 Type *Nucleus::getContainedType(Type *vectorType)
1583 {
1584 return T(T(vectorType)->getContainedType(0));
1585 }
1586
getPointerType(Type * ElementType)1587 Type *Nucleus::getPointerType(Type *ElementType)
1588 {
1589 return T(llvm::PointerType::get(T(ElementType), 0));
1590 }
1591
getNaturalIntType()1592 static ::llvm::Type *getNaturalIntType()
1593 {
1594 return ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8);
1595 }
1596
getPrintfStorageType(Type * valueType)1597 Type *Nucleus::getPrintfStorageType(Type *valueType)
1598 {
1599 llvm::Type *valueTy = T(valueType);
1600 if(valueTy->isIntegerTy())
1601 {
1602 return T(getNaturalIntType());
1603 }
1604 if(valueTy->isFloatTy())
1605 {
1606 return T(llvm::Type::getDoubleTy(jit->context));
1607 }
1608
1609 UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1610 return {};
1611 }
1612
createNullValue(Type * Ty)1613 Value *Nucleus::createNullValue(Type *Ty)
1614 {
1615 RR_DEBUG_INFO_UPDATE_LOC();
1616 return V(llvm::Constant::getNullValue(T(Ty)));
1617 }
1618
createConstantLong(int64_t i)1619 Value *Nucleus::createConstantLong(int64_t i)
1620 {
1621 RR_DEBUG_INFO_UPDATE_LOC();
1622 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(jit->context), i, true));
1623 }
1624
createConstantInt(int i)1625 Value *Nucleus::createConstantInt(int i)
1626 {
1627 RR_DEBUG_INFO_UPDATE_LOC();
1628 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, true));
1629 }
1630
createConstantInt(unsigned int i)1631 Value *Nucleus::createConstantInt(unsigned int i)
1632 {
1633 RR_DEBUG_INFO_UPDATE_LOC();
1634 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, false));
1635 }
1636
createConstantBool(bool b)1637 Value *Nucleus::createConstantBool(bool b)
1638 {
1639 RR_DEBUG_INFO_UPDATE_LOC();
1640 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(jit->context), b));
1641 }
1642
createConstantByte(signed char i)1643 Value *Nucleus::createConstantByte(signed char i)
1644 {
1645 RR_DEBUG_INFO_UPDATE_LOC();
1646 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, true));
1647 }
1648
createConstantByte(unsigned char i)1649 Value *Nucleus::createConstantByte(unsigned char i)
1650 {
1651 RR_DEBUG_INFO_UPDATE_LOC();
1652 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, false));
1653 }
1654
createConstantShort(short i)1655 Value *Nucleus::createConstantShort(short i)
1656 {
1657 RR_DEBUG_INFO_UPDATE_LOC();
1658 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, true));
1659 }
1660
createConstantShort(unsigned short i)1661 Value *Nucleus::createConstantShort(unsigned short i)
1662 {
1663 RR_DEBUG_INFO_UPDATE_LOC();
1664 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, false));
1665 }
1666
createConstantFloat(float x)1667 Value *Nucleus::createConstantFloat(float x)
1668 {
1669 RR_DEBUG_INFO_UPDATE_LOC();
1670 return V(llvm::ConstantFP::get(T(Float::getType()), x));
1671 }
1672
createNullPointer(Type * Ty)1673 Value *Nucleus::createNullPointer(Type *Ty)
1674 {
1675 RR_DEBUG_INFO_UPDATE_LOC();
1676 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1677 }
1678
createConstantVector(const int64_t * constants,Type * type)1679 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1680 {
1681 RR_DEBUG_INFO_UPDATE_LOC();
1682 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1683 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1684 const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1685 ASSERT(numElements <= 16 && numConstants <= numElements);
1686 llvm::Constant *constantVector[16];
1687
1688 for(int i = 0; i < numElements; i++)
1689 {
1690 constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1691 }
1692
1693 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1694 }
1695
createConstantVector(const double * constants,Type * type)1696 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1697 {
1698 RR_DEBUG_INFO_UPDATE_LOC();
1699 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1700 const int numConstants = elementCount(type); // Number of provided constants for the (emulated) type.
1701 const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1702 ASSERT(numElements <= 8 && numConstants <= numElements);
1703 llvm::Constant *constantVector[8];
1704
1705 for(int i = 0; i < numElements; i++)
1706 {
1707 constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1708 }
1709
1710 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1711 }
1712
createConstantString(const char * v)1713 Value *Nucleus::createConstantString(const char *v)
1714 {
1715 // NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1716 auto ptr = jit->builder->CreateGlobalStringPtr(v);
1717 return V(ptr);
1718 }
1719
getType()1720 Type *Void::getType()
1721 {
1722 return T(llvm::Type::getVoidTy(jit->context));
1723 }
1724
getType()1725 Type *Bool::getType()
1726 {
1727 return T(llvm::Type::getInt1Ty(jit->context));
1728 }
1729
getType()1730 Type *Byte::getType()
1731 {
1732 return T(llvm::Type::getInt8Ty(jit->context));
1733 }
1734
getType()1735 Type *SByte::getType()
1736 {
1737 return T(llvm::Type::getInt8Ty(jit->context));
1738 }
1739
getType()1740 Type *Short::getType()
1741 {
1742 return T(llvm::Type::getInt16Ty(jit->context));
1743 }
1744
getType()1745 Type *UShort::getType()
1746 {
1747 return T(llvm::Type::getInt16Ty(jit->context));
1748 }
1749
getType()1750 Type *Byte4::getType()
1751 {
1752 return T(Type_v4i8);
1753 }
1754
getType()1755 Type *SByte4::getType()
1756 {
1757 return T(Type_v4i8);
1758 }
1759
AddSat(RValue<Byte8> x,RValue<Byte8> y)1760 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1761 {
1762 RR_DEBUG_INFO_UPDATE_LOC();
1763 #if defined(__i386__) || defined(__x86_64__)
1764 return x86::paddusb(x, y);
1765 #else
1766 return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
1767 #endif
1768 }
1769
SubSat(RValue<Byte8> x,RValue<Byte8> y)1770 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1771 {
1772 RR_DEBUG_INFO_UPDATE_LOC();
1773 #if defined(__i386__) || defined(__x86_64__)
1774 return x86::psubusb(x, y);
1775 #else
1776 return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
1777 #endif
1778 }
1779
SignMask(RValue<Byte8> x)1780 RValue<Int> SignMask(RValue<Byte8> x)
1781 {
1782 RR_DEBUG_INFO_UPDATE_LOC();
1783 #if defined(__i386__) || defined(__x86_64__)
1784 return x86::pmovmskb(x);
1785 #else
1786 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1787 #endif
1788 }
1789
1790 // RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1791 // {
1792 //#if defined(__i386__) || defined(__x86_64__)
1793 // return x86::pcmpgtb(x, y); // FIXME: Signedness
1794 //#else
1795 // return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1796 //#endif
1797 // }
1798
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1799 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1800 {
1801 RR_DEBUG_INFO_UPDATE_LOC();
1802 #if defined(__i386__) || defined(__x86_64__)
1803 return x86::pcmpeqb(x, y);
1804 #else
1805 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1806 #endif
1807 }
1808
getType()1809 Type *Byte8::getType()
1810 {
1811 return T(Type_v8i8);
1812 }
1813
AddSat(RValue<SByte8> x,RValue<SByte8> y)1814 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1815 {
1816 RR_DEBUG_INFO_UPDATE_LOC();
1817 #if defined(__i386__) || defined(__x86_64__)
1818 return x86::paddsb(x, y);
1819 #else
1820 return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1821 #endif
1822 }
1823
SubSat(RValue<SByte8> x,RValue<SByte8> y)1824 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1825 {
1826 RR_DEBUG_INFO_UPDATE_LOC();
1827 #if defined(__i386__) || defined(__x86_64__)
1828 return x86::psubsb(x, y);
1829 #else
1830 return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1831 #endif
1832 }
1833
SignMask(RValue<SByte8> x)1834 RValue<Int> SignMask(RValue<SByte8> x)
1835 {
1836 RR_DEBUG_INFO_UPDATE_LOC();
1837 #if defined(__i386__) || defined(__x86_64__)
1838 return x86::pmovmskb(As<Byte8>(x));
1839 #else
1840 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1841 #endif
1842 }
1843
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1844 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1845 {
1846 RR_DEBUG_INFO_UPDATE_LOC();
1847 #if defined(__i386__) || defined(__x86_64__)
1848 return x86::pcmpgtb(x, y);
1849 #else
1850 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1851 #endif
1852 }
1853
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1854 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1855 {
1856 RR_DEBUG_INFO_UPDATE_LOC();
1857 #if defined(__i386__) || defined(__x86_64__)
1858 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1859 #else
1860 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1861 #endif
1862 }
1863
getType()1864 Type *SByte8::getType()
1865 {
1866 return T(Type_v8i8);
1867 }
1868
getType()1869 Type *Byte16::getType()
1870 {
1871 return T(llvm::VectorType::get(T(Byte::getType()), 16));
1872 }
1873
getType()1874 Type *SByte16::getType()
1875 {
1876 return T(llvm::VectorType::get(T(SByte::getType()), 16));
1877 }
1878
getType()1879 Type *Short2::getType()
1880 {
1881 return T(Type_v2i16);
1882 }
1883
getType()1884 Type *UShort2::getType()
1885 {
1886 return T(Type_v2i16);
1887 }
1888
Short4(RValue<Int4> cast)1889 Short4::Short4(RValue<Int4> cast)
1890 {
1891 RR_DEBUG_INFO_UPDATE_LOC();
1892 int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
1893 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
1894
1895 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1896 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
1897
1898 storeValue(short4);
1899 }
1900
1901 // Short4::Short4(RValue<Float> cast)
1902 // {
1903 // }
1904
Short4(RValue<Float4> cast)1905 Short4::Short4(RValue<Float4> cast)
1906 {
1907 RR_DEBUG_INFO_UPDATE_LOC();
1908 Int4 v4i32 = Int4(cast);
1909 #if defined(__i386__) || defined(__x86_64__)
1910 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
1911 #else
1912 Value *v = v4i32.loadValue();
1913 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
1914 #endif
1915
1916 storeValue(As<Short4>(Int2(v4i32)).value);
1917 }
1918
operator <<(RValue<Short4> lhs,unsigned char rhs)1919 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
1920 {
1921 RR_DEBUG_INFO_UPDATE_LOC();
1922 #if defined(__i386__) || defined(__x86_64__)
1923 // return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
1924
1925 return x86::psllw(lhs, rhs);
1926 #else
1927 return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
1928 #endif
1929 }
1930
operator >>(RValue<Short4> lhs,unsigned char rhs)1931 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
1932 {
1933 RR_DEBUG_INFO_UPDATE_LOC();
1934 #if defined(__i386__) || defined(__x86_64__)
1935 return x86::psraw(lhs, rhs);
1936 #else
1937 return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
1938 #endif
1939 }
1940
Max(RValue<Short4> x,RValue<Short4> y)1941 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
1942 {
1943 RR_DEBUG_INFO_UPDATE_LOC();
1944 #if defined(__i386__) || defined(__x86_64__)
1945 return x86::pmaxsw(x, y);
1946 #else
1947 return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
1948 #endif
1949 }
1950
Min(RValue<Short4> x,RValue<Short4> y)1951 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
1952 {
1953 RR_DEBUG_INFO_UPDATE_LOC();
1954 #if defined(__i386__) || defined(__x86_64__)
1955 return x86::pminsw(x, y);
1956 #else
1957 return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
1958 #endif
1959 }
1960
AddSat(RValue<Short4> x,RValue<Short4> y)1961 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
1962 {
1963 RR_DEBUG_INFO_UPDATE_LOC();
1964 #if defined(__i386__) || defined(__x86_64__)
1965 return x86::paddsw(x, y);
1966 #else
1967 return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1968 #endif
1969 }
1970
SubSat(RValue<Short4> x,RValue<Short4> y)1971 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
1972 {
1973 RR_DEBUG_INFO_UPDATE_LOC();
1974 #if defined(__i386__) || defined(__x86_64__)
1975 return x86::psubsw(x, y);
1976 #else
1977 return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1978 #endif
1979 }
1980
MulHigh(RValue<Short4> x,RValue<Short4> y)1981 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
1982 {
1983 RR_DEBUG_INFO_UPDATE_LOC();
1984 #if defined(__i386__) || defined(__x86_64__)
1985 return x86::pmulhw(x, y);
1986 #else
1987 return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
1988 #endif
1989 }
1990
MulAdd(RValue<Short4> x,RValue<Short4> y)1991 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
1992 {
1993 RR_DEBUG_INFO_UPDATE_LOC();
1994 #if defined(__i386__) || defined(__x86_64__)
1995 return x86::pmaddwd(x, y);
1996 #else
1997 return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
1998 #endif
1999 }
2000
PackSigned(RValue<Short4> x,RValue<Short4> y)2001 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2002 {
2003 RR_DEBUG_INFO_UPDATE_LOC();
2004 #if defined(__i386__) || defined(__x86_64__)
2005 auto result = x86::packsswb(x, y);
2006 #else
2007 auto result = V(lowerPack(V(x.value), V(y.value), true));
2008 #endif
2009 return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2010 }
2011
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2012 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2013 {
2014 RR_DEBUG_INFO_UPDATE_LOC();
2015 #if defined(__i386__) || defined(__x86_64__)
2016 auto result = x86::packuswb(x, y);
2017 #else
2018 auto result = V(lowerPack(V(x.value), V(y.value), false));
2019 #endif
2020 return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2021 }
2022
CmpGT(RValue<Short4> x,RValue<Short4> y)2023 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2024 {
2025 RR_DEBUG_INFO_UPDATE_LOC();
2026 #if defined(__i386__) || defined(__x86_64__)
2027 return x86::pcmpgtw(x, y);
2028 #else
2029 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
2030 #endif
2031 }
2032
CmpEQ(RValue<Short4> x,RValue<Short4> y)2033 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2034 {
2035 RR_DEBUG_INFO_UPDATE_LOC();
2036 #if defined(__i386__) || defined(__x86_64__)
2037 return x86::pcmpeqw(x, y);
2038 #else
2039 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
2040 #endif
2041 }
2042
getType()2043 Type *Short4::getType()
2044 {
2045 return T(Type_v4i16);
2046 }
2047
UShort4(RValue<Float4> cast,bool saturate)2048 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2049 {
2050 RR_DEBUG_INFO_UPDATE_LOC();
2051 if(saturate)
2052 {
2053 #if defined(__i386__) || defined(__x86_64__)
2054 if(CPUID::supportsSSE4_1())
2055 {
2056 Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
2057 *this = As<Short4>(PackUnsigned(int4, int4));
2058 }
2059 else
2060 #endif
2061 {
2062 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2063 }
2064 }
2065 else
2066 {
2067 *this = Short4(Int4(cast));
2068 }
2069 }
2070
operator <<(RValue<UShort4> lhs,unsigned char rhs)2071 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2072 {
2073 RR_DEBUG_INFO_UPDATE_LOC();
2074 #if defined(__i386__) || defined(__x86_64__)
2075 // return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2076
2077 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2078 #else
2079 return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
2080 #endif
2081 }
2082
operator >>(RValue<UShort4> lhs,unsigned char rhs)2083 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2084 {
2085 RR_DEBUG_INFO_UPDATE_LOC();
2086 #if defined(__i386__) || defined(__x86_64__)
2087 // return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
2088
2089 return x86::psrlw(lhs, rhs);
2090 #else
2091 return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2092 #endif
2093 }
2094
Max(RValue<UShort4> x,RValue<UShort4> y)2095 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2096 {
2097 RR_DEBUG_INFO_UPDATE_LOC();
2098 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2099 }
2100
Min(RValue<UShort4> x,RValue<UShort4> y)2101 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2102 {
2103 RR_DEBUG_INFO_UPDATE_LOC();
2104 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2105 }
2106
AddSat(RValue<UShort4> x,RValue<UShort4> y)2107 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2108 {
2109 RR_DEBUG_INFO_UPDATE_LOC();
2110 #if defined(__i386__) || defined(__x86_64__)
2111 return x86::paddusw(x, y);
2112 #else
2113 return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
2114 #endif
2115 }
2116
SubSat(RValue<UShort4> x,RValue<UShort4> y)2117 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2118 {
2119 RR_DEBUG_INFO_UPDATE_LOC();
2120 #if defined(__i386__) || defined(__x86_64__)
2121 return x86::psubusw(x, y);
2122 #else
2123 return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
2124 #endif
2125 }
2126
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2127 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2128 {
2129 RR_DEBUG_INFO_UPDATE_LOC();
2130 #if defined(__i386__) || defined(__x86_64__)
2131 return x86::pmulhuw(x, y);
2132 #else
2133 return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2134 #endif
2135 }
2136
Average(RValue<UShort4> x,RValue<UShort4> y)2137 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2138 {
2139 RR_DEBUG_INFO_UPDATE_LOC();
2140 #if defined(__i386__) || defined(__x86_64__)
2141 return x86::pavgw(x, y);
2142 #else
2143 return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
2144 #endif
2145 }
2146
getType()2147 Type *UShort4::getType()
2148 {
2149 return T(Type_v4i16);
2150 }
2151
operator <<(RValue<Short8> lhs,unsigned char rhs)2152 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2153 {
2154 RR_DEBUG_INFO_UPDATE_LOC();
2155 #if defined(__i386__) || defined(__x86_64__)
2156 return x86::psllw(lhs, rhs);
2157 #else
2158 return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
2159 #endif
2160 }
2161
operator >>(RValue<Short8> lhs,unsigned char rhs)2162 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2163 {
2164 RR_DEBUG_INFO_UPDATE_LOC();
2165 #if defined(__i386__) || defined(__x86_64__)
2166 return x86::psraw(lhs, rhs);
2167 #else
2168 return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
2169 #endif
2170 }
2171
MulAdd(RValue<Short8> x,RValue<Short8> y)2172 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2173 {
2174 RR_DEBUG_INFO_UPDATE_LOC();
2175 #if defined(__i386__) || defined(__x86_64__)
2176 return x86::pmaddwd(x, y);
2177 #else
2178 return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
2179 #endif
2180 }
2181
MulHigh(RValue<Short8> x,RValue<Short8> y)2182 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2183 {
2184 RR_DEBUG_INFO_UPDATE_LOC();
2185 #if defined(__i386__) || defined(__x86_64__)
2186 return x86::pmulhw(x, y);
2187 #else
2188 return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2189 #endif
2190 }
2191
getType()2192 Type *Short8::getType()
2193 {
2194 return T(llvm::VectorType::get(T(Short::getType()), 8));
2195 }
2196
operator <<(RValue<UShort8> lhs,unsigned char rhs)2197 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2198 {
2199 RR_DEBUG_INFO_UPDATE_LOC();
2200 #if defined(__i386__) || defined(__x86_64__)
2201 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2202 #else
2203 return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
2204 #endif
2205 }
2206
operator >>(RValue<UShort8> lhs,unsigned char rhs)2207 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2208 {
2209 RR_DEBUG_INFO_UPDATE_LOC();
2210 #if defined(__i386__) || defined(__x86_64__)
2211 return x86::psrlw(lhs, rhs); // FIXME: Fallback required
2212 #else
2213 return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
2214 #endif
2215 }
2216
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2217 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2218 {
2219 RR_DEBUG_INFO_UPDATE_LOC();
2220 #if defined(__i386__) || defined(__x86_64__)
2221 return x86::pmulhuw(x, y);
2222 #else
2223 return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2224 #endif
2225 }
2226
getType()2227 Type *UShort8::getType()
2228 {
2229 return T(llvm::VectorType::get(T(UShort::getType()), 8));
2230 }
2231
operator ++(Int & val,int)2232 RValue<Int> operator++(Int &val, int) // Post-increment
2233 {
2234 RR_DEBUG_INFO_UPDATE_LOC();
2235 RValue<Int> res = val;
2236
2237 Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2238 val.storeValue(inc);
2239
2240 return res;
2241 }
2242
operator ++(Int & val)2243 const Int &operator++(Int &val) // Pre-increment
2244 {
2245 RR_DEBUG_INFO_UPDATE_LOC();
2246 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2247 val.storeValue(inc);
2248
2249 return val;
2250 }
2251
operator --(Int & val,int)2252 RValue<Int> operator--(Int &val, int) // Post-decrement
2253 {
2254 RR_DEBUG_INFO_UPDATE_LOC();
2255 RValue<Int> res = val;
2256
2257 Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2258 val.storeValue(inc);
2259
2260 return res;
2261 }
2262
operator --(Int & val)2263 const Int &operator--(Int &val) // Pre-decrement
2264 {
2265 RR_DEBUG_INFO_UPDATE_LOC();
2266 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2267 val.storeValue(inc);
2268
2269 return val;
2270 }
2271
RoundInt(RValue<Float> cast)2272 RValue<Int> RoundInt(RValue<Float> cast)
2273 {
2274 RR_DEBUG_INFO_UPDATE_LOC();
2275 #if defined(__i386__) || defined(__x86_64__)
2276 return x86::cvtss2si(cast);
2277 #else
2278 return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
2279 #endif
2280 }
2281
getType()2282 Type *Int::getType()
2283 {
2284 return T(llvm::Type::getInt32Ty(jit->context));
2285 }
2286
getType()2287 Type *Long::getType()
2288 {
2289 return T(llvm::Type::getInt64Ty(jit->context));
2290 }
2291
UInt(RValue<Float> cast)2292 UInt::UInt(RValue<Float> cast)
2293 {
2294 RR_DEBUG_INFO_UPDATE_LOC();
2295 Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
2296 storeValue(integer);
2297 }
2298
operator ++(UInt & val,int)2299 RValue<UInt> operator++(UInt &val, int) // Post-increment
2300 {
2301 RR_DEBUG_INFO_UPDATE_LOC();
2302 RValue<UInt> res = val;
2303
2304 Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2305 val.storeValue(inc);
2306
2307 return res;
2308 }
2309
operator ++(UInt & val)2310 const UInt &operator++(UInt &val) // Pre-increment
2311 {
2312 RR_DEBUG_INFO_UPDATE_LOC();
2313 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2314 val.storeValue(inc);
2315
2316 return val;
2317 }
2318
operator --(UInt & val,int)2319 RValue<UInt> operator--(UInt &val, int) // Post-decrement
2320 {
2321 RR_DEBUG_INFO_UPDATE_LOC();
2322 RValue<UInt> res = val;
2323
2324 Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2325 val.storeValue(inc);
2326
2327 return res;
2328 }
2329
operator --(UInt & val)2330 const UInt &operator--(UInt &val) // Pre-decrement
2331 {
2332 RR_DEBUG_INFO_UPDATE_LOC();
2333 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2334 val.storeValue(inc);
2335
2336 return val;
2337 }
2338
2339 // RValue<UInt> RoundUInt(RValue<Float> cast)
2340 // {
2341 //#if defined(__i386__) || defined(__x86_64__)
2342 // return x86::cvtss2si(val); // FIXME: Unsigned
2343 //#else
2344 // return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2345 //#endif
2346 // }
2347
getType()2348 Type *UInt::getType()
2349 {
2350 return T(llvm::Type::getInt32Ty(jit->context));
2351 }
2352
2353 // Int2::Int2(RValue<Int> cast)
2354 // {
2355 // Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2356 // Value *vector = Nucleus::createBitCast(extend, Int2::getType());
2357 //
2358 // int shuffle[2] = {0, 0};
2359 // Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2360 //
2361 // storeValue(replicate);
2362 // }
2363
operator <<(RValue<Int2> lhs,unsigned char rhs)2364 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2365 {
2366 RR_DEBUG_INFO_UPDATE_LOC();
2367 #if defined(__i386__) || defined(__x86_64__)
2368 // return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
2369
2370 return x86::pslld(lhs, rhs);
2371 #else
2372 return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
2373 #endif
2374 }
2375
operator >>(RValue<Int2> lhs,unsigned char rhs)2376 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2377 {
2378 RR_DEBUG_INFO_UPDATE_LOC();
2379 #if defined(__i386__) || defined(__x86_64__)
2380 // return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
2381
2382 return x86::psrad(lhs, rhs);
2383 #else
2384 return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
2385 #endif
2386 }
2387
getType()2388 Type *Int2::getType()
2389 {
2390 return T(Type_v2i32);
2391 }
2392
operator <<(RValue<UInt2> lhs,unsigned char rhs)2393 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2394 {
2395 RR_DEBUG_INFO_UPDATE_LOC();
2396 #if defined(__i386__) || defined(__x86_64__)
2397 // return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
2398
2399 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2400 #else
2401 return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
2402 #endif
2403 }
2404
operator >>(RValue<UInt2> lhs,unsigned char rhs)2405 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2406 {
2407 RR_DEBUG_INFO_UPDATE_LOC();
2408 #if defined(__i386__) || defined(__x86_64__)
2409 // return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
2410
2411 return x86::psrld(lhs, rhs);
2412 #else
2413 return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
2414 #endif
2415 }
2416
getType()2417 Type *UInt2::getType()
2418 {
2419 return T(Type_v2i32);
2420 }
2421
Int4(RValue<Byte4> cast)2422 Int4::Int4(RValue<Byte4> cast)
2423 : XYZW(this)
2424 {
2425 RR_DEBUG_INFO_UPDATE_LOC();
2426 #if defined(__i386__) || defined(__x86_64__)
2427 if(CPUID::supportsSSE4_1())
2428 {
2429 *this = x86::pmovzxbd(As<Byte16>(cast));
2430 }
2431 else
2432 #endif
2433 {
2434 int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2435 Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2436 Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
2437
2438 int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2439 Value *c = Nucleus::createBitCast(b, Short8::getType());
2440 Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
2441
2442 *this = As<Int4>(d);
2443 }
2444 }
2445
Int4(RValue<SByte4> cast)2446 Int4::Int4(RValue<SByte4> cast)
2447 : XYZW(this)
2448 {
2449 RR_DEBUG_INFO_UPDATE_LOC();
2450 #if defined(__i386__) || defined(__x86_64__)
2451 if(CPUID::supportsSSE4_1())
2452 {
2453 *this = x86::pmovsxbd(As<SByte16>(cast));
2454 }
2455 else
2456 #endif
2457 {
2458 int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2459 Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2460 Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2461
2462 int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2463 Value *c = Nucleus::createBitCast(b, Short8::getType());
2464 Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2465
2466 *this = As<Int4>(d) >> 24;
2467 }
2468 }
2469
Int4(RValue<Short4> cast)2470 Int4::Int4(RValue<Short4> cast)
2471 : XYZW(this)
2472 {
2473 RR_DEBUG_INFO_UPDATE_LOC();
2474 #if defined(__i386__) || defined(__x86_64__)
2475 if(CPUID::supportsSSE4_1())
2476 {
2477 *this = x86::pmovsxwd(As<Short8>(cast));
2478 }
2479 else
2480 #endif
2481 {
2482 int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2483 Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
2484 *this = As<Int4>(c) >> 16;
2485 }
2486 }
2487
Int4(RValue<UShort4> cast)2488 Int4::Int4(RValue<UShort4> cast)
2489 : XYZW(this)
2490 {
2491 RR_DEBUG_INFO_UPDATE_LOC();
2492 #if defined(__i386__) || defined(__x86_64__)
2493 if(CPUID::supportsSSE4_1())
2494 {
2495 *this = x86::pmovzxwd(As<UShort8>(cast));
2496 }
2497 else
2498 #endif
2499 {
2500 int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2501 Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2502 *this = As<Int4>(c);
2503 }
2504 }
2505
Int4(RValue<Int> rhs)2506 Int4::Int4(RValue<Int> rhs)
2507 : XYZW(this)
2508 {
2509 RR_DEBUG_INFO_UPDATE_LOC();
2510 Value *vector = loadValue();
2511 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2512
2513 int swizzle[4] = { 0, 0, 0, 0 };
2514 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2515
2516 storeValue(replicate);
2517 }
2518
operator <<(RValue<Int4> lhs,unsigned char rhs)2519 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2520 {
2521 RR_DEBUG_INFO_UPDATE_LOC();
2522 #if defined(__i386__) || defined(__x86_64__)
2523 return x86::pslld(lhs, rhs);
2524 #else
2525 return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
2526 #endif
2527 }
2528
operator >>(RValue<Int4> lhs,unsigned char rhs)2529 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2530 {
2531 RR_DEBUG_INFO_UPDATE_LOC();
2532 #if defined(__i386__) || defined(__x86_64__)
2533 return x86::psrad(lhs, rhs);
2534 #else
2535 return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
2536 #endif
2537 }
2538
CmpEQ(RValue<Int4> x,RValue<Int4> y)2539 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2540 {
2541 RR_DEBUG_INFO_UPDATE_LOC();
2542 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2543 }
2544
CmpLT(RValue<Int4> x,RValue<Int4> y)2545 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2546 {
2547 RR_DEBUG_INFO_UPDATE_LOC();
2548 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
2549 }
2550
CmpLE(RValue<Int4> x,RValue<Int4> y)2551 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2552 {
2553 RR_DEBUG_INFO_UPDATE_LOC();
2554 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
2555 }
2556
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2557 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2558 {
2559 RR_DEBUG_INFO_UPDATE_LOC();
2560 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2561 }
2562
CmpNLT(RValue<Int4> x,RValue<Int4> y)2563 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2564 {
2565 RR_DEBUG_INFO_UPDATE_LOC();
2566 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
2567 }
2568
CmpNLE(RValue<Int4> x,RValue<Int4> y)2569 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2570 {
2571 RR_DEBUG_INFO_UPDATE_LOC();
2572 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
2573 }
2574
Max(RValue<Int4> x,RValue<Int4> y)2575 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2576 {
2577 RR_DEBUG_INFO_UPDATE_LOC();
2578 #if defined(__i386__) || defined(__x86_64__)
2579 if(CPUID::supportsSSE4_1())
2580 {
2581 return x86::pmaxsd(x, y);
2582 }
2583 else
2584 #endif
2585 {
2586 RValue<Int4> greater = CmpNLE(x, y);
2587 return (x & greater) | (y & ~greater);
2588 }
2589 }
2590
Min(RValue<Int4> x,RValue<Int4> y)2591 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2592 {
2593 RR_DEBUG_INFO_UPDATE_LOC();
2594 #if defined(__i386__) || defined(__x86_64__)
2595 if(CPUID::supportsSSE4_1())
2596 {
2597 return x86::pminsd(x, y);
2598 }
2599 else
2600 #endif
2601 {
2602 RValue<Int4> less = CmpLT(x, y);
2603 return (x & less) | (y & ~less);
2604 }
2605 }
2606
RoundInt(RValue<Float4> cast)2607 RValue<Int4> RoundInt(RValue<Float4> cast)
2608 {
2609 RR_DEBUG_INFO_UPDATE_LOC();
2610 #if defined(__i386__) || defined(__x86_64__)
2611 return x86::cvtps2dq(cast);
2612 #else
2613 return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
2614 #endif
2615 }
2616
MulHigh(RValue<Int4> x,RValue<Int4> y)2617 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2618 {
2619 RR_DEBUG_INFO_UPDATE_LOC();
2620 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2621 return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2622 }
2623
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2624 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2625 {
2626 RR_DEBUG_INFO_UPDATE_LOC();
2627 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2628 return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2629 }
2630
PackSigned(RValue<Int4> x,RValue<Int4> y)2631 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2632 {
2633 RR_DEBUG_INFO_UPDATE_LOC();
2634 #if defined(__i386__) || defined(__x86_64__)
2635 return x86::packssdw(x, y);
2636 #else
2637 return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
2638 #endif
2639 }
2640
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2641 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2642 {
2643 RR_DEBUG_INFO_UPDATE_LOC();
2644 #if defined(__i386__) || defined(__x86_64__)
2645 return x86::packusdw(x, y);
2646 #else
2647 return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
2648 #endif
2649 }
2650
SignMask(RValue<Int4> x)2651 RValue<Int> SignMask(RValue<Int4> x)
2652 {
2653 RR_DEBUG_INFO_UPDATE_LOC();
2654 #if defined(__i386__) || defined(__x86_64__)
2655 return x86::movmskps(As<Float4>(x));
2656 #else
2657 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
2658 #endif
2659 }
2660
getType()2661 Type *Int4::getType()
2662 {
2663 return T(llvm::VectorType::get(T(Int::getType()), 4));
2664 }
2665
UInt4(RValue<Float4> cast)2666 UInt4::UInt4(RValue<Float4> cast)
2667 : XYZW(this)
2668 {
2669 RR_DEBUG_INFO_UPDATE_LOC();
2670 Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
2671 storeValue(xyzw);
2672 }
2673
UInt4(RValue<UInt> rhs)2674 UInt4::UInt4(RValue<UInt> rhs)
2675 : XYZW(this)
2676 {
2677 RR_DEBUG_INFO_UPDATE_LOC();
2678 Value *vector = loadValue();
2679 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2680
2681 int swizzle[4] = { 0, 0, 0, 0 };
2682 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2683
2684 storeValue(replicate);
2685 }
2686
operator <<(RValue<UInt4> lhs,unsigned char rhs)2687 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2688 {
2689 RR_DEBUG_INFO_UPDATE_LOC();
2690 #if defined(__i386__) || defined(__x86_64__)
2691 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2692 #else
2693 return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
2694 #endif
2695 }
2696
operator >>(RValue<UInt4> lhs,unsigned char rhs)2697 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2698 {
2699 RR_DEBUG_INFO_UPDATE_LOC();
2700 #if defined(__i386__) || defined(__x86_64__)
2701 return x86::psrld(lhs, rhs);
2702 #else
2703 return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2704 #endif
2705 }
2706
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2707 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2708 {
2709 RR_DEBUG_INFO_UPDATE_LOC();
2710 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2711 }
2712
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2713 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2714 {
2715 RR_DEBUG_INFO_UPDATE_LOC();
2716 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
2717 }
2718
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2719 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2720 {
2721 RR_DEBUG_INFO_UPDATE_LOC();
2722 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
2723 }
2724
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2725 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2726 {
2727 RR_DEBUG_INFO_UPDATE_LOC();
2728 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2729 }
2730
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2731 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2732 {
2733 RR_DEBUG_INFO_UPDATE_LOC();
2734 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
2735 }
2736
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2737 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2738 {
2739 RR_DEBUG_INFO_UPDATE_LOC();
2740 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
2741 }
2742
Max(RValue<UInt4> x,RValue<UInt4> y)2743 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2744 {
2745 RR_DEBUG_INFO_UPDATE_LOC();
2746 #if defined(__i386__) || defined(__x86_64__)
2747 if(CPUID::supportsSSE4_1())
2748 {
2749 return x86::pmaxud(x, y);
2750 }
2751 else
2752 #endif
2753 {
2754 RValue<UInt4> greater = CmpNLE(x, y);
2755 return (x & greater) | (y & ~greater);
2756 }
2757 }
2758
Min(RValue<UInt4> x,RValue<UInt4> y)2759 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2760 {
2761 RR_DEBUG_INFO_UPDATE_LOC();
2762 #if defined(__i386__) || defined(__x86_64__)
2763 if(CPUID::supportsSSE4_1())
2764 {
2765 return x86::pminud(x, y);
2766 }
2767 else
2768 #endif
2769 {
2770 RValue<UInt4> less = CmpLT(x, y);
2771 return (x & less) | (y & ~less);
2772 }
2773 }
2774
getType()2775 Type *UInt4::getType()
2776 {
2777 return T(llvm::VectorType::get(T(UInt::getType()), 4));
2778 }
2779
getType()2780 Type *Half::getType()
2781 {
2782 return T(llvm::Type::getInt16Ty(jit->context));
2783 }
2784
Rcp_pp(RValue<Float> x,bool exactAtPow2)2785 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2786 {
2787 RR_DEBUG_INFO_UPDATE_LOC();
2788 #if defined(__i386__) || defined(__x86_64__)
2789 if(exactAtPow2)
2790 {
2791 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2792 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2793 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2794 }
2795 return x86::rcpss(x);
2796 #else
2797 return As<Float>(V(lowerRCP(V(x.value))));
2798 #endif
2799 }
2800
RcpSqrt_pp(RValue<Float> x)2801 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2802 {
2803 RR_DEBUG_INFO_UPDATE_LOC();
2804 #if defined(__i386__) || defined(__x86_64__)
2805 return x86::rsqrtss(x);
2806 #else
2807 return As<Float>(V(lowerRSQRT(V(x.value))));
2808 #endif
2809 }
2810
Sqrt(RValue<Float> x)2811 RValue<Float> Sqrt(RValue<Float> x)
2812 {
2813 RR_DEBUG_INFO_UPDATE_LOC();
2814 #if defined(__i386__) || defined(__x86_64__)
2815 return x86::sqrtss(x);
2816 #else
2817 return As<Float>(V(lowerSQRT(V(x.value))));
2818 #endif
2819 }
2820
Round(RValue<Float> x)2821 RValue<Float> Round(RValue<Float> x)
2822 {
2823 RR_DEBUG_INFO_UPDATE_LOC();
2824 #if defined(__i386__) || defined(__x86_64__)
2825 if(CPUID::supportsSSE4_1())
2826 {
2827 return x86::roundss(x, 0);
2828 }
2829 else
2830 {
2831 return Float4(Round(Float4(x))).x;
2832 }
2833 #else
2834 return RValue<Float>(V(lowerRound(V(x.value))));
2835 #endif
2836 }
2837
Trunc(RValue<Float> x)2838 RValue<Float> Trunc(RValue<Float> x)
2839 {
2840 RR_DEBUG_INFO_UPDATE_LOC();
2841 #if defined(__i386__) || defined(__x86_64__)
2842 if(CPUID::supportsSSE4_1())
2843 {
2844 return x86::roundss(x, 3);
2845 }
2846 else
2847 {
2848 return Float(Int(x)); // Rounded toward zero
2849 }
2850 #else
2851 return RValue<Float>(V(lowerTrunc(V(x.value))));
2852 #endif
2853 }
2854
Frac(RValue<Float> x)2855 RValue<Float> Frac(RValue<Float> x)
2856 {
2857 RR_DEBUG_INFO_UPDATE_LOC();
2858 #if defined(__i386__) || defined(__x86_64__)
2859 if(CPUID::supportsSSE4_1())
2860 {
2861 return x - x86::floorss(x);
2862 }
2863 else
2864 {
2865 return Float4(Frac(Float4(x))).x;
2866 }
2867 #else
2868 // x - floor(x) can be 1.0 for very small negative x.
2869 // Clamp against the value just below 1.0.
2870 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
2871 #endif
2872 }
2873
Floor(RValue<Float> x)2874 RValue<Float> Floor(RValue<Float> x)
2875 {
2876 RR_DEBUG_INFO_UPDATE_LOC();
2877 #if defined(__i386__) || defined(__x86_64__)
2878 if(CPUID::supportsSSE4_1())
2879 {
2880 return x86::floorss(x);
2881 }
2882 else
2883 {
2884 return Float4(Floor(Float4(x))).x;
2885 }
2886 #else
2887 return RValue<Float>(V(lowerFloor(V(x.value))));
2888 #endif
2889 }
2890
Ceil(RValue<Float> x)2891 RValue<Float> Ceil(RValue<Float> x)
2892 {
2893 RR_DEBUG_INFO_UPDATE_LOC();
2894 #if defined(__i386__) || defined(__x86_64__)
2895 if(CPUID::supportsSSE4_1())
2896 {
2897 return x86::ceilss(x);
2898 }
2899 else
2900 #endif
2901 {
2902 return Float4(Ceil(Float4(x))).x;
2903 }
2904 }
2905
getType()2906 Type *Float::getType()
2907 {
2908 return T(llvm::Type::getFloatTy(jit->context));
2909 }
2910
getType()2911 Type *Float2::getType()
2912 {
2913 return T(Type_v2f32);
2914 }
2915
Exp2(RValue<Float> v)2916 RValue<Float> Exp2(RValue<Float> v)
2917 {
2918 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::getType()) });
2919 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
2920 }
2921
Log2(RValue<Float> v)2922 RValue<Float> Log2(RValue<Float> v)
2923 {
2924 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::getType()) });
2925 return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
2926 }
2927
Float4(RValue<Float> rhs)2928 Float4::Float4(RValue<Float> rhs)
2929 : XYZW(this)
2930 {
2931 RR_DEBUG_INFO_UPDATE_LOC();
2932 Value *vector = loadValue();
2933 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2934
2935 int swizzle[4] = { 0, 0, 0, 0 };
2936 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2937
2938 storeValue(replicate);
2939 }
2940
Max(RValue<Float4> x,RValue<Float4> y)2941 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
2942 {
2943 RR_DEBUG_INFO_UPDATE_LOC();
2944 #if defined(__i386__) || defined(__x86_64__)
2945 return x86::maxps(x, y);
2946 #else
2947 return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
2948 #endif
2949 }
2950
Min(RValue<Float4> x,RValue<Float4> y)2951 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
2952 {
2953 RR_DEBUG_INFO_UPDATE_LOC();
2954 #if defined(__i386__) || defined(__x86_64__)
2955 return x86::minps(x, y);
2956 #else
2957 return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
2958 #endif
2959 }
2960
Rcp_pp(RValue<Float4> x,bool exactAtPow2)2961 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
2962 {
2963 RR_DEBUG_INFO_UPDATE_LOC();
2964 #if defined(__i386__) || defined(__x86_64__)
2965 if(exactAtPow2)
2966 {
2967 // rcpps uses a piecewise-linear approximation which minimizes the relative error
2968 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2969 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2970 }
2971 return x86::rcpps(x);
2972 #else
2973 return As<Float4>(V(lowerRCP(V(x.value))));
2974 #endif
2975 }
2976
RcpSqrt_pp(RValue<Float4> x)2977 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
2978 {
2979 RR_DEBUG_INFO_UPDATE_LOC();
2980 #if defined(__i386__) || defined(__x86_64__)
2981 return x86::rsqrtps(x);
2982 #else
2983 return As<Float4>(V(lowerRSQRT(V(x.value))));
2984 #endif
2985 }
2986
Sqrt(RValue<Float4> x)2987 RValue<Float4> Sqrt(RValue<Float4> x)
2988 {
2989 RR_DEBUG_INFO_UPDATE_LOC();
2990 #if defined(__i386__) || defined(__x86_64__)
2991 return x86::sqrtps(x);
2992 #else
2993 return As<Float4>(V(lowerSQRT(V(x.value))));
2994 #endif
2995 }
2996
SignMask(RValue<Float4> x)2997 RValue<Int> SignMask(RValue<Float4> x)
2998 {
2999 RR_DEBUG_INFO_UPDATE_LOC();
3000 #if defined(__i386__) || defined(__x86_64__)
3001 return x86::movmskps(x);
3002 #else
3003 return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
3004 #endif
3005 }
3006
CmpEQ(RValue<Float4> x,RValue<Float4> y)3007 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3008 {
3009 RR_DEBUG_INFO_UPDATE_LOC();
3010 // return As<Int4>(x86::cmpeqps(x, y));
3011 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
3012 }
3013
CmpLT(RValue<Float4> x,RValue<Float4> y)3014 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3015 {
3016 RR_DEBUG_INFO_UPDATE_LOC();
3017 // return As<Int4>(x86::cmpltps(x, y));
3018 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
3019 }
3020
CmpLE(RValue<Float4> x,RValue<Float4> y)3021 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3022 {
3023 RR_DEBUG_INFO_UPDATE_LOC();
3024 // return As<Int4>(x86::cmpleps(x, y));
3025 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
3026 }
3027
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3028 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3029 {
3030 RR_DEBUG_INFO_UPDATE_LOC();
3031 // return As<Int4>(x86::cmpneqps(x, y));
3032 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
3033 }
3034
CmpNLT(RValue<Float4> x,RValue<Float4> y)3035 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3036 {
3037 RR_DEBUG_INFO_UPDATE_LOC();
3038 // return As<Int4>(x86::cmpnltps(x, y));
3039 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
3040 }
3041
CmpNLE(RValue<Float4> x,RValue<Float4> y)3042 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3043 {
3044 RR_DEBUG_INFO_UPDATE_LOC();
3045 // return As<Int4>(x86::cmpnleps(x, y));
3046 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
3047 }
3048
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3049 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3050 {
3051 RR_DEBUG_INFO_UPDATE_LOC();
3052 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
3053 }
3054
CmpULT(RValue<Float4> x,RValue<Float4> y)3055 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3056 {
3057 RR_DEBUG_INFO_UPDATE_LOC();
3058 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
3059 }
3060
CmpULE(RValue<Float4> x,RValue<Float4> y)3061 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3062 {
3063 RR_DEBUG_INFO_UPDATE_LOC();
3064 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
3065 }
3066
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3067 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3068 {
3069 RR_DEBUG_INFO_UPDATE_LOC();
3070 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
3071 }
3072
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3073 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3074 {
3075 RR_DEBUG_INFO_UPDATE_LOC();
3076 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
3077 }
3078
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3079 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3080 {
3081 RR_DEBUG_INFO_UPDATE_LOC();
3082 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
3083 }
3084
Round(RValue<Float4> x)3085 RValue<Float4> Round(RValue<Float4> x)
3086 {
3087 RR_DEBUG_INFO_UPDATE_LOC();
3088 #if defined(__i386__) || defined(__x86_64__)
3089 if(CPUID::supportsSSE4_1())
3090 {
3091 return x86::roundps(x, 0);
3092 }
3093 else
3094 {
3095 return Float4(RoundInt(x));
3096 }
3097 #else
3098 return RValue<Float4>(V(lowerRound(V(x.value))));
3099 #endif
3100 }
3101
Trunc(RValue<Float4> x)3102 RValue<Float4> Trunc(RValue<Float4> x)
3103 {
3104 RR_DEBUG_INFO_UPDATE_LOC();
3105 #if defined(__i386__) || defined(__x86_64__)
3106 if(CPUID::supportsSSE4_1())
3107 {
3108 return x86::roundps(x, 3);
3109 }
3110 else
3111 {
3112 return Float4(Int4(x));
3113 }
3114 #else
3115 return RValue<Float4>(V(lowerTrunc(V(x.value))));
3116 #endif
3117 }
3118
Frac(RValue<Float4> x)3119 RValue<Float4> Frac(RValue<Float4> x)
3120 {
3121 RR_DEBUG_INFO_UPDATE_LOC();
3122 Float4 frc;
3123
3124 #if defined(__i386__) || defined(__x86_64__)
3125 if(CPUID::supportsSSE4_1())
3126 {
3127 frc = x - Floor(x);
3128 }
3129 else
3130 {
3131 frc = x - Float4(Int4(x)); // Signed fractional part.
3132
3133 frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f))); // Add 1.0 if negative.
3134 }
3135 #else
3136 frc = x - Floor(x);
3137 #endif
3138
3139 // x - floor(x) can be 1.0 for very small negative x.
3140 // Clamp against the value just below 1.0.
3141 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3142 }
3143
Floor(RValue<Float4> x)3144 RValue<Float4> Floor(RValue<Float4> x)
3145 {
3146 RR_DEBUG_INFO_UPDATE_LOC();
3147 #if defined(__i386__) || defined(__x86_64__)
3148 if(CPUID::supportsSSE4_1())
3149 {
3150 return x86::floorps(x);
3151 }
3152 else
3153 {
3154 return x - Frac(x);
3155 }
3156 #else
3157 return RValue<Float4>(V(lowerFloor(V(x.value))));
3158 #endif
3159 }
3160
Ceil(RValue<Float4> x)3161 RValue<Float4> Ceil(RValue<Float4> x)
3162 {
3163 RR_DEBUG_INFO_UPDATE_LOC();
3164 #if defined(__i386__) || defined(__x86_64__)
3165 if(CPUID::supportsSSE4_1())
3166 {
3167 return x86::ceilps(x);
3168 }
3169 else
3170 #endif
3171 {
3172 return -Floor(-x);
3173 }
3174 }
3175
Sin(RValue<Float4> v)3176 RValue<Float4> Sin(RValue<Float4> v)
3177 {
3178 RR_DEBUG_INFO_UPDATE_LOC();
3179 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value)->getType() });
3180 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3181 }
3182
Cos(RValue<Float4> v)3183 RValue<Float4> Cos(RValue<Float4> v)
3184 {
3185 RR_DEBUG_INFO_UPDATE_LOC();
3186 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value)->getType() });
3187 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3188 }
3189
Tan(RValue<Float4> v)3190 RValue<Float4> Tan(RValue<Float4> v)
3191 {
3192 RR_DEBUG_INFO_UPDATE_LOC();
3193 return Sin(v) / Cos(v);
3194 }
3195
TransformFloat4PerElement(RValue<Float4> v,const char * name)3196 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3197 {
3198 auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), ::llvm::ArrayRef<llvm::Type *>(T(Float::getType())), false);
3199 auto func = jit->module->getOrInsertFunction(name, funcTy);
3200 llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
3201 for(uint64_t i = 0; i < 4; i++)
3202 {
3203 auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value, Float::getType(), i)));
3204 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3205 }
3206 return RValue<Float4>(V(out));
3207 }
3208
Asin(RValue<Float4> v)3209 RValue<Float4> Asin(RValue<Float4> v)
3210 {
3211 RR_DEBUG_INFO_UPDATE_LOC();
3212 return TransformFloat4PerElement(v, "asinf");
3213 }
3214
Acos(RValue<Float4> v)3215 RValue<Float4> Acos(RValue<Float4> v)
3216 {
3217 RR_DEBUG_INFO_UPDATE_LOC();
3218 return TransformFloat4PerElement(v, "acosf");
3219 }
3220
Atan(RValue<Float4> v)3221 RValue<Float4> Atan(RValue<Float4> v)
3222 {
3223 RR_DEBUG_INFO_UPDATE_LOC();
3224 return TransformFloat4PerElement(v, "atanf");
3225 }
3226
Sinh(RValue<Float4> v)3227 RValue<Float4> Sinh(RValue<Float4> v)
3228 {
3229 RR_DEBUG_INFO_UPDATE_LOC();
3230 return emulated::Sinh(v);
3231 }
3232
Cosh(RValue<Float4> v)3233 RValue<Float4> Cosh(RValue<Float4> v)
3234 {
3235 RR_DEBUG_INFO_UPDATE_LOC();
3236 return emulated::Cosh(v);
3237 }
3238
Tanh(RValue<Float4> v)3239 RValue<Float4> Tanh(RValue<Float4> v)
3240 {
3241 RR_DEBUG_INFO_UPDATE_LOC();
3242 return TransformFloat4PerElement(v, "tanhf");
3243 }
3244
Asinh(RValue<Float4> v)3245 RValue<Float4> Asinh(RValue<Float4> v)
3246 {
3247 RR_DEBUG_INFO_UPDATE_LOC();
3248 return TransformFloat4PerElement(v, "asinhf");
3249 }
3250
Acosh(RValue<Float4> v)3251 RValue<Float4> Acosh(RValue<Float4> v)
3252 {
3253 RR_DEBUG_INFO_UPDATE_LOC();
3254 return TransformFloat4PerElement(v, "acoshf");
3255 }
3256
Atanh(RValue<Float4> v)3257 RValue<Float4> Atanh(RValue<Float4> v)
3258 {
3259 RR_DEBUG_INFO_UPDATE_LOC();
3260 return TransformFloat4PerElement(v, "atanhf");
3261 }
3262
Atan2(RValue<Float4> x,RValue<Float4> y)3263 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3264 {
3265 RR_DEBUG_INFO_UPDATE_LOC();
3266 ::llvm::SmallVector<::llvm::Type *, 2> paramTys;
3267 paramTys.push_back(T(Float::getType()));
3268 paramTys.push_back(T(Float::getType()));
3269 auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), paramTys, false);
3270 auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3271 llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
3272 for(uint64_t i = 0; i < 4; i++)
3273 {
3274 auto el = jit->builder->CreateCall2(func, ARGS(
3275 V(Nucleus::createExtractElement(x.value, Float::getType(), i)),
3276 V(Nucleus::createExtractElement(y.value, Float::getType(), i))));
3277 out = V(Nucleus::createInsertElement(V(out), V(el), i));
3278 }
3279 return RValue<Float4>(V(out));
3280 }
3281
Pow(RValue<Float4> x,RValue<Float4> y)3282 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3283 {
3284 RR_DEBUG_INFO_UPDATE_LOC();
3285 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::getType()) });
3286 return RValue<Float4>(V(jit->builder->CreateCall2(func, ARGS(V(x.value), V(y.value)))));
3287 }
3288
Exp(RValue<Float4> v)3289 RValue<Float4> Exp(RValue<Float4> v)
3290 {
3291 RR_DEBUG_INFO_UPDATE_LOC();
3292 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::getType()) });
3293 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3294 }
3295
Log(RValue<Float4> v)3296 RValue<Float4> Log(RValue<Float4> v)
3297 {
3298 RR_DEBUG_INFO_UPDATE_LOC();
3299 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::getType()) });
3300 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3301 }
3302
Exp2(RValue<Float4> v)3303 RValue<Float4> Exp2(RValue<Float4> v)
3304 {
3305 RR_DEBUG_INFO_UPDATE_LOC();
3306 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::getType()) });
3307 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3308 }
3309
Log2(RValue<Float4> v)3310 RValue<Float4> Log2(RValue<Float4> v)
3311 {
3312 RR_DEBUG_INFO_UPDATE_LOC();
3313 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::getType()) });
3314 return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3315 }
3316
Ctlz(RValue<UInt> v,bool isZeroUndef)3317 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3318 {
3319 RR_DEBUG_INFO_UPDATE_LOC();
3320 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::getType()) });
3321 return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
3322 V(v.value),
3323 isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3324 }
3325
Ctlz(RValue<UInt4> v,bool isZeroUndef)3326 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3327 {
3328 RR_DEBUG_INFO_UPDATE_LOC();
3329 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::getType()) });
3330 return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
3331 V(v.value),
3332 isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3333 }
3334
Cttz(RValue<UInt> v,bool isZeroUndef)3335 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3336 {
3337 RR_DEBUG_INFO_UPDATE_LOC();
3338 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::getType()) });
3339 return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
3340 V(v.value),
3341 isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3342 }
3343
Cttz(RValue<UInt4> v,bool isZeroUndef)3344 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3345 {
3346 RR_DEBUG_INFO_UPDATE_LOC();
3347 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::getType()) });
3348 return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
3349 V(v.value),
3350 isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3351 }
3352
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3353 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3354 {
3355 return RValue<Int>(Nucleus::createAtomicMin(x.value, y.value, memoryOrder));
3356 }
3357
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3358 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3359 {
3360 return RValue<UInt>(Nucleus::createAtomicUMin(x.value, y.value, memoryOrder));
3361 }
3362
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3363 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3364 {
3365 return RValue<Int>(Nucleus::createAtomicMax(x.value, y.value, memoryOrder));
3366 }
3367
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3368 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3369 {
3370 return RValue<UInt>(Nucleus::createAtomicUMax(x.value, y.value, memoryOrder));
3371 }
3372
getType()3373 Type *Float4::getType()
3374 {
3375 return T(llvm::VectorType::get(T(Float::getType()), 4));
3376 }
3377
Ticks()3378 RValue<Long> Ticks()
3379 {
3380 RR_DEBUG_INFO_UPDATE_LOC();
3381 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3382
3383 return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3384 }
3385
ConstantPointer(void const * ptr)3386 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3387 {
3388 RR_DEBUG_INFO_UPDATE_LOC();
3389 // Note: this should work for 32-bit pointers as well because 'inttoptr'
3390 // is defined to truncate (and zero extend) if necessary.
3391 auto ptrAsInt = ::llvm::ConstantInt::get(::llvm::Type::getInt64Ty(jit->context), reinterpret_cast<uintptr_t>(ptr));
3392 return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::getType()))));
3393 }
3394
ConstantData(void const * data,size_t size)3395 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3396 {
3397 RR_DEBUG_INFO_UPDATE_LOC();
3398 auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3399 auto ptr = jit->builder->CreateGlobalStringPtr(str);
3400 return RValue<Pointer<Byte>>(V(ptr));
3401 }
3402
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3403 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3404 {
3405 RR_DEBUG_INFO_UPDATE_LOC();
3406 ::llvm::SmallVector<::llvm::Type *, 8> paramTys;
3407 for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3408 auto funcTy = ::llvm::FunctionType::get(T(retTy), paramTys, false);
3409
3410 auto funcPtrTy = funcTy->getPointerTo();
3411 auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value), funcPtrTy);
3412
3413 ::llvm::SmallVector<::llvm::Value *, 8> arguments;
3414 for(auto arg : args) { arguments.push_back(V(arg)); }
3415 return V(jit->builder->CreateCall(funcPtr, arguments));
3416 }
3417
Breakpoint()3418 void Breakpoint()
3419 {
3420 RR_DEBUG_INFO_UPDATE_LOC();
3421 llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3422
3423 jit->builder->CreateCall(debugtrap);
3424 }
3425
3426 } // namespace rr
3427
3428 namespace rr {
3429
3430 #if defined(__i386__) || defined(__x86_64__)
3431 namespace x86 {
3432
cvtss2si(RValue<Float> val)3433 RValue<Int> cvtss2si(RValue<Float> val)
3434 {
3435 llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_cvtss2si);
3436
3437 Float4 vector;
3438 vector.x = val;
3439
3440 return RValue<Int>(V(jit->builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
3441 }
3442
cvtps2dq(RValue<Float4> val)3443 RValue<Int4> cvtps2dq(RValue<Float4> val)
3444 {
3445 llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_cvtps2dq);
3446
3447 return RValue<Int4>(V(jit->builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
3448 }
3449
rcpss(RValue<Float> val)3450 RValue<Float> rcpss(RValue<Float> val)
3451 {
3452 llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ss);
3453
3454 Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3455
3456 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
3457 }
3458
sqrtss(RValue<Float> val)3459 RValue<Float> sqrtss(RValue<Float> val)
3460 {
3461 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, { V(val.value)->getType() });
3462 return RValue<Float>(V(jit->builder->CreateCall(sqrt, ARGS(V(val.value)))));
3463 }
3464
rsqrtss(RValue<Float> val)3465 RValue<Float> rsqrtss(RValue<Float> val)
3466 {
3467 llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ss);
3468
3469 Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3470
3471 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
3472 }
3473
rcpps(RValue<Float4> val)3474 RValue<Float4> rcpps(RValue<Float4> val)
3475 {
3476 llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ps);
3477
3478 return RValue<Float4>(V(jit->builder->CreateCall(rcpps, ARGS(V(val.value)))));
3479 }
3480
sqrtps(RValue<Float4> val)3481 RValue<Float4> sqrtps(RValue<Float4> val)
3482 {
3483 llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, { V(val.value)->getType() });
3484
3485 return RValue<Float4>(V(jit->builder->CreateCall(sqrtps, ARGS(V(val.value)))));
3486 }
3487
rsqrtps(RValue<Float4> val)3488 RValue<Float4> rsqrtps(RValue<Float4> val)
3489 {
3490 llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ps);
3491
3492 return RValue<Float4>(V(jit->builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
3493 }
3494
maxps(RValue<Float4> x,RValue<Float4> y)3495 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3496 {
3497 llvm::Function *maxps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_max_ps);
3498
3499 return RValue<Float4>(V(jit->builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
3500 }
3501
minps(RValue<Float4> x,RValue<Float4> y)3502 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3503 {
3504 llvm::Function *minps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_min_ps);
3505
3506 return RValue<Float4>(V(jit->builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
3507 }
3508
roundss(RValue<Float> val,unsigned char imm)3509 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3510 {
3511 llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3512
3513 Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
3514 Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
3515
3516 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
3517 }
3518
floorss(RValue<Float> val)3519 RValue<Float> floorss(RValue<Float> val)
3520 {
3521 return roundss(val, 1);
3522 }
3523
ceilss(RValue<Float> val)3524 RValue<Float> ceilss(RValue<Float> val)
3525 {
3526 return roundss(val, 2);
3527 }
3528
roundps(RValue<Float4> val,unsigned char imm)3529 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3530 {
3531 llvm::Function *roundps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ps);
3532
3533 return RValue<Float4>(V(jit->builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
3534 }
3535
floorps(RValue<Float4> val)3536 RValue<Float4> floorps(RValue<Float4> val)
3537 {
3538 return roundps(val, 1);
3539 }
3540
ceilps(RValue<Float4> val)3541 RValue<Float4> ceilps(RValue<Float4> val)
3542 {
3543 return roundps(val, 2);
3544 }
3545
pabsd(RValue<Int4> x)3546 RValue<Int4> pabsd(RValue<Int4> x)
3547 {
3548 return RValue<Int4>(V(lowerPABS(V(x.value))));
3549 }
3550
paddsw(RValue<Short4> x,RValue<Short4> y)3551 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3552 {
3553 # if LLVM_VERSION_MAJOR >= 8
3554 return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
3555 # else
3556 llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_w);
3557
3558 return As<Short4>(V(jit->builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
3559 # endif
3560 }
3561
psubsw(RValue<Short4> x,RValue<Short4> y)3562 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3563 {
3564 # if LLVM_VERSION_MAJOR >= 8
3565 return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
3566 # else
3567 llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_w);
3568
3569 return As<Short4>(V(jit->builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
3570 # endif
3571 }
3572
paddusw(RValue<UShort4> x,RValue<UShort4> y)3573 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3574 {
3575 # if LLVM_VERSION_MAJOR >= 8
3576 return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
3577 # else
3578 llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_w);
3579
3580 return As<UShort4>(V(jit->builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
3581 # endif
3582 }
3583
psubusw(RValue<UShort4> x,RValue<UShort4> y)3584 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3585 {
3586 # if LLVM_VERSION_MAJOR >= 8
3587 return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
3588 # else
3589 llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_w);
3590
3591 return As<UShort4>(V(jit->builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
3592 # endif
3593 }
3594
paddsb(RValue<SByte8> x,RValue<SByte8> y)3595 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3596 {
3597 # if LLVM_VERSION_MAJOR >= 8
3598 return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
3599 # else
3600 llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_b);
3601
3602 return As<SByte8>(V(jit->builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
3603 # endif
3604 }
3605
psubsb(RValue<SByte8> x,RValue<SByte8> y)3606 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3607 {
3608 # if LLVM_VERSION_MAJOR >= 8
3609 return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
3610 # else
3611 llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_b);
3612
3613 return As<SByte8>(V(jit->builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
3614 # endif
3615 }
3616
paddusb(RValue<Byte8> x,RValue<Byte8> y)3617 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3618 {
3619 # if LLVM_VERSION_MAJOR >= 8
3620 return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
3621 # else
3622 llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_b);
3623
3624 return As<Byte8>(V(jit->builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
3625 # endif
3626 }
3627
psubusb(RValue<Byte8> x,RValue<Byte8> y)3628 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3629 {
3630 # if LLVM_VERSION_MAJOR >= 8
3631 return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
3632 # else
3633 llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_b);
3634
3635 return As<Byte8>(V(jit->builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
3636 # endif
3637 }
3638
pavgw(RValue<UShort4> x,RValue<UShort4> y)3639 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3640 {
3641 return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
3642 }
3643
pmaxsw(RValue<Short4> x,RValue<Short4> y)3644 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3645 {
3646 return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3647 }
3648
pminsw(RValue<Short4> x,RValue<Short4> y)3649 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3650 {
3651 return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3652 }
3653
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3654 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3655 {
3656 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
3657 }
3658
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3659 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3660 {
3661 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
3662 }
3663
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3664 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3665 {
3666 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
3667 }
3668
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3669 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3670 {
3671 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
3672 }
3673
packssdw(RValue<Int2> x,RValue<Int2> y)3674 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3675 {
3676 llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
3677
3678 return As<Short4>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3679 }
3680
packssdw(RValue<Int4> x,RValue<Int4> y)3681 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3682 {
3683 llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
3684
3685 return RValue<Short8>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3686 }
3687
packsswb(RValue<Short4> x,RValue<Short4> y)3688 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3689 {
3690 llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packsswb_128);
3691
3692 return As<SByte8>(V(jit->builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
3693 }
3694
packuswb(RValue<Short4> x,RValue<Short4> y)3695 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3696 {
3697 llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packuswb_128);
3698
3699 return As<Byte8>(V(jit->builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
3700 }
3701
packusdw(RValue<Int4> x,RValue<Int4> y)3702 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3703 {
3704 if(CPUID::supportsSSE4_1())
3705 {
3706 llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_packusdw);
3707
3708 return RValue<UShort8>(V(jit->builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
3709 }
3710 else
3711 {
3712 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3713 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3714
3715 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3716 }
3717 }
3718
psrlw(RValue<UShort4> x,unsigned char y)3719 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3720 {
3721 llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
3722
3723 return As<UShort4>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3724 }
3725
psrlw(RValue<UShort8> x,unsigned char y)3726 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3727 {
3728 llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
3729
3730 return RValue<UShort8>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3731 }
3732
psraw(RValue<Short4> x,unsigned char y)3733 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3734 {
3735 llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
3736
3737 return As<Short4>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3738 }
3739
psraw(RValue<Short8> x,unsigned char y)3740 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3741 {
3742 llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
3743
3744 return RValue<Short8>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3745 }
3746
psllw(RValue<Short4> x,unsigned char y)3747 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3748 {
3749 llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
3750
3751 return As<Short4>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3752 }
3753
psllw(RValue<Short8> x,unsigned char y)3754 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3755 {
3756 llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
3757
3758 return RValue<Short8>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3759 }
3760
pslld(RValue<Int2> x,unsigned char y)3761 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3762 {
3763 llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
3764
3765 return As<Int2>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3766 }
3767
pslld(RValue<Int4> x,unsigned char y)3768 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3769 {
3770 llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
3771
3772 return RValue<Int4>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3773 }
3774
psrad(RValue<Int2> x,unsigned char y)3775 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3776 {
3777 llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
3778
3779 return As<Int2>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3780 }
3781
psrad(RValue<Int4> x,unsigned char y)3782 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3783 {
3784 llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
3785
3786 return RValue<Int4>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3787 }
3788
psrld(RValue<UInt2> x,unsigned char y)3789 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3790 {
3791 llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
3792
3793 return As<UInt2>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3794 }
3795
psrld(RValue<UInt4> x,unsigned char y)3796 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3797 {
3798 llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
3799
3800 return RValue<UInt4>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3801 }
3802
pmaxsd(RValue<Int4> x,RValue<Int4> y)3803 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3804 {
3805 return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3806 }
3807
pminsd(RValue<Int4> x,RValue<Int4> y)3808 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3809 {
3810 return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3811 }
3812
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3813 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3814 {
3815 return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
3816 }
3817
pminud(RValue<UInt4> x,RValue<UInt4> y)3818 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3819 {
3820 return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
3821 }
3822
pmulhw(RValue<Short4> x,RValue<Short4> y)3823 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3824 {
3825 llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
3826
3827 return As<Short4>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3828 }
3829
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3830 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3831 {
3832 llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
3833
3834 return As<UShort4>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3835 }
3836
pmaddwd(RValue<Short4> x,RValue<Short4> y)3837 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3838 {
3839 llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
3840
3841 return As<Int2>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3842 }
3843
pmulhw(RValue<Short8> x,RValue<Short8> y)3844 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3845 {
3846 llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
3847
3848 return RValue<Short8>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3849 }
3850
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3851 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3852 {
3853 llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
3854
3855 return RValue<UShort8>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3856 }
3857
pmaddwd(RValue<Short8> x,RValue<Short8> y)3858 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3859 {
3860 llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
3861
3862 return RValue<Int4>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3863 }
3864
movmskps(RValue<Float4> x)3865 RValue<Int> movmskps(RValue<Float4> x)
3866 {
3867 llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_movmsk_ps);
3868
3869 return RValue<Int>(V(jit->builder->CreateCall(movmskps, ARGS(V(x.value)))));
3870 }
3871
pmovmskb(RValue<Byte8> x)3872 RValue<Int> pmovmskb(RValue<Byte8> x)
3873 {
3874 llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
3875
3876 return RValue<Int>(V(jit->builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
3877 }
3878
pmovzxbd(RValue<Byte16> x)3879 RValue<Int4> pmovzxbd(RValue<Byte16> x)
3880 {
3881 return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3882 }
3883
pmovsxbd(RValue<SByte16> x)3884 RValue<Int4> pmovsxbd(RValue<SByte16> x)
3885 {
3886 return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3887 }
3888
pmovzxwd(RValue<UShort8> x)3889 RValue<Int4> pmovzxwd(RValue<UShort8> x)
3890 {
3891 return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3892 }
3893
pmovsxwd(RValue<Short8> x)3894 RValue<Int4> pmovsxwd(RValue<Short8> x)
3895 {
3896 return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3897 }
3898
3899 } // namespace x86
3900 #endif // defined(__i386__) || defined(__x86_64__)
3901
3902 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3903 void VPrintf(const std::vector<Value *> &vals)
3904 {
3905 auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
3906 auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
3907 auto funcTy = ::llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
3908 auto func = jit->module->getOrInsertFunction("printf", funcTy);
3909 jit->builder->CreateCall(func, V(vals));
3910 }
3911 #endif // ENABLE_RR_PRINT
3912
Nop()3913 void Nop()
3914 {
3915 auto voidTy = ::llvm::Type::getVoidTy(jit->context);
3916 auto funcTy = ::llvm::FunctionType::get(voidTy, {}, false);
3917 auto func = jit->module->getOrInsertFunction("nop", funcTy);
3918 jit->builder->CreateCall(func);
3919 }
3920
EmitDebugLocation()3921 void EmitDebugLocation()
3922 {
3923 #ifdef ENABLE_RR_DEBUG_INFO
3924 if(jit->debugInfo != nullptr)
3925 {
3926 jit->debugInfo->EmitLocation();
3927 }
3928 #endif // ENABLE_RR_DEBUG_INFO
3929 }
3930
EmitDebugVariable(Value * value)3931 void EmitDebugVariable(Value *value)
3932 {
3933 #ifdef ENABLE_RR_DEBUG_INFO
3934 if(jit->debugInfo != nullptr)
3935 {
3936 jit->debugInfo->EmitVariable(value);
3937 }
3938 #endif // ENABLE_RR_DEBUG_INFO
3939 }
3940
FlushDebug()3941 void FlushDebug()
3942 {
3943 #ifdef ENABLE_RR_DEBUG_INFO
3944 if(jit->debugInfo != nullptr)
3945 {
3946 jit->debugInfo->Flush();
3947 }
3948 #endif // ENABLE_RR_DEBUG_INFO
3949 }
3950
3951 } // namespace rr
3952
3953 // ------------------------------ Coroutines ------------------------------
3954
3955 namespace {
3956
3957 // Magic values retuned by llvm.coro.suspend.
3958 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
3959 enum SuspendAction
3960 {
3961 SuspendActionSuspend = -1,
3962 SuspendActionResume = 0,
3963 SuspendActionDestroy = 1
3964 };
3965
promoteFunctionToCoroutine()3966 void promoteFunctionToCoroutine()
3967 {
3968 ASSERT(jit->coroutine.id == nullptr);
3969
3970 // Types
3971 auto voidTy = ::llvm::Type::getVoidTy(jit->context);
3972 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
3973 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
3974 auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
3975 auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
3976 auto promiseTy = jit->coroutine.yieldType;
3977 auto promisePtrTy = promiseTy->getPointerTo();
3978
3979 // LLVM intrinsics
3980 auto coro_id = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
3981 auto coro_size = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
3982 auto coro_begin = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
3983 auto coro_resume = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_resume);
3984 auto coro_end = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_end);
3985 auto coro_free = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_free);
3986 auto coro_destroy = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_destroy);
3987 auto coro_promise = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_promise);
3988 auto coro_done = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_done);
3989 auto coro_suspend = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_suspend);
3990
3991 auto allocFrameTy = ::llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
3992 auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
3993 auto freeFrameTy = ::llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
3994 auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
3995
3996 auto oldInsertionPoint = jit->builder->saveIP();
3997
3998 // Build the coroutine_await() function:
3999 //
4000 // bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4001 // {
4002 // if(llvm.coro.done(handle))
4003 // {
4004 // return false;
4005 // }
4006 // else
4007 // {
4008 // *value = (T*)llvm.coro.promise(handle);
4009 // llvm.coro.resume(handle);
4010 // return true;
4011 // }
4012 // }
4013 //
4014 {
4015 auto args = jit->coroutine.await->arg_begin();
4016 auto handle = args++;
4017 auto outPtr = args++;
4018 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "co_await", jit->coroutine.await));
4019 auto doneBlock = llvm::BasicBlock::Create(jit->context, "done", jit->coroutine.await);
4020 auto resumeBlock = llvm::BasicBlock::Create(jit->context, "resume", jit->coroutine.await);
4021
4022 auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4023 jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4024
4025 jit->builder->SetInsertPoint(doneBlock);
4026 jit->builder->CreateRet(::llvm::ConstantInt::getFalse(i1Ty));
4027
4028 jit->builder->SetInsertPoint(resumeBlock);
4029 auto promiseAlignment = ::llvm::ConstantInt::get(i32Ty, 4); // TODO: Get correct alignment.
4030 auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, ::llvm::ConstantInt::get(i1Ty, 0) });
4031 auto promise = jit->builder->CreateLoad(jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4032 jit->builder->CreateStore(promise, outPtr);
4033 jit->builder->CreateCall(coro_resume, { handle });
4034 jit->builder->CreateRet(::llvm::ConstantInt::getTrue(i1Ty));
4035 }
4036
4037 // Build the coroutine_destroy() function:
4038 //
4039 // void coroutine_destroy(CoroutineHandle* handle)
4040 // {
4041 // llvm.coro.destroy(handle);
4042 // }
4043 //
4044 {
4045 auto handle = jit->coroutine.destroy->arg_begin();
4046 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.destroy));
4047 jit->builder->CreateCall(coro_destroy, { handle });
4048 jit->builder->CreateRetVoid();
4049 }
4050
4051 // Begin building the main coroutine_begin() function.
4052 //
4053 // CoroutineHandle* coroutine_begin(<Arguments>)
4054 // {
4055 // YieldType promise;
4056 // auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4057 // void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4058 // CoroutineHandle *handle = llvm.coro.begin(id, frame);
4059 //
4060 // ... <REACTOR CODE> ...
4061 //
4062 // end:
4063 // SuspendAction action = llvm.coro.suspend(none, true /* final */); // <-- RESUME POINT
4064 // switch(action)
4065 // {
4066 // case SuspendActionResume:
4067 // UNREACHABLE(); // Illegal to resume after final suspend.
4068 // case SuspendActionDestroy:
4069 // goto destroy;
4070 // default: // (SuspendActionSuspend)
4071 // goto suspend;
4072 // }
4073 //
4074 // destroy:
4075 // coroutine_free_frame(llvm.coro.free(id, handle));
4076 // goto suspend;
4077 //
4078 // suspend:
4079 // llvm.coro.end(handle, false);
4080 // return handle;
4081 // }
4082 //
4083
4084 #ifdef ENABLE_RR_DEBUG_INFO
4085 jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), &jit->context, jit->module.get(), jit->function);
4086 #endif // ENABLE_RR_DEBUG_INFO
4087
4088 jit->coroutine.suspendBlock = llvm::BasicBlock::Create(jit->context, "suspend", jit->function);
4089 jit->coroutine.endBlock = llvm::BasicBlock::Create(jit->context, "end", jit->function);
4090 jit->coroutine.destroyBlock = llvm::BasicBlock::Create(jit->context, "destroy", jit->function);
4091
4092 jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4093 jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4094 jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4095 ::llvm::ConstantInt::get(i32Ty, 0),
4096 jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4097 ::llvm::ConstantPointerNull::get(i8PtrTy),
4098 ::llvm::ConstantPointerNull::get(i8PtrTy),
4099 });
4100 auto size = jit->builder->CreateCall(coro_size, {});
4101 auto frame = jit->builder->CreateCall(allocFrame, { size });
4102 jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4103
4104 // Build the suspend block
4105 jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4106 jit->builder->CreateCall(coro_end, { jit->coroutine.handle, ::llvm::ConstantInt::get(i1Ty, 0) });
4107 jit->builder->CreateRet(jit->coroutine.handle);
4108
4109 // Build the end block
4110 jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4111 auto action = jit->builder->CreateCall(coro_suspend, {
4112 ::llvm::ConstantTokenNone::get(jit->context),
4113 ::llvm::ConstantInt::get(i1Ty, 1), // final: true
4114 });
4115 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4116 // switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4117 switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4118
4119 // Build the destroy block
4120 jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4121 auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4122 jit->builder->CreateCall(freeFrame, { memory });
4123 jit->builder->CreateBr(jit->coroutine.suspendBlock);
4124
4125 // Switch back to original insert point to continue building the coroutine.
4126 jit->builder->restoreIP(oldInsertionPoint);
4127 }
4128
4129 } // anonymous namespace
4130
4131 namespace rr {
4132
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4133 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4134 {
4135 // Coroutines are initially created as a regular function.
4136 // Upon the first call to Yield(), the function is promoted to a true
4137 // coroutine.
4138 auto voidTy = ::llvm::Type::getVoidTy(jit->context);
4139 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
4140 auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
4141 auto handleTy = i8PtrTy;
4142 auto boolTy = i1Ty;
4143 auto promiseTy = T(YieldType);
4144 auto promisePtrTy = promiseTy->getPointerTo();
4145
4146 jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4147 jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4148 jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4149 jit->coroutine.yieldType = promiseTy;
4150 jit->coroutine.entryBlock = llvm::BasicBlock::Create(jit->context, "function", jit->function);
4151
4152 jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4153 }
4154
yield(Value * val)4155 void Nucleus::yield(Value *val)
4156 {
4157 if(jit->coroutine.id == nullptr)
4158 {
4159 // First call to yield().
4160 // Promote the function to a full coroutine.
4161 promoteFunctionToCoroutine();
4162 ASSERT(jit->coroutine.id != nullptr);
4163 }
4164
4165 // promise = val;
4166 //
4167 // auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4168 // switch(action)
4169 // {
4170 // case SuspendActionResume:
4171 // goto resume;
4172 // case SuspendActionDestroy:
4173 // goto destroy;
4174 // default: // (SuspendActionSuspend)
4175 // goto suspend;
4176 // }
4177 // resume:
4178 //
4179
4180 RR_DEBUG_INFO_UPDATE_LOC();
4181 Variable::materializeAll();
4182
4183 // Types
4184 auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
4185 auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
4186
4187 // Intrinsics
4188 auto coro_suspend = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_suspend);
4189
4190 // Create a block to resume execution.
4191 auto resumeBlock = llvm::BasicBlock::Create(jit->context, "resume", jit->function);
4192
4193 // Store the promise (yield value)
4194 jit->builder->CreateStore(V(val), jit->coroutine.promise);
4195 auto action = jit->builder->CreateCall(coro_suspend, {
4196 ::llvm::ConstantTokenNone::get(jit->context),
4197 ::llvm::ConstantInt::get(i1Ty, 0), // final: true
4198 });
4199 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4200 switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4201 switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4202
4203 // Continue building in the resume block.
4204 jit->builder->SetInsertPoint(resumeBlock);
4205 }
4206
acquireCoroutine(const char * name,const Config::Edit & cfgEdit)4207 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
4208 {
4209 bool isCoroutine = jit->coroutine.id != nullptr;
4210 if(isCoroutine)
4211 {
4212 jit->builder->CreateBr(jit->coroutine.endBlock);
4213 }
4214 else
4215 {
4216 // Coroutine without a Yield acts as a regular function.
4217 // The 'coroutine_begin' function returns a nullptr for the coroutine
4218 // handle.
4219 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4220 // The 'coroutine_await' function always returns false (coroutine done).
4221 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.await));
4222 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4223 // The 'coroutine_destroy' does nothing, returns void.
4224 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.destroy));
4225 jit->builder->CreateRetVoid();
4226 }
4227
4228 #ifdef ENABLE_RR_DEBUG_INFO
4229 if(jit->debugInfo != nullptr)
4230 {
4231 jit->debugInfo->Finalize();
4232 }
4233 #endif // ENABLE_RR_DEBUG_INFO
4234
4235 if(false)
4236 {
4237 std::error_code error;
4238 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4239 jit->module->print(file, 0);
4240 }
4241
4242 if(isCoroutine)
4243 {
4244 // Run manadory coroutine transforms.
4245 llvm::legacy::PassManager pm;
4246
4247 #if LLVM_VERSION_MAJOR >= 9
4248 pm.add(llvm::createCoroEarlyLegacyPass());
4249 pm.add(llvm::createCoroSplitLegacyPass());
4250 pm.add(llvm::createCoroElideLegacyPass());
4251 pm.add(llvm::createBarrierNoopPass());
4252 pm.add(llvm::createCoroCleanupLegacyPass());
4253 #else
4254 pm.add(llvm::createCoroEarlyPass());
4255 pm.add(llvm::createCoroSplitPass());
4256 pm.add(llvm::createCoroElidePass());
4257 pm.add(llvm::createBarrierNoopPass());
4258 pm.add(llvm::createCoroCleanupPass());
4259 #endif
4260
4261 pm.run(*jit->module);
4262 }
4263
4264 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4265 {
4266 llvm::legacy::PassManager pm;
4267 pm.add(llvm::createVerifierPass());
4268 pm.run(*jit->module);
4269 }
4270 #endif // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4271
4272 auto cfg = cfgEdit.apply(jit->config);
4273 jit->optimize(cfg);
4274
4275 if(false)
4276 {
4277 std::error_code error;
4278 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4279 jit->module->print(file, 0);
4280 }
4281
4282 llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4283 funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4284 funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4285 funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4286 auto routine = jit->acquireRoutine(funcs, Nucleus::CoroutineEntryCount, cfg);
4287 jit.reset();
4288
4289 return routine;
4290 }
4291
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4292 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4293 {
4294 return func();
4295 }
4296
4297 } // namespace rr
4298