1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "EmulatedReactor.hpp"
20 #include "LLVMReactorDebugInfo.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "x86.hpp"
24 
25 #include "llvm/IR/Intrinsics.h"
26 #if LLVM_VERSION_MAJOR >= 9
27 #	include "llvm/IR/IntrinsicsX86.h"
28 #endif
29 #include "llvm/IR/LegacyPassManager.h"
30 #include "llvm/IR/Verifier.h"
31 #include "llvm/Transforms/Coroutines.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Scalar.h"
34 
35 #define ARGS(...)   \
36 	{               \
37 		__VA_ARGS__ \
38 	}
39 #define CreateCall2 CreateCall
40 #define CreateCall3 CreateCall
41 
42 #include <fstream>
43 #include <iostream>
44 #include <mutex>
45 #include <numeric>
46 #include <thread>
47 #include <unordered_map>
48 
49 #if defined(__i386__) || defined(__x86_64__)
50 #	include <xmmintrin.h>
51 #endif
52 
53 #include <math.h>
54 
55 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()56 extern "C" void X86CompilationCallback()
57 {
58 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
59 }
60 #endif
61 
62 namespace {
63 
64 std::unique_ptr<rr::JITBuilder> jit;
65 std::mutex codegenMutex;
66 
67 // Default configuration settings. Must be accessed under mutex lock.
68 std::mutex defaultConfigLock;
defaultConfig()69 rr::Config &defaultConfig()
70 {
71 	// This uses a static in a function to avoid the cost of a global static
72 	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
73 	static rr::Config config = rr::Config::Edit()
74 	                               .add(rr::Optimization::Pass::ScalarReplAggregates)
75 	                               .add(rr::Optimization::Pass::InstructionCombining)
76 	                               .apply({});
77 	return config;
78 }
79 
lowerPAVG(llvm::Value * x,llvm::Value * y)80 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
81 {
82 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
83 
84 	llvm::VectorType *extTy =
85 	    llvm::VectorType::getExtendedElementVectorType(ty);
86 	x = jit->builder->CreateZExt(x, extTy);
87 	y = jit->builder->CreateZExt(y, extTy);
88 
89 	// (x + y + 1) >> 1
90 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
91 	llvm::Value *res = jit->builder->CreateAdd(x, y);
92 	res = jit->builder->CreateAdd(res, one);
93 	res = jit->builder->CreateLShr(res, one);
94 	return jit->builder->CreateTrunc(res, ty);
95 }
96 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)97 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
98                           llvm::ICmpInst::Predicate pred)
99 {
100 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
101 }
102 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)103 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
104                        llvm::Value *y, llvm::Type *dstTy)
105 {
106 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
107 }
108 
109 #if defined(__i386__) || defined(__x86_64__)
lowerPMOV(llvm::Value * op,llvm::Type * dstType,bool sext)110 llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
111 {
112 	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
113 	llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
114 
115 	llvm::Value *undef = llvm::UndefValue::get(srcTy);
116 	llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
117 	std::iota(mask.begin(), mask.end(), 0);
118 	llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
119 
120 	return sext ? jit->builder->CreateSExt(v, dstTy)
121 	            : jit->builder->CreateZExt(v, dstTy);
122 }
123 
lowerPABS(llvm::Value * v)124 llvm::Value *lowerPABS(llvm::Value *v)
125 {
126 	llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
127 	llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
128 	llvm::Value *neg = jit->builder->CreateNeg(v);
129 	return jit->builder->CreateSelect(cmp, v, neg);
130 }
131 #endif  // defined(__i386__) || defined(__x86_64__)
132 
133 #if !defined(__i386__) && !defined(__x86_64__)
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)134 llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
135                            llvm::FCmpInst::Predicate pred)
136 {
137 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
138 }
139 
lowerRound(llvm::Value * x)140 llvm::Value *lowerRound(llvm::Value *x)
141 {
142 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
143 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
144 	return jit->builder->CreateCall(nearbyint, ARGS(x));
145 }
146 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)147 llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
148 {
149 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
150 }
151 
lowerFloor(llvm::Value * x)152 llvm::Value *lowerFloor(llvm::Value *x)
153 {
154 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
155 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
156 	return jit->builder->CreateCall(floor, ARGS(x));
157 }
158 
lowerTrunc(llvm::Value * x)159 llvm::Value *lowerTrunc(llvm::Value *x)
160 {
161 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
162 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
163 	return jit->builder->CreateCall(trunc, ARGS(x));
164 }
165 
166 // Packed add/sub with saturation
lowerPSAT(llvm::Value * x,llvm::Value * y,bool isAdd,bool isSigned)167 llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
168 {
169 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
170 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
171 
172 	unsigned numBits = ty->getScalarSizeInBits();
173 
174 	llvm::Value *max, *min, *extX, *extY;
175 	if(isSigned)
176 	{
177 		max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
178 		min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
179 		extX = jit->builder->CreateSExt(x, extTy);
180 		extY = jit->builder->CreateSExt(y, extTy);
181 	}
182 	else
183 	{
184 		ASSERT_MSG(numBits <= 64, "numBits: %d", int(numBits));
185 		uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
186 		max = llvm::ConstantInt::get(extTy, maxVal, false);
187 		min = llvm::ConstantInt::get(extTy, 0, false);
188 		extX = jit->builder->CreateZExt(x, extTy);
189 		extY = jit->builder->CreateZExt(y, extTy);
190 	}
191 
192 	llvm::Value *res = isAdd ? jit->builder->CreateAdd(extX, extY)
193 	                         : jit->builder->CreateSub(extX, extY);
194 
195 	res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
196 	res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
197 
198 	return jit->builder->CreateTrunc(res, ty);
199 }
200 
lowerSQRT(llvm::Value * x)201 llvm::Value *lowerSQRT(llvm::Value *x)
202 {
203 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
204 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
205 	return jit->builder->CreateCall(sqrt, ARGS(x));
206 }
207 
lowerRCP(llvm::Value * x)208 llvm::Value *lowerRCP(llvm::Value *x)
209 {
210 	llvm::Type *ty = x->getType();
211 	llvm::Constant *one;
212 	if(llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
213 	{
214 		one = llvm::ConstantVector::getSplat(
215 		    vectorTy->getNumElements(),
216 		    llvm::ConstantFP::get(vectorTy->getElementType(), 1));
217 	}
218 	else
219 	{
220 		one = llvm::ConstantFP::get(ty, 1);
221 	}
222 	return jit->builder->CreateFDiv(one, x);
223 }
224 
lowerRSQRT(llvm::Value * x)225 llvm::Value *lowerRSQRT(llvm::Value *x)
226 {
227 	return lowerRCP(lowerSQRT(x));
228 }
229 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)230 llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
231 {
232 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
233 	llvm::Value *y = llvm::ConstantVector::getSplat(
234 	    ty->getNumElements(),
235 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
236 	return jit->builder->CreateShl(x, y);
237 }
238 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)239 llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
240 {
241 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
242 	llvm::Value *y = llvm::ConstantVector::getSplat(
243 	    ty->getNumElements(),
244 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
245 	return jit->builder->CreateAShr(x, y);
246 }
247 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)248 llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
249 {
250 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
251 	llvm::Value *y = llvm::ConstantVector::getSplat(
252 	    ty->getNumElements(),
253 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
254 	return jit->builder->CreateLShr(x, y);
255 }
256 
lowerMulAdd(llvm::Value * x,llvm::Value * y)257 llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
258 {
259 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
260 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
261 
262 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
263 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
264 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
265 
266 	llvm::Value *undef = llvm::UndefValue::get(extTy);
267 
268 	llvm::SmallVector<uint32_t, 16> evenIdx;
269 	llvm::SmallVector<uint32_t, 16> oddIdx;
270 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
271 	{
272 		evenIdx.push_back(i);
273 		oddIdx.push_back(i + 1);
274 	}
275 
276 	llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
277 	llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
278 	return jit->builder->CreateAdd(lhs, rhs);
279 }
280 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)281 llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
282 {
283 	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
284 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
285 
286 	llvm::IntegerType *dstElemTy =
287 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
288 
289 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
290 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
291 	llvm::Constant *max, *min;
292 	if(isSigned)
293 	{
294 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
295 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
296 	}
297 	else
298 	{
299 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
300 		min = llvm::ConstantInt::get(srcTy, 0, false);
301 	}
302 
303 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
304 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
305 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
306 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
307 
308 	x = jit->builder->CreateTrunc(x, dstTy);
309 	y = jit->builder->CreateTrunc(y, dstTy);
310 
311 	llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
312 	std::iota(index.begin(), index.end(), 0);
313 
314 	return jit->builder->CreateShuffleVector(x, y, index);
315 }
316 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)317 llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
318 {
319 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
320 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
321 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
322 
323 	llvm::Value *ret = jit->builder->CreateZExt(
324 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
325 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
326 	{
327 		llvm::Value *elem = jit->builder->CreateZExt(
328 		    jit->builder->CreateExtractElement(cmp, i), retTy);
329 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
330 	}
331 	return ret;
332 }
333 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)334 llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
335 {
336 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
337 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
338 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
339 
340 	llvm::Value *ret = jit->builder->CreateZExt(
341 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
342 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
343 	{
344 		llvm::Value *elem = jit->builder->CreateZExt(
345 		    jit->builder->CreateExtractElement(cmp, i), retTy);
346 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
347 	}
348 	return ret;
349 }
350 #endif  // !defined(__i386__) && !defined(__x86_64__)
351 
352 #if(LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)353 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
354 {
355 #	if LLVM_VERSION_MAJOR >= 8
356 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
357 #	else
358 	return lowerPSAT(x, y, true, false);
359 #	endif
360 }
361 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)362 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
363 {
364 #	if LLVM_VERSION_MAJOR >= 8
365 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
366 #	else
367 	return lowerPSAT(x, y, true, true);
368 #	endif
369 }
370 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)371 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
372 {
373 #	if LLVM_VERSION_MAJOR >= 8
374 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
375 #	else
376 	return lowerPSAT(x, y, false, false);
377 #	endif
378 }
379 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)380 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
381 {
382 #	if LLVM_VERSION_MAJOR >= 8
383 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
384 #	else
385 	return lowerPSAT(x, y, false, true);
386 #	endif
387 }
388 #endif  // (LLVM_VERSION_MAJOR >= 8) || (!defined(__i386__) && !defined(__x86_64__))
389 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)390 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
391 {
392 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
393 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
394 
395 	llvm::Value *extX, *extY;
396 	if(sext)
397 	{
398 		extX = jit->builder->CreateSExt(x, extTy);
399 		extY = jit->builder->CreateSExt(y, extTy);
400 	}
401 	else
402 	{
403 		extX = jit->builder->CreateZExt(x, extTy);
404 		extY = jit->builder->CreateZExt(y, extTy);
405 	}
406 
407 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
408 
409 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
410 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
411 	return jit->builder->CreateTrunc(mulh, ty);
412 }
413 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)414 llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
415 {
416 	ASSERT(base->getType()->isPointerTy());
417 	ASSERT(offsets->getType()->isVectorTy());
418 	ASSERT(mask->getType()->isVectorTy());
419 
420 	auto numEls = mask->getType()->getVectorNumElements();
421 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
422 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
423 	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
424 	auto i8PtrTy = i8Ty->getPointerTo();
425 	auto elPtrTy = elTy->getPointerTo();
426 	auto elVecTy = ::llvm::VectorType::get(elTy, numEls);
427 	auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
428 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
429 	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
430 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
431 	auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
432 	auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
433 	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
434 	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
435 	return jit->builder->CreateCall(func, { elPtrs, align, i8Mask, passthrough });
436 }
437 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)438 void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
439 {
440 	ASSERT(base->getType()->isPointerTy());
441 	ASSERT(val->getType()->isVectorTy());
442 	ASSERT(offsets->getType()->isVectorTy());
443 	ASSERT(mask->getType()->isVectorTy());
444 
445 	auto numEls = mask->getType()->getVectorNumElements();
446 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
447 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
448 	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
449 	auto i8PtrTy = i8Ty->getPointerTo();
450 	auto elVecTy = val->getType();
451 	auto elTy = elVecTy->getVectorElementType();
452 	auto elPtrTy = elTy->getPointerTo();
453 	auto elPtrVecTy = ::llvm::VectorType::get(elPtrTy, numEls);
454 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
455 	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
456 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
457 	auto i8Mask = jit->builder->CreateIntCast(mask, ::llvm::VectorType::get(i1Ty, numEls), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
458 	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
459 	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
460 	jit->builder->CreateCall(func, { val, elPtrs, align, i8Mask });
461 }
462 }  // namespace
463 
464 namespace rr {
465 
BackendName()466 std::string BackendName()
467 {
468 	return std::string("LLVM ") + LLVM_VERSION_STRING;
469 }
470 
471 const Capabilities Caps = {
472 	true,  // CoroutinesSupported
473 };
474 
475 // The abstract Type* types are implemented as LLVM types, except that
476 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
477 // and VFP in ARM, and eliminate the overhead of converting them to explicit
478 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
479 // as abstract pointers with small enum values.
480 enum InternalType : uintptr_t
481 {
482 	// Emulated types:
483 	Type_v2i32,
484 	Type_v4i16,
485 	Type_v2i16,
486 	Type_v8i8,
487 	Type_v4i8,
488 	Type_v2f32,
489 	EmulatedTypeCount,
490 	// Returned by asInternalType() to indicate that the abstract Type*
491 	// should be interpreted as LLVM type pointer:
492 	Type_LLVM
493 };
494 
asInternalType(Type * type)495 inline InternalType asInternalType(Type *type)
496 {
497 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
498 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
499 }
500 
T(Type * t)501 llvm::Type *T(Type *t)
502 {
503 	// Use 128-bit vectors to implement logically shorter ones.
504 	switch(asInternalType(t))
505 	{
506 		case Type_v2i32: return T(Int4::getType());
507 		case Type_v4i16: return T(Short8::getType());
508 		case Type_v2i16: return T(Short8::getType());
509 		case Type_v8i8: return T(Byte16::getType());
510 		case Type_v4i8: return T(Byte16::getType());
511 		case Type_v2f32: return T(Float4::getType());
512 		case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
513 		default:
514 			UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
515 			return nullptr;
516 	}
517 }
518 
T(InternalType t)519 Type *T(InternalType t)
520 {
521 	return reinterpret_cast<Type *>(t);
522 }
523 
T(const std::vector<Type * > & t)524 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
525 {
526 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
527 }
528 
B(BasicBlock * t)529 inline llvm::BasicBlock *B(BasicBlock *t)
530 {
531 	return reinterpret_cast<llvm::BasicBlock *>(t);
532 }
533 
B(llvm::BasicBlock * t)534 inline BasicBlock *B(llvm::BasicBlock *t)
535 {
536 	return reinterpret_cast<BasicBlock *>(t);
537 }
538 
typeSize(Type * type)539 static size_t typeSize(Type *type)
540 {
541 	switch(asInternalType(type))
542 	{
543 		case Type_v2i32: return 8;
544 		case Type_v4i16: return 8;
545 		case Type_v2i16: return 4;
546 		case Type_v8i8: return 8;
547 		case Type_v4i8: return 4;
548 		case Type_v2f32: return 8;
549 		case Type_LLVM:
550 		{
551 			llvm::Type *t = T(type);
552 
553 			if(t->isPointerTy())
554 			{
555 				return sizeof(void *);
556 			}
557 
558 			// At this point we should only have LLVM 'primitive' types.
559 			unsigned int bits = t->getPrimitiveSizeInBits();
560 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
561 
562 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
563 			// but are typically stored as one byte. The DataLayout structure should
564 			// be used here and many other places if this assumption fails.
565 			return (bits + 7) / 8;
566 		}
567 		break;
568 		default:
569 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
570 			return 0;
571 	}
572 }
573 
elementCount(Type * type)574 static unsigned int elementCount(Type *type)
575 {
576 	switch(asInternalType(type))
577 	{
578 		case Type_v2i32: return 2;
579 		case Type_v4i16: return 4;
580 		case Type_v2i16: return 2;
581 		case Type_v8i8: return 8;
582 		case Type_v4i8: return 4;
583 		case Type_v2f32: return 2;
584 		case Type_LLVM: return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
585 		default:
586 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
587 			return 0;
588 	}
589 }
590 
createFunction(const char * name,::llvm::Type * retTy,const std::vector<::llvm::Type * > & params)591 static ::llvm::Function *createFunction(const char *name, ::llvm::Type *retTy, const std::vector<::llvm::Type *> &params)
592 {
593 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
594 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
595 	func->setDoesNotThrow();
596 	func->setCallingConv(llvm::CallingConv::C);
597 	return func;
598 }
599 
Nucleus()600 Nucleus::Nucleus()
601 {
602 	::codegenMutex.lock();  // Reactor and LLVM are currently not thread safe
603 
604 	ASSERT(jit == nullptr);
605 	jit.reset(new JITBuilder(Nucleus::getDefaultConfig()));
606 }
607 
~Nucleus()608 Nucleus::~Nucleus()
609 {
610 	jit.reset();
611 	::codegenMutex.unlock();
612 }
613 
setDefaultConfig(const Config & cfg)614 void Nucleus::setDefaultConfig(const Config &cfg)
615 {
616 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
617 	::defaultConfig() = cfg;
618 }
619 
adjustDefaultConfig(const Config::Edit & cfgEdit)620 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
621 {
622 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
623 	auto &config = ::defaultConfig();
624 	config = cfgEdit.apply(config);
625 }
626 
getDefaultConfig()627 Config Nucleus::getDefaultConfig()
628 {
629 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
630 	return ::defaultConfig();
631 }
632 
acquireRoutine(const char * name,const Config::Edit & cfgEdit)633 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
634 {
635 	std::shared_ptr<Routine> routine;
636 
637 	auto acquire = [&]() {
638 		auto cfg = cfgEdit.apply(jit->config);
639 
640 		if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
641 		{
642 			llvm::Type *type = jit->function->getReturnType();
643 
644 			if(type->isVoidTy())
645 			{
646 				createRetVoid();
647 			}
648 			else
649 			{
650 				createRet(V(llvm::UndefValue::get(type)));
651 			}
652 		}
653 
654 #ifdef ENABLE_RR_DEBUG_INFO
655 		if(jit->debugInfo != nullptr)
656 		{
657 			jit->debugInfo->Finalize();
658 		}
659 #endif  // ENABLE_RR_DEBUG_INFO
660 
661 		if(false)
662 		{
663 			std::error_code error;
664 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
665 			jit->module->print(file, 0);
666 		}
667 
668 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
669 		{
670 			llvm::legacy::PassManager pm;
671 			pm.add(llvm::createVerifierPass());
672 			pm.run(*jit->module);
673 		}
674 #endif  // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
675 
676 		jit->optimize(cfg);
677 
678 		if(false)
679 		{
680 			std::error_code error;
681 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
682 			jit->module->print(file, 0);
683 		}
684 
685 		routine = jit->acquireRoutine(&jit->function, 1, cfg);
686 		jit.reset();
687 	};
688 
689 #ifdef JIT_IN_SEPARATE_THREAD
690 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
691 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
692 	// over the threading and stack sizes of its users, so this should be addressed
693 	// at a higher level instead.
694 	std::thread thread(acquire);
695 	thread.join();
696 #else
697 	acquire();
698 #endif
699 
700 	return routine;
701 }
702 
allocateStackVariable(Type * type,int arraySize)703 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
704 {
705 	// Need to allocate it in the entry block for mem2reg to work
706 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
707 
708 	llvm::Instruction *declaration;
709 
710 	if(arraySize)
711 	{
712 		declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
713 	}
714 	else
715 	{
716 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr);
717 	}
718 
719 	entryBlock.getInstList().push_front(declaration);
720 
721 	return V(declaration);
722 }
723 
createBasicBlock()724 BasicBlock *Nucleus::createBasicBlock()
725 {
726 	return B(llvm::BasicBlock::Create(jit->context, "", jit->function));
727 }
728 
getInsertBlock()729 BasicBlock *Nucleus::getInsertBlock()
730 {
731 	return B(jit->builder->GetInsertBlock());
732 }
733 
setInsertBlock(BasicBlock * basicBlock)734 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
735 {
736 	//	assert(jit->builder->GetInsertBlock()->back().isTerminator());
737 
738 	Variable::materializeAll();
739 
740 	jit->builder->SetInsertPoint(B(basicBlock));
741 }
742 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)743 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
744 {
745 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
746 
747 #ifdef ENABLE_RR_DEBUG_INFO
748 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), &jit->context, jit->module.get(), jit->function);
749 #endif  // ENABLE_RR_DEBUG_INFO
750 
751 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->function));
752 }
753 
getArgument(unsigned int index)754 Value *Nucleus::getArgument(unsigned int index)
755 {
756 	llvm::Function::arg_iterator args = jit->function->arg_begin();
757 
758 	while(index)
759 	{
760 		args++;
761 		index--;
762 	}
763 
764 	return V(&*args);
765 }
766 
createRetVoid()767 void Nucleus::createRetVoid()
768 {
769 	RR_DEBUG_INFO_UPDATE_LOC();
770 
771 	ASSERT_MSG(jit->function->getReturnType() == T(Void::getType()), "Return type mismatch");
772 
773 	// Code generated after this point is unreachable, so any variables
774 	// being read can safely return an undefined value. We have to avoid
775 	// materializing variables after the terminator ret instruction.
776 	Variable::killUnmaterialized();
777 
778 	jit->builder->CreateRetVoid();
779 }
780 
createRet(Value * v)781 void Nucleus::createRet(Value *v)
782 {
783 	RR_DEBUG_INFO_UPDATE_LOC();
784 
785 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
786 
787 	// Code generated after this point is unreachable, so any variables
788 	// being read can safely return an undefined value. We have to avoid
789 	// materializing variables after the terminator ret instruction.
790 	Variable::killUnmaterialized();
791 
792 	jit->builder->CreateRet(V(v));
793 }
794 
createBr(BasicBlock * dest)795 void Nucleus::createBr(BasicBlock *dest)
796 {
797 	RR_DEBUG_INFO_UPDATE_LOC();
798 	Variable::materializeAll();
799 
800 	jit->builder->CreateBr(B(dest));
801 }
802 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)803 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
804 {
805 	RR_DEBUG_INFO_UPDATE_LOC();
806 	Variable::materializeAll();
807 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
808 }
809 
createAdd(Value * lhs,Value * rhs)810 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
811 {
812 	RR_DEBUG_INFO_UPDATE_LOC();
813 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
814 }
815 
createSub(Value * lhs,Value * rhs)816 Value *Nucleus::createSub(Value *lhs, Value *rhs)
817 {
818 	RR_DEBUG_INFO_UPDATE_LOC();
819 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
820 }
821 
createMul(Value * lhs,Value * rhs)822 Value *Nucleus::createMul(Value *lhs, Value *rhs)
823 {
824 	RR_DEBUG_INFO_UPDATE_LOC();
825 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
826 }
827 
createUDiv(Value * lhs,Value * rhs)828 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
829 {
830 	RR_DEBUG_INFO_UPDATE_LOC();
831 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
832 }
833 
createSDiv(Value * lhs,Value * rhs)834 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
835 {
836 	RR_DEBUG_INFO_UPDATE_LOC();
837 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
838 }
839 
createFAdd(Value * lhs,Value * rhs)840 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
841 {
842 	RR_DEBUG_INFO_UPDATE_LOC();
843 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
844 }
845 
createFSub(Value * lhs,Value * rhs)846 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
847 {
848 	RR_DEBUG_INFO_UPDATE_LOC();
849 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
850 }
851 
createFMul(Value * lhs,Value * rhs)852 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
853 {
854 	RR_DEBUG_INFO_UPDATE_LOC();
855 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
856 }
857 
createFDiv(Value * lhs,Value * rhs)858 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
859 {
860 	RR_DEBUG_INFO_UPDATE_LOC();
861 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
862 }
863 
createURem(Value * lhs,Value * rhs)864 Value *Nucleus::createURem(Value *lhs, Value *rhs)
865 {
866 	RR_DEBUG_INFO_UPDATE_LOC();
867 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
868 }
869 
createSRem(Value * lhs,Value * rhs)870 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
871 {
872 	RR_DEBUG_INFO_UPDATE_LOC();
873 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
874 }
875 
createFRem(Value * lhs,Value * rhs)876 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
877 {
878 	RR_DEBUG_INFO_UPDATE_LOC();
879 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
880 }
881 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)882 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
883 {
884 	return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
885 }
886 
createShl(Value * lhs,Value * rhs)887 Value *Nucleus::createShl(Value *lhs, Value *rhs)
888 {
889 	RR_DEBUG_INFO_UPDATE_LOC();
890 	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
891 }
892 
createLShr(Value * lhs,Value * rhs)893 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
894 {
895 	RR_DEBUG_INFO_UPDATE_LOC();
896 	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
897 }
898 
createAShr(Value * lhs,Value * rhs)899 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
900 {
901 	RR_DEBUG_INFO_UPDATE_LOC();
902 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
903 }
904 
createAnd(Value * lhs,Value * rhs)905 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
906 {
907 	RR_DEBUG_INFO_UPDATE_LOC();
908 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
909 }
910 
createOr(Value * lhs,Value * rhs)911 Value *Nucleus::createOr(Value *lhs, Value *rhs)
912 {
913 	RR_DEBUG_INFO_UPDATE_LOC();
914 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
915 }
916 
createXor(Value * lhs,Value * rhs)917 Value *Nucleus::createXor(Value *lhs, Value *rhs)
918 {
919 	RR_DEBUG_INFO_UPDATE_LOC();
920 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
921 }
922 
createNeg(Value * v)923 Value *Nucleus::createNeg(Value *v)
924 {
925 	RR_DEBUG_INFO_UPDATE_LOC();
926 	return V(jit->builder->CreateNeg(V(v)));
927 }
928 
createFNeg(Value * v)929 Value *Nucleus::createFNeg(Value *v)
930 {
931 	RR_DEBUG_INFO_UPDATE_LOC();
932 	return V(jit->builder->CreateFNeg(V(v)));
933 }
934 
createNot(Value * v)935 Value *Nucleus::createNot(Value *v)
936 {
937 	RR_DEBUG_INFO_UPDATE_LOC();
938 	return V(jit->builder->CreateNot(V(v)));
939 }
940 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)941 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
942 {
943 	RR_DEBUG_INFO_UPDATE_LOC();
944 	switch(asInternalType(type))
945 	{
946 		case Type_v2i32:
947 		case Type_v4i16:
948 		case Type_v8i8:
949 		case Type_v2f32:
950 			return createBitCast(
951 			    createInsertElement(
952 			        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
953 			        createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
954 			        0),
955 			    type);
956 		case Type_v2i16:
957 		case Type_v4i8:
958 			if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
959 			{
960 				Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
961 				Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
962 				i = createZExt(i, Long::getType());
963 				Value *v = createInsertElement(u, i, 0);
964 				return createBitCast(v, type);
965 			}
966 			// Fallthrough to non-emulated case.
967 		case Type_LLVM:
968 		{
969 			auto elTy = T(type);
970 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
971 
972 			if(!atomic)
973 			{
974 				return V(jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile));
975 			}
976 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
977 			{
978 				// Integers and pointers can be atomically loaded by setting
979 				// the ordering constraint on the load instruction.
980 				auto load = jit->builder->CreateAlignedLoad(V(ptr), alignment, isVolatile);
981 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
982 				return V(load);
983 			}
984 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
985 			{
986 				// LLVM claims to support atomic loads of float types as
987 				// above, but certain backends cannot deal with this.
988 				// Load as an integer and bitcast. See b/136037244.
989 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
990 				auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
991 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
992 				auto load = jit->builder->CreateAlignedLoad(ptrCast, alignment, isVolatile);
993 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
994 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
995 				return V(loadCast);
996 			}
997 			else
998 			{
999 				// More exotic types require falling back to the extern:
1000 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
1001 				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
1002 				auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
1003 				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
1004 				auto i8PtrTy = i8Ty->getPointerTo();
1005 				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
1006 				auto funcTy = ::llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1007 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
1008 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1009 				auto out = allocateStackVariable(type);
1010 				jit->builder->CreateCall(func, {
1011 				                                   ::llvm::ConstantInt::get(sizetTy, size),
1012 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1013 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
1014 				                                   ::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1015 				                               });
1016 				return V(jit->builder->CreateLoad(V(out)));
1017 			}
1018 		}
1019 		default:
1020 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1021 			return nullptr;
1022 	}
1023 }
1024 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)1025 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
1026 {
1027 	RR_DEBUG_INFO_UPDATE_LOC();
1028 	switch(asInternalType(type))
1029 	{
1030 		case Type_v2i32:
1031 		case Type_v4i16:
1032 		case Type_v8i8:
1033 		case Type_v2f32:
1034 			createStore(
1035 			    createExtractElement(
1036 			        createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
1037 			    createBitCast(ptr, Pointer<Long>::getType()),
1038 			    Long::getType(), isVolatile, alignment, atomic, memoryOrder);
1039 			return value;
1040 		case Type_v2i16:
1041 		case Type_v4i8:
1042 			if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
1043 			{
1044 				createStore(
1045 				    createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
1046 				    createBitCast(ptr, Pointer<Int>::getType()),
1047 				    Int::getType(), isVolatile, alignment, atomic, memoryOrder);
1048 				return value;
1049 			}
1050 			// Fallthrough to non-emulated case.
1051 		case Type_LLVM:
1052 		{
1053 			auto elTy = T(type);
1054 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
1055 
1056 #if __has_feature(memory_sanitizer)
1057 			// Mark all memory writes as initialized by calling __msan_unpoison
1058 			{
1059 				// void __msan_unpoison(const volatile void *a, size_t size)
1060 				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
1061 				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
1062 				auto voidPtrTy = i8Ty->getPointerTo();
1063 				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
1064 				auto funcTy = ::llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1065 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1066 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1067 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
1068 				                                 ::llvm::ConstantInt::get(sizetTy, size) });
1069 			}
1070 #endif
1071 
1072 			if(!atomic)
1073 			{
1074 				jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
1075 			}
1076 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
1077 			{
1078 				// Integers and pointers can be atomically stored by setting
1079 				// the ordering constraint on the store instruction.
1080 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), alignment, isVolatile);
1081 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1082 			}
1083 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
1084 			{
1085 				// LLVM claims to support atomic stores of float types as
1086 				// above, but certain backends cannot deal with this.
1087 				// Store as an bitcast integer. See b/136037244.
1088 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1089 				auto elAsIntTy = ::llvm::IntegerType::get(jit->context, size * 8);
1090 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1091 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1092 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, alignment, isVolatile);
1093 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1094 			}
1095 			else
1096 			{
1097 				// More exotic types require falling back to the extern:
1098 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1099 				auto sizetTy = ::llvm::IntegerType::get(jit->context, sizeof(size_t) * 8);
1100 				auto intTy = ::llvm::IntegerType::get(jit->context, sizeof(int) * 8);
1101 				auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
1102 				auto i8PtrTy = i8Ty->getPointerTo();
1103 				auto voidTy = ::llvm::Type::getVoidTy(jit->context);
1104 				auto funcTy = ::llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1105 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1106 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1107 				auto copy = allocateStackVariable(type);
1108 				jit->builder->CreateStore(V(value), V(copy));
1109 				jit->builder->CreateCall(func, {
1110 				                                   ::llvm::ConstantInt::get(sizetTy, size),
1111 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1112 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1113 				                                   ::llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1114 				                               });
1115 			}
1116 
1117 			return value;
1118 		}
1119 		default:
1120 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1121 			return nullptr;
1122 	}
1123 }
1124 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1125 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1126 {
1127 	RR_DEBUG_INFO_UPDATE_LOC();
1128 
1129 	ASSERT(V(ptr)->getType()->isPointerTy());
1130 	ASSERT(V(mask)->getType()->isVectorTy());
1131 
1132 	auto numEls = V(mask)->getType()->getVectorNumElements();
1133 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
1134 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
1135 	auto elVecTy = ::llvm::VectorType::get(T(elTy), numEls);
1136 	auto elVecPtrTy = elVecTy->getPointerTo();
1137 	auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1138 	auto passthrough = zeroMaskedLanes ? ::llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1139 	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
1140 	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1141 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1142 }
1143 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1144 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1145 {
1146 	RR_DEBUG_INFO_UPDATE_LOC();
1147 
1148 	ASSERT(V(ptr)->getType()->isPointerTy());
1149 	ASSERT(V(val)->getType()->isVectorTy());
1150 	ASSERT(V(mask)->getType()->isVectorTy());
1151 
1152 	auto numEls = V(mask)->getType()->getVectorNumElements();
1153 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
1154 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
1155 	auto elVecTy = V(val)->getType();
1156 	auto elVecPtrTy = elVecTy->getPointerTo();
1157 	auto i8Mask = jit->builder->CreateIntCast(V(mask), ::llvm::VectorType::get(i1Ty, numEls), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1158 	auto align = ::llvm::ConstantInt::get(i32Ty, alignment);
1159 	auto func = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1160 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i8Mask });
1161 }
1162 
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1163 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1164 {
1165 	return As<Float4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
1166 }
1167 
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1168 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1169 {
1170 	return As<Int4>(V(createGather(V(base.value), T(Float::getType()), V(offsets.value), V(mask.value), alignment, zeroMaskedLanes)));
1171 }
1172 
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1173 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1174 {
1175 	return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
1176 }
1177 
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1178 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1179 {
1180 	return createScatter(V(base.value), V(val.value), V(offsets.value), V(mask.value), alignment);
1181 }
1182 
createFence(std::memory_order memoryOrder)1183 void Nucleus::createFence(std::memory_order memoryOrder)
1184 {
1185 	RR_DEBUG_INFO_UPDATE_LOC();
1186 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1187 }
1188 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1189 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1190 {
1191 	RR_DEBUG_INFO_UPDATE_LOC();
1192 	ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1193 	if(sizeof(void *) == 8)
1194 	{
1195 		// LLVM manual: "When indexing into an array, pointer or vector,
1196 		// integers of any width are allowed, and they are not required to
1197 		// be constant. These integers are treated as signed values where
1198 		// relevant."
1199 		//
1200 		// Thus if we want indexes to be treated as unsigned we have to
1201 		// zero-extend them ourselves.
1202 		//
1203 		// Note that this is not because we want to address anywhere near
1204 		// 4 GB of data. Instead this is important for performance because
1205 		// x86 supports automatic zero-extending of 32-bit registers to
1206 		// 64-bit. Thus when indexing into an array using a uint32 is
1207 		// actually faster than an int32.
1208 		index = unsignedIndex ? createZExt(index, Long::getType()) : createSExt(index, Long::getType());
1209 	}
1210 
1211 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1212 	// effective address correctly.
1213 	if(asInternalType(type) == Type_LLVM)
1214 	{
1215 		return V(jit->builder->CreateGEP(V(ptr), V(index)));
1216 	}
1217 
1218 	// For emulated types we have to multiply the index by the intended
1219 	// type size ourselves to obain the byte offset.
1220 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1221 
1222 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1223 	// original pointer type.
1224 	return createBitCast(
1225 	    V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
1226 	    T(llvm::PointerType::get(T(type), 0)));
1227 }
1228 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1229 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1230 {
1231 	RR_DEBUG_INFO_UPDATE_LOC();
1232 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1233 }
1234 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1235 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1236 {
1237 	RR_DEBUG_INFO_UPDATE_LOC();
1238 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1239 }
1240 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1241 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1242 {
1243 	RR_DEBUG_INFO_UPDATE_LOC();
1244 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1245 }
1246 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1247 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1248 {
1249 	RR_DEBUG_INFO_UPDATE_LOC();
1250 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1251 }
1252 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1253 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1254 {
1255 	RR_DEBUG_INFO_UPDATE_LOC();
1256 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1257 }
1258 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1259 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1260 {
1261 	RR_DEBUG_INFO_UPDATE_LOC();
1262 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1263 }
1264 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1265 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1266 {
1267 	RR_DEBUG_INFO_UPDATE_LOC();
1268 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1269 }
1270 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1271 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1272 {
1273 	RR_DEBUG_INFO_UPDATE_LOC();
1274 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1275 }
1276 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1277 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1278 {
1279 	RR_DEBUG_INFO_UPDATE_LOC();
1280 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1281 }
1282 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1283 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1284 {
1285 	RR_DEBUG_INFO_UPDATE_LOC();
1286 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value), atomicOrdering(true, memoryOrder)));
1287 }
1288 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1289 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1290 {
1291 	RR_DEBUG_INFO_UPDATE_LOC();
1292 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1293 	return V(jit->builder->CreateExtractValue(
1294 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value), atomicOrdering(true, memoryOrderEqual), atomicOrdering(true, memoryOrderUnequal)),
1295 	    llvm::ArrayRef<unsigned>(0u)));
1296 }
1297 
createTrunc(Value * v,Type * destType)1298 Value *Nucleus::createTrunc(Value *v, Type *destType)
1299 {
1300 	RR_DEBUG_INFO_UPDATE_LOC();
1301 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1302 }
1303 
createZExt(Value * v,Type * destType)1304 Value *Nucleus::createZExt(Value *v, Type *destType)
1305 {
1306 	RR_DEBUG_INFO_UPDATE_LOC();
1307 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1308 }
1309 
createSExt(Value * v,Type * destType)1310 Value *Nucleus::createSExt(Value *v, Type *destType)
1311 {
1312 	RR_DEBUG_INFO_UPDATE_LOC();
1313 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1314 }
1315 
createFPToUI(Value * v,Type * destType)1316 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1317 {
1318 	RR_DEBUG_INFO_UPDATE_LOC();
1319 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1320 }
1321 
createFPToSI(Value * v,Type * destType)1322 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1323 {
1324 	RR_DEBUG_INFO_UPDATE_LOC();
1325 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1326 }
1327 
createSIToFP(Value * v,Type * destType)1328 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1329 {
1330 	RR_DEBUG_INFO_UPDATE_LOC();
1331 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1332 }
1333 
createFPTrunc(Value * v,Type * destType)1334 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1335 {
1336 	RR_DEBUG_INFO_UPDATE_LOC();
1337 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1338 }
1339 
createFPExt(Value * v,Type * destType)1340 Value *Nucleus::createFPExt(Value *v, Type *destType)
1341 {
1342 	RR_DEBUG_INFO_UPDATE_LOC();
1343 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1344 }
1345 
createBitCast(Value * v,Type * destType)1346 Value *Nucleus::createBitCast(Value *v, Type *destType)
1347 {
1348 	RR_DEBUG_INFO_UPDATE_LOC();
1349 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1350 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1351 	// reading back as the destination type.
1352 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1353 	{
1354 		Value *readAddress = allocateStackVariable(destType);
1355 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1356 		createStore(v, writeAddress, T(V(v)->getType()));
1357 		return createLoad(readAddress, destType);
1358 	}
1359 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1360 	{
1361 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1362 		createStore(v, writeAddress, T(V(v)->getType()));
1363 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1364 		return createLoad(readAddress, destType);
1365 	}
1366 
1367 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1368 }
1369 
createPtrEQ(Value * lhs,Value * rhs)1370 Value *Nucleus::createPtrEQ(Value *lhs, Value *rhs)
1371 {
1372 	RR_DEBUG_INFO_UPDATE_LOC();
1373 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1374 }
1375 
createICmpEQ(Value * lhs,Value * rhs)1376 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1377 {
1378 	RR_DEBUG_INFO_UPDATE_LOC();
1379 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1380 }
1381 
createICmpNE(Value * lhs,Value * rhs)1382 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1383 {
1384 	RR_DEBUG_INFO_UPDATE_LOC();
1385 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1386 }
1387 
createICmpUGT(Value * lhs,Value * rhs)1388 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1389 {
1390 	RR_DEBUG_INFO_UPDATE_LOC();
1391 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1392 }
1393 
createICmpUGE(Value * lhs,Value * rhs)1394 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1395 {
1396 	RR_DEBUG_INFO_UPDATE_LOC();
1397 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1398 }
1399 
createICmpULT(Value * lhs,Value * rhs)1400 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1401 {
1402 	RR_DEBUG_INFO_UPDATE_LOC();
1403 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1404 }
1405 
createICmpULE(Value * lhs,Value * rhs)1406 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1407 {
1408 	RR_DEBUG_INFO_UPDATE_LOC();
1409 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1410 }
1411 
createICmpSGT(Value * lhs,Value * rhs)1412 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1413 {
1414 	RR_DEBUG_INFO_UPDATE_LOC();
1415 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1416 }
1417 
createICmpSGE(Value * lhs,Value * rhs)1418 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1419 {
1420 	RR_DEBUG_INFO_UPDATE_LOC();
1421 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1422 }
1423 
createICmpSLT(Value * lhs,Value * rhs)1424 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1425 {
1426 	RR_DEBUG_INFO_UPDATE_LOC();
1427 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1428 }
1429 
createICmpSLE(Value * lhs,Value * rhs)1430 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1431 {
1432 	RR_DEBUG_INFO_UPDATE_LOC();
1433 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1434 }
1435 
createFCmpOEQ(Value * lhs,Value * rhs)1436 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1437 {
1438 	RR_DEBUG_INFO_UPDATE_LOC();
1439 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1440 }
1441 
createFCmpOGT(Value * lhs,Value * rhs)1442 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1443 {
1444 	RR_DEBUG_INFO_UPDATE_LOC();
1445 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1446 }
1447 
createFCmpOGE(Value * lhs,Value * rhs)1448 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1449 {
1450 	RR_DEBUG_INFO_UPDATE_LOC();
1451 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1452 }
1453 
createFCmpOLT(Value * lhs,Value * rhs)1454 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1455 {
1456 	RR_DEBUG_INFO_UPDATE_LOC();
1457 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1458 }
1459 
createFCmpOLE(Value * lhs,Value * rhs)1460 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1461 {
1462 	RR_DEBUG_INFO_UPDATE_LOC();
1463 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1464 }
1465 
createFCmpONE(Value * lhs,Value * rhs)1466 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1467 {
1468 	RR_DEBUG_INFO_UPDATE_LOC();
1469 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1470 }
1471 
createFCmpORD(Value * lhs,Value * rhs)1472 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1473 {
1474 	RR_DEBUG_INFO_UPDATE_LOC();
1475 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1476 }
1477 
createFCmpUNO(Value * lhs,Value * rhs)1478 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1479 {
1480 	RR_DEBUG_INFO_UPDATE_LOC();
1481 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1482 }
1483 
createFCmpUEQ(Value * lhs,Value * rhs)1484 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1485 {
1486 	RR_DEBUG_INFO_UPDATE_LOC();
1487 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1488 }
1489 
createFCmpUGT(Value * lhs,Value * rhs)1490 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1491 {
1492 	RR_DEBUG_INFO_UPDATE_LOC();
1493 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1494 }
1495 
createFCmpUGE(Value * lhs,Value * rhs)1496 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1497 {
1498 	RR_DEBUG_INFO_UPDATE_LOC();
1499 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1500 }
1501 
createFCmpULT(Value * lhs,Value * rhs)1502 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1503 {
1504 	RR_DEBUG_INFO_UPDATE_LOC();
1505 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1506 }
1507 
createFCmpULE(Value * lhs,Value * rhs)1508 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1509 {
1510 	RR_DEBUG_INFO_UPDATE_LOC();
1511 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1512 }
1513 
createFCmpUNE(Value * lhs,Value * rhs)1514 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1515 {
1516 	RR_DEBUG_INFO_UPDATE_LOC();
1517 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1518 }
1519 
createExtractElement(Value * vector,Type * type,int index)1520 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1521 {
1522 	RR_DEBUG_INFO_UPDATE_LOC();
1523 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1524 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1525 }
1526 
createInsertElement(Value * vector,Value * element,int index)1527 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1528 {
1529 	RR_DEBUG_INFO_UPDATE_LOC();
1530 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1531 }
1532 
createShuffleVector(Value * v1,Value * v2,const int * select)1533 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1534 {
1535 	RR_DEBUG_INFO_UPDATE_LOC();
1536 
1537 	int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
1538 	const int maxSize = 16;
1539 	llvm::Constant *swizzle[maxSize];
1540 	ASSERT(size <= maxSize);
1541 
1542 	for(int i = 0; i < size; i++)
1543 	{
1544 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), select[i]);
1545 	}
1546 
1547 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
1548 
1549 	return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1550 }
1551 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1552 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1553 {
1554 	RR_DEBUG_INFO_UPDATE_LOC();
1555 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1556 }
1557 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1558 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1559 {
1560 	RR_DEBUG_INFO_UPDATE_LOC();
1561 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1562 }
1563 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1564 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1565 {
1566 	RR_DEBUG_INFO_UPDATE_LOC();
1567 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1568 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), label, true), B(branch));
1569 }
1570 
createUnreachable()1571 void Nucleus::createUnreachable()
1572 {
1573 	RR_DEBUG_INFO_UPDATE_LOC();
1574 	jit->builder->CreateUnreachable();
1575 }
1576 
getType(Value * value)1577 Type *Nucleus::getType(Value *value)
1578 {
1579 	return T(V(value)->getType());
1580 }
1581 
getContainedType(Type * vectorType)1582 Type *Nucleus::getContainedType(Type *vectorType)
1583 {
1584 	return T(T(vectorType)->getContainedType(0));
1585 }
1586 
getPointerType(Type * ElementType)1587 Type *Nucleus::getPointerType(Type *ElementType)
1588 {
1589 	return T(llvm::PointerType::get(T(ElementType), 0));
1590 }
1591 
getNaturalIntType()1592 static ::llvm::Type *getNaturalIntType()
1593 {
1594 	return ::llvm::Type::getIntNTy(jit->context, sizeof(int) * 8);
1595 }
1596 
getPrintfStorageType(Type * valueType)1597 Type *Nucleus::getPrintfStorageType(Type *valueType)
1598 {
1599 	llvm::Type *valueTy = T(valueType);
1600 	if(valueTy->isIntegerTy())
1601 	{
1602 		return T(getNaturalIntType());
1603 	}
1604 	if(valueTy->isFloatTy())
1605 	{
1606 		return T(llvm::Type::getDoubleTy(jit->context));
1607 	}
1608 
1609 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1610 	return {};
1611 }
1612 
createNullValue(Type * Ty)1613 Value *Nucleus::createNullValue(Type *Ty)
1614 {
1615 	RR_DEBUG_INFO_UPDATE_LOC();
1616 	return V(llvm::Constant::getNullValue(T(Ty)));
1617 }
1618 
createConstantLong(int64_t i)1619 Value *Nucleus::createConstantLong(int64_t i)
1620 {
1621 	RR_DEBUG_INFO_UPDATE_LOC();
1622 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(jit->context), i, true));
1623 }
1624 
createConstantInt(int i)1625 Value *Nucleus::createConstantInt(int i)
1626 {
1627 	RR_DEBUG_INFO_UPDATE_LOC();
1628 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, true));
1629 }
1630 
createConstantInt(unsigned int i)1631 Value *Nucleus::createConstantInt(unsigned int i)
1632 {
1633 	RR_DEBUG_INFO_UPDATE_LOC();
1634 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(jit->context), i, false));
1635 }
1636 
createConstantBool(bool b)1637 Value *Nucleus::createConstantBool(bool b)
1638 {
1639 	RR_DEBUG_INFO_UPDATE_LOC();
1640 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(jit->context), b));
1641 }
1642 
createConstantByte(signed char i)1643 Value *Nucleus::createConstantByte(signed char i)
1644 {
1645 	RR_DEBUG_INFO_UPDATE_LOC();
1646 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, true));
1647 }
1648 
createConstantByte(unsigned char i)1649 Value *Nucleus::createConstantByte(unsigned char i)
1650 {
1651 	RR_DEBUG_INFO_UPDATE_LOC();
1652 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(jit->context), i, false));
1653 }
1654 
createConstantShort(short i)1655 Value *Nucleus::createConstantShort(short i)
1656 {
1657 	RR_DEBUG_INFO_UPDATE_LOC();
1658 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, true));
1659 }
1660 
createConstantShort(unsigned short i)1661 Value *Nucleus::createConstantShort(unsigned short i)
1662 {
1663 	RR_DEBUG_INFO_UPDATE_LOC();
1664 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(jit->context), i, false));
1665 }
1666 
createConstantFloat(float x)1667 Value *Nucleus::createConstantFloat(float x)
1668 {
1669 	RR_DEBUG_INFO_UPDATE_LOC();
1670 	return V(llvm::ConstantFP::get(T(Float::getType()), x));
1671 }
1672 
createNullPointer(Type * Ty)1673 Value *Nucleus::createNullPointer(Type *Ty)
1674 {
1675 	RR_DEBUG_INFO_UPDATE_LOC();
1676 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1677 }
1678 
createConstantVector(const int64_t * constants,Type * type)1679 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1680 {
1681 	RR_DEBUG_INFO_UPDATE_LOC();
1682 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1683 	const int numConstants = elementCount(type);                                      // Number of provided constants for the (emulated) type.
1684 	const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1685 	ASSERT(numElements <= 16 && numConstants <= numElements);
1686 	llvm::Constant *constantVector[16];
1687 
1688 	for(int i = 0; i < numElements; i++)
1689 	{
1690 		constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1691 	}
1692 
1693 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1694 }
1695 
createConstantVector(const double * constants,Type * type)1696 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1697 {
1698 	RR_DEBUG_INFO_UPDATE_LOC();
1699 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1700 	const int numConstants = elementCount(type);                                      // Number of provided constants for the (emulated) type.
1701 	const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1702 	ASSERT(numElements <= 8 && numConstants <= numElements);
1703 	llvm::Constant *constantVector[8];
1704 
1705 	for(int i = 0; i < numElements; i++)
1706 	{
1707 		constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1708 	}
1709 
1710 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1711 }
1712 
createConstantString(const char * v)1713 Value *Nucleus::createConstantString(const char *v)
1714 {
1715 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1716 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1717 	return V(ptr);
1718 }
1719 
getType()1720 Type *Void::getType()
1721 {
1722 	return T(llvm::Type::getVoidTy(jit->context));
1723 }
1724 
getType()1725 Type *Bool::getType()
1726 {
1727 	return T(llvm::Type::getInt1Ty(jit->context));
1728 }
1729 
getType()1730 Type *Byte::getType()
1731 {
1732 	return T(llvm::Type::getInt8Ty(jit->context));
1733 }
1734 
getType()1735 Type *SByte::getType()
1736 {
1737 	return T(llvm::Type::getInt8Ty(jit->context));
1738 }
1739 
getType()1740 Type *Short::getType()
1741 {
1742 	return T(llvm::Type::getInt16Ty(jit->context));
1743 }
1744 
getType()1745 Type *UShort::getType()
1746 {
1747 	return T(llvm::Type::getInt16Ty(jit->context));
1748 }
1749 
getType()1750 Type *Byte4::getType()
1751 {
1752 	return T(Type_v4i8);
1753 }
1754 
getType()1755 Type *SByte4::getType()
1756 {
1757 	return T(Type_v4i8);
1758 }
1759 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1760 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1761 {
1762 	RR_DEBUG_INFO_UPDATE_LOC();
1763 #if defined(__i386__) || defined(__x86_64__)
1764 	return x86::paddusb(x, y);
1765 #else
1766 	return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
1767 #endif
1768 }
1769 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1770 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1771 {
1772 	RR_DEBUG_INFO_UPDATE_LOC();
1773 #if defined(__i386__) || defined(__x86_64__)
1774 	return x86::psubusb(x, y);
1775 #else
1776 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
1777 #endif
1778 }
1779 
SignMask(RValue<Byte8> x)1780 RValue<Int> SignMask(RValue<Byte8> x)
1781 {
1782 	RR_DEBUG_INFO_UPDATE_LOC();
1783 #if defined(__i386__) || defined(__x86_64__)
1784 	return x86::pmovmskb(x);
1785 #else
1786 	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1787 #endif
1788 }
1789 
1790 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1791 //	{
1792 //#if defined(__i386__) || defined(__x86_64__)
1793 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1794 //#else
1795 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1796 //#endif
1797 //	}
1798 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1799 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1800 {
1801 	RR_DEBUG_INFO_UPDATE_LOC();
1802 #if defined(__i386__) || defined(__x86_64__)
1803 	return x86::pcmpeqb(x, y);
1804 #else
1805 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1806 #endif
1807 }
1808 
getType()1809 Type *Byte8::getType()
1810 {
1811 	return T(Type_v8i8);
1812 }
1813 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1814 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1815 {
1816 	RR_DEBUG_INFO_UPDATE_LOC();
1817 #if defined(__i386__) || defined(__x86_64__)
1818 	return x86::paddsb(x, y);
1819 #else
1820 	return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1821 #endif
1822 }
1823 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1824 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1825 {
1826 	RR_DEBUG_INFO_UPDATE_LOC();
1827 #if defined(__i386__) || defined(__x86_64__)
1828 	return x86::psubsb(x, y);
1829 #else
1830 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1831 #endif
1832 }
1833 
SignMask(RValue<SByte8> x)1834 RValue<Int> SignMask(RValue<SByte8> x)
1835 {
1836 	RR_DEBUG_INFO_UPDATE_LOC();
1837 #if defined(__i386__) || defined(__x86_64__)
1838 	return x86::pmovmskb(As<Byte8>(x));
1839 #else
1840 	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1841 #endif
1842 }
1843 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1844 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1845 {
1846 	RR_DEBUG_INFO_UPDATE_LOC();
1847 #if defined(__i386__) || defined(__x86_64__)
1848 	return x86::pcmpgtb(x, y);
1849 #else
1850 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1851 #endif
1852 }
1853 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1854 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1855 {
1856 	RR_DEBUG_INFO_UPDATE_LOC();
1857 #if defined(__i386__) || defined(__x86_64__)
1858 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1859 #else
1860 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1861 #endif
1862 }
1863 
getType()1864 Type *SByte8::getType()
1865 {
1866 	return T(Type_v8i8);
1867 }
1868 
getType()1869 Type *Byte16::getType()
1870 {
1871 	return T(llvm::VectorType::get(T(Byte::getType()), 16));
1872 }
1873 
getType()1874 Type *SByte16::getType()
1875 {
1876 	return T(llvm::VectorType::get(T(SByte::getType()), 16));
1877 }
1878 
getType()1879 Type *Short2::getType()
1880 {
1881 	return T(Type_v2i16);
1882 }
1883 
getType()1884 Type *UShort2::getType()
1885 {
1886 	return T(Type_v2i16);
1887 }
1888 
Short4(RValue<Int4> cast)1889 Short4::Short4(RValue<Int4> cast)
1890 {
1891 	RR_DEBUG_INFO_UPDATE_LOC();
1892 	int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
1893 	Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
1894 
1895 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1896 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
1897 
1898 	storeValue(short4);
1899 }
1900 
1901 //	Short4::Short4(RValue<Float> cast)
1902 //	{
1903 //	}
1904 
Short4(RValue<Float4> cast)1905 Short4::Short4(RValue<Float4> cast)
1906 {
1907 	RR_DEBUG_INFO_UPDATE_LOC();
1908 	Int4 v4i32 = Int4(cast);
1909 #if defined(__i386__) || defined(__x86_64__)
1910 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
1911 #else
1912 	Value *v = v4i32.loadValue();
1913 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
1914 #endif
1915 
1916 	storeValue(As<Short4>(Int2(v4i32)).value);
1917 }
1918 
operator <<(RValue<Short4> lhs,unsigned char rhs)1919 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
1920 {
1921 	RR_DEBUG_INFO_UPDATE_LOC();
1922 #if defined(__i386__) || defined(__x86_64__)
1923 	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
1924 
1925 	return x86::psllw(lhs, rhs);
1926 #else
1927 	return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
1928 #endif
1929 }
1930 
operator >>(RValue<Short4> lhs,unsigned char rhs)1931 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
1932 {
1933 	RR_DEBUG_INFO_UPDATE_LOC();
1934 #if defined(__i386__) || defined(__x86_64__)
1935 	return x86::psraw(lhs, rhs);
1936 #else
1937 	return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
1938 #endif
1939 }
1940 
Max(RValue<Short4> x,RValue<Short4> y)1941 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
1942 {
1943 	RR_DEBUG_INFO_UPDATE_LOC();
1944 #if defined(__i386__) || defined(__x86_64__)
1945 	return x86::pmaxsw(x, y);
1946 #else
1947 	return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
1948 #endif
1949 }
1950 
Min(RValue<Short4> x,RValue<Short4> y)1951 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
1952 {
1953 	RR_DEBUG_INFO_UPDATE_LOC();
1954 #if defined(__i386__) || defined(__x86_64__)
1955 	return x86::pminsw(x, y);
1956 #else
1957 	return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
1958 #endif
1959 }
1960 
AddSat(RValue<Short4> x,RValue<Short4> y)1961 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
1962 {
1963 	RR_DEBUG_INFO_UPDATE_LOC();
1964 #if defined(__i386__) || defined(__x86_64__)
1965 	return x86::paddsw(x, y);
1966 #else
1967 	return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1968 #endif
1969 }
1970 
SubSat(RValue<Short4> x,RValue<Short4> y)1971 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
1972 {
1973 	RR_DEBUG_INFO_UPDATE_LOC();
1974 #if defined(__i386__) || defined(__x86_64__)
1975 	return x86::psubsw(x, y);
1976 #else
1977 	return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1978 #endif
1979 }
1980 
MulHigh(RValue<Short4> x,RValue<Short4> y)1981 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
1982 {
1983 	RR_DEBUG_INFO_UPDATE_LOC();
1984 #if defined(__i386__) || defined(__x86_64__)
1985 	return x86::pmulhw(x, y);
1986 #else
1987 	return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
1988 #endif
1989 }
1990 
MulAdd(RValue<Short4> x,RValue<Short4> y)1991 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
1992 {
1993 	RR_DEBUG_INFO_UPDATE_LOC();
1994 #if defined(__i386__) || defined(__x86_64__)
1995 	return x86::pmaddwd(x, y);
1996 #else
1997 	return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
1998 #endif
1999 }
2000 
PackSigned(RValue<Short4> x,RValue<Short4> y)2001 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2002 {
2003 	RR_DEBUG_INFO_UPDATE_LOC();
2004 #if defined(__i386__) || defined(__x86_64__)
2005 	auto result = x86::packsswb(x, y);
2006 #else
2007 	auto result = V(lowerPack(V(x.value), V(y.value), true));
2008 #endif
2009 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2010 }
2011 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2012 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2013 {
2014 	RR_DEBUG_INFO_UPDATE_LOC();
2015 #if defined(__i386__) || defined(__x86_64__)
2016 	auto result = x86::packuswb(x, y);
2017 #else
2018 	auto result = V(lowerPack(V(x.value), V(y.value), false));
2019 #endif
2020 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2021 }
2022 
CmpGT(RValue<Short4> x,RValue<Short4> y)2023 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2024 {
2025 	RR_DEBUG_INFO_UPDATE_LOC();
2026 #if defined(__i386__) || defined(__x86_64__)
2027 	return x86::pcmpgtw(x, y);
2028 #else
2029 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
2030 #endif
2031 }
2032 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2033 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2034 {
2035 	RR_DEBUG_INFO_UPDATE_LOC();
2036 #if defined(__i386__) || defined(__x86_64__)
2037 	return x86::pcmpeqw(x, y);
2038 #else
2039 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
2040 #endif
2041 }
2042 
getType()2043 Type *Short4::getType()
2044 {
2045 	return T(Type_v4i16);
2046 }
2047 
UShort4(RValue<Float4> cast,bool saturate)2048 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2049 {
2050 	RR_DEBUG_INFO_UPDATE_LOC();
2051 	if(saturate)
2052 	{
2053 #if defined(__i386__) || defined(__x86_64__)
2054 		if(CPUID::supportsSSE4_1())
2055 		{
2056 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2057 			*this = As<Short4>(PackUnsigned(int4, int4));
2058 		}
2059 		else
2060 #endif
2061 		{
2062 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2063 		}
2064 	}
2065 	else
2066 	{
2067 		*this = Short4(Int4(cast));
2068 	}
2069 }
2070 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2071 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2072 {
2073 	RR_DEBUG_INFO_UPDATE_LOC();
2074 #if defined(__i386__) || defined(__x86_64__)
2075 	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2076 
2077 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2078 #else
2079 	return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
2080 #endif
2081 }
2082 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2083 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2084 {
2085 	RR_DEBUG_INFO_UPDATE_LOC();
2086 #if defined(__i386__) || defined(__x86_64__)
2087 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
2088 
2089 	return x86::psrlw(lhs, rhs);
2090 #else
2091 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2092 #endif
2093 }
2094 
Max(RValue<UShort4> x,RValue<UShort4> y)2095 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2096 {
2097 	RR_DEBUG_INFO_UPDATE_LOC();
2098 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2099 }
2100 
Min(RValue<UShort4> x,RValue<UShort4> y)2101 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2102 {
2103 	RR_DEBUG_INFO_UPDATE_LOC();
2104 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2105 }
2106 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2107 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2108 {
2109 	RR_DEBUG_INFO_UPDATE_LOC();
2110 #if defined(__i386__) || defined(__x86_64__)
2111 	return x86::paddusw(x, y);
2112 #else
2113 	return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
2114 #endif
2115 }
2116 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2117 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2118 {
2119 	RR_DEBUG_INFO_UPDATE_LOC();
2120 #if defined(__i386__) || defined(__x86_64__)
2121 	return x86::psubusw(x, y);
2122 #else
2123 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
2124 #endif
2125 }
2126 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2127 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2128 {
2129 	RR_DEBUG_INFO_UPDATE_LOC();
2130 #if defined(__i386__) || defined(__x86_64__)
2131 	return x86::pmulhuw(x, y);
2132 #else
2133 	return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2134 #endif
2135 }
2136 
Average(RValue<UShort4> x,RValue<UShort4> y)2137 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2138 {
2139 	RR_DEBUG_INFO_UPDATE_LOC();
2140 #if defined(__i386__) || defined(__x86_64__)
2141 	return x86::pavgw(x, y);
2142 #else
2143 	return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
2144 #endif
2145 }
2146 
getType()2147 Type *UShort4::getType()
2148 {
2149 	return T(Type_v4i16);
2150 }
2151 
operator <<(RValue<Short8> lhs,unsigned char rhs)2152 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2153 {
2154 	RR_DEBUG_INFO_UPDATE_LOC();
2155 #if defined(__i386__) || defined(__x86_64__)
2156 	return x86::psllw(lhs, rhs);
2157 #else
2158 	return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
2159 #endif
2160 }
2161 
operator >>(RValue<Short8> lhs,unsigned char rhs)2162 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2163 {
2164 	RR_DEBUG_INFO_UPDATE_LOC();
2165 #if defined(__i386__) || defined(__x86_64__)
2166 	return x86::psraw(lhs, rhs);
2167 #else
2168 	return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
2169 #endif
2170 }
2171 
MulAdd(RValue<Short8> x,RValue<Short8> y)2172 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2173 {
2174 	RR_DEBUG_INFO_UPDATE_LOC();
2175 #if defined(__i386__) || defined(__x86_64__)
2176 	return x86::pmaddwd(x, y);
2177 #else
2178 	return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
2179 #endif
2180 }
2181 
MulHigh(RValue<Short8> x,RValue<Short8> y)2182 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2183 {
2184 	RR_DEBUG_INFO_UPDATE_LOC();
2185 #if defined(__i386__) || defined(__x86_64__)
2186 	return x86::pmulhw(x, y);
2187 #else
2188 	return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2189 #endif
2190 }
2191 
getType()2192 Type *Short8::getType()
2193 {
2194 	return T(llvm::VectorType::get(T(Short::getType()), 8));
2195 }
2196 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2197 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2198 {
2199 	RR_DEBUG_INFO_UPDATE_LOC();
2200 #if defined(__i386__) || defined(__x86_64__)
2201 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2202 #else
2203 	return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
2204 #endif
2205 }
2206 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2207 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2208 {
2209 	RR_DEBUG_INFO_UPDATE_LOC();
2210 #if defined(__i386__) || defined(__x86_64__)
2211 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2212 #else
2213 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
2214 #endif
2215 }
2216 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2217 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2218 {
2219 	RR_DEBUG_INFO_UPDATE_LOC();
2220 #if defined(__i386__) || defined(__x86_64__)
2221 	return x86::pmulhuw(x, y);
2222 #else
2223 	return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2224 #endif
2225 }
2226 
getType()2227 Type *UShort8::getType()
2228 {
2229 	return T(llvm::VectorType::get(T(UShort::getType()), 8));
2230 }
2231 
operator ++(Int & val,int)2232 RValue<Int> operator++(Int &val, int)  // Post-increment
2233 {
2234 	RR_DEBUG_INFO_UPDATE_LOC();
2235 	RValue<Int> res = val;
2236 
2237 	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2238 	val.storeValue(inc);
2239 
2240 	return res;
2241 }
2242 
operator ++(Int & val)2243 const Int &operator++(Int &val)  // Pre-increment
2244 {
2245 	RR_DEBUG_INFO_UPDATE_LOC();
2246 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2247 	val.storeValue(inc);
2248 
2249 	return val;
2250 }
2251 
operator --(Int & val,int)2252 RValue<Int> operator--(Int &val, int)  // Post-decrement
2253 {
2254 	RR_DEBUG_INFO_UPDATE_LOC();
2255 	RValue<Int> res = val;
2256 
2257 	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2258 	val.storeValue(inc);
2259 
2260 	return res;
2261 }
2262 
operator --(Int & val)2263 const Int &operator--(Int &val)  // Pre-decrement
2264 {
2265 	RR_DEBUG_INFO_UPDATE_LOC();
2266 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2267 	val.storeValue(inc);
2268 
2269 	return val;
2270 }
2271 
RoundInt(RValue<Float> cast)2272 RValue<Int> RoundInt(RValue<Float> cast)
2273 {
2274 	RR_DEBUG_INFO_UPDATE_LOC();
2275 #if defined(__i386__) || defined(__x86_64__)
2276 	return x86::cvtss2si(cast);
2277 #else
2278 	return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
2279 #endif
2280 }
2281 
getType()2282 Type *Int::getType()
2283 {
2284 	return T(llvm::Type::getInt32Ty(jit->context));
2285 }
2286 
getType()2287 Type *Long::getType()
2288 {
2289 	return T(llvm::Type::getInt64Ty(jit->context));
2290 }
2291 
UInt(RValue<Float> cast)2292 UInt::UInt(RValue<Float> cast)
2293 {
2294 	RR_DEBUG_INFO_UPDATE_LOC();
2295 	Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
2296 	storeValue(integer);
2297 }
2298 
operator ++(UInt & val,int)2299 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2300 {
2301 	RR_DEBUG_INFO_UPDATE_LOC();
2302 	RValue<UInt> res = val;
2303 
2304 	Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2305 	val.storeValue(inc);
2306 
2307 	return res;
2308 }
2309 
operator ++(UInt & val)2310 const UInt &operator++(UInt &val)  // Pre-increment
2311 {
2312 	RR_DEBUG_INFO_UPDATE_LOC();
2313 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2314 	val.storeValue(inc);
2315 
2316 	return val;
2317 }
2318 
operator --(UInt & val,int)2319 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2320 {
2321 	RR_DEBUG_INFO_UPDATE_LOC();
2322 	RValue<UInt> res = val;
2323 
2324 	Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2325 	val.storeValue(inc);
2326 
2327 	return res;
2328 }
2329 
operator --(UInt & val)2330 const UInt &operator--(UInt &val)  // Pre-decrement
2331 {
2332 	RR_DEBUG_INFO_UPDATE_LOC();
2333 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2334 	val.storeValue(inc);
2335 
2336 	return val;
2337 }
2338 
2339 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2340 //	{
2341 //#if defined(__i386__) || defined(__x86_64__)
2342 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2343 //#else
2344 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2345 //#endif
2346 //	}
2347 
getType()2348 Type *UInt::getType()
2349 {
2350 	return T(llvm::Type::getInt32Ty(jit->context));
2351 }
2352 
2353 //	Int2::Int2(RValue<Int> cast)
2354 //	{
2355 //		Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2356 //		Value *vector = Nucleus::createBitCast(extend, Int2::getType());
2357 //
2358 //		int shuffle[2] = {0, 0};
2359 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2360 //
2361 //		storeValue(replicate);
2362 //	}
2363 
operator <<(RValue<Int2> lhs,unsigned char rhs)2364 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2365 {
2366 	RR_DEBUG_INFO_UPDATE_LOC();
2367 #if defined(__i386__) || defined(__x86_64__)
2368 	//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
2369 
2370 	return x86::pslld(lhs, rhs);
2371 #else
2372 	return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
2373 #endif
2374 }
2375 
operator >>(RValue<Int2> lhs,unsigned char rhs)2376 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2377 {
2378 	RR_DEBUG_INFO_UPDATE_LOC();
2379 #if defined(__i386__) || defined(__x86_64__)
2380 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
2381 
2382 	return x86::psrad(lhs, rhs);
2383 #else
2384 	return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
2385 #endif
2386 }
2387 
getType()2388 Type *Int2::getType()
2389 {
2390 	return T(Type_v2i32);
2391 }
2392 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2393 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2394 {
2395 	RR_DEBUG_INFO_UPDATE_LOC();
2396 #if defined(__i386__) || defined(__x86_64__)
2397 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
2398 
2399 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2400 #else
2401 	return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
2402 #endif
2403 }
2404 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2405 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2406 {
2407 	RR_DEBUG_INFO_UPDATE_LOC();
2408 #if defined(__i386__) || defined(__x86_64__)
2409 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
2410 
2411 	return x86::psrld(lhs, rhs);
2412 #else
2413 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
2414 #endif
2415 }
2416 
getType()2417 Type *UInt2::getType()
2418 {
2419 	return T(Type_v2i32);
2420 }
2421 
Int4(RValue<Byte4> cast)2422 Int4::Int4(RValue<Byte4> cast)
2423     : XYZW(this)
2424 {
2425 	RR_DEBUG_INFO_UPDATE_LOC();
2426 #if defined(__i386__) || defined(__x86_64__)
2427 	if(CPUID::supportsSSE4_1())
2428 	{
2429 		*this = x86::pmovzxbd(As<Byte16>(cast));
2430 	}
2431 	else
2432 #endif
2433 	{
2434 		int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2435 		Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2436 		Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
2437 
2438 		int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2439 		Value *c = Nucleus::createBitCast(b, Short8::getType());
2440 		Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
2441 
2442 		*this = As<Int4>(d);
2443 	}
2444 }
2445 
Int4(RValue<SByte4> cast)2446 Int4::Int4(RValue<SByte4> cast)
2447     : XYZW(this)
2448 {
2449 	RR_DEBUG_INFO_UPDATE_LOC();
2450 #if defined(__i386__) || defined(__x86_64__)
2451 	if(CPUID::supportsSSE4_1())
2452 	{
2453 		*this = x86::pmovsxbd(As<SByte16>(cast));
2454 	}
2455 	else
2456 #endif
2457 	{
2458 		int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2459 		Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2460 		Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2461 
2462 		int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2463 		Value *c = Nucleus::createBitCast(b, Short8::getType());
2464 		Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2465 
2466 		*this = As<Int4>(d) >> 24;
2467 	}
2468 }
2469 
Int4(RValue<Short4> cast)2470 Int4::Int4(RValue<Short4> cast)
2471     : XYZW(this)
2472 {
2473 	RR_DEBUG_INFO_UPDATE_LOC();
2474 #if defined(__i386__) || defined(__x86_64__)
2475 	if(CPUID::supportsSSE4_1())
2476 	{
2477 		*this = x86::pmovsxwd(As<Short8>(cast));
2478 	}
2479 	else
2480 #endif
2481 	{
2482 		int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2483 		Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
2484 		*this = As<Int4>(c) >> 16;
2485 	}
2486 }
2487 
Int4(RValue<UShort4> cast)2488 Int4::Int4(RValue<UShort4> cast)
2489     : XYZW(this)
2490 {
2491 	RR_DEBUG_INFO_UPDATE_LOC();
2492 #if defined(__i386__) || defined(__x86_64__)
2493 	if(CPUID::supportsSSE4_1())
2494 	{
2495 		*this = x86::pmovzxwd(As<UShort8>(cast));
2496 	}
2497 	else
2498 #endif
2499 	{
2500 		int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2501 		Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2502 		*this = As<Int4>(c);
2503 	}
2504 }
2505 
Int4(RValue<Int> rhs)2506 Int4::Int4(RValue<Int> rhs)
2507     : XYZW(this)
2508 {
2509 	RR_DEBUG_INFO_UPDATE_LOC();
2510 	Value *vector = loadValue();
2511 	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2512 
2513 	int swizzle[4] = { 0, 0, 0, 0 };
2514 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2515 
2516 	storeValue(replicate);
2517 }
2518 
operator <<(RValue<Int4> lhs,unsigned char rhs)2519 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2520 {
2521 	RR_DEBUG_INFO_UPDATE_LOC();
2522 #if defined(__i386__) || defined(__x86_64__)
2523 	return x86::pslld(lhs, rhs);
2524 #else
2525 	return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
2526 #endif
2527 }
2528 
operator >>(RValue<Int4> lhs,unsigned char rhs)2529 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2530 {
2531 	RR_DEBUG_INFO_UPDATE_LOC();
2532 #if defined(__i386__) || defined(__x86_64__)
2533 	return x86::psrad(lhs, rhs);
2534 #else
2535 	return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
2536 #endif
2537 }
2538 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2539 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2540 {
2541 	RR_DEBUG_INFO_UPDATE_LOC();
2542 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2543 }
2544 
CmpLT(RValue<Int4> x,RValue<Int4> y)2545 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2546 {
2547 	RR_DEBUG_INFO_UPDATE_LOC();
2548 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
2549 }
2550 
CmpLE(RValue<Int4> x,RValue<Int4> y)2551 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2552 {
2553 	RR_DEBUG_INFO_UPDATE_LOC();
2554 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
2555 }
2556 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2557 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2558 {
2559 	RR_DEBUG_INFO_UPDATE_LOC();
2560 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2561 }
2562 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2563 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2564 {
2565 	RR_DEBUG_INFO_UPDATE_LOC();
2566 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
2567 }
2568 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2569 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2570 {
2571 	RR_DEBUG_INFO_UPDATE_LOC();
2572 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
2573 }
2574 
Max(RValue<Int4> x,RValue<Int4> y)2575 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2576 {
2577 	RR_DEBUG_INFO_UPDATE_LOC();
2578 #if defined(__i386__) || defined(__x86_64__)
2579 	if(CPUID::supportsSSE4_1())
2580 	{
2581 		return x86::pmaxsd(x, y);
2582 	}
2583 	else
2584 #endif
2585 	{
2586 		RValue<Int4> greater = CmpNLE(x, y);
2587 		return (x & greater) | (y & ~greater);
2588 	}
2589 }
2590 
Min(RValue<Int4> x,RValue<Int4> y)2591 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2592 {
2593 	RR_DEBUG_INFO_UPDATE_LOC();
2594 #if defined(__i386__) || defined(__x86_64__)
2595 	if(CPUID::supportsSSE4_1())
2596 	{
2597 		return x86::pminsd(x, y);
2598 	}
2599 	else
2600 #endif
2601 	{
2602 		RValue<Int4> less = CmpLT(x, y);
2603 		return (x & less) | (y & ~less);
2604 	}
2605 }
2606 
RoundInt(RValue<Float4> cast)2607 RValue<Int4> RoundInt(RValue<Float4> cast)
2608 {
2609 	RR_DEBUG_INFO_UPDATE_LOC();
2610 #if defined(__i386__) || defined(__x86_64__)
2611 	return x86::cvtps2dq(cast);
2612 #else
2613 	return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
2614 #endif
2615 }
2616 
MulHigh(RValue<Int4> x,RValue<Int4> y)2617 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2618 {
2619 	RR_DEBUG_INFO_UPDATE_LOC();
2620 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2621 	return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2622 }
2623 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2624 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2625 {
2626 	RR_DEBUG_INFO_UPDATE_LOC();
2627 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2628 	return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2629 }
2630 
PackSigned(RValue<Int4> x,RValue<Int4> y)2631 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2632 {
2633 	RR_DEBUG_INFO_UPDATE_LOC();
2634 #if defined(__i386__) || defined(__x86_64__)
2635 	return x86::packssdw(x, y);
2636 #else
2637 	return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
2638 #endif
2639 }
2640 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2641 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2642 {
2643 	RR_DEBUG_INFO_UPDATE_LOC();
2644 #if defined(__i386__) || defined(__x86_64__)
2645 	return x86::packusdw(x, y);
2646 #else
2647 	return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
2648 #endif
2649 }
2650 
SignMask(RValue<Int4> x)2651 RValue<Int> SignMask(RValue<Int4> x)
2652 {
2653 	RR_DEBUG_INFO_UPDATE_LOC();
2654 #if defined(__i386__) || defined(__x86_64__)
2655 	return x86::movmskps(As<Float4>(x));
2656 #else
2657 	return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
2658 #endif
2659 }
2660 
getType()2661 Type *Int4::getType()
2662 {
2663 	return T(llvm::VectorType::get(T(Int::getType()), 4));
2664 }
2665 
UInt4(RValue<Float4> cast)2666 UInt4::UInt4(RValue<Float4> cast)
2667     : XYZW(this)
2668 {
2669 	RR_DEBUG_INFO_UPDATE_LOC();
2670 	Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
2671 	storeValue(xyzw);
2672 }
2673 
UInt4(RValue<UInt> rhs)2674 UInt4::UInt4(RValue<UInt> rhs)
2675     : XYZW(this)
2676 {
2677 	RR_DEBUG_INFO_UPDATE_LOC();
2678 	Value *vector = loadValue();
2679 	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2680 
2681 	int swizzle[4] = { 0, 0, 0, 0 };
2682 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2683 
2684 	storeValue(replicate);
2685 }
2686 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2687 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2688 {
2689 	RR_DEBUG_INFO_UPDATE_LOC();
2690 #if defined(__i386__) || defined(__x86_64__)
2691 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2692 #else
2693 	return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
2694 #endif
2695 }
2696 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2697 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2698 {
2699 	RR_DEBUG_INFO_UPDATE_LOC();
2700 #if defined(__i386__) || defined(__x86_64__)
2701 	return x86::psrld(lhs, rhs);
2702 #else
2703 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2704 #endif
2705 }
2706 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2707 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2708 {
2709 	RR_DEBUG_INFO_UPDATE_LOC();
2710 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2711 }
2712 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2713 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2714 {
2715 	RR_DEBUG_INFO_UPDATE_LOC();
2716 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
2717 }
2718 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2719 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2720 {
2721 	RR_DEBUG_INFO_UPDATE_LOC();
2722 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
2723 }
2724 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2725 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2726 {
2727 	RR_DEBUG_INFO_UPDATE_LOC();
2728 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2729 }
2730 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2731 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2732 {
2733 	RR_DEBUG_INFO_UPDATE_LOC();
2734 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
2735 }
2736 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2737 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2738 {
2739 	RR_DEBUG_INFO_UPDATE_LOC();
2740 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
2741 }
2742 
Max(RValue<UInt4> x,RValue<UInt4> y)2743 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2744 {
2745 	RR_DEBUG_INFO_UPDATE_LOC();
2746 #if defined(__i386__) || defined(__x86_64__)
2747 	if(CPUID::supportsSSE4_1())
2748 	{
2749 		return x86::pmaxud(x, y);
2750 	}
2751 	else
2752 #endif
2753 	{
2754 		RValue<UInt4> greater = CmpNLE(x, y);
2755 		return (x & greater) | (y & ~greater);
2756 	}
2757 }
2758 
Min(RValue<UInt4> x,RValue<UInt4> y)2759 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2760 {
2761 	RR_DEBUG_INFO_UPDATE_LOC();
2762 #if defined(__i386__) || defined(__x86_64__)
2763 	if(CPUID::supportsSSE4_1())
2764 	{
2765 		return x86::pminud(x, y);
2766 	}
2767 	else
2768 #endif
2769 	{
2770 		RValue<UInt4> less = CmpLT(x, y);
2771 		return (x & less) | (y & ~less);
2772 	}
2773 }
2774 
getType()2775 Type *UInt4::getType()
2776 {
2777 	return T(llvm::VectorType::get(T(UInt::getType()), 4));
2778 }
2779 
getType()2780 Type *Half::getType()
2781 {
2782 	return T(llvm::Type::getInt16Ty(jit->context));
2783 }
2784 
Rcp_pp(RValue<Float> x,bool exactAtPow2)2785 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2786 {
2787 	RR_DEBUG_INFO_UPDATE_LOC();
2788 #if defined(__i386__) || defined(__x86_64__)
2789 	if(exactAtPow2)
2790 	{
2791 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2792 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2793 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2794 	}
2795 	return x86::rcpss(x);
2796 #else
2797 	return As<Float>(V(lowerRCP(V(x.value))));
2798 #endif
2799 }
2800 
RcpSqrt_pp(RValue<Float> x)2801 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2802 {
2803 	RR_DEBUG_INFO_UPDATE_LOC();
2804 #if defined(__i386__) || defined(__x86_64__)
2805 	return x86::rsqrtss(x);
2806 #else
2807 	return As<Float>(V(lowerRSQRT(V(x.value))));
2808 #endif
2809 }
2810 
Sqrt(RValue<Float> x)2811 RValue<Float> Sqrt(RValue<Float> x)
2812 {
2813 	RR_DEBUG_INFO_UPDATE_LOC();
2814 #if defined(__i386__) || defined(__x86_64__)
2815 	return x86::sqrtss(x);
2816 #else
2817 	return As<Float>(V(lowerSQRT(V(x.value))));
2818 #endif
2819 }
2820 
Round(RValue<Float> x)2821 RValue<Float> Round(RValue<Float> x)
2822 {
2823 	RR_DEBUG_INFO_UPDATE_LOC();
2824 #if defined(__i386__) || defined(__x86_64__)
2825 	if(CPUID::supportsSSE4_1())
2826 	{
2827 		return x86::roundss(x, 0);
2828 	}
2829 	else
2830 	{
2831 		return Float4(Round(Float4(x))).x;
2832 	}
2833 #else
2834 	return RValue<Float>(V(lowerRound(V(x.value))));
2835 #endif
2836 }
2837 
Trunc(RValue<Float> x)2838 RValue<Float> Trunc(RValue<Float> x)
2839 {
2840 	RR_DEBUG_INFO_UPDATE_LOC();
2841 #if defined(__i386__) || defined(__x86_64__)
2842 	if(CPUID::supportsSSE4_1())
2843 	{
2844 		return x86::roundss(x, 3);
2845 	}
2846 	else
2847 	{
2848 		return Float(Int(x));  // Rounded toward zero
2849 	}
2850 #else
2851 	return RValue<Float>(V(lowerTrunc(V(x.value))));
2852 #endif
2853 }
2854 
Frac(RValue<Float> x)2855 RValue<Float> Frac(RValue<Float> x)
2856 {
2857 	RR_DEBUG_INFO_UPDATE_LOC();
2858 #if defined(__i386__) || defined(__x86_64__)
2859 	if(CPUID::supportsSSE4_1())
2860 	{
2861 		return x - x86::floorss(x);
2862 	}
2863 	else
2864 	{
2865 		return Float4(Frac(Float4(x))).x;
2866 	}
2867 #else
2868 	// x - floor(x) can be 1.0 for very small negative x.
2869 	// Clamp against the value just below 1.0.
2870 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
2871 #endif
2872 }
2873 
Floor(RValue<Float> x)2874 RValue<Float> Floor(RValue<Float> x)
2875 {
2876 	RR_DEBUG_INFO_UPDATE_LOC();
2877 #if defined(__i386__) || defined(__x86_64__)
2878 	if(CPUID::supportsSSE4_1())
2879 	{
2880 		return x86::floorss(x);
2881 	}
2882 	else
2883 	{
2884 		return Float4(Floor(Float4(x))).x;
2885 	}
2886 #else
2887 	return RValue<Float>(V(lowerFloor(V(x.value))));
2888 #endif
2889 }
2890 
Ceil(RValue<Float> x)2891 RValue<Float> Ceil(RValue<Float> x)
2892 {
2893 	RR_DEBUG_INFO_UPDATE_LOC();
2894 #if defined(__i386__) || defined(__x86_64__)
2895 	if(CPUID::supportsSSE4_1())
2896 	{
2897 		return x86::ceilss(x);
2898 	}
2899 	else
2900 #endif
2901 	{
2902 		return Float4(Ceil(Float4(x))).x;
2903 	}
2904 }
2905 
getType()2906 Type *Float::getType()
2907 {
2908 	return T(llvm::Type::getFloatTy(jit->context));
2909 }
2910 
getType()2911 Type *Float2::getType()
2912 {
2913 	return T(Type_v2f32);
2914 }
2915 
Exp2(RValue<Float> v)2916 RValue<Float> Exp2(RValue<Float> v)
2917 {
2918 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::getType()) });
2919 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
2920 }
2921 
Log2(RValue<Float> v)2922 RValue<Float> Log2(RValue<Float> v)
2923 {
2924 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::getType()) });
2925 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value))));
2926 }
2927 
Float4(RValue<Float> rhs)2928 Float4::Float4(RValue<Float> rhs)
2929     : XYZW(this)
2930 {
2931 	RR_DEBUG_INFO_UPDATE_LOC();
2932 	Value *vector = loadValue();
2933 	Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2934 
2935 	int swizzle[4] = { 0, 0, 0, 0 };
2936 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2937 
2938 	storeValue(replicate);
2939 }
2940 
Max(RValue<Float4> x,RValue<Float4> y)2941 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
2942 {
2943 	RR_DEBUG_INFO_UPDATE_LOC();
2944 #if defined(__i386__) || defined(__x86_64__)
2945 	return x86::maxps(x, y);
2946 #else
2947 	return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
2948 #endif
2949 }
2950 
Min(RValue<Float4> x,RValue<Float4> y)2951 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
2952 {
2953 	RR_DEBUG_INFO_UPDATE_LOC();
2954 #if defined(__i386__) || defined(__x86_64__)
2955 	return x86::minps(x, y);
2956 #else
2957 	return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
2958 #endif
2959 }
2960 
Rcp_pp(RValue<Float4> x,bool exactAtPow2)2961 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
2962 {
2963 	RR_DEBUG_INFO_UPDATE_LOC();
2964 #if defined(__i386__) || defined(__x86_64__)
2965 	if(exactAtPow2)
2966 	{
2967 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2968 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2969 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2970 	}
2971 	return x86::rcpps(x);
2972 #else
2973 	return As<Float4>(V(lowerRCP(V(x.value))));
2974 #endif
2975 }
2976 
RcpSqrt_pp(RValue<Float4> x)2977 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
2978 {
2979 	RR_DEBUG_INFO_UPDATE_LOC();
2980 #if defined(__i386__) || defined(__x86_64__)
2981 	return x86::rsqrtps(x);
2982 #else
2983 	return As<Float4>(V(lowerRSQRT(V(x.value))));
2984 #endif
2985 }
2986 
Sqrt(RValue<Float4> x)2987 RValue<Float4> Sqrt(RValue<Float4> x)
2988 {
2989 	RR_DEBUG_INFO_UPDATE_LOC();
2990 #if defined(__i386__) || defined(__x86_64__)
2991 	return x86::sqrtps(x);
2992 #else
2993 	return As<Float4>(V(lowerSQRT(V(x.value))));
2994 #endif
2995 }
2996 
SignMask(RValue<Float4> x)2997 RValue<Int> SignMask(RValue<Float4> x)
2998 {
2999 	RR_DEBUG_INFO_UPDATE_LOC();
3000 #if defined(__i386__) || defined(__x86_64__)
3001 	return x86::movmskps(x);
3002 #else
3003 	return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
3004 #endif
3005 }
3006 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3007 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3008 {
3009 	RR_DEBUG_INFO_UPDATE_LOC();
3010 	//	return As<Int4>(x86::cmpeqps(x, y));
3011 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
3012 }
3013 
CmpLT(RValue<Float4> x,RValue<Float4> y)3014 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3015 {
3016 	RR_DEBUG_INFO_UPDATE_LOC();
3017 	//	return As<Int4>(x86::cmpltps(x, y));
3018 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
3019 }
3020 
CmpLE(RValue<Float4> x,RValue<Float4> y)3021 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3022 {
3023 	RR_DEBUG_INFO_UPDATE_LOC();
3024 	//	return As<Int4>(x86::cmpleps(x, y));
3025 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
3026 }
3027 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3028 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3029 {
3030 	RR_DEBUG_INFO_UPDATE_LOC();
3031 	//	return As<Int4>(x86::cmpneqps(x, y));
3032 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
3033 }
3034 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3035 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3036 {
3037 	RR_DEBUG_INFO_UPDATE_LOC();
3038 	//	return As<Int4>(x86::cmpnltps(x, y));
3039 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
3040 }
3041 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3042 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3043 {
3044 	RR_DEBUG_INFO_UPDATE_LOC();
3045 	//	return As<Int4>(x86::cmpnleps(x, y));
3046 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
3047 }
3048 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3049 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3050 {
3051 	RR_DEBUG_INFO_UPDATE_LOC();
3052 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
3053 }
3054 
CmpULT(RValue<Float4> x,RValue<Float4> y)3055 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3056 {
3057 	RR_DEBUG_INFO_UPDATE_LOC();
3058 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
3059 }
3060 
CmpULE(RValue<Float4> x,RValue<Float4> y)3061 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3062 {
3063 	RR_DEBUG_INFO_UPDATE_LOC();
3064 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
3065 }
3066 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3067 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3068 {
3069 	RR_DEBUG_INFO_UPDATE_LOC();
3070 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
3071 }
3072 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3073 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3074 {
3075 	RR_DEBUG_INFO_UPDATE_LOC();
3076 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
3077 }
3078 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3079 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3080 {
3081 	RR_DEBUG_INFO_UPDATE_LOC();
3082 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
3083 }
3084 
Round(RValue<Float4> x)3085 RValue<Float4> Round(RValue<Float4> x)
3086 {
3087 	RR_DEBUG_INFO_UPDATE_LOC();
3088 #if defined(__i386__) || defined(__x86_64__)
3089 	if(CPUID::supportsSSE4_1())
3090 	{
3091 		return x86::roundps(x, 0);
3092 	}
3093 	else
3094 	{
3095 		return Float4(RoundInt(x));
3096 	}
3097 #else
3098 	return RValue<Float4>(V(lowerRound(V(x.value))));
3099 #endif
3100 }
3101 
Trunc(RValue<Float4> x)3102 RValue<Float4> Trunc(RValue<Float4> x)
3103 {
3104 	RR_DEBUG_INFO_UPDATE_LOC();
3105 #if defined(__i386__) || defined(__x86_64__)
3106 	if(CPUID::supportsSSE4_1())
3107 	{
3108 		return x86::roundps(x, 3);
3109 	}
3110 	else
3111 	{
3112 		return Float4(Int4(x));
3113 	}
3114 #else
3115 	return RValue<Float4>(V(lowerTrunc(V(x.value))));
3116 #endif
3117 }
3118 
Frac(RValue<Float4> x)3119 RValue<Float4> Frac(RValue<Float4> x)
3120 {
3121 	RR_DEBUG_INFO_UPDATE_LOC();
3122 	Float4 frc;
3123 
3124 #if defined(__i386__) || defined(__x86_64__)
3125 	if(CPUID::supportsSSE4_1())
3126 	{
3127 		frc = x - Floor(x);
3128 	}
3129 	else
3130 	{
3131 		frc = x - Float4(Int4(x));  // Signed fractional part.
3132 
3133 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3134 	}
3135 #else
3136 	frc = x - Floor(x);
3137 #endif
3138 
3139 	// x - floor(x) can be 1.0 for very small negative x.
3140 	// Clamp against the value just below 1.0.
3141 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3142 }
3143 
Floor(RValue<Float4> x)3144 RValue<Float4> Floor(RValue<Float4> x)
3145 {
3146 	RR_DEBUG_INFO_UPDATE_LOC();
3147 #if defined(__i386__) || defined(__x86_64__)
3148 	if(CPUID::supportsSSE4_1())
3149 	{
3150 		return x86::floorps(x);
3151 	}
3152 	else
3153 	{
3154 		return x - Frac(x);
3155 	}
3156 #else
3157 	return RValue<Float4>(V(lowerFloor(V(x.value))));
3158 #endif
3159 }
3160 
Ceil(RValue<Float4> x)3161 RValue<Float4> Ceil(RValue<Float4> x)
3162 {
3163 	RR_DEBUG_INFO_UPDATE_LOC();
3164 #if defined(__i386__) || defined(__x86_64__)
3165 	if(CPUID::supportsSSE4_1())
3166 	{
3167 		return x86::ceilps(x);
3168 	}
3169 	else
3170 #endif
3171 	{
3172 		return -Floor(-x);
3173 	}
3174 }
3175 
Sin(RValue<Float4> v)3176 RValue<Float4> Sin(RValue<Float4> v)
3177 {
3178 	RR_DEBUG_INFO_UPDATE_LOC();
3179 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value)->getType() });
3180 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3181 }
3182 
Cos(RValue<Float4> v)3183 RValue<Float4> Cos(RValue<Float4> v)
3184 {
3185 	RR_DEBUG_INFO_UPDATE_LOC();
3186 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value)->getType() });
3187 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3188 }
3189 
Tan(RValue<Float4> v)3190 RValue<Float4> Tan(RValue<Float4> v)
3191 {
3192 	RR_DEBUG_INFO_UPDATE_LOC();
3193 	return Sin(v) / Cos(v);
3194 }
3195 
TransformFloat4PerElement(RValue<Float4> v,const char * name)3196 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3197 {
3198 	auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), ::llvm::ArrayRef<llvm::Type *>(T(Float::getType())), false);
3199 	auto func = jit->module->getOrInsertFunction(name, funcTy);
3200 	llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
3201 	for(uint64_t i = 0; i < 4; i++)
3202 	{
3203 		auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value, Float::getType(), i)));
3204 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3205 	}
3206 	return RValue<Float4>(V(out));
3207 }
3208 
Asin(RValue<Float4> v)3209 RValue<Float4> Asin(RValue<Float4> v)
3210 {
3211 	RR_DEBUG_INFO_UPDATE_LOC();
3212 	return TransformFloat4PerElement(v, "asinf");
3213 }
3214 
Acos(RValue<Float4> v)3215 RValue<Float4> Acos(RValue<Float4> v)
3216 {
3217 	RR_DEBUG_INFO_UPDATE_LOC();
3218 	return TransformFloat4PerElement(v, "acosf");
3219 }
3220 
Atan(RValue<Float4> v)3221 RValue<Float4> Atan(RValue<Float4> v)
3222 {
3223 	RR_DEBUG_INFO_UPDATE_LOC();
3224 	return TransformFloat4PerElement(v, "atanf");
3225 }
3226 
Sinh(RValue<Float4> v)3227 RValue<Float4> Sinh(RValue<Float4> v)
3228 {
3229 	RR_DEBUG_INFO_UPDATE_LOC();
3230 	return emulated::Sinh(v);
3231 }
3232 
Cosh(RValue<Float4> v)3233 RValue<Float4> Cosh(RValue<Float4> v)
3234 {
3235 	RR_DEBUG_INFO_UPDATE_LOC();
3236 	return emulated::Cosh(v);
3237 }
3238 
Tanh(RValue<Float4> v)3239 RValue<Float4> Tanh(RValue<Float4> v)
3240 {
3241 	RR_DEBUG_INFO_UPDATE_LOC();
3242 	return TransformFloat4PerElement(v, "tanhf");
3243 }
3244 
Asinh(RValue<Float4> v)3245 RValue<Float4> Asinh(RValue<Float4> v)
3246 {
3247 	RR_DEBUG_INFO_UPDATE_LOC();
3248 	return TransformFloat4PerElement(v, "asinhf");
3249 }
3250 
Acosh(RValue<Float4> v)3251 RValue<Float4> Acosh(RValue<Float4> v)
3252 {
3253 	RR_DEBUG_INFO_UPDATE_LOC();
3254 	return TransformFloat4PerElement(v, "acoshf");
3255 }
3256 
Atanh(RValue<Float4> v)3257 RValue<Float4> Atanh(RValue<Float4> v)
3258 {
3259 	RR_DEBUG_INFO_UPDATE_LOC();
3260 	return TransformFloat4PerElement(v, "atanhf");
3261 }
3262 
Atan2(RValue<Float4> x,RValue<Float4> y)3263 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3264 {
3265 	RR_DEBUG_INFO_UPDATE_LOC();
3266 	::llvm::SmallVector<::llvm::Type *, 2> paramTys;
3267 	paramTys.push_back(T(Float::getType()));
3268 	paramTys.push_back(T(Float::getType()));
3269 	auto funcTy = ::llvm::FunctionType::get(T(Float::getType()), paramTys, false);
3270 	auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3271 	llvm::Value *out = ::llvm::UndefValue::get(T(Float4::getType()));
3272 	for(uint64_t i = 0; i < 4; i++)
3273 	{
3274 		auto el = jit->builder->CreateCall2(func, ARGS(
3275 		                                              V(Nucleus::createExtractElement(x.value, Float::getType(), i)),
3276 		                                              V(Nucleus::createExtractElement(y.value, Float::getType(), i))));
3277 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3278 	}
3279 	return RValue<Float4>(V(out));
3280 }
3281 
Pow(RValue<Float4> x,RValue<Float4> y)3282 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3283 {
3284 	RR_DEBUG_INFO_UPDATE_LOC();
3285 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::getType()) });
3286 	return RValue<Float4>(V(jit->builder->CreateCall2(func, ARGS(V(x.value), V(y.value)))));
3287 }
3288 
Exp(RValue<Float4> v)3289 RValue<Float4> Exp(RValue<Float4> v)
3290 {
3291 	RR_DEBUG_INFO_UPDATE_LOC();
3292 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::getType()) });
3293 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3294 }
3295 
Log(RValue<Float4> v)3296 RValue<Float4> Log(RValue<Float4> v)
3297 {
3298 	RR_DEBUG_INFO_UPDATE_LOC();
3299 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::getType()) });
3300 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3301 }
3302 
Exp2(RValue<Float4> v)3303 RValue<Float4> Exp2(RValue<Float4> v)
3304 {
3305 	RR_DEBUG_INFO_UPDATE_LOC();
3306 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::getType()) });
3307 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3308 }
3309 
Log2(RValue<Float4> v)3310 RValue<Float4> Log2(RValue<Float4> v)
3311 {
3312 	RR_DEBUG_INFO_UPDATE_LOC();
3313 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::getType()) });
3314 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value))));
3315 }
3316 
Ctlz(RValue<UInt> v,bool isZeroUndef)3317 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3318 {
3319 	RR_DEBUG_INFO_UPDATE_LOC();
3320 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::getType()) });
3321 	return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
3322 	                                                          V(v.value),
3323 	                                                          isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3324 }
3325 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3326 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3327 {
3328 	RR_DEBUG_INFO_UPDATE_LOC();
3329 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::getType()) });
3330 	return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
3331 	                                                           V(v.value),
3332 	                                                           isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3333 }
3334 
Cttz(RValue<UInt> v,bool isZeroUndef)3335 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3336 {
3337 	RR_DEBUG_INFO_UPDATE_LOC();
3338 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::getType()) });
3339 	return RValue<UInt>(V(jit->builder->CreateCall2(func, ARGS(
3340 	                                                          V(v.value),
3341 	                                                          isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3342 }
3343 
Cttz(RValue<UInt4> v,bool isZeroUndef)3344 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3345 {
3346 	RR_DEBUG_INFO_UPDATE_LOC();
3347 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::getType()) });
3348 	return RValue<UInt4>(V(jit->builder->CreateCall2(func, ARGS(
3349 	                                                           V(v.value),
3350 	                                                           isZeroUndef ? ::llvm::ConstantInt::getTrue(jit->context) : ::llvm::ConstantInt::getFalse(jit->context)))));
3351 }
3352 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3353 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3354 {
3355 	return RValue<Int>(Nucleus::createAtomicMin(x.value, y.value, memoryOrder));
3356 }
3357 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3358 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3359 {
3360 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value, y.value, memoryOrder));
3361 }
3362 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3363 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3364 {
3365 	return RValue<Int>(Nucleus::createAtomicMax(x.value, y.value, memoryOrder));
3366 }
3367 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3368 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3369 {
3370 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value, y.value, memoryOrder));
3371 }
3372 
getType()3373 Type *Float4::getType()
3374 {
3375 	return T(llvm::VectorType::get(T(Float::getType()), 4));
3376 }
3377 
Ticks()3378 RValue<Long> Ticks()
3379 {
3380 	RR_DEBUG_INFO_UPDATE_LOC();
3381 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3382 
3383 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3384 }
3385 
ConstantPointer(void const * ptr)3386 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3387 {
3388 	RR_DEBUG_INFO_UPDATE_LOC();
3389 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3390 	// is defined to truncate (and zero extend) if necessary.
3391 	auto ptrAsInt = ::llvm::ConstantInt::get(::llvm::Type::getInt64Ty(jit->context), reinterpret_cast<uintptr_t>(ptr));
3392 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::getType()))));
3393 }
3394 
ConstantData(void const * data,size_t size)3395 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3396 {
3397 	RR_DEBUG_INFO_UPDATE_LOC();
3398 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3399 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3400 	return RValue<Pointer<Byte>>(V(ptr));
3401 }
3402 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3403 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3404 {
3405 	RR_DEBUG_INFO_UPDATE_LOC();
3406 	::llvm::SmallVector<::llvm::Type *, 8> paramTys;
3407 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3408 	auto funcTy = ::llvm::FunctionType::get(T(retTy), paramTys, false);
3409 
3410 	auto funcPtrTy = funcTy->getPointerTo();
3411 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value), funcPtrTy);
3412 
3413 	::llvm::SmallVector<::llvm::Value *, 8> arguments;
3414 	for(auto arg : args) { arguments.push_back(V(arg)); }
3415 	return V(jit->builder->CreateCall(funcPtr, arguments));
3416 }
3417 
Breakpoint()3418 void Breakpoint()
3419 {
3420 	RR_DEBUG_INFO_UPDATE_LOC();
3421 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3422 
3423 	jit->builder->CreateCall(debugtrap);
3424 }
3425 
3426 }  // namespace rr
3427 
3428 namespace rr {
3429 
3430 #if defined(__i386__) || defined(__x86_64__)
3431 namespace x86 {
3432 
cvtss2si(RValue<Float> val)3433 RValue<Int> cvtss2si(RValue<Float> val)
3434 {
3435 	llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_cvtss2si);
3436 
3437 	Float4 vector;
3438 	vector.x = val;
3439 
3440 	return RValue<Int>(V(jit->builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
3441 }
3442 
cvtps2dq(RValue<Float4> val)3443 RValue<Int4> cvtps2dq(RValue<Float4> val)
3444 {
3445 	llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_cvtps2dq);
3446 
3447 	return RValue<Int4>(V(jit->builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
3448 }
3449 
rcpss(RValue<Float> val)3450 RValue<Float> rcpss(RValue<Float> val)
3451 {
3452 	llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ss);
3453 
3454 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3455 
3456 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
3457 }
3458 
sqrtss(RValue<Float> val)3459 RValue<Float> sqrtss(RValue<Float> val)
3460 {
3461 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, { V(val.value)->getType() });
3462 	return RValue<Float>(V(jit->builder->CreateCall(sqrt, ARGS(V(val.value)))));
3463 }
3464 
rsqrtss(RValue<Float> val)3465 RValue<Float> rsqrtss(RValue<Float> val)
3466 {
3467 	llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ss);
3468 
3469 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3470 
3471 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
3472 }
3473 
rcpps(RValue<Float4> val)3474 RValue<Float4> rcpps(RValue<Float4> val)
3475 {
3476 	llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rcp_ps);
3477 
3478 	return RValue<Float4>(V(jit->builder->CreateCall(rcpps, ARGS(V(val.value)))));
3479 }
3480 
sqrtps(RValue<Float4> val)3481 RValue<Float4> sqrtps(RValue<Float4> val)
3482 {
3483 	llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sqrt, { V(val.value)->getType() });
3484 
3485 	return RValue<Float4>(V(jit->builder->CreateCall(sqrtps, ARGS(V(val.value)))));
3486 }
3487 
rsqrtps(RValue<Float4> val)3488 RValue<Float4> rsqrtps(RValue<Float4> val)
3489 {
3490 	llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_rsqrt_ps);
3491 
3492 	return RValue<Float4>(V(jit->builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
3493 }
3494 
maxps(RValue<Float4> x,RValue<Float4> y)3495 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3496 {
3497 	llvm::Function *maxps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_max_ps);
3498 
3499 	return RValue<Float4>(V(jit->builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
3500 }
3501 
minps(RValue<Float4> x,RValue<Float4> y)3502 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3503 {
3504 	llvm::Function *minps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_min_ps);
3505 
3506 	return RValue<Float4>(V(jit->builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
3507 }
3508 
roundss(RValue<Float> val,unsigned char imm)3509 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3510 {
3511 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3512 
3513 	Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
3514 	Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
3515 
3516 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
3517 }
3518 
floorss(RValue<Float> val)3519 RValue<Float> floorss(RValue<Float> val)
3520 {
3521 	return roundss(val, 1);
3522 }
3523 
ceilss(RValue<Float> val)3524 RValue<Float> ceilss(RValue<Float> val)
3525 {
3526 	return roundss(val, 2);
3527 }
3528 
roundps(RValue<Float4> val,unsigned char imm)3529 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3530 {
3531 	llvm::Function *roundps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ps);
3532 
3533 	return RValue<Float4>(V(jit->builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
3534 }
3535 
floorps(RValue<Float4> val)3536 RValue<Float4> floorps(RValue<Float4> val)
3537 {
3538 	return roundps(val, 1);
3539 }
3540 
ceilps(RValue<Float4> val)3541 RValue<Float4> ceilps(RValue<Float4> val)
3542 {
3543 	return roundps(val, 2);
3544 }
3545 
pabsd(RValue<Int4> x)3546 RValue<Int4> pabsd(RValue<Int4> x)
3547 {
3548 	return RValue<Int4>(V(lowerPABS(V(x.value))));
3549 }
3550 
paddsw(RValue<Short4> x,RValue<Short4> y)3551 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3552 {
3553 #	if LLVM_VERSION_MAJOR >= 8
3554 	return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
3555 #	else
3556 	llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_w);
3557 
3558 	return As<Short4>(V(jit->builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
3559 #	endif
3560 }
3561 
psubsw(RValue<Short4> x,RValue<Short4> y)3562 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3563 {
3564 #	if LLVM_VERSION_MAJOR >= 8
3565 	return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
3566 #	else
3567 	llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_w);
3568 
3569 	return As<Short4>(V(jit->builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
3570 #	endif
3571 }
3572 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3573 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3574 {
3575 #	if LLVM_VERSION_MAJOR >= 8
3576 	return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
3577 #	else
3578 	llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_w);
3579 
3580 	return As<UShort4>(V(jit->builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
3581 #	endif
3582 }
3583 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3584 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3585 {
3586 #	if LLVM_VERSION_MAJOR >= 8
3587 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
3588 #	else
3589 	llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_w);
3590 
3591 	return As<UShort4>(V(jit->builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
3592 #	endif
3593 }
3594 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3595 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3596 {
3597 #	if LLVM_VERSION_MAJOR >= 8
3598 	return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
3599 #	else
3600 	llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_padds_b);
3601 
3602 	return As<SByte8>(V(jit->builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
3603 #	endif
3604 }
3605 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3606 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3607 {
3608 #	if LLVM_VERSION_MAJOR >= 8
3609 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
3610 #	else
3611 	llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubs_b);
3612 
3613 	return As<SByte8>(V(jit->builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
3614 #	endif
3615 }
3616 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3617 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3618 {
3619 #	if LLVM_VERSION_MAJOR >= 8
3620 	return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
3621 #	else
3622 	llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_paddus_b);
3623 
3624 	return As<Byte8>(V(jit->builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
3625 #	endif
3626 }
3627 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3628 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3629 {
3630 #	if LLVM_VERSION_MAJOR >= 8
3631 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
3632 #	else
3633 	llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psubus_b);
3634 
3635 	return As<Byte8>(V(jit->builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
3636 #	endif
3637 }
3638 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3639 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3640 {
3641 	return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
3642 }
3643 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3644 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3645 {
3646 	return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3647 }
3648 
pminsw(RValue<Short4> x,RValue<Short4> y)3649 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3650 {
3651 	return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3652 }
3653 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3654 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3655 {
3656 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
3657 }
3658 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3659 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3660 {
3661 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
3662 }
3663 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3664 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3665 {
3666 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
3667 }
3668 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3669 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3670 {
3671 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
3672 }
3673 
packssdw(RValue<Int2> x,RValue<Int2> y)3674 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3675 {
3676 	llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
3677 
3678 	return As<Short4>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3679 }
3680 
packssdw(RValue<Int4> x,RValue<Int4> y)3681 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3682 {
3683 	llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packssdw_128);
3684 
3685 	return RValue<Short8>(V(jit->builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3686 }
3687 
packsswb(RValue<Short4> x,RValue<Short4> y)3688 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3689 {
3690 	llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packsswb_128);
3691 
3692 	return As<SByte8>(V(jit->builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
3693 }
3694 
packuswb(RValue<Short4> x,RValue<Short4> y)3695 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3696 {
3697 	llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_packuswb_128);
3698 
3699 	return As<Byte8>(V(jit->builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
3700 }
3701 
packusdw(RValue<Int4> x,RValue<Int4> y)3702 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3703 {
3704 	if(CPUID::supportsSSE4_1())
3705 	{
3706 		llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_packusdw);
3707 
3708 		return RValue<UShort8>(V(jit->builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
3709 	}
3710 	else
3711 	{
3712 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3713 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3714 
3715 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3716 	}
3717 }
3718 
psrlw(RValue<UShort4> x,unsigned char y)3719 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3720 {
3721 	llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
3722 
3723 	return As<UShort4>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3724 }
3725 
psrlw(RValue<UShort8> x,unsigned char y)3726 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3727 {
3728 	llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_w);
3729 
3730 	return RValue<UShort8>(V(jit->builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3731 }
3732 
psraw(RValue<Short4> x,unsigned char y)3733 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3734 {
3735 	llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
3736 
3737 	return As<Short4>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3738 }
3739 
psraw(RValue<Short8> x,unsigned char y)3740 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3741 {
3742 	llvm::Function *psraw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_w);
3743 
3744 	return RValue<Short8>(V(jit->builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3745 }
3746 
psllw(RValue<Short4> x,unsigned char y)3747 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3748 {
3749 	llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
3750 
3751 	return As<Short4>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3752 }
3753 
psllw(RValue<Short8> x,unsigned char y)3754 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3755 {
3756 	llvm::Function *psllw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_w);
3757 
3758 	return RValue<Short8>(V(jit->builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3759 }
3760 
pslld(RValue<Int2> x,unsigned char y)3761 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3762 {
3763 	llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
3764 
3765 	return As<Int2>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3766 }
3767 
pslld(RValue<Int4> x,unsigned char y)3768 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3769 {
3770 	llvm::Function *pslld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pslli_d);
3771 
3772 	return RValue<Int4>(V(jit->builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3773 }
3774 
psrad(RValue<Int2> x,unsigned char y)3775 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3776 {
3777 	llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
3778 
3779 	return As<Int2>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3780 }
3781 
psrad(RValue<Int4> x,unsigned char y)3782 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3783 {
3784 	llvm::Function *psrad = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrai_d);
3785 
3786 	return RValue<Int4>(V(jit->builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3787 }
3788 
psrld(RValue<UInt2> x,unsigned char y)3789 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3790 {
3791 	llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
3792 
3793 	return As<UInt2>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3794 }
3795 
psrld(RValue<UInt4> x,unsigned char y)3796 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3797 {
3798 	llvm::Function *psrld = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_psrli_d);
3799 
3800 	return RValue<UInt4>(V(jit->builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3801 }
3802 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3803 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3804 {
3805 	return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3806 }
3807 
pminsd(RValue<Int4> x,RValue<Int4> y)3808 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3809 {
3810 	return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3811 }
3812 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3813 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3814 {
3815 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
3816 }
3817 
pminud(RValue<UInt4> x,RValue<UInt4> y)3818 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3819 {
3820 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
3821 }
3822 
pmulhw(RValue<Short4> x,RValue<Short4> y)3823 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3824 {
3825 	llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
3826 
3827 	return As<Short4>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3828 }
3829 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3830 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3831 {
3832 	llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
3833 
3834 	return As<UShort4>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3835 }
3836 
pmaddwd(RValue<Short4> x,RValue<Short4> y)3837 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3838 {
3839 	llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
3840 
3841 	return As<Int2>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3842 }
3843 
pmulhw(RValue<Short8> x,RValue<Short8> y)3844 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3845 {
3846 	llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulh_w);
3847 
3848 	return RValue<Short8>(V(jit->builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3849 }
3850 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3851 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3852 {
3853 	llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmulhu_w);
3854 
3855 	return RValue<UShort8>(V(jit->builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3856 }
3857 
pmaddwd(RValue<Short8> x,RValue<Short8> y)3858 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3859 {
3860 	llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmadd_wd);
3861 
3862 	return RValue<Int4>(V(jit->builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3863 }
3864 
movmskps(RValue<Float4> x)3865 RValue<Int> movmskps(RValue<Float4> x)
3866 {
3867 	llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse_movmsk_ps);
3868 
3869 	return RValue<Int>(V(jit->builder->CreateCall(movmskps, ARGS(V(x.value)))));
3870 }
3871 
pmovmskb(RValue<Byte8> x)3872 RValue<Int> pmovmskb(RValue<Byte8> x)
3873 {
3874 	llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse2_pmovmskb_128);
3875 
3876 	return RValue<Int>(V(jit->builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
3877 }
3878 
pmovzxbd(RValue<Byte16> x)3879 RValue<Int4> pmovzxbd(RValue<Byte16> x)
3880 {
3881 	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3882 }
3883 
pmovsxbd(RValue<SByte16> x)3884 RValue<Int4> pmovsxbd(RValue<SByte16> x)
3885 {
3886 	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3887 }
3888 
pmovzxwd(RValue<UShort8> x)3889 RValue<Int4> pmovzxwd(RValue<UShort8> x)
3890 {
3891 	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3892 }
3893 
pmovsxwd(RValue<Short8> x)3894 RValue<Int4> pmovsxwd(RValue<Short8> x)
3895 {
3896 	return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3897 }
3898 
3899 }  // namespace x86
3900 #endif  // defined(__i386__) || defined(__x86_64__)
3901 
3902 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3903 void VPrintf(const std::vector<Value *> &vals)
3904 {
3905 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
3906 	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
3907 	auto funcTy = ::llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
3908 	auto func = jit->module->getOrInsertFunction("printf", funcTy);
3909 	jit->builder->CreateCall(func, V(vals));
3910 }
3911 #endif  // ENABLE_RR_PRINT
3912 
Nop()3913 void Nop()
3914 {
3915 	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
3916 	auto funcTy = ::llvm::FunctionType::get(voidTy, {}, false);
3917 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
3918 	jit->builder->CreateCall(func);
3919 }
3920 
EmitDebugLocation()3921 void EmitDebugLocation()
3922 {
3923 #ifdef ENABLE_RR_DEBUG_INFO
3924 	if(jit->debugInfo != nullptr)
3925 	{
3926 		jit->debugInfo->EmitLocation();
3927 	}
3928 #endif  // ENABLE_RR_DEBUG_INFO
3929 }
3930 
EmitDebugVariable(Value * value)3931 void EmitDebugVariable(Value *value)
3932 {
3933 #ifdef ENABLE_RR_DEBUG_INFO
3934 	if(jit->debugInfo != nullptr)
3935 	{
3936 		jit->debugInfo->EmitVariable(value);
3937 	}
3938 #endif  // ENABLE_RR_DEBUG_INFO
3939 }
3940 
FlushDebug()3941 void FlushDebug()
3942 {
3943 #ifdef ENABLE_RR_DEBUG_INFO
3944 	if(jit->debugInfo != nullptr)
3945 	{
3946 		jit->debugInfo->Flush();
3947 	}
3948 #endif  // ENABLE_RR_DEBUG_INFO
3949 }
3950 
3951 }  // namespace rr
3952 
3953 // ------------------------------  Coroutines ------------------------------
3954 
3955 namespace {
3956 
3957 // Magic values retuned by llvm.coro.suspend.
3958 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
3959 enum SuspendAction
3960 {
3961 	SuspendActionSuspend = -1,
3962 	SuspendActionResume = 0,
3963 	SuspendActionDestroy = 1
3964 };
3965 
promoteFunctionToCoroutine()3966 void promoteFunctionToCoroutine()
3967 {
3968 	ASSERT(jit->coroutine.id == nullptr);
3969 
3970 	// Types
3971 	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
3972 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
3973 	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
3974 	auto i32Ty = ::llvm::Type::getInt32Ty(jit->context);
3975 	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
3976 	auto promiseTy = jit->coroutine.yieldType;
3977 	auto promisePtrTy = promiseTy->getPointerTo();
3978 
3979 	// LLVM intrinsics
3980 	auto coro_id = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
3981 	auto coro_size = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
3982 	auto coro_begin = ::llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
3983 	auto coro_resume = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_resume);
3984 	auto coro_end = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_end);
3985 	auto coro_free = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_free);
3986 	auto coro_destroy = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_destroy);
3987 	auto coro_promise = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_promise);
3988 	auto coro_done = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_done);
3989 	auto coro_suspend = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_suspend);
3990 
3991 	auto allocFrameTy = ::llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
3992 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
3993 	auto freeFrameTy = ::llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
3994 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
3995 
3996 	auto oldInsertionPoint = jit->builder->saveIP();
3997 
3998 	// Build the coroutine_await() function:
3999 	//
4000 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4001 	//    {
4002 	//        if(llvm.coro.done(handle))
4003 	//        {
4004 	//            return false;
4005 	//        }
4006 	//        else
4007 	//        {
4008 	//            *value = (T*)llvm.coro.promise(handle);
4009 	//            llvm.coro.resume(handle);
4010 	//            return true;
4011 	//        }
4012 	//    }
4013 	//
4014 	{
4015 		auto args = jit->coroutine.await->arg_begin();
4016 		auto handle = args++;
4017 		auto outPtr = args++;
4018 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "co_await", jit->coroutine.await));
4019 		auto doneBlock = llvm::BasicBlock::Create(jit->context, "done", jit->coroutine.await);
4020 		auto resumeBlock = llvm::BasicBlock::Create(jit->context, "resume", jit->coroutine.await);
4021 
4022 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4023 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4024 
4025 		jit->builder->SetInsertPoint(doneBlock);
4026 		jit->builder->CreateRet(::llvm::ConstantInt::getFalse(i1Ty));
4027 
4028 		jit->builder->SetInsertPoint(resumeBlock);
4029 		auto promiseAlignment = ::llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
4030 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, ::llvm::ConstantInt::get(i1Ty, 0) });
4031 		auto promise = jit->builder->CreateLoad(jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4032 		jit->builder->CreateStore(promise, outPtr);
4033 		jit->builder->CreateCall(coro_resume, { handle });
4034 		jit->builder->CreateRet(::llvm::ConstantInt::getTrue(i1Ty));
4035 	}
4036 
4037 	// Build the coroutine_destroy() function:
4038 	//
4039 	//    void coroutine_destroy(CoroutineHandle* handle)
4040 	//    {
4041 	//        llvm.coro.destroy(handle);
4042 	//    }
4043 	//
4044 	{
4045 		auto handle = jit->coroutine.destroy->arg_begin();
4046 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.destroy));
4047 		jit->builder->CreateCall(coro_destroy, { handle });
4048 		jit->builder->CreateRetVoid();
4049 	}
4050 
4051 	// Begin building the main coroutine_begin() function.
4052 	//
4053 	//    CoroutineHandle* coroutine_begin(<Arguments>)
4054 	//    {
4055 	//        YieldType promise;
4056 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4057 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4058 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
4059 	//
4060 	//        ... <REACTOR CODE> ...
4061 	//
4062 	//    end:
4063 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
4064 	//        switch(action)
4065 	//        {
4066 	//        case SuspendActionResume:
4067 	//            UNREACHABLE(); // Illegal to resume after final suspend.
4068 	//        case SuspendActionDestroy:
4069 	//            goto destroy;
4070 	//        default: // (SuspendActionSuspend)
4071 	//            goto suspend;
4072 	//        }
4073 	//
4074 	//    destroy:
4075 	//        coroutine_free_frame(llvm.coro.free(id, handle));
4076 	//        goto suspend;
4077 	//
4078 	//    suspend:
4079 	//        llvm.coro.end(handle, false);
4080 	//        return handle;
4081 	//    }
4082 	//
4083 
4084 #ifdef ENABLE_RR_DEBUG_INFO
4085 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), &jit->context, jit->module.get(), jit->function);
4086 #endif  // ENABLE_RR_DEBUG_INFO
4087 
4088 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(jit->context, "suspend", jit->function);
4089 	jit->coroutine.endBlock = llvm::BasicBlock::Create(jit->context, "end", jit->function);
4090 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(jit->context, "destroy", jit->function);
4091 
4092 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4093 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4094 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4095 	                                                          ::llvm::ConstantInt::get(i32Ty, 0),
4096 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4097 	                                                          ::llvm::ConstantPointerNull::get(i8PtrTy),
4098 	                                                          ::llvm::ConstantPointerNull::get(i8PtrTy),
4099 	                                                      });
4100 	auto size = jit->builder->CreateCall(coro_size, {});
4101 	auto frame = jit->builder->CreateCall(allocFrame, { size });
4102 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4103 
4104 	// Build the suspend block
4105 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4106 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, ::llvm::ConstantInt::get(i1Ty, 0) });
4107 	jit->builder->CreateRet(jit->coroutine.handle);
4108 
4109 	// Build the end block
4110 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4111 	auto action = jit->builder->CreateCall(coro_suspend, {
4112 	                                                         ::llvm::ConstantTokenNone::get(jit->context),
4113 	                                                         ::llvm::ConstantInt::get(i1Ty, 1),  // final: true
4114 	                                                     });
4115 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4116 	// switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4117 	switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4118 
4119 	// Build the destroy block
4120 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4121 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4122 	jit->builder->CreateCall(freeFrame, { memory });
4123 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4124 
4125 	// Switch back to original insert point to continue building the coroutine.
4126 	jit->builder->restoreIP(oldInsertionPoint);
4127 }
4128 
4129 }  // anonymous namespace
4130 
4131 namespace rr {
4132 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4133 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4134 {
4135 	// Coroutines are initially created as a regular function.
4136 	// Upon the first call to Yield(), the function is promoted to a true
4137 	// coroutine.
4138 	auto voidTy = ::llvm::Type::getVoidTy(jit->context);
4139 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
4140 	auto i8PtrTy = ::llvm::Type::getInt8PtrTy(jit->context);
4141 	auto handleTy = i8PtrTy;
4142 	auto boolTy = i1Ty;
4143 	auto promiseTy = T(YieldType);
4144 	auto promisePtrTy = promiseTy->getPointerTo();
4145 
4146 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4147 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4148 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4149 	jit->coroutine.yieldType = promiseTy;
4150 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(jit->context, "function", jit->function);
4151 
4152 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4153 }
4154 
yield(Value * val)4155 void Nucleus::yield(Value *val)
4156 {
4157 	if(jit->coroutine.id == nullptr)
4158 	{
4159 		// First call to yield().
4160 		// Promote the function to a full coroutine.
4161 		promoteFunctionToCoroutine();
4162 		ASSERT(jit->coroutine.id != nullptr);
4163 	}
4164 
4165 	//      promise = val;
4166 	//
4167 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4168 	//      switch(action)
4169 	//      {
4170 	//      case SuspendActionResume:
4171 	//          goto resume;
4172 	//      case SuspendActionDestroy:
4173 	//          goto destroy;
4174 	//      default: // (SuspendActionSuspend)
4175 	//          goto suspend;
4176 	//      }
4177 	//  resume:
4178 	//
4179 
4180 	RR_DEBUG_INFO_UPDATE_LOC();
4181 	Variable::materializeAll();
4182 
4183 	// Types
4184 	auto i1Ty = ::llvm::Type::getInt1Ty(jit->context);
4185 	auto i8Ty = ::llvm::Type::getInt8Ty(jit->context);
4186 
4187 	// Intrinsics
4188 	auto coro_suspend = ::llvm::Intrinsic::getDeclaration(jit->module.get(), ::llvm::Intrinsic::coro_suspend);
4189 
4190 	// Create a block to resume execution.
4191 	auto resumeBlock = llvm::BasicBlock::Create(jit->context, "resume", jit->function);
4192 
4193 	// Store the promise (yield value)
4194 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4195 	auto action = jit->builder->CreateCall(coro_suspend, {
4196 	                                                         ::llvm::ConstantTokenNone::get(jit->context),
4197 	                                                         ::llvm::ConstantInt::get(i1Ty, 0),  // final: true
4198 	                                                     });
4199 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4200 	switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4201 	switch_->addCase(::llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4202 
4203 	// Continue building in the resume block.
4204 	jit->builder->SetInsertPoint(resumeBlock);
4205 }
4206 
acquireCoroutine(const char * name,const Config::Edit & cfgEdit)4207 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
4208 {
4209 	bool isCoroutine = jit->coroutine.id != nullptr;
4210 	if(isCoroutine)
4211 	{
4212 		jit->builder->CreateBr(jit->coroutine.endBlock);
4213 	}
4214 	else
4215 	{
4216 		// Coroutine without a Yield acts as a regular function.
4217 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4218 		// handle.
4219 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4220 		// The 'coroutine_await' function always returns false (coroutine done).
4221 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.await));
4222 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4223 		// The 'coroutine_destroy' does nothing, returns void.
4224 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(jit->context, "", jit->coroutine.destroy));
4225 		jit->builder->CreateRetVoid();
4226 	}
4227 
4228 #ifdef ENABLE_RR_DEBUG_INFO
4229 	if(jit->debugInfo != nullptr)
4230 	{
4231 		jit->debugInfo->Finalize();
4232 	}
4233 #endif  // ENABLE_RR_DEBUG_INFO
4234 
4235 	if(false)
4236 	{
4237 		std::error_code error;
4238 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4239 		jit->module->print(file, 0);
4240 	}
4241 
4242 	if(isCoroutine)
4243 	{
4244 		// Run manadory coroutine transforms.
4245 		llvm::legacy::PassManager pm;
4246 
4247 #if LLVM_VERSION_MAJOR >= 9
4248 		pm.add(llvm::createCoroEarlyLegacyPass());
4249 		pm.add(llvm::createCoroSplitLegacyPass());
4250 		pm.add(llvm::createCoroElideLegacyPass());
4251 		pm.add(llvm::createBarrierNoopPass());
4252 		pm.add(llvm::createCoroCleanupLegacyPass());
4253 #else
4254 		pm.add(llvm::createCoroEarlyPass());
4255 		pm.add(llvm::createCoroSplitPass());
4256 		pm.add(llvm::createCoroElidePass());
4257 		pm.add(llvm::createBarrierNoopPass());
4258 		pm.add(llvm::createCoroCleanupPass());
4259 #endif
4260 
4261 		pm.run(*jit->module);
4262 	}
4263 
4264 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4265 	{
4266 		llvm::legacy::PassManager pm;
4267 		pm.add(llvm::createVerifierPass());
4268 		pm.run(*jit->module);
4269 	}
4270 #endif  // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4271 
4272 	auto cfg = cfgEdit.apply(jit->config);
4273 	jit->optimize(cfg);
4274 
4275 	if(false)
4276 	{
4277 		std::error_code error;
4278 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4279 		jit->module->print(file, 0);
4280 	}
4281 
4282 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4283 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4284 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4285 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4286 	auto routine = jit->acquireRoutine(funcs, Nucleus::CoroutineEntryCount, cfg);
4287 	jit.reset();
4288 
4289 	return routine;
4290 }
4291 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4292 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4293 {
4294 	return func();
4295 }
4296 
4297 }  // namespace rr
4298