1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include <cm-cl/math.h>
10 #include <cm-cl/vector.h>
11 using namespace cm;
12
13 namespace details {
14
15 template <unsigned N>
16 static CM_NODEBUG CM_INLINE vector<uint64_t, N>
__impl_combineLoHi(vector<uint32_t,N> Lo,vector<uint32_t,N> Hi)17 __impl_combineLoHi(vector<uint32_t, N> Lo, vector<uint32_t, N> Hi) {
18 vector<uint32_t, 2 * N> Res;
19 Res.template select<N, 2>(1) = Hi;
20 Res.template select<N, 2>(0) = Lo;
21 return Res.template format<uint64_t>();
22 }
23
24 template <unsigned N>
25 CM_NODEBUG CM_INLINE vector<double, N>
__impl_ui2fp__double__(vector<uint64_t,N> a)26 __impl_ui2fp__double__(vector<uint64_t, N> a) {
27 const vector<uint32_t, N> Zero(0);
28 const vector<uint32_t, N> Ones(0xffffffff);
29 const vector<uint32_t, N> One(1);
30
31 vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
32 vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
33 vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
34 // TODO : replace with clz for 64 bit
35 vector<uint32_t, N> LZHi = cm::math::count_leading_zeros(Hi);
36 vector<uint32_t, N> LZLo = cm::math::count_leading_zeros(Lo);
37 auto ZeroHi = Hi == Zero;
38 vector<uint32_t, N> LZ = LZHi;
39 LZ.merge(LZ + LZLo, ZeroHi);
40
41 // we need to get that nice first set bit into bit position 51.
42 // thus we shift our nice pair of values by 63 - 51 - clz,
43 // uint8_t shift = 12 - lz;
44 // shift hidden bit too (+1)
45
46 // 64bit Shift - we rely on compiler emulation there
47 vector<int32_t, N> Shift = LZ - vector<int32_t, N>(11);
48 vector<uint64_t, N> ToShift = __impl_combineLoHi<N>(Lo, Hi);
49 vector<uint64_t, N> Shifted64 = ToShift << Shift;
50 auto IsRightShift = Shift < vector<int32_t, N>(0);
51 Shifted64.merge(ToShift >> -Shift, IsRightShift);
52
53 vector<uint32_t, 2 *N> Shifted = Shifted64.template format<uint32_t>();
54 vector<uint32_t, N> LoMant = Shifted.template select<N, 2>(0);
55 vector<uint32_t, N> HiMant = Shifted.template select<N, 2>(1);
56 // delete hidden bit
57 HiMant = HiMant & ~(1u << 20);
58
59 // calculate RS
60 vector<uint32_t, N> RMask = (One << (10 - LZ));
61 vector<uint32_t, N> R = (RMask & Lo) >> (10 - LZ);
62 auto NoR = LZ > vector<uint32_t, N>(10);
63 R.merge(Zero, NoR);
64
65 vector<uint32_t, N> SMask = RMask - 1;
66 vector<uint32_t, N> S = Zero;
67 vector<uint32_t, N> AfterR = Lo & SMask;
68 auto ZeroRem = AfterR == Zero;
69 S.merge(One, ~ZeroRem);
70 auto NoS = LZ > vector<uint32_t, N>(9);
71 S.merge(Zero, NoS);
72
73 // R is set but no S, round to even.
74 // Mant + R
75 auto AddC = cm::math::add_with_carry(LoMant, R);
76 LoMant = AddC.first;
77 vector<uint32_t, N> CB = AddC.second;
78 HiMant = HiMant + CB;
79
80 LoMant &= ~(~S & R);
81
82 vector<uint32_t, N> Exp = vector<uint32_t, N>(1086) - LZ;
83 Exp.merge(Zero, LZ == vector<uint32_t, N>(64));
84 vector<uint32_t, N> HiRes = Exp << vector<uint32_t, N>(20);
85 HiRes += HiMant;
86 vector<uint32_t, N> LoRes = LoMant;
87
88 vector<int64_t, N> Result = __impl_combineLoHi<N>(LoRes, HiRes);
89
90 return Result.template format<double>();
91 };
92
93 template <unsigned N>
__impl_ui2fp__(vector<uint64_t,N> a)94 CM_NODEBUG CM_INLINE vector<float, N> __impl_ui2fp__(vector<uint64_t, N> a) {
95 const vector<uint32_t, N> Zero(0);
96 const vector<uint32_t, N> Ones(0xffffffff);
97 const vector<uint32_t, N> One(1);
98
99 vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
100
101 vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
102 vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
103 vector<uint32_t, N> LZ = cm::math::count_leading_zeros(Hi);
104
105 // we need to get that nice first set bit into bit position 23.
106 // thus we shift our nice pair of values by 63 - 23 - clz,
107 // some bits will be dropped by shift thus we'll add 1 bits as R bit.
108 // uint8_t shift = 39 - lz;
109
110 vector<uint32_t, N> DroppedBits = vector<uint32_t, N>(39) - LZ;
111 // SI
112 vector<uint32_t, N> Sha = DroppedBits & vector<uint32_t, N>(0x3f);
113 vector<uint32_t, N> Vec32 = vector<int32_t, N>(32);
114 vector<uint32_t, N> Sh32 = Vec32 - Sha;
115 auto Flag_large_shift = (Sha >= Vec32);
116 auto Flag_zero_shift = (Sha == Zero);
117 vector<uint32_t, N> Mask1 = Ones;
118 Mask1.merge(Zero, Flag_large_shift);
119 vector<uint32_t, N> Mask0 = Ones;
120 Mask0.merge(Zero, Flag_zero_shift);
121
122 // partial shift
123 vector<uint32_t, N> TmpH1 = ((Hi & Mask0) << Sh32) & Mask1;
124 vector<uint32_t, N> TmpH2 = (Hi >> (Sha - Vec32)) & ~Mask1;
125 vector<uint32_t, N> TmpL = (Lo >> Sha) & Mask1;
126 vector<uint32_t, N> Mant = TmpL | TmpH1 | TmpH2;
127
128 vector<uint32_t, N> TmpSha = One << (-Sh32);
129 vector<uint32_t, N> TmpMask = TmpSha - One;
130 vector<uint32_t, N> StickyH = Hi & ~Mask1;
131 StickyH = StickyH & TmpMask;
132
133 // calculate RS
134 vector<uint32_t, N> L1 = Lo & ~Mask1;
135 vector<uint32_t, N> L2 = Lo & (Mask1 >> Sh32);
136 vector<uint32_t, N> StickyL = L1 | L2;
137 vector<uint32_t, N> S1 = StickyH | StickyL;
138 auto S = S1 == Zero;
139 vector<uint32_t, N> NotS = Zero;
140 NotS.merge(Ones, S);
141
142 // R is set but no S, round to even.
143 vector<uint32_t, N> R = Mant & One;
144 Mant = (Mant + One) >> One;
145 Mant &= ~(NotS & R);
146
147 vector<uint32_t, N> Exp = vector<uint32_t, N>(0xbd) - LZ;
148 vector<uint32_t, N> ResL = Exp << vector<uint32_t, N>(23);
149 ResL += Mant;
150
151 vector<float, N> ResultLarge = ResL.template format<float>();
152 vector<float, N> ResultSmall = Lo;
153
154 auto IsSmallPred = Hi == Zero;
155
156 vector<float, N> Result = ResultLarge;
157 Result.merge(ResultSmall, IsSmallPred);
158
159 return Result;
160 }
161
162 template <unsigned N>
163 CM_NODEBUG CM_INLINE vector<half, N>
__impl_ui2fp__half__(vector<uint64_t,N> a)164 __impl_ui2fp__half__(vector<uint64_t, N> a) {
165 const vector<uint32_t, N> Zero(0);
166 const vector<uint32_t, N> Ones(0xffffffff);
167
168 vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
169 vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
170 vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
171 // max half value is 65504 (0xffe0)
172 // so we can use only Low Part
173 vector<half, N> Res = Lo;
174 // NoZeroHi - should be overflow
175 auto NoZeroHi = Hi != Zero;
176 Res.merge(vector<half, N>(Ones), NoZeroHi);
177 return Res;
178 };
179
180 template <typename T, unsigned N> class __impl_ui2fp_runner {};
181
182 template <unsigned N> class __impl_ui2fp_runner<float, N> {
183 public:
run(vector<uint64_t,N> arg)184 vector<float, N> run(vector<uint64_t, N> arg) {
185 return __impl_ui2fp__<N>(arg);
186 }
187 };
188 template <unsigned N> class __impl_ui2fp_runner<double, N> {
189 public:
run(vector<uint64_t,N> arg)190 vector<double, N> run(vector<uint64_t, N> arg) {
191 return __impl_ui2fp__double__<N>(arg);
192 }
193 };
194 template <unsigned N> class __impl_ui2fp_runner<half, N> {
195 public:
run(vector<uint64_t,N> arg)196 vector<half, N> run(vector<uint64_t, N> arg) {
197 return __impl_ui2fp__half__<N>(arg);
198 }
199 };
200
201 template <typename T, unsigned N>
__impl_si2fp__(vector<uint64_t,N> a)202 CM_NODEBUG CM_INLINE vector<float, N> __impl_si2fp__(vector<uint64_t, N> a) {
203 const vector<uint32_t, N> Zero(0);
204
205 // NOTE: SIToFP is special, since it does not do the convert by itself,
206 // Instead it just creates a sequence of 64.bit operations which
207 // are then expanded. As such some type convertion trickery is involved.
208 vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
209 vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
210 vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
211 vector<uint32_t, N> SB = Hi & vector<uint32_t, N>(1u << 31);
212 auto IsSignZero = SB == Zero;
213 vector<uint64_t, N> b = -a;
214 b.merge(a, IsSignZero);
215 auto Res = __impl_ui2fp_runner<T, N>().run(b);
216 Res.merge(-Res, ~IsSignZero);
217 return Res;
218 }
219
220 template <unsigned N, bool isSigned>
221 CM_NODEBUG CM_INLINE vector<uint64_t, N>
__impl_fp2ui__double__(vector<double,N> a)222 __impl_fp2ui__double__(vector<double, N> a) {
223 // vector of floats -> vector of ints
224 vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
225 const vector<uint32_t, N> MantissaMask((1u << 20) - 1);
226 const vector<uint32_t, N> ExpMask(0x7ff);
227 const vector<uint32_t, N> Zero(0);
228 const vector<uint32_t, N> Ones(0xffffffff);
229 const vector<uint32_t, N> One(1);
230 vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
231 vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
232 vector<uint32_t, N> Exp = (Hi >> 20) & ExpMask;
233 // mantissa without hidden bit
234 vector<uint32_t, N> LoMant = Lo;
235 vector<uint32_t, N> HiMant = Hi & MantissaMask;
236 // for normalized numbers (1 + mant/2^52) * 2 ^ (mant-1023)
237 vector<int32_t, N> MantShift = Exp - 1023 - 52;
238 vector<int32_t, N> OneShift = Exp - 1023;
239 auto IsNormalized = Exp == Zero;
240 // for denormalized numbers (0 + mant/2^52) * 2 ^ (1-1023)
241 MantShift.merge(One - 1023 + 1, IsNormalized);
242 OneShift.merge(Ones, IsNormalized);
243
244 // 64bit Shift - we rely on compiler emulation there
245 vector<uint64_t, N> ToShift = __impl_combineLoHi<N>(LoMant, HiMant);
246 vector<uint64_t, N> Shifted64 = ToShift << MantShift;
247 auto IsRightShift = MantShift < vector<int32_t, N>(0);
248 Shifted64.merge(ToShift >> -MantShift, IsRightShift);
249 auto IsShiftGT64 = (MantShift >= vector<int32_t, N>(64)) |
250 (MantShift <= vector<int32_t, N>(-64));
251 Shifted64.merge(vector<uint64_t, N>(0), IsShiftGT64);
252 vector<uint32_t, 2 *N> Shifted = Shifted64.template format<uint32_t>();
253 vector<uint32_t, N> LoRes = Shifted.template select<N, 2>(0);
254 vector<uint32_t, N> HiRes = Shifted.template select<N, 2>(1);
255
256 // add hidden One
257 vector<uint32_t, N> OneInLo = One << OneShift;
258 auto IsOneInLo =
259 (OneShift < vector<int32_t, N>(32)) & (OneShift >= vector<int32_t, N>(0));
260 LoRes.merge(LoRes + OneInLo, IsOneInLo);
261
262 vector<uint32_t, N> OneInHi = One << (OneShift - 32);
263 auto IsOneInHi = (OneShift >= vector<int32_t, N>(32)) &
264 (OneShift < vector<int32_t, N>(64));
265 HiRes.merge(HiRes + OneInHi, IsOneInHi);
266
267 vector<uint32_t, N> SignedBitMask(1u << 31);
268 vector<uint32_t, N> SignedBit = Hi & SignedBitMask;
269
270 auto FlagSignSet = (SignedBit != Zero);
271 auto FlagNoSignSet = (SignedBit == Zero);
272 // check for Exponent overflow (when sign bit set)
273 auto FlagExpO = (Exp > vector<uint32_t, N>(1089));
274 auto FlagExpUO = FlagNoSignSet & FlagExpO;
275 auto IsNaN = (Exp == ExpMask) & ((LoMant != Zero) | (HiMant != Zero));
276 if constexpr (isSigned) {
277 // calculate (NOT[Lo, Hi] + 1) (integer sign negation)
278 vector<uint32_t, N> NegLo = ~LoRes;
279 vector<uint32_t, N> NegHi = ~HiRes;
280
281 auto AddC = cm::math::add_with_carry(NegLo, One);
282 auto AddcRes = AddC.first;
283 auto AddcResCB = AddC.second;
284 NegHi = NegHi + AddcResCB;
285
286 // if sign bit is set, alter the result with negated value
287 // if (FlagSignSet)
288 LoRes.merge(AddcRes, FlagSignSet);
289 HiRes.merge(NegHi, FlagSignSet);
290
291 // Here we process overflows
292 vector<uint32_t, N> LoOrHi = LoRes | HiRes;
293 auto NZ = (LoOrHi != Zero);
294 vector<uint32_t, N> HiHBit = HiRes & SignedBitMask;
295 auto NZ2 = SignedBit != HiHBit;
296 auto Ovrfl1 = NZ2 & NZ;
297
298 // In case of overflow, HW response is : 7fffffffffffffff
299 // if (Ovrfl1)
300 LoRes.merge(Ones, Ovrfl1);
301 HiRes.merge(vector<uint32_t, N>((1u << 31) - 1), Ovrfl1);
302
303 // if (FlagExpO)
304 LoRes.merge(Zero, FlagExpO);
305 HiRes.merge(vector<uint32_t, N>(1u << 31), FlagExpO);
306
307 // if (FlagExpUO)
308 LoRes.merge(Ones, FlagExpUO);
309 HiRes.merge(vector<uint32_t, N>((1u << 31) - 1), FlagExpUO);
310
311 // if (IsNaN)
312 LoRes.merge(Zero, IsNaN);
313 HiRes.merge(Zero, IsNaN);
314
315 } else {
316 // if (FlagSignSet)
317 LoRes.merge(Zero, FlagSignSet);
318 HiRes.merge(Zero, FlagSignSet);
319
320 // if (FlagExpUO)
321 LoRes.merge(Ones, FlagExpUO);
322 HiRes.merge(Ones, FlagExpUO);
323
324 // if (IsNaN)
325 LoRes.merge(Zero, IsNaN);
326 HiRes.merge(Zero, IsNaN);
327 }
328 return __impl_combineLoHi<N>(LoRes, HiRes);
329 }
330 template <unsigned N, bool isSigned>
__impl_fp2ui__(vector<float,N> a)331 CM_NODEBUG CM_INLINE vector<uint64_t, N> __impl_fp2ui__(vector<float, N> a) {
332 // vector of floats -> vector of ints
333 vector<uint32_t, N> Uifl = a.template format<uint32_t>();
334 const vector<uint32_t, N> ExpMask(0xff);
335 const vector<uint32_t, N> MantissaMask((1u << 23) - 1);
336 const vector<uint32_t, N> Zero(0);
337 const vector<uint32_t, N> Ones(0xffffffff);
338 const vector<uint32_t, N> One(1);
339
340 vector<uint32_t, N> Exp = (Uifl >> 23) & ExpMask;
341 // mantissa without hidden bit
342 vector<uint32_t, N> Pmantissa = Uifl & MantissaMask;
343 // take hidden bit into account
344 vector<uint32_t, N> Mantissa = Pmantissa | vector<uint32_t, N>(1 << 23);
345 vector<uint32_t, N> Data_h = Mantissa << 8;
346 vector<uint32_t, N> Data_l = Zero;
347
348 // this block do Logical Shift Right
349 vector<uint32_t, N> Shift = vector<uint32_t, N>(0xbe) - Exp;
350 vector<uint32_t, N> Sha = Shift & vector<uint32_t, N>(0x3f);
351 vector<uint32_t, N> Vec32 = vector<uint32_t, N>(32);
352 vector<uint32_t, N> Sh32 = Vec32 - Sha;
353 auto Flag_large_shift = (Sha >= Vec32);
354 auto Flag_zero_shift = (Sha == Zero);
355 vector<uint32_t, N> Mask1 = Ones;
356 Mask1.merge(Zero, Flag_large_shift);
357 vector<uint32_t, N> Mask0 = Ones;
358 Mask0.merge(Zero, Flag_zero_shift);
359 vector<uint32_t, N> TmpH1 = ((Data_h & Mask0) << Sh32) & Mask1;
360 vector<uint32_t, N> TmpH2 = (Data_h >> (Sha - Vec32)) & ~Mask1;
361 vector<uint32_t, N> TmpL = (Data_l >> Sha) & Mask1;
362 vector<uint32_t, N> Lo = TmpL | TmpH1 | TmpH2;
363 vector<uint32_t, N> Hi = (Data_h >> Sha) & Mask1;
364
365 // Discard results if shift is greater than 63
366 vector<uint32_t, N> Mask = Ones;
367 auto FlagDiscard = (Shift > vector<uint32_t, N>(63));
368 Mask.merge(Zero, FlagDiscard);
369 Lo = Lo & Mask;
370 Hi = Hi & Mask;
371 vector<uint32_t, N> SignedBitMask(1u << 31);
372 vector<uint32_t, N> SignedBit = Uifl & SignedBitMask;
373 auto FlagSignSet = (SignedBit != Zero);
374 auto FlagNoSignSet = (SignedBit == Zero);
375 // check for Exponent overflow (when sign bit set)
376 auto FlagExpO = (Exp > vector<uint32_t, N>(0xbe));
377 auto FlagExpUO = FlagNoSignSet & FlagExpO;
378 auto IsNaN = (Exp == ExpMask) & (Pmantissa != Zero);
379 if constexpr (isSigned) {
380 // calculate (NOT[Lo, Hi] + 1) (integer sign negation)
381 vector<uint32_t, N> NegLo = ~Lo;
382 vector<uint32_t, N> NegHi = ~Hi;
383
384 auto AddC = cm::math::add_with_carry(NegLo, One);
385 auto AddcResVal = AddC.first;
386 vector<uint32_t, N> AddcResCB = AddC.second;
387
388 NegHi = NegHi + AddcResCB;
389
390 // if sign bit is set, alter the result with negated value
391 // if (FlagSignSet)
392 Lo.merge(AddcResVal, FlagSignSet);
393 Hi.merge(NegHi, FlagSignSet);
394
395 // Here we process overflows
396 vector<uint32_t, N> LoOrHi = Lo | Hi;
397 auto NZ = (LoOrHi != Zero);
398 vector<uint32_t, N> HiHBit = Hi & SignedBitMask;
399 auto NZ2 = SignedBit != HiHBit;
400 auto Ovrfl1 = NZ2 & NZ;
401
402 // In case of overflow, HW response is : 7fffffffffffffff
403 // if (Ovrfl1)
404 Lo.merge(Ones, Ovrfl1);
405 Hi.merge(vector<uint32_t, N>((1u << 31) - 1), Ovrfl1);
406
407 // if (FlagExpO)
408 Lo.merge(Zero, FlagExpO);
409 Hi.merge(vector<uint32_t, N>(1u << 31), FlagExpO);
410
411 // if (FlagExpUO)
412 Lo.merge(Ones, FlagExpUO);
413 Hi.merge(vector<uint32_t, N>((1u << 31) - 1), FlagExpUO);
414
415 // if (IsNaN)
416 Lo.merge(Zero, IsNaN);
417 Hi.merge(Zero, IsNaN);
418 } else {
419 // if (FlagSignSet)
420 Lo.merge(Zero, FlagSignSet);
421 Hi.merge(Zero, FlagSignSet);
422
423 // if (FlagExpUO)
424 Lo.merge(Ones, FlagExpUO);
425 Hi.merge(Ones, FlagExpUO);
426
427 // if (IsNaN)
428 Lo.merge(Zero, IsNaN);
429 Hi.merge(Zero, IsNaN);
430 }
431 return __impl_combineLoHi<N>(Lo, Hi);
432 }
433
434 template <unsigned N, bool isSigned>
435 CM_NODEBUG CM_INLINE vector<uint64_t, N>
__impl_fp2ui__half__(vector<half,N> a)436 __impl_fp2ui__half__(vector<half, N> a) {
437 vector<uint16_t, N> Val16 = a.template format<uint16_t>();
438 vector<uint32_t, N> Val = Val16;
439 const vector<uint32_t, N> Zero(0);
440 const vector<uint32_t, N> Ones(0xffffffff);
441 const vector<uint32_t, N> One(1);
442 const vector<uint32_t, N> ExpMask = vector<uint32_t, N>(0x1f);
443 const vector<uint32_t, N> MantissaMask = vector<uint32_t, N>(0x3ff);
444
445 vector<uint32_t, N> SignedBitMask(1u << 15);
446 vector<uint32_t, N> SignedBit = Val & SignedBitMask;
447 vector<uint32_t, N> Exp = (Val >> 10) & ExpMask;
448 vector<uint32_t, N> Mant = Val & MantissaMask;
449 auto FlagSignSet = (SignedBit != Zero);
450 auto FlagNoSignSet = (SignedBit == Zero);
451
452 // check for Exponent overflow (when sign bit set)
453 auto FlagExpO = (Exp == vector<uint32_t, N>(0x1f));
454 auto FlagExpUO = FlagNoSignSet & FlagExpO;
455 auto IsNaN = FlagExpO & (Mant != Zero);
456 vector<uint32_t, N> LoRes = a;
457 vector<uint32_t, N> HiRes = Zero;
458 if constexpr (isSigned) {
459 vector<uint32_t, N> IntNegA = -a;
460 LoRes.merge(IntNegA, FlagSignSet);
461 // calculate (NOT[Lo, Hi] + 1) (integer sign negation)
462 vector<uint32_t, N> NegLo = ~LoRes;
463 vector<uint32_t, N> NegHi = ~HiRes;
464
465 auto AddC = cm::math::add_with_carry(NegLo, One);
466 auto AddcRes = AddC.first;
467 auto AddcResCB = AddC.second;
468 NegHi = NegHi + AddcResCB;
469
470 // if sign bit is set, alter the result with negated value
471 // if (FlagSignSet)
472 LoRes.merge(AddcRes, FlagSignSet);
473 HiRes.merge(NegHi, FlagSignSet);
474
475 // if (FlagExpO)
476 LoRes.merge(Zero, FlagExpO);
477 HiRes.merge(vector<uint32_t, N>(1u << 31), FlagExpO);
478
479 // if (FlagExpUO)
480 LoRes.merge(Ones, FlagExpUO);
481 HiRes.merge(vector<uint32_t, N>((1u << 31) - 1), FlagExpUO);
482
483 // if (IsNaN)
484 LoRes.merge(Zero, IsNaN);
485 HiRes.merge(Zero, IsNaN);
486
487 } else {
488 LoRes.merge(Zero, FlagSignSet);
489 HiRes.merge(Zero, FlagSignSet);
490
491 // if (FlagExpUO)
492 LoRes.merge(Ones, FlagExpUO);
493 HiRes.merge(Ones, FlagExpUO);
494
495 // if (IsNaN)
496 LoRes.merge(Zero, IsNaN);
497 HiRes.merge(Zero, IsNaN);
498 }
499
500 return __impl_combineLoHi<N>(LoRes, HiRes);
501 }
502 } // namespace details
503
504 #define __FP2UI_D_VECTOR_IMPL(N) \
505 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<uint64_t, N> \
506 __cm_intrinsic_impl_fp2ui__double_##N##_(cl_vector<double, N> a) { \
507 vector<uint64_t, N> b = details::__impl_fp2ui__double__<N, false>(a); \
508 return b.cl_vector(); \
509 };
510
511 #define __FP2UI_VECTOR_IMPL(N) \
512 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<uint64_t, N> \
513 __cm_intrinsic_impl_fp2ui_##N##_(cl_vector<float, N> a) { \
514 vector<uint64_t, N> b = details::__impl_fp2ui__<N, false>(a); \
515 return b.cl_vector(); \
516 };
517
518 #define __FP2UI_H_VECTOR_IMPL(N) \
519 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<uint64_t, N> \
520 __cm_intrinsic_impl_fp2ui__half_##N##_(cl_vector<half, N> a) { \
521 vector<uint64_t, N> b = details::__impl_fp2ui__half__<N, false>(a); \
522 return b.cl_vector(); \
523 };
524
525 #define __FP2SI_D_VECTOR_IMPL(N) \
526 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<int64_t, N> \
527 __cm_intrinsic_impl_fp2si__double_##N##_(cl_vector<double, N> a) { \
528 vector<int64_t, N> b = details::__impl_fp2ui__double__<N, true>(a); \
529 return b.cl_vector(); \
530 };
531
532 #define __FP2SI_VECTOR_IMPL(N) \
533 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<int64_t, N> \
534 __cm_intrinsic_impl_fp2si_##N##_(cl_vector<float, N> a) { \
535 vector<int64_t, N> b = details::__impl_fp2ui__<N, true>(a); \
536 return b.cl_vector(); \
537 };
538
539 #define __FP2SI_H_VECTOR_IMPL(N) \
540 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<int64_t, N> \
541 __cm_intrinsic_impl_fp2si__half_##N##_(cl_vector<half, N> a) { \
542 vector<int64_t, N> b = details::__impl_fp2ui__half__<N, true>(a); \
543 return b.cl_vector(); \
544 };
545
546 #define __UI2FP_D_VECTOR_IMPL(N) \
547 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<double, N> \
548 __cm_intrinsic_impl_ui2fp__double_##N##_(cl_vector<uint64_t, N> a) { \
549 vector<double, N> b = details::__impl_ui2fp__double__<N>(a); \
550 return b.cl_vector(); \
551 };
552
553 #define __UI2FP_VECTOR_IMPL(N) \
554 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<float, N> \
555 __cm_intrinsic_impl_ui2fp_##N##_(cl_vector<uint64_t, N> a) { \
556 vector<float, N> b = details::__impl_ui2fp__<N>(a); \
557 return b.cl_vector(); \
558 };
559
560 #define __UI2FP_H_VECTOR_IMPL(N) \
561 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<half, N> \
562 __cm_intrinsic_impl_ui2fp__half_##N##_(cl_vector<uint64_t, N> a) { \
563 vector<half, N> b = details::__impl_ui2fp__half__<N>(a); \
564 return b.cl_vector(); \
565 };
566
567 #define __SI2FP_D_VECTOR_IMPL(N) \
568 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<double, N> \
569 __cm_intrinsic_impl_si2fp__double_##N##_(cl_vector<uint64_t, N> a) { \
570 vector<double, N> b = details::__impl_si2fp__<double, N>(a); \
571 return b.cl_vector(); \
572 };
573
574 #define __SI2FP_VECTOR_IMPL(N) \
575 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<float, N> \
576 __cm_intrinsic_impl_si2fp_##N##_(cl_vector<uint64_t, N> a) { \
577 vector<float, N> b = details::__impl_si2fp__<float, N>(a); \
578 return b.cl_vector(); \
579 };
580
581 #define __SI2FP_H_VECTOR_IMPL(N) \
582 CM_NODEBUG CM_NOINLINE extern "C" cl_vector<half, N> \
583 __cm_intrinsic_impl_si2fp__half_##N##_(cl_vector<uint64_t, N> a) { \
584 vector<half, N> b = details::__impl_si2fp__<half, N>(a); \
585 return b.cl_vector(); \
586 };
587
588 // FP2UI
589 // special case - input not a vector
590 CM_NODEBUG CM_NOINLINE extern "C" uint64_t
__cm_intrinsic_impl_fp2ui_1_double_base__(double a)591 __cm_intrinsic_impl_fp2ui_1_double_base__(double a) {
592 vector<uint64_t, 1> b =
593 details::__impl_fp2ui__double__<1, false>(vector<double, 1>(a));
594 return b[0];
595 }
596
597 // special case - input not a vector
598 CM_NODEBUG CM_NOINLINE extern "C" uint64_t
__cm_intrinsic_impl_fp2ui_1_base__(float a)599 __cm_intrinsic_impl_fp2ui_1_base__(float a) {
600 vector<uint64_t, 1> b =
601 details::__impl_fp2ui__<1, false>(vector<float, 1>(a));
602 return b[0];
603 }
604
605 // special case - input not a vector
606 CM_NODEBUG CM_NOINLINE extern "C" uint64_t
__cm_intrinsic_impl_fp2ui_1_half_base__(half a)607 __cm_intrinsic_impl_fp2ui_1_half_base__(half a) {
608 vector<uint64_t, 1> b =
609 details::__impl_fp2ui__half__<1, false>(vector<half, 1>(a));
610 return b[0];
611 }
612
613 // FP2SI
614 // special case - input not a vector
615 CM_NODEBUG CM_NOINLINE extern "C" int64_t
__cm_intrinsic_impl_fp2si_1_double_base__(double a)616 __cm_intrinsic_impl_fp2si_1_double_base__(double a) {
617 vector<int64_t, 1> b =
618 details::__impl_fp2ui__double__<1, true>(vector<double, 1>(a));
619 return b[0];
620 }
621
622 // special case - input not a vector
623 CM_NODEBUG CM_NOINLINE extern "C" int64_t
__cm_intrinsic_impl_fp2si_1_base__(float a)624 __cm_intrinsic_impl_fp2si_1_base__(float a) {
625 vector<int64_t, 1> b = details::__impl_fp2ui__<1, true>(vector<float, 1>(a));
626 return b[0];
627 }
628
629 // special case - input not a vector
630 CM_NODEBUG CM_NOINLINE extern "C" int64_t
__cm_intrinsic_impl_fp2si_1_half_base__(half a)631 __cm_intrinsic_impl_fp2si_1_half_base__(half a) {
632 vector<int64_t, 1> b =
633 details::__impl_fp2ui__half__<1, true>(vector<half, 1>(a));
634 return b[0];
635 }
636
637 // UI2FP
638 // special case - input not a vector
639 CM_NODEBUG CM_NOINLINE extern "C" double
__cm_intrinsic_impl_ui2fp_1_double_base__(uint64_t a)640 __cm_intrinsic_impl_ui2fp_1_double_base__(uint64_t a) {
641 vector<double, 1> b =
642 details::__impl_ui2fp__double__<1>(vector<uint64_t, 1>(a));
643 return b[0];
644 }
645
646 // special case - input not a vector
647 CM_NODEBUG CM_NOINLINE extern "C" float
__cm_intrinsic_impl_ui2fp_1_base__(uint64_t a)648 __cm_intrinsic_impl_ui2fp_1_base__(uint64_t a) {
649 vector<float, 1> b = details::__impl_ui2fp__<1>(vector<uint64_t, 1>(a));
650 return b[0];
651 }
652
653 // special case - input not a vector
654 CM_NODEBUG CM_NOINLINE extern "C" half
__cm_intrinsic_impl_ui2fp_1_half_base__(uint64_t a)655 __cm_intrinsic_impl_ui2fp_1_half_base__(uint64_t a) {
656 vector<half, 1> b = details::__impl_ui2fp__half__<1>(vector<uint64_t, 1>(a));
657 return b[0];
658 }
659
660 // SI2FP
661 // special case - input not a vector
662 CM_NODEBUG CM_NOINLINE extern "C" double
__cm_intrinsic_impl_si2fp_1_double_base__(int64_t a)663 __cm_intrinsic_impl_si2fp_1_double_base__(int64_t a) {
664 vector<double, 1> b =
665 details::__impl_si2fp__<double, 1>(vector<int64_t, 1>(a));
666 return b[0];
667 }
668
669 // special case - input not a vector
670 CM_NODEBUG CM_NOINLINE extern "C" float
__cm_intrinsic_impl_si2fp_1_base__(int64_t a)671 __cm_intrinsic_impl_si2fp_1_base__(int64_t a) {
672 vector<float, 1> b = details::__impl_si2fp__<float, 1>(vector<int64_t, 1>(a));
673 return b[0];
674 }
675
676 // special case - input not a vector
677 CM_NODEBUG CM_NOINLINE extern "C" half
__cm_intrinsic_impl_si2fp_1_half_base__(int64_t a)678 __cm_intrinsic_impl_si2fp_1_half_base__(int64_t a) {
679 vector<half, 1> b = details::__impl_si2fp__<half, 1>(vector<int64_t, 1>(a));
680 return b[0];
681 }
682
683 #define __DEFINE_FP2UI_FUN(N) \
684 __FP2UI_D_VECTOR_IMPL(N); \
685 __FP2UI_VECTOR_IMPL(N); \
686 __FP2UI_H_VECTOR_IMPL(N); \
687 __FP2SI_D_VECTOR_IMPL(N); \
688 __FP2SI_VECTOR_IMPL(N); \
689 __FP2SI_H_VECTOR_IMPL(N); \
690 __UI2FP_D_VECTOR_IMPL(N); \
691 __UI2FP_VECTOR_IMPL(N); \
692 __UI2FP_H_VECTOR_IMPL(N); \
693 __SI2FP_D_VECTOR_IMPL(N); \
694 __SI2FP_VECTOR_IMPL(N); \
695 __SI2FP_H_VECTOR_IMPL(N);
696
697 __DEFINE_FP2UI_FUN(1);
698 __DEFINE_FP2UI_FUN(2);
699 __DEFINE_FP2UI_FUN(4);
700 __DEFINE_FP2UI_FUN(8);
701 __DEFINE_FP2UI_FUN(16);
702 __DEFINE_FP2UI_FUN(32);
703