1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include <cm-cl/math.h>
10 #include <cm-cl/vector.h>
11 using namespace cm;
12 
13 namespace details {
14 
15 template <unsigned N>
16 static CM_NODEBUG CM_INLINE vector<uint64_t, N>
__impl_combineLoHi(vector<uint32_t,N> Lo,vector<uint32_t,N> Hi)17 __impl_combineLoHi(vector<uint32_t, N> Lo, vector<uint32_t, N> Hi) {
18   vector<uint32_t, 2 * N> Res;
19   Res.template select<N, 2>(1) = Hi;
20   Res.template select<N, 2>(0) = Lo;
21   return Res.template format<uint64_t>();
22 }
23 
24 template <unsigned N>
25 CM_NODEBUG CM_INLINE vector<double, N>
__impl_ui2fp__double__(vector<uint64_t,N> a)26 __impl_ui2fp__double__(vector<uint64_t, N> a) {
27   const vector<uint32_t, N> Zero(0);
28   const vector<uint32_t, N> Ones(0xffffffff);
29   const vector<uint32_t, N> One(1);
30 
31   vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
32   vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
33   vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
34   // TODO : replace with clz for 64 bit
35   vector<uint32_t, N> LZHi = cm::math::count_leading_zeros(Hi);
36   vector<uint32_t, N> LZLo = cm::math::count_leading_zeros(Lo);
37   auto ZeroHi = Hi == Zero;
38   vector<uint32_t, N> LZ = LZHi;
39   LZ.merge(LZ + LZLo, ZeroHi);
40 
41   // we need to get that nice first set bit into bit position 51.
42   // thus we shift our nice pair of values by 63 - 51 - clz,
43   // uint8_t shift = 12 - lz;
44   // shift hidden bit too (+1)
45 
46   // 64bit Shift - we rely on compiler emulation there
47   vector<int32_t, N> Shift = LZ - vector<int32_t, N>(11);
48   vector<uint64_t, N> ToShift = __impl_combineLoHi<N>(Lo, Hi);
49   vector<uint64_t, N> Shifted64 = ToShift << Shift;
50   auto IsRightShift = Shift < vector<int32_t, N>(0);
51   Shifted64.merge(ToShift >> -Shift, IsRightShift);
52 
53   vector<uint32_t, 2 *N> Shifted = Shifted64.template format<uint32_t>();
54   vector<uint32_t, N> LoMant = Shifted.template select<N, 2>(0);
55   vector<uint32_t, N> HiMant = Shifted.template select<N, 2>(1);
56   // delete hidden bit
57   HiMant = HiMant & ~(1u << 20);
58 
59   // calculate RS
60   vector<uint32_t, N> RMask = (One << (10 - LZ));
61   vector<uint32_t, N> R = (RMask & Lo) >> (10 - LZ);
62   auto NoR = LZ > vector<uint32_t, N>(10);
63   R.merge(Zero, NoR);
64 
65   vector<uint32_t, N> SMask = RMask - 1;
66   vector<uint32_t, N> S = Zero;
67   vector<uint32_t, N> AfterR = Lo & SMask;
68   auto ZeroRem = AfterR == Zero;
69   S.merge(One, ~ZeroRem);
70   auto NoS = LZ > vector<uint32_t, N>(9);
71   S.merge(Zero, NoS);
72 
73   // R is set but no S, round to even.
74   // Mant + R
75   auto AddC = cm::math::add_with_carry(LoMant, R);
76   LoMant = AddC.first;
77   vector<uint32_t, N> CB = AddC.second;
78   HiMant = HiMant + CB;
79 
80   LoMant &= ~(~S & R);
81 
82   vector<uint32_t, N> Exp = vector<uint32_t, N>(1086) - LZ;
83   Exp.merge(Zero, LZ == vector<uint32_t, N>(64));
84   vector<uint32_t, N> HiRes = Exp << vector<uint32_t, N>(20);
85   HiRes += HiMant;
86   vector<uint32_t, N> LoRes = LoMant;
87 
88   vector<int64_t, N> Result = __impl_combineLoHi<N>(LoRes, HiRes);
89 
90   return Result.template format<double>();
91 };
92 
93 template <unsigned N>
__impl_ui2fp__(vector<uint64_t,N> a)94 CM_NODEBUG CM_INLINE vector<float, N> __impl_ui2fp__(vector<uint64_t, N> a) {
95   const vector<uint32_t, N> Zero(0);
96   const vector<uint32_t, N> Ones(0xffffffff);
97   const vector<uint32_t, N> One(1);
98 
99   vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
100 
101   vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
102   vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
103   vector<uint32_t, N> LZ = cm::math::count_leading_zeros(Hi);
104 
105   // we need to get that nice first set bit into bit position 23.
106   // thus we shift our nice pair of values by 63 - 23 - clz,
107   // some bits will be dropped by shift thus we'll add 1 bits as R bit.
108   // uint8_t shift = 39 - lz;
109 
110   vector<uint32_t, N> DroppedBits = vector<uint32_t, N>(39) - LZ;
111   // SI
112   vector<uint32_t, N> Sha = DroppedBits & vector<uint32_t, N>(0x3f);
113   vector<uint32_t, N> Vec32 = vector<int32_t, N>(32);
114   vector<uint32_t, N> Sh32 = Vec32 - Sha;
115   auto Flag_large_shift = (Sha >= Vec32);
116   auto Flag_zero_shift = (Sha == Zero);
117   vector<uint32_t, N> Mask1 = Ones;
118   Mask1.merge(Zero, Flag_large_shift);
119   vector<uint32_t, N> Mask0 = Ones;
120   Mask0.merge(Zero, Flag_zero_shift);
121 
122   // partial shift
123   vector<uint32_t, N> TmpH1 = ((Hi & Mask0) << Sh32) & Mask1;
124   vector<uint32_t, N> TmpH2 = (Hi >> (Sha - Vec32)) & ~Mask1;
125   vector<uint32_t, N> TmpL = (Lo >> Sha) & Mask1;
126   vector<uint32_t, N> Mant = TmpL | TmpH1 | TmpH2;
127 
128   vector<uint32_t, N> TmpSha = One << (-Sh32);
129   vector<uint32_t, N> TmpMask = TmpSha - One;
130   vector<uint32_t, N> StickyH = Hi & ~Mask1;
131   StickyH = StickyH & TmpMask;
132 
133   // calculate RS
134   vector<uint32_t, N> L1 = Lo & ~Mask1;
135   vector<uint32_t, N> L2 = Lo & (Mask1 >> Sh32);
136   vector<uint32_t, N> StickyL = L1 | L2;
137   vector<uint32_t, N> S1 = StickyH | StickyL;
138   auto S = S1 == Zero;
139   vector<uint32_t, N> NotS = Zero;
140   NotS.merge(Ones, S);
141 
142   // R is set but no S, round to even.
143   vector<uint32_t, N> R = Mant & One;
144   Mant = (Mant + One) >> One;
145   Mant &= ~(NotS & R);
146 
147   vector<uint32_t, N> Exp = vector<uint32_t, N>(0xbd) - LZ;
148   vector<uint32_t, N> ResL = Exp << vector<uint32_t, N>(23);
149   ResL += Mant;
150 
151   vector<float, N> ResultLarge = ResL.template format<float>();
152   vector<float, N> ResultSmall = Lo;
153 
154   auto IsSmallPred = Hi == Zero;
155 
156   vector<float, N> Result = ResultLarge;
157   Result.merge(ResultSmall, IsSmallPred);
158 
159   return Result;
160 }
161 
162 template <unsigned N>
163 CM_NODEBUG CM_INLINE vector<half, N>
__impl_ui2fp__half__(vector<uint64_t,N> a)164 __impl_ui2fp__half__(vector<uint64_t, N> a) {
165   const vector<uint32_t, N> Zero(0);
166   const vector<uint32_t, N> Ones(0xffffffff);
167 
168   vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
169   vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
170   vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
171   // max half value is 65504 (0xffe0)
172   // so we can use only Low Part
173   vector<half, N> Res = Lo;
174   // NoZeroHi - should be overflow
175   auto NoZeroHi = Hi != Zero;
176   Res.merge(vector<half, N>(Ones), NoZeroHi);
177   return Res;
178 };
179 
180 template <typename T, unsigned N> class __impl_ui2fp_runner {};
181 
182 template <unsigned N> class __impl_ui2fp_runner<float, N> {
183 public:
run(vector<uint64_t,N> arg)184   vector<float, N> run(vector<uint64_t, N> arg) {
185     return __impl_ui2fp__<N>(arg);
186   }
187 };
188 template <unsigned N> class __impl_ui2fp_runner<double, N> {
189 public:
run(vector<uint64_t,N> arg)190   vector<double, N> run(vector<uint64_t, N> arg) {
191     return __impl_ui2fp__double__<N>(arg);
192   }
193 };
194 template <unsigned N> class __impl_ui2fp_runner<half, N> {
195 public:
run(vector<uint64_t,N> arg)196   vector<half, N> run(vector<uint64_t, N> arg) {
197     return __impl_ui2fp__half__<N>(arg);
198   }
199 };
200 
201 template <typename T, unsigned N>
__impl_si2fp__(vector<uint64_t,N> a)202 CM_NODEBUG CM_INLINE vector<float, N> __impl_si2fp__(vector<uint64_t, N> a) {
203   const vector<uint32_t, N> Zero(0);
204 
205   // NOTE: SIToFP is special, since it does not do the convert by itself,
206   // Instead it just creates a sequence of 64.bit operations which
207   // are then expanded. As such some type convertion trickery is involved.
208   vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
209   vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
210   vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
211   vector<uint32_t, N> SB = Hi & vector<uint32_t, N>(1u << 31);
212   auto IsSignZero = SB == Zero;
213   vector<uint64_t, N> b = -a;
214   b.merge(a, IsSignZero);
215   auto Res = __impl_ui2fp_runner<T, N>().run(b);
216   Res.merge(-Res, ~IsSignZero);
217   return Res;
218 }
219 
220 template <unsigned N, bool isSigned>
221 CM_NODEBUG CM_INLINE vector<uint64_t, N>
__impl_fp2ui__double__(vector<double,N> a)222 __impl_fp2ui__double__(vector<double, N> a) {
223   // vector of floats -> vector of ints
224   vector<uint32_t, 2 *N> LoHi = a.template format<uint32_t>();
225   const vector<uint32_t, N> MantissaMask((1u << 20) - 1);
226   const vector<uint32_t, N> ExpMask(0x7ff);
227   const vector<uint32_t, N> Zero(0);
228   const vector<uint32_t, N> Ones(0xffffffff);
229   const vector<uint32_t, N> One(1);
230   vector<uint32_t, N> Lo = LoHi.template select<N, 2>(0);
231   vector<uint32_t, N> Hi = LoHi.template select<N, 2>(1);
232   vector<uint32_t, N> Exp = (Hi >> 20) & ExpMask;
233   // mantissa without hidden bit
234   vector<uint32_t, N> LoMant = Lo;
235   vector<uint32_t, N> HiMant = Hi & MantissaMask;
236   // for normalized numbers (1 + mant/2^52) * 2 ^ (mant-1023)
237   vector<int32_t, N> MantShift = Exp - 1023 - 52;
238   vector<int32_t, N> OneShift = Exp - 1023;
239   auto IsNormalized = Exp == Zero;
240   // for denormalized numbers (0 + mant/2^52) * 2 ^ (1-1023)
241   MantShift.merge(One - 1023 + 1, IsNormalized);
242   OneShift.merge(Ones, IsNormalized);
243 
244   // 64bit Shift - we rely on compiler emulation there
245   vector<uint64_t, N> ToShift = __impl_combineLoHi<N>(LoMant, HiMant);
246   vector<uint64_t, N> Shifted64 = ToShift << MantShift;
247   auto IsRightShift = MantShift < vector<int32_t, N>(0);
248   Shifted64.merge(ToShift >> -MantShift, IsRightShift);
249   auto IsShiftGT64 = (MantShift >= vector<int32_t, N>(64)) |
250                      (MantShift <= vector<int32_t, N>(-64));
251   Shifted64.merge(vector<uint64_t, N>(0), IsShiftGT64);
252   vector<uint32_t, 2 *N> Shifted = Shifted64.template format<uint32_t>();
253   vector<uint32_t, N> LoRes = Shifted.template select<N, 2>(0);
254   vector<uint32_t, N> HiRes = Shifted.template select<N, 2>(1);
255 
256   // add hidden One
257   vector<uint32_t, N> OneInLo = One << OneShift;
258   auto IsOneInLo =
259       (OneShift < vector<int32_t, N>(32)) & (OneShift >= vector<int32_t, N>(0));
260   LoRes.merge(LoRes + OneInLo, IsOneInLo);
261 
262   vector<uint32_t, N> OneInHi = One << (OneShift - 32);
263   auto IsOneInHi = (OneShift >= vector<int32_t, N>(32)) &
264                    (OneShift < vector<int32_t, N>(64));
265   HiRes.merge(HiRes + OneInHi, IsOneInHi);
266 
267   vector<uint32_t, N> SignedBitMask(1u << 31);
268   vector<uint32_t, N> SignedBit = Hi & SignedBitMask;
269 
270   auto FlagSignSet = (SignedBit != Zero);
271   auto FlagNoSignSet = (SignedBit == Zero);
272   // check for Exponent overflow (when sign bit set)
273   auto FlagExpO = (Exp > vector<uint32_t, N>(1089));
274   auto FlagExpUO = FlagNoSignSet & FlagExpO;
275   auto IsNaN = (Exp == ExpMask) & ((LoMant != Zero) | (HiMant != Zero));
276   if constexpr (isSigned) {
277     // calculate (NOT[Lo, Hi] + 1) (integer sign negation)
278     vector<uint32_t, N> NegLo = ~LoRes;
279     vector<uint32_t, N> NegHi = ~HiRes;
280 
281     auto AddC = cm::math::add_with_carry(NegLo, One);
282     auto AddcRes = AddC.first;
283     auto AddcResCB = AddC.second;
284     NegHi = NegHi + AddcResCB;
285 
286     // if sign bit is set, alter the result with negated value
287     // if (FlagSignSet)
288     LoRes.merge(AddcRes, FlagSignSet);
289     HiRes.merge(NegHi, FlagSignSet);
290 
291     // Here we process overflows
292     vector<uint32_t, N> LoOrHi = LoRes | HiRes;
293     auto NZ = (LoOrHi != Zero);
294     vector<uint32_t, N> HiHBit = HiRes & SignedBitMask;
295     auto NZ2 = SignedBit != HiHBit;
296     auto Ovrfl1 = NZ2 & NZ;
297 
298     // In case of overflow, HW response is : 7fffffffffffffff
299     // if (Ovrfl1)
300     LoRes.merge(Ones, Ovrfl1);
301     HiRes.merge(vector<uint32_t, N>((1u << 31) - 1), Ovrfl1);
302 
303     // if (FlagExpO)
304     LoRes.merge(Zero, FlagExpO);
305     HiRes.merge(vector<uint32_t, N>(1u << 31), FlagExpO);
306 
307     // if (FlagExpUO)
308     LoRes.merge(Ones, FlagExpUO);
309     HiRes.merge(vector<uint32_t, N>((1u << 31) - 1), FlagExpUO);
310 
311     // if (IsNaN)
312     LoRes.merge(Zero, IsNaN);
313     HiRes.merge(Zero, IsNaN);
314 
315   } else {
316     // if (FlagSignSet)
317     LoRes.merge(Zero, FlagSignSet);
318     HiRes.merge(Zero, FlagSignSet);
319 
320     // if (FlagExpUO)
321     LoRes.merge(Ones, FlagExpUO);
322     HiRes.merge(Ones, FlagExpUO);
323 
324     // if (IsNaN)
325     LoRes.merge(Zero, IsNaN);
326     HiRes.merge(Zero, IsNaN);
327   }
328   return __impl_combineLoHi<N>(LoRes, HiRes);
329 }
330 template <unsigned N, bool isSigned>
__impl_fp2ui__(vector<float,N> a)331 CM_NODEBUG CM_INLINE vector<uint64_t, N> __impl_fp2ui__(vector<float, N> a) {
332   // vector of floats -> vector of ints
333   vector<uint32_t, N> Uifl = a.template format<uint32_t>();
334   const vector<uint32_t, N> ExpMask(0xff);
335   const vector<uint32_t, N> MantissaMask((1u << 23) - 1);
336   const vector<uint32_t, N> Zero(0);
337   const vector<uint32_t, N> Ones(0xffffffff);
338   const vector<uint32_t, N> One(1);
339 
340   vector<uint32_t, N> Exp = (Uifl >> 23) & ExpMask;
341   // mantissa without hidden bit
342   vector<uint32_t, N> Pmantissa = Uifl & MantissaMask;
343   // take hidden bit into account
344   vector<uint32_t, N> Mantissa = Pmantissa | vector<uint32_t, N>(1 << 23);
345   vector<uint32_t, N> Data_h = Mantissa << 8;
346   vector<uint32_t, N> Data_l = Zero;
347 
348   // this block do Logical Shift Right
349   vector<uint32_t, N> Shift = vector<uint32_t, N>(0xbe) - Exp;
350   vector<uint32_t, N> Sha = Shift & vector<uint32_t, N>(0x3f);
351   vector<uint32_t, N> Vec32 = vector<uint32_t, N>(32);
352   vector<uint32_t, N> Sh32 = Vec32 - Sha;
353   auto Flag_large_shift = (Sha >= Vec32);
354   auto Flag_zero_shift = (Sha == Zero);
355   vector<uint32_t, N> Mask1 = Ones;
356   Mask1.merge(Zero, Flag_large_shift);
357   vector<uint32_t, N> Mask0 = Ones;
358   Mask0.merge(Zero, Flag_zero_shift);
359   vector<uint32_t, N> TmpH1 = ((Data_h & Mask0) << Sh32) & Mask1;
360   vector<uint32_t, N> TmpH2 = (Data_h >> (Sha - Vec32)) & ~Mask1;
361   vector<uint32_t, N> TmpL = (Data_l >> Sha) & Mask1;
362   vector<uint32_t, N> Lo = TmpL | TmpH1 | TmpH2;
363   vector<uint32_t, N> Hi = (Data_h >> Sha) & Mask1;
364 
365   // Discard results if shift is greater than 63
366   vector<uint32_t, N> Mask = Ones;
367   auto FlagDiscard = (Shift > vector<uint32_t, N>(63));
368   Mask.merge(Zero, FlagDiscard);
369   Lo = Lo & Mask;
370   Hi = Hi & Mask;
371   vector<uint32_t, N> SignedBitMask(1u << 31);
372   vector<uint32_t, N> SignedBit = Uifl & SignedBitMask;
373   auto FlagSignSet = (SignedBit != Zero);
374   auto FlagNoSignSet = (SignedBit == Zero);
375   // check for Exponent overflow (when sign bit set)
376   auto FlagExpO = (Exp > vector<uint32_t, N>(0xbe));
377   auto FlagExpUO = FlagNoSignSet & FlagExpO;
378   auto IsNaN = (Exp == ExpMask) & (Pmantissa != Zero);
379   if constexpr (isSigned) {
380     // calculate (NOT[Lo, Hi] + 1) (integer sign negation)
381     vector<uint32_t, N> NegLo = ~Lo;
382     vector<uint32_t, N> NegHi = ~Hi;
383 
384     auto AddC = cm::math::add_with_carry(NegLo, One);
385     auto AddcResVal = AddC.first;
386     vector<uint32_t, N> AddcResCB = AddC.second;
387 
388     NegHi = NegHi + AddcResCB;
389 
390     // if sign bit is set, alter the result with negated value
391     // if (FlagSignSet)
392     Lo.merge(AddcResVal, FlagSignSet);
393     Hi.merge(NegHi, FlagSignSet);
394 
395     // Here we process overflows
396     vector<uint32_t, N> LoOrHi = Lo | Hi;
397     auto NZ = (LoOrHi != Zero);
398     vector<uint32_t, N> HiHBit = Hi & SignedBitMask;
399     auto NZ2 = SignedBit != HiHBit;
400     auto Ovrfl1 = NZ2 & NZ;
401 
402     // In case of overflow, HW response is : 7fffffffffffffff
403     // if (Ovrfl1)
404     Lo.merge(Ones, Ovrfl1);
405     Hi.merge(vector<uint32_t, N>((1u << 31) - 1), Ovrfl1);
406 
407     // if (FlagExpO)
408     Lo.merge(Zero, FlagExpO);
409     Hi.merge(vector<uint32_t, N>(1u << 31), FlagExpO);
410 
411     // if (FlagExpUO)
412     Lo.merge(Ones, FlagExpUO);
413     Hi.merge(vector<uint32_t, N>((1u << 31) - 1), FlagExpUO);
414 
415     // if (IsNaN)
416     Lo.merge(Zero, IsNaN);
417     Hi.merge(Zero, IsNaN);
418   } else {
419     // if (FlagSignSet)
420     Lo.merge(Zero, FlagSignSet);
421     Hi.merge(Zero, FlagSignSet);
422 
423     // if (FlagExpUO)
424     Lo.merge(Ones, FlagExpUO);
425     Hi.merge(Ones, FlagExpUO);
426 
427     // if (IsNaN)
428     Lo.merge(Zero, IsNaN);
429     Hi.merge(Zero, IsNaN);
430   }
431   return __impl_combineLoHi<N>(Lo, Hi);
432 }
433 
434 template <unsigned N, bool isSigned>
435 CM_NODEBUG CM_INLINE vector<uint64_t, N>
__impl_fp2ui__half__(vector<half,N> a)436 __impl_fp2ui__half__(vector<half, N> a) {
437   vector<uint16_t, N> Val16 = a.template format<uint16_t>();
438   vector<uint32_t, N> Val = Val16;
439   const vector<uint32_t, N> Zero(0);
440   const vector<uint32_t, N> Ones(0xffffffff);
441   const vector<uint32_t, N> One(1);
442   const vector<uint32_t, N> ExpMask = vector<uint32_t, N>(0x1f);
443   const vector<uint32_t, N> MantissaMask = vector<uint32_t, N>(0x3ff);
444 
445   vector<uint32_t, N> SignedBitMask(1u << 15);
446   vector<uint32_t, N> SignedBit = Val & SignedBitMask;
447   vector<uint32_t, N> Exp = (Val >> 10) & ExpMask;
448   vector<uint32_t, N> Mant = Val & MantissaMask;
449   auto FlagSignSet = (SignedBit != Zero);
450   auto FlagNoSignSet = (SignedBit == Zero);
451 
452   // check for Exponent overflow (when sign bit set)
453   auto FlagExpO = (Exp == vector<uint32_t, N>(0x1f));
454   auto FlagExpUO = FlagNoSignSet & FlagExpO;
455   auto IsNaN = FlagExpO & (Mant != Zero);
456   vector<uint32_t, N> LoRes = a;
457   vector<uint32_t, N> HiRes = Zero;
458   if constexpr (isSigned) {
459     vector<uint32_t, N> IntNegA = -a;
460     LoRes.merge(IntNegA, FlagSignSet);
461     // calculate (NOT[Lo, Hi] + 1) (integer sign negation)
462     vector<uint32_t, N> NegLo = ~LoRes;
463     vector<uint32_t, N> NegHi = ~HiRes;
464 
465     auto AddC = cm::math::add_with_carry(NegLo, One);
466     auto AddcRes = AddC.first;
467     auto AddcResCB = AddC.second;
468     NegHi = NegHi + AddcResCB;
469 
470     // if sign bit is set, alter the result with negated value
471     // if (FlagSignSet)
472     LoRes.merge(AddcRes, FlagSignSet);
473     HiRes.merge(NegHi, FlagSignSet);
474 
475     // if (FlagExpO)
476     LoRes.merge(Zero, FlagExpO);
477     HiRes.merge(vector<uint32_t, N>(1u << 31), FlagExpO);
478 
479     // if (FlagExpUO)
480     LoRes.merge(Ones, FlagExpUO);
481     HiRes.merge(vector<uint32_t, N>((1u << 31) - 1), FlagExpUO);
482 
483     // if (IsNaN)
484     LoRes.merge(Zero, IsNaN);
485     HiRes.merge(Zero, IsNaN);
486 
487   } else {
488     LoRes.merge(Zero, FlagSignSet);
489     HiRes.merge(Zero, FlagSignSet);
490 
491     // if (FlagExpUO)
492     LoRes.merge(Ones, FlagExpUO);
493     HiRes.merge(Ones, FlagExpUO);
494 
495     // if (IsNaN)
496     LoRes.merge(Zero, IsNaN);
497     HiRes.merge(Zero, IsNaN);
498   }
499 
500   return __impl_combineLoHi<N>(LoRes, HiRes);
501 }
502 } // namespace details
503 
504 #define __FP2UI_D_VECTOR_IMPL(N)                                               \
505   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<uint64_t, N>                     \
506       __cm_intrinsic_impl_fp2ui__double_##N##_(cl_vector<double, N> a) {       \
507     vector<uint64_t, N> b = details::__impl_fp2ui__double__<N, false>(a);      \
508     return b.cl_vector();                                                      \
509   };
510 
511 #define __FP2UI_VECTOR_IMPL(N)                                                 \
512   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<uint64_t, N>                     \
513       __cm_intrinsic_impl_fp2ui_##N##_(cl_vector<float, N> a) {                \
514     vector<uint64_t, N> b = details::__impl_fp2ui__<N, false>(a);              \
515     return b.cl_vector();                                                      \
516   };
517 
518 #define __FP2UI_H_VECTOR_IMPL(N)                                               \
519   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<uint64_t, N>                     \
520       __cm_intrinsic_impl_fp2ui__half_##N##_(cl_vector<half, N> a) {           \
521     vector<uint64_t, N> b = details::__impl_fp2ui__half__<N, false>(a);        \
522     return b.cl_vector();                                                      \
523   };
524 
525 #define __FP2SI_D_VECTOR_IMPL(N)                                               \
526   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<int64_t, N>                      \
527       __cm_intrinsic_impl_fp2si__double_##N##_(cl_vector<double, N> a) {       \
528     vector<int64_t, N> b = details::__impl_fp2ui__double__<N, true>(a);        \
529     return b.cl_vector();                                                      \
530   };
531 
532 #define __FP2SI_VECTOR_IMPL(N)                                                 \
533   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<int64_t, N>                      \
534       __cm_intrinsic_impl_fp2si_##N##_(cl_vector<float, N> a) {                \
535     vector<int64_t, N> b = details::__impl_fp2ui__<N, true>(a);                \
536     return b.cl_vector();                                                      \
537   };
538 
539 #define __FP2SI_H_VECTOR_IMPL(N)                                               \
540   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<int64_t, N>                      \
541       __cm_intrinsic_impl_fp2si__half_##N##_(cl_vector<half, N> a) {           \
542     vector<int64_t, N> b = details::__impl_fp2ui__half__<N, true>(a);          \
543     return b.cl_vector();                                                      \
544   };
545 
546 #define __UI2FP_D_VECTOR_IMPL(N)                                               \
547   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<double, N>                       \
548       __cm_intrinsic_impl_ui2fp__double_##N##_(cl_vector<uint64_t, N> a) {     \
549     vector<double, N> b = details::__impl_ui2fp__double__<N>(a);               \
550     return b.cl_vector();                                                      \
551   };
552 
553 #define __UI2FP_VECTOR_IMPL(N)                                                 \
554   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<float, N>                        \
555       __cm_intrinsic_impl_ui2fp_##N##_(cl_vector<uint64_t, N> a) {             \
556     vector<float, N> b = details::__impl_ui2fp__<N>(a);                        \
557     return b.cl_vector();                                                      \
558   };
559 
560 #define __UI2FP_H_VECTOR_IMPL(N)                                               \
561   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<half, N>                         \
562       __cm_intrinsic_impl_ui2fp__half_##N##_(cl_vector<uint64_t, N> a) {       \
563     vector<half, N> b = details::__impl_ui2fp__half__<N>(a);                   \
564     return b.cl_vector();                                                      \
565   };
566 
567 #define __SI2FP_D_VECTOR_IMPL(N)                                               \
568   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<double, N>                       \
569       __cm_intrinsic_impl_si2fp__double_##N##_(cl_vector<uint64_t, N> a) {     \
570     vector<double, N> b = details::__impl_si2fp__<double, N>(a);               \
571     return b.cl_vector();                                                      \
572   };
573 
574 #define __SI2FP_VECTOR_IMPL(N)                                                 \
575   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<float, N>                        \
576       __cm_intrinsic_impl_si2fp_##N##_(cl_vector<uint64_t, N> a) {             \
577     vector<float, N> b = details::__impl_si2fp__<float, N>(a);                 \
578     return b.cl_vector();                                                      \
579   };
580 
581 #define __SI2FP_H_VECTOR_IMPL(N)                                               \
582   CM_NODEBUG CM_NOINLINE extern "C" cl_vector<half, N>                         \
583       __cm_intrinsic_impl_si2fp__half_##N##_(cl_vector<uint64_t, N> a) {       \
584     vector<half, N> b = details::__impl_si2fp__<half, N>(a);                   \
585     return b.cl_vector();                                                      \
586   };
587 
588 // FP2UI
589 // special case - input not a vector
590 CM_NODEBUG CM_NOINLINE extern "C" uint64_t
__cm_intrinsic_impl_fp2ui_1_double_base__(double a)591 __cm_intrinsic_impl_fp2ui_1_double_base__(double a) {
592   vector<uint64_t, 1> b =
593       details::__impl_fp2ui__double__<1, false>(vector<double, 1>(a));
594   return b[0];
595 }
596 
597 // special case - input not a vector
598 CM_NODEBUG CM_NOINLINE extern "C" uint64_t
__cm_intrinsic_impl_fp2ui_1_base__(float a)599 __cm_intrinsic_impl_fp2ui_1_base__(float a) {
600   vector<uint64_t, 1> b =
601       details::__impl_fp2ui__<1, false>(vector<float, 1>(a));
602   return b[0];
603 }
604 
605 // special case - input not a vector
606 CM_NODEBUG CM_NOINLINE extern "C" uint64_t
__cm_intrinsic_impl_fp2ui_1_half_base__(half a)607 __cm_intrinsic_impl_fp2ui_1_half_base__(half a) {
608   vector<uint64_t, 1> b =
609       details::__impl_fp2ui__half__<1, false>(vector<half, 1>(a));
610   return b[0];
611 }
612 
613 // FP2SI
614 // special case - input not a vector
615 CM_NODEBUG CM_NOINLINE extern "C" int64_t
__cm_intrinsic_impl_fp2si_1_double_base__(double a)616 __cm_intrinsic_impl_fp2si_1_double_base__(double a) {
617   vector<int64_t, 1> b =
618       details::__impl_fp2ui__double__<1, true>(vector<double, 1>(a));
619   return b[0];
620 }
621 
622 // special case - input not a vector
623 CM_NODEBUG CM_NOINLINE extern "C" int64_t
__cm_intrinsic_impl_fp2si_1_base__(float a)624 __cm_intrinsic_impl_fp2si_1_base__(float a) {
625   vector<int64_t, 1> b = details::__impl_fp2ui__<1, true>(vector<float, 1>(a));
626   return b[0];
627 }
628 
629 // special case - input not a vector
630 CM_NODEBUG CM_NOINLINE extern "C" int64_t
__cm_intrinsic_impl_fp2si_1_half_base__(half a)631 __cm_intrinsic_impl_fp2si_1_half_base__(half a) {
632   vector<int64_t, 1> b =
633       details::__impl_fp2ui__half__<1, true>(vector<half, 1>(a));
634   return b[0];
635 }
636 
637 // UI2FP
638 // special case - input not a vector
639 CM_NODEBUG CM_NOINLINE extern "C" double
__cm_intrinsic_impl_ui2fp_1_double_base__(uint64_t a)640 __cm_intrinsic_impl_ui2fp_1_double_base__(uint64_t a) {
641   vector<double, 1> b =
642       details::__impl_ui2fp__double__<1>(vector<uint64_t, 1>(a));
643   return b[0];
644 }
645 
646 // special case - input not a vector
647 CM_NODEBUG CM_NOINLINE extern "C" float
__cm_intrinsic_impl_ui2fp_1_base__(uint64_t a)648 __cm_intrinsic_impl_ui2fp_1_base__(uint64_t a) {
649   vector<float, 1> b = details::__impl_ui2fp__<1>(vector<uint64_t, 1>(a));
650   return b[0];
651 }
652 
653 // special case - input not a vector
654 CM_NODEBUG CM_NOINLINE extern "C" half
__cm_intrinsic_impl_ui2fp_1_half_base__(uint64_t a)655 __cm_intrinsic_impl_ui2fp_1_half_base__(uint64_t a) {
656   vector<half, 1> b = details::__impl_ui2fp__half__<1>(vector<uint64_t, 1>(a));
657   return b[0];
658 }
659 
660 // SI2FP
661 // special case - input not a vector
662 CM_NODEBUG CM_NOINLINE extern "C" double
__cm_intrinsic_impl_si2fp_1_double_base__(int64_t a)663 __cm_intrinsic_impl_si2fp_1_double_base__(int64_t a) {
664   vector<double, 1> b =
665       details::__impl_si2fp__<double, 1>(vector<int64_t, 1>(a));
666   return b[0];
667 }
668 
669 // special case - input not a vector
670 CM_NODEBUG CM_NOINLINE extern "C" float
__cm_intrinsic_impl_si2fp_1_base__(int64_t a)671 __cm_intrinsic_impl_si2fp_1_base__(int64_t a) {
672   vector<float, 1> b = details::__impl_si2fp__<float, 1>(vector<int64_t, 1>(a));
673   return b[0];
674 }
675 
676 // special case - input not a vector
677 CM_NODEBUG CM_NOINLINE extern "C" half
__cm_intrinsic_impl_si2fp_1_half_base__(int64_t a)678 __cm_intrinsic_impl_si2fp_1_half_base__(int64_t a) {
679   vector<half, 1> b = details::__impl_si2fp__<half, 1>(vector<int64_t, 1>(a));
680   return b[0];
681 }
682 
683 #define __DEFINE_FP2UI_FUN(N)                                                  \
684   __FP2UI_D_VECTOR_IMPL(N);                                                    \
685   __FP2UI_VECTOR_IMPL(N);                                                      \
686   __FP2UI_H_VECTOR_IMPL(N);                                                    \
687   __FP2SI_D_VECTOR_IMPL(N);                                                    \
688   __FP2SI_VECTOR_IMPL(N);                                                      \
689   __FP2SI_H_VECTOR_IMPL(N);                                                    \
690   __UI2FP_D_VECTOR_IMPL(N);                                                    \
691   __UI2FP_VECTOR_IMPL(N);                                                      \
692   __UI2FP_H_VECTOR_IMPL(N);                                                    \
693   __SI2FP_D_VECTOR_IMPL(N);                                                    \
694   __SI2FP_VECTOR_IMPL(N);                                                      \
695   __SI2FP_H_VECTOR_IMPL(N);
696 
697 __DEFINE_FP2UI_FUN(1);
698 __DEFINE_FP2UI_FUN(2);
699 __DEFINE_FP2UI_FUN(4);
700 __DEFINE_FP2UI_FUN(8);
701 __DEFINE_FP2UI_FUN(16);
702 __DEFINE_FP2UI_FUN(32);
703