1/* Safe-but-slow arithmetic that can handle larger numbers without
2   overflowing. */
3#define DEFINE_SAFE_1(STYPE)                                                  \
4                                                                              \
5  STYPE##2 _CL_OVERLOADABLE safe_normalize (STYPE##2 const a)                 \
6  {                                                                           \
7    STYPE const halfbits = 4 * sizeof (STYPE);                                \
8    STYPE const halfmax = (STYPE)1 << halfbits;                               \
9    STYPE const halfmask = halfmax - (STYPE)1;                                \
10    STYPE##2 b;                                                               \
11    b.s0 = a.s0 & halfmask;                                                   \
12    b.s1 = a.s1 + (STYPE) (a.s0 >> halfbits);                                 \
13    return b;                                                                 \
14  }                                                                           \
15                                                                              \
16  STYPE##4 _CL_OVERLOADABLE safe_normalize (STYPE##4 const a)                 \
17  {                                                                           \
18    STYPE const halfbits = 4 * sizeof (STYPE);                                \
19    STYPE const halfmax = (STYPE)1 << halfbits;                               \
20    STYPE const halfmask = halfmax - (STYPE)1;                                \
21    STYPE tmp;                                                                \
22    STYPE##4 b;                                                               \
23    tmp = a.s0;                                                               \
24    b.s0 = tmp & halfmask;                                                    \
25    tmp = (STYPE) (tmp >> halfbits) + a.s1;                                   \
26    b.s1 = tmp & halfmask;                                                    \
27    tmp = (STYPE) (tmp >> halfbits) + a.s2;                                   \
28    b.s2 = tmp & halfmask;                                                    \
29    tmp = (STYPE) (tmp >> halfbits) + a.s3;                                   \
30    b.s3 = tmp;                                                               \
31    return b;                                                                 \
32  }                                                                           \
33                                                                              \
34  STYPE _CL_OVERLOADABLE safe_extract (STYPE##2 const a)                      \
35  {                                                                           \
36    STYPE const halfbits = 4 * sizeof (STYPE);                                \
37    STYPE const halfmax = (STYPE)1 << halfbits;                               \
38    STYPE const halfmask = halfmax - (STYPE)1;                                \
39    STYPE b;                                                                  \
40    b = a.s0 | (STYPE) (a.s1 << halfbits);                                    \
41    return b;                                                                 \
42  }                                                                           \
43                                                                              \
44  STYPE _CL_OVERLOADABLE safe_extract (STYPE##4 const a)                      \
45  {                                                                           \
46    STYPE const halfbits = 4 * sizeof (STYPE);                                \
47    STYPE const halfmax = (STYPE)1 << halfbits;                               \
48    STYPE const halfmask = halfmax - (STYPE)1;                                \
49    STYPE b;                                                                  \
50    if (safe_extract (a.hi) != 0 && safe_extract (a.hi) != -1)                \
51      {                                                                       \
52        printf ("FAIL: safe_extract [%d,%d,%d,%d]\n", (int)a.s0, (int)a.s1,   \
53                (int)a.s2, (int)a.s3);                                        \
54      }                                                                       \
55    return safe_extract (a.lo);                                               \
56  }                                                                           \
57                                                                              \
58  bool _CL_OVERLOADABLE safe_isneg (STYPE##2 a) { return a.s1 < (STYPE)0; }   \
59                                                                              \
60  bool _CL_OVERLOADABLE safe_isneg (STYPE##4 a) { return a.s3 < (STYPE)0; }   \
61                                                                              \
62  STYPE##2 _CL_OVERLOADABLE safe_neg (STYPE##2 a)                             \
63  {                                                                           \
64    STYPE##2 b;                                                               \
65    b.s0 = -a.s0;                                                             \
66    b.s1 = -a.s1;                                                             \
67    return safe_normalize (b);                                                \
68  }                                                                           \
69                                                                              \
70  STYPE##4 _CL_OVERLOADABLE safe_neg (STYPE##4 a)                             \
71  {                                                                           \
72    STYPE##4 b;                                                               \
73    b.s0 = -a.s0;                                                             \
74    b.s1 = -a.s1;                                                             \
75    b.s2 = -a.s2;                                                             \
76    b.s3 = -a.s3;                                                             \
77    return safe_normalize (b);                                                \
78  }                                                                           \
79                                                                              \
80  STYPE##2 _CL_OVERLOADABLE safe_abs (STYPE##2 const a)                       \
81  {                                                                           \
82    STYPE##2 b;                                                               \
83    b = a;                                                                    \
84    if (safe_isneg (b))                                                       \
85      {                                                                       \
86        b = safe_neg (b);                                                     \
87      }                                                                       \
88    return b;                                                                 \
89  }                                                                           \
90                                                                              \
91  STYPE##4 _CL_OVERLOADABLE safe_abs (STYPE##4 const a)                       \
92  {                                                                           \
93    STYPE##4 b;                                                               \
94    b = a;                                                                    \
95    if (safe_isneg (b))                                                       \
96      {                                                                       \
97        b = safe_neg (b);                                                     \
98      }                                                                       \
99    return b;                                                                 \
100  }                                                                           \
101                                                                              \
102  STYPE##2 _CL_OVERLOADABLE safe_add (STYPE##2 const a, STYPE##2 const b)     \
103  {                                                                           \
104    STYPE##2 c;                                                               \
105    c.s0 = a.s0 + b.s0;                                                       \
106    c.s1 = a.s1 + b.s1;                                                       \
107    return safe_normalize (c);                                                \
108  }                                                                           \
109                                                                              \
110  STYPE##4 _CL_OVERLOADABLE safe_add (STYPE##4 const a, STYPE##4 const b)     \
111  {                                                                           \
112    STYPE##4 c;                                                               \
113    c.s0 = a.s0 + b.s0;                                                       \
114    c.s1 = a.s1 + b.s1;                                                       \
115    c.s2 = a.s2 + b.s2;                                                       \
116    c.s3 = a.s3 + b.s3;                                                       \
117    return safe_normalize (c);                                                \
118  }                                                                           \
119                                                                              \
120  STYPE##2 _CL_OVERLOADABLE safe_sub (STYPE##2 const a, STYPE##2 const b)     \
121  {                                                                           \
122    STYPE##2 c;                                                               \
123    c.s0 = a.s0 - b.s0;                                                       \
124    c.s1 = a.s1 - b.s1;                                                       \
125    return safe_normalize (c);                                                \
126  }                                                                           \
127                                                                              \
128  STYPE##4 _CL_OVERLOADABLE safe_sub (STYPE##4 const a, STYPE##4 const b)     \
129  {                                                                           \
130    STYPE##4 c;                                                               \
131    c.s0 = a.s0 - b.s0;                                                       \
132    c.s1 = a.s1 - b.s1;                                                       \
133    c.s2 = a.s2 - b.s2;                                                       \
134    c.s3 = a.s3 - b.s3;                                                       \
135    return safe_normalize (c);                                                \
136  }                                                                           \
137                                                                              \
138  STYPE##2 _CL_OVERLOADABLE safe_create (STYPE const a);                      \
139  STYPE##2 _CL_OVERLOADABLE safe_minimul (STYPE const a, STYPE const b)       \
140  {                                                                           \
141    STYPE##2 tmp1 = safe_create ((STYPE) (a * (STYPE) (b & (STYPE)1)));       \
142    STYPE##2 tmp2 = safe_create ((STYPE) (a * (STYPE) (b >> (STYPE)1)));      \
143    STYPE##2 res;                                                             \
144    res = safe_add (tmp1, safe_add (tmp2, tmp2));                             \
145    return res;                                                               \
146  }                                                                           \
147                                                                              \
148  STYPE##4 _CL_OVERLOADABLE safe_mul (STYPE##2 a, STYPE##2 b)                 \
149  {                                                                           \
150    bool a_neg = safe_isneg (a);                                              \
151    bool b_neg = safe_isneg (b);                                              \
152    a = safe_abs (a);                                                         \
153    b = safe_abs (b);                                                         \
154    STYPE##4 c00, c01, c10, c11;                                              \
155    c00 = 0;                                                                  \
156    c00.s01 = safe_minimul (a.s0, b.s0);                                      \
157    c00 = safe_normalize (c00);                                               \
158    c01 = 0;                                                                  \
159    c01.s12 = safe_minimul (a.s0, b.s1);                                      \
160    c01 = safe_normalize (c01);                                               \
161    c10 = 0;                                                                  \
162    c10.s12 = safe_minimul (a.s1, b.s0);                                      \
163    c10 = safe_normalize (c10);                                               \
164    c11 = 0;                                                                  \
165    c11.s23 = safe_minimul (a.s1, b.s1);                                      \
166    c11 = safe_normalize (c11);                                               \
167    STYPE##4 c;                                                               \
168    c = safe_add (safe_add (c00, c01), safe_add (c10, c11));                  \
169    if (a_neg ^ b_neg)                                                        \
170      c = safe_neg (c);                                                       \
171    return c;                                                                 \
172  }                                                                           \
173                                                                              \
174  STYPE##2 _CL_OVERLOADABLE safe_max (STYPE##2 const a, STYPE##2 const b)     \
175  {                                                                           \
176    STYPE##2 c;                                                               \
177    if (safe_isneg (safe_sub (a, b)))                                         \
178      {                                                                       \
179        c = b;                                                                \
180      }                                                                       \
181    else                                                                      \
182      {                                                                       \
183        c = a;                                                                \
184      }                                                                       \
185    return c;                                                                 \
186  }                                                                           \
187                                                                              \
188  STYPE##4 _CL_OVERLOADABLE safe_max (STYPE##4 const a, STYPE##4 const b)     \
189  {                                                                           \
190    STYPE##4 c;                                                               \
191    if (safe_isneg (safe_sub (a, b)))                                         \
192      {                                                                       \
193        c = b;                                                                \
194      }                                                                       \
195    else                                                                      \
196      {                                                                       \
197        c = a;                                                                \
198      }                                                                       \
199    return c;                                                                 \
200  }                                                                           \
201                                                                              \
202  STYPE##2 _CL_OVERLOADABLE safe_min (STYPE##2 const a, STYPE##2 const b)     \
203  {                                                                           \
204    STYPE##2 c;                                                               \
205    if (safe_isneg (safe_sub (a, b)))                                         \
206      {                                                                       \
207        c = a;                                                                \
208      }                                                                       \
209    else                                                                      \
210      {                                                                       \
211        c = b;                                                                \
212      }                                                                       \
213    return c;                                                                 \
214  }                                                                           \
215                                                                              \
216  STYPE##4 _CL_OVERLOADABLE safe_min (STYPE##4 const a, STYPE##4 const b)     \
217  {                                                                           \
218    STYPE##4 c;                                                               \
219    if (safe_isneg (safe_sub (a, b)))                                         \
220      {                                                                       \
221        c = a;                                                                \
222      }                                                                       \
223    else                                                                      \
224      {                                                                       \
225        c = b;                                                                \
226      }                                                                       \
227    return c;                                                                 \
228  }                                                                           \
229                                                                              \
230  STYPE##2 _CL_OVERLOADABLE safe_clamp (STYPE##2 const a, STYPE##2 const alo, \
231                                        STYPE##2 const ahi)                   \
232  {                                                                           \
233    return safe_max (alo, safe_min (ahi, a));                                 \
234  }                                                                           \
235                                                                              \
236  STYPE##4 _CL_OVERLOADABLE safe_clamp (STYPE##4 const a, STYPE##4 const alo, \
237                                        STYPE##4 const ahi)                   \
238  {                                                                           \
239    return safe_max (alo, safe_min (ahi, a));                                 \
240  }                                                                           \
241                                                                              \
242  STYPE##2 _CL_OVERLOADABLE safe_rshift (STYPE##2 a)                          \
243  {                                                                           \
244    STYPE const halfbits = 4 * sizeof (STYPE);                                \
245    STYPE const halfmax = (STYPE)1 << halfbits;                               \
246    STYPE const halfmask = halfmax - (STYPE)1;                                \
247    STYPE##2 b;                                                               \
248    b.s0 = a.s0 | ((a.s1 & (STYPE)1) << halfbits);                            \
249    b.s1 = a.s1 & ~(STYPE)1;                                                  \
250    b.s0 >>= (STYPE)1;                                                        \
251    b.s1 >>= (STYPE)1;                                                        \
252    return safe_normalize (b);                                                \
253  }                                                                           \
254                                                                              \
255  STYPE##2 _CL_OVERLOADABLE safe_lo (STYPE##2 a)                              \
256  {                                                                           \
257    STYPE const halfbits = 4 * sizeof (STYPE);                                \
258    STYPE const halfmax = (STYPE)1 << halfbits;                               \
259    STYPE const halfmask = halfmax - (STYPE)1;                                \
260    bool a_neg = a.s1 < (STYPE)0;                                             \
261    a = safe_abs (a);                                                         \
262    if (a.s1 >= halfmax)                                                      \
263      a.s1 &= halfmask;                                                       \
264    if (a_neg)                                                                \
265      a = safe_neg (a);                                                       \
266    return a;                                                                 \
267  }                                                                           \
268                                                                              \
269  STYPE##2 _CL_OVERLOADABLE safe_lo (STYPE##4 a)                              \
270  {                                                                           \
271    bool a_neg = a.s3 < (STYPE)0;                                             \
272    a = safe_abs (a);                                                         \
273    STYPE##2 res = safe_normalize (a.lo);                                     \
274    if (a_neg)                                                                \
275      res = safe_neg (res);                                                   \
276    return res;                                                               \
277  }                                                                           \
278                                                                              \
279  STYPE##2 _CL_OVERLOADABLE safe_hi (STYPE##4 a)                              \
280  {                                                                           \
281    return safe_normalize (a.hi);                                             \
282  }
283
284#define DEFINE_SAFE_2(TYPE, STYPE)                                            \
285                                                                              \
286  STYPE##2 _CL_OVERLOADABLE safe_create (TYPE const a)                        \
287  {                                                                           \
288    STYPE const halfbits = 4 * sizeof (STYPE);                                \
289    STYPE const halfmax = (STYPE)1 << halfbits;                               \
290    STYPE const halfmask = halfmax - (STYPE)1;                                \
291    STYPE##2 b;                                                               \
292    /* input may be unsigned */                                               \
293    b.s0 = a & (TYPE)halfmask;                                                \
294    b.s1 = a >> (TYPE)halfbits;                                               \
295    b = safe_normalize (b);                                                   \
296    if ((TYPE)safe_extract (b) != a)                                          \
297      {                                                                       \
298        printf ("FAIL: safe_create %d (got %d)\n", (int)a,                    \
299                (int)(TYPE)safe_extract (b));                                 \
300      }                                                                       \
301    return b;                                                                 \
302  }                                                                           \
303                                                                              \
304  STYPE##4 _CL_OVERLOADABLE safe_create4 (TYPE const a)                       \
305  {                                                                           \
306    STYPE const halfbits = 4 * sizeof (STYPE);                                \
307    STYPE const halfmax = (STYPE)1 << halfbits;                               \
308    STYPE const halfmask = halfmax - (STYPE)1;                                \
309    STYPE##4 b;                                                               \
310    /* input may be unsigned */                                               \
311    TYPE tmp = a;                                                             \
312    b.s0 = tmp & (TYPE)halfmask;                                              \
313    tmp >>= halfbits;                                                         \
314    b.s1 = tmp & (TYPE)halfmask;                                              \
315    tmp >>= halfbits;                                                         \
316    b.s2 = tmp & (TYPE)halfmask;                                              \
317    tmp >>= halfbits;                                                         \
318    b.s3 = tmp;                                                               \
319    b = safe_normalize (b);                                                   \
320    if ((TYPE)safe_extract (b) != a)                                          \
321      {                                                                       \
322        printf ("FAIL: safe_create4 sz=%d sg=%d %d (got %d) [%d,%d,%d,%d]\n", \
323                (int)sizeof (TYPE), (int)((TYPE)-1 < (TYPE)0), (int)a,        \
324                (int)(TYPE)safe_extract (b), (int)b.s0, (int)b.s1, (int)b.s2, \
325                (int)b.s3);                                                   \
326      }                                                                       \
327    return b;                                                                 \
328  }
329
330
331
332DEFINE_SAFE_1 (char)
333DEFINE_SAFE_1 (short)
334DEFINE_SAFE_1 (int)
335__IF_INT64 (DEFINE_SAFE_1 (long))
336
337DEFINE_SAFE_2 (char, char)
338DEFINE_SAFE_2 (uchar, char)
339DEFINE_SAFE_2 (short, short)
340DEFINE_SAFE_2 (ushort, short)
341DEFINE_SAFE_2 (int, int)
342DEFINE_SAFE_2 (uint, int)
343__IF_INT64 (DEFINE_SAFE_2 (long, long))
344__IF_INT64 (DEFINE_SAFE_2 (ulong, long))
345
346
347
348#define IMPLEMENT_BODY_G_HADD(NAME, BODY, SIZE, GTYPE, SGTYPE, UGTYPE,        \
349                              SUGTYPE)                                        \
350  void NAME##_##GTYPE ()                                                      \
351  {                                                                           \
352    typedef GTYPE gtype;                                                      \
353    typedef SGTYPE sgtype;                                                    \
354    typedef UGTYPE ugtype;                                                    \
355    typedef SUGTYPE sugtype;                                                  \
356    string const typename = #GTYPE;                                           \
357    const int vecsize = SIZE;                                                 \
358    int const bits = count_bits (sgtype);                                     \
359    sgtype const tmin = is_signed (sgtype)                                    \
360                            ? (sgtype) ((sugtype)1 << (sugtype) (bits - 1))   \
361                            : (sgtype)0;                                      \
362    sgtype const tmax = (sgtype) ((sugtype)tmin - (sugtype)1);                \
363    for (int iter = 0; iter < nrandoms; ++iter)                               \
364      {                                                                       \
365        typedef union                                                         \
366        {                                                                     \
367          gtype v;                                                            \
368          ugtype u;                                                           \
369          sgtype s[16];                                                       \
370        } Tvec;                                                               \
371        Tvec x, y, z;                                                         \
372        Tvec good_abs;                                                        \
373        Tvec good_abs_diff, good_add_sat, good_mad_sat, good_sub_sat;         \
374        Tvec good_hadd, good_mad_hi, good_mul_hi, good_rhadd;                 \
375        for (int n = 0; n < vecsize; ++n)                                     \
376          {                                                                   \
377            x.s[n] = randoms[(iter + n) % nrandoms];                          \
378            y.s[n] = randoms[(iter + n + 20) % nrandoms];                     \
379            z.s[n] = randoms[(iter + n + 40) % nrandoms];                     \
380            if (bits > 32)                                                    \
381              {                                                               \
382                x.s[n] = (x.s[n] << (bits / 2))                               \
383                         | randoms[(iter + n + 100) % nrandoms];              \
384                y.s[n] = (y.s[n] << (bits / 2))                               \
385                         | randoms[(iter + n + 120) % nrandoms];              \
386                z.s[n] = (z.s[n] << (bits / 2))                               \
387                         | randoms[(iter + n + 140) % nrandoms];              \
388              }                                                               \
389            good_abs.s[n] = safe_extract (safe_abs (safe_create (x.s[n])));   \
390            good_abs_diff.s[n] = safe_extract (safe_abs (                     \
391                safe_sub (safe_create (x.s[n]), safe_create (y.s[n]))));      \
392            good_add_sat.s[n] = safe_extract (safe_clamp (                    \
393                safe_add (safe_create (x.s[n]), safe_create (y.s[n])),        \
394                safe_create (tmin), safe_create (tmax)));                     \
395            good_mad_sat.s[n] = safe_extract (                                \
396                safe_clamp (safe_add (safe_mul (safe_create (x.s[n]),         \
397                                                safe_create (y.s[n])),        \
398                                      safe_create4 (z.s[n])),                 \
399                            safe_create4 (tmin), safe_create4 (tmax)));       \
400            good_sub_sat.s[n] = safe_extract (safe_clamp (                    \
401                safe_sub (safe_create (x.s[n]), safe_create (y.s[n])),        \
402                safe_create (tmin), safe_create (tmax)));                     \
403            good_hadd.s[n] = safe_extract (safe_rshift (                      \
404                safe_add (safe_create (x.s[n]), safe_create (y.s[n]))));      \
405            good_mad_hi.s[n] = safe_extract (                                 \
406                safe_lo (safe_add (safe_hi (safe_mul (safe_create (x.s[n]),   \
407                                                      safe_create (y.s[n]))), \
408                                   safe_create (z.s[n]))));                   \
409            good_mul_hi.s[n] = safe_extract (safe_hi (                        \
410                safe_mul (safe_create (x.s[n]), safe_create (y.s[n]))));      \
411            good_rhadd.s[n] = safe_extract (safe_rshift (safe_add (           \
412                safe_add (safe_create (x.s[n]), safe_create (y.s[n])),        \
413                safe_create ((sgtype)1))));                                   \
414          }                                                                   \
415        Tvec res_abs;                                                         \
416        Tvec res_abs_diff, res_add_sat, res_mad_sat, res_sub_sat;             \
417        Tvec res_hadd, res_mad_hi, res_mul_hi, res_rhadd;                     \
418        res_abs.u = abs (x.v);                                                \
419        res_abs_diff.u = abs_diff (x.v, y.v);                                 \
420        res_add_sat.v = add_sat (x.v, y.v);                                   \
421        res_mad_sat.v = mad_sat (x.v, y.v, z.v);                              \
422        res_sub_sat.v = sub_sat (x.v, y.v);                                   \
423        res_hadd.v = hadd (x.v, y.v);                                         \
424        res_mad_hi.v = mad_hi (x.v, y.v, z.v);                                \
425        res_mul_hi.v = mul_hi (x.v, y.v);                                     \
426        res_rhadd.v = rhadd (x.v, y.v);                                       \
427        bool error = false;                                                   \
428        bool equal;                                                           \
429        BODY;                                                                 \
430      }                                                                       \
431  }
432
433
434#define DEFINE_BODY_G_HADD(NAME, EXPR)                                        \
435  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, char, char, uchar, uchar)             \
436  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, char2, char, uchar2, uchar)           \
437  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, char3, char, uchar3, uchar)           \
438  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, char4, char, uchar4, uchar)           \
439  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, char8, char, uchar8, uchar)           \
440  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, char16, char, uchar16, uchar)        \
441  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, uchar, uchar, uchar, uchar)           \
442  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, uchar2, uchar, uchar2, uchar)         \
443  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, uchar3, uchar, uchar3, uchar)         \
444  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, uchar4, uchar, uchar4, uchar)         \
445  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, uchar8, uchar, uchar8, uchar)         \
446  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, uchar16, uchar, uchar16, uchar)      \
447  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, short, short, ushort, ushort)         \
448  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, short2, short, ushort2, ushort)       \
449  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, short3, short, ushort3, ushort)       \
450  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, short4, short, ushort4, ushort)       \
451  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, short8, short, ushort8, ushort)       \
452  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, short16, short, ushort16, ushort)    \
453  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, ushort, ushort, ushort, ushort)       \
454  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, ushort2, ushort, ushort2, ushort)     \
455  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, ushort3, ushort, ushort3, ushort)     \
456  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, ushort4, ushort, ushort4, ushort)     \
457  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, ushort8, ushort, ushort8, ushort)     \
458  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, ushort16, ushort, ushort16, ushort)  \
459  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, int, int, uint, uint)                 \
460  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, int2, int, uint2, uint)               \
461  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, int3, int, uint3, uint)               \
462  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, int4, int, uint4, uint)               \
463  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, int8, int, uint8, uint)               \
464  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, int16, int, uint16, uint)            \
465  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, uint, uint, uint, uint)               \
466  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, uint2, uint, uint2, uint)             \
467  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, uint3, uint, uint3, uint)             \
468  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, uint4, uint, uint4, uint)             \
469  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, uint8, uint, uint8, uint)             \
470  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, uint16, uint, uint16, uint)          \
471  __IF_INT64 (                                                                \
472  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, long, long, ulong, ulong)             \
473  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, long2, long, ulong2, ulong)           \
474  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, long3, long, ulong3, ulong)           \
475  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, long4, long, ulong4, ulong)           \
476  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, long8, long, ulong8, ulong)           \
477  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, long16, long, ulong16, ulong)        \
478  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, ulong, ulong, ulong, ulong)           \
479  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, ulong2, ulong, ulong2, ulong)         \
480  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, ulong3, ulong, ulong3, ulong)         \
481  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, ulong4, ulong, ulong4, ulong)         \
482  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, ulong8, ulong, ulong8, ulong)         \
483  IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, ulong16, ulong, ulong16, ulong))
484