1/* Safe-but-slow arithmetic that can handle larger numbers without 2 overflowing. */ 3#define DEFINE_SAFE_1(STYPE) \ 4 \ 5 STYPE##2 _CL_OVERLOADABLE safe_normalize (STYPE##2 const a) \ 6 { \ 7 STYPE const halfbits = 4 * sizeof (STYPE); \ 8 STYPE const halfmax = (STYPE)1 << halfbits; \ 9 STYPE const halfmask = halfmax - (STYPE)1; \ 10 STYPE##2 b; \ 11 b.s0 = a.s0 & halfmask; \ 12 b.s1 = a.s1 + (STYPE) (a.s0 >> halfbits); \ 13 return b; \ 14 } \ 15 \ 16 STYPE##4 _CL_OVERLOADABLE safe_normalize (STYPE##4 const a) \ 17 { \ 18 STYPE const halfbits = 4 * sizeof (STYPE); \ 19 STYPE const halfmax = (STYPE)1 << halfbits; \ 20 STYPE const halfmask = halfmax - (STYPE)1; \ 21 STYPE tmp; \ 22 STYPE##4 b; \ 23 tmp = a.s0; \ 24 b.s0 = tmp & halfmask; \ 25 tmp = (STYPE) (tmp >> halfbits) + a.s1; \ 26 b.s1 = tmp & halfmask; \ 27 tmp = (STYPE) (tmp >> halfbits) + a.s2; \ 28 b.s2 = tmp & halfmask; \ 29 tmp = (STYPE) (tmp >> halfbits) + a.s3; \ 30 b.s3 = tmp; \ 31 return b; \ 32 } \ 33 \ 34 STYPE _CL_OVERLOADABLE safe_extract (STYPE##2 const a) \ 35 { \ 36 STYPE const halfbits = 4 * sizeof (STYPE); \ 37 STYPE const halfmax = (STYPE)1 << halfbits; \ 38 STYPE const halfmask = halfmax - (STYPE)1; \ 39 STYPE b; \ 40 b = a.s0 | (STYPE) (a.s1 << halfbits); \ 41 return b; \ 42 } \ 43 \ 44 STYPE _CL_OVERLOADABLE safe_extract (STYPE##4 const a) \ 45 { \ 46 STYPE const halfbits = 4 * sizeof (STYPE); \ 47 STYPE const halfmax = (STYPE)1 << halfbits; \ 48 STYPE const halfmask = halfmax - (STYPE)1; \ 49 STYPE b; \ 50 if (safe_extract (a.hi) != 0 && safe_extract (a.hi) != -1) \ 51 { \ 52 printf ("FAIL: safe_extract [%d,%d,%d,%d]\n", (int)a.s0, (int)a.s1, \ 53 (int)a.s2, (int)a.s3); \ 54 } \ 55 return safe_extract (a.lo); \ 56 } \ 57 \ 58 bool _CL_OVERLOADABLE safe_isneg (STYPE##2 a) { return a.s1 < (STYPE)0; } \ 59 \ 60 bool _CL_OVERLOADABLE safe_isneg (STYPE##4 a) { return a.s3 < (STYPE)0; } \ 61 \ 62 STYPE##2 _CL_OVERLOADABLE safe_neg (STYPE##2 a) \ 63 { \ 64 STYPE##2 b; \ 65 b.s0 = -a.s0; \ 66 b.s1 = -a.s1; \ 67 return safe_normalize (b); \ 68 } \ 69 \ 70 STYPE##4 _CL_OVERLOADABLE safe_neg (STYPE##4 a) \ 71 { \ 72 STYPE##4 b; \ 73 b.s0 = -a.s0; \ 74 b.s1 = -a.s1; \ 75 b.s2 = -a.s2; \ 76 b.s3 = -a.s3; \ 77 return safe_normalize (b); \ 78 } \ 79 \ 80 STYPE##2 _CL_OVERLOADABLE safe_abs (STYPE##2 const a) \ 81 { \ 82 STYPE##2 b; \ 83 b = a; \ 84 if (safe_isneg (b)) \ 85 { \ 86 b = safe_neg (b); \ 87 } \ 88 return b; \ 89 } \ 90 \ 91 STYPE##4 _CL_OVERLOADABLE safe_abs (STYPE##4 const a) \ 92 { \ 93 STYPE##4 b; \ 94 b = a; \ 95 if (safe_isneg (b)) \ 96 { \ 97 b = safe_neg (b); \ 98 } \ 99 return b; \ 100 } \ 101 \ 102 STYPE##2 _CL_OVERLOADABLE safe_add (STYPE##2 const a, STYPE##2 const b) \ 103 { \ 104 STYPE##2 c; \ 105 c.s0 = a.s0 + b.s0; \ 106 c.s1 = a.s1 + b.s1; \ 107 return safe_normalize (c); \ 108 } \ 109 \ 110 STYPE##4 _CL_OVERLOADABLE safe_add (STYPE##4 const a, STYPE##4 const b) \ 111 { \ 112 STYPE##4 c; \ 113 c.s0 = a.s0 + b.s0; \ 114 c.s1 = a.s1 + b.s1; \ 115 c.s2 = a.s2 + b.s2; \ 116 c.s3 = a.s3 + b.s3; \ 117 return safe_normalize (c); \ 118 } \ 119 \ 120 STYPE##2 _CL_OVERLOADABLE safe_sub (STYPE##2 const a, STYPE##2 const b) \ 121 { \ 122 STYPE##2 c; \ 123 c.s0 = a.s0 - b.s0; \ 124 c.s1 = a.s1 - b.s1; \ 125 return safe_normalize (c); \ 126 } \ 127 \ 128 STYPE##4 _CL_OVERLOADABLE safe_sub (STYPE##4 const a, STYPE##4 const b) \ 129 { \ 130 STYPE##4 c; \ 131 c.s0 = a.s0 - b.s0; \ 132 c.s1 = a.s1 - b.s1; \ 133 c.s2 = a.s2 - b.s2; \ 134 c.s3 = a.s3 - b.s3; \ 135 return safe_normalize (c); \ 136 } \ 137 \ 138 STYPE##2 _CL_OVERLOADABLE safe_create (STYPE const a); \ 139 STYPE##2 _CL_OVERLOADABLE safe_minimul (STYPE const a, STYPE const b) \ 140 { \ 141 STYPE##2 tmp1 = safe_create ((STYPE) (a * (STYPE) (b & (STYPE)1))); \ 142 STYPE##2 tmp2 = safe_create ((STYPE) (a * (STYPE) (b >> (STYPE)1))); \ 143 STYPE##2 res; \ 144 res = safe_add (tmp1, safe_add (tmp2, tmp2)); \ 145 return res; \ 146 } \ 147 \ 148 STYPE##4 _CL_OVERLOADABLE safe_mul (STYPE##2 a, STYPE##2 b) \ 149 { \ 150 bool a_neg = safe_isneg (a); \ 151 bool b_neg = safe_isneg (b); \ 152 a = safe_abs (a); \ 153 b = safe_abs (b); \ 154 STYPE##4 c00, c01, c10, c11; \ 155 c00 = 0; \ 156 c00.s01 = safe_minimul (a.s0, b.s0); \ 157 c00 = safe_normalize (c00); \ 158 c01 = 0; \ 159 c01.s12 = safe_minimul (a.s0, b.s1); \ 160 c01 = safe_normalize (c01); \ 161 c10 = 0; \ 162 c10.s12 = safe_minimul (a.s1, b.s0); \ 163 c10 = safe_normalize (c10); \ 164 c11 = 0; \ 165 c11.s23 = safe_minimul (a.s1, b.s1); \ 166 c11 = safe_normalize (c11); \ 167 STYPE##4 c; \ 168 c = safe_add (safe_add (c00, c01), safe_add (c10, c11)); \ 169 if (a_neg ^ b_neg) \ 170 c = safe_neg (c); \ 171 return c; \ 172 } \ 173 \ 174 STYPE##2 _CL_OVERLOADABLE safe_max (STYPE##2 const a, STYPE##2 const b) \ 175 { \ 176 STYPE##2 c; \ 177 if (safe_isneg (safe_sub (a, b))) \ 178 { \ 179 c = b; \ 180 } \ 181 else \ 182 { \ 183 c = a; \ 184 } \ 185 return c; \ 186 } \ 187 \ 188 STYPE##4 _CL_OVERLOADABLE safe_max (STYPE##4 const a, STYPE##4 const b) \ 189 { \ 190 STYPE##4 c; \ 191 if (safe_isneg (safe_sub (a, b))) \ 192 { \ 193 c = b; \ 194 } \ 195 else \ 196 { \ 197 c = a; \ 198 } \ 199 return c; \ 200 } \ 201 \ 202 STYPE##2 _CL_OVERLOADABLE safe_min (STYPE##2 const a, STYPE##2 const b) \ 203 { \ 204 STYPE##2 c; \ 205 if (safe_isneg (safe_sub (a, b))) \ 206 { \ 207 c = a; \ 208 } \ 209 else \ 210 { \ 211 c = b; \ 212 } \ 213 return c; \ 214 } \ 215 \ 216 STYPE##4 _CL_OVERLOADABLE safe_min (STYPE##4 const a, STYPE##4 const b) \ 217 { \ 218 STYPE##4 c; \ 219 if (safe_isneg (safe_sub (a, b))) \ 220 { \ 221 c = a; \ 222 } \ 223 else \ 224 { \ 225 c = b; \ 226 } \ 227 return c; \ 228 } \ 229 \ 230 STYPE##2 _CL_OVERLOADABLE safe_clamp (STYPE##2 const a, STYPE##2 const alo, \ 231 STYPE##2 const ahi) \ 232 { \ 233 return safe_max (alo, safe_min (ahi, a)); \ 234 } \ 235 \ 236 STYPE##4 _CL_OVERLOADABLE safe_clamp (STYPE##4 const a, STYPE##4 const alo, \ 237 STYPE##4 const ahi) \ 238 { \ 239 return safe_max (alo, safe_min (ahi, a)); \ 240 } \ 241 \ 242 STYPE##2 _CL_OVERLOADABLE safe_rshift (STYPE##2 a) \ 243 { \ 244 STYPE const halfbits = 4 * sizeof (STYPE); \ 245 STYPE const halfmax = (STYPE)1 << halfbits; \ 246 STYPE const halfmask = halfmax - (STYPE)1; \ 247 STYPE##2 b; \ 248 b.s0 = a.s0 | ((a.s1 & (STYPE)1) << halfbits); \ 249 b.s1 = a.s1 & ~(STYPE)1; \ 250 b.s0 >>= (STYPE)1; \ 251 b.s1 >>= (STYPE)1; \ 252 return safe_normalize (b); \ 253 } \ 254 \ 255 STYPE##2 _CL_OVERLOADABLE safe_lo (STYPE##2 a) \ 256 { \ 257 STYPE const halfbits = 4 * sizeof (STYPE); \ 258 STYPE const halfmax = (STYPE)1 << halfbits; \ 259 STYPE const halfmask = halfmax - (STYPE)1; \ 260 bool a_neg = a.s1 < (STYPE)0; \ 261 a = safe_abs (a); \ 262 if (a.s1 >= halfmax) \ 263 a.s1 &= halfmask; \ 264 if (a_neg) \ 265 a = safe_neg (a); \ 266 return a; \ 267 } \ 268 \ 269 STYPE##2 _CL_OVERLOADABLE safe_lo (STYPE##4 a) \ 270 { \ 271 bool a_neg = a.s3 < (STYPE)0; \ 272 a = safe_abs (a); \ 273 STYPE##2 res = safe_normalize (a.lo); \ 274 if (a_neg) \ 275 res = safe_neg (res); \ 276 return res; \ 277 } \ 278 \ 279 STYPE##2 _CL_OVERLOADABLE safe_hi (STYPE##4 a) \ 280 { \ 281 return safe_normalize (a.hi); \ 282 } 283 284#define DEFINE_SAFE_2(TYPE, STYPE) \ 285 \ 286 STYPE##2 _CL_OVERLOADABLE safe_create (TYPE const a) \ 287 { \ 288 STYPE const halfbits = 4 * sizeof (STYPE); \ 289 STYPE const halfmax = (STYPE)1 << halfbits; \ 290 STYPE const halfmask = halfmax - (STYPE)1; \ 291 STYPE##2 b; \ 292 /* input may be unsigned */ \ 293 b.s0 = a & (TYPE)halfmask; \ 294 b.s1 = a >> (TYPE)halfbits; \ 295 b = safe_normalize (b); \ 296 if ((TYPE)safe_extract (b) != a) \ 297 { \ 298 printf ("FAIL: safe_create %d (got %d)\n", (int)a, \ 299 (int)(TYPE)safe_extract (b)); \ 300 } \ 301 return b; \ 302 } \ 303 \ 304 STYPE##4 _CL_OVERLOADABLE safe_create4 (TYPE const a) \ 305 { \ 306 STYPE const halfbits = 4 * sizeof (STYPE); \ 307 STYPE const halfmax = (STYPE)1 << halfbits; \ 308 STYPE const halfmask = halfmax - (STYPE)1; \ 309 STYPE##4 b; \ 310 /* input may be unsigned */ \ 311 TYPE tmp = a; \ 312 b.s0 = tmp & (TYPE)halfmask; \ 313 tmp >>= halfbits; \ 314 b.s1 = tmp & (TYPE)halfmask; \ 315 tmp >>= halfbits; \ 316 b.s2 = tmp & (TYPE)halfmask; \ 317 tmp >>= halfbits; \ 318 b.s3 = tmp; \ 319 b = safe_normalize (b); \ 320 if ((TYPE)safe_extract (b) != a) \ 321 { \ 322 printf ("FAIL: safe_create4 sz=%d sg=%d %d (got %d) [%d,%d,%d,%d]\n", \ 323 (int)sizeof (TYPE), (int)((TYPE)-1 < (TYPE)0), (int)a, \ 324 (int)(TYPE)safe_extract (b), (int)b.s0, (int)b.s1, (int)b.s2, \ 325 (int)b.s3); \ 326 } \ 327 return b; \ 328 } 329 330 331 332DEFINE_SAFE_1 (char) 333DEFINE_SAFE_1 (short) 334DEFINE_SAFE_1 (int) 335__IF_INT64 (DEFINE_SAFE_1 (long)) 336 337DEFINE_SAFE_2 (char, char) 338DEFINE_SAFE_2 (uchar, char) 339DEFINE_SAFE_2 (short, short) 340DEFINE_SAFE_2 (ushort, short) 341DEFINE_SAFE_2 (int, int) 342DEFINE_SAFE_2 (uint, int) 343__IF_INT64 (DEFINE_SAFE_2 (long, long)) 344__IF_INT64 (DEFINE_SAFE_2 (ulong, long)) 345 346 347 348#define IMPLEMENT_BODY_G_HADD(NAME, BODY, SIZE, GTYPE, SGTYPE, UGTYPE, \ 349 SUGTYPE) \ 350 void NAME##_##GTYPE () \ 351 { \ 352 typedef GTYPE gtype; \ 353 typedef SGTYPE sgtype; \ 354 typedef UGTYPE ugtype; \ 355 typedef SUGTYPE sugtype; \ 356 string const typename = #GTYPE; \ 357 const int vecsize = SIZE; \ 358 int const bits = count_bits (sgtype); \ 359 sgtype const tmin = is_signed (sgtype) \ 360 ? (sgtype) ((sugtype)1 << (sugtype) (bits - 1)) \ 361 : (sgtype)0; \ 362 sgtype const tmax = (sgtype) ((sugtype)tmin - (sugtype)1); \ 363 for (int iter = 0; iter < nrandoms; ++iter) \ 364 { \ 365 typedef union \ 366 { \ 367 gtype v; \ 368 ugtype u; \ 369 sgtype s[16]; \ 370 } Tvec; \ 371 Tvec x, y, z; \ 372 Tvec good_abs; \ 373 Tvec good_abs_diff, good_add_sat, good_mad_sat, good_sub_sat; \ 374 Tvec good_hadd, good_mad_hi, good_mul_hi, good_rhadd; \ 375 for (int n = 0; n < vecsize; ++n) \ 376 { \ 377 x.s[n] = randoms[(iter + n) % nrandoms]; \ 378 y.s[n] = randoms[(iter + n + 20) % nrandoms]; \ 379 z.s[n] = randoms[(iter + n + 40) % nrandoms]; \ 380 if (bits > 32) \ 381 { \ 382 x.s[n] = (x.s[n] << (bits / 2)) \ 383 | randoms[(iter + n + 100) % nrandoms]; \ 384 y.s[n] = (y.s[n] << (bits / 2)) \ 385 | randoms[(iter + n + 120) % nrandoms]; \ 386 z.s[n] = (z.s[n] << (bits / 2)) \ 387 | randoms[(iter + n + 140) % nrandoms]; \ 388 } \ 389 good_abs.s[n] = safe_extract (safe_abs (safe_create (x.s[n]))); \ 390 good_abs_diff.s[n] = safe_extract (safe_abs ( \ 391 safe_sub (safe_create (x.s[n]), safe_create (y.s[n])))); \ 392 good_add_sat.s[n] = safe_extract (safe_clamp ( \ 393 safe_add (safe_create (x.s[n]), safe_create (y.s[n])), \ 394 safe_create (tmin), safe_create (tmax))); \ 395 good_mad_sat.s[n] = safe_extract ( \ 396 safe_clamp (safe_add (safe_mul (safe_create (x.s[n]), \ 397 safe_create (y.s[n])), \ 398 safe_create4 (z.s[n])), \ 399 safe_create4 (tmin), safe_create4 (tmax))); \ 400 good_sub_sat.s[n] = safe_extract (safe_clamp ( \ 401 safe_sub (safe_create (x.s[n]), safe_create (y.s[n])), \ 402 safe_create (tmin), safe_create (tmax))); \ 403 good_hadd.s[n] = safe_extract (safe_rshift ( \ 404 safe_add (safe_create (x.s[n]), safe_create (y.s[n])))); \ 405 good_mad_hi.s[n] = safe_extract ( \ 406 safe_lo (safe_add (safe_hi (safe_mul (safe_create (x.s[n]), \ 407 safe_create (y.s[n]))), \ 408 safe_create (z.s[n])))); \ 409 good_mul_hi.s[n] = safe_extract (safe_hi ( \ 410 safe_mul (safe_create (x.s[n]), safe_create (y.s[n])))); \ 411 good_rhadd.s[n] = safe_extract (safe_rshift (safe_add ( \ 412 safe_add (safe_create (x.s[n]), safe_create (y.s[n])), \ 413 safe_create ((sgtype)1)))); \ 414 } \ 415 Tvec res_abs; \ 416 Tvec res_abs_diff, res_add_sat, res_mad_sat, res_sub_sat; \ 417 Tvec res_hadd, res_mad_hi, res_mul_hi, res_rhadd; \ 418 res_abs.u = abs (x.v); \ 419 res_abs_diff.u = abs_diff (x.v, y.v); \ 420 res_add_sat.v = add_sat (x.v, y.v); \ 421 res_mad_sat.v = mad_sat (x.v, y.v, z.v); \ 422 res_sub_sat.v = sub_sat (x.v, y.v); \ 423 res_hadd.v = hadd (x.v, y.v); \ 424 res_mad_hi.v = mad_hi (x.v, y.v, z.v); \ 425 res_mul_hi.v = mul_hi (x.v, y.v); \ 426 res_rhadd.v = rhadd (x.v, y.v); \ 427 bool error = false; \ 428 bool equal; \ 429 BODY; \ 430 } \ 431 } 432 433 434#define DEFINE_BODY_G_HADD(NAME, EXPR) \ 435 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, char, char, uchar, uchar) \ 436 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, char2, char, uchar2, uchar) \ 437 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, char3, char, uchar3, uchar) \ 438 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, char4, char, uchar4, uchar) \ 439 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, char8, char, uchar8, uchar) \ 440 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, char16, char, uchar16, uchar) \ 441 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, uchar, uchar, uchar, uchar) \ 442 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, uchar2, uchar, uchar2, uchar) \ 443 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, uchar3, uchar, uchar3, uchar) \ 444 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, uchar4, uchar, uchar4, uchar) \ 445 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, uchar8, uchar, uchar8, uchar) \ 446 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, uchar16, uchar, uchar16, uchar) \ 447 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, short, short, ushort, ushort) \ 448 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, short2, short, ushort2, ushort) \ 449 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, short3, short, ushort3, ushort) \ 450 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, short4, short, ushort4, ushort) \ 451 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, short8, short, ushort8, ushort) \ 452 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, short16, short, ushort16, ushort) \ 453 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, ushort, ushort, ushort, ushort) \ 454 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, ushort2, ushort, ushort2, ushort) \ 455 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, ushort3, ushort, ushort3, ushort) \ 456 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, ushort4, ushort, ushort4, ushort) \ 457 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, ushort8, ushort, ushort8, ushort) \ 458 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, ushort16, ushort, ushort16, ushort) \ 459 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, int, int, uint, uint) \ 460 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, int2, int, uint2, uint) \ 461 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, int3, int, uint3, uint) \ 462 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, int4, int, uint4, uint) \ 463 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, int8, int, uint8, uint) \ 464 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, int16, int, uint16, uint) \ 465 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, uint, uint, uint, uint) \ 466 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, uint2, uint, uint2, uint) \ 467 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, uint3, uint, uint3, uint) \ 468 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, uint4, uint, uint4, uint) \ 469 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, uint8, uint, uint8, uint) \ 470 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, uint16, uint, uint16, uint) \ 471 __IF_INT64 ( \ 472 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, long, long, ulong, ulong) \ 473 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, long2, long, ulong2, ulong) \ 474 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, long3, long, ulong3, ulong) \ 475 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, long4, long, ulong4, ulong) \ 476 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, long8, long, ulong8, ulong) \ 477 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, long16, long, ulong16, ulong) \ 478 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 1, ulong, ulong, ulong, ulong) \ 479 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 2, ulong2, ulong, ulong2, ulong) \ 480 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 3, ulong3, ulong, ulong3, ulong) \ 481 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 4, ulong4, ulong, ulong4, ulong) \ 482 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 8, ulong8, ulong, ulong8, ulong) \ 483 IMPLEMENT_BODY_G_HADD (NAME, EXPR, 16, ulong16, ulong, ulong16, ulong)) 484