1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 315 void *va, uint32_t desc) 316 { 317 intptr_t i, opr_sz = simd_oprsz(desc); 318 int16_t *d = vd, *n = vn, *m = vm, *a = va; 319 uint32_t discard; 320 321 for (i = 0; i < opr_sz / 2; ++i) { 322 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 323 } 324 } 325 326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 327 void *va, uint32_t desc) 328 { 329 intptr_t i, opr_sz = simd_oprsz(desc); 330 int16_t *d = vd, *n = vn, *m = vm, *a = va; 331 uint32_t discard; 332 333 for (i = 0; i < opr_sz / 2; ++i) { 334 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 335 } 336 } 337 338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 339 { 340 intptr_t i, opr_sz = simd_oprsz(desc); 341 int16_t *d = vd, *n = vn, *m = vm; 342 uint32_t discard; 343 344 for (i = 0; i < opr_sz / 2; ++i) { 345 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 346 } 347 } 348 349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 350 { 351 intptr_t i, opr_sz = simd_oprsz(desc); 352 int16_t *d = vd, *n = vn, *m = vm; 353 uint32_t discard; 354 355 for (i = 0; i < opr_sz / 2; ++i) { 356 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 357 } 358 } 359 360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 361 { 362 intptr_t i, j, opr_sz = simd_oprsz(desc); 363 int idx = simd_data(desc); 364 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 365 uint32_t discard; 366 367 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 368 int16_t mm = m[i]; 369 for (j = 0; j < 16 / 2; ++j) { 370 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 371 } 372 } 373 } 374 375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 376 { 377 intptr_t i, j, opr_sz = simd_oprsz(desc); 378 int idx = simd_data(desc); 379 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 380 uint32_t discard; 381 382 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 383 int16_t mm = m[i]; 384 for (j = 0; j < 16 / 2; ++j) { 385 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 386 } 387 } 388 } 389 390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 392 bool neg, bool round, uint32_t *sat) 393 { 394 /* Simplify similarly to do_sqrdmlah_b above. */ 395 int64_t ret = (int64_t)src1 * src2; 396 if (neg) { 397 ret = -ret; 398 } 399 ret += ((int64_t)src3 << 31) + (round << 30); 400 ret >>= 31; 401 402 if (ret != (int32_t)ret) { 403 *sat = 1; 404 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 405 } 406 return ret; 407 } 408 409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 410 int32_t src2, int32_t src3) 411 { 412 uint32_t *sat = &env->vfp.qc[0]; 413 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 414 } 415 416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 417 void *vq, uint32_t desc) 418 { 419 uintptr_t opr_sz = simd_oprsz(desc); 420 int32_t *d = vd; 421 int32_t *n = vn; 422 int32_t *m = vm; 423 uintptr_t i; 424 425 for (i = 0; i < opr_sz / 4; ++i) { 426 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 427 } 428 clear_tail(d, opr_sz, simd_maxsz(desc)); 429 } 430 431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 432 int32_t src2, int32_t src3) 433 { 434 uint32_t *sat = &env->vfp.qc[0]; 435 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 436 } 437 438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 439 void *vq, uint32_t desc) 440 { 441 uintptr_t opr_sz = simd_oprsz(desc); 442 int32_t *d = vd; 443 int32_t *n = vn; 444 int32_t *m = vm; 445 uintptr_t i; 446 447 for (i = 0; i < opr_sz / 4; ++i) { 448 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 449 } 450 clear_tail(d, opr_sz, simd_maxsz(desc)); 451 } 452 453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 454 void *vq, uint32_t desc) 455 { 456 intptr_t i, opr_sz = simd_oprsz(desc); 457 int32_t *d = vd, *n = vn, *m = vm; 458 459 for (i = 0; i < opr_sz / 4; ++i) { 460 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 461 } 462 clear_tail(d, opr_sz, simd_maxsz(desc)); 463 } 464 465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 466 void *vq, uint32_t desc) 467 { 468 intptr_t i, opr_sz = simd_oprsz(desc); 469 int32_t *d = vd, *n = vn, *m = vm; 470 471 for (i = 0; i < opr_sz / 4; ++i) { 472 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 473 } 474 clear_tail(d, opr_sz, simd_maxsz(desc)); 475 } 476 477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 478 void *va, uint32_t desc) 479 { 480 intptr_t i, opr_sz = simd_oprsz(desc); 481 int32_t *d = vd, *n = vn, *m = vm, *a = va; 482 uint32_t discard; 483 484 for (i = 0; i < opr_sz / 4; ++i) { 485 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 486 } 487 } 488 489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 490 void *va, uint32_t desc) 491 { 492 intptr_t i, opr_sz = simd_oprsz(desc); 493 int32_t *d = vd, *n = vn, *m = vm, *a = va; 494 uint32_t discard; 495 496 for (i = 0; i < opr_sz / 4; ++i) { 497 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 498 } 499 } 500 501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 502 { 503 intptr_t i, opr_sz = simd_oprsz(desc); 504 int32_t *d = vd, *n = vn, *m = vm; 505 uint32_t discard; 506 507 for (i = 0; i < opr_sz / 4; ++i) { 508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 509 } 510 } 511 512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 513 { 514 intptr_t i, opr_sz = simd_oprsz(desc); 515 int32_t *d = vd, *n = vn, *m = vm; 516 uint32_t discard; 517 518 for (i = 0; i < opr_sz / 4; ++i) { 519 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 520 } 521 } 522 523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 524 { 525 intptr_t i, j, opr_sz = simd_oprsz(desc); 526 int idx = simd_data(desc); 527 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 528 uint32_t discard; 529 530 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 531 int32_t mm = m[i]; 532 for (j = 0; j < 16 / 4; ++j) { 533 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 534 } 535 } 536 } 537 538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 539 { 540 intptr_t i, j, opr_sz = simd_oprsz(desc); 541 int idx = simd_data(desc); 542 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 543 uint32_t discard; 544 545 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 546 int32_t mm = m[i]; 547 for (j = 0; j < 16 / 4; ++j) { 548 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 549 } 550 } 551 } 552 553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 554 static int64_t do_sat128_d(Int128 r) 555 { 556 int64_t ls = int128_getlo(r); 557 int64_t hs = int128_gethi(r); 558 559 if (unlikely(hs != (ls >> 63))) { 560 return hs < 0 ? INT64_MIN : INT64_MAX; 561 } 562 return ls; 563 } 564 565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 566 { 567 uint64_t l, h; 568 Int128 r, t; 569 570 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 571 muls64(&l, &h, m, n); 572 r = int128_make128(l, h); 573 if (neg) { 574 r = int128_neg(r); 575 } 576 if (a) { 577 t = int128_exts64(a); 578 t = int128_lshift(t, 63); 579 r = int128_add(r, t); 580 } 581 if (round) { 582 t = int128_exts64(1ll << 62); 583 r = int128_add(r, t); 584 } 585 r = int128_rshift(r, 63); 586 587 return do_sat128_d(r); 588 } 589 590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 591 void *va, uint32_t desc) 592 { 593 intptr_t i, opr_sz = simd_oprsz(desc); 594 int64_t *d = vd, *n = vn, *m = vm, *a = va; 595 596 for (i = 0; i < opr_sz / 8; ++i) { 597 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 598 } 599 } 600 601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 602 void *va, uint32_t desc) 603 { 604 intptr_t i, opr_sz = simd_oprsz(desc); 605 int64_t *d = vd, *n = vn, *m = vm, *a = va; 606 607 for (i = 0; i < opr_sz / 8; ++i) { 608 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 609 } 610 } 611 612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 613 { 614 intptr_t i, opr_sz = simd_oprsz(desc); 615 int64_t *d = vd, *n = vn, *m = vm; 616 617 for (i = 0; i < opr_sz / 8; ++i) { 618 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 619 } 620 } 621 622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int64_t *d = vd, *n = vn, *m = vm; 626 627 for (i = 0; i < opr_sz / 8; ++i) { 628 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 629 } 630 } 631 632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 633 { 634 intptr_t i, j, opr_sz = simd_oprsz(desc); 635 int idx = simd_data(desc); 636 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 637 638 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 639 int64_t mm = m[i]; 640 for (j = 0; j < 16 / 8; ++j) { 641 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 642 } 643 } 644 } 645 646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 647 { 648 intptr_t i, j, opr_sz = simd_oprsz(desc); 649 int idx = simd_data(desc); 650 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 651 652 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 653 int64_t mm = m[i]; 654 for (j = 0; j < 16 / 8; ++j) { 655 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 656 } 657 } 658 } 659 660 /* Integer 8 and 16-bit dot-product. 661 * 662 * Note that for the loops herein, host endianness does not matter 663 * with respect to the ordering of data within the quad-width lanes. 664 * All elements are treated equally, no matter where they are. 665 */ 666 667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 669 { \ 670 intptr_t i, opr_sz = simd_oprsz(desc); \ 671 TYPED *d = vd, *a = va; \ 672 TYPEN *n = vn; \ 673 TYPEM *m = vm; \ 674 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 675 d[i] = (a[i] + \ 676 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 677 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 678 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 679 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 680 } \ 681 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 682 } 683 684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 689 690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 692 { \ 693 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 694 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 695 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 696 intptr_t index = simd_data(desc); \ 697 TYPED *d = vd, *a = va; \ 698 TYPEN *n = vn; \ 699 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 700 do { \ 701 TYPED m0 = m_indexed[i * 4 + 0]; \ 702 TYPED m1 = m_indexed[i * 4 + 1]; \ 703 TYPED m2 = m_indexed[i * 4 + 2]; \ 704 TYPED m3 = m_indexed[i * 4 + 3]; \ 705 do { \ 706 d[i] = (a[i] + \ 707 n[i * 4 + 0] * m0 + \ 708 n[i * 4 + 1] * m1 + \ 709 n[i * 4 + 2] * m2 + \ 710 n[i * 4 + 3] * m3); \ 711 } while (++i < segend); \ 712 segend = i + 4; \ 713 } while (i < opr_sz_n); \ 714 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 715 } 716 717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 723 724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 725 void *vfpst, uint32_t desc) 726 { 727 uintptr_t opr_sz = simd_oprsz(desc); 728 float16 *d = vd; 729 float16 *n = vn; 730 float16 *m = vm; 731 float_status *fpst = vfpst; 732 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 733 uint32_t neg_imag = neg_real ^ 1; 734 uintptr_t i; 735 736 /* Shift boolean to the sign bit so we can xor to negate. */ 737 neg_real <<= 15; 738 neg_imag <<= 15; 739 740 for (i = 0; i < opr_sz / 2; i += 2) { 741 float16 e0 = n[H2(i)]; 742 float16 e1 = m[H2(i + 1)] ^ neg_imag; 743 float16 e2 = n[H2(i + 1)]; 744 float16 e3 = m[H2(i)] ^ neg_real; 745 746 d[H2(i)] = float16_add(e0, e1, fpst); 747 d[H2(i + 1)] = float16_add(e2, e3, fpst); 748 } 749 clear_tail(d, opr_sz, simd_maxsz(desc)); 750 } 751 752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 753 void *vfpst, uint32_t desc) 754 { 755 uintptr_t opr_sz = simd_oprsz(desc); 756 float32 *d = vd; 757 float32 *n = vn; 758 float32 *m = vm; 759 float_status *fpst = vfpst; 760 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 761 uint32_t neg_imag = neg_real ^ 1; 762 uintptr_t i; 763 764 /* Shift boolean to the sign bit so we can xor to negate. */ 765 neg_real <<= 31; 766 neg_imag <<= 31; 767 768 for (i = 0; i < opr_sz / 4; i += 2) { 769 float32 e0 = n[H4(i)]; 770 float32 e1 = m[H4(i + 1)] ^ neg_imag; 771 float32 e2 = n[H4(i + 1)]; 772 float32 e3 = m[H4(i)] ^ neg_real; 773 774 d[H4(i)] = float32_add(e0, e1, fpst); 775 d[H4(i + 1)] = float32_add(e2, e3, fpst); 776 } 777 clear_tail(d, opr_sz, simd_maxsz(desc)); 778 } 779 780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 781 void *vfpst, uint32_t desc) 782 { 783 uintptr_t opr_sz = simd_oprsz(desc); 784 float64 *d = vd; 785 float64 *n = vn; 786 float64 *m = vm; 787 float_status *fpst = vfpst; 788 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 789 uint64_t neg_imag = neg_real ^ 1; 790 uintptr_t i; 791 792 /* Shift boolean to the sign bit so we can xor to negate. */ 793 neg_real <<= 63; 794 neg_imag <<= 63; 795 796 for (i = 0; i < opr_sz / 8; i += 2) { 797 float64 e0 = n[i]; 798 float64 e1 = m[i + 1] ^ neg_imag; 799 float64 e2 = n[i + 1]; 800 float64 e3 = m[i] ^ neg_real; 801 802 d[i] = float64_add(e0, e1, fpst); 803 d[i + 1] = float64_add(e2, e3, fpst); 804 } 805 clear_tail(d, opr_sz, simd_maxsz(desc)); 806 } 807 808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 809 void *vfpst, uint32_t desc) 810 { 811 uintptr_t opr_sz = simd_oprsz(desc); 812 float16 *d = vd, *n = vn, *m = vm, *a = va; 813 float_status *fpst = vfpst; 814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 816 uint32_t neg_real = flip ^ neg_imag; 817 uintptr_t i; 818 819 /* Shift boolean to the sign bit so we can xor to negate. */ 820 neg_real <<= 15; 821 neg_imag <<= 15; 822 823 for (i = 0; i < opr_sz / 2; i += 2) { 824 float16 e2 = n[H2(i + flip)]; 825 float16 e1 = m[H2(i + flip)] ^ neg_real; 826 float16 e4 = e2; 827 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 828 829 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 830 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 831 } 832 clear_tail(d, opr_sz, simd_maxsz(desc)); 833 } 834 835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 836 void *vfpst, uint32_t desc) 837 { 838 uintptr_t opr_sz = simd_oprsz(desc); 839 float16 *d = vd, *n = vn, *m = vm, *a = va; 840 float_status *fpst = vfpst; 841 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 842 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 843 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 844 uint32_t neg_real = flip ^ neg_imag; 845 intptr_t elements = opr_sz / sizeof(float16); 846 intptr_t eltspersegment = 16 / sizeof(float16); 847 intptr_t i, j; 848 849 /* Shift boolean to the sign bit so we can xor to negate. */ 850 neg_real <<= 15; 851 neg_imag <<= 15; 852 853 for (i = 0; i < elements; i += eltspersegment) { 854 float16 mr = m[H2(i + 2 * index + 0)]; 855 float16 mi = m[H2(i + 2 * index + 1)]; 856 float16 e1 = neg_real ^ (flip ? mi : mr); 857 float16 e3 = neg_imag ^ (flip ? mr : mi); 858 859 for (j = i; j < i + eltspersegment; j += 2) { 860 float16 e2 = n[H2(j + flip)]; 861 float16 e4 = e2; 862 863 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 864 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 865 } 866 } 867 clear_tail(d, opr_sz, simd_maxsz(desc)); 868 } 869 870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 871 void *vfpst, uint32_t desc) 872 { 873 uintptr_t opr_sz = simd_oprsz(desc); 874 float32 *d = vd, *n = vn, *m = vm, *a = va; 875 float_status *fpst = vfpst; 876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 878 uint32_t neg_real = flip ^ neg_imag; 879 uintptr_t i; 880 881 /* Shift boolean to the sign bit so we can xor to negate. */ 882 neg_real <<= 31; 883 neg_imag <<= 31; 884 885 for (i = 0; i < opr_sz / 4; i += 2) { 886 float32 e2 = n[H4(i + flip)]; 887 float32 e1 = m[H4(i + flip)] ^ neg_real; 888 float32 e4 = e2; 889 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 890 891 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 892 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 893 } 894 clear_tail(d, opr_sz, simd_maxsz(desc)); 895 } 896 897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 898 void *vfpst, uint32_t desc) 899 { 900 uintptr_t opr_sz = simd_oprsz(desc); 901 float32 *d = vd, *n = vn, *m = vm, *a = va; 902 float_status *fpst = vfpst; 903 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 904 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 905 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 906 uint32_t neg_real = flip ^ neg_imag; 907 intptr_t elements = opr_sz / sizeof(float32); 908 intptr_t eltspersegment = 16 / sizeof(float32); 909 intptr_t i, j; 910 911 /* Shift boolean to the sign bit so we can xor to negate. */ 912 neg_real <<= 31; 913 neg_imag <<= 31; 914 915 for (i = 0; i < elements; i += eltspersegment) { 916 float32 mr = m[H4(i + 2 * index + 0)]; 917 float32 mi = m[H4(i + 2 * index + 1)]; 918 float32 e1 = neg_real ^ (flip ? mi : mr); 919 float32 e3 = neg_imag ^ (flip ? mr : mi); 920 921 for (j = i; j < i + eltspersegment; j += 2) { 922 float32 e2 = n[H4(j + flip)]; 923 float32 e4 = e2; 924 925 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 926 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 927 } 928 } 929 clear_tail(d, opr_sz, simd_maxsz(desc)); 930 } 931 932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 933 void *vfpst, uint32_t desc) 934 { 935 uintptr_t opr_sz = simd_oprsz(desc); 936 float64 *d = vd, *n = vn, *m = vm, *a = va; 937 float_status *fpst = vfpst; 938 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 939 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 940 uint64_t neg_real = flip ^ neg_imag; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e2 = n[i + flip]; 949 float64 e1 = m[i + flip] ^ neg_real; 950 float64 e4 = e2; 951 float64 e3 = m[i + 1 - flip] ^ neg_imag; 952 953 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 954 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 /* 960 * Floating point comparisons producing an integer result (all 1s or all 0s). 961 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 962 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 963 */ 964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 965 { 966 return -float16_eq_quiet(op1, op2, stat); 967 } 968 969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 970 { 971 return -float32_eq_quiet(op1, op2, stat); 972 } 973 974 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 975 { 976 return -float16_le(op2, op1, stat); 977 } 978 979 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 980 { 981 return -float32_le(op2, op1, stat); 982 } 983 984 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 985 { 986 return -float16_lt(op2, op1, stat); 987 } 988 989 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 990 { 991 return -float32_lt(op2, op1, stat); 992 } 993 994 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 995 { 996 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 997 } 998 999 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1000 { 1001 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1002 } 1003 1004 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1005 { 1006 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1007 } 1008 1009 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1010 { 1011 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1012 } 1013 1014 static int16_t vfp_tosszh(float16 x, void *fpstp) 1015 { 1016 float_status *fpst = fpstp; 1017 if (float16_is_any_nan(x)) { 1018 float_raise(float_flag_invalid, fpst); 1019 return 0; 1020 } 1021 return float16_to_int16_round_to_zero(x, fpst); 1022 } 1023 1024 static uint16_t vfp_touszh(float16 x, void *fpstp) 1025 { 1026 float_status *fpst = fpstp; 1027 if (float16_is_any_nan(x)) { 1028 float_raise(float_flag_invalid, fpst); 1029 return 0; 1030 } 1031 return float16_to_uint16_round_to_zero(x, fpst); 1032 } 1033 1034 #define DO_2OP(NAME, FUNC, TYPE) \ 1035 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1036 { \ 1037 intptr_t i, oprsz = simd_oprsz(desc); \ 1038 TYPE *d = vd, *n = vn; \ 1039 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1040 d[i] = FUNC(n[i], stat); \ 1041 } \ 1042 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1043 } 1044 1045 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1046 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1047 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1048 1049 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1050 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1051 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1052 1053 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1054 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1055 1056 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1057 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1058 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1059 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1060 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1061 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1062 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1063 DO_2OP(gvec_touszh, vfp_touszh, float16) 1064 1065 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1066 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1067 { \ 1068 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1069 } 1070 1071 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1072 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1073 { \ 1074 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1075 } 1076 1077 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1078 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1079 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1080 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1081 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1082 1083 DO_2OP_CMP0(cgt, cgt, FWD) 1084 DO_2OP_CMP0(cge, cge, FWD) 1085 DO_2OP_CMP0(ceq, ceq, FWD) 1086 DO_2OP_CMP0(clt, cgt, REV) 1087 DO_2OP_CMP0(cle, cge, REV) 1088 1089 #undef DO_2OP 1090 #undef DO_2OP_CMP0 1091 1092 /* Floating-point trigonometric starting value. 1093 * See the ARM ARM pseudocode function FPTrigSMul. 1094 */ 1095 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1096 { 1097 float16 result = float16_mul(op1, op1, stat); 1098 if (!float16_is_any_nan(result)) { 1099 result = float16_set_sign(result, op2 & 1); 1100 } 1101 return result; 1102 } 1103 1104 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1105 { 1106 float32 result = float32_mul(op1, op1, stat); 1107 if (!float32_is_any_nan(result)) { 1108 result = float32_set_sign(result, op2 & 1); 1109 } 1110 return result; 1111 } 1112 1113 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1114 { 1115 float64 result = float64_mul(op1, op1, stat); 1116 if (!float64_is_any_nan(result)) { 1117 result = float64_set_sign(result, op2 & 1); 1118 } 1119 return result; 1120 } 1121 1122 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1123 { 1124 return float16_abs(float16_sub(op1, op2, stat)); 1125 } 1126 1127 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1128 { 1129 return float32_abs(float32_sub(op1, op2, stat)); 1130 } 1131 1132 /* 1133 * Reciprocal step. These are the AArch32 version which uses a 1134 * non-fused multiply-and-subtract. 1135 */ 1136 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1137 { 1138 op1 = float16_squash_input_denormal(op1, stat); 1139 op2 = float16_squash_input_denormal(op2, stat); 1140 1141 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1142 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1143 return float16_two; 1144 } 1145 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1146 } 1147 1148 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1149 { 1150 op1 = float32_squash_input_denormal(op1, stat); 1151 op2 = float32_squash_input_denormal(op2, stat); 1152 1153 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1154 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1155 return float32_two; 1156 } 1157 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1158 } 1159 1160 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1161 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1162 { 1163 op1 = float16_squash_input_denormal(op1, stat); 1164 op2 = float16_squash_input_denormal(op2, stat); 1165 1166 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1167 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1168 return float16_one_point_five; 1169 } 1170 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1171 return float16_div(op1, float16_two, stat); 1172 } 1173 1174 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1175 { 1176 op1 = float32_squash_input_denormal(op1, stat); 1177 op2 = float32_squash_input_denormal(op2, stat); 1178 1179 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1180 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1181 return float32_one_point_five; 1182 } 1183 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1184 return float32_div(op1, float32_two, stat); 1185 } 1186 1187 #define DO_3OP(NAME, FUNC, TYPE) \ 1188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1189 { \ 1190 intptr_t i, oprsz = simd_oprsz(desc); \ 1191 TYPE *d = vd, *n = vn, *m = vm; \ 1192 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1193 d[i] = FUNC(n[i], m[i], stat); \ 1194 } \ 1195 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1196 } 1197 1198 DO_3OP(gvec_fadd_h, float16_add, float16) 1199 DO_3OP(gvec_fadd_s, float32_add, float32) 1200 DO_3OP(gvec_fadd_d, float64_add, float64) 1201 1202 DO_3OP(gvec_fsub_h, float16_sub, float16) 1203 DO_3OP(gvec_fsub_s, float32_sub, float32) 1204 DO_3OP(gvec_fsub_d, float64_sub, float64) 1205 1206 DO_3OP(gvec_fmul_h, float16_mul, float16) 1207 DO_3OP(gvec_fmul_s, float32_mul, float32) 1208 DO_3OP(gvec_fmul_d, float64_mul, float64) 1209 1210 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1211 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1212 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1213 1214 DO_3OP(gvec_fabd_h, float16_abd, float16) 1215 DO_3OP(gvec_fabd_s, float32_abd, float32) 1216 1217 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1218 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1219 1220 DO_3OP(gvec_fcge_h, float16_cge, float16) 1221 DO_3OP(gvec_fcge_s, float32_cge, float32) 1222 1223 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1224 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1225 1226 DO_3OP(gvec_facge_h, float16_acge, float16) 1227 DO_3OP(gvec_facge_s, float32_acge, float32) 1228 1229 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1230 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1231 1232 DO_3OP(gvec_fmax_h, float16_max, float16) 1233 DO_3OP(gvec_fmax_s, float32_max, float32) 1234 1235 DO_3OP(gvec_fmin_h, float16_min, float16) 1236 DO_3OP(gvec_fmin_s, float32_min, float32) 1237 1238 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1239 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1240 1241 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1242 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1243 1244 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1245 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1246 1247 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1248 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1249 1250 #ifdef TARGET_AARCH64 1251 1252 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1253 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1254 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1255 1256 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1257 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1258 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1259 1260 #endif 1261 #undef DO_3OP 1262 1263 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1264 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1265 float_status *stat) 1266 { 1267 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1268 } 1269 1270 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1271 float_status *stat) 1272 { 1273 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1274 } 1275 1276 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1277 float_status *stat) 1278 { 1279 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1280 } 1281 1282 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1283 float_status *stat) 1284 { 1285 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1286 } 1287 1288 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1289 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1290 float_status *stat) 1291 { 1292 return float16_muladd(op1, op2, dest, 0, stat); 1293 } 1294 1295 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1296 float_status *stat) 1297 { 1298 return float32_muladd(op1, op2, dest, 0, stat); 1299 } 1300 1301 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1302 float_status *stat) 1303 { 1304 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1305 } 1306 1307 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1308 float_status *stat) 1309 { 1310 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1311 } 1312 1313 #define DO_MULADD(NAME, FUNC, TYPE) \ 1314 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1315 { \ 1316 intptr_t i, oprsz = simd_oprsz(desc); \ 1317 TYPE *d = vd, *n = vn, *m = vm; \ 1318 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1319 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1320 } \ 1321 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1322 } 1323 1324 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1325 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1326 1327 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1328 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1329 1330 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1331 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1332 1333 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1334 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1335 1336 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1337 * For AdvSIMD, there is of course only one such vector segment. 1338 */ 1339 1340 #define DO_MUL_IDX(NAME, TYPE, H) \ 1341 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1342 { \ 1343 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1344 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1345 intptr_t idx = simd_data(desc); \ 1346 TYPE *d = vd, *n = vn, *m = vm; \ 1347 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1348 TYPE mm = m[H(i + idx)]; \ 1349 for (j = 0; j < segment; j++) { \ 1350 d[i + j] = n[i + j] * mm; \ 1351 } \ 1352 } \ 1353 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1354 } 1355 1356 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1357 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1358 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1359 1360 #undef DO_MUL_IDX 1361 1362 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1363 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1364 { \ 1365 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1366 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1367 intptr_t idx = simd_data(desc); \ 1368 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1369 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1370 TYPE mm = m[H(i + idx)]; \ 1371 for (j = 0; j < segment; j++) { \ 1372 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1373 } \ 1374 } \ 1375 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1376 } 1377 1378 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1379 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1380 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1381 1382 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1383 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1384 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1385 1386 #undef DO_MLA_IDX 1387 1388 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ 1389 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1390 { \ 1391 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1392 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1393 intptr_t idx = simd_data(desc); \ 1394 TYPE *d = vd, *n = vn, *m = vm; \ 1395 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1396 TYPE mm = m[H(i + idx)]; \ 1397 for (j = 0; j < segment; j++) { \ 1398 d[i + j] = TYPE##_##ADD(d[i + j], \ 1399 TYPE##_mul(n[i + j], mm, stat), stat); \ 1400 } \ 1401 } \ 1402 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1403 } 1404 1405 #define float16_nop(N, M, S) (M) 1406 #define float32_nop(N, M, S) (M) 1407 #define float64_nop(N, M, S) (M) 1408 1409 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) 1410 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) 1411 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) 1412 1413 /* 1414 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1415 * the fused ops below they assume accumulate both from and into Vd. 1416 */ 1417 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) 1418 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) 1419 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) 1420 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) 1421 1422 #undef float16_nop 1423 #undef float32_nop 1424 #undef float64_nop 1425 #undef DO_FMUL_IDX 1426 1427 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1428 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1429 void *stat, uint32_t desc) \ 1430 { \ 1431 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1432 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1433 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1434 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1435 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1436 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1437 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1438 TYPE mm = m[H(i + idx)]; \ 1439 for (j = 0; j < segment; j++) { \ 1440 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1441 mm, a[i + j], 0, stat); \ 1442 } \ 1443 } \ 1444 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1445 } 1446 1447 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1448 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1449 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1450 1451 #undef DO_FMLA_IDX 1452 1453 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1454 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1455 { \ 1456 intptr_t i, oprsz = simd_oprsz(desc); \ 1457 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1458 bool q = false; \ 1459 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1460 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1461 if (dd < MIN) { \ 1462 dd = MIN; \ 1463 q = true; \ 1464 } else if (dd > MAX) { \ 1465 dd = MAX; \ 1466 q = true; \ 1467 } \ 1468 d[i] = dd; \ 1469 } \ 1470 if (q) { \ 1471 uint32_t *qc = vq; \ 1472 qc[0] = 1; \ 1473 } \ 1474 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1475 } 1476 1477 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1478 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1479 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1480 1481 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1482 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1483 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1484 1485 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1486 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1487 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1488 1489 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1490 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1491 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1492 1493 #undef DO_SAT 1494 1495 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1496 void *vm, uint32_t desc) 1497 { 1498 intptr_t i, oprsz = simd_oprsz(desc); 1499 uint64_t *d = vd, *n = vn, *m = vm; 1500 bool q = false; 1501 1502 for (i = 0; i < oprsz / 8; i++) { 1503 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1504 if (dd < nn) { 1505 dd = UINT64_MAX; 1506 q = true; 1507 } 1508 d[i] = dd; 1509 } 1510 if (q) { 1511 uint32_t *qc = vq; 1512 qc[0] = 1; 1513 } 1514 clear_tail(d, oprsz, simd_maxsz(desc)); 1515 } 1516 1517 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1518 void *vm, uint32_t desc) 1519 { 1520 intptr_t i, oprsz = simd_oprsz(desc); 1521 uint64_t *d = vd, *n = vn, *m = vm; 1522 bool q = false; 1523 1524 for (i = 0; i < oprsz / 8; i++) { 1525 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1526 if (nn < mm) { 1527 dd = 0; 1528 q = true; 1529 } 1530 d[i] = dd; 1531 } 1532 if (q) { 1533 uint32_t *qc = vq; 1534 qc[0] = 1; 1535 } 1536 clear_tail(d, oprsz, simd_maxsz(desc)); 1537 } 1538 1539 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1540 void *vm, uint32_t desc) 1541 { 1542 intptr_t i, oprsz = simd_oprsz(desc); 1543 int64_t *d = vd, *n = vn, *m = vm; 1544 bool q = false; 1545 1546 for (i = 0; i < oprsz / 8; i++) { 1547 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1548 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1549 dd = (nn >> 63) ^ ~INT64_MIN; 1550 q = true; 1551 } 1552 d[i] = dd; 1553 } 1554 if (q) { 1555 uint32_t *qc = vq; 1556 qc[0] = 1; 1557 } 1558 clear_tail(d, oprsz, simd_maxsz(desc)); 1559 } 1560 1561 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1562 void *vm, uint32_t desc) 1563 { 1564 intptr_t i, oprsz = simd_oprsz(desc); 1565 int64_t *d = vd, *n = vn, *m = vm; 1566 bool q = false; 1567 1568 for (i = 0; i < oprsz / 8; i++) { 1569 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1570 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1571 dd = (nn >> 63) ^ ~INT64_MIN; 1572 q = true; 1573 } 1574 d[i] = dd; 1575 } 1576 if (q) { 1577 uint32_t *qc = vq; 1578 qc[0] = 1; 1579 } 1580 clear_tail(d, oprsz, simd_maxsz(desc)); 1581 } 1582 1583 1584 #define DO_SRA(NAME, TYPE) \ 1585 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1586 { \ 1587 intptr_t i, oprsz = simd_oprsz(desc); \ 1588 int shift = simd_data(desc); \ 1589 TYPE *d = vd, *n = vn; \ 1590 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1591 d[i] += n[i] >> shift; \ 1592 } \ 1593 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1594 } 1595 1596 DO_SRA(gvec_ssra_b, int8_t) 1597 DO_SRA(gvec_ssra_h, int16_t) 1598 DO_SRA(gvec_ssra_s, int32_t) 1599 DO_SRA(gvec_ssra_d, int64_t) 1600 1601 DO_SRA(gvec_usra_b, uint8_t) 1602 DO_SRA(gvec_usra_h, uint16_t) 1603 DO_SRA(gvec_usra_s, uint32_t) 1604 DO_SRA(gvec_usra_d, uint64_t) 1605 1606 #undef DO_SRA 1607 1608 #define DO_RSHR(NAME, TYPE) \ 1609 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1610 { \ 1611 intptr_t i, oprsz = simd_oprsz(desc); \ 1612 int shift = simd_data(desc); \ 1613 TYPE *d = vd, *n = vn; \ 1614 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1615 TYPE tmp = n[i] >> (shift - 1); \ 1616 d[i] = (tmp >> 1) + (tmp & 1); \ 1617 } \ 1618 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1619 } 1620 1621 DO_RSHR(gvec_srshr_b, int8_t) 1622 DO_RSHR(gvec_srshr_h, int16_t) 1623 DO_RSHR(gvec_srshr_s, int32_t) 1624 DO_RSHR(gvec_srshr_d, int64_t) 1625 1626 DO_RSHR(gvec_urshr_b, uint8_t) 1627 DO_RSHR(gvec_urshr_h, uint16_t) 1628 DO_RSHR(gvec_urshr_s, uint32_t) 1629 DO_RSHR(gvec_urshr_d, uint64_t) 1630 1631 #undef DO_RSHR 1632 1633 #define DO_RSRA(NAME, TYPE) \ 1634 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1635 { \ 1636 intptr_t i, oprsz = simd_oprsz(desc); \ 1637 int shift = simd_data(desc); \ 1638 TYPE *d = vd, *n = vn; \ 1639 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1640 TYPE tmp = n[i] >> (shift - 1); \ 1641 d[i] += (tmp >> 1) + (tmp & 1); \ 1642 } \ 1643 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1644 } 1645 1646 DO_RSRA(gvec_srsra_b, int8_t) 1647 DO_RSRA(gvec_srsra_h, int16_t) 1648 DO_RSRA(gvec_srsra_s, int32_t) 1649 DO_RSRA(gvec_srsra_d, int64_t) 1650 1651 DO_RSRA(gvec_ursra_b, uint8_t) 1652 DO_RSRA(gvec_ursra_h, uint16_t) 1653 DO_RSRA(gvec_ursra_s, uint32_t) 1654 DO_RSRA(gvec_ursra_d, uint64_t) 1655 1656 #undef DO_RSRA 1657 1658 #define DO_SRI(NAME, TYPE) \ 1659 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1660 { \ 1661 intptr_t i, oprsz = simd_oprsz(desc); \ 1662 int shift = simd_data(desc); \ 1663 TYPE *d = vd, *n = vn; \ 1664 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1665 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1666 } \ 1667 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1668 } 1669 1670 DO_SRI(gvec_sri_b, uint8_t) 1671 DO_SRI(gvec_sri_h, uint16_t) 1672 DO_SRI(gvec_sri_s, uint32_t) 1673 DO_SRI(gvec_sri_d, uint64_t) 1674 1675 #undef DO_SRI 1676 1677 #define DO_SLI(NAME, TYPE) \ 1678 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1679 { \ 1680 intptr_t i, oprsz = simd_oprsz(desc); \ 1681 int shift = simd_data(desc); \ 1682 TYPE *d = vd, *n = vn; \ 1683 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1684 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1685 } \ 1686 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1687 } 1688 1689 DO_SLI(gvec_sli_b, uint8_t) 1690 DO_SLI(gvec_sli_h, uint16_t) 1691 DO_SLI(gvec_sli_s, uint32_t) 1692 DO_SLI(gvec_sli_d, uint64_t) 1693 1694 #undef DO_SLI 1695 1696 /* 1697 * Convert float16 to float32, raising no exceptions and 1698 * preserving exceptional values, including SNaN. 1699 * This is effectively an unpack+repack operation. 1700 */ 1701 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1702 { 1703 const int f16_bias = 15; 1704 const int f32_bias = 127; 1705 uint32_t sign = extract32(f16, 15, 1); 1706 uint32_t exp = extract32(f16, 10, 5); 1707 uint32_t frac = extract32(f16, 0, 10); 1708 1709 if (exp == 0x1f) { 1710 /* Inf or NaN */ 1711 exp = 0xff; 1712 } else if (exp == 0) { 1713 /* Zero or denormal. */ 1714 if (frac != 0) { 1715 if (fz16) { 1716 frac = 0; 1717 } else { 1718 /* 1719 * Denormal; these are all normal float32. 1720 * Shift the fraction so that the msb is at bit 11, 1721 * then remove bit 11 as the implicit bit of the 1722 * normalized float32. Note that we still go through 1723 * the shift for normal numbers below, to put the 1724 * float32 fraction at the right place. 1725 */ 1726 int shift = clz32(frac) - 21; 1727 frac = (frac << shift) & 0x3ff; 1728 exp = f32_bias - f16_bias - shift + 1; 1729 } 1730 } 1731 } else { 1732 /* Normal number; adjust the bias. */ 1733 exp += f32_bias - f16_bias; 1734 } 1735 sign <<= 31; 1736 exp <<= 23; 1737 frac <<= 23 - 10; 1738 1739 return sign | exp | frac; 1740 } 1741 1742 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1743 { 1744 /* 1745 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1746 * Load the 2nd qword iff is_q & is_2. 1747 * Shift to the 2nd dword iff !is_q & is_2. 1748 * For !is_q & !is_2, the upper bits of the result are garbage. 1749 */ 1750 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1751 } 1752 1753 /* 1754 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1755 * as there is not yet SVE versions that might use blocking. 1756 */ 1757 1758 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1759 uint32_t desc, bool fz16) 1760 { 1761 intptr_t i, oprsz = simd_oprsz(desc); 1762 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1763 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1764 int is_q = oprsz == 16; 1765 uint64_t n_4, m_4; 1766 1767 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1768 n_4 = load4_f16(vn, is_q, is_2); 1769 m_4 = load4_f16(vm, is_q, is_2); 1770 1771 /* Negate all inputs for FMLSL at once. */ 1772 if (is_s) { 1773 n_4 ^= 0x8000800080008000ull; 1774 } 1775 1776 for (i = 0; i < oprsz / 4; i++) { 1777 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1778 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1779 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1780 } 1781 clear_tail(d, oprsz, simd_maxsz(desc)); 1782 } 1783 1784 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1785 void *venv, uint32_t desc) 1786 { 1787 CPUARMState *env = venv; 1788 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1789 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1790 } 1791 1792 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1793 void *venv, uint32_t desc) 1794 { 1795 CPUARMState *env = venv; 1796 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1797 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1798 } 1799 1800 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1801 void *venv, uint32_t desc) 1802 { 1803 intptr_t i, oprsz = simd_oprsz(desc); 1804 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1805 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1806 CPUARMState *env = venv; 1807 float_status *status = &env->vfp.fp_status; 1808 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1809 1810 for (i = 0; i < oprsz; i += sizeof(float32)) { 1811 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1812 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1813 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1814 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1815 float32 aa = *(float32 *)(va + H1_4(i)); 1816 1817 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1818 } 1819 } 1820 1821 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1822 uint32_t desc, bool fz16) 1823 { 1824 intptr_t i, oprsz = simd_oprsz(desc); 1825 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1826 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1827 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1828 int is_q = oprsz == 16; 1829 uint64_t n_4; 1830 float32 m_1; 1831 1832 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1833 n_4 = load4_f16(vn, is_q, is_2); 1834 1835 /* Negate all inputs for FMLSL at once. */ 1836 if (is_s) { 1837 n_4 ^= 0x8000800080008000ull; 1838 } 1839 1840 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1841 1842 for (i = 0; i < oprsz / 4; i++) { 1843 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1844 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1845 } 1846 clear_tail(d, oprsz, simd_maxsz(desc)); 1847 } 1848 1849 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1850 void *venv, uint32_t desc) 1851 { 1852 CPUARMState *env = venv; 1853 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1854 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1855 } 1856 1857 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1858 void *venv, uint32_t desc) 1859 { 1860 CPUARMState *env = venv; 1861 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1862 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1863 } 1864 1865 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1866 void *venv, uint32_t desc) 1867 { 1868 intptr_t i, j, oprsz = simd_oprsz(desc); 1869 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1870 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1871 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 1872 CPUARMState *env = venv; 1873 float_status *status = &env->vfp.fp_status; 1874 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1875 1876 for (i = 0; i < oprsz; i += 16) { 1877 float16 mm_16 = *(float16 *)(vm + i + idx); 1878 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1879 1880 for (j = 0; j < 16; j += sizeof(float32)) { 1881 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 1882 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1883 float32 aa = *(float32 *)(va + H1_4(i + j)); 1884 1885 *(float32 *)(vd + H1_4(i + j)) = 1886 float32_muladd(nn, mm, aa, 0, status); 1887 } 1888 } 1889 } 1890 1891 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1892 { 1893 intptr_t i, opr_sz = simd_oprsz(desc); 1894 int8_t *d = vd, *n = vn, *m = vm; 1895 1896 for (i = 0; i < opr_sz; ++i) { 1897 int8_t mm = m[i]; 1898 int8_t nn = n[i]; 1899 int8_t res = 0; 1900 if (mm >= 0) { 1901 if (mm < 8) { 1902 res = nn << mm; 1903 } 1904 } else { 1905 res = nn >> (mm > -8 ? -mm : 7); 1906 } 1907 d[i] = res; 1908 } 1909 clear_tail(d, opr_sz, simd_maxsz(desc)); 1910 } 1911 1912 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1913 { 1914 intptr_t i, opr_sz = simd_oprsz(desc); 1915 int16_t *d = vd, *n = vn, *m = vm; 1916 1917 for (i = 0; i < opr_sz / 2; ++i) { 1918 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1919 int16_t nn = n[i]; 1920 int16_t res = 0; 1921 if (mm >= 0) { 1922 if (mm < 16) { 1923 res = nn << mm; 1924 } 1925 } else { 1926 res = nn >> (mm > -16 ? -mm : 15); 1927 } 1928 d[i] = res; 1929 } 1930 clear_tail(d, opr_sz, simd_maxsz(desc)); 1931 } 1932 1933 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1934 { 1935 intptr_t i, opr_sz = simd_oprsz(desc); 1936 uint8_t *d = vd, *n = vn, *m = vm; 1937 1938 for (i = 0; i < opr_sz; ++i) { 1939 int8_t mm = m[i]; 1940 uint8_t nn = n[i]; 1941 uint8_t res = 0; 1942 if (mm >= 0) { 1943 if (mm < 8) { 1944 res = nn << mm; 1945 } 1946 } else { 1947 if (mm > -8) { 1948 res = nn >> -mm; 1949 } 1950 } 1951 d[i] = res; 1952 } 1953 clear_tail(d, opr_sz, simd_maxsz(desc)); 1954 } 1955 1956 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1957 { 1958 intptr_t i, opr_sz = simd_oprsz(desc); 1959 uint16_t *d = vd, *n = vn, *m = vm; 1960 1961 for (i = 0; i < opr_sz / 2; ++i) { 1962 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1963 uint16_t nn = n[i]; 1964 uint16_t res = 0; 1965 if (mm >= 0) { 1966 if (mm < 16) { 1967 res = nn << mm; 1968 } 1969 } else { 1970 if (mm > -16) { 1971 res = nn >> -mm; 1972 } 1973 } 1974 d[i] = res; 1975 } 1976 clear_tail(d, opr_sz, simd_maxsz(desc)); 1977 } 1978 1979 /* 1980 * 8x8->8 polynomial multiply. 1981 * 1982 * Polynomial multiplication is like integer multiplication except the 1983 * partial products are XORed, not added. 1984 * 1985 * TODO: expose this as a generic vector operation, as it is a common 1986 * crypto building block. 1987 */ 1988 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 1989 { 1990 intptr_t i, opr_sz = simd_oprsz(desc); 1991 uint64_t *d = vd, *n = vn, *m = vm; 1992 1993 for (i = 0; i < opr_sz / 8; ++i) { 1994 d[i] = clmul_8x8_low(n[i], m[i]); 1995 } 1996 clear_tail(d, opr_sz, simd_maxsz(desc)); 1997 } 1998 1999 /* 2000 * 64x64->128 polynomial multiply. 2001 * Because of the lanes are not accessed in strict columns, 2002 * this probably cannot be turned into a generic helper. 2003 */ 2004 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2005 { 2006 intptr_t i, j, opr_sz = simd_oprsz(desc); 2007 intptr_t hi = simd_data(desc); 2008 uint64_t *d = vd, *n = vn, *m = vm; 2009 2010 for (i = 0; i < opr_sz / 8; i += 2) { 2011 uint64_t nn = n[i + hi]; 2012 uint64_t mm = m[i + hi]; 2013 uint64_t rhi = 0; 2014 uint64_t rlo = 0; 2015 2016 /* Bit 0 can only influence the low 64-bit result. */ 2017 if (nn & 1) { 2018 rlo = mm; 2019 } 2020 2021 for (j = 1; j < 64; ++j) { 2022 uint64_t mask = -((nn >> j) & 1); 2023 rlo ^= (mm << j) & mask; 2024 rhi ^= (mm >> (64 - j)) & mask; 2025 } 2026 d[i] = rlo; 2027 d[i + 1] = rhi; 2028 } 2029 clear_tail(d, opr_sz, simd_maxsz(desc)); 2030 } 2031 2032 uint64_t pmull_w(uint64_t op1, uint64_t op2) 2033 { 2034 uint64_t result = 0; 2035 int i; 2036 for (i = 0; i < 16; ++i) { 2037 uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; 2038 result ^= op2 & mask; 2039 op1 >>= 1; 2040 op2 <<= 1; 2041 } 2042 return result; 2043 } 2044 2045 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2046 { 2047 int hi = simd_data(desc); 2048 uint64_t *d = vd, *n = vn, *m = vm; 2049 uint64_t nn = n[hi], mm = m[hi]; 2050 2051 d[0] = clmul_8x4_packed(nn, mm); 2052 nn >>= 32; 2053 mm >>= 32; 2054 d[1] = clmul_8x4_packed(nn, mm); 2055 2056 clear_tail(d, 16, simd_maxsz(desc)); 2057 } 2058 2059 #ifdef TARGET_AARCH64 2060 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2061 { 2062 int shift = simd_data(desc) * 8; 2063 intptr_t i, opr_sz = simd_oprsz(desc); 2064 uint64_t *d = vd, *n = vn, *m = vm; 2065 2066 for (i = 0; i < opr_sz / 8; ++i) { 2067 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2068 } 2069 } 2070 2071 static uint64_t pmull_d(uint64_t op1, uint64_t op2) 2072 { 2073 uint64_t result = 0; 2074 int i; 2075 2076 for (i = 0; i < 32; ++i) { 2077 uint64_t mask = -((op1 >> i) & 1); 2078 result ^= (op2 << i) & mask; 2079 } 2080 return result; 2081 } 2082 2083 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2084 { 2085 intptr_t sel = H4(simd_data(desc)); 2086 intptr_t i, opr_sz = simd_oprsz(desc); 2087 uint32_t *n = vn, *m = vm; 2088 uint64_t *d = vd; 2089 2090 for (i = 0; i < opr_sz / 8; ++i) { 2091 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); 2092 } 2093 } 2094 #endif 2095 2096 #define DO_CMP0(NAME, TYPE, OP) \ 2097 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2098 { \ 2099 intptr_t i, opr_sz = simd_oprsz(desc); \ 2100 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2101 TYPE nn = *(TYPE *)(vn + i); \ 2102 *(TYPE *)(vd + i) = -(nn OP 0); \ 2103 } \ 2104 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2105 } 2106 2107 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2108 DO_CMP0(gvec_clt0_b, int8_t, <) 2109 DO_CMP0(gvec_cle0_b, int8_t, <=) 2110 DO_CMP0(gvec_cgt0_b, int8_t, >) 2111 DO_CMP0(gvec_cge0_b, int8_t, >=) 2112 2113 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2114 DO_CMP0(gvec_clt0_h, int16_t, <) 2115 DO_CMP0(gvec_cle0_h, int16_t, <=) 2116 DO_CMP0(gvec_cgt0_h, int16_t, >) 2117 DO_CMP0(gvec_cge0_h, int16_t, >=) 2118 2119 #undef DO_CMP0 2120 2121 #define DO_ABD(NAME, TYPE) \ 2122 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2123 { \ 2124 intptr_t i, opr_sz = simd_oprsz(desc); \ 2125 TYPE *d = vd, *n = vn, *m = vm; \ 2126 \ 2127 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2128 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2129 } \ 2130 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2131 } 2132 2133 DO_ABD(gvec_sabd_b, int8_t) 2134 DO_ABD(gvec_sabd_h, int16_t) 2135 DO_ABD(gvec_sabd_s, int32_t) 2136 DO_ABD(gvec_sabd_d, int64_t) 2137 2138 DO_ABD(gvec_uabd_b, uint8_t) 2139 DO_ABD(gvec_uabd_h, uint16_t) 2140 DO_ABD(gvec_uabd_s, uint32_t) 2141 DO_ABD(gvec_uabd_d, uint64_t) 2142 2143 #undef DO_ABD 2144 2145 #define DO_ABA(NAME, TYPE) \ 2146 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2147 { \ 2148 intptr_t i, opr_sz = simd_oprsz(desc); \ 2149 TYPE *d = vd, *n = vn, *m = vm; \ 2150 \ 2151 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2152 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2153 } \ 2154 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2155 } 2156 2157 DO_ABA(gvec_saba_b, int8_t) 2158 DO_ABA(gvec_saba_h, int16_t) 2159 DO_ABA(gvec_saba_s, int32_t) 2160 DO_ABA(gvec_saba_d, int64_t) 2161 2162 DO_ABA(gvec_uaba_b, uint8_t) 2163 DO_ABA(gvec_uaba_h, uint16_t) 2164 DO_ABA(gvec_uaba_s, uint32_t) 2165 DO_ABA(gvec_uaba_d, uint64_t) 2166 2167 #undef DO_ABA 2168 2169 #define DO_NEON_PAIRWISE(NAME, OP) \ 2170 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ 2171 void *stat, uint32_t oprsz) \ 2172 { \ 2173 float_status *fpst = stat; \ 2174 float32 *d = vd; \ 2175 float32 *n = vn; \ 2176 float32 *m = vm; \ 2177 float32 r0, r1; \ 2178 \ 2179 /* Read all inputs before writing outputs in case vm == vd */ \ 2180 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ 2181 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ 2182 \ 2183 d[H4(0)] = r0; \ 2184 d[H4(1)] = r1; \ 2185 } \ 2186 \ 2187 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ 2188 void *stat, uint32_t oprsz) \ 2189 { \ 2190 float_status *fpst = stat; \ 2191 float16 *d = vd; \ 2192 float16 *n = vn; \ 2193 float16 *m = vm; \ 2194 float16 r0, r1, r2, r3; \ 2195 \ 2196 /* Read all inputs before writing outputs in case vm == vd */ \ 2197 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ 2198 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ 2199 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ 2200 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ 2201 \ 2202 d[H2(0)] = r0; \ 2203 d[H2(1)] = r1; \ 2204 d[H2(2)] = r2; \ 2205 d[H2(3)] = r3; \ 2206 } 2207 2208 DO_NEON_PAIRWISE(neon_padd, add) 2209 DO_NEON_PAIRWISE(neon_pmax, max) 2210 DO_NEON_PAIRWISE(neon_pmin, min) 2211 2212 #undef DO_NEON_PAIRWISE 2213 2214 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2215 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2216 { \ 2217 intptr_t i, oprsz = simd_oprsz(desc); \ 2218 int shift = simd_data(desc); \ 2219 TYPE *d = vd, *n = vn; \ 2220 float_status *fpst = stat; \ 2221 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2222 d[i] = FUNC(n[i], shift, fpst); \ 2223 } \ 2224 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2225 } 2226 2227 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2228 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2229 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2230 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2231 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2232 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2233 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2234 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2235 2236 #undef DO_VCVT_FIXED 2237 2238 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2239 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2240 { \ 2241 float_status *fpst = stat; \ 2242 intptr_t i, oprsz = simd_oprsz(desc); \ 2243 uint32_t rmode = simd_data(desc); \ 2244 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2245 TYPE *d = vd, *n = vn; \ 2246 set_float_rounding_mode(rmode, fpst); \ 2247 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2248 d[i] = FUNC(n[i], 0, fpst); \ 2249 } \ 2250 set_float_rounding_mode(prev_rmode, fpst); \ 2251 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2252 } 2253 2254 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2255 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2256 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2257 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2258 2259 #undef DO_VCVT_RMODE 2260 2261 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2262 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2263 { \ 2264 float_status *fpst = stat; \ 2265 intptr_t i, oprsz = simd_oprsz(desc); \ 2266 uint32_t rmode = simd_data(desc); \ 2267 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2268 TYPE *d = vd, *n = vn; \ 2269 set_float_rounding_mode(rmode, fpst); \ 2270 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2271 d[i] = FUNC(n[i], fpst); \ 2272 } \ 2273 set_float_rounding_mode(prev_rmode, fpst); \ 2274 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2275 } 2276 2277 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2278 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2279 2280 #undef DO_VRINT_RMODE 2281 2282 #ifdef TARGET_AARCH64 2283 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2284 { 2285 const uint8_t *indices = vm; 2286 CPUARMState *env = venv; 2287 size_t oprsz = simd_oprsz(desc); 2288 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2289 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2290 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2291 union { 2292 uint8_t b[16]; 2293 uint64_t d[2]; 2294 } result; 2295 2296 /* 2297 * We must construct the final result in a temp, lest the output 2298 * overlaps the input table. For TBL, begin with zero; for TBX, 2299 * begin with the original register contents. Note that we always 2300 * copy 16 bytes here to avoid an extra branch; clearing the high 2301 * bits of the register for oprsz == 8 is handled below. 2302 */ 2303 if (is_tbx) { 2304 memcpy(&result, vd, 16); 2305 } else { 2306 memset(&result, 0, 16); 2307 } 2308 2309 for (size_t i = 0; i < oprsz; ++i) { 2310 uint32_t index = indices[H1(i)]; 2311 2312 if (index < table_len) { 2313 /* 2314 * Convert index (a byte offset into the virtual table 2315 * which is a series of 128-bit vectors concatenated) 2316 * into the correct register element, bearing in mind 2317 * that the table can wrap around from V31 to V0. 2318 */ 2319 const uint8_t *table = (const uint8_t *) 2320 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2321 result.b[H1(i)] = table[H1(index % 16)]; 2322 } 2323 } 2324 2325 memcpy(vd, &result, 16); 2326 clear_tail(vd, oprsz, simd_maxsz(desc)); 2327 } 2328 #endif 2329 2330 /* 2331 * NxN -> N highpart multiply 2332 * 2333 * TODO: expose this as a generic vector operation. 2334 */ 2335 2336 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2337 { 2338 intptr_t i, opr_sz = simd_oprsz(desc); 2339 int8_t *d = vd, *n = vn, *m = vm; 2340 2341 for (i = 0; i < opr_sz; ++i) { 2342 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2343 } 2344 clear_tail(d, opr_sz, simd_maxsz(desc)); 2345 } 2346 2347 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2348 { 2349 intptr_t i, opr_sz = simd_oprsz(desc); 2350 int16_t *d = vd, *n = vn, *m = vm; 2351 2352 for (i = 0; i < opr_sz / 2; ++i) { 2353 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2354 } 2355 clear_tail(d, opr_sz, simd_maxsz(desc)); 2356 } 2357 2358 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2359 { 2360 intptr_t i, opr_sz = simd_oprsz(desc); 2361 int32_t *d = vd, *n = vn, *m = vm; 2362 2363 for (i = 0; i < opr_sz / 4; ++i) { 2364 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2365 } 2366 clear_tail(d, opr_sz, simd_maxsz(desc)); 2367 } 2368 2369 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2370 { 2371 intptr_t i, opr_sz = simd_oprsz(desc); 2372 uint64_t *d = vd, *n = vn, *m = vm; 2373 uint64_t discard; 2374 2375 for (i = 0; i < opr_sz / 8; ++i) { 2376 muls64(&discard, &d[i], n[i], m[i]); 2377 } 2378 clear_tail(d, opr_sz, simd_maxsz(desc)); 2379 } 2380 2381 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2382 { 2383 intptr_t i, opr_sz = simd_oprsz(desc); 2384 uint8_t *d = vd, *n = vn, *m = vm; 2385 2386 for (i = 0; i < opr_sz; ++i) { 2387 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2388 } 2389 clear_tail(d, opr_sz, simd_maxsz(desc)); 2390 } 2391 2392 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2393 { 2394 intptr_t i, opr_sz = simd_oprsz(desc); 2395 uint16_t *d = vd, *n = vn, *m = vm; 2396 2397 for (i = 0; i < opr_sz / 2; ++i) { 2398 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2399 } 2400 clear_tail(d, opr_sz, simd_maxsz(desc)); 2401 } 2402 2403 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2404 { 2405 intptr_t i, opr_sz = simd_oprsz(desc); 2406 uint32_t *d = vd, *n = vn, *m = vm; 2407 2408 for (i = 0; i < opr_sz / 4; ++i) { 2409 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2410 } 2411 clear_tail(d, opr_sz, simd_maxsz(desc)); 2412 } 2413 2414 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2415 { 2416 intptr_t i, opr_sz = simd_oprsz(desc); 2417 uint64_t *d = vd, *n = vn, *m = vm; 2418 uint64_t discard; 2419 2420 for (i = 0; i < opr_sz / 8; ++i) { 2421 mulu64(&discard, &d[i], n[i], m[i]); 2422 } 2423 clear_tail(d, opr_sz, simd_maxsz(desc)); 2424 } 2425 2426 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2427 { 2428 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2429 int shr = simd_data(desc); 2430 uint64_t *d = vd, *n = vn, *m = vm; 2431 2432 for (i = 0; i < opr_sz; ++i) { 2433 d[i] = ror64(n[i] ^ m[i], shr); 2434 } 2435 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2436 } 2437 2438 /* 2439 * Integer matrix-multiply accumulate 2440 */ 2441 2442 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2443 { 2444 int8_t *n = vn, *m = vm; 2445 2446 for (intptr_t k = 0; k < 8; ++k) { 2447 sum += n[H1(k)] * m[H1(k)]; 2448 } 2449 return sum; 2450 } 2451 2452 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2453 { 2454 uint8_t *n = vn, *m = vm; 2455 2456 for (intptr_t k = 0; k < 8; ++k) { 2457 sum += n[H1(k)] * m[H1(k)]; 2458 } 2459 return sum; 2460 } 2461 2462 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2463 { 2464 uint8_t *n = vn; 2465 int8_t *m = vm; 2466 2467 for (intptr_t k = 0; k < 8; ++k) { 2468 sum += n[H1(k)] * m[H1(k)]; 2469 } 2470 return sum; 2471 } 2472 2473 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2474 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2475 { 2476 intptr_t seg, opr_sz = simd_oprsz(desc); 2477 2478 for (seg = 0; seg < opr_sz; seg += 16) { 2479 uint32_t *d = vd + seg; 2480 uint32_t *a = va + seg; 2481 uint32_t sum0, sum1, sum2, sum3; 2482 2483 /* 2484 * Process the entire segment at once, writing back the 2485 * results only after we've consumed all of the inputs. 2486 * 2487 * Key to indices by column: 2488 * i j i j 2489 */ 2490 sum0 = a[H4(0 + 0)]; 2491 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2492 sum1 = a[H4(0 + 1)]; 2493 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2494 sum2 = a[H4(2 + 0)]; 2495 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2496 sum3 = a[H4(2 + 1)]; 2497 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2498 2499 d[H4(0)] = sum0; 2500 d[H4(1)] = sum1; 2501 d[H4(2)] = sum2; 2502 d[H4(3)] = sum3; 2503 } 2504 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2505 } 2506 2507 #define DO_MMLA_B(NAME, INNER) \ 2508 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2509 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2510 2511 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2512 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2513 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2514 2515 /* 2516 * BFloat16 Dot Product 2517 */ 2518 2519 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2520 { 2521 /* FPCR is ignored for BFDOT and BFMMLA. */ 2522 float_status bf_status = { 2523 .tininess_before_rounding = float_tininess_before_rounding, 2524 .float_rounding_mode = float_round_to_odd_inf, 2525 .flush_to_zero = true, 2526 .flush_inputs_to_zero = true, 2527 .default_nan_mode = true, 2528 }; 2529 float32 t1, t2; 2530 2531 /* 2532 * Extract each BFloat16 from the element pair, and shift 2533 * them such that they become float32. 2534 */ 2535 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2536 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2537 t1 = float32_add(t1, t2, &bf_status); 2538 t1 = float32_add(sum, t1, &bf_status); 2539 2540 return t1; 2541 } 2542 2543 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2544 { 2545 intptr_t i, opr_sz = simd_oprsz(desc); 2546 float32 *d = vd, *a = va; 2547 uint32_t *n = vn, *m = vm; 2548 2549 for (i = 0; i < opr_sz / 4; ++i) { 2550 d[i] = bfdotadd(a[i], n[i], m[i]); 2551 } 2552 clear_tail(d, opr_sz, simd_maxsz(desc)); 2553 } 2554 2555 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2556 void *va, uint32_t desc) 2557 { 2558 intptr_t i, j, opr_sz = simd_oprsz(desc); 2559 intptr_t index = simd_data(desc); 2560 intptr_t elements = opr_sz / 4; 2561 intptr_t eltspersegment = MIN(16 / 4, elements); 2562 float32 *d = vd, *a = va; 2563 uint32_t *n = vn, *m = vm; 2564 2565 for (i = 0; i < elements; i += eltspersegment) { 2566 uint32_t m_idx = m[i + H4(index)]; 2567 2568 for (j = i; j < i + eltspersegment; j++) { 2569 d[j] = bfdotadd(a[j], n[j], m_idx); 2570 } 2571 } 2572 clear_tail(d, opr_sz, simd_maxsz(desc)); 2573 } 2574 2575 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2576 { 2577 intptr_t s, opr_sz = simd_oprsz(desc); 2578 float32 *d = vd, *a = va; 2579 uint32_t *n = vn, *m = vm; 2580 2581 for (s = 0; s < opr_sz / 4; s += 4) { 2582 float32 sum00, sum01, sum10, sum11; 2583 2584 /* 2585 * Process the entire segment at once, writing back the 2586 * results only after we've consumed all of the inputs. 2587 * 2588 * Key to indices by column: 2589 * i j i k j k 2590 */ 2591 sum00 = a[s + H4(0 + 0)]; 2592 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2593 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2594 2595 sum01 = a[s + H4(0 + 1)]; 2596 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2597 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2598 2599 sum10 = a[s + H4(2 + 0)]; 2600 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2601 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2602 2603 sum11 = a[s + H4(2 + 1)]; 2604 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2605 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2606 2607 d[s + H4(0 + 0)] = sum00; 2608 d[s + H4(0 + 1)] = sum01; 2609 d[s + H4(2 + 0)] = sum10; 2610 d[s + H4(2 + 1)] = sum11; 2611 } 2612 clear_tail(d, opr_sz, simd_maxsz(desc)); 2613 } 2614 2615 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2616 void *stat, uint32_t desc) 2617 { 2618 intptr_t i, opr_sz = simd_oprsz(desc); 2619 intptr_t sel = simd_data(desc); 2620 float32 *d = vd, *a = va; 2621 bfloat16 *n = vn, *m = vm; 2622 2623 for (i = 0; i < opr_sz / 4; ++i) { 2624 float32 nn = n[H2(i * 2 + sel)] << 16; 2625 float32 mm = m[H2(i * 2 + sel)] << 16; 2626 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2627 } 2628 clear_tail(d, opr_sz, simd_maxsz(desc)); 2629 } 2630 2631 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2632 void *va, void *stat, uint32_t desc) 2633 { 2634 intptr_t i, j, opr_sz = simd_oprsz(desc); 2635 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2636 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2637 intptr_t elements = opr_sz / 4; 2638 intptr_t eltspersegment = MIN(16 / 4, elements); 2639 float32 *d = vd, *a = va; 2640 bfloat16 *n = vn, *m = vm; 2641 2642 for (i = 0; i < elements; i += eltspersegment) { 2643 float32 m_idx = m[H2(2 * i + index)] << 16; 2644 2645 for (j = i; j < i + eltspersegment; j++) { 2646 float32 n_j = n[H2(2 * j + sel)] << 16; 2647 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2648 } 2649 } 2650 clear_tail(d, opr_sz, simd_maxsz(desc)); 2651 } 2652 2653 #define DO_CLAMP(NAME, TYPE) \ 2654 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2655 { \ 2656 intptr_t i, opr_sz = simd_oprsz(desc); \ 2657 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2658 TYPE aa = *(TYPE *)(a + i); \ 2659 TYPE nn = *(TYPE *)(n + i); \ 2660 TYPE mm = *(TYPE *)(m + i); \ 2661 TYPE dd = MIN(MAX(aa, nn), mm); \ 2662 *(TYPE *)(d + i) = dd; \ 2663 } \ 2664 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2665 } 2666 2667 DO_CLAMP(gvec_sclamp_b, int8_t) 2668 DO_CLAMP(gvec_sclamp_h, int16_t) 2669 DO_CLAMP(gvec_sclamp_s, int32_t) 2670 DO_CLAMP(gvec_sclamp_d, int64_t) 2671 2672 DO_CLAMP(gvec_uclamp_b, uint8_t) 2673 DO_CLAMP(gvec_uclamp_h, uint16_t) 2674 DO_CLAMP(gvec_uclamp_s, uint32_t) 2675 DO_CLAMP(gvec_uclamp_d, uint64_t) 2676