1 /*
2 * ARM AdvSIMD / SVE Vector Operations
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28
29 /*
30 * Data for expanding active predicate bits to bytes, for byte elements.
31 *
32 * for (i = 0; i < 256; ++i) {
33 * unsigned long m = 0;
34 * for (j = 0; j < 8; j++) {
35 * if ((i >> j) & 1) {
36 * m |= 0xfful << (j << 3);
37 * }
38 * }
39 * printf("0x%016lx,\n", m);
40 * }
41 */
42 const uint64_t expand_pred_b_data[256] = {
43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128 0xffffffffffffffff,
129 };
130
131 /*
132 * Similarly for half-word elements.
133 * for (i = 0; i < 256; ++i) {
134 * unsigned long m = 0;
135 * if (i & 0xaa) {
136 * continue;
137 * }
138 * for (j = 0; j < 8; j += 2) {
139 * if ((i >> j) & 1) {
140 * m |= 0xfffful << (j << 3);
141 * }
142 * }
143 * printf("[0x%x] = 0x%016lx,\n", i, m);
144 * }
145 */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154 [0x55] = 0xffffffffffffffff,
155 };
156
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
do_sqrdmlah_b(int8_t src1,int8_t src2,int8_t src3,bool neg,bool round)158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159 bool neg, bool round)
160 {
161 /*
162 * Simplify:
163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165 */
166 int32_t ret = (int32_t)src1 * src2;
167 if (neg) {
168 ret = -ret;
169 }
170 ret += ((int32_t)src3 << 7) + (round << 6);
171 ret >>= 7;
172
173 if (ret != (int8_t)ret) {
174 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175 }
176 return ret;
177 }
178
HELPER(sve2_sqrdmlah_b)179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180 void *va, uint32_t desc)
181 {
182 intptr_t i, opr_sz = simd_oprsz(desc);
183 int8_t *d = vd, *n = vn, *m = vm, *a = va;
184
185 for (i = 0; i < opr_sz; ++i) {
186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187 }
188 }
189
HELPER(sve2_sqrdmlsh_b)190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191 void *va, uint32_t desc)
192 {
193 intptr_t i, opr_sz = simd_oprsz(desc);
194 int8_t *d = vd, *n = vn, *m = vm, *a = va;
195
196 for (i = 0; i < opr_sz; ++i) {
197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198 }
199 }
200
HELPER(sve2_sqdmulh_b)201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203 intptr_t i, opr_sz = simd_oprsz(desc);
204 int8_t *d = vd, *n = vn, *m = vm;
205
206 for (i = 0; i < opr_sz; ++i) {
207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208 }
209 }
210
HELPER(sve2_sqrdmulh_b)211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213 intptr_t i, opr_sz = simd_oprsz(desc);
214 int8_t *d = vd, *n = vn, *m = vm;
215
216 for (i = 0; i < opr_sz; ++i) {
217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218 }
219 }
220
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
do_sqrdmlah_h(int16_t src1,int16_t src2,int16_t src3,bool neg,bool round,uint32_t * sat)222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223 bool neg, bool round, uint32_t *sat)
224 {
225 /* Simplify similarly to do_sqrdmlah_b above. */
226 int32_t ret = (int32_t)src1 * src2;
227 if (neg) {
228 ret = -ret;
229 }
230 ret += ((int32_t)src3 << 15) + (round << 14);
231 ret >>= 15;
232
233 if (ret != (int16_t)ret) {
234 *sat = 1;
235 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236 }
237 return ret;
238 }
239
HELPER(neon_qrdmlah_s16)240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241 uint32_t src2, uint32_t src3)
242 {
243 uint32_t *sat = &env->vfp.qc[0];
244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246 false, true, sat);
247 return deposit32(e1, 16, 16, e2);
248 }
249
HELPER(gvec_qrdmlah_s16)250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251 void *vq, uint32_t desc)
252 {
253 uintptr_t opr_sz = simd_oprsz(desc);
254 int16_t *d = vd;
255 int16_t *n = vn;
256 int16_t *m = vm;
257 uintptr_t i;
258
259 for (i = 0; i < opr_sz / 2; ++i) {
260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261 }
262 clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264
HELPER(neon_qrdmlsh_s16)265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266 uint32_t src2, uint32_t src3)
267 {
268 uint32_t *sat = &env->vfp.qc[0];
269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271 true, true, sat);
272 return deposit32(e1, 16, 16, e2);
273 }
274
HELPER(gvec_qrdmlsh_s16)275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276 void *vq, uint32_t desc)
277 {
278 uintptr_t opr_sz = simd_oprsz(desc);
279 int16_t *d = vd;
280 int16_t *n = vn;
281 int16_t *m = vm;
282 uintptr_t i;
283
284 for (i = 0; i < opr_sz / 2; ++i) {
285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286 }
287 clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289
HELPER(neon_sqdmulh_h)290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291 void *vq, uint32_t desc)
292 {
293 intptr_t i, opr_sz = simd_oprsz(desc);
294 int16_t *d = vd, *n = vn, *m = vm;
295
296 for (i = 0; i < opr_sz / 2; ++i) {
297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298 }
299 clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301
HELPER(neon_sqrdmulh_h)302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303 void *vq, uint32_t desc)
304 {
305 intptr_t i, opr_sz = simd_oprsz(desc);
306 int16_t *d = vd, *n = vn, *m = vm;
307
308 for (i = 0; i < opr_sz / 2; ++i) {
309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310 }
311 clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313
HELPER(neon_sqdmulh_idx_h)314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315 void *vq, uint32_t desc)
316 {
317 intptr_t i, j, opr_sz = simd_oprsz(desc);
318 int idx = simd_data(desc);
319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320 intptr_t elements = opr_sz / 2;
321 intptr_t eltspersegment = MIN(16 / 2, elements);
322
323 for (i = 0; i < elements; i += 16 / 2) {
324 int16_t mm = m[i];
325 for (j = 0; j < eltspersegment; ++j) {
326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327 }
328 }
329 clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331
HELPER(neon_sqrdmulh_idx_h)332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333 void *vq, uint32_t desc)
334 {
335 intptr_t i, j, opr_sz = simd_oprsz(desc);
336 int idx = simd_data(desc);
337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338 intptr_t elements = opr_sz / 2;
339 intptr_t eltspersegment = MIN(16 / 2, elements);
340
341 for (i = 0; i < elements; i += 16 / 2) {
342 int16_t mm = m[i];
343 for (j = 0; j < eltspersegment; ++j) {
344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345 }
346 }
347 clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349
HELPER(neon_sqrdmlah_idx_h)350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351 void *vq, uint32_t desc)
352 {
353 intptr_t i, j, opr_sz = simd_oprsz(desc);
354 int idx = simd_data(desc);
355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356 intptr_t elements = opr_sz / 2;
357 intptr_t eltspersegment = MIN(16 / 2, elements);
358
359 for (i = 0; i < elements; i += 16 / 2) {
360 int16_t mm = m[i];
361 for (j = 0; j < eltspersegment; ++j) {
362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363 }
364 }
365 clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367
HELPER(neon_sqrdmlsh_idx_h)368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369 void *vq, uint32_t desc)
370 {
371 intptr_t i, j, opr_sz = simd_oprsz(desc);
372 int idx = simd_data(desc);
373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374 intptr_t elements = opr_sz / 2;
375 intptr_t eltspersegment = MIN(16 / 2, elements);
376
377 for (i = 0; i < elements; i += 16 / 2) {
378 int16_t mm = m[i];
379 for (j = 0; j < eltspersegment; ++j) {
380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381 }
382 }
383 clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385
HELPER(sve2_sqrdmlah_h)386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387 void *va, uint32_t desc)
388 {
389 intptr_t i, opr_sz = simd_oprsz(desc);
390 int16_t *d = vd, *n = vn, *m = vm, *a = va;
391 uint32_t discard;
392
393 for (i = 0; i < opr_sz / 2; ++i) {
394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395 }
396 }
397
HELPER(sve2_sqrdmlsh_h)398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399 void *va, uint32_t desc)
400 {
401 intptr_t i, opr_sz = simd_oprsz(desc);
402 int16_t *d = vd, *n = vn, *m = vm, *a = va;
403 uint32_t discard;
404
405 for (i = 0; i < opr_sz / 2; ++i) {
406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407 }
408 }
409
HELPER(sve2_sqdmulh_h)410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412 intptr_t i, opr_sz = simd_oprsz(desc);
413 int16_t *d = vd, *n = vn, *m = vm;
414 uint32_t discard;
415
416 for (i = 0; i < opr_sz / 2; ++i) {
417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418 }
419 }
420
HELPER(sve2_sqrdmulh_h)421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423 intptr_t i, opr_sz = simd_oprsz(desc);
424 int16_t *d = vd, *n = vn, *m = vm;
425 uint32_t discard;
426
427 for (i = 0; i < opr_sz / 2; ++i) {
428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429 }
430 }
431
HELPER(sve2_sqdmulh_idx_h)432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434 intptr_t i, j, opr_sz = simd_oprsz(desc);
435 int idx = simd_data(desc);
436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437 uint32_t discard;
438
439 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440 int16_t mm = m[i];
441 for (j = 0; j < 16 / 2; ++j) {
442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443 }
444 }
445 }
446
HELPER(sve2_sqrdmulh_idx_h)447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449 intptr_t i, j, opr_sz = simd_oprsz(desc);
450 int idx = simd_data(desc);
451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452 uint32_t discard;
453
454 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455 int16_t mm = m[i];
456 for (j = 0; j < 16 / 2; ++j) {
457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458 }
459 }
460 }
461
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
do_sqrdmlah_s(int32_t src1,int32_t src2,int32_t src3,bool neg,bool round,uint32_t * sat)463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464 bool neg, bool round, uint32_t *sat)
465 {
466 /* Simplify similarly to do_sqrdmlah_b above. */
467 int64_t ret = (int64_t)src1 * src2;
468 if (neg) {
469 ret = -ret;
470 }
471 ret += ((int64_t)src3 << 31) + (round << 30);
472 ret >>= 31;
473
474 if (ret != (int32_t)ret) {
475 *sat = 1;
476 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477 }
478 return ret;
479 }
480
HELPER(neon_qrdmlah_s32)481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482 int32_t src2, int32_t src3)
483 {
484 uint32_t *sat = &env->vfp.qc[0];
485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487
HELPER(gvec_qrdmlah_s32)488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489 void *vq, uint32_t desc)
490 {
491 uintptr_t opr_sz = simd_oprsz(desc);
492 int32_t *d = vd;
493 int32_t *n = vn;
494 int32_t *m = vm;
495 uintptr_t i;
496
497 for (i = 0; i < opr_sz / 4; ++i) {
498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499 }
500 clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502
HELPER(neon_qrdmlsh_s32)503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504 int32_t src2, int32_t src3)
505 {
506 uint32_t *sat = &env->vfp.qc[0];
507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509
HELPER(gvec_qrdmlsh_s32)510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511 void *vq, uint32_t desc)
512 {
513 uintptr_t opr_sz = simd_oprsz(desc);
514 int32_t *d = vd;
515 int32_t *n = vn;
516 int32_t *m = vm;
517 uintptr_t i;
518
519 for (i = 0; i < opr_sz / 4; ++i) {
520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521 }
522 clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524
HELPER(neon_sqdmulh_s)525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526 void *vq, uint32_t desc)
527 {
528 intptr_t i, opr_sz = simd_oprsz(desc);
529 int32_t *d = vd, *n = vn, *m = vm;
530
531 for (i = 0; i < opr_sz / 4; ++i) {
532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533 }
534 clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536
HELPER(neon_sqrdmulh_s)537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538 void *vq, uint32_t desc)
539 {
540 intptr_t i, opr_sz = simd_oprsz(desc);
541 int32_t *d = vd, *n = vn, *m = vm;
542
543 for (i = 0; i < opr_sz / 4; ++i) {
544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545 }
546 clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548
HELPER(neon_sqdmulh_idx_s)549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550 void *vq, uint32_t desc)
551 {
552 intptr_t i, j, opr_sz = simd_oprsz(desc);
553 int idx = simd_data(desc);
554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555 intptr_t elements = opr_sz / 4;
556 intptr_t eltspersegment = MIN(16 / 4, elements);
557
558 for (i = 0; i < elements; i += 16 / 4) {
559 int32_t mm = m[i];
560 for (j = 0; j < eltspersegment; ++j) {
561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562 }
563 }
564 clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566
HELPER(neon_sqrdmulh_idx_s)567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568 void *vq, uint32_t desc)
569 {
570 intptr_t i, j, opr_sz = simd_oprsz(desc);
571 int idx = simd_data(desc);
572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573 intptr_t elements = opr_sz / 4;
574 intptr_t eltspersegment = MIN(16 / 4, elements);
575
576 for (i = 0; i < elements; i += 16 / 4) {
577 int32_t mm = m[i];
578 for (j = 0; j < eltspersegment; ++j) {
579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580 }
581 }
582 clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584
HELPER(neon_sqrdmlah_idx_s)585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586 void *vq, uint32_t desc)
587 {
588 intptr_t i, j, opr_sz = simd_oprsz(desc);
589 int idx = simd_data(desc);
590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591 intptr_t elements = opr_sz / 4;
592 intptr_t eltspersegment = MIN(16 / 4, elements);
593
594 for (i = 0; i < elements; i += 16 / 4) {
595 int32_t mm = m[i];
596 for (j = 0; j < eltspersegment; ++j) {
597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598 }
599 }
600 clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602
HELPER(neon_sqrdmlsh_idx_s)603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604 void *vq, uint32_t desc)
605 {
606 intptr_t i, j, opr_sz = simd_oprsz(desc);
607 int idx = simd_data(desc);
608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609 intptr_t elements = opr_sz / 4;
610 intptr_t eltspersegment = MIN(16 / 4, elements);
611
612 for (i = 0; i < elements; i += 16 / 4) {
613 int32_t mm = m[i];
614 for (j = 0; j < eltspersegment; ++j) {
615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616 }
617 }
618 clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620
HELPER(sve2_sqrdmlah_s)621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622 void *va, uint32_t desc)
623 {
624 intptr_t i, opr_sz = simd_oprsz(desc);
625 int32_t *d = vd, *n = vn, *m = vm, *a = va;
626 uint32_t discard;
627
628 for (i = 0; i < opr_sz / 4; ++i) {
629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630 }
631 }
632
HELPER(sve2_sqrdmlsh_s)633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634 void *va, uint32_t desc)
635 {
636 intptr_t i, opr_sz = simd_oprsz(desc);
637 int32_t *d = vd, *n = vn, *m = vm, *a = va;
638 uint32_t discard;
639
640 for (i = 0; i < opr_sz / 4; ++i) {
641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642 }
643 }
644
HELPER(sve2_sqdmulh_s)645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647 intptr_t i, opr_sz = simd_oprsz(desc);
648 int32_t *d = vd, *n = vn, *m = vm;
649 uint32_t discard;
650
651 for (i = 0; i < opr_sz / 4; ++i) {
652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653 }
654 }
655
HELPER(sve2_sqrdmulh_s)656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658 intptr_t i, opr_sz = simd_oprsz(desc);
659 int32_t *d = vd, *n = vn, *m = vm;
660 uint32_t discard;
661
662 for (i = 0; i < opr_sz / 4; ++i) {
663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664 }
665 }
666
HELPER(sve2_sqdmulh_idx_s)667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669 intptr_t i, j, opr_sz = simd_oprsz(desc);
670 int idx = simd_data(desc);
671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672 uint32_t discard;
673
674 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675 int32_t mm = m[i];
676 for (j = 0; j < 16 / 4; ++j) {
677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678 }
679 }
680 }
681
HELPER(sve2_sqrdmulh_idx_s)682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684 intptr_t i, j, opr_sz = simd_oprsz(desc);
685 int idx = simd_data(desc);
686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687 uint32_t discard;
688
689 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690 int32_t mm = m[i];
691 for (j = 0; j < 16 / 4; ++j) {
692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693 }
694 }
695 }
696
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
do_sat128_d(Int128 r)698 static int64_t do_sat128_d(Int128 r)
699 {
700 int64_t ls = int128_getlo(r);
701 int64_t hs = int128_gethi(r);
702
703 if (unlikely(hs != (ls >> 63))) {
704 return hs < 0 ? INT64_MIN : INT64_MAX;
705 }
706 return ls;
707 }
708
do_sqrdmlah_d(int64_t n,int64_t m,int64_t a,bool neg,bool round)709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711 uint64_t l, h;
712 Int128 r, t;
713
714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715 muls64(&l, &h, m, n);
716 r = int128_make128(l, h);
717 if (neg) {
718 r = int128_neg(r);
719 }
720 if (a) {
721 t = int128_exts64(a);
722 t = int128_lshift(t, 63);
723 r = int128_add(r, t);
724 }
725 if (round) {
726 t = int128_exts64(1ll << 62);
727 r = int128_add(r, t);
728 }
729 r = int128_rshift(r, 63);
730
731 return do_sat128_d(r);
732 }
733
HELPER(sve2_sqrdmlah_d)734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735 void *va, uint32_t desc)
736 {
737 intptr_t i, opr_sz = simd_oprsz(desc);
738 int64_t *d = vd, *n = vn, *m = vm, *a = va;
739
740 for (i = 0; i < opr_sz / 8; ++i) {
741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742 }
743 }
744
HELPER(sve2_sqrdmlsh_d)745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746 void *va, uint32_t desc)
747 {
748 intptr_t i, opr_sz = simd_oprsz(desc);
749 int64_t *d = vd, *n = vn, *m = vm, *a = va;
750
751 for (i = 0; i < opr_sz / 8; ++i) {
752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753 }
754 }
755
HELPER(sve2_sqdmulh_d)756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758 intptr_t i, opr_sz = simd_oprsz(desc);
759 int64_t *d = vd, *n = vn, *m = vm;
760
761 for (i = 0; i < opr_sz / 8; ++i) {
762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763 }
764 }
765
HELPER(sve2_sqrdmulh_d)766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768 intptr_t i, opr_sz = simd_oprsz(desc);
769 int64_t *d = vd, *n = vn, *m = vm;
770
771 for (i = 0; i < opr_sz / 8; ++i) {
772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773 }
774 }
775
HELPER(sve2_sqdmulh_idx_d)776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778 intptr_t i, j, opr_sz = simd_oprsz(desc);
779 int idx = simd_data(desc);
780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781
782 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783 int64_t mm = m[i];
784 for (j = 0; j < 16 / 8; ++j) {
785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786 }
787 }
788 }
789
HELPER(sve2_sqrdmulh_idx_d)790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792 intptr_t i, j, opr_sz = simd_oprsz(desc);
793 int idx = simd_data(desc);
794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795
796 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797 int64_t mm = m[i];
798 for (j = 0; j < 16 / 8; ++j) {
799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800 }
801 }
802 }
803
804 /* Integer 8 and 16-bit dot-product.
805 *
806 * Note that for the loops herein, host endianness does not matter
807 * with respect to the ordering of data within the quad-width lanes.
808 * All elements are treated equally, no matter where they are.
809 */
810
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
813 { \
814 intptr_t i, opr_sz = simd_oprsz(desc); \
815 TYPED *d = vd, *a = va; \
816 TYPEN *n = vn; \
817 TYPEM *m = vm; \
818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
819 d[i] = (a[i] + \
820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
824 } \
825 clear_tail(d, opr_sz, simd_maxsz(desc)); \
826 }
827
DO_DOT(gvec_sdot_b,int32_t,int8_t,int8_t)828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
836 { \
837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
839 /* \
840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \
841 * first iteration might not be a full 16 byte segment. But \
842 * for vector lengths beyond that this must be SVE and we know \
843 * opr_sz is a multiple of 16, so we need not clamp segend \
844 * to opr_sz_n when we advance it at the end of the loop. \
845 */ \
846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
847 intptr_t index = simd_data(desc); \
848 TYPED *d = vd, *a = va; \
849 TYPEN *n = vn; \
850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
851 do { \
852 TYPED m0 = m_indexed[i * 4 + 0]; \
853 TYPED m1 = m_indexed[i * 4 + 1]; \
854 TYPED m2 = m_indexed[i * 4 + 2]; \
855 TYPED m3 = m_indexed[i * 4 + 3]; \
856 do { \
857 d[i] = (a[i] + \
858 n[i * 4 + 0] * m0 + \
859 n[i * 4 + 1] * m1 + \
860 n[i * 4 + 2] * m2 + \
861 n[i * 4 + 3] * m3); \
862 } while (++i < segend); \
863 segend = i + (16 / sizeof(TYPED)); \
864 } while (i < opr_sz_n); \
865 clear_tail(d, opr_sz, simd_maxsz(desc)); \
866 }
867
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876 void *vfpst, uint32_t desc)
877 {
878 uintptr_t opr_sz = simd_oprsz(desc);
879 float16 *d = vd;
880 float16 *n = vn;
881 float16 *m = vm;
882 float_status *fpst = vfpst;
883 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
884 uint32_t neg_imag = neg_real ^ 1;
885 uintptr_t i;
886
887 /* Shift boolean to the sign bit so we can xor to negate. */
888 neg_real <<= 15;
889 neg_imag <<= 15;
890
891 for (i = 0; i < opr_sz / 2; i += 2) {
892 float16 e0 = n[H2(i)];
893 float16 e1 = m[H2(i + 1)] ^ neg_imag;
894 float16 e2 = n[H2(i + 1)];
895 float16 e3 = m[H2(i)] ^ neg_real;
896
897 d[H2(i)] = float16_add(e0, e1, fpst);
898 d[H2(i + 1)] = float16_add(e2, e3, fpst);
899 }
900 clear_tail(d, opr_sz, simd_maxsz(desc));
901 }
902
HELPER(gvec_fcadds)903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
904 void *vfpst, uint32_t desc)
905 {
906 uintptr_t opr_sz = simd_oprsz(desc);
907 float32 *d = vd;
908 float32 *n = vn;
909 float32 *m = vm;
910 float_status *fpst = vfpst;
911 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
912 uint32_t neg_imag = neg_real ^ 1;
913 uintptr_t i;
914
915 /* Shift boolean to the sign bit so we can xor to negate. */
916 neg_real <<= 31;
917 neg_imag <<= 31;
918
919 for (i = 0; i < opr_sz / 4; i += 2) {
920 float32 e0 = n[H4(i)];
921 float32 e1 = m[H4(i + 1)] ^ neg_imag;
922 float32 e2 = n[H4(i + 1)];
923 float32 e3 = m[H4(i)] ^ neg_real;
924
925 d[H4(i)] = float32_add(e0, e1, fpst);
926 d[H4(i + 1)] = float32_add(e2, e3, fpst);
927 }
928 clear_tail(d, opr_sz, simd_maxsz(desc));
929 }
930
HELPER(gvec_fcaddd)931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
932 void *vfpst, uint32_t desc)
933 {
934 uintptr_t opr_sz = simd_oprsz(desc);
935 float64 *d = vd;
936 float64 *n = vn;
937 float64 *m = vm;
938 float_status *fpst = vfpst;
939 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
940 uint64_t neg_imag = neg_real ^ 1;
941 uintptr_t i;
942
943 /* Shift boolean to the sign bit so we can xor to negate. */
944 neg_real <<= 63;
945 neg_imag <<= 63;
946
947 for (i = 0; i < opr_sz / 8; i += 2) {
948 float64 e0 = n[i];
949 float64 e1 = m[i + 1] ^ neg_imag;
950 float64 e2 = n[i + 1];
951 float64 e3 = m[i] ^ neg_real;
952
953 d[i] = float64_add(e0, e1, fpst);
954 d[i + 1] = float64_add(e2, e3, fpst);
955 }
956 clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958
HELPER(gvec_fcmlah)959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
960 void *vfpst, uint32_t desc)
961 {
962 uintptr_t opr_sz = simd_oprsz(desc);
963 float16 *d = vd, *n = vn, *m = vm, *a = va;
964 float_status *fpst = vfpst;
965 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
966 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
967 uint32_t neg_real = flip ^ neg_imag;
968 uintptr_t i;
969
970 /* Shift boolean to the sign bit so we can xor to negate. */
971 neg_real <<= 15;
972 neg_imag <<= 15;
973
974 for (i = 0; i < opr_sz / 2; i += 2) {
975 float16 e2 = n[H2(i + flip)];
976 float16 e1 = m[H2(i + flip)] ^ neg_real;
977 float16 e4 = e2;
978 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
979
980 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
981 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
982 }
983 clear_tail(d, opr_sz, simd_maxsz(desc));
984 }
985
HELPER(gvec_fcmlah_idx)986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
987 void *vfpst, uint32_t desc)
988 {
989 uintptr_t opr_sz = simd_oprsz(desc);
990 float16 *d = vd, *n = vn, *m = vm, *a = va;
991 float_status *fpst = vfpst;
992 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
993 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
994 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
995 uint32_t neg_real = flip ^ neg_imag;
996 intptr_t elements = opr_sz / sizeof(float16);
997 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
998 intptr_t i, j;
999
1000 /* Shift boolean to the sign bit so we can xor to negate. */
1001 neg_real <<= 15;
1002 neg_imag <<= 15;
1003
1004 for (i = 0; i < elements; i += eltspersegment) {
1005 float16 mr = m[H2(i + 2 * index + 0)];
1006 float16 mi = m[H2(i + 2 * index + 1)];
1007 float16 e1 = neg_real ^ (flip ? mi : mr);
1008 float16 e3 = neg_imag ^ (flip ? mr : mi);
1009
1010 for (j = i; j < i + eltspersegment; j += 2) {
1011 float16 e2 = n[H2(j + flip)];
1012 float16 e4 = e2;
1013
1014 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1015 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1016 }
1017 }
1018 clear_tail(d, opr_sz, simd_maxsz(desc));
1019 }
1020
HELPER(gvec_fcmlas)1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1022 void *vfpst, uint32_t desc)
1023 {
1024 uintptr_t opr_sz = simd_oprsz(desc);
1025 float32 *d = vd, *n = vn, *m = vm, *a = va;
1026 float_status *fpst = vfpst;
1027 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029 uint32_t neg_real = flip ^ neg_imag;
1030 uintptr_t i;
1031
1032 /* Shift boolean to the sign bit so we can xor to negate. */
1033 neg_real <<= 31;
1034 neg_imag <<= 31;
1035
1036 for (i = 0; i < opr_sz / 4; i += 2) {
1037 float32 e2 = n[H4(i + flip)];
1038 float32 e1 = m[H4(i + flip)] ^ neg_real;
1039 float32 e4 = e2;
1040 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041
1042 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044 }
1045 clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047
HELPER(gvec_fcmlas_idx)1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049 void *vfpst, uint32_t desc)
1050 {
1051 uintptr_t opr_sz = simd_oprsz(desc);
1052 float32 *d = vd, *n = vn, *m = vm, *a = va;
1053 float_status *fpst = vfpst;
1054 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1055 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1056 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1057 uint32_t neg_real = flip ^ neg_imag;
1058 intptr_t elements = opr_sz / sizeof(float32);
1059 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1060 intptr_t i, j;
1061
1062 /* Shift boolean to the sign bit so we can xor to negate. */
1063 neg_real <<= 31;
1064 neg_imag <<= 31;
1065
1066 for (i = 0; i < elements; i += eltspersegment) {
1067 float32 mr = m[H4(i + 2 * index + 0)];
1068 float32 mi = m[H4(i + 2 * index + 1)];
1069 float32 e1 = neg_real ^ (flip ? mi : mr);
1070 float32 e3 = neg_imag ^ (flip ? mr : mi);
1071
1072 for (j = i; j < i + eltspersegment; j += 2) {
1073 float32 e2 = n[H4(j + flip)];
1074 float32 e4 = e2;
1075
1076 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1077 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1078 }
1079 }
1080 clear_tail(d, opr_sz, simd_maxsz(desc));
1081 }
1082
HELPER(gvec_fcmlad)1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1084 void *vfpst, uint32_t desc)
1085 {
1086 uintptr_t opr_sz = simd_oprsz(desc);
1087 float64 *d = vd, *n = vn, *m = vm, *a = va;
1088 float_status *fpst = vfpst;
1089 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1090 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1091 uint64_t neg_real = flip ^ neg_imag;
1092 uintptr_t i;
1093
1094 /* Shift boolean to the sign bit so we can xor to negate. */
1095 neg_real <<= 63;
1096 neg_imag <<= 63;
1097
1098 for (i = 0; i < opr_sz / 8; i += 2) {
1099 float64 e2 = n[i + flip];
1100 float64 e1 = m[i + flip] ^ neg_real;
1101 float64 e4 = e2;
1102 float64 e3 = m[i + 1 - flip] ^ neg_imag;
1103
1104 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1105 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1106 }
1107 clear_tail(d, opr_sz, simd_maxsz(desc));
1108 }
1109
1110 /*
1111 * Floating point comparisons producing an integer result (all 1s or all 0s).
1112 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1113 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1114 */
float16_ceq(float16 op1,float16 op2,float_status * stat)1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1116 {
1117 return -float16_eq_quiet(op1, op2, stat);
1118 }
1119
float32_ceq(float32 op1,float32 op2,float_status * stat)1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1121 {
1122 return -float32_eq_quiet(op1, op2, stat);
1123 }
1124
float64_ceq(float64 op1,float64 op2,float_status * stat)1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1126 {
1127 return -float64_eq_quiet(op1, op2, stat);
1128 }
1129
float16_cge(float16 op1,float16 op2,float_status * stat)1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1131 {
1132 return -float16_le(op2, op1, stat);
1133 }
1134
float32_cge(float32 op1,float32 op2,float_status * stat)1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1136 {
1137 return -float32_le(op2, op1, stat);
1138 }
1139
float64_cge(float64 op1,float64 op2,float_status * stat)1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1141 {
1142 return -float64_le(op2, op1, stat);
1143 }
1144
float16_cgt(float16 op1,float16 op2,float_status * stat)1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1146 {
1147 return -float16_lt(op2, op1, stat);
1148 }
1149
float32_cgt(float32 op1,float32 op2,float_status * stat)1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1151 {
1152 return -float32_lt(op2, op1, stat);
1153 }
1154
float64_cgt(float64 op1,float64 op2,float_status * stat)1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1156 {
1157 return -float64_lt(op2, op1, stat);
1158 }
1159
float16_acge(float16 op1,float16 op2,float_status * stat)1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1161 {
1162 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1163 }
1164
float32_acge(float32 op1,float32 op2,float_status * stat)1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1166 {
1167 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1168 }
1169
float64_acge(float64 op1,float64 op2,float_status * stat)1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1171 {
1172 return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1173 }
1174
float16_acgt(float16 op1,float16 op2,float_status * stat)1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1176 {
1177 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1178 }
1179
float32_acgt(float32 op1,float32 op2,float_status * stat)1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1181 {
1182 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1183 }
1184
float64_acgt(float64 op1,float64 op2,float_status * stat)1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1186 {
1187 return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1188 }
1189
vfp_tosszh(float16 x,void * fpstp)1190 static int16_t vfp_tosszh(float16 x, void *fpstp)
1191 {
1192 float_status *fpst = fpstp;
1193 if (float16_is_any_nan(x)) {
1194 float_raise(float_flag_invalid, fpst);
1195 return 0;
1196 }
1197 return float16_to_int16_round_to_zero(x, fpst);
1198 }
1199
vfp_touszh(float16 x,void * fpstp)1200 static uint16_t vfp_touszh(float16 x, void *fpstp)
1201 {
1202 float_status *fpst = fpstp;
1203 if (float16_is_any_nan(x)) {
1204 float_raise(float_flag_invalid, fpst);
1205 return 0;
1206 }
1207 return float16_to_uint16_round_to_zero(x, fpst);
1208 }
1209
1210 #define DO_2OP(NAME, FUNC, TYPE) \
1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1212 { \
1213 intptr_t i, oprsz = simd_oprsz(desc); \
1214 TYPE *d = vd, *n = vn; \
1215 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1216 d[i] = FUNC(n[i], stat); \
1217 } \
1218 clear_tail(d, oprsz, simd_maxsz(desc)); \
1219 }
1220
DO_2OP(gvec_frecpe_h,helper_recpe_f16,float16)1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1224
1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1228
1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1231
1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1239 DO_2OP(gvec_touszh, vfp_touszh, float16)
1240
1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1242 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1243 { \
1244 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1245 }
1246
1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1248 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1249 { \
1250 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1251 }
1252
1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1254 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1255 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1256 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1257 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1258
1259 DO_2OP_CMP0(cgt, cgt, FWD)
1260 DO_2OP_CMP0(cge, cge, FWD)
1261 DO_2OP_CMP0(ceq, ceq, FWD)
1262 DO_2OP_CMP0(clt, cgt, REV)
1263 DO_2OP_CMP0(cle, cge, REV)
1264
1265 #undef DO_2OP
1266 #undef DO_2OP_CMP0
1267
1268 /* Floating-point trigonometric starting value.
1269 * See the ARM ARM pseudocode function FPTrigSMul.
1270 */
1271 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1272 {
1273 float16 result = float16_mul(op1, op1, stat);
1274 if (!float16_is_any_nan(result)) {
1275 result = float16_set_sign(result, op2 & 1);
1276 }
1277 return result;
1278 }
1279
float32_ftsmul(float32 op1,uint32_t op2,float_status * stat)1280 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1281 {
1282 float32 result = float32_mul(op1, op1, stat);
1283 if (!float32_is_any_nan(result)) {
1284 result = float32_set_sign(result, op2 & 1);
1285 }
1286 return result;
1287 }
1288
float64_ftsmul(float64 op1,uint64_t op2,float_status * stat)1289 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1290 {
1291 float64 result = float64_mul(op1, op1, stat);
1292 if (!float64_is_any_nan(result)) {
1293 result = float64_set_sign(result, op2 & 1);
1294 }
1295 return result;
1296 }
1297
float16_abd(float16 op1,float16 op2,float_status * stat)1298 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1299 {
1300 return float16_abs(float16_sub(op1, op2, stat));
1301 }
1302
float32_abd(float32 op1,float32 op2,float_status * stat)1303 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1304 {
1305 return float32_abs(float32_sub(op1, op2, stat));
1306 }
1307
float64_abd(float64 op1,float64 op2,float_status * stat)1308 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1309 {
1310 return float64_abs(float64_sub(op1, op2, stat));
1311 }
1312
1313 /*
1314 * Reciprocal step. These are the AArch32 version which uses a
1315 * non-fused multiply-and-subtract.
1316 */
float16_recps_nf(float16 op1,float16 op2,float_status * stat)1317 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1318 {
1319 op1 = float16_squash_input_denormal(op1, stat);
1320 op2 = float16_squash_input_denormal(op2, stat);
1321
1322 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1323 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1324 return float16_two;
1325 }
1326 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1327 }
1328
float32_recps_nf(float32 op1,float32 op2,float_status * stat)1329 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1330 {
1331 op1 = float32_squash_input_denormal(op1, stat);
1332 op2 = float32_squash_input_denormal(op2, stat);
1333
1334 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1335 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1336 return float32_two;
1337 }
1338 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1339 }
1340
1341 /* Reciprocal square-root step. AArch32 non-fused semantics. */
float16_rsqrts_nf(float16 op1,float16 op2,float_status * stat)1342 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1343 {
1344 op1 = float16_squash_input_denormal(op1, stat);
1345 op2 = float16_squash_input_denormal(op2, stat);
1346
1347 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1348 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1349 return float16_one_point_five;
1350 }
1351 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1352 return float16_div(op1, float16_two, stat);
1353 }
1354
float32_rsqrts_nf(float32 op1,float32 op2,float_status * stat)1355 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1356 {
1357 op1 = float32_squash_input_denormal(op1, stat);
1358 op2 = float32_squash_input_denormal(op2, stat);
1359
1360 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1361 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1362 return float32_one_point_five;
1363 }
1364 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1365 return float32_div(op1, float32_two, stat);
1366 }
1367
1368 #define DO_3OP(NAME, FUNC, TYPE) \
1369 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1370 { \
1371 intptr_t i, oprsz = simd_oprsz(desc); \
1372 TYPE *d = vd, *n = vn, *m = vm; \
1373 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1374 d[i] = FUNC(n[i], m[i], stat); \
1375 } \
1376 clear_tail(d, oprsz, simd_maxsz(desc)); \
1377 }
1378
DO_3OP(gvec_fadd_h,float16_add,float16)1379 DO_3OP(gvec_fadd_h, float16_add, float16)
1380 DO_3OP(gvec_fadd_s, float32_add, float32)
1381 DO_3OP(gvec_fadd_d, float64_add, float64)
1382
1383 DO_3OP(gvec_fsub_h, float16_sub, float16)
1384 DO_3OP(gvec_fsub_s, float32_sub, float32)
1385 DO_3OP(gvec_fsub_d, float64_sub, float64)
1386
1387 DO_3OP(gvec_fmul_h, float16_mul, float16)
1388 DO_3OP(gvec_fmul_s, float32_mul, float32)
1389 DO_3OP(gvec_fmul_d, float64_mul, float64)
1390
1391 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1392 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1393 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1394
1395 DO_3OP(gvec_fabd_h, float16_abd, float16)
1396 DO_3OP(gvec_fabd_s, float32_abd, float32)
1397 DO_3OP(gvec_fabd_d, float64_abd, float64)
1398
1399 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1400 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1401 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1402
1403 DO_3OP(gvec_fcge_h, float16_cge, float16)
1404 DO_3OP(gvec_fcge_s, float32_cge, float32)
1405 DO_3OP(gvec_fcge_d, float64_cge, float64)
1406
1407 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1408 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1409 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1410
1411 DO_3OP(gvec_facge_h, float16_acge, float16)
1412 DO_3OP(gvec_facge_s, float32_acge, float32)
1413 DO_3OP(gvec_facge_d, float64_acge, float64)
1414
1415 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1416 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1417 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1418
1419 DO_3OP(gvec_fmax_h, float16_max, float16)
1420 DO_3OP(gvec_fmax_s, float32_max, float32)
1421 DO_3OP(gvec_fmax_d, float64_max, float64)
1422
1423 DO_3OP(gvec_fmin_h, float16_min, float16)
1424 DO_3OP(gvec_fmin_s, float32_min, float32)
1425 DO_3OP(gvec_fmin_d, float64_min, float64)
1426
1427 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1428 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1429 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1430
1431 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1432 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1433 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1434
1435 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1436 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1437
1438 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1439 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1440
1441 #ifdef TARGET_AARCH64
1442 DO_3OP(gvec_fdiv_h, float16_div, float16)
1443 DO_3OP(gvec_fdiv_s, float32_div, float32)
1444 DO_3OP(gvec_fdiv_d, float64_div, float64)
1445
1446 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1447 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1448 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1449
1450 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1451 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1452 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1453
1454 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1455 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1456 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1457
1458 #endif
1459 #undef DO_3OP
1460
1461 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1462 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1463 float_status *stat)
1464 {
1465 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1466 }
1467
float32_muladd_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1468 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1469 float_status *stat)
1470 {
1471 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1472 }
1473
float16_mulsub_nf(float16 dest,float16 op1,float16 op2,float_status * stat)1474 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1475 float_status *stat)
1476 {
1477 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1478 }
1479
float32_mulsub_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1480 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1481 float_status *stat)
1482 {
1483 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1484 }
1485
1486 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
float16_muladd_f(float16 dest,float16 op1,float16 op2,float_status * stat)1487 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1488 float_status *stat)
1489 {
1490 return float16_muladd(op1, op2, dest, 0, stat);
1491 }
1492
float32_muladd_f(float32 dest,float32 op1,float32 op2,float_status * stat)1493 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1494 float_status *stat)
1495 {
1496 return float32_muladd(op1, op2, dest, 0, stat);
1497 }
1498
float64_muladd_f(float64 dest,float64 op1,float64 op2,float_status * stat)1499 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1500 float_status *stat)
1501 {
1502 return float64_muladd(op1, op2, dest, 0, stat);
1503 }
1504
float16_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1505 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1506 float_status *stat)
1507 {
1508 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1509 }
1510
float32_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1511 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1512 float_status *stat)
1513 {
1514 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1515 }
1516
float64_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)1517 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1518 float_status *stat)
1519 {
1520 return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1521 }
1522
1523 #define DO_MULADD(NAME, FUNC, TYPE) \
1524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1525 { \
1526 intptr_t i, oprsz = simd_oprsz(desc); \
1527 TYPE *d = vd, *n = vn, *m = vm; \
1528 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1529 d[i] = FUNC(d[i], n[i], m[i], stat); \
1530 } \
1531 clear_tail(d, oprsz, simd_maxsz(desc)); \
1532 }
1533
DO_MULADD(gvec_fmla_h,float16_muladd_nf,float16)1534 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1535 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1536
1537 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1538 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1539
1540 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1541 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1542 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1543
1544 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1545 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1546 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1547
1548 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1549 * For AdvSIMD, there is of course only one such vector segment.
1550 */
1551
1552 #define DO_MUL_IDX(NAME, TYPE, H) \
1553 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1554 { \
1555 intptr_t i, j, oprsz = simd_oprsz(desc); \
1556 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1557 intptr_t idx = simd_data(desc); \
1558 TYPE *d = vd, *n = vn, *m = vm; \
1559 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1560 TYPE mm = m[H(i + idx)]; \
1561 for (j = 0; j < segment; j++) { \
1562 d[i + j] = n[i + j] * mm; \
1563 } \
1564 } \
1565 clear_tail(d, oprsz, simd_maxsz(desc)); \
1566 }
1567
1568 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1569 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1570 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1571
1572 #undef DO_MUL_IDX
1573
1574 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1575 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1576 { \
1577 intptr_t i, j, oprsz = simd_oprsz(desc); \
1578 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1579 intptr_t idx = simd_data(desc); \
1580 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1581 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1582 TYPE mm = m[H(i + idx)]; \
1583 for (j = 0; j < segment; j++) { \
1584 d[i + j] = a[i + j] OP n[i + j] * mm; \
1585 } \
1586 } \
1587 clear_tail(d, oprsz, simd_maxsz(desc)); \
1588 }
1589
1590 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1591 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1592 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1593
1594 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1595 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1596 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1597
1598 #undef DO_MLA_IDX
1599
1600 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \
1601 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1602 { \
1603 intptr_t i, j, oprsz = simd_oprsz(desc); \
1604 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1605 intptr_t idx = simd_data(desc); \
1606 TYPE *d = vd, *n = vn, *m = vm; \
1607 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1608 TYPE mm = m[H(i + idx)]; \
1609 for (j = 0; j < segment; j++) { \
1610 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \
1611 } \
1612 } \
1613 clear_tail(d, oprsz, simd_maxsz(desc)); \
1614 }
1615
1616 #define nop(N, M, S) (M)
1617
1618 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1619 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1620 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1621
1622 #ifdef TARGET_AARCH64
1623
1624 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1625 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1626 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1627
1628 #endif
1629
1630 #undef nop
1631
1632 /*
1633 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1634 * the fused ops below they assume accumulate both from and into Vd.
1635 */
1636 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1637 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1638 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1639 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1640
1641 #undef DO_FMUL_IDX
1642
1643 #define DO_FMLA_IDX(NAME, TYPE, H) \
1644 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1645 void *stat, uint32_t desc) \
1646 { \
1647 intptr_t i, j, oprsz = simd_oprsz(desc); \
1648 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1649 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1650 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1651 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1652 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1653 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1654 TYPE mm = m[H(i + idx)]; \
1655 for (j = 0; j < segment; j++) { \
1656 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1657 mm, a[i + j], 0, stat); \
1658 } \
1659 } \
1660 clear_tail(d, oprsz, simd_maxsz(desc)); \
1661 }
1662
1663 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1664 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1665 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1666
1667 #undef DO_FMLA_IDX
1668
1669 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1670 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1671 { \
1672 intptr_t i, oprsz = simd_oprsz(desc); \
1673 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1674 bool q = false; \
1675 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1676 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1677 if (dd < MIN) { \
1678 dd = MIN; \
1679 q = true; \
1680 } else if (dd > MAX) { \
1681 dd = MAX; \
1682 q = true; \
1683 } \
1684 d[i] = dd; \
1685 } \
1686 if (q) { \
1687 uint32_t *qc = vq; \
1688 qc[0] = 1; \
1689 } \
1690 clear_tail(d, oprsz, simd_maxsz(desc)); \
1691 }
1692
1693 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1694 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1695 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1696
1697 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1698 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1699 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1700
1701 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1702 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1703 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1704
1705 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1706 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1707 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1708
1709 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1710 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1711 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1712
1713 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1714 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1715 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1716
1717 #undef DO_SAT
1718
1719 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1720 void *vm, uint32_t desc)
1721 {
1722 intptr_t i, oprsz = simd_oprsz(desc);
1723 uint64_t *d = vd, *n = vn, *m = vm;
1724 bool q = false;
1725
1726 for (i = 0; i < oprsz / 8; i++) {
1727 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1728 if (dd < nn) {
1729 dd = UINT64_MAX;
1730 q = true;
1731 }
1732 d[i] = dd;
1733 }
1734 if (q) {
1735 uint32_t *qc = vq;
1736 qc[0] = 1;
1737 }
1738 clear_tail(d, oprsz, simd_maxsz(desc));
1739 }
1740
HELPER(gvec_uqsub_d)1741 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1742 void *vm, uint32_t desc)
1743 {
1744 intptr_t i, oprsz = simd_oprsz(desc);
1745 uint64_t *d = vd, *n = vn, *m = vm;
1746 bool q = false;
1747
1748 for (i = 0; i < oprsz / 8; i++) {
1749 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1750 if (nn < mm) {
1751 dd = 0;
1752 q = true;
1753 }
1754 d[i] = dd;
1755 }
1756 if (q) {
1757 uint32_t *qc = vq;
1758 qc[0] = 1;
1759 }
1760 clear_tail(d, oprsz, simd_maxsz(desc));
1761 }
1762
HELPER(gvec_sqadd_d)1763 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1764 void *vm, uint32_t desc)
1765 {
1766 intptr_t i, oprsz = simd_oprsz(desc);
1767 int64_t *d = vd, *n = vn, *m = vm;
1768 bool q = false;
1769
1770 for (i = 0; i < oprsz / 8; i++) {
1771 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1772 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1773 dd = (nn >> 63) ^ ~INT64_MIN;
1774 q = true;
1775 }
1776 d[i] = dd;
1777 }
1778 if (q) {
1779 uint32_t *qc = vq;
1780 qc[0] = 1;
1781 }
1782 clear_tail(d, oprsz, simd_maxsz(desc));
1783 }
1784
HELPER(gvec_sqsub_d)1785 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1786 void *vm, uint32_t desc)
1787 {
1788 intptr_t i, oprsz = simd_oprsz(desc);
1789 int64_t *d = vd, *n = vn, *m = vm;
1790 bool q = false;
1791
1792 for (i = 0; i < oprsz / 8; i++) {
1793 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1794 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1795 dd = (nn >> 63) ^ ~INT64_MIN;
1796 q = true;
1797 }
1798 d[i] = dd;
1799 }
1800 if (q) {
1801 uint32_t *qc = vq;
1802 qc[0] = 1;
1803 }
1804 clear_tail(d, oprsz, simd_maxsz(desc));
1805 }
1806
HELPER(gvec_usqadd_d)1807 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1808 void *vm, uint32_t desc)
1809 {
1810 intptr_t i, oprsz = simd_oprsz(desc);
1811 uint64_t *d = vd, *n = vn, *m = vm;
1812 bool q = false;
1813
1814 for (i = 0; i < oprsz / 8; i++) {
1815 uint64_t nn = n[i];
1816 int64_t mm = m[i];
1817 uint64_t dd = nn + mm;
1818
1819 if (mm < 0) {
1820 if (nn < (uint64_t)-mm) {
1821 dd = 0;
1822 q = true;
1823 }
1824 } else {
1825 if (dd < nn) {
1826 dd = UINT64_MAX;
1827 q = true;
1828 }
1829 }
1830 d[i] = dd;
1831 }
1832 if (q) {
1833 uint32_t *qc = vq;
1834 qc[0] = 1;
1835 }
1836 clear_tail(d, oprsz, simd_maxsz(desc));
1837 }
1838
HELPER(gvec_suqadd_d)1839 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1840 void *vm, uint32_t desc)
1841 {
1842 intptr_t i, oprsz = simd_oprsz(desc);
1843 uint64_t *d = vd, *n = vn, *m = vm;
1844 bool q = false;
1845
1846 for (i = 0; i < oprsz / 8; i++) {
1847 int64_t nn = n[i];
1848 uint64_t mm = m[i];
1849 int64_t dd = nn + mm;
1850
1851 if (mm > (uint64_t)(INT64_MAX - nn)) {
1852 dd = INT64_MAX;
1853 q = true;
1854 }
1855 d[i] = dd;
1856 }
1857 if (q) {
1858 uint32_t *qc = vq;
1859 qc[0] = 1;
1860 }
1861 clear_tail(d, oprsz, simd_maxsz(desc));
1862 }
1863
1864 #define DO_SRA(NAME, TYPE) \
1865 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1866 { \
1867 intptr_t i, oprsz = simd_oprsz(desc); \
1868 int shift = simd_data(desc); \
1869 TYPE *d = vd, *n = vn; \
1870 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1871 d[i] += n[i] >> shift; \
1872 } \
1873 clear_tail(d, oprsz, simd_maxsz(desc)); \
1874 }
1875
DO_SRA(gvec_ssra_b,int8_t)1876 DO_SRA(gvec_ssra_b, int8_t)
1877 DO_SRA(gvec_ssra_h, int16_t)
1878 DO_SRA(gvec_ssra_s, int32_t)
1879 DO_SRA(gvec_ssra_d, int64_t)
1880
1881 DO_SRA(gvec_usra_b, uint8_t)
1882 DO_SRA(gvec_usra_h, uint16_t)
1883 DO_SRA(gvec_usra_s, uint32_t)
1884 DO_SRA(gvec_usra_d, uint64_t)
1885
1886 #undef DO_SRA
1887
1888 #define DO_RSHR(NAME, TYPE) \
1889 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1890 { \
1891 intptr_t i, oprsz = simd_oprsz(desc); \
1892 int shift = simd_data(desc); \
1893 TYPE *d = vd, *n = vn; \
1894 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1895 TYPE tmp = n[i] >> (shift - 1); \
1896 d[i] = (tmp >> 1) + (tmp & 1); \
1897 } \
1898 clear_tail(d, oprsz, simd_maxsz(desc)); \
1899 }
1900
1901 DO_RSHR(gvec_srshr_b, int8_t)
1902 DO_RSHR(gvec_srshr_h, int16_t)
1903 DO_RSHR(gvec_srshr_s, int32_t)
1904 DO_RSHR(gvec_srshr_d, int64_t)
1905
1906 DO_RSHR(gvec_urshr_b, uint8_t)
1907 DO_RSHR(gvec_urshr_h, uint16_t)
1908 DO_RSHR(gvec_urshr_s, uint32_t)
1909 DO_RSHR(gvec_urshr_d, uint64_t)
1910
1911 #undef DO_RSHR
1912
1913 #define DO_RSRA(NAME, TYPE) \
1914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1915 { \
1916 intptr_t i, oprsz = simd_oprsz(desc); \
1917 int shift = simd_data(desc); \
1918 TYPE *d = vd, *n = vn; \
1919 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1920 TYPE tmp = n[i] >> (shift - 1); \
1921 d[i] += (tmp >> 1) + (tmp & 1); \
1922 } \
1923 clear_tail(d, oprsz, simd_maxsz(desc)); \
1924 }
1925
1926 DO_RSRA(gvec_srsra_b, int8_t)
1927 DO_RSRA(gvec_srsra_h, int16_t)
1928 DO_RSRA(gvec_srsra_s, int32_t)
1929 DO_RSRA(gvec_srsra_d, int64_t)
1930
1931 DO_RSRA(gvec_ursra_b, uint8_t)
1932 DO_RSRA(gvec_ursra_h, uint16_t)
1933 DO_RSRA(gvec_ursra_s, uint32_t)
1934 DO_RSRA(gvec_ursra_d, uint64_t)
1935
1936 #undef DO_RSRA
1937
1938 #define DO_SRI(NAME, TYPE) \
1939 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1940 { \
1941 intptr_t i, oprsz = simd_oprsz(desc); \
1942 int shift = simd_data(desc); \
1943 TYPE *d = vd, *n = vn; \
1944 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1945 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1946 } \
1947 clear_tail(d, oprsz, simd_maxsz(desc)); \
1948 }
1949
1950 DO_SRI(gvec_sri_b, uint8_t)
1951 DO_SRI(gvec_sri_h, uint16_t)
1952 DO_SRI(gvec_sri_s, uint32_t)
1953 DO_SRI(gvec_sri_d, uint64_t)
1954
1955 #undef DO_SRI
1956
1957 #define DO_SLI(NAME, TYPE) \
1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1959 { \
1960 intptr_t i, oprsz = simd_oprsz(desc); \
1961 int shift = simd_data(desc); \
1962 TYPE *d = vd, *n = vn; \
1963 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1964 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1965 } \
1966 clear_tail(d, oprsz, simd_maxsz(desc)); \
1967 }
1968
1969 DO_SLI(gvec_sli_b, uint8_t)
1970 DO_SLI(gvec_sli_h, uint16_t)
1971 DO_SLI(gvec_sli_s, uint32_t)
1972 DO_SLI(gvec_sli_d, uint64_t)
1973
1974 #undef DO_SLI
1975
1976 /*
1977 * Convert float16 to float32, raising no exceptions and
1978 * preserving exceptional values, including SNaN.
1979 * This is effectively an unpack+repack operation.
1980 */
1981 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1982 {
1983 const int f16_bias = 15;
1984 const int f32_bias = 127;
1985 uint32_t sign = extract32(f16, 15, 1);
1986 uint32_t exp = extract32(f16, 10, 5);
1987 uint32_t frac = extract32(f16, 0, 10);
1988
1989 if (exp == 0x1f) {
1990 /* Inf or NaN */
1991 exp = 0xff;
1992 } else if (exp == 0) {
1993 /* Zero or denormal. */
1994 if (frac != 0) {
1995 if (fz16) {
1996 frac = 0;
1997 } else {
1998 /*
1999 * Denormal; these are all normal float32.
2000 * Shift the fraction so that the msb is at bit 11,
2001 * then remove bit 11 as the implicit bit of the
2002 * normalized float32. Note that we still go through
2003 * the shift for normal numbers below, to put the
2004 * float32 fraction at the right place.
2005 */
2006 int shift = clz32(frac) - 21;
2007 frac = (frac << shift) & 0x3ff;
2008 exp = f32_bias - f16_bias - shift + 1;
2009 }
2010 }
2011 } else {
2012 /* Normal number; adjust the bias. */
2013 exp += f32_bias - f16_bias;
2014 }
2015 sign <<= 31;
2016 exp <<= 23;
2017 frac <<= 23 - 10;
2018
2019 return sign | exp | frac;
2020 }
2021
load4_f16(uint64_t * ptr,int is_q,int is_2)2022 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2023 {
2024 /*
2025 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2026 * Load the 2nd qword iff is_q & is_2.
2027 * Shift to the 2nd dword iff !is_q & is_2.
2028 * For !is_q & !is_2, the upper bits of the result are garbage.
2029 */
2030 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2031 }
2032
2033 /*
2034 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2035 * as there is not yet SVE versions that might use blocking.
2036 */
2037
do_fmlal(float32 * d,void * vn,void * vm,float_status * fpst,uint32_t desc,bool fz16)2038 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2039 uint32_t desc, bool fz16)
2040 {
2041 intptr_t i, oprsz = simd_oprsz(desc);
2042 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2043 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2044 int is_q = oprsz == 16;
2045 uint64_t n_4, m_4;
2046
2047 /* Pre-load all of the f16 data, avoiding overlap issues. */
2048 n_4 = load4_f16(vn, is_q, is_2);
2049 m_4 = load4_f16(vm, is_q, is_2);
2050
2051 /* Negate all inputs for FMLSL at once. */
2052 if (is_s) {
2053 n_4 ^= 0x8000800080008000ull;
2054 }
2055
2056 for (i = 0; i < oprsz / 4; i++) {
2057 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2058 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2059 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2060 }
2061 clear_tail(d, oprsz, simd_maxsz(desc));
2062 }
2063
HELPER(gvec_fmlal_a32)2064 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2065 void *venv, uint32_t desc)
2066 {
2067 CPUARMState *env = venv;
2068 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2069 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2070 }
2071
HELPER(gvec_fmlal_a64)2072 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2073 void *venv, uint32_t desc)
2074 {
2075 CPUARMState *env = venv;
2076 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2077 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2078 }
2079
HELPER(sve2_fmlal_zzzw_s)2080 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2081 void *venv, uint32_t desc)
2082 {
2083 intptr_t i, oprsz = simd_oprsz(desc);
2084 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2085 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2086 CPUARMState *env = venv;
2087 float_status *status = &env->vfp.fp_status;
2088 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2089
2090 for (i = 0; i < oprsz; i += sizeof(float32)) {
2091 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095 float32 aa = *(float32 *)(va + H1_4(i));
2096
2097 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098 }
2099 }
2100
do_fmlal_idx(float32 * d,void * vn,void * vm,float_status * fpst,uint32_t desc,bool fz16)2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102 uint32_t desc, bool fz16)
2103 {
2104 intptr_t i, oprsz = simd_oprsz(desc);
2105 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108 int is_q = oprsz == 16;
2109 uint64_t n_4;
2110 float32 m_1;
2111
2112 /* Pre-load all of the f16 data, avoiding overlap issues. */
2113 n_4 = load4_f16(vn, is_q, is_2);
2114
2115 /* Negate all inputs for FMLSL at once. */
2116 if (is_s) {
2117 n_4 ^= 0x8000800080008000ull;
2118 }
2119
2120 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121
2122 for (i = 0; i < oprsz / 4; i++) {
2123 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125 }
2126 clear_tail(d, oprsz, simd_maxsz(desc));
2127 }
2128
HELPER(gvec_fmlal_idx_a32)2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130 void *venv, uint32_t desc)
2131 {
2132 CPUARMState *env = venv;
2133 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2134 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2135 }
2136
HELPER(gvec_fmlal_idx_a64)2137 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2138 void *venv, uint32_t desc)
2139 {
2140 CPUARMState *env = venv;
2141 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2142 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2143 }
2144
HELPER(sve2_fmlal_zzxw_s)2145 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2146 void *venv, uint32_t desc)
2147 {
2148 intptr_t i, j, oprsz = simd_oprsz(desc);
2149 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2150 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2151 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2152 CPUARMState *env = venv;
2153 float_status *status = &env->vfp.fp_status;
2154 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2155
2156 for (i = 0; i < oprsz; i += 16) {
2157 float16 mm_16 = *(float16 *)(vm + i + idx);
2158 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2159
2160 for (j = 0; j < 16; j += sizeof(float32)) {
2161 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2162 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2163 float32 aa = *(float32 *)(va + H1_4(i + j));
2164
2165 *(float32 *)(vd + H1_4(i + j)) =
2166 float32_muladd(nn, mm, aa, 0, status);
2167 }
2168 }
2169 }
2170
HELPER(gvec_sshl_b)2171 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2172 {
2173 intptr_t i, opr_sz = simd_oprsz(desc);
2174 int8_t *d = vd, *n = vn, *m = vm;
2175
2176 for (i = 0; i < opr_sz; ++i) {
2177 int8_t mm = m[i];
2178 int8_t nn = n[i];
2179 int8_t res = 0;
2180 if (mm >= 0) {
2181 if (mm < 8) {
2182 res = nn << mm;
2183 }
2184 } else {
2185 res = nn >> (mm > -8 ? -mm : 7);
2186 }
2187 d[i] = res;
2188 }
2189 clear_tail(d, opr_sz, simd_maxsz(desc));
2190 }
2191
HELPER(gvec_sshl_h)2192 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2193 {
2194 intptr_t i, opr_sz = simd_oprsz(desc);
2195 int16_t *d = vd, *n = vn, *m = vm;
2196
2197 for (i = 0; i < opr_sz / 2; ++i) {
2198 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2199 int16_t nn = n[i];
2200 int16_t res = 0;
2201 if (mm >= 0) {
2202 if (mm < 16) {
2203 res = nn << mm;
2204 }
2205 } else {
2206 res = nn >> (mm > -16 ? -mm : 15);
2207 }
2208 d[i] = res;
2209 }
2210 clear_tail(d, opr_sz, simd_maxsz(desc));
2211 }
2212
HELPER(gvec_ushl_b)2213 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2214 {
2215 intptr_t i, opr_sz = simd_oprsz(desc);
2216 uint8_t *d = vd, *n = vn, *m = vm;
2217
2218 for (i = 0; i < opr_sz; ++i) {
2219 int8_t mm = m[i];
2220 uint8_t nn = n[i];
2221 uint8_t res = 0;
2222 if (mm >= 0) {
2223 if (mm < 8) {
2224 res = nn << mm;
2225 }
2226 } else {
2227 if (mm > -8) {
2228 res = nn >> -mm;
2229 }
2230 }
2231 d[i] = res;
2232 }
2233 clear_tail(d, opr_sz, simd_maxsz(desc));
2234 }
2235
HELPER(gvec_ushl_h)2236 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2237 {
2238 intptr_t i, opr_sz = simd_oprsz(desc);
2239 uint16_t *d = vd, *n = vn, *m = vm;
2240
2241 for (i = 0; i < opr_sz / 2; ++i) {
2242 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2243 uint16_t nn = n[i];
2244 uint16_t res = 0;
2245 if (mm >= 0) {
2246 if (mm < 16) {
2247 res = nn << mm;
2248 }
2249 } else {
2250 if (mm > -16) {
2251 res = nn >> -mm;
2252 }
2253 }
2254 d[i] = res;
2255 }
2256 clear_tail(d, opr_sz, simd_maxsz(desc));
2257 }
2258
2259 /*
2260 * 8x8->8 polynomial multiply.
2261 *
2262 * Polynomial multiplication is like integer multiplication except the
2263 * partial products are XORed, not added.
2264 *
2265 * TODO: expose this as a generic vector operation, as it is a common
2266 * crypto building block.
2267 */
HELPER(gvec_pmul_b)2268 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2269 {
2270 intptr_t i, opr_sz = simd_oprsz(desc);
2271 uint64_t *d = vd, *n = vn, *m = vm;
2272
2273 for (i = 0; i < opr_sz / 8; ++i) {
2274 d[i] = clmul_8x8_low(n[i], m[i]);
2275 }
2276 clear_tail(d, opr_sz, simd_maxsz(desc));
2277 }
2278
2279 /*
2280 * 64x64->128 polynomial multiply.
2281 * Because of the lanes are not accessed in strict columns,
2282 * this probably cannot be turned into a generic helper.
2283 */
HELPER(gvec_pmull_q)2284 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2285 {
2286 intptr_t i, opr_sz = simd_oprsz(desc);
2287 intptr_t hi = simd_data(desc);
2288 uint64_t *d = vd, *n = vn, *m = vm;
2289
2290 for (i = 0; i < opr_sz / 8; i += 2) {
2291 Int128 r = clmul_64(n[i + hi], m[i + hi]);
2292 d[i] = int128_getlo(r);
2293 d[i + 1] = int128_gethi(r);
2294 }
2295 clear_tail(d, opr_sz, simd_maxsz(desc));
2296 }
2297
HELPER(neon_pmull_h)2298 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2299 {
2300 int hi = simd_data(desc);
2301 uint64_t *d = vd, *n = vn, *m = vm;
2302 uint64_t nn = n[hi], mm = m[hi];
2303
2304 d[0] = clmul_8x4_packed(nn, mm);
2305 nn >>= 32;
2306 mm >>= 32;
2307 d[1] = clmul_8x4_packed(nn, mm);
2308
2309 clear_tail(d, 16, simd_maxsz(desc));
2310 }
2311
2312 #ifdef TARGET_AARCH64
HELPER(sve2_pmull_h)2313 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2314 {
2315 int shift = simd_data(desc) * 8;
2316 intptr_t i, opr_sz = simd_oprsz(desc);
2317 uint64_t *d = vd, *n = vn, *m = vm;
2318
2319 for (i = 0; i < opr_sz / 8; ++i) {
2320 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2321 }
2322 }
2323
HELPER(sve2_pmull_d)2324 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2325 {
2326 intptr_t sel = H4(simd_data(desc));
2327 intptr_t i, opr_sz = simd_oprsz(desc);
2328 uint32_t *n = vn, *m = vm;
2329 uint64_t *d = vd;
2330
2331 for (i = 0; i < opr_sz / 8; ++i) {
2332 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2333 }
2334 }
2335 #endif
2336
2337 #define DO_CMP0(NAME, TYPE, OP) \
2338 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2339 { \
2340 intptr_t i, opr_sz = simd_oprsz(desc); \
2341 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2342 TYPE nn = *(TYPE *)(vn + i); \
2343 *(TYPE *)(vd + i) = -(nn OP 0); \
2344 } \
2345 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2346 }
2347
2348 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2349 DO_CMP0(gvec_clt0_b, int8_t, <)
2350 DO_CMP0(gvec_cle0_b, int8_t, <=)
2351 DO_CMP0(gvec_cgt0_b, int8_t, >)
2352 DO_CMP0(gvec_cge0_b, int8_t, >=)
2353
2354 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2355 DO_CMP0(gvec_clt0_h, int16_t, <)
2356 DO_CMP0(gvec_cle0_h, int16_t, <=)
2357 DO_CMP0(gvec_cgt0_h, int16_t, >)
2358 DO_CMP0(gvec_cge0_h, int16_t, >=)
2359
2360 #undef DO_CMP0
2361
2362 #define DO_ABD(NAME, TYPE) \
2363 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2364 { \
2365 intptr_t i, opr_sz = simd_oprsz(desc); \
2366 TYPE *d = vd, *n = vn, *m = vm; \
2367 \
2368 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2369 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2370 } \
2371 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2372 }
2373
DO_ABD(gvec_sabd_b,int8_t)2374 DO_ABD(gvec_sabd_b, int8_t)
2375 DO_ABD(gvec_sabd_h, int16_t)
2376 DO_ABD(gvec_sabd_s, int32_t)
2377 DO_ABD(gvec_sabd_d, int64_t)
2378
2379 DO_ABD(gvec_uabd_b, uint8_t)
2380 DO_ABD(gvec_uabd_h, uint16_t)
2381 DO_ABD(gvec_uabd_s, uint32_t)
2382 DO_ABD(gvec_uabd_d, uint64_t)
2383
2384 #undef DO_ABD
2385
2386 #define DO_ABA(NAME, TYPE) \
2387 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2388 { \
2389 intptr_t i, opr_sz = simd_oprsz(desc); \
2390 TYPE *d = vd, *n = vn, *m = vm; \
2391 \
2392 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2393 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2394 } \
2395 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2396 }
2397
2398 DO_ABA(gvec_saba_b, int8_t)
2399 DO_ABA(gvec_saba_h, int16_t)
2400 DO_ABA(gvec_saba_s, int32_t)
2401 DO_ABA(gvec_saba_d, int64_t)
2402
2403 DO_ABA(gvec_uaba_b, uint8_t)
2404 DO_ABA(gvec_uaba_h, uint16_t)
2405 DO_ABA(gvec_uaba_s, uint32_t)
2406 DO_ABA(gvec_uaba_d, uint64_t)
2407
2408 #undef DO_ABA
2409
2410 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2412 { \
2413 ARMVectorReg scratch; \
2414 intptr_t oprsz = simd_oprsz(desc); \
2415 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2416 TYPE *d = vd, *n = vn, *m = vm; \
2417 if (unlikely(d == m)) { \
2418 m = memcpy(&scratch, m, oprsz); \
2419 } \
2420 for (intptr_t i = 0; i < half; ++i) { \
2421 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \
2422 } \
2423 for (intptr_t i = 0; i < half; ++i) { \
2424 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \
2425 } \
2426 clear_tail(d, oprsz, simd_maxsz(desc)); \
2427 }
2428
2429 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2430 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2431 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2432
2433 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2434 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2435 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2436
2437 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2438 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2439 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2440
2441 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2442 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2443 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2444
2445 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2446 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2447 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2448
2449 #undef DO_3OP_PAIR
2450
2451 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2452 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2453 { \
2454 ARMVectorReg scratch; \
2455 intptr_t oprsz = simd_oprsz(desc); \
2456 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2457 TYPE *d = vd, *n = vn, *m = vm; \
2458 if (unlikely(d == m)) { \
2459 m = memcpy(&scratch, m, oprsz); \
2460 } \
2461 for (intptr_t i = 0; i < half; ++i) { \
2462 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \
2463 } \
2464 for (intptr_t i = 0; i < half; ++i) { \
2465 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \
2466 } \
2467 clear_tail(d, oprsz, simd_maxsz(desc)); \
2468 }
2469
2470 #define ADD(A, B) (A + B)
2471 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2472 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2473 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2474 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2475 #undef ADD
2476
2477 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2478 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2479 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2480
2481 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2482 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2483 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2484
2485 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2486 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2487 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2488
2489 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2490 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2491 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2492
2493 #undef DO_3OP_PAIR
2494
2495 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2496 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2497 { \
2498 intptr_t i, oprsz = simd_oprsz(desc); \
2499 int shift = simd_data(desc); \
2500 TYPE *d = vd, *n = vn; \
2501 float_status *fpst = stat; \
2502 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2503 d[i] = FUNC(n[i], shift, fpst); \
2504 } \
2505 clear_tail(d, oprsz, simd_maxsz(desc)); \
2506 }
2507
2508 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2509 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2510 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2511 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2512 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2513 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2514 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2515 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2516
2517 #undef DO_VCVT_FIXED
2518
2519 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2520 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2521 { \
2522 float_status *fpst = stat; \
2523 intptr_t i, oprsz = simd_oprsz(desc); \
2524 uint32_t rmode = simd_data(desc); \
2525 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2526 TYPE *d = vd, *n = vn; \
2527 set_float_rounding_mode(rmode, fpst); \
2528 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2529 d[i] = FUNC(n[i], 0, fpst); \
2530 } \
2531 set_float_rounding_mode(prev_rmode, fpst); \
2532 clear_tail(d, oprsz, simd_maxsz(desc)); \
2533 }
2534
2535 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2536 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2537 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2538 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2539
2540 #undef DO_VCVT_RMODE
2541
2542 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2543 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2544 { \
2545 float_status *fpst = stat; \
2546 intptr_t i, oprsz = simd_oprsz(desc); \
2547 uint32_t rmode = simd_data(desc); \
2548 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2549 TYPE *d = vd, *n = vn; \
2550 set_float_rounding_mode(rmode, fpst); \
2551 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2552 d[i] = FUNC(n[i], fpst); \
2553 } \
2554 set_float_rounding_mode(prev_rmode, fpst); \
2555 clear_tail(d, oprsz, simd_maxsz(desc)); \
2556 }
2557
2558 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2559 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2560
2561 #undef DO_VRINT_RMODE
2562
2563 #ifdef TARGET_AARCH64
2564 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2565 {
2566 const uint8_t *indices = vm;
2567 CPUARMState *env = venv;
2568 size_t oprsz = simd_oprsz(desc);
2569 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2570 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2571 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2572 union {
2573 uint8_t b[16];
2574 uint64_t d[2];
2575 } result;
2576
2577 /*
2578 * We must construct the final result in a temp, lest the output
2579 * overlaps the input table. For TBL, begin with zero; for TBX,
2580 * begin with the original register contents. Note that we always
2581 * copy 16 bytes here to avoid an extra branch; clearing the high
2582 * bits of the register for oprsz == 8 is handled below.
2583 */
2584 if (is_tbx) {
2585 memcpy(&result, vd, 16);
2586 } else {
2587 memset(&result, 0, 16);
2588 }
2589
2590 for (size_t i = 0; i < oprsz; ++i) {
2591 uint32_t index = indices[H1(i)];
2592
2593 if (index < table_len) {
2594 /*
2595 * Convert index (a byte offset into the virtual table
2596 * which is a series of 128-bit vectors concatenated)
2597 * into the correct register element, bearing in mind
2598 * that the table can wrap around from V31 to V0.
2599 */
2600 const uint8_t *table = (const uint8_t *)
2601 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2602 result.b[H1(i)] = table[H1(index % 16)];
2603 }
2604 }
2605
2606 memcpy(vd, &result, 16);
2607 clear_tail(vd, oprsz, simd_maxsz(desc));
2608 }
2609 #endif
2610
2611 /*
2612 * NxN -> N highpart multiply
2613 *
2614 * TODO: expose this as a generic vector operation.
2615 */
2616
HELPER(gvec_smulh_b)2617 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2618 {
2619 intptr_t i, opr_sz = simd_oprsz(desc);
2620 int8_t *d = vd, *n = vn, *m = vm;
2621
2622 for (i = 0; i < opr_sz; ++i) {
2623 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2624 }
2625 clear_tail(d, opr_sz, simd_maxsz(desc));
2626 }
2627
HELPER(gvec_smulh_h)2628 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2629 {
2630 intptr_t i, opr_sz = simd_oprsz(desc);
2631 int16_t *d = vd, *n = vn, *m = vm;
2632
2633 for (i = 0; i < opr_sz / 2; ++i) {
2634 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2635 }
2636 clear_tail(d, opr_sz, simd_maxsz(desc));
2637 }
2638
HELPER(gvec_smulh_s)2639 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2640 {
2641 intptr_t i, opr_sz = simd_oprsz(desc);
2642 int32_t *d = vd, *n = vn, *m = vm;
2643
2644 for (i = 0; i < opr_sz / 4; ++i) {
2645 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2646 }
2647 clear_tail(d, opr_sz, simd_maxsz(desc));
2648 }
2649
HELPER(gvec_smulh_d)2650 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2651 {
2652 intptr_t i, opr_sz = simd_oprsz(desc);
2653 uint64_t *d = vd, *n = vn, *m = vm;
2654 uint64_t discard;
2655
2656 for (i = 0; i < opr_sz / 8; ++i) {
2657 muls64(&discard, &d[i], n[i], m[i]);
2658 }
2659 clear_tail(d, opr_sz, simd_maxsz(desc));
2660 }
2661
HELPER(gvec_umulh_b)2662 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2663 {
2664 intptr_t i, opr_sz = simd_oprsz(desc);
2665 uint8_t *d = vd, *n = vn, *m = vm;
2666
2667 for (i = 0; i < opr_sz; ++i) {
2668 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2669 }
2670 clear_tail(d, opr_sz, simd_maxsz(desc));
2671 }
2672
HELPER(gvec_umulh_h)2673 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2674 {
2675 intptr_t i, opr_sz = simd_oprsz(desc);
2676 uint16_t *d = vd, *n = vn, *m = vm;
2677
2678 for (i = 0; i < opr_sz / 2; ++i) {
2679 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2680 }
2681 clear_tail(d, opr_sz, simd_maxsz(desc));
2682 }
2683
HELPER(gvec_umulh_s)2684 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2685 {
2686 intptr_t i, opr_sz = simd_oprsz(desc);
2687 uint32_t *d = vd, *n = vn, *m = vm;
2688
2689 for (i = 0; i < opr_sz / 4; ++i) {
2690 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2691 }
2692 clear_tail(d, opr_sz, simd_maxsz(desc));
2693 }
2694
HELPER(gvec_umulh_d)2695 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2696 {
2697 intptr_t i, opr_sz = simd_oprsz(desc);
2698 uint64_t *d = vd, *n = vn, *m = vm;
2699 uint64_t discard;
2700
2701 for (i = 0; i < opr_sz / 8; ++i) {
2702 mulu64(&discard, &d[i], n[i], m[i]);
2703 }
2704 clear_tail(d, opr_sz, simd_maxsz(desc));
2705 }
2706
HELPER(gvec_xar_d)2707 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2708 {
2709 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2710 int shr = simd_data(desc);
2711 uint64_t *d = vd, *n = vn, *m = vm;
2712
2713 for (i = 0; i < opr_sz; ++i) {
2714 d[i] = ror64(n[i] ^ m[i], shr);
2715 }
2716 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2717 }
2718
2719 /*
2720 * Integer matrix-multiply accumulate
2721 */
2722
do_smmla_b(uint32_t sum,void * vn,void * vm)2723 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2724 {
2725 int8_t *n = vn, *m = vm;
2726
2727 for (intptr_t k = 0; k < 8; ++k) {
2728 sum += n[H1(k)] * m[H1(k)];
2729 }
2730 return sum;
2731 }
2732
do_ummla_b(uint32_t sum,void * vn,void * vm)2733 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2734 {
2735 uint8_t *n = vn, *m = vm;
2736
2737 for (intptr_t k = 0; k < 8; ++k) {
2738 sum += n[H1(k)] * m[H1(k)];
2739 }
2740 return sum;
2741 }
2742
do_usmmla_b(uint32_t sum,void * vn,void * vm)2743 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2744 {
2745 uint8_t *n = vn;
2746 int8_t *m = vm;
2747
2748 for (intptr_t k = 0; k < 8; ++k) {
2749 sum += n[H1(k)] * m[H1(k)];
2750 }
2751 return sum;
2752 }
2753
do_mmla_b(void * vd,void * vn,void * vm,void * va,uint32_t desc,uint32_t (* inner_loop)(uint32_t,void *,void *))2754 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2755 uint32_t (*inner_loop)(uint32_t, void *, void *))
2756 {
2757 intptr_t seg, opr_sz = simd_oprsz(desc);
2758
2759 for (seg = 0; seg < opr_sz; seg += 16) {
2760 uint32_t *d = vd + seg;
2761 uint32_t *a = va + seg;
2762 uint32_t sum0, sum1, sum2, sum3;
2763
2764 /*
2765 * Process the entire segment at once, writing back the
2766 * results only after we've consumed all of the inputs.
2767 *
2768 * Key to indices by column:
2769 * i j i j
2770 */
2771 sum0 = a[H4(0 + 0)];
2772 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2773 sum1 = a[H4(0 + 1)];
2774 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2775 sum2 = a[H4(2 + 0)];
2776 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2777 sum3 = a[H4(2 + 1)];
2778 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2779
2780 d[H4(0)] = sum0;
2781 d[H4(1)] = sum1;
2782 d[H4(2)] = sum2;
2783 d[H4(3)] = sum3;
2784 }
2785 clear_tail(vd, opr_sz, simd_maxsz(desc));
2786 }
2787
2788 #define DO_MMLA_B(NAME, INNER) \
2789 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2790 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2791
DO_MMLA_B(gvec_smmla_b,do_smmla_b)2792 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2793 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2794 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2795
2796 /*
2797 * BFloat16 Dot Product
2798 */
2799
2800 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2801 {
2802 /*
2803 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2804 * For EBF = 0, we ignore the FPCR bits which determine rounding
2805 * mode and denormal-flushing, and we do unfused multiplies and
2806 * additions with intermediate rounding of all products and sums.
2807 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2808 * and we perform a fused two-way sum-of-products without intermediate
2809 * rounding of the products.
2810 * In either case, we don't set fp exception flags.
2811 *
2812 * EBF is AArch64 only, so even if it's set in the FPCR it has
2813 * no effect on AArch32 instructions.
2814 */
2815 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2816 *statusp = (float_status){
2817 .tininess_before_rounding = float_tininess_before_rounding,
2818 .float_rounding_mode = float_round_to_odd_inf,
2819 .flush_to_zero = true,
2820 .flush_inputs_to_zero = true,
2821 .default_nan_mode = true,
2822 };
2823
2824 if (ebf) {
2825 float_status *fpst = &env->vfp.fp_status;
2826 set_flush_to_zero(get_flush_to_zero(fpst), statusp);
2827 set_flush_inputs_to_zero(get_flush_inputs_to_zero(fpst), statusp);
2828 set_float_rounding_mode(get_float_rounding_mode(fpst), statusp);
2829
2830 /* EBF=1 needs to do a step with round-to-odd semantics */
2831 *oddstatusp = *statusp;
2832 set_float_rounding_mode(float_round_to_odd, oddstatusp);
2833 }
2834
2835 return ebf;
2836 }
2837
bfdotadd(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst)2838 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2839 {
2840 float32 t1, t2;
2841
2842 /*
2843 * Extract each BFloat16 from the element pair, and shift
2844 * them such that they become float32.
2845 */
2846 t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2847 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2848 t1 = float32_add(t1, t2, fpst);
2849 t1 = float32_add(sum, t1, fpst);
2850
2851 return t1;
2852 }
2853
bfdotadd_ebf(float32 sum,uint32_t e1,uint32_t e2,float_status * fpst,float_status * fpst_odd)2854 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2855 float_status *fpst, float_status *fpst_odd)
2856 {
2857 /*
2858 * Compare f16_dotadd() in sme_helper.c, but here we have
2859 * bfloat16 inputs. In particular that means that we do not
2860 * want the FPCR.FZ16 flush semantics, so we use the normal
2861 * float_status for the input handling here.
2862 */
2863 float64 e1r = float32_to_float64(e1 << 16, fpst);
2864 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2865 float64 e2r = float32_to_float64(e2 << 16, fpst);
2866 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2867 float64 t64;
2868 float32 t32;
2869
2870 /*
2871 * The ARM pseudocode function FPDot performs both multiplies
2872 * and the add with a single rounding operation. Emulate this
2873 * by performing the first multiply in round-to-odd, then doing
2874 * the second multiply as fused multiply-add, and rounding to
2875 * float32 all in one step.
2876 */
2877 t64 = float64_mul(e1r, e2r, fpst_odd);
2878 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2879
2880 /* This conversion is exact, because we've already rounded. */
2881 t32 = float64_to_float32(t64, fpst);
2882
2883 /* The final accumulation step is not fused. */
2884 return float32_add(sum, t32, fpst);
2885 }
2886
HELPER(gvec_bfdot)2887 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2888 CPUARMState *env, uint32_t desc)
2889 {
2890 intptr_t i, opr_sz = simd_oprsz(desc);
2891 float32 *d = vd, *a = va;
2892 uint32_t *n = vn, *m = vm;
2893 float_status fpst, fpst_odd;
2894
2895 if (is_ebf(env, &fpst, &fpst_odd)) {
2896 for (i = 0; i < opr_sz / 4; ++i) {
2897 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2898 }
2899 } else {
2900 for (i = 0; i < opr_sz / 4; ++i) {
2901 d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2902 }
2903 }
2904 clear_tail(d, opr_sz, simd_maxsz(desc));
2905 }
2906
HELPER(gvec_bfdot_idx)2907 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2908 void *va, CPUARMState *env, uint32_t desc)
2909 {
2910 intptr_t i, j, opr_sz = simd_oprsz(desc);
2911 intptr_t index = simd_data(desc);
2912 intptr_t elements = opr_sz / 4;
2913 intptr_t eltspersegment = MIN(16 / 4, elements);
2914 float32 *d = vd, *a = va;
2915 uint32_t *n = vn, *m = vm;
2916 float_status fpst, fpst_odd;
2917
2918 if (is_ebf(env, &fpst, &fpst_odd)) {
2919 for (i = 0; i < elements; i += eltspersegment) {
2920 uint32_t m_idx = m[i + H4(index)];
2921
2922 for (j = i; j < i + eltspersegment; j++) {
2923 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2924 }
2925 }
2926 } else {
2927 for (i = 0; i < elements; i += eltspersegment) {
2928 uint32_t m_idx = m[i + H4(index)];
2929
2930 for (j = i; j < i + eltspersegment; j++) {
2931 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2932 }
2933 }
2934 }
2935 clear_tail(d, opr_sz, simd_maxsz(desc));
2936 }
2937
HELPER(gvec_bfmmla)2938 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2939 CPUARMState *env, uint32_t desc)
2940 {
2941 intptr_t s, opr_sz = simd_oprsz(desc);
2942 float32 *d = vd, *a = va;
2943 uint32_t *n = vn, *m = vm;
2944 float_status fpst, fpst_odd;
2945
2946 if (is_ebf(env, &fpst, &fpst_odd)) {
2947 for (s = 0; s < opr_sz / 4; s += 4) {
2948 float32 sum00, sum01, sum10, sum11;
2949
2950 /*
2951 * Process the entire segment at once, writing back the
2952 * results only after we've consumed all of the inputs.
2953 *
2954 * Key to indices by column:
2955 * i j i k j k
2956 */
2957 sum00 = a[s + H4(0 + 0)];
2958 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2959 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2960
2961 sum01 = a[s + H4(0 + 1)];
2962 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2963 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2964
2965 sum10 = a[s + H4(2 + 0)];
2966 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2967 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2968
2969 sum11 = a[s + H4(2 + 1)];
2970 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2971 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2972
2973 d[s + H4(0 + 0)] = sum00;
2974 d[s + H4(0 + 1)] = sum01;
2975 d[s + H4(2 + 0)] = sum10;
2976 d[s + H4(2 + 1)] = sum11;
2977 }
2978 } else {
2979 for (s = 0; s < opr_sz / 4; s += 4) {
2980 float32 sum00, sum01, sum10, sum11;
2981
2982 /*
2983 * Process the entire segment at once, writing back the
2984 * results only after we've consumed all of the inputs.
2985 *
2986 * Key to indices by column:
2987 * i j i k j k
2988 */
2989 sum00 = a[s + H4(0 + 0)];
2990 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2991 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2992
2993 sum01 = a[s + H4(0 + 1)];
2994 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2995 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2996
2997 sum10 = a[s + H4(2 + 0)];
2998 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2999 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3000
3001 sum11 = a[s + H4(2 + 1)];
3002 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3003 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3004
3005 d[s + H4(0 + 0)] = sum00;
3006 d[s + H4(0 + 1)] = sum01;
3007 d[s + H4(2 + 0)] = sum10;
3008 d[s + H4(2 + 1)] = sum11;
3009 }
3010 }
3011 clear_tail(d, opr_sz, simd_maxsz(desc));
3012 }
3013
HELPER(gvec_bfmlal)3014 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3015 void *stat, uint32_t desc)
3016 {
3017 intptr_t i, opr_sz = simd_oprsz(desc);
3018 intptr_t sel = simd_data(desc);
3019 float32 *d = vd, *a = va;
3020 bfloat16 *n = vn, *m = vm;
3021
3022 for (i = 0; i < opr_sz / 4; ++i) {
3023 float32 nn = n[H2(i * 2 + sel)] << 16;
3024 float32 mm = m[H2(i * 2 + sel)] << 16;
3025 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3026 }
3027 clear_tail(d, opr_sz, simd_maxsz(desc));
3028 }
3029
HELPER(gvec_bfmlal_idx)3030 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3031 void *va, void *stat, uint32_t desc)
3032 {
3033 intptr_t i, j, opr_sz = simd_oprsz(desc);
3034 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3035 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3036 intptr_t elements = opr_sz / 4;
3037 intptr_t eltspersegment = MIN(16 / 4, elements);
3038 float32 *d = vd, *a = va;
3039 bfloat16 *n = vn, *m = vm;
3040
3041 for (i = 0; i < elements; i += eltspersegment) {
3042 float32 m_idx = m[H2(2 * i + index)] << 16;
3043
3044 for (j = i; j < i + eltspersegment; j++) {
3045 float32 n_j = n[H2(2 * j + sel)] << 16;
3046 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3047 }
3048 }
3049 clear_tail(d, opr_sz, simd_maxsz(desc));
3050 }
3051
3052 #define DO_CLAMP(NAME, TYPE) \
3053 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
3054 { \
3055 intptr_t i, opr_sz = simd_oprsz(desc); \
3056 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
3057 TYPE aa = *(TYPE *)(a + i); \
3058 TYPE nn = *(TYPE *)(n + i); \
3059 TYPE mm = *(TYPE *)(m + i); \
3060 TYPE dd = MIN(MAX(aa, nn), mm); \
3061 *(TYPE *)(d + i) = dd; \
3062 } \
3063 clear_tail(d, opr_sz, simd_maxsz(desc)); \
3064 }
3065
3066 DO_CLAMP(gvec_sclamp_b, int8_t)
3067 DO_CLAMP(gvec_sclamp_h, int16_t)
3068 DO_CLAMP(gvec_sclamp_s, int32_t)
3069 DO_CLAMP(gvec_sclamp_d, int64_t)
3070
3071 DO_CLAMP(gvec_uclamp_b, uint8_t)
3072 DO_CLAMP(gvec_uclamp_h, uint16_t)
3073 DO_CLAMP(gvec_uclamp_s, uint32_t)
3074 DO_CLAMP(gvec_uclamp_d, uint64_t)
3075