1 /*
2 * ARM AdvSIMD / SVE Vector Operations
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28
29 /*
30 * Data for expanding active predicate bits to bytes, for byte elements.
31 *
32 * for (i = 0; i < 256; ++i) {
33 * unsigned long m = 0;
34 * for (j = 0; j < 8; j++) {
35 * if ((i >> j) & 1) {
36 * m |= 0xfful << (j << 3);
37 * }
38 * }
39 * printf("0x%016lx,\n", m);
40 * }
41 */
42 const uint64_t expand_pred_b_data[256] = {
43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128 0xffffffffffffffff,
129 };
130
131 /*
132 * Similarly for half-word elements.
133 * for (i = 0; i < 256; ++i) {
134 * unsigned long m = 0;
135 * if (i & 0xaa) {
136 * continue;
137 * }
138 * for (j = 0; j < 8; j += 2) {
139 * if ((i >> j) & 1) {
140 * m |= 0xfffful << (j << 3);
141 * }
142 * }
143 * printf("[0x%x] = 0x%016lx,\n", i, m);
144 * }
145 */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154 [0x55] = 0xffffffffffffffff,
155 };
156
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
do_sqrdmlah_b(int8_t src1,int8_t src2,int8_t src3,bool neg,bool round)158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159 bool neg, bool round)
160 {
161 /*
162 * Simplify:
163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165 */
166 int32_t ret = (int32_t)src1 * src2;
167 if (neg) {
168 ret = -ret;
169 }
170 ret += ((int32_t)src3 << 7) + (round << 6);
171 ret >>= 7;
172
173 if (ret != (int8_t)ret) {
174 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175 }
176 return ret;
177 }
178
HELPER(sve2_sqrdmlah_b)179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180 void *va, uint32_t desc)
181 {
182 intptr_t i, opr_sz = simd_oprsz(desc);
183 int8_t *d = vd, *n = vn, *m = vm, *a = va;
184
185 for (i = 0; i < opr_sz; ++i) {
186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187 }
188 }
189
HELPER(sve2_sqrdmlsh_b)190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191 void *va, uint32_t desc)
192 {
193 intptr_t i, opr_sz = simd_oprsz(desc);
194 int8_t *d = vd, *n = vn, *m = vm, *a = va;
195
196 for (i = 0; i < opr_sz; ++i) {
197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198 }
199 }
200
HELPER(sve2_sqdmulh_b)201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203 intptr_t i, opr_sz = simd_oprsz(desc);
204 int8_t *d = vd, *n = vn, *m = vm;
205
206 for (i = 0; i < opr_sz; ++i) {
207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208 }
209 }
210
HELPER(sve2_sqrdmulh_b)211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213 intptr_t i, opr_sz = simd_oprsz(desc);
214 int8_t *d = vd, *n = vn, *m = vm;
215
216 for (i = 0; i < opr_sz; ++i) {
217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218 }
219 }
220
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
do_sqrdmlah_h(int16_t src1,int16_t src2,int16_t src3,bool neg,bool round,uint32_t * sat)222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223 bool neg, bool round, uint32_t *sat)
224 {
225 /* Simplify similarly to do_sqrdmlah_b above. */
226 int32_t ret = (int32_t)src1 * src2;
227 if (neg) {
228 ret = -ret;
229 }
230 ret += ((int32_t)src3 << 15) + (round << 14);
231 ret >>= 15;
232
233 if (ret != (int16_t)ret) {
234 *sat = 1;
235 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236 }
237 return ret;
238 }
239
HELPER(neon_qrdmlah_s16)240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241 uint32_t src2, uint32_t src3)
242 {
243 uint32_t *sat = &env->vfp.qc[0];
244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246 false, true, sat);
247 return deposit32(e1, 16, 16, e2);
248 }
249
HELPER(gvec_qrdmlah_s16)250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251 void *vq, uint32_t desc)
252 {
253 uintptr_t opr_sz = simd_oprsz(desc);
254 int16_t *d = vd;
255 int16_t *n = vn;
256 int16_t *m = vm;
257 uintptr_t i;
258
259 for (i = 0; i < opr_sz / 2; ++i) {
260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261 }
262 clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264
HELPER(neon_qrdmlsh_s16)265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266 uint32_t src2, uint32_t src3)
267 {
268 uint32_t *sat = &env->vfp.qc[0];
269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271 true, true, sat);
272 return deposit32(e1, 16, 16, e2);
273 }
274
HELPER(gvec_qrdmlsh_s16)275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276 void *vq, uint32_t desc)
277 {
278 uintptr_t opr_sz = simd_oprsz(desc);
279 int16_t *d = vd;
280 int16_t *n = vn;
281 int16_t *m = vm;
282 uintptr_t i;
283
284 for (i = 0; i < opr_sz / 2; ++i) {
285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286 }
287 clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289
HELPER(neon_sqdmulh_h)290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291 void *vq, uint32_t desc)
292 {
293 intptr_t i, opr_sz = simd_oprsz(desc);
294 int16_t *d = vd, *n = vn, *m = vm;
295
296 for (i = 0; i < opr_sz / 2; ++i) {
297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298 }
299 clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301
HELPER(neon_sqrdmulh_h)302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303 void *vq, uint32_t desc)
304 {
305 intptr_t i, opr_sz = simd_oprsz(desc);
306 int16_t *d = vd, *n = vn, *m = vm;
307
308 for (i = 0; i < opr_sz / 2; ++i) {
309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310 }
311 clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313
HELPER(neon_sqdmulh_idx_h)314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315 void *vq, uint32_t desc)
316 {
317 intptr_t i, j, opr_sz = simd_oprsz(desc);
318 int idx = simd_data(desc);
319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320
321 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
322 int16_t mm = m[i];
323 for (j = 0; j < 16 / 2; ++j) {
324 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
325 }
326 }
327 clear_tail(d, opr_sz, simd_maxsz(desc));
328 }
329
HELPER(neon_sqrdmulh_idx_h)330 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
331 void *vq, uint32_t desc)
332 {
333 intptr_t i, j, opr_sz = simd_oprsz(desc);
334 int idx = simd_data(desc);
335 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
336
337 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
338 int16_t mm = m[i];
339 for (j = 0; j < 16 / 2; ++j) {
340 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
341 }
342 }
343 clear_tail(d, opr_sz, simd_maxsz(desc));
344 }
345
HELPER(sve2_sqrdmlah_h)346 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
347 void *va, uint32_t desc)
348 {
349 intptr_t i, opr_sz = simd_oprsz(desc);
350 int16_t *d = vd, *n = vn, *m = vm, *a = va;
351 uint32_t discard;
352
353 for (i = 0; i < opr_sz / 2; ++i) {
354 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
355 }
356 }
357
HELPER(sve2_sqrdmlsh_h)358 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
359 void *va, uint32_t desc)
360 {
361 intptr_t i, opr_sz = simd_oprsz(desc);
362 int16_t *d = vd, *n = vn, *m = vm, *a = va;
363 uint32_t discard;
364
365 for (i = 0; i < opr_sz / 2; ++i) {
366 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
367 }
368 }
369
HELPER(sve2_sqdmulh_h)370 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
371 {
372 intptr_t i, opr_sz = simd_oprsz(desc);
373 int16_t *d = vd, *n = vn, *m = vm;
374 uint32_t discard;
375
376 for (i = 0; i < opr_sz / 2; ++i) {
377 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
378 }
379 }
380
HELPER(sve2_sqrdmulh_h)381 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
382 {
383 intptr_t i, opr_sz = simd_oprsz(desc);
384 int16_t *d = vd, *n = vn, *m = vm;
385 uint32_t discard;
386
387 for (i = 0; i < opr_sz / 2; ++i) {
388 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
389 }
390 }
391
HELPER(sve2_sqdmulh_idx_h)392 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
393 {
394 intptr_t i, j, opr_sz = simd_oprsz(desc);
395 int idx = simd_data(desc);
396 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
397 uint32_t discard;
398
399 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
400 int16_t mm = m[i];
401 for (j = 0; j < 16 / 2; ++j) {
402 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
403 }
404 }
405 }
406
HELPER(sve2_sqrdmulh_idx_h)407 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
408 {
409 intptr_t i, j, opr_sz = simd_oprsz(desc);
410 int idx = simd_data(desc);
411 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
412 uint32_t discard;
413
414 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
415 int16_t mm = m[i];
416 for (j = 0; j < 16 / 2; ++j) {
417 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
418 }
419 }
420 }
421
422 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
do_sqrdmlah_s(int32_t src1,int32_t src2,int32_t src3,bool neg,bool round,uint32_t * sat)423 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
424 bool neg, bool round, uint32_t *sat)
425 {
426 /* Simplify similarly to do_sqrdmlah_b above. */
427 int64_t ret = (int64_t)src1 * src2;
428 if (neg) {
429 ret = -ret;
430 }
431 ret += ((int64_t)src3 << 31) + (round << 30);
432 ret >>= 31;
433
434 if (ret != (int32_t)ret) {
435 *sat = 1;
436 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
437 }
438 return ret;
439 }
440
HELPER(neon_qrdmlah_s32)441 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
442 int32_t src2, int32_t src3)
443 {
444 uint32_t *sat = &env->vfp.qc[0];
445 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
446 }
447
HELPER(gvec_qrdmlah_s32)448 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
449 void *vq, uint32_t desc)
450 {
451 uintptr_t opr_sz = simd_oprsz(desc);
452 int32_t *d = vd;
453 int32_t *n = vn;
454 int32_t *m = vm;
455 uintptr_t i;
456
457 for (i = 0; i < opr_sz / 4; ++i) {
458 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
459 }
460 clear_tail(d, opr_sz, simd_maxsz(desc));
461 }
462
HELPER(neon_qrdmlsh_s32)463 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
464 int32_t src2, int32_t src3)
465 {
466 uint32_t *sat = &env->vfp.qc[0];
467 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
468 }
469
HELPER(gvec_qrdmlsh_s32)470 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
471 void *vq, uint32_t desc)
472 {
473 uintptr_t opr_sz = simd_oprsz(desc);
474 int32_t *d = vd;
475 int32_t *n = vn;
476 int32_t *m = vm;
477 uintptr_t i;
478
479 for (i = 0; i < opr_sz / 4; ++i) {
480 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
481 }
482 clear_tail(d, opr_sz, simd_maxsz(desc));
483 }
484
HELPER(neon_sqdmulh_s)485 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
486 void *vq, uint32_t desc)
487 {
488 intptr_t i, opr_sz = simd_oprsz(desc);
489 int32_t *d = vd, *n = vn, *m = vm;
490
491 for (i = 0; i < opr_sz / 4; ++i) {
492 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
493 }
494 clear_tail(d, opr_sz, simd_maxsz(desc));
495 }
496
HELPER(neon_sqrdmulh_s)497 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
498 void *vq, uint32_t desc)
499 {
500 intptr_t i, opr_sz = simd_oprsz(desc);
501 int32_t *d = vd, *n = vn, *m = vm;
502
503 for (i = 0; i < opr_sz / 4; ++i) {
504 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
505 }
506 clear_tail(d, opr_sz, simd_maxsz(desc));
507 }
508
HELPER(neon_sqdmulh_idx_s)509 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
510 void *vq, uint32_t desc)
511 {
512 intptr_t i, j, opr_sz = simd_oprsz(desc);
513 int idx = simd_data(desc);
514 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
515
516 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
517 int32_t mm = m[i];
518 for (j = 0; j < 16 / 4; ++j) {
519 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
520 }
521 }
522 clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524
HELPER(neon_sqrdmulh_idx_s)525 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
526 void *vq, uint32_t desc)
527 {
528 intptr_t i, j, opr_sz = simd_oprsz(desc);
529 int idx = simd_data(desc);
530 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
531
532 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
533 int32_t mm = m[i];
534 for (j = 0; j < 16 / 4; ++j) {
535 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
536 }
537 }
538 clear_tail(d, opr_sz, simd_maxsz(desc));
539 }
540
HELPER(sve2_sqrdmlah_s)541 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
542 void *va, uint32_t desc)
543 {
544 intptr_t i, opr_sz = simd_oprsz(desc);
545 int32_t *d = vd, *n = vn, *m = vm, *a = va;
546 uint32_t discard;
547
548 for (i = 0; i < opr_sz / 4; ++i) {
549 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
550 }
551 }
552
HELPER(sve2_sqrdmlsh_s)553 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
554 void *va, uint32_t desc)
555 {
556 intptr_t i, opr_sz = simd_oprsz(desc);
557 int32_t *d = vd, *n = vn, *m = vm, *a = va;
558 uint32_t discard;
559
560 for (i = 0; i < opr_sz / 4; ++i) {
561 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
562 }
563 }
564
HELPER(sve2_sqdmulh_s)565 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
566 {
567 intptr_t i, opr_sz = simd_oprsz(desc);
568 int32_t *d = vd, *n = vn, *m = vm;
569 uint32_t discard;
570
571 for (i = 0; i < opr_sz / 4; ++i) {
572 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
573 }
574 }
575
HELPER(sve2_sqrdmulh_s)576 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
577 {
578 intptr_t i, opr_sz = simd_oprsz(desc);
579 int32_t *d = vd, *n = vn, *m = vm;
580 uint32_t discard;
581
582 for (i = 0; i < opr_sz / 4; ++i) {
583 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
584 }
585 }
586
HELPER(sve2_sqdmulh_idx_s)587 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
588 {
589 intptr_t i, j, opr_sz = simd_oprsz(desc);
590 int idx = simd_data(desc);
591 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
592 uint32_t discard;
593
594 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
595 int32_t mm = m[i];
596 for (j = 0; j < 16 / 4; ++j) {
597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
598 }
599 }
600 }
601
HELPER(sve2_sqrdmulh_idx_s)602 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
603 {
604 intptr_t i, j, opr_sz = simd_oprsz(desc);
605 int idx = simd_data(desc);
606 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
607 uint32_t discard;
608
609 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
610 int32_t mm = m[i];
611 for (j = 0; j < 16 / 4; ++j) {
612 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
613 }
614 }
615 }
616
617 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
do_sat128_d(Int128 r)618 static int64_t do_sat128_d(Int128 r)
619 {
620 int64_t ls = int128_getlo(r);
621 int64_t hs = int128_gethi(r);
622
623 if (unlikely(hs != (ls >> 63))) {
624 return hs < 0 ? INT64_MIN : INT64_MAX;
625 }
626 return ls;
627 }
628
do_sqrdmlah_d(int64_t n,int64_t m,int64_t a,bool neg,bool round)629 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
630 {
631 uint64_t l, h;
632 Int128 r, t;
633
634 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
635 muls64(&l, &h, m, n);
636 r = int128_make128(l, h);
637 if (neg) {
638 r = int128_neg(r);
639 }
640 if (a) {
641 t = int128_exts64(a);
642 t = int128_lshift(t, 63);
643 r = int128_add(r, t);
644 }
645 if (round) {
646 t = int128_exts64(1ll << 62);
647 r = int128_add(r, t);
648 }
649 r = int128_rshift(r, 63);
650
651 return do_sat128_d(r);
652 }
653
HELPER(sve2_sqrdmlah_d)654 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
655 void *va, uint32_t desc)
656 {
657 intptr_t i, opr_sz = simd_oprsz(desc);
658 int64_t *d = vd, *n = vn, *m = vm, *a = va;
659
660 for (i = 0; i < opr_sz / 8; ++i) {
661 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
662 }
663 }
664
HELPER(sve2_sqrdmlsh_d)665 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
666 void *va, uint32_t desc)
667 {
668 intptr_t i, opr_sz = simd_oprsz(desc);
669 int64_t *d = vd, *n = vn, *m = vm, *a = va;
670
671 for (i = 0; i < opr_sz / 8; ++i) {
672 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
673 }
674 }
675
HELPER(sve2_sqdmulh_d)676 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
677 {
678 intptr_t i, opr_sz = simd_oprsz(desc);
679 int64_t *d = vd, *n = vn, *m = vm;
680
681 for (i = 0; i < opr_sz / 8; ++i) {
682 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
683 }
684 }
685
HELPER(sve2_sqrdmulh_d)686 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
687 {
688 intptr_t i, opr_sz = simd_oprsz(desc);
689 int64_t *d = vd, *n = vn, *m = vm;
690
691 for (i = 0; i < opr_sz / 8; ++i) {
692 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
693 }
694 }
695
HELPER(sve2_sqdmulh_idx_d)696 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
697 {
698 intptr_t i, j, opr_sz = simd_oprsz(desc);
699 int idx = simd_data(desc);
700 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
701
702 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
703 int64_t mm = m[i];
704 for (j = 0; j < 16 / 8; ++j) {
705 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
706 }
707 }
708 }
709
HELPER(sve2_sqrdmulh_idx_d)710 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
711 {
712 intptr_t i, j, opr_sz = simd_oprsz(desc);
713 int idx = simd_data(desc);
714 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
715
716 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
717 int64_t mm = m[i];
718 for (j = 0; j < 16 / 8; ++j) {
719 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
720 }
721 }
722 }
723
724 /* Integer 8 and 16-bit dot-product.
725 *
726 * Note that for the loops herein, host endianness does not matter
727 * with respect to the ordering of data within the quad-width lanes.
728 * All elements are treated equally, no matter where they are.
729 */
730
731 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
733 { \
734 intptr_t i, opr_sz = simd_oprsz(desc); \
735 TYPED *d = vd, *a = va; \
736 TYPEN *n = vn; \
737 TYPEM *m = vm; \
738 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
739 d[i] = (a[i] + \
740 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
741 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
742 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
743 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
744 } \
745 clear_tail(d, opr_sz, simd_maxsz(desc)); \
746 }
747
DO_DOT(gvec_sdot_b,int32_t,int8_t,int8_t)748 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
749 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
750 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
751 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
752 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
753
754 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
755 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
756 { \
757 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
758 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
759 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
760 intptr_t index = simd_data(desc); \
761 TYPED *d = vd, *a = va; \
762 TYPEN *n = vn; \
763 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
764 do { \
765 TYPED m0 = m_indexed[i * 4 + 0]; \
766 TYPED m1 = m_indexed[i * 4 + 1]; \
767 TYPED m2 = m_indexed[i * 4 + 2]; \
768 TYPED m3 = m_indexed[i * 4 + 3]; \
769 do { \
770 d[i] = (a[i] + \
771 n[i * 4 + 0] * m0 + \
772 n[i * 4 + 1] * m1 + \
773 n[i * 4 + 2] * m2 + \
774 n[i * 4 + 3] * m3); \
775 } while (++i < segend); \
776 segend = i + 4; \
777 } while (i < opr_sz_n); \
778 clear_tail(d, opr_sz, simd_maxsz(desc)); \
779 }
780
781 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
782 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
783 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
784 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
785 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
786 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
787
788 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
789 void *vfpst, uint32_t desc)
790 {
791 uintptr_t opr_sz = simd_oprsz(desc);
792 float16 *d = vd;
793 float16 *n = vn;
794 float16 *m = vm;
795 float_status *fpst = vfpst;
796 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
797 uint32_t neg_imag = neg_real ^ 1;
798 uintptr_t i;
799
800 /* Shift boolean to the sign bit so we can xor to negate. */
801 neg_real <<= 15;
802 neg_imag <<= 15;
803
804 for (i = 0; i < opr_sz / 2; i += 2) {
805 float16 e0 = n[H2(i)];
806 float16 e1 = m[H2(i + 1)] ^ neg_imag;
807 float16 e2 = n[H2(i + 1)];
808 float16 e3 = m[H2(i)] ^ neg_real;
809
810 d[H2(i)] = float16_add(e0, e1, fpst);
811 d[H2(i + 1)] = float16_add(e2, e3, fpst);
812 }
813 clear_tail(d, opr_sz, simd_maxsz(desc));
814 }
815
HELPER(gvec_fcadds)816 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
817 void *vfpst, uint32_t desc)
818 {
819 uintptr_t opr_sz = simd_oprsz(desc);
820 float32 *d = vd;
821 float32 *n = vn;
822 float32 *m = vm;
823 float_status *fpst = vfpst;
824 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
825 uint32_t neg_imag = neg_real ^ 1;
826 uintptr_t i;
827
828 /* Shift boolean to the sign bit so we can xor to negate. */
829 neg_real <<= 31;
830 neg_imag <<= 31;
831
832 for (i = 0; i < opr_sz / 4; i += 2) {
833 float32 e0 = n[H4(i)];
834 float32 e1 = m[H4(i + 1)] ^ neg_imag;
835 float32 e2 = n[H4(i + 1)];
836 float32 e3 = m[H4(i)] ^ neg_real;
837
838 d[H4(i)] = float32_add(e0, e1, fpst);
839 d[H4(i + 1)] = float32_add(e2, e3, fpst);
840 }
841 clear_tail(d, opr_sz, simd_maxsz(desc));
842 }
843
HELPER(gvec_fcaddd)844 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
845 void *vfpst, uint32_t desc)
846 {
847 uintptr_t opr_sz = simd_oprsz(desc);
848 float64 *d = vd;
849 float64 *n = vn;
850 float64 *m = vm;
851 float_status *fpst = vfpst;
852 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
853 uint64_t neg_imag = neg_real ^ 1;
854 uintptr_t i;
855
856 /* Shift boolean to the sign bit so we can xor to negate. */
857 neg_real <<= 63;
858 neg_imag <<= 63;
859
860 for (i = 0; i < opr_sz / 8; i += 2) {
861 float64 e0 = n[i];
862 float64 e1 = m[i + 1] ^ neg_imag;
863 float64 e2 = n[i + 1];
864 float64 e3 = m[i] ^ neg_real;
865
866 d[i] = float64_add(e0, e1, fpst);
867 d[i + 1] = float64_add(e2, e3, fpst);
868 }
869 clear_tail(d, opr_sz, simd_maxsz(desc));
870 }
871
HELPER(gvec_fcmlah)872 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
873 void *vfpst, uint32_t desc)
874 {
875 uintptr_t opr_sz = simd_oprsz(desc);
876 float16 *d = vd, *n = vn, *m = vm, *a = va;
877 float_status *fpst = vfpst;
878 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
879 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
880 uint32_t neg_real = flip ^ neg_imag;
881 uintptr_t i;
882
883 /* Shift boolean to the sign bit so we can xor to negate. */
884 neg_real <<= 15;
885 neg_imag <<= 15;
886
887 for (i = 0; i < opr_sz / 2; i += 2) {
888 float16 e2 = n[H2(i + flip)];
889 float16 e1 = m[H2(i + flip)] ^ neg_real;
890 float16 e4 = e2;
891 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
892
893 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
894 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
895 }
896 clear_tail(d, opr_sz, simd_maxsz(desc));
897 }
898
HELPER(gvec_fcmlah_idx)899 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
900 void *vfpst, uint32_t desc)
901 {
902 uintptr_t opr_sz = simd_oprsz(desc);
903 float16 *d = vd, *n = vn, *m = vm, *a = va;
904 float_status *fpst = vfpst;
905 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
906 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
907 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
908 uint32_t neg_real = flip ^ neg_imag;
909 intptr_t elements = opr_sz / sizeof(float16);
910 intptr_t eltspersegment = 16 / sizeof(float16);
911 intptr_t i, j;
912
913 /* Shift boolean to the sign bit so we can xor to negate. */
914 neg_real <<= 15;
915 neg_imag <<= 15;
916
917 for (i = 0; i < elements; i += eltspersegment) {
918 float16 mr = m[H2(i + 2 * index + 0)];
919 float16 mi = m[H2(i + 2 * index + 1)];
920 float16 e1 = neg_real ^ (flip ? mi : mr);
921 float16 e3 = neg_imag ^ (flip ? mr : mi);
922
923 for (j = i; j < i + eltspersegment; j += 2) {
924 float16 e2 = n[H2(j + flip)];
925 float16 e4 = e2;
926
927 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
928 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
929 }
930 }
931 clear_tail(d, opr_sz, simd_maxsz(desc));
932 }
933
HELPER(gvec_fcmlas)934 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
935 void *vfpst, uint32_t desc)
936 {
937 uintptr_t opr_sz = simd_oprsz(desc);
938 float32 *d = vd, *n = vn, *m = vm, *a = va;
939 float_status *fpst = vfpst;
940 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
941 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
942 uint32_t neg_real = flip ^ neg_imag;
943 uintptr_t i;
944
945 /* Shift boolean to the sign bit so we can xor to negate. */
946 neg_real <<= 31;
947 neg_imag <<= 31;
948
949 for (i = 0; i < opr_sz / 4; i += 2) {
950 float32 e2 = n[H4(i + flip)];
951 float32 e1 = m[H4(i + flip)] ^ neg_real;
952 float32 e4 = e2;
953 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
954
955 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
956 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
957 }
958 clear_tail(d, opr_sz, simd_maxsz(desc));
959 }
960
HELPER(gvec_fcmlas_idx)961 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
962 void *vfpst, uint32_t desc)
963 {
964 uintptr_t opr_sz = simd_oprsz(desc);
965 float32 *d = vd, *n = vn, *m = vm, *a = va;
966 float_status *fpst = vfpst;
967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
969 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
970 uint32_t neg_real = flip ^ neg_imag;
971 intptr_t elements = opr_sz / sizeof(float32);
972 intptr_t eltspersegment = 16 / sizeof(float32);
973 intptr_t i, j;
974
975 /* Shift boolean to the sign bit so we can xor to negate. */
976 neg_real <<= 31;
977 neg_imag <<= 31;
978
979 for (i = 0; i < elements; i += eltspersegment) {
980 float32 mr = m[H4(i + 2 * index + 0)];
981 float32 mi = m[H4(i + 2 * index + 1)];
982 float32 e1 = neg_real ^ (flip ? mi : mr);
983 float32 e3 = neg_imag ^ (flip ? mr : mi);
984
985 for (j = i; j < i + eltspersegment; j += 2) {
986 float32 e2 = n[H4(j + flip)];
987 float32 e4 = e2;
988
989 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
990 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
991 }
992 }
993 clear_tail(d, opr_sz, simd_maxsz(desc));
994 }
995
HELPER(gvec_fcmlad)996 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
997 void *vfpst, uint32_t desc)
998 {
999 uintptr_t opr_sz = simd_oprsz(desc);
1000 float64 *d = vd, *n = vn, *m = vm, *a = va;
1001 float_status *fpst = vfpst;
1002 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1003 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1004 uint64_t neg_real = flip ^ neg_imag;
1005 uintptr_t i;
1006
1007 /* Shift boolean to the sign bit so we can xor to negate. */
1008 neg_real <<= 63;
1009 neg_imag <<= 63;
1010
1011 for (i = 0; i < opr_sz / 8; i += 2) {
1012 float64 e2 = n[i + flip];
1013 float64 e1 = m[i + flip] ^ neg_real;
1014 float64 e4 = e2;
1015 float64 e3 = m[i + 1 - flip] ^ neg_imag;
1016
1017 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1018 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1019 }
1020 clear_tail(d, opr_sz, simd_maxsz(desc));
1021 }
1022
1023 /*
1024 * Floating point comparisons producing an integer result (all 1s or all 0s).
1025 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1026 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1027 */
float16_ceq(float16 op1,float16 op2,float_status * stat)1028 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1029 {
1030 return -float16_eq_quiet(op1, op2, stat);
1031 }
1032
float32_ceq(float32 op1,float32 op2,float_status * stat)1033 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1034 {
1035 return -float32_eq_quiet(op1, op2, stat);
1036 }
1037
float64_ceq(float64 op1,float64 op2,float_status * stat)1038 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1039 {
1040 return -float64_eq_quiet(op1, op2, stat);
1041 }
1042
float16_cge(float16 op1,float16 op2,float_status * stat)1043 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1044 {
1045 return -float16_le(op2, op1, stat);
1046 }
1047
float32_cge(float32 op1,float32 op2,float_status * stat)1048 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1049 {
1050 return -float32_le(op2, op1, stat);
1051 }
1052
float64_cge(float64 op1,float64 op2,float_status * stat)1053 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1054 {
1055 return -float64_le(op2, op1, stat);
1056 }
1057
float16_cgt(float16 op1,float16 op2,float_status * stat)1058 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1059 {
1060 return -float16_lt(op2, op1, stat);
1061 }
1062
float32_cgt(float32 op1,float32 op2,float_status * stat)1063 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1064 {
1065 return -float32_lt(op2, op1, stat);
1066 }
1067
float64_cgt(float64 op1,float64 op2,float_status * stat)1068 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1069 {
1070 return -float64_lt(op2, op1, stat);
1071 }
1072
float16_acge(float16 op1,float16 op2,float_status * stat)1073 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1074 {
1075 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1076 }
1077
float32_acge(float32 op1,float32 op2,float_status * stat)1078 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1079 {
1080 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1081 }
1082
float64_acge(float64 op1,float64 op2,float_status * stat)1083 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1084 {
1085 return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1086 }
1087
float16_acgt(float16 op1,float16 op2,float_status * stat)1088 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1089 {
1090 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1091 }
1092
float32_acgt(float32 op1,float32 op2,float_status * stat)1093 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1094 {
1095 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1096 }
1097
float64_acgt(float64 op1,float64 op2,float_status * stat)1098 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1099 {
1100 return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1101 }
1102
vfp_tosszh(float16 x,void * fpstp)1103 static int16_t vfp_tosszh(float16 x, void *fpstp)
1104 {
1105 float_status *fpst = fpstp;
1106 if (float16_is_any_nan(x)) {
1107 float_raise(float_flag_invalid, fpst);
1108 return 0;
1109 }
1110 return float16_to_int16_round_to_zero(x, fpst);
1111 }
1112
vfp_touszh(float16 x,void * fpstp)1113 static uint16_t vfp_touszh(float16 x, void *fpstp)
1114 {
1115 float_status *fpst = fpstp;
1116 if (float16_is_any_nan(x)) {
1117 float_raise(float_flag_invalid, fpst);
1118 return 0;
1119 }
1120 return float16_to_uint16_round_to_zero(x, fpst);
1121 }
1122
1123 #define DO_2OP(NAME, FUNC, TYPE) \
1124 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1125 { \
1126 intptr_t i, oprsz = simd_oprsz(desc); \
1127 TYPE *d = vd, *n = vn; \
1128 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1129 d[i] = FUNC(n[i], stat); \
1130 } \
1131 clear_tail(d, oprsz, simd_maxsz(desc)); \
1132 }
1133
DO_2OP(gvec_frecpe_h,helper_recpe_f16,float16)1134 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1135 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1136 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1137
1138 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1139 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1140 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1141
1142 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1143 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1144
1145 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1146 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1147 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1148 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1149 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1150 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1151 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1152 DO_2OP(gvec_touszh, vfp_touszh, float16)
1153
1154 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1155 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1156 { \
1157 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1158 }
1159
1160 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1161 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1162 { \
1163 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1164 }
1165
1166 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1167 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1168 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1169 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1170 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1171
1172 DO_2OP_CMP0(cgt, cgt, FWD)
1173 DO_2OP_CMP0(cge, cge, FWD)
1174 DO_2OP_CMP0(ceq, ceq, FWD)
1175 DO_2OP_CMP0(clt, cgt, REV)
1176 DO_2OP_CMP0(cle, cge, REV)
1177
1178 #undef DO_2OP
1179 #undef DO_2OP_CMP0
1180
1181 /* Floating-point trigonometric starting value.
1182 * See the ARM ARM pseudocode function FPTrigSMul.
1183 */
1184 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1185 {
1186 float16 result = float16_mul(op1, op1, stat);
1187 if (!float16_is_any_nan(result)) {
1188 result = float16_set_sign(result, op2 & 1);
1189 }
1190 return result;
1191 }
1192
float32_ftsmul(float32 op1,uint32_t op2,float_status * stat)1193 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1194 {
1195 float32 result = float32_mul(op1, op1, stat);
1196 if (!float32_is_any_nan(result)) {
1197 result = float32_set_sign(result, op2 & 1);
1198 }
1199 return result;
1200 }
1201
float64_ftsmul(float64 op1,uint64_t op2,float_status * stat)1202 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1203 {
1204 float64 result = float64_mul(op1, op1, stat);
1205 if (!float64_is_any_nan(result)) {
1206 result = float64_set_sign(result, op2 & 1);
1207 }
1208 return result;
1209 }
1210
float16_abd(float16 op1,float16 op2,float_status * stat)1211 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1212 {
1213 return float16_abs(float16_sub(op1, op2, stat));
1214 }
1215
float32_abd(float32 op1,float32 op2,float_status * stat)1216 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1217 {
1218 return float32_abs(float32_sub(op1, op2, stat));
1219 }
1220
float64_abd(float64 op1,float64 op2,float_status * stat)1221 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1222 {
1223 return float64_abs(float64_sub(op1, op2, stat));
1224 }
1225
1226 /*
1227 * Reciprocal step. These are the AArch32 version which uses a
1228 * non-fused multiply-and-subtract.
1229 */
float16_recps_nf(float16 op1,float16 op2,float_status * stat)1230 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1231 {
1232 op1 = float16_squash_input_denormal(op1, stat);
1233 op2 = float16_squash_input_denormal(op2, stat);
1234
1235 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1236 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1237 return float16_two;
1238 }
1239 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1240 }
1241
float32_recps_nf(float32 op1,float32 op2,float_status * stat)1242 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1243 {
1244 op1 = float32_squash_input_denormal(op1, stat);
1245 op2 = float32_squash_input_denormal(op2, stat);
1246
1247 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1248 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1249 return float32_two;
1250 }
1251 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1252 }
1253
1254 /* Reciprocal square-root step. AArch32 non-fused semantics. */
float16_rsqrts_nf(float16 op1,float16 op2,float_status * stat)1255 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1256 {
1257 op1 = float16_squash_input_denormal(op1, stat);
1258 op2 = float16_squash_input_denormal(op2, stat);
1259
1260 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1261 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1262 return float16_one_point_five;
1263 }
1264 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1265 return float16_div(op1, float16_two, stat);
1266 }
1267
float32_rsqrts_nf(float32 op1,float32 op2,float_status * stat)1268 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1269 {
1270 op1 = float32_squash_input_denormal(op1, stat);
1271 op2 = float32_squash_input_denormal(op2, stat);
1272
1273 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1274 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1275 return float32_one_point_five;
1276 }
1277 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1278 return float32_div(op1, float32_two, stat);
1279 }
1280
1281 #define DO_3OP(NAME, FUNC, TYPE) \
1282 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1283 { \
1284 intptr_t i, oprsz = simd_oprsz(desc); \
1285 TYPE *d = vd, *n = vn, *m = vm; \
1286 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1287 d[i] = FUNC(n[i], m[i], stat); \
1288 } \
1289 clear_tail(d, oprsz, simd_maxsz(desc)); \
1290 }
1291
DO_3OP(gvec_fadd_h,float16_add,float16)1292 DO_3OP(gvec_fadd_h, float16_add, float16)
1293 DO_3OP(gvec_fadd_s, float32_add, float32)
1294 DO_3OP(gvec_fadd_d, float64_add, float64)
1295
1296 DO_3OP(gvec_fsub_h, float16_sub, float16)
1297 DO_3OP(gvec_fsub_s, float32_sub, float32)
1298 DO_3OP(gvec_fsub_d, float64_sub, float64)
1299
1300 DO_3OP(gvec_fmul_h, float16_mul, float16)
1301 DO_3OP(gvec_fmul_s, float32_mul, float32)
1302 DO_3OP(gvec_fmul_d, float64_mul, float64)
1303
1304 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1305 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1306 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1307
1308 DO_3OP(gvec_fabd_h, float16_abd, float16)
1309 DO_3OP(gvec_fabd_s, float32_abd, float32)
1310 DO_3OP(gvec_fabd_d, float64_abd, float64)
1311
1312 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1313 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1314 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1315
1316 DO_3OP(gvec_fcge_h, float16_cge, float16)
1317 DO_3OP(gvec_fcge_s, float32_cge, float32)
1318 DO_3OP(gvec_fcge_d, float64_cge, float64)
1319
1320 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1321 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1322 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1323
1324 DO_3OP(gvec_facge_h, float16_acge, float16)
1325 DO_3OP(gvec_facge_s, float32_acge, float32)
1326 DO_3OP(gvec_facge_d, float64_acge, float64)
1327
1328 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1329 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1330 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1331
1332 DO_3OP(gvec_fmax_h, float16_max, float16)
1333 DO_3OP(gvec_fmax_s, float32_max, float32)
1334 DO_3OP(gvec_fmax_d, float64_max, float64)
1335
1336 DO_3OP(gvec_fmin_h, float16_min, float16)
1337 DO_3OP(gvec_fmin_s, float32_min, float32)
1338 DO_3OP(gvec_fmin_d, float64_min, float64)
1339
1340 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1341 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1342 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1343
1344 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1345 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1346 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1347
1348 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1349 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1350
1351 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1352 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1353
1354 #ifdef TARGET_AARCH64
1355 DO_3OP(gvec_fdiv_h, float16_div, float16)
1356 DO_3OP(gvec_fdiv_s, float32_div, float32)
1357 DO_3OP(gvec_fdiv_d, float64_div, float64)
1358
1359 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1360 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1361 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1362
1363 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1364 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1365 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1366
1367 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1368 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1369 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1370
1371 #endif
1372 #undef DO_3OP
1373
1374 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1375 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1376 float_status *stat)
1377 {
1378 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1379 }
1380
float32_muladd_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1381 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1382 float_status *stat)
1383 {
1384 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1385 }
1386
float16_mulsub_nf(float16 dest,float16 op1,float16 op2,float_status * stat)1387 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1388 float_status *stat)
1389 {
1390 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1391 }
1392
float32_mulsub_nf(float32 dest,float32 op1,float32 op2,float_status * stat)1393 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1394 float_status *stat)
1395 {
1396 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1397 }
1398
1399 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
float16_muladd_f(float16 dest,float16 op1,float16 op2,float_status * stat)1400 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1401 float_status *stat)
1402 {
1403 return float16_muladd(op1, op2, dest, 0, stat);
1404 }
1405
float32_muladd_f(float32 dest,float32 op1,float32 op2,float_status * stat)1406 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1407 float_status *stat)
1408 {
1409 return float32_muladd(op1, op2, dest, 0, stat);
1410 }
1411
float64_muladd_f(float64 dest,float64 op1,float64 op2,float_status * stat)1412 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1413 float_status *stat)
1414 {
1415 return float64_muladd(op1, op2, dest, 0, stat);
1416 }
1417
float16_mulsub_f(float16 dest,float16 op1,float16 op2,float_status * stat)1418 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1419 float_status *stat)
1420 {
1421 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1422 }
1423
float32_mulsub_f(float32 dest,float32 op1,float32 op2,float_status * stat)1424 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1425 float_status *stat)
1426 {
1427 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1428 }
1429
float64_mulsub_f(float64 dest,float64 op1,float64 op2,float_status * stat)1430 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1431 float_status *stat)
1432 {
1433 return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1434 }
1435
1436 #define DO_MULADD(NAME, FUNC, TYPE) \
1437 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1438 { \
1439 intptr_t i, oprsz = simd_oprsz(desc); \
1440 TYPE *d = vd, *n = vn, *m = vm; \
1441 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1442 d[i] = FUNC(d[i], n[i], m[i], stat); \
1443 } \
1444 clear_tail(d, oprsz, simd_maxsz(desc)); \
1445 }
1446
DO_MULADD(gvec_fmla_h,float16_muladd_nf,float16)1447 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1448 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1449
1450 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1451 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1452
1453 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1454 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1455 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1456
1457 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1458 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1459 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1460
1461 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1462 * For AdvSIMD, there is of course only one such vector segment.
1463 */
1464
1465 #define DO_MUL_IDX(NAME, TYPE, H) \
1466 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1467 { \
1468 intptr_t i, j, oprsz = simd_oprsz(desc); \
1469 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1470 intptr_t idx = simd_data(desc); \
1471 TYPE *d = vd, *n = vn, *m = vm; \
1472 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1473 TYPE mm = m[H(i + idx)]; \
1474 for (j = 0; j < segment; j++) { \
1475 d[i + j] = n[i + j] * mm; \
1476 } \
1477 } \
1478 clear_tail(d, oprsz, simd_maxsz(desc)); \
1479 }
1480
1481 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1482 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1483 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1484
1485 #undef DO_MUL_IDX
1486
1487 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1488 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1489 { \
1490 intptr_t i, j, oprsz = simd_oprsz(desc); \
1491 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1492 intptr_t idx = simd_data(desc); \
1493 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1494 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1495 TYPE mm = m[H(i + idx)]; \
1496 for (j = 0; j < segment; j++) { \
1497 d[i + j] = a[i + j] OP n[i + j] * mm; \
1498 } \
1499 } \
1500 clear_tail(d, oprsz, simd_maxsz(desc)); \
1501 }
1502
1503 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1504 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1505 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1506
1507 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1508 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1509 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1510
1511 #undef DO_MLA_IDX
1512
1513 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \
1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1515 { \
1516 intptr_t i, j, oprsz = simd_oprsz(desc); \
1517 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1518 intptr_t idx = simd_data(desc); \
1519 TYPE *d = vd, *n = vn, *m = vm; \
1520 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1521 TYPE mm = m[H(i + idx)]; \
1522 for (j = 0; j < segment; j++) { \
1523 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \
1524 } \
1525 } \
1526 clear_tail(d, oprsz, simd_maxsz(desc)); \
1527 }
1528
1529 #define nop(N, M, S) (M)
1530
1531 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1532 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1533 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1534
1535 #ifdef TARGET_AARCH64
1536
1537 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1538 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1539 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1540
1541 #endif
1542
1543 #undef nop
1544
1545 /*
1546 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1547 * the fused ops below they assume accumulate both from and into Vd.
1548 */
1549 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1550 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1551 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1552 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1553
1554 #undef DO_FMUL_IDX
1555
1556 #define DO_FMLA_IDX(NAME, TYPE, H) \
1557 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1558 void *stat, uint32_t desc) \
1559 { \
1560 intptr_t i, j, oprsz = simd_oprsz(desc); \
1561 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1562 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1563 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1564 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1565 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1566 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1567 TYPE mm = m[H(i + idx)]; \
1568 for (j = 0; j < segment; j++) { \
1569 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1570 mm, a[i + j], 0, stat); \
1571 } \
1572 } \
1573 clear_tail(d, oprsz, simd_maxsz(desc)); \
1574 }
1575
1576 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1577 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1578 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1579
1580 #undef DO_FMLA_IDX
1581
1582 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1583 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1584 { \
1585 intptr_t i, oprsz = simd_oprsz(desc); \
1586 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1587 bool q = false; \
1588 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1589 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1590 if (dd < MIN) { \
1591 dd = MIN; \
1592 q = true; \
1593 } else if (dd > MAX) { \
1594 dd = MAX; \
1595 q = true; \
1596 } \
1597 d[i] = dd; \
1598 } \
1599 if (q) { \
1600 uint32_t *qc = vq; \
1601 qc[0] = 1; \
1602 } \
1603 clear_tail(d, oprsz, simd_maxsz(desc)); \
1604 }
1605
1606 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1607 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1608 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1609
1610 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1611 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1612 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1613
1614 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1615 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1616 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1617
1618 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1619 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1620 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1621
1622 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1623 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1624 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1625
1626 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1627 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1628 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1629
1630 #undef DO_SAT
1631
1632 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1633 void *vm, uint32_t desc)
1634 {
1635 intptr_t i, oprsz = simd_oprsz(desc);
1636 uint64_t *d = vd, *n = vn, *m = vm;
1637 bool q = false;
1638
1639 for (i = 0; i < oprsz / 8; i++) {
1640 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1641 if (dd < nn) {
1642 dd = UINT64_MAX;
1643 q = true;
1644 }
1645 d[i] = dd;
1646 }
1647 if (q) {
1648 uint32_t *qc = vq;
1649 qc[0] = 1;
1650 }
1651 clear_tail(d, oprsz, simd_maxsz(desc));
1652 }
1653
HELPER(gvec_uqsub_d)1654 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1655 void *vm, uint32_t desc)
1656 {
1657 intptr_t i, oprsz = simd_oprsz(desc);
1658 uint64_t *d = vd, *n = vn, *m = vm;
1659 bool q = false;
1660
1661 for (i = 0; i < oprsz / 8; i++) {
1662 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1663 if (nn < mm) {
1664 dd = 0;
1665 q = true;
1666 }
1667 d[i] = dd;
1668 }
1669 if (q) {
1670 uint32_t *qc = vq;
1671 qc[0] = 1;
1672 }
1673 clear_tail(d, oprsz, simd_maxsz(desc));
1674 }
1675
HELPER(gvec_sqadd_d)1676 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1677 void *vm, uint32_t desc)
1678 {
1679 intptr_t i, oprsz = simd_oprsz(desc);
1680 int64_t *d = vd, *n = vn, *m = vm;
1681 bool q = false;
1682
1683 for (i = 0; i < oprsz / 8; i++) {
1684 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1685 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1686 dd = (nn >> 63) ^ ~INT64_MIN;
1687 q = true;
1688 }
1689 d[i] = dd;
1690 }
1691 if (q) {
1692 uint32_t *qc = vq;
1693 qc[0] = 1;
1694 }
1695 clear_tail(d, oprsz, simd_maxsz(desc));
1696 }
1697
HELPER(gvec_sqsub_d)1698 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1699 void *vm, uint32_t desc)
1700 {
1701 intptr_t i, oprsz = simd_oprsz(desc);
1702 int64_t *d = vd, *n = vn, *m = vm;
1703 bool q = false;
1704
1705 for (i = 0; i < oprsz / 8; i++) {
1706 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1707 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1708 dd = (nn >> 63) ^ ~INT64_MIN;
1709 q = true;
1710 }
1711 d[i] = dd;
1712 }
1713 if (q) {
1714 uint32_t *qc = vq;
1715 qc[0] = 1;
1716 }
1717 clear_tail(d, oprsz, simd_maxsz(desc));
1718 }
1719
HELPER(gvec_usqadd_d)1720 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1721 void *vm, uint32_t desc)
1722 {
1723 intptr_t i, oprsz = simd_oprsz(desc);
1724 uint64_t *d = vd, *n = vn, *m = vm;
1725 bool q = false;
1726
1727 for (i = 0; i < oprsz / 8; i++) {
1728 uint64_t nn = n[i];
1729 int64_t mm = m[i];
1730 uint64_t dd = nn + mm;
1731
1732 if (mm < 0) {
1733 if (nn < (uint64_t)-mm) {
1734 dd = 0;
1735 q = true;
1736 }
1737 } else {
1738 if (dd < nn) {
1739 dd = UINT64_MAX;
1740 q = true;
1741 }
1742 }
1743 d[i] = dd;
1744 }
1745 if (q) {
1746 uint32_t *qc = vq;
1747 qc[0] = 1;
1748 }
1749 clear_tail(d, oprsz, simd_maxsz(desc));
1750 }
1751
HELPER(gvec_suqadd_d)1752 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1753 void *vm, uint32_t desc)
1754 {
1755 intptr_t i, oprsz = simd_oprsz(desc);
1756 uint64_t *d = vd, *n = vn, *m = vm;
1757 bool q = false;
1758
1759 for (i = 0; i < oprsz / 8; i++) {
1760 int64_t nn = n[i];
1761 uint64_t mm = m[i];
1762 int64_t dd = nn + mm;
1763
1764 if (mm > (uint64_t)(INT64_MAX - nn)) {
1765 dd = INT64_MAX;
1766 q = true;
1767 }
1768 d[i] = dd;
1769 }
1770 if (q) {
1771 uint32_t *qc = vq;
1772 qc[0] = 1;
1773 }
1774 clear_tail(d, oprsz, simd_maxsz(desc));
1775 }
1776
1777 #define DO_SRA(NAME, TYPE) \
1778 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1779 { \
1780 intptr_t i, oprsz = simd_oprsz(desc); \
1781 int shift = simd_data(desc); \
1782 TYPE *d = vd, *n = vn; \
1783 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1784 d[i] += n[i] >> shift; \
1785 } \
1786 clear_tail(d, oprsz, simd_maxsz(desc)); \
1787 }
1788
DO_SRA(gvec_ssra_b,int8_t)1789 DO_SRA(gvec_ssra_b, int8_t)
1790 DO_SRA(gvec_ssra_h, int16_t)
1791 DO_SRA(gvec_ssra_s, int32_t)
1792 DO_SRA(gvec_ssra_d, int64_t)
1793
1794 DO_SRA(gvec_usra_b, uint8_t)
1795 DO_SRA(gvec_usra_h, uint16_t)
1796 DO_SRA(gvec_usra_s, uint32_t)
1797 DO_SRA(gvec_usra_d, uint64_t)
1798
1799 #undef DO_SRA
1800
1801 #define DO_RSHR(NAME, TYPE) \
1802 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1803 { \
1804 intptr_t i, oprsz = simd_oprsz(desc); \
1805 int shift = simd_data(desc); \
1806 TYPE *d = vd, *n = vn; \
1807 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1808 TYPE tmp = n[i] >> (shift - 1); \
1809 d[i] = (tmp >> 1) + (tmp & 1); \
1810 } \
1811 clear_tail(d, oprsz, simd_maxsz(desc)); \
1812 }
1813
1814 DO_RSHR(gvec_srshr_b, int8_t)
1815 DO_RSHR(gvec_srshr_h, int16_t)
1816 DO_RSHR(gvec_srshr_s, int32_t)
1817 DO_RSHR(gvec_srshr_d, int64_t)
1818
1819 DO_RSHR(gvec_urshr_b, uint8_t)
1820 DO_RSHR(gvec_urshr_h, uint16_t)
1821 DO_RSHR(gvec_urshr_s, uint32_t)
1822 DO_RSHR(gvec_urshr_d, uint64_t)
1823
1824 #undef DO_RSHR
1825
1826 #define DO_RSRA(NAME, TYPE) \
1827 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1828 { \
1829 intptr_t i, oprsz = simd_oprsz(desc); \
1830 int shift = simd_data(desc); \
1831 TYPE *d = vd, *n = vn; \
1832 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1833 TYPE tmp = n[i] >> (shift - 1); \
1834 d[i] += (tmp >> 1) + (tmp & 1); \
1835 } \
1836 clear_tail(d, oprsz, simd_maxsz(desc)); \
1837 }
1838
1839 DO_RSRA(gvec_srsra_b, int8_t)
1840 DO_RSRA(gvec_srsra_h, int16_t)
1841 DO_RSRA(gvec_srsra_s, int32_t)
1842 DO_RSRA(gvec_srsra_d, int64_t)
1843
1844 DO_RSRA(gvec_ursra_b, uint8_t)
1845 DO_RSRA(gvec_ursra_h, uint16_t)
1846 DO_RSRA(gvec_ursra_s, uint32_t)
1847 DO_RSRA(gvec_ursra_d, uint64_t)
1848
1849 #undef DO_RSRA
1850
1851 #define DO_SRI(NAME, TYPE) \
1852 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1853 { \
1854 intptr_t i, oprsz = simd_oprsz(desc); \
1855 int shift = simd_data(desc); \
1856 TYPE *d = vd, *n = vn; \
1857 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1858 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1859 } \
1860 clear_tail(d, oprsz, simd_maxsz(desc)); \
1861 }
1862
1863 DO_SRI(gvec_sri_b, uint8_t)
1864 DO_SRI(gvec_sri_h, uint16_t)
1865 DO_SRI(gvec_sri_s, uint32_t)
1866 DO_SRI(gvec_sri_d, uint64_t)
1867
1868 #undef DO_SRI
1869
1870 #define DO_SLI(NAME, TYPE) \
1871 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1872 { \
1873 intptr_t i, oprsz = simd_oprsz(desc); \
1874 int shift = simd_data(desc); \
1875 TYPE *d = vd, *n = vn; \
1876 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1877 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1878 } \
1879 clear_tail(d, oprsz, simd_maxsz(desc)); \
1880 }
1881
1882 DO_SLI(gvec_sli_b, uint8_t)
1883 DO_SLI(gvec_sli_h, uint16_t)
1884 DO_SLI(gvec_sli_s, uint32_t)
1885 DO_SLI(gvec_sli_d, uint64_t)
1886
1887 #undef DO_SLI
1888
1889 /*
1890 * Convert float16 to float32, raising no exceptions and
1891 * preserving exceptional values, including SNaN.
1892 * This is effectively an unpack+repack operation.
1893 */
1894 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1895 {
1896 const int f16_bias = 15;
1897 const int f32_bias = 127;
1898 uint32_t sign = extract32(f16, 15, 1);
1899 uint32_t exp = extract32(f16, 10, 5);
1900 uint32_t frac = extract32(f16, 0, 10);
1901
1902 if (exp == 0x1f) {
1903 /* Inf or NaN */
1904 exp = 0xff;
1905 } else if (exp == 0) {
1906 /* Zero or denormal. */
1907 if (frac != 0) {
1908 if (fz16) {
1909 frac = 0;
1910 } else {
1911 /*
1912 * Denormal; these are all normal float32.
1913 * Shift the fraction so that the msb is at bit 11,
1914 * then remove bit 11 as the implicit bit of the
1915 * normalized float32. Note that we still go through
1916 * the shift for normal numbers below, to put the
1917 * float32 fraction at the right place.
1918 */
1919 int shift = clz32(frac) - 21;
1920 frac = (frac << shift) & 0x3ff;
1921 exp = f32_bias - f16_bias - shift + 1;
1922 }
1923 }
1924 } else {
1925 /* Normal number; adjust the bias. */
1926 exp += f32_bias - f16_bias;
1927 }
1928 sign <<= 31;
1929 exp <<= 23;
1930 frac <<= 23 - 10;
1931
1932 return sign | exp | frac;
1933 }
1934
load4_f16(uint64_t * ptr,int is_q,int is_2)1935 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1936 {
1937 /*
1938 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1939 * Load the 2nd qword iff is_q & is_2.
1940 * Shift to the 2nd dword iff !is_q & is_2.
1941 * For !is_q & !is_2, the upper bits of the result are garbage.
1942 */
1943 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1944 }
1945
1946 /*
1947 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1948 * as there is not yet SVE versions that might use blocking.
1949 */
1950
do_fmlal(float32 * d,void * vn,void * vm,float_status * fpst,uint32_t desc,bool fz16)1951 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1952 uint32_t desc, bool fz16)
1953 {
1954 intptr_t i, oprsz = simd_oprsz(desc);
1955 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1956 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1957 int is_q = oprsz == 16;
1958 uint64_t n_4, m_4;
1959
1960 /* Pre-load all of the f16 data, avoiding overlap issues. */
1961 n_4 = load4_f16(vn, is_q, is_2);
1962 m_4 = load4_f16(vm, is_q, is_2);
1963
1964 /* Negate all inputs for FMLSL at once. */
1965 if (is_s) {
1966 n_4 ^= 0x8000800080008000ull;
1967 }
1968
1969 for (i = 0; i < oprsz / 4; i++) {
1970 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1971 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1972 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1973 }
1974 clear_tail(d, oprsz, simd_maxsz(desc));
1975 }
1976
HELPER(gvec_fmlal_a32)1977 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1978 void *venv, uint32_t desc)
1979 {
1980 CPUARMState *env = venv;
1981 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1982 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1983 }
1984
HELPER(gvec_fmlal_a64)1985 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1986 void *venv, uint32_t desc)
1987 {
1988 CPUARMState *env = venv;
1989 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1990 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1991 }
1992
HELPER(sve2_fmlal_zzzw_s)1993 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1994 void *venv, uint32_t desc)
1995 {
1996 intptr_t i, oprsz = simd_oprsz(desc);
1997 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1998 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1999 CPUARMState *env = venv;
2000 float_status *status = &env->vfp.fp_status;
2001 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2002
2003 for (i = 0; i < oprsz; i += sizeof(float32)) {
2004 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2005 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2006 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2007 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2008 float32 aa = *(float32 *)(va + H1_4(i));
2009
2010 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2011 }
2012 }
2013
do_fmlal_idx(float32 * d,void * vn,void * vm,float_status * fpst,uint32_t desc,bool fz16)2014 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2015 uint32_t desc, bool fz16)
2016 {
2017 intptr_t i, oprsz = simd_oprsz(desc);
2018 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2019 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2020 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2021 int is_q = oprsz == 16;
2022 uint64_t n_4;
2023 float32 m_1;
2024
2025 /* Pre-load all of the f16 data, avoiding overlap issues. */
2026 n_4 = load4_f16(vn, is_q, is_2);
2027
2028 /* Negate all inputs for FMLSL at once. */
2029 if (is_s) {
2030 n_4 ^= 0x8000800080008000ull;
2031 }
2032
2033 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2034
2035 for (i = 0; i < oprsz / 4; i++) {
2036 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2037 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2038 }
2039 clear_tail(d, oprsz, simd_maxsz(desc));
2040 }
2041
HELPER(gvec_fmlal_idx_a32)2042 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2043 void *venv, uint32_t desc)
2044 {
2045 CPUARMState *env = venv;
2046 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2047 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2048 }
2049
HELPER(gvec_fmlal_idx_a64)2050 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2051 void *venv, uint32_t desc)
2052 {
2053 CPUARMState *env = venv;
2054 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2055 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2056 }
2057
HELPER(sve2_fmlal_zzxw_s)2058 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2059 void *venv, uint32_t desc)
2060 {
2061 intptr_t i, j, oprsz = simd_oprsz(desc);
2062 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2063 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2064 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2065 CPUARMState *env = venv;
2066 float_status *status = &env->vfp.fp_status;
2067 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2068
2069 for (i = 0; i < oprsz; i += 16) {
2070 float16 mm_16 = *(float16 *)(vm + i + idx);
2071 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2072
2073 for (j = 0; j < 16; j += sizeof(float32)) {
2074 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2075 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2076 float32 aa = *(float32 *)(va + H1_4(i + j));
2077
2078 *(float32 *)(vd + H1_4(i + j)) =
2079 float32_muladd(nn, mm, aa, 0, status);
2080 }
2081 }
2082 }
2083
HELPER(gvec_sshl_b)2084 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2085 {
2086 intptr_t i, opr_sz = simd_oprsz(desc);
2087 int8_t *d = vd, *n = vn, *m = vm;
2088
2089 for (i = 0; i < opr_sz; ++i) {
2090 int8_t mm = m[i];
2091 int8_t nn = n[i];
2092 int8_t res = 0;
2093 if (mm >= 0) {
2094 if (mm < 8) {
2095 res = nn << mm;
2096 }
2097 } else {
2098 res = nn >> (mm > -8 ? -mm : 7);
2099 }
2100 d[i] = res;
2101 }
2102 clear_tail(d, opr_sz, simd_maxsz(desc));
2103 }
2104
HELPER(gvec_sshl_h)2105 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2106 {
2107 intptr_t i, opr_sz = simd_oprsz(desc);
2108 int16_t *d = vd, *n = vn, *m = vm;
2109
2110 for (i = 0; i < opr_sz / 2; ++i) {
2111 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2112 int16_t nn = n[i];
2113 int16_t res = 0;
2114 if (mm >= 0) {
2115 if (mm < 16) {
2116 res = nn << mm;
2117 }
2118 } else {
2119 res = nn >> (mm > -16 ? -mm : 15);
2120 }
2121 d[i] = res;
2122 }
2123 clear_tail(d, opr_sz, simd_maxsz(desc));
2124 }
2125
HELPER(gvec_ushl_b)2126 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2127 {
2128 intptr_t i, opr_sz = simd_oprsz(desc);
2129 uint8_t *d = vd, *n = vn, *m = vm;
2130
2131 for (i = 0; i < opr_sz; ++i) {
2132 int8_t mm = m[i];
2133 uint8_t nn = n[i];
2134 uint8_t res = 0;
2135 if (mm >= 0) {
2136 if (mm < 8) {
2137 res = nn << mm;
2138 }
2139 } else {
2140 if (mm > -8) {
2141 res = nn >> -mm;
2142 }
2143 }
2144 d[i] = res;
2145 }
2146 clear_tail(d, opr_sz, simd_maxsz(desc));
2147 }
2148
HELPER(gvec_ushl_h)2149 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2150 {
2151 intptr_t i, opr_sz = simd_oprsz(desc);
2152 uint16_t *d = vd, *n = vn, *m = vm;
2153
2154 for (i = 0; i < opr_sz / 2; ++i) {
2155 int8_t mm = m[i]; /* only 8 bits of shift are significant */
2156 uint16_t nn = n[i];
2157 uint16_t res = 0;
2158 if (mm >= 0) {
2159 if (mm < 16) {
2160 res = nn << mm;
2161 }
2162 } else {
2163 if (mm > -16) {
2164 res = nn >> -mm;
2165 }
2166 }
2167 d[i] = res;
2168 }
2169 clear_tail(d, opr_sz, simd_maxsz(desc));
2170 }
2171
2172 /*
2173 * 8x8->8 polynomial multiply.
2174 *
2175 * Polynomial multiplication is like integer multiplication except the
2176 * partial products are XORed, not added.
2177 *
2178 * TODO: expose this as a generic vector operation, as it is a common
2179 * crypto building block.
2180 */
HELPER(gvec_pmul_b)2181 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2182 {
2183 intptr_t i, opr_sz = simd_oprsz(desc);
2184 uint64_t *d = vd, *n = vn, *m = vm;
2185
2186 for (i = 0; i < opr_sz / 8; ++i) {
2187 d[i] = clmul_8x8_low(n[i], m[i]);
2188 }
2189 clear_tail(d, opr_sz, simd_maxsz(desc));
2190 }
2191
2192 /*
2193 * 64x64->128 polynomial multiply.
2194 * Because of the lanes are not accessed in strict columns,
2195 * this probably cannot be turned into a generic helper.
2196 */
HELPER(gvec_pmull_q)2197 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2198 {
2199 intptr_t i, opr_sz = simd_oprsz(desc);
2200 intptr_t hi = simd_data(desc);
2201 uint64_t *d = vd, *n = vn, *m = vm;
2202
2203 for (i = 0; i < opr_sz / 8; i += 2) {
2204 Int128 r = clmul_64(n[i + hi], m[i + hi]);
2205 d[i] = int128_getlo(r);
2206 d[i + 1] = int128_gethi(r);
2207 }
2208 clear_tail(d, opr_sz, simd_maxsz(desc));
2209 }
2210
HELPER(neon_pmull_h)2211 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2212 {
2213 int hi = simd_data(desc);
2214 uint64_t *d = vd, *n = vn, *m = vm;
2215 uint64_t nn = n[hi], mm = m[hi];
2216
2217 d[0] = clmul_8x4_packed(nn, mm);
2218 nn >>= 32;
2219 mm >>= 32;
2220 d[1] = clmul_8x4_packed(nn, mm);
2221
2222 clear_tail(d, 16, simd_maxsz(desc));
2223 }
2224
2225 #ifdef TARGET_AARCH64
HELPER(sve2_pmull_h)2226 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2227 {
2228 int shift = simd_data(desc) * 8;
2229 intptr_t i, opr_sz = simd_oprsz(desc);
2230 uint64_t *d = vd, *n = vn, *m = vm;
2231
2232 for (i = 0; i < opr_sz / 8; ++i) {
2233 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2234 }
2235 }
2236
HELPER(sve2_pmull_d)2237 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2238 {
2239 intptr_t sel = H4(simd_data(desc));
2240 intptr_t i, opr_sz = simd_oprsz(desc);
2241 uint32_t *n = vn, *m = vm;
2242 uint64_t *d = vd;
2243
2244 for (i = 0; i < opr_sz / 8; ++i) {
2245 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2246 }
2247 }
2248 #endif
2249
2250 #define DO_CMP0(NAME, TYPE, OP) \
2251 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2252 { \
2253 intptr_t i, opr_sz = simd_oprsz(desc); \
2254 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2255 TYPE nn = *(TYPE *)(vn + i); \
2256 *(TYPE *)(vd + i) = -(nn OP 0); \
2257 } \
2258 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2259 }
2260
2261 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2262 DO_CMP0(gvec_clt0_b, int8_t, <)
2263 DO_CMP0(gvec_cle0_b, int8_t, <=)
2264 DO_CMP0(gvec_cgt0_b, int8_t, >)
2265 DO_CMP0(gvec_cge0_b, int8_t, >=)
2266
2267 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2268 DO_CMP0(gvec_clt0_h, int16_t, <)
2269 DO_CMP0(gvec_cle0_h, int16_t, <=)
2270 DO_CMP0(gvec_cgt0_h, int16_t, >)
2271 DO_CMP0(gvec_cge0_h, int16_t, >=)
2272
2273 #undef DO_CMP0
2274
2275 #define DO_ABD(NAME, TYPE) \
2276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2277 { \
2278 intptr_t i, opr_sz = simd_oprsz(desc); \
2279 TYPE *d = vd, *n = vn, *m = vm; \
2280 \
2281 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2282 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2283 } \
2284 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2285 }
2286
DO_ABD(gvec_sabd_b,int8_t)2287 DO_ABD(gvec_sabd_b, int8_t)
2288 DO_ABD(gvec_sabd_h, int16_t)
2289 DO_ABD(gvec_sabd_s, int32_t)
2290 DO_ABD(gvec_sabd_d, int64_t)
2291
2292 DO_ABD(gvec_uabd_b, uint8_t)
2293 DO_ABD(gvec_uabd_h, uint16_t)
2294 DO_ABD(gvec_uabd_s, uint32_t)
2295 DO_ABD(gvec_uabd_d, uint64_t)
2296
2297 #undef DO_ABD
2298
2299 #define DO_ABA(NAME, TYPE) \
2300 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2301 { \
2302 intptr_t i, opr_sz = simd_oprsz(desc); \
2303 TYPE *d = vd, *n = vn, *m = vm; \
2304 \
2305 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2306 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2307 } \
2308 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2309 }
2310
2311 DO_ABA(gvec_saba_b, int8_t)
2312 DO_ABA(gvec_saba_h, int16_t)
2313 DO_ABA(gvec_saba_s, int32_t)
2314 DO_ABA(gvec_saba_d, int64_t)
2315
2316 DO_ABA(gvec_uaba_b, uint8_t)
2317 DO_ABA(gvec_uaba_h, uint16_t)
2318 DO_ABA(gvec_uaba_s, uint32_t)
2319 DO_ABA(gvec_uaba_d, uint64_t)
2320
2321 #undef DO_ABA
2322
2323 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2324 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2325 { \
2326 ARMVectorReg scratch; \
2327 intptr_t oprsz = simd_oprsz(desc); \
2328 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2329 TYPE *d = vd, *n = vn, *m = vm; \
2330 if (unlikely(d == m)) { \
2331 m = memcpy(&scratch, m, oprsz); \
2332 } \
2333 for (intptr_t i = 0; i < half; ++i) { \
2334 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \
2335 } \
2336 for (intptr_t i = 0; i < half; ++i) { \
2337 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \
2338 } \
2339 clear_tail(d, oprsz, simd_maxsz(desc)); \
2340 }
2341
2342 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2343 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2344 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2345
2346 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2347 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2348 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2349
2350 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2351 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2352 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2353
2354 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2355 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2356 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2357
2358 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2359 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2360 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2361
2362 #undef DO_3OP_PAIR
2363
2364 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2366 { \
2367 ARMVectorReg scratch; \
2368 intptr_t oprsz = simd_oprsz(desc); \
2369 intptr_t half = oprsz / sizeof(TYPE) / 2; \
2370 TYPE *d = vd, *n = vn, *m = vm; \
2371 if (unlikely(d == m)) { \
2372 m = memcpy(&scratch, m, oprsz); \
2373 } \
2374 for (intptr_t i = 0; i < half; ++i) { \
2375 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \
2376 } \
2377 for (intptr_t i = 0; i < half; ++i) { \
2378 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \
2379 } \
2380 clear_tail(d, oprsz, simd_maxsz(desc)); \
2381 }
2382
2383 #define ADD(A, B) (A + B)
2384 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2385 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2386 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2387 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2388 #undef ADD
2389
2390 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2391 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2392 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2393
2394 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2395 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2396 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2397
2398 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2399 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2400 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2401
2402 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2403 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2404 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2405
2406 #undef DO_3OP_PAIR
2407
2408 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2409 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2410 { \
2411 intptr_t i, oprsz = simd_oprsz(desc); \
2412 int shift = simd_data(desc); \
2413 TYPE *d = vd, *n = vn; \
2414 float_status *fpst = stat; \
2415 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2416 d[i] = FUNC(n[i], shift, fpst); \
2417 } \
2418 clear_tail(d, oprsz, simd_maxsz(desc)); \
2419 }
2420
2421 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2422 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2423 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2424 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2425 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2426 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2427 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2428 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2429
2430 #undef DO_VCVT_FIXED
2431
2432 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2433 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2434 { \
2435 float_status *fpst = stat; \
2436 intptr_t i, oprsz = simd_oprsz(desc); \
2437 uint32_t rmode = simd_data(desc); \
2438 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2439 TYPE *d = vd, *n = vn; \
2440 set_float_rounding_mode(rmode, fpst); \
2441 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2442 d[i] = FUNC(n[i], 0, fpst); \
2443 } \
2444 set_float_rounding_mode(prev_rmode, fpst); \
2445 clear_tail(d, oprsz, simd_maxsz(desc)); \
2446 }
2447
2448 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2449 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2450 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2451 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2452
2453 #undef DO_VCVT_RMODE
2454
2455 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2456 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2457 { \
2458 float_status *fpst = stat; \
2459 intptr_t i, oprsz = simd_oprsz(desc); \
2460 uint32_t rmode = simd_data(desc); \
2461 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2462 TYPE *d = vd, *n = vn; \
2463 set_float_rounding_mode(rmode, fpst); \
2464 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2465 d[i] = FUNC(n[i], fpst); \
2466 } \
2467 set_float_rounding_mode(prev_rmode, fpst); \
2468 clear_tail(d, oprsz, simd_maxsz(desc)); \
2469 }
2470
2471 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2472 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2473
2474 #undef DO_VRINT_RMODE
2475
2476 #ifdef TARGET_AARCH64
2477 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2478 {
2479 const uint8_t *indices = vm;
2480 CPUARMState *env = venv;
2481 size_t oprsz = simd_oprsz(desc);
2482 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2483 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2484 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2485 union {
2486 uint8_t b[16];
2487 uint64_t d[2];
2488 } result;
2489
2490 /*
2491 * We must construct the final result in a temp, lest the output
2492 * overlaps the input table. For TBL, begin with zero; for TBX,
2493 * begin with the original register contents. Note that we always
2494 * copy 16 bytes here to avoid an extra branch; clearing the high
2495 * bits of the register for oprsz == 8 is handled below.
2496 */
2497 if (is_tbx) {
2498 memcpy(&result, vd, 16);
2499 } else {
2500 memset(&result, 0, 16);
2501 }
2502
2503 for (size_t i = 0; i < oprsz; ++i) {
2504 uint32_t index = indices[H1(i)];
2505
2506 if (index < table_len) {
2507 /*
2508 * Convert index (a byte offset into the virtual table
2509 * which is a series of 128-bit vectors concatenated)
2510 * into the correct register element, bearing in mind
2511 * that the table can wrap around from V31 to V0.
2512 */
2513 const uint8_t *table = (const uint8_t *)
2514 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2515 result.b[H1(i)] = table[H1(index % 16)];
2516 }
2517 }
2518
2519 memcpy(vd, &result, 16);
2520 clear_tail(vd, oprsz, simd_maxsz(desc));
2521 }
2522 #endif
2523
2524 /*
2525 * NxN -> N highpart multiply
2526 *
2527 * TODO: expose this as a generic vector operation.
2528 */
2529
HELPER(gvec_smulh_b)2530 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2531 {
2532 intptr_t i, opr_sz = simd_oprsz(desc);
2533 int8_t *d = vd, *n = vn, *m = vm;
2534
2535 for (i = 0; i < opr_sz; ++i) {
2536 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2537 }
2538 clear_tail(d, opr_sz, simd_maxsz(desc));
2539 }
2540
HELPER(gvec_smulh_h)2541 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2542 {
2543 intptr_t i, opr_sz = simd_oprsz(desc);
2544 int16_t *d = vd, *n = vn, *m = vm;
2545
2546 for (i = 0; i < opr_sz / 2; ++i) {
2547 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2548 }
2549 clear_tail(d, opr_sz, simd_maxsz(desc));
2550 }
2551
HELPER(gvec_smulh_s)2552 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2553 {
2554 intptr_t i, opr_sz = simd_oprsz(desc);
2555 int32_t *d = vd, *n = vn, *m = vm;
2556
2557 for (i = 0; i < opr_sz / 4; ++i) {
2558 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2559 }
2560 clear_tail(d, opr_sz, simd_maxsz(desc));
2561 }
2562
HELPER(gvec_smulh_d)2563 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564 {
2565 intptr_t i, opr_sz = simd_oprsz(desc);
2566 uint64_t *d = vd, *n = vn, *m = vm;
2567 uint64_t discard;
2568
2569 for (i = 0; i < opr_sz / 8; ++i) {
2570 muls64(&discard, &d[i], n[i], m[i]);
2571 }
2572 clear_tail(d, opr_sz, simd_maxsz(desc));
2573 }
2574
HELPER(gvec_umulh_b)2575 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2576 {
2577 intptr_t i, opr_sz = simd_oprsz(desc);
2578 uint8_t *d = vd, *n = vn, *m = vm;
2579
2580 for (i = 0; i < opr_sz; ++i) {
2581 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2582 }
2583 clear_tail(d, opr_sz, simd_maxsz(desc));
2584 }
2585
HELPER(gvec_umulh_h)2586 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2587 {
2588 intptr_t i, opr_sz = simd_oprsz(desc);
2589 uint16_t *d = vd, *n = vn, *m = vm;
2590
2591 for (i = 0; i < opr_sz / 2; ++i) {
2592 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2593 }
2594 clear_tail(d, opr_sz, simd_maxsz(desc));
2595 }
2596
HELPER(gvec_umulh_s)2597 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2598 {
2599 intptr_t i, opr_sz = simd_oprsz(desc);
2600 uint32_t *d = vd, *n = vn, *m = vm;
2601
2602 for (i = 0; i < opr_sz / 4; ++i) {
2603 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2604 }
2605 clear_tail(d, opr_sz, simd_maxsz(desc));
2606 }
2607
HELPER(gvec_umulh_d)2608 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2609 {
2610 intptr_t i, opr_sz = simd_oprsz(desc);
2611 uint64_t *d = vd, *n = vn, *m = vm;
2612 uint64_t discard;
2613
2614 for (i = 0; i < opr_sz / 8; ++i) {
2615 mulu64(&discard, &d[i], n[i], m[i]);
2616 }
2617 clear_tail(d, opr_sz, simd_maxsz(desc));
2618 }
2619
HELPER(gvec_xar_d)2620 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2621 {
2622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2623 int shr = simd_data(desc);
2624 uint64_t *d = vd, *n = vn, *m = vm;
2625
2626 for (i = 0; i < opr_sz; ++i) {
2627 d[i] = ror64(n[i] ^ m[i], shr);
2628 }
2629 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2630 }
2631
2632 /*
2633 * Integer matrix-multiply accumulate
2634 */
2635
do_smmla_b(uint32_t sum,void * vn,void * vm)2636 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2637 {
2638 int8_t *n = vn, *m = vm;
2639
2640 for (intptr_t k = 0; k < 8; ++k) {
2641 sum += n[H1(k)] * m[H1(k)];
2642 }
2643 return sum;
2644 }
2645
do_ummla_b(uint32_t sum,void * vn,void * vm)2646 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2647 {
2648 uint8_t *n = vn, *m = vm;
2649
2650 for (intptr_t k = 0; k < 8; ++k) {
2651 sum += n[H1(k)] * m[H1(k)];
2652 }
2653 return sum;
2654 }
2655
do_usmmla_b(uint32_t sum,void * vn,void * vm)2656 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2657 {
2658 uint8_t *n = vn;
2659 int8_t *m = vm;
2660
2661 for (intptr_t k = 0; k < 8; ++k) {
2662 sum += n[H1(k)] * m[H1(k)];
2663 }
2664 return sum;
2665 }
2666
do_mmla_b(void * vd,void * vn,void * vm,void * va,uint32_t desc,uint32_t (* inner_loop)(uint32_t,void *,void *))2667 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2668 uint32_t (*inner_loop)(uint32_t, void *, void *))
2669 {
2670 intptr_t seg, opr_sz = simd_oprsz(desc);
2671
2672 for (seg = 0; seg < opr_sz; seg += 16) {
2673 uint32_t *d = vd + seg;
2674 uint32_t *a = va + seg;
2675 uint32_t sum0, sum1, sum2, sum3;
2676
2677 /*
2678 * Process the entire segment at once, writing back the
2679 * results only after we've consumed all of the inputs.
2680 *
2681 * Key to indices by column:
2682 * i j i j
2683 */
2684 sum0 = a[H4(0 + 0)];
2685 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2686 sum1 = a[H4(0 + 1)];
2687 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2688 sum2 = a[H4(2 + 0)];
2689 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2690 sum3 = a[H4(2 + 1)];
2691 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2692
2693 d[H4(0)] = sum0;
2694 d[H4(1)] = sum1;
2695 d[H4(2)] = sum2;
2696 d[H4(3)] = sum3;
2697 }
2698 clear_tail(vd, opr_sz, simd_maxsz(desc));
2699 }
2700
2701 #define DO_MMLA_B(NAME, INNER) \
2702 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2703 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2704
DO_MMLA_B(gvec_smmla_b,do_smmla_b)2705 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2706 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2707 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2708
2709 /*
2710 * BFloat16 Dot Product
2711 */
2712
2713 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2714 {
2715 /* FPCR is ignored for BFDOT and BFMMLA. */
2716 float_status bf_status = {
2717 .tininess_before_rounding = float_tininess_before_rounding,
2718 .float_rounding_mode = float_round_to_odd_inf,
2719 .flush_to_zero = true,
2720 .flush_inputs_to_zero = true,
2721 .default_nan_mode = true,
2722 };
2723 float32 t1, t2;
2724
2725 /*
2726 * Extract each BFloat16 from the element pair, and shift
2727 * them such that they become float32.
2728 */
2729 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2730 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2731 t1 = float32_add(t1, t2, &bf_status);
2732 t1 = float32_add(sum, t1, &bf_status);
2733
2734 return t1;
2735 }
2736
HELPER(gvec_bfdot)2737 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2738 {
2739 intptr_t i, opr_sz = simd_oprsz(desc);
2740 float32 *d = vd, *a = va;
2741 uint32_t *n = vn, *m = vm;
2742
2743 for (i = 0; i < opr_sz / 4; ++i) {
2744 d[i] = bfdotadd(a[i], n[i], m[i]);
2745 }
2746 clear_tail(d, opr_sz, simd_maxsz(desc));
2747 }
2748
HELPER(gvec_bfdot_idx)2749 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2750 void *va, uint32_t desc)
2751 {
2752 intptr_t i, j, opr_sz = simd_oprsz(desc);
2753 intptr_t index = simd_data(desc);
2754 intptr_t elements = opr_sz / 4;
2755 intptr_t eltspersegment = MIN(16 / 4, elements);
2756 float32 *d = vd, *a = va;
2757 uint32_t *n = vn, *m = vm;
2758
2759 for (i = 0; i < elements; i += eltspersegment) {
2760 uint32_t m_idx = m[i + H4(index)];
2761
2762 for (j = i; j < i + eltspersegment; j++) {
2763 d[j] = bfdotadd(a[j], n[j], m_idx);
2764 }
2765 }
2766 clear_tail(d, opr_sz, simd_maxsz(desc));
2767 }
2768
HELPER(gvec_bfmmla)2769 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2770 {
2771 intptr_t s, opr_sz = simd_oprsz(desc);
2772 float32 *d = vd, *a = va;
2773 uint32_t *n = vn, *m = vm;
2774
2775 for (s = 0; s < opr_sz / 4; s += 4) {
2776 float32 sum00, sum01, sum10, sum11;
2777
2778 /*
2779 * Process the entire segment at once, writing back the
2780 * results only after we've consumed all of the inputs.
2781 *
2782 * Key to indices by column:
2783 * i j i k j k
2784 */
2785 sum00 = a[s + H4(0 + 0)];
2786 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2787 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2788
2789 sum01 = a[s + H4(0 + 1)];
2790 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2791 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2792
2793 sum10 = a[s + H4(2 + 0)];
2794 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2795 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2796
2797 sum11 = a[s + H4(2 + 1)];
2798 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2799 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2800
2801 d[s + H4(0 + 0)] = sum00;
2802 d[s + H4(0 + 1)] = sum01;
2803 d[s + H4(2 + 0)] = sum10;
2804 d[s + H4(2 + 1)] = sum11;
2805 }
2806 clear_tail(d, opr_sz, simd_maxsz(desc));
2807 }
2808
HELPER(gvec_bfmlal)2809 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2810 void *stat, uint32_t desc)
2811 {
2812 intptr_t i, opr_sz = simd_oprsz(desc);
2813 intptr_t sel = simd_data(desc);
2814 float32 *d = vd, *a = va;
2815 bfloat16 *n = vn, *m = vm;
2816
2817 for (i = 0; i < opr_sz / 4; ++i) {
2818 float32 nn = n[H2(i * 2 + sel)] << 16;
2819 float32 mm = m[H2(i * 2 + sel)] << 16;
2820 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2821 }
2822 clear_tail(d, opr_sz, simd_maxsz(desc));
2823 }
2824
HELPER(gvec_bfmlal_idx)2825 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2826 void *va, void *stat, uint32_t desc)
2827 {
2828 intptr_t i, j, opr_sz = simd_oprsz(desc);
2829 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2830 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2831 intptr_t elements = opr_sz / 4;
2832 intptr_t eltspersegment = MIN(16 / 4, elements);
2833 float32 *d = vd, *a = va;
2834 bfloat16 *n = vn, *m = vm;
2835
2836 for (i = 0; i < elements; i += eltspersegment) {
2837 float32 m_idx = m[H2(2 * i + index)] << 16;
2838
2839 for (j = i; j < i + eltspersegment; j++) {
2840 float32 n_j = n[H2(2 * j + sel)] << 16;
2841 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2842 }
2843 }
2844 clear_tail(d, opr_sz, simd_maxsz(desc));
2845 }
2846
2847 #define DO_CLAMP(NAME, TYPE) \
2848 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
2849 { \
2850 intptr_t i, opr_sz = simd_oprsz(desc); \
2851 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2852 TYPE aa = *(TYPE *)(a + i); \
2853 TYPE nn = *(TYPE *)(n + i); \
2854 TYPE mm = *(TYPE *)(m + i); \
2855 TYPE dd = MIN(MAX(aa, nn), mm); \
2856 *(TYPE *)(d + i) = dd; \
2857 } \
2858 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2859 }
2860
2861 DO_CLAMP(gvec_sclamp_b, int8_t)
2862 DO_CLAMP(gvec_sclamp_h, int16_t)
2863 DO_CLAMP(gvec_sclamp_s, int32_t)
2864 DO_CLAMP(gvec_sclamp_d, int64_t)
2865
2866 DO_CLAMP(gvec_uclamp_b, uint8_t)
2867 DO_CLAMP(gvec_uclamp_h, uint16_t)
2868 DO_CLAMP(gvec_uclamp_s, uint32_t)
2869 DO_CLAMP(gvec_uclamp_d, uint64_t)
2870