xref: /qemu/target/arm/tcg/vec_helper.c (revision cf1b2cab)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315                              void *va, uint32_t desc)
316 {
317     intptr_t i, opr_sz = simd_oprsz(desc);
318     int16_t *d = vd, *n = vn, *m = vm, *a = va;
319     uint32_t discard;
320 
321     for (i = 0; i < opr_sz / 2; ++i) {
322         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323     }
324 }
325 
326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327                              void *va, uint32_t desc)
328 {
329     intptr_t i, opr_sz = simd_oprsz(desc);
330     int16_t *d = vd, *n = vn, *m = vm, *a = va;
331     uint32_t discard;
332 
333     for (i = 0; i < opr_sz / 2; ++i) {
334         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335     }
336 }
337 
338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339 {
340     intptr_t i, opr_sz = simd_oprsz(desc);
341     int16_t *d = vd, *n = vn, *m = vm;
342     uint32_t discard;
343 
344     for (i = 0; i < opr_sz / 2; ++i) {
345         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346     }
347 }
348 
349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350 {
351     intptr_t i, opr_sz = simd_oprsz(desc);
352     int16_t *d = vd, *n = vn, *m = vm;
353     uint32_t discard;
354 
355     for (i = 0; i < opr_sz / 2; ++i) {
356         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357     }
358 }
359 
360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361 {
362     intptr_t i, j, opr_sz = simd_oprsz(desc);
363     int idx = simd_data(desc);
364     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365     uint32_t discard;
366 
367     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368         int16_t mm = m[i];
369         for (j = 0; j < 16 / 2; ++j) {
370             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371         }
372     }
373 }
374 
375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376 {
377     intptr_t i, j, opr_sz = simd_oprsz(desc);
378     int idx = simd_data(desc);
379     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380     uint32_t discard;
381 
382     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383         int16_t mm = m[i];
384         for (j = 0; j < 16 / 2; ++j) {
385             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386         }
387     }
388 }
389 
390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392                       bool neg, bool round, uint32_t *sat)
393 {
394     /* Simplify similarly to do_sqrdmlah_b above.  */
395     int64_t ret = (int64_t)src1 * src2;
396     if (neg) {
397         ret = -ret;
398     }
399     ret += ((int64_t)src3 << 31) + (round << 30);
400     ret >>= 31;
401 
402     if (ret != (int32_t)ret) {
403         *sat = 1;
404         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405     }
406     return ret;
407 }
408 
409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410                                   int32_t src2, int32_t src3)
411 {
412     uint32_t *sat = &env->vfp.qc[0];
413     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
414 }
415 
416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
417                               void *vq, uint32_t desc)
418 {
419     uintptr_t opr_sz = simd_oprsz(desc);
420     int32_t *d = vd;
421     int32_t *n = vn;
422     int32_t *m = vm;
423     uintptr_t i;
424 
425     for (i = 0; i < opr_sz / 4; ++i) {
426         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
427     }
428     clear_tail(d, opr_sz, simd_maxsz(desc));
429 }
430 
431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432                                   int32_t src2, int32_t src3)
433 {
434     uint32_t *sat = &env->vfp.qc[0];
435     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
436 }
437 
438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
439                               void *vq, uint32_t desc)
440 {
441     uintptr_t opr_sz = simd_oprsz(desc);
442     int32_t *d = vd;
443     int32_t *n = vn;
444     int32_t *m = vm;
445     uintptr_t i;
446 
447     for (i = 0; i < opr_sz / 4; ++i) {
448         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
449     }
450     clear_tail(d, opr_sz, simd_maxsz(desc));
451 }
452 
453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454                             void *vq, uint32_t desc)
455 {
456     intptr_t i, opr_sz = simd_oprsz(desc);
457     int32_t *d = vd, *n = vn, *m = vm;
458 
459     for (i = 0; i < opr_sz / 4; ++i) {
460         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461     }
462     clear_tail(d, opr_sz, simd_maxsz(desc));
463 }
464 
465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466                              void *vq, uint32_t desc)
467 {
468     intptr_t i, opr_sz = simd_oprsz(desc);
469     int32_t *d = vd, *n = vn, *m = vm;
470 
471     for (i = 0; i < opr_sz / 4; ++i) {
472         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473     }
474     clear_tail(d, opr_sz, simd_maxsz(desc));
475 }
476 
477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478                              void *va, uint32_t desc)
479 {
480     intptr_t i, opr_sz = simd_oprsz(desc);
481     int32_t *d = vd, *n = vn, *m = vm, *a = va;
482     uint32_t discard;
483 
484     for (i = 0; i < opr_sz / 4; ++i) {
485         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486     }
487 }
488 
489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490                              void *va, uint32_t desc)
491 {
492     intptr_t i, opr_sz = simd_oprsz(desc);
493     int32_t *d = vd, *n = vn, *m = vm, *a = va;
494     uint32_t discard;
495 
496     for (i = 0; i < opr_sz / 4; ++i) {
497         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498     }
499 }
500 
501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502 {
503     intptr_t i, opr_sz = simd_oprsz(desc);
504     int32_t *d = vd, *n = vn, *m = vm;
505     uint32_t discard;
506 
507     for (i = 0; i < opr_sz / 4; ++i) {
508         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509     }
510 }
511 
512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513 {
514     intptr_t i, opr_sz = simd_oprsz(desc);
515     int32_t *d = vd, *n = vn, *m = vm;
516     uint32_t discard;
517 
518     for (i = 0; i < opr_sz / 4; ++i) {
519         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520     }
521 }
522 
523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524 {
525     intptr_t i, j, opr_sz = simd_oprsz(desc);
526     int idx = simd_data(desc);
527     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528     uint32_t discard;
529 
530     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531         int32_t mm = m[i];
532         for (j = 0; j < 16 / 4; ++j) {
533             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534         }
535     }
536 }
537 
538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539 {
540     intptr_t i, j, opr_sz = simd_oprsz(desc);
541     int idx = simd_data(desc);
542     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543     uint32_t discard;
544 
545     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546         int32_t mm = m[i];
547         for (j = 0; j < 16 / 4; ++j) {
548             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549         }
550     }
551 }
552 
553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554 static int64_t do_sat128_d(Int128 r)
555 {
556     int64_t ls = int128_getlo(r);
557     int64_t hs = int128_gethi(r);
558 
559     if (unlikely(hs != (ls >> 63))) {
560         return hs < 0 ? INT64_MIN : INT64_MAX;
561     }
562     return ls;
563 }
564 
565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
566 {
567     uint64_t l, h;
568     Int128 r, t;
569 
570     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571     muls64(&l, &h, m, n);
572     r = int128_make128(l, h);
573     if (neg) {
574         r = int128_neg(r);
575     }
576     if (a) {
577         t = int128_exts64(a);
578         t = int128_lshift(t, 63);
579         r = int128_add(r, t);
580     }
581     if (round) {
582         t = int128_exts64(1ll << 62);
583         r = int128_add(r, t);
584     }
585     r = int128_rshift(r, 63);
586 
587     return do_sat128_d(r);
588 }
589 
590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591                              void *va, uint32_t desc)
592 {
593     intptr_t i, opr_sz = simd_oprsz(desc);
594     int64_t *d = vd, *n = vn, *m = vm, *a = va;
595 
596     for (i = 0; i < opr_sz / 8; ++i) {
597         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598     }
599 }
600 
601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602                              void *va, uint32_t desc)
603 {
604     intptr_t i, opr_sz = simd_oprsz(desc);
605     int64_t *d = vd, *n = vn, *m = vm, *a = va;
606 
607     for (i = 0; i < opr_sz / 8; ++i) {
608         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609     }
610 }
611 
612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613 {
614     intptr_t i, opr_sz = simd_oprsz(desc);
615     int64_t *d = vd, *n = vn, *m = vm;
616 
617     for (i = 0; i < opr_sz / 8; ++i) {
618         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619     }
620 }
621 
622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int64_t *d = vd, *n = vn, *m = vm;
626 
627     for (i = 0; i < opr_sz / 8; ++i) {
628         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629     }
630 }
631 
632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633 {
634     intptr_t i, j, opr_sz = simd_oprsz(desc);
635     int idx = simd_data(desc);
636     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637 
638     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639         int64_t mm = m[i];
640         for (j = 0; j < 16 / 8; ++j) {
641             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642         }
643     }
644 }
645 
646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647 {
648     intptr_t i, j, opr_sz = simd_oprsz(desc);
649     int idx = simd_data(desc);
650     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651 
652     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653         int64_t mm = m[i];
654         for (j = 0; j < 16 / 8; ++j) {
655             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656         }
657     }
658 }
659 
660 /* Integer 8 and 16-bit dot-product.
661  *
662  * Note that for the loops herein, host endianness does not matter
663  * with respect to the ordering of data within the quad-width lanes.
664  * All elements are treated equally, no matter where they are.
665  */
666 
667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
669 {                                                                         \
670     intptr_t i, opr_sz = simd_oprsz(desc);                                \
671     TYPED *d = vd, *a = va;                                               \
672     TYPEN *n = vn;                                                        \
673     TYPEM *m = vm;                                                        \
674     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
675         d[i] = (a[i] +                                                    \
676                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
677                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
678                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
679                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
680     }                                                                     \
681     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
682 }
683 
684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
689 
690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
692 {                                                                         \
693     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
694     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
695     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
696     intptr_t index = simd_data(desc);                                     \
697     TYPED *d = vd, *a = va;                                               \
698     TYPEN *n = vn;                                                        \
699     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
700     do {                                                                  \
701         TYPED m0 = m_indexed[i * 4 + 0];                                  \
702         TYPED m1 = m_indexed[i * 4 + 1];                                  \
703         TYPED m2 = m_indexed[i * 4 + 2];                                  \
704         TYPED m3 = m_indexed[i * 4 + 3];                                  \
705         do {                                                              \
706             d[i] = (a[i] +                                                \
707                     n[i * 4 + 0] * m0 +                                   \
708                     n[i * 4 + 1] * m1 +                                   \
709                     n[i * 4 + 2] * m2 +                                   \
710                     n[i * 4 + 3] * m3);                                   \
711         } while (++i < segend);                                           \
712         segend = i + 4;                                                   \
713     } while (i < opr_sz_n);                                               \
714     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
715 }
716 
717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
723 
724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725                          void *vfpst, uint32_t desc)
726 {
727     uintptr_t opr_sz = simd_oprsz(desc);
728     float16 *d = vd;
729     float16 *n = vn;
730     float16 *m = vm;
731     float_status *fpst = vfpst;
732     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733     uint32_t neg_imag = neg_real ^ 1;
734     uintptr_t i;
735 
736     /* Shift boolean to the sign bit so we can xor to negate.  */
737     neg_real <<= 15;
738     neg_imag <<= 15;
739 
740     for (i = 0; i < opr_sz / 2; i += 2) {
741         float16 e0 = n[H2(i)];
742         float16 e1 = m[H2(i + 1)] ^ neg_imag;
743         float16 e2 = n[H2(i + 1)];
744         float16 e3 = m[H2(i)] ^ neg_real;
745 
746         d[H2(i)] = float16_add(e0, e1, fpst);
747         d[H2(i + 1)] = float16_add(e2, e3, fpst);
748     }
749     clear_tail(d, opr_sz, simd_maxsz(desc));
750 }
751 
752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753                          void *vfpst, uint32_t desc)
754 {
755     uintptr_t opr_sz = simd_oprsz(desc);
756     float32 *d = vd;
757     float32 *n = vn;
758     float32 *m = vm;
759     float_status *fpst = vfpst;
760     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761     uint32_t neg_imag = neg_real ^ 1;
762     uintptr_t i;
763 
764     /* Shift boolean to the sign bit so we can xor to negate.  */
765     neg_real <<= 31;
766     neg_imag <<= 31;
767 
768     for (i = 0; i < opr_sz / 4; i += 2) {
769         float32 e0 = n[H4(i)];
770         float32 e1 = m[H4(i + 1)] ^ neg_imag;
771         float32 e2 = n[H4(i + 1)];
772         float32 e3 = m[H4(i)] ^ neg_real;
773 
774         d[H4(i)] = float32_add(e0, e1, fpst);
775         d[H4(i + 1)] = float32_add(e2, e3, fpst);
776     }
777     clear_tail(d, opr_sz, simd_maxsz(desc));
778 }
779 
780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781                          void *vfpst, uint32_t desc)
782 {
783     uintptr_t opr_sz = simd_oprsz(desc);
784     float64 *d = vd;
785     float64 *n = vn;
786     float64 *m = vm;
787     float_status *fpst = vfpst;
788     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789     uint64_t neg_imag = neg_real ^ 1;
790     uintptr_t i;
791 
792     /* Shift boolean to the sign bit so we can xor to negate.  */
793     neg_real <<= 63;
794     neg_imag <<= 63;
795 
796     for (i = 0; i < opr_sz / 8; i += 2) {
797         float64 e0 = n[i];
798         float64 e1 = m[i + 1] ^ neg_imag;
799         float64 e2 = n[i + 1];
800         float64 e3 = m[i] ^ neg_real;
801 
802         d[i] = float64_add(e0, e1, fpst);
803         d[i + 1] = float64_add(e2, e3, fpst);
804     }
805     clear_tail(d, opr_sz, simd_maxsz(desc));
806 }
807 
808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
809                          void *vfpst, uint32_t desc)
810 {
811     uintptr_t opr_sz = simd_oprsz(desc);
812     float16 *d = vd, *n = vn, *m = vm, *a = va;
813     float_status *fpst = vfpst;
814     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816     uint32_t neg_real = flip ^ neg_imag;
817     uintptr_t i;
818 
819     /* Shift boolean to the sign bit so we can xor to negate.  */
820     neg_real <<= 15;
821     neg_imag <<= 15;
822 
823     for (i = 0; i < opr_sz / 2; i += 2) {
824         float16 e2 = n[H2(i + flip)];
825         float16 e1 = m[H2(i + flip)] ^ neg_real;
826         float16 e4 = e2;
827         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828 
829         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
831     }
832     clear_tail(d, opr_sz, simd_maxsz(desc));
833 }
834 
835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
836                              void *vfpst, uint32_t desc)
837 {
838     uintptr_t opr_sz = simd_oprsz(desc);
839     float16 *d = vd, *n = vn, *m = vm, *a = va;
840     float_status *fpst = vfpst;
841     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
843     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
844     uint32_t neg_real = flip ^ neg_imag;
845     intptr_t elements = opr_sz / sizeof(float16);
846     intptr_t eltspersegment = 16 / sizeof(float16);
847     intptr_t i, j;
848 
849     /* Shift boolean to the sign bit so we can xor to negate.  */
850     neg_real <<= 15;
851     neg_imag <<= 15;
852 
853     for (i = 0; i < elements; i += eltspersegment) {
854         float16 mr = m[H2(i + 2 * index + 0)];
855         float16 mi = m[H2(i + 2 * index + 1)];
856         float16 e1 = neg_real ^ (flip ? mi : mr);
857         float16 e3 = neg_imag ^ (flip ? mr : mi);
858 
859         for (j = i; j < i + eltspersegment; j += 2) {
860             float16 e2 = n[H2(j + flip)];
861             float16 e4 = e2;
862 
863             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
865         }
866     }
867     clear_tail(d, opr_sz, simd_maxsz(desc));
868 }
869 
870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
871                          void *vfpst, uint32_t desc)
872 {
873     uintptr_t opr_sz = simd_oprsz(desc);
874     float32 *d = vd, *n = vn, *m = vm, *a = va;
875     float_status *fpst = vfpst;
876     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878     uint32_t neg_real = flip ^ neg_imag;
879     uintptr_t i;
880 
881     /* Shift boolean to the sign bit so we can xor to negate.  */
882     neg_real <<= 31;
883     neg_imag <<= 31;
884 
885     for (i = 0; i < opr_sz / 4; i += 2) {
886         float32 e2 = n[H4(i + flip)];
887         float32 e1 = m[H4(i + flip)] ^ neg_real;
888         float32 e4 = e2;
889         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890 
891         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
893     }
894     clear_tail(d, opr_sz, simd_maxsz(desc));
895 }
896 
897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
898                              void *vfpst, uint32_t desc)
899 {
900     uintptr_t opr_sz = simd_oprsz(desc);
901     float32 *d = vd, *n = vn, *m = vm, *a = va;
902     float_status *fpst = vfpst;
903     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
905     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
906     uint32_t neg_real = flip ^ neg_imag;
907     intptr_t elements = opr_sz / sizeof(float32);
908     intptr_t eltspersegment = 16 / sizeof(float32);
909     intptr_t i, j;
910 
911     /* Shift boolean to the sign bit so we can xor to negate.  */
912     neg_real <<= 31;
913     neg_imag <<= 31;
914 
915     for (i = 0; i < elements; i += eltspersegment) {
916         float32 mr = m[H4(i + 2 * index + 0)];
917         float32 mi = m[H4(i + 2 * index + 1)];
918         float32 e1 = neg_real ^ (flip ? mi : mr);
919         float32 e3 = neg_imag ^ (flip ? mr : mi);
920 
921         for (j = i; j < i + eltspersegment; j += 2) {
922             float32 e2 = n[H4(j + flip)];
923             float32 e4 = e2;
924 
925             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
927         }
928     }
929     clear_tail(d, opr_sz, simd_maxsz(desc));
930 }
931 
932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
933                          void *vfpst, uint32_t desc)
934 {
935     uintptr_t opr_sz = simd_oprsz(desc);
936     float64 *d = vd, *n = vn, *m = vm, *a = va;
937     float_status *fpst = vfpst;
938     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940     uint64_t neg_real = flip ^ neg_imag;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e2 = n[i + flip];
949         float64 e1 = m[i + flip] ^ neg_real;
950         float64 e4 = e2;
951         float64 e3 = m[i + 1 - flip] ^ neg_imag;
952 
953         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 /*
960  * Floating point comparisons producing an integer result (all 1s or all 0s).
961  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963  */
964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965 {
966     return -float16_eq_quiet(op1, op2, stat);
967 }
968 
969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970 {
971     return -float32_eq_quiet(op1, op2, stat);
972 }
973 
974 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
975 {
976     return -float16_le(op2, op1, stat);
977 }
978 
979 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
980 {
981     return -float32_le(op2, op1, stat);
982 }
983 
984 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
985 {
986     return -float16_lt(op2, op1, stat);
987 }
988 
989 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
990 {
991     return -float32_lt(op2, op1, stat);
992 }
993 
994 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
995 {
996     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
997 }
998 
999 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1000 {
1001     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1002 }
1003 
1004 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1005 {
1006     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1007 }
1008 
1009 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1010 {
1011     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1012 }
1013 
1014 static int16_t vfp_tosszh(float16 x, void *fpstp)
1015 {
1016     float_status *fpst = fpstp;
1017     if (float16_is_any_nan(x)) {
1018         float_raise(float_flag_invalid, fpst);
1019         return 0;
1020     }
1021     return float16_to_int16_round_to_zero(x, fpst);
1022 }
1023 
1024 static uint16_t vfp_touszh(float16 x, void *fpstp)
1025 {
1026     float_status *fpst = fpstp;
1027     if (float16_is_any_nan(x)) {
1028         float_raise(float_flag_invalid, fpst);
1029         return 0;
1030     }
1031     return float16_to_uint16_round_to_zero(x, fpst);
1032 }
1033 
1034 #define DO_2OP(NAME, FUNC, TYPE) \
1035 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1036 {                                                                 \
1037     intptr_t i, oprsz = simd_oprsz(desc);                         \
1038     TYPE *d = vd, *n = vn;                                        \
1039     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1040         d[i] = FUNC(n[i], stat);                                  \
1041     }                                                             \
1042     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1043 }
1044 
1045 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1046 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1047 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1048 
1049 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1050 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1051 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1052 
1053 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1054 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1055 
1056 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1057 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1058 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1059 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1060 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1061 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1062 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1063 DO_2OP(gvec_touszh, vfp_touszh, float16)
1064 
1065 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1066     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1067     {                                                           \
1068         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1069     }
1070 
1071 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1072     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1073     {                                                           \
1074         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1075     }
1076 
1077 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1078     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1079     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1080     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1081     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1082 
1083 DO_2OP_CMP0(cgt, cgt, FWD)
1084 DO_2OP_CMP0(cge, cge, FWD)
1085 DO_2OP_CMP0(ceq, ceq, FWD)
1086 DO_2OP_CMP0(clt, cgt, REV)
1087 DO_2OP_CMP0(cle, cge, REV)
1088 
1089 #undef DO_2OP
1090 #undef DO_2OP_CMP0
1091 
1092 /* Floating-point trigonometric starting value.
1093  * See the ARM ARM pseudocode function FPTrigSMul.
1094  */
1095 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1096 {
1097     float16 result = float16_mul(op1, op1, stat);
1098     if (!float16_is_any_nan(result)) {
1099         result = float16_set_sign(result, op2 & 1);
1100     }
1101     return result;
1102 }
1103 
1104 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1105 {
1106     float32 result = float32_mul(op1, op1, stat);
1107     if (!float32_is_any_nan(result)) {
1108         result = float32_set_sign(result, op2 & 1);
1109     }
1110     return result;
1111 }
1112 
1113 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1114 {
1115     float64 result = float64_mul(op1, op1, stat);
1116     if (!float64_is_any_nan(result)) {
1117         result = float64_set_sign(result, op2 & 1);
1118     }
1119     return result;
1120 }
1121 
1122 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return float16_abs(float16_sub(op1, op2, stat));
1125 }
1126 
1127 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return float32_abs(float32_sub(op1, op2, stat));
1130 }
1131 
1132 /*
1133  * Reciprocal step. These are the AArch32 version which uses a
1134  * non-fused multiply-and-subtract.
1135  */
1136 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1137 {
1138     op1 = float16_squash_input_denormal(op1, stat);
1139     op2 = float16_squash_input_denormal(op2, stat);
1140 
1141     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1142         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1143         return float16_two;
1144     }
1145     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1146 }
1147 
1148 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1149 {
1150     op1 = float32_squash_input_denormal(op1, stat);
1151     op2 = float32_squash_input_denormal(op2, stat);
1152 
1153     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1154         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1155         return float32_two;
1156     }
1157     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1158 }
1159 
1160 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1161 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1162 {
1163     op1 = float16_squash_input_denormal(op1, stat);
1164     op2 = float16_squash_input_denormal(op2, stat);
1165 
1166     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1167         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1168         return float16_one_point_five;
1169     }
1170     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1171     return float16_div(op1, float16_two, stat);
1172 }
1173 
1174 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1175 {
1176     op1 = float32_squash_input_denormal(op1, stat);
1177     op2 = float32_squash_input_denormal(op2, stat);
1178 
1179     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1180         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1181         return float32_one_point_five;
1182     }
1183     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1184     return float32_div(op1, float32_two, stat);
1185 }
1186 
1187 #define DO_3OP(NAME, FUNC, TYPE) \
1188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1189 {                                                                          \
1190     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1191     TYPE *d = vd, *n = vn, *m = vm;                                        \
1192     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1193         d[i] = FUNC(n[i], m[i], stat);                                     \
1194     }                                                                      \
1195     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1196 }
1197 
1198 DO_3OP(gvec_fadd_h, float16_add, float16)
1199 DO_3OP(gvec_fadd_s, float32_add, float32)
1200 DO_3OP(gvec_fadd_d, float64_add, float64)
1201 
1202 DO_3OP(gvec_fsub_h, float16_sub, float16)
1203 DO_3OP(gvec_fsub_s, float32_sub, float32)
1204 DO_3OP(gvec_fsub_d, float64_sub, float64)
1205 
1206 DO_3OP(gvec_fmul_h, float16_mul, float16)
1207 DO_3OP(gvec_fmul_s, float32_mul, float32)
1208 DO_3OP(gvec_fmul_d, float64_mul, float64)
1209 
1210 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1211 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1212 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1213 
1214 DO_3OP(gvec_fabd_h, float16_abd, float16)
1215 DO_3OP(gvec_fabd_s, float32_abd, float32)
1216 
1217 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1218 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1219 
1220 DO_3OP(gvec_fcge_h, float16_cge, float16)
1221 DO_3OP(gvec_fcge_s, float32_cge, float32)
1222 
1223 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1224 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1225 
1226 DO_3OP(gvec_facge_h, float16_acge, float16)
1227 DO_3OP(gvec_facge_s, float32_acge, float32)
1228 
1229 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1230 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1231 
1232 DO_3OP(gvec_fmax_h, float16_max, float16)
1233 DO_3OP(gvec_fmax_s, float32_max, float32)
1234 
1235 DO_3OP(gvec_fmin_h, float16_min, float16)
1236 DO_3OP(gvec_fmin_s, float32_min, float32)
1237 
1238 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1239 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1240 
1241 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1242 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1243 
1244 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1245 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1246 
1247 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1248 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1249 
1250 #ifdef TARGET_AARCH64
1251 
1252 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1253 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1254 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1255 
1256 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1257 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1258 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1259 
1260 #endif
1261 #undef DO_3OP
1262 
1263 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1264 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1265                                  float_status *stat)
1266 {
1267     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1268 }
1269 
1270 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1271                                  float_status *stat)
1272 {
1273     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1274 }
1275 
1276 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1277                                  float_status *stat)
1278 {
1279     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1280 }
1281 
1282 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1283                                  float_status *stat)
1284 {
1285     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1286 }
1287 
1288 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1289 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1290                                 float_status *stat)
1291 {
1292     return float16_muladd(op1, op2, dest, 0, stat);
1293 }
1294 
1295 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1296                                  float_status *stat)
1297 {
1298     return float32_muladd(op1, op2, dest, 0, stat);
1299 }
1300 
1301 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1302                                  float_status *stat)
1303 {
1304     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1305 }
1306 
1307 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1308                                  float_status *stat)
1309 {
1310     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1311 }
1312 
1313 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1314 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1315 {                                                                          \
1316     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1317     TYPE *d = vd, *n = vn, *m = vm;                                        \
1318     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1319         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1320     }                                                                      \
1321     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1322 }
1323 
1324 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1325 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1326 
1327 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1328 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1329 
1330 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1331 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1332 
1333 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1334 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1335 
1336 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1337  * For AdvSIMD, there is of course only one such vector segment.
1338  */
1339 
1340 #define DO_MUL_IDX(NAME, TYPE, H) \
1341 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1342 {                                                                          \
1343     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1344     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1345     intptr_t idx = simd_data(desc);                                        \
1346     TYPE *d = vd, *n = vn, *m = vm;                                        \
1347     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1348         TYPE mm = m[H(i + idx)];                                           \
1349         for (j = 0; j < segment; j++) {                                    \
1350             d[i + j] = n[i + j] * mm;                                      \
1351         }                                                                  \
1352     }                                                                      \
1353     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1354 }
1355 
1356 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1357 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1358 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1359 
1360 #undef DO_MUL_IDX
1361 
1362 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1363 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1364 {                                                                          \
1365     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1366     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1367     intptr_t idx = simd_data(desc);                                        \
1368     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1369     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1370         TYPE mm = m[H(i + idx)];                                           \
1371         for (j = 0; j < segment; j++) {                                    \
1372             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1373         }                                                                  \
1374     }                                                                      \
1375     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1376 }
1377 
1378 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1379 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1380 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1381 
1382 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1383 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1384 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1385 
1386 #undef DO_MLA_IDX
1387 
1388 #define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
1389 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1390 {                                                                          \
1391     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1392     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1393     intptr_t idx = simd_data(desc);                                        \
1394     TYPE *d = vd, *n = vn, *m = vm;                                        \
1395     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1396         TYPE mm = m[H(i + idx)];                                           \
1397         for (j = 0; j < segment; j++) {                                    \
1398             d[i + j] = TYPE##_##ADD(d[i + j],                              \
1399                                     TYPE##_mul(n[i + j], mm, stat), stat); \
1400         }                                                                  \
1401     }                                                                      \
1402     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1403 }
1404 
1405 #define float16_nop(N, M, S) (M)
1406 #define float32_nop(N, M, S) (M)
1407 #define float64_nop(N, M, S) (M)
1408 
1409 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1410 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1411 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1412 
1413 /*
1414  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1415  * the fused ops below they assume accumulate both from and into Vd.
1416  */
1417 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1418 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1419 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1420 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1421 
1422 #undef float16_nop
1423 #undef float32_nop
1424 #undef float64_nop
1425 #undef DO_FMUL_IDX
1426 
1427 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1428 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1429                   void *stat, uint32_t desc)                               \
1430 {                                                                          \
1431     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1432     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1433     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1434     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1435     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1436     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1437     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1438         TYPE mm = m[H(i + idx)];                                           \
1439         for (j = 0; j < segment; j++) {                                    \
1440             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1441                                      mm, a[i + j], 0, stat);               \
1442         }                                                                  \
1443     }                                                                      \
1444     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1445 }
1446 
1447 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1448 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1449 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1450 
1451 #undef DO_FMLA_IDX
1452 
1453 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1454 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1455 {                                                                          \
1456     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1457     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1458     bool q = false;                                                        \
1459     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1460         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1461         if (dd < MIN) {                                                    \
1462             dd = MIN;                                                      \
1463             q = true;                                                      \
1464         } else if (dd > MAX) {                                             \
1465             dd = MAX;                                                      \
1466             q = true;                                                      \
1467         }                                                                  \
1468         d[i] = dd;                                                         \
1469     }                                                                      \
1470     if (q) {                                                               \
1471         uint32_t *qc = vq;                                                 \
1472         qc[0] = 1;                                                         \
1473     }                                                                      \
1474     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1475 }
1476 
1477 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1478 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1479 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1480 
1481 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1482 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1483 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1484 
1485 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1486 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1487 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1488 
1489 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1490 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1491 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1492 
1493 #undef DO_SAT
1494 
1495 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1496                           void *vm, uint32_t desc)
1497 {
1498     intptr_t i, oprsz = simd_oprsz(desc);
1499     uint64_t *d = vd, *n = vn, *m = vm;
1500     bool q = false;
1501 
1502     for (i = 0; i < oprsz / 8; i++) {
1503         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1504         if (dd < nn) {
1505             dd = UINT64_MAX;
1506             q = true;
1507         }
1508         d[i] = dd;
1509     }
1510     if (q) {
1511         uint32_t *qc = vq;
1512         qc[0] = 1;
1513     }
1514     clear_tail(d, oprsz, simd_maxsz(desc));
1515 }
1516 
1517 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1518                           void *vm, uint32_t desc)
1519 {
1520     intptr_t i, oprsz = simd_oprsz(desc);
1521     uint64_t *d = vd, *n = vn, *m = vm;
1522     bool q = false;
1523 
1524     for (i = 0; i < oprsz / 8; i++) {
1525         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1526         if (nn < mm) {
1527             dd = 0;
1528             q = true;
1529         }
1530         d[i] = dd;
1531     }
1532     if (q) {
1533         uint32_t *qc = vq;
1534         qc[0] = 1;
1535     }
1536     clear_tail(d, oprsz, simd_maxsz(desc));
1537 }
1538 
1539 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1540                           void *vm, uint32_t desc)
1541 {
1542     intptr_t i, oprsz = simd_oprsz(desc);
1543     int64_t *d = vd, *n = vn, *m = vm;
1544     bool q = false;
1545 
1546     for (i = 0; i < oprsz / 8; i++) {
1547         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1548         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1549             dd = (nn >> 63) ^ ~INT64_MIN;
1550             q = true;
1551         }
1552         d[i] = dd;
1553     }
1554     if (q) {
1555         uint32_t *qc = vq;
1556         qc[0] = 1;
1557     }
1558     clear_tail(d, oprsz, simd_maxsz(desc));
1559 }
1560 
1561 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1562                           void *vm, uint32_t desc)
1563 {
1564     intptr_t i, oprsz = simd_oprsz(desc);
1565     int64_t *d = vd, *n = vn, *m = vm;
1566     bool q = false;
1567 
1568     for (i = 0; i < oprsz / 8; i++) {
1569         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1570         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1571             dd = (nn >> 63) ^ ~INT64_MIN;
1572             q = true;
1573         }
1574         d[i] = dd;
1575     }
1576     if (q) {
1577         uint32_t *qc = vq;
1578         qc[0] = 1;
1579     }
1580     clear_tail(d, oprsz, simd_maxsz(desc));
1581 }
1582 
1583 
1584 #define DO_SRA(NAME, TYPE)                              \
1585 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1586 {                                                       \
1587     intptr_t i, oprsz = simd_oprsz(desc);               \
1588     int shift = simd_data(desc);                        \
1589     TYPE *d = vd, *n = vn;                              \
1590     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1591         d[i] += n[i] >> shift;                          \
1592     }                                                   \
1593     clear_tail(d, oprsz, simd_maxsz(desc));             \
1594 }
1595 
1596 DO_SRA(gvec_ssra_b, int8_t)
1597 DO_SRA(gvec_ssra_h, int16_t)
1598 DO_SRA(gvec_ssra_s, int32_t)
1599 DO_SRA(gvec_ssra_d, int64_t)
1600 
1601 DO_SRA(gvec_usra_b, uint8_t)
1602 DO_SRA(gvec_usra_h, uint16_t)
1603 DO_SRA(gvec_usra_s, uint32_t)
1604 DO_SRA(gvec_usra_d, uint64_t)
1605 
1606 #undef DO_SRA
1607 
1608 #define DO_RSHR(NAME, TYPE)                             \
1609 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1610 {                                                       \
1611     intptr_t i, oprsz = simd_oprsz(desc);               \
1612     int shift = simd_data(desc);                        \
1613     TYPE *d = vd, *n = vn;                              \
1614     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1615         TYPE tmp = n[i] >> (shift - 1);                 \
1616         d[i] = (tmp >> 1) + (tmp & 1);                  \
1617     }                                                   \
1618     clear_tail(d, oprsz, simd_maxsz(desc));             \
1619 }
1620 
1621 DO_RSHR(gvec_srshr_b, int8_t)
1622 DO_RSHR(gvec_srshr_h, int16_t)
1623 DO_RSHR(gvec_srshr_s, int32_t)
1624 DO_RSHR(gvec_srshr_d, int64_t)
1625 
1626 DO_RSHR(gvec_urshr_b, uint8_t)
1627 DO_RSHR(gvec_urshr_h, uint16_t)
1628 DO_RSHR(gvec_urshr_s, uint32_t)
1629 DO_RSHR(gvec_urshr_d, uint64_t)
1630 
1631 #undef DO_RSHR
1632 
1633 #define DO_RSRA(NAME, TYPE)                             \
1634 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1635 {                                                       \
1636     intptr_t i, oprsz = simd_oprsz(desc);               \
1637     int shift = simd_data(desc);                        \
1638     TYPE *d = vd, *n = vn;                              \
1639     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1640         TYPE tmp = n[i] >> (shift - 1);                 \
1641         d[i] += (tmp >> 1) + (tmp & 1);                 \
1642     }                                                   \
1643     clear_tail(d, oprsz, simd_maxsz(desc));             \
1644 }
1645 
1646 DO_RSRA(gvec_srsra_b, int8_t)
1647 DO_RSRA(gvec_srsra_h, int16_t)
1648 DO_RSRA(gvec_srsra_s, int32_t)
1649 DO_RSRA(gvec_srsra_d, int64_t)
1650 
1651 DO_RSRA(gvec_ursra_b, uint8_t)
1652 DO_RSRA(gvec_ursra_h, uint16_t)
1653 DO_RSRA(gvec_ursra_s, uint32_t)
1654 DO_RSRA(gvec_ursra_d, uint64_t)
1655 
1656 #undef DO_RSRA
1657 
1658 #define DO_SRI(NAME, TYPE)                              \
1659 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1660 {                                                       \
1661     intptr_t i, oprsz = simd_oprsz(desc);               \
1662     int shift = simd_data(desc);                        \
1663     TYPE *d = vd, *n = vn;                              \
1664     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1665         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1666     }                                                   \
1667     clear_tail(d, oprsz, simd_maxsz(desc));             \
1668 }
1669 
1670 DO_SRI(gvec_sri_b, uint8_t)
1671 DO_SRI(gvec_sri_h, uint16_t)
1672 DO_SRI(gvec_sri_s, uint32_t)
1673 DO_SRI(gvec_sri_d, uint64_t)
1674 
1675 #undef DO_SRI
1676 
1677 #define DO_SLI(NAME, TYPE)                              \
1678 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1679 {                                                       \
1680     intptr_t i, oprsz = simd_oprsz(desc);               \
1681     int shift = simd_data(desc);                        \
1682     TYPE *d = vd, *n = vn;                              \
1683     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1684         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1685     }                                                   \
1686     clear_tail(d, oprsz, simd_maxsz(desc));             \
1687 }
1688 
1689 DO_SLI(gvec_sli_b, uint8_t)
1690 DO_SLI(gvec_sli_h, uint16_t)
1691 DO_SLI(gvec_sli_s, uint32_t)
1692 DO_SLI(gvec_sli_d, uint64_t)
1693 
1694 #undef DO_SLI
1695 
1696 /*
1697  * Convert float16 to float32, raising no exceptions and
1698  * preserving exceptional values, including SNaN.
1699  * This is effectively an unpack+repack operation.
1700  */
1701 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1702 {
1703     const int f16_bias = 15;
1704     const int f32_bias = 127;
1705     uint32_t sign = extract32(f16, 15, 1);
1706     uint32_t exp = extract32(f16, 10, 5);
1707     uint32_t frac = extract32(f16, 0, 10);
1708 
1709     if (exp == 0x1f) {
1710         /* Inf or NaN */
1711         exp = 0xff;
1712     } else if (exp == 0) {
1713         /* Zero or denormal.  */
1714         if (frac != 0) {
1715             if (fz16) {
1716                 frac = 0;
1717             } else {
1718                 /*
1719                  * Denormal; these are all normal float32.
1720                  * Shift the fraction so that the msb is at bit 11,
1721                  * then remove bit 11 as the implicit bit of the
1722                  * normalized float32.  Note that we still go through
1723                  * the shift for normal numbers below, to put the
1724                  * float32 fraction at the right place.
1725                  */
1726                 int shift = clz32(frac) - 21;
1727                 frac = (frac << shift) & 0x3ff;
1728                 exp = f32_bias - f16_bias - shift + 1;
1729             }
1730         }
1731     } else {
1732         /* Normal number; adjust the bias.  */
1733         exp += f32_bias - f16_bias;
1734     }
1735     sign <<= 31;
1736     exp <<= 23;
1737     frac <<= 23 - 10;
1738 
1739     return sign | exp | frac;
1740 }
1741 
1742 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1743 {
1744     /*
1745      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1746      * Load the 2nd qword iff is_q & is_2.
1747      * Shift to the 2nd dword iff !is_q & is_2.
1748      * For !is_q & !is_2, the upper bits of the result are garbage.
1749      */
1750     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1751 }
1752 
1753 /*
1754  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1755  * as there is not yet SVE versions that might use blocking.
1756  */
1757 
1758 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1759                      uint32_t desc, bool fz16)
1760 {
1761     intptr_t i, oprsz = simd_oprsz(desc);
1762     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1763     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1764     int is_q = oprsz == 16;
1765     uint64_t n_4, m_4;
1766 
1767     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1768     n_4 = load4_f16(vn, is_q, is_2);
1769     m_4 = load4_f16(vm, is_q, is_2);
1770 
1771     /* Negate all inputs for FMLSL at once.  */
1772     if (is_s) {
1773         n_4 ^= 0x8000800080008000ull;
1774     }
1775 
1776     for (i = 0; i < oprsz / 4; i++) {
1777         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1778         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1779         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1780     }
1781     clear_tail(d, oprsz, simd_maxsz(desc));
1782 }
1783 
1784 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1785                             void *venv, uint32_t desc)
1786 {
1787     CPUARMState *env = venv;
1788     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1789              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1790 }
1791 
1792 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1793                             void *venv, uint32_t desc)
1794 {
1795     CPUARMState *env = venv;
1796     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1797              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1798 }
1799 
1800 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1801                                void *venv, uint32_t desc)
1802 {
1803     intptr_t i, oprsz = simd_oprsz(desc);
1804     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1805     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1806     CPUARMState *env = venv;
1807     float_status *status = &env->vfp.fp_status;
1808     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1809 
1810     for (i = 0; i < oprsz; i += sizeof(float32)) {
1811         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1812         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1813         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1814         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1815         float32 aa = *(float32 *)(va + H1_4(i));
1816 
1817         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1818     }
1819 }
1820 
1821 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1822                          uint32_t desc, bool fz16)
1823 {
1824     intptr_t i, oprsz = simd_oprsz(desc);
1825     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1826     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1827     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1828     int is_q = oprsz == 16;
1829     uint64_t n_4;
1830     float32 m_1;
1831 
1832     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1833     n_4 = load4_f16(vn, is_q, is_2);
1834 
1835     /* Negate all inputs for FMLSL at once.  */
1836     if (is_s) {
1837         n_4 ^= 0x8000800080008000ull;
1838     }
1839 
1840     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1841 
1842     for (i = 0; i < oprsz / 4; i++) {
1843         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1844         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1845     }
1846     clear_tail(d, oprsz, simd_maxsz(desc));
1847 }
1848 
1849 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1850                                 void *venv, uint32_t desc)
1851 {
1852     CPUARMState *env = venv;
1853     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1854                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1855 }
1856 
1857 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1858                                 void *venv, uint32_t desc)
1859 {
1860     CPUARMState *env = venv;
1861     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1862                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1863 }
1864 
1865 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1866                                void *venv, uint32_t desc)
1867 {
1868     intptr_t i, j, oprsz = simd_oprsz(desc);
1869     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1870     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1871     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1872     CPUARMState *env = venv;
1873     float_status *status = &env->vfp.fp_status;
1874     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1875 
1876     for (i = 0; i < oprsz; i += 16) {
1877         float16 mm_16 = *(float16 *)(vm + i + idx);
1878         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1879 
1880         for (j = 0; j < 16; j += sizeof(float32)) {
1881             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1882             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1883             float32 aa = *(float32 *)(va + H1_4(i + j));
1884 
1885             *(float32 *)(vd + H1_4(i + j)) =
1886                 float32_muladd(nn, mm, aa, 0, status);
1887         }
1888     }
1889 }
1890 
1891 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1892 {
1893     intptr_t i, opr_sz = simd_oprsz(desc);
1894     int8_t *d = vd, *n = vn, *m = vm;
1895 
1896     for (i = 0; i < opr_sz; ++i) {
1897         int8_t mm = m[i];
1898         int8_t nn = n[i];
1899         int8_t res = 0;
1900         if (mm >= 0) {
1901             if (mm < 8) {
1902                 res = nn << mm;
1903             }
1904         } else {
1905             res = nn >> (mm > -8 ? -mm : 7);
1906         }
1907         d[i] = res;
1908     }
1909     clear_tail(d, opr_sz, simd_maxsz(desc));
1910 }
1911 
1912 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1913 {
1914     intptr_t i, opr_sz = simd_oprsz(desc);
1915     int16_t *d = vd, *n = vn, *m = vm;
1916 
1917     for (i = 0; i < opr_sz / 2; ++i) {
1918         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1919         int16_t nn = n[i];
1920         int16_t res = 0;
1921         if (mm >= 0) {
1922             if (mm < 16) {
1923                 res = nn << mm;
1924             }
1925         } else {
1926             res = nn >> (mm > -16 ? -mm : 15);
1927         }
1928         d[i] = res;
1929     }
1930     clear_tail(d, opr_sz, simd_maxsz(desc));
1931 }
1932 
1933 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1934 {
1935     intptr_t i, opr_sz = simd_oprsz(desc);
1936     uint8_t *d = vd, *n = vn, *m = vm;
1937 
1938     for (i = 0; i < opr_sz; ++i) {
1939         int8_t mm = m[i];
1940         uint8_t nn = n[i];
1941         uint8_t res = 0;
1942         if (mm >= 0) {
1943             if (mm < 8) {
1944                 res = nn << mm;
1945             }
1946         } else {
1947             if (mm > -8) {
1948                 res = nn >> -mm;
1949             }
1950         }
1951         d[i] = res;
1952     }
1953     clear_tail(d, opr_sz, simd_maxsz(desc));
1954 }
1955 
1956 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1957 {
1958     intptr_t i, opr_sz = simd_oprsz(desc);
1959     uint16_t *d = vd, *n = vn, *m = vm;
1960 
1961     for (i = 0; i < opr_sz / 2; ++i) {
1962         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1963         uint16_t nn = n[i];
1964         uint16_t res = 0;
1965         if (mm >= 0) {
1966             if (mm < 16) {
1967                 res = nn << mm;
1968             }
1969         } else {
1970             if (mm > -16) {
1971                 res = nn >> -mm;
1972             }
1973         }
1974         d[i] = res;
1975     }
1976     clear_tail(d, opr_sz, simd_maxsz(desc));
1977 }
1978 
1979 /*
1980  * 8x8->8 polynomial multiply.
1981  *
1982  * Polynomial multiplication is like integer multiplication except the
1983  * partial products are XORed, not added.
1984  *
1985  * TODO: expose this as a generic vector operation, as it is a common
1986  * crypto building block.
1987  */
1988 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1989 {
1990     intptr_t i, opr_sz = simd_oprsz(desc);
1991     uint64_t *d = vd, *n = vn, *m = vm;
1992 
1993     for (i = 0; i < opr_sz / 8; ++i) {
1994         d[i] = clmul_8x8_low(n[i], m[i]);
1995     }
1996     clear_tail(d, opr_sz, simd_maxsz(desc));
1997 }
1998 
1999 /*
2000  * 64x64->128 polynomial multiply.
2001  * Because of the lanes are not accessed in strict columns,
2002  * this probably cannot be turned into a generic helper.
2003  */
2004 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2005 {
2006     intptr_t i, j, opr_sz = simd_oprsz(desc);
2007     intptr_t hi = simd_data(desc);
2008     uint64_t *d = vd, *n = vn, *m = vm;
2009 
2010     for (i = 0; i < opr_sz / 8; i += 2) {
2011         uint64_t nn = n[i + hi];
2012         uint64_t mm = m[i + hi];
2013         uint64_t rhi = 0;
2014         uint64_t rlo = 0;
2015 
2016         /* Bit 0 can only influence the low 64-bit result.  */
2017         if (nn & 1) {
2018             rlo = mm;
2019         }
2020 
2021         for (j = 1; j < 64; ++j) {
2022             uint64_t mask = -((nn >> j) & 1);
2023             rlo ^= (mm << j) & mask;
2024             rhi ^= (mm >> (64 - j)) & mask;
2025         }
2026         d[i] = rlo;
2027         d[i + 1] = rhi;
2028     }
2029     clear_tail(d, opr_sz, simd_maxsz(desc));
2030 }
2031 
2032 uint64_t pmull_w(uint64_t op1, uint64_t op2)
2033 {
2034     uint64_t result = 0;
2035     int i;
2036     for (i = 0; i < 16; ++i) {
2037         uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2038         result ^= op2 & mask;
2039         op1 >>= 1;
2040         op2 <<= 1;
2041     }
2042     return result;
2043 }
2044 
2045 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2046 {
2047     int hi = simd_data(desc);
2048     uint64_t *d = vd, *n = vn, *m = vm;
2049     uint64_t nn = n[hi], mm = m[hi];
2050 
2051     d[0] = clmul_8x4_packed(nn, mm);
2052     nn >>= 32;
2053     mm >>= 32;
2054     d[1] = clmul_8x4_packed(nn, mm);
2055 
2056     clear_tail(d, 16, simd_maxsz(desc));
2057 }
2058 
2059 #ifdef TARGET_AARCH64
2060 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2061 {
2062     int shift = simd_data(desc) * 8;
2063     intptr_t i, opr_sz = simd_oprsz(desc);
2064     uint64_t *d = vd, *n = vn, *m = vm;
2065 
2066     for (i = 0; i < opr_sz / 8; ++i) {
2067         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2068     }
2069 }
2070 
2071 static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2072 {
2073     uint64_t result = 0;
2074     int i;
2075 
2076     for (i = 0; i < 32; ++i) {
2077         uint64_t mask = -((op1 >> i) & 1);
2078         result ^= (op2 << i) & mask;
2079     }
2080     return result;
2081 }
2082 
2083 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2084 {
2085     intptr_t sel = H4(simd_data(desc));
2086     intptr_t i, opr_sz = simd_oprsz(desc);
2087     uint32_t *n = vn, *m = vm;
2088     uint64_t *d = vd;
2089 
2090     for (i = 0; i < opr_sz / 8; ++i) {
2091         d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2092     }
2093 }
2094 #endif
2095 
2096 #define DO_CMP0(NAME, TYPE, OP)                         \
2097 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2098 {                                                       \
2099     intptr_t i, opr_sz = simd_oprsz(desc);              \
2100     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2101         TYPE nn = *(TYPE *)(vn + i);                    \
2102         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2103     }                                                   \
2104     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2105 }
2106 
2107 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2108 DO_CMP0(gvec_clt0_b, int8_t, <)
2109 DO_CMP0(gvec_cle0_b, int8_t, <=)
2110 DO_CMP0(gvec_cgt0_b, int8_t, >)
2111 DO_CMP0(gvec_cge0_b, int8_t, >=)
2112 
2113 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2114 DO_CMP0(gvec_clt0_h, int16_t, <)
2115 DO_CMP0(gvec_cle0_h, int16_t, <=)
2116 DO_CMP0(gvec_cgt0_h, int16_t, >)
2117 DO_CMP0(gvec_cge0_h, int16_t, >=)
2118 
2119 #undef DO_CMP0
2120 
2121 #define DO_ABD(NAME, TYPE)                                      \
2122 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2123 {                                                               \
2124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2125     TYPE *d = vd, *n = vn, *m = vm;                             \
2126                                                                 \
2127     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2128         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2129     }                                                           \
2130     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2131 }
2132 
2133 DO_ABD(gvec_sabd_b, int8_t)
2134 DO_ABD(gvec_sabd_h, int16_t)
2135 DO_ABD(gvec_sabd_s, int32_t)
2136 DO_ABD(gvec_sabd_d, int64_t)
2137 
2138 DO_ABD(gvec_uabd_b, uint8_t)
2139 DO_ABD(gvec_uabd_h, uint16_t)
2140 DO_ABD(gvec_uabd_s, uint32_t)
2141 DO_ABD(gvec_uabd_d, uint64_t)
2142 
2143 #undef DO_ABD
2144 
2145 #define DO_ABA(NAME, TYPE)                                      \
2146 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2147 {                                                               \
2148     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2149     TYPE *d = vd, *n = vn, *m = vm;                             \
2150                                                                 \
2151     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2152         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2153     }                                                           \
2154     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2155 }
2156 
2157 DO_ABA(gvec_saba_b, int8_t)
2158 DO_ABA(gvec_saba_h, int16_t)
2159 DO_ABA(gvec_saba_s, int32_t)
2160 DO_ABA(gvec_saba_d, int64_t)
2161 
2162 DO_ABA(gvec_uaba_b, uint8_t)
2163 DO_ABA(gvec_uaba_h, uint16_t)
2164 DO_ABA(gvec_uaba_s, uint32_t)
2165 DO_ABA(gvec_uaba_d, uint64_t)
2166 
2167 #undef DO_ABA
2168 
2169 #define DO_NEON_PAIRWISE(NAME, OP)                                      \
2170     void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2171                          void *stat, uint32_t oprsz)                    \
2172     {                                                                   \
2173         float_status *fpst = stat;                                      \
2174         float32 *d = vd;                                                \
2175         float32 *n = vn;                                                \
2176         float32 *m = vm;                                                \
2177         float32 r0, r1;                                                 \
2178                                                                         \
2179         /* Read all inputs before writing outputs in case vm == vd */   \
2180         r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2181         r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2182                                                                         \
2183         d[H4(0)] = r0;                                                  \
2184         d[H4(1)] = r1;                                                  \
2185     }                                                                   \
2186                                                                         \
2187     void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2188                          void *stat, uint32_t oprsz)                    \
2189     {                                                                   \
2190         float_status *fpst = stat;                                      \
2191         float16 *d = vd;                                                \
2192         float16 *n = vn;                                                \
2193         float16 *m = vm;                                                \
2194         float16 r0, r1, r2, r3;                                         \
2195                                                                         \
2196         /* Read all inputs before writing outputs in case vm == vd */   \
2197         r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2198         r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2199         r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2200         r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2201                                                                         \
2202         d[H2(0)] = r0;                                                  \
2203         d[H2(1)] = r1;                                                  \
2204         d[H2(2)] = r2;                                                  \
2205         d[H2(3)] = r3;                                                  \
2206     }
2207 
2208 DO_NEON_PAIRWISE(neon_padd, add)
2209 DO_NEON_PAIRWISE(neon_pmax, max)
2210 DO_NEON_PAIRWISE(neon_pmin, min)
2211 
2212 #undef DO_NEON_PAIRWISE
2213 
2214 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2215     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2216     {                                                                   \
2217         intptr_t i, oprsz = simd_oprsz(desc);                           \
2218         int shift = simd_data(desc);                                    \
2219         TYPE *d = vd, *n = vn;                                          \
2220         float_status *fpst = stat;                                      \
2221         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2222             d[i] = FUNC(n[i], shift, fpst);                             \
2223         }                                                               \
2224         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2225     }
2226 
2227 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2228 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2229 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2230 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2231 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2232 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2233 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2234 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2235 
2236 #undef DO_VCVT_FIXED
2237 
2238 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2239     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2240     {                                                                   \
2241         float_status *fpst = stat;                                      \
2242         intptr_t i, oprsz = simd_oprsz(desc);                           \
2243         uint32_t rmode = simd_data(desc);                               \
2244         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2245         TYPE *d = vd, *n = vn;                                          \
2246         set_float_rounding_mode(rmode, fpst);                           \
2247         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2248             d[i] = FUNC(n[i], 0, fpst);                                 \
2249         }                                                               \
2250         set_float_rounding_mode(prev_rmode, fpst);                      \
2251         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2252     }
2253 
2254 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2255 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2256 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2257 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2258 
2259 #undef DO_VCVT_RMODE
2260 
2261 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2262     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2263     {                                                                   \
2264         float_status *fpst = stat;                                      \
2265         intptr_t i, oprsz = simd_oprsz(desc);                           \
2266         uint32_t rmode = simd_data(desc);                               \
2267         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2268         TYPE *d = vd, *n = vn;                                          \
2269         set_float_rounding_mode(rmode, fpst);                           \
2270         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2271             d[i] = FUNC(n[i], fpst);                                    \
2272         }                                                               \
2273         set_float_rounding_mode(prev_rmode, fpst);                      \
2274         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2275     }
2276 
2277 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2278 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2279 
2280 #undef DO_VRINT_RMODE
2281 
2282 #ifdef TARGET_AARCH64
2283 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2284 {
2285     const uint8_t *indices = vm;
2286     CPUARMState *env = venv;
2287     size_t oprsz = simd_oprsz(desc);
2288     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2289     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2290     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2291     union {
2292         uint8_t b[16];
2293         uint64_t d[2];
2294     } result;
2295 
2296     /*
2297      * We must construct the final result in a temp, lest the output
2298      * overlaps the input table.  For TBL, begin with zero; for TBX,
2299      * begin with the original register contents.  Note that we always
2300      * copy 16 bytes here to avoid an extra branch; clearing the high
2301      * bits of the register for oprsz == 8 is handled below.
2302      */
2303     if (is_tbx) {
2304         memcpy(&result, vd, 16);
2305     } else {
2306         memset(&result, 0, 16);
2307     }
2308 
2309     for (size_t i = 0; i < oprsz; ++i) {
2310         uint32_t index = indices[H1(i)];
2311 
2312         if (index < table_len) {
2313             /*
2314              * Convert index (a byte offset into the virtual table
2315              * which is a series of 128-bit vectors concatenated)
2316              * into the correct register element, bearing in mind
2317              * that the table can wrap around from V31 to V0.
2318              */
2319             const uint8_t *table = (const uint8_t *)
2320                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2321             result.b[H1(i)] = table[H1(index % 16)];
2322         }
2323     }
2324 
2325     memcpy(vd, &result, 16);
2326     clear_tail(vd, oprsz, simd_maxsz(desc));
2327 }
2328 #endif
2329 
2330 /*
2331  * NxN -> N highpart multiply
2332  *
2333  * TODO: expose this as a generic vector operation.
2334  */
2335 
2336 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2337 {
2338     intptr_t i, opr_sz = simd_oprsz(desc);
2339     int8_t *d = vd, *n = vn, *m = vm;
2340 
2341     for (i = 0; i < opr_sz; ++i) {
2342         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2343     }
2344     clear_tail(d, opr_sz, simd_maxsz(desc));
2345 }
2346 
2347 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2348 {
2349     intptr_t i, opr_sz = simd_oprsz(desc);
2350     int16_t *d = vd, *n = vn, *m = vm;
2351 
2352     for (i = 0; i < opr_sz / 2; ++i) {
2353         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2354     }
2355     clear_tail(d, opr_sz, simd_maxsz(desc));
2356 }
2357 
2358 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2359 {
2360     intptr_t i, opr_sz = simd_oprsz(desc);
2361     int32_t *d = vd, *n = vn, *m = vm;
2362 
2363     for (i = 0; i < opr_sz / 4; ++i) {
2364         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2365     }
2366     clear_tail(d, opr_sz, simd_maxsz(desc));
2367 }
2368 
2369 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2370 {
2371     intptr_t i, opr_sz = simd_oprsz(desc);
2372     uint64_t *d = vd, *n = vn, *m = vm;
2373     uint64_t discard;
2374 
2375     for (i = 0; i < opr_sz / 8; ++i) {
2376         muls64(&discard, &d[i], n[i], m[i]);
2377     }
2378     clear_tail(d, opr_sz, simd_maxsz(desc));
2379 }
2380 
2381 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2382 {
2383     intptr_t i, opr_sz = simd_oprsz(desc);
2384     uint8_t *d = vd, *n = vn, *m = vm;
2385 
2386     for (i = 0; i < opr_sz; ++i) {
2387         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2388     }
2389     clear_tail(d, opr_sz, simd_maxsz(desc));
2390 }
2391 
2392 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2393 {
2394     intptr_t i, opr_sz = simd_oprsz(desc);
2395     uint16_t *d = vd, *n = vn, *m = vm;
2396 
2397     for (i = 0; i < opr_sz / 2; ++i) {
2398         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2399     }
2400     clear_tail(d, opr_sz, simd_maxsz(desc));
2401 }
2402 
2403 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2404 {
2405     intptr_t i, opr_sz = simd_oprsz(desc);
2406     uint32_t *d = vd, *n = vn, *m = vm;
2407 
2408     for (i = 0; i < opr_sz / 4; ++i) {
2409         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2410     }
2411     clear_tail(d, opr_sz, simd_maxsz(desc));
2412 }
2413 
2414 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2415 {
2416     intptr_t i, opr_sz = simd_oprsz(desc);
2417     uint64_t *d = vd, *n = vn, *m = vm;
2418     uint64_t discard;
2419 
2420     for (i = 0; i < opr_sz / 8; ++i) {
2421         mulu64(&discard, &d[i], n[i], m[i]);
2422     }
2423     clear_tail(d, opr_sz, simd_maxsz(desc));
2424 }
2425 
2426 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2427 {
2428     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2429     int shr = simd_data(desc);
2430     uint64_t *d = vd, *n = vn, *m = vm;
2431 
2432     for (i = 0; i < opr_sz; ++i) {
2433         d[i] = ror64(n[i] ^ m[i], shr);
2434     }
2435     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2436 }
2437 
2438 /*
2439  * Integer matrix-multiply accumulate
2440  */
2441 
2442 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2443 {
2444     int8_t *n = vn, *m = vm;
2445 
2446     for (intptr_t k = 0; k < 8; ++k) {
2447         sum += n[H1(k)] * m[H1(k)];
2448     }
2449     return sum;
2450 }
2451 
2452 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2453 {
2454     uint8_t *n = vn, *m = vm;
2455 
2456     for (intptr_t k = 0; k < 8; ++k) {
2457         sum += n[H1(k)] * m[H1(k)];
2458     }
2459     return sum;
2460 }
2461 
2462 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2463 {
2464     uint8_t *n = vn;
2465     int8_t *m = vm;
2466 
2467     for (intptr_t k = 0; k < 8; ++k) {
2468         sum += n[H1(k)] * m[H1(k)];
2469     }
2470     return sum;
2471 }
2472 
2473 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2474                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2475 {
2476     intptr_t seg, opr_sz = simd_oprsz(desc);
2477 
2478     for (seg = 0; seg < opr_sz; seg += 16) {
2479         uint32_t *d = vd + seg;
2480         uint32_t *a = va + seg;
2481         uint32_t sum0, sum1, sum2, sum3;
2482 
2483         /*
2484          * Process the entire segment at once, writing back the
2485          * results only after we've consumed all of the inputs.
2486          *
2487          * Key to indices by column:
2488          *          i   j                  i             j
2489          */
2490         sum0 = a[H4(0 + 0)];
2491         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2492         sum1 = a[H4(0 + 1)];
2493         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2494         sum2 = a[H4(2 + 0)];
2495         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2496         sum3 = a[H4(2 + 1)];
2497         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2498 
2499         d[H4(0)] = sum0;
2500         d[H4(1)] = sum1;
2501         d[H4(2)] = sum2;
2502         d[H4(3)] = sum3;
2503     }
2504     clear_tail(vd, opr_sz, simd_maxsz(desc));
2505 }
2506 
2507 #define DO_MMLA_B(NAME, INNER) \
2508     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2509     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2510 
2511 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2512 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2513 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2514 
2515 /*
2516  * BFloat16 Dot Product
2517  */
2518 
2519 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2520 {
2521     /* FPCR is ignored for BFDOT and BFMMLA. */
2522     float_status bf_status = {
2523         .tininess_before_rounding = float_tininess_before_rounding,
2524         .float_rounding_mode = float_round_to_odd_inf,
2525         .flush_to_zero = true,
2526         .flush_inputs_to_zero = true,
2527         .default_nan_mode = true,
2528     };
2529     float32 t1, t2;
2530 
2531     /*
2532      * Extract each BFloat16 from the element pair, and shift
2533      * them such that they become float32.
2534      */
2535     t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2536     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2537     t1 = float32_add(t1, t2, &bf_status);
2538     t1 = float32_add(sum, t1, &bf_status);
2539 
2540     return t1;
2541 }
2542 
2543 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2544 {
2545     intptr_t i, opr_sz = simd_oprsz(desc);
2546     float32 *d = vd, *a = va;
2547     uint32_t *n = vn, *m = vm;
2548 
2549     for (i = 0; i < opr_sz / 4; ++i) {
2550         d[i] = bfdotadd(a[i], n[i], m[i]);
2551     }
2552     clear_tail(d, opr_sz, simd_maxsz(desc));
2553 }
2554 
2555 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2556                             void *va, uint32_t desc)
2557 {
2558     intptr_t i, j, opr_sz = simd_oprsz(desc);
2559     intptr_t index = simd_data(desc);
2560     intptr_t elements = opr_sz / 4;
2561     intptr_t eltspersegment = MIN(16 / 4, elements);
2562     float32 *d = vd, *a = va;
2563     uint32_t *n = vn, *m = vm;
2564 
2565     for (i = 0; i < elements; i += eltspersegment) {
2566         uint32_t m_idx = m[i + H4(index)];
2567 
2568         for (j = i; j < i + eltspersegment; j++) {
2569             d[j] = bfdotadd(a[j], n[j], m_idx);
2570         }
2571     }
2572     clear_tail(d, opr_sz, simd_maxsz(desc));
2573 }
2574 
2575 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2576 {
2577     intptr_t s, opr_sz = simd_oprsz(desc);
2578     float32 *d = vd, *a = va;
2579     uint32_t *n = vn, *m = vm;
2580 
2581     for (s = 0; s < opr_sz / 4; s += 4) {
2582         float32 sum00, sum01, sum10, sum11;
2583 
2584         /*
2585          * Process the entire segment at once, writing back the
2586          * results only after we've consumed all of the inputs.
2587          *
2588          * Key to indices by column:
2589          *               i   j           i   k             j   k
2590          */
2591         sum00 = a[s + H4(0 + 0)];
2592         sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2593         sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2594 
2595         sum01 = a[s + H4(0 + 1)];
2596         sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2597         sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2598 
2599         sum10 = a[s + H4(2 + 0)];
2600         sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2601         sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2602 
2603         sum11 = a[s + H4(2 + 1)];
2604         sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2605         sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2606 
2607         d[s + H4(0 + 0)] = sum00;
2608         d[s + H4(0 + 1)] = sum01;
2609         d[s + H4(2 + 0)] = sum10;
2610         d[s + H4(2 + 1)] = sum11;
2611     }
2612     clear_tail(d, opr_sz, simd_maxsz(desc));
2613 }
2614 
2615 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2616                          void *stat, uint32_t desc)
2617 {
2618     intptr_t i, opr_sz = simd_oprsz(desc);
2619     intptr_t sel = simd_data(desc);
2620     float32 *d = vd, *a = va;
2621     bfloat16 *n = vn, *m = vm;
2622 
2623     for (i = 0; i < opr_sz / 4; ++i) {
2624         float32 nn = n[H2(i * 2 + sel)] << 16;
2625         float32 mm = m[H2(i * 2 + sel)] << 16;
2626         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2627     }
2628     clear_tail(d, opr_sz, simd_maxsz(desc));
2629 }
2630 
2631 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2632                              void *va, void *stat, uint32_t desc)
2633 {
2634     intptr_t i, j, opr_sz = simd_oprsz(desc);
2635     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2636     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2637     intptr_t elements = opr_sz / 4;
2638     intptr_t eltspersegment = MIN(16 / 4, elements);
2639     float32 *d = vd, *a = va;
2640     bfloat16 *n = vn, *m = vm;
2641 
2642     for (i = 0; i < elements; i += eltspersegment) {
2643         float32 m_idx = m[H2(2 * i + index)] << 16;
2644 
2645         for (j = i; j < i + eltspersegment; j++) {
2646             float32 n_j = n[H2(2 * j + sel)] << 16;
2647             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2648         }
2649     }
2650     clear_tail(d, opr_sz, simd_maxsz(desc));
2651 }
2652 
2653 #define DO_CLAMP(NAME, TYPE) \
2654 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2655 {                                                                       \
2656     intptr_t i, opr_sz = simd_oprsz(desc);                              \
2657     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2658         TYPE aa = *(TYPE *)(a + i);                                     \
2659         TYPE nn = *(TYPE *)(n + i);                                     \
2660         TYPE mm = *(TYPE *)(m + i);                                     \
2661         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2662         *(TYPE *)(d + i) = dd;                                          \
2663     }                                                                   \
2664     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2665 }
2666 
2667 DO_CLAMP(gvec_sclamp_b, int8_t)
2668 DO_CLAMP(gvec_sclamp_h, int16_t)
2669 DO_CLAMP(gvec_sclamp_s, int32_t)
2670 DO_CLAMP(gvec_sclamp_d, int64_t)
2671 
2672 DO_CLAMP(gvec_uclamp_b, uint8_t)
2673 DO_CLAMP(gvec_uclamp_h, uint16_t)
2674 DO_CLAMP(gvec_uclamp_s, uint32_t)
2675 DO_CLAMP(gvec_uclamp_d, uint64_t)
2676