xref: /qemu/target/arm/tcg/sve_helper.c (revision 5ac034b1)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg.h"
28 #include "vec_internal.h"
29 #include "sve_ldst_internal.h"
30 
31 
32 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
33  *
34  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
35  * and bit 0 set if C is set.  Compare the definitions of these variables
36  * within CPUARMState.
37  */
38 
39 /* For no G bits set, NZCV = C.  */
40 #define PREDTEST_INIT  1
41 
42 /* This is an iterative function, called for each Pd and Pg word
43  * moving forward.
44  */
45 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
46 {
47     if (likely(g)) {
48         /* Compute N from first D & G.
49            Use bit 2 to signal first G bit seen.  */
50         if (!(flags & 4)) {
51             flags |= ((d & (g & -g)) != 0) << 31;
52             flags |= 4;
53         }
54 
55         /* Accumulate Z from each D & G.  */
56         flags |= ((d & g) != 0) << 1;
57 
58         /* Compute C from last !(D & G).  Replace previous.  */
59         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
60     }
61     return flags;
62 }
63 
64 /* This is an iterative function, called for each Pd and Pg word
65  * moving backward.
66  */
67 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
68 {
69     if (likely(g)) {
70         /* Compute C from first (i.e last) !(D & G).
71            Use bit 2 to signal first G bit seen.  */
72         if (!(flags & 4)) {
73             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
74             flags |= (d & pow2floor(g)) == 0;
75         }
76 
77         /* Accumulate Z from each D & G.  */
78         flags |= ((d & g) != 0) << 1;
79 
80         /* Compute N from last (i.e first) D & G.  Replace previous.  */
81         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
82     }
83     return flags;
84 }
85 
86 /* The same for a single word predicate.  */
87 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
88 {
89     return iter_predtest_fwd(d, g, PREDTEST_INIT);
90 }
91 
92 /* The same for a multi-word predicate.  */
93 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
94 {
95     uint32_t flags = PREDTEST_INIT;
96     uint64_t *d = vd, *g = vg;
97     uintptr_t i = 0;
98 
99     do {
100         flags = iter_predtest_fwd(d[i], g[i], flags);
101     } while (++i < words);
102 
103     return flags;
104 }
105 
106 /* Similarly for single word elements.  */
107 static inline uint64_t expand_pred_s(uint8_t byte)
108 {
109     static const uint64_t word[] = {
110         [0x01] = 0x00000000ffffffffull,
111         [0x10] = 0xffffffff00000000ull,
112         [0x11] = 0xffffffffffffffffull,
113     };
114     return word[byte & 0x11];
115 }
116 
117 #define LOGICAL_PPPP(NAME, FUNC) \
118 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
119 {                                                                         \
120     uintptr_t opr_sz = simd_oprsz(desc);                                  \
121     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
122     uintptr_t i;                                                          \
123     for (i = 0; i < opr_sz / 8; ++i) {                                    \
124         d[i] = FUNC(n[i], m[i], g[i]);                                    \
125     }                                                                     \
126 }
127 
128 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
129 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
130 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
131 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
132 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
133 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
134 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
135 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
136 
137 LOGICAL_PPPP(sve_and_pppp, DO_AND)
138 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
139 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
140 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
141 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
142 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
143 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
144 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
145 
146 #undef DO_AND
147 #undef DO_BIC
148 #undef DO_EOR
149 #undef DO_ORR
150 #undef DO_ORN
151 #undef DO_NOR
152 #undef DO_NAND
153 #undef DO_SEL
154 #undef LOGICAL_PPPP
155 
156 /* Fully general three-operand expander, controlled by a predicate.
157  * This is complicated by the host-endian storage of the register file.
158  */
159 /* ??? I don't expect the compiler could ever vectorize this itself.
160  * With some tables we can convert bit masks to byte masks, and with
161  * extra care wrt byte/word ordering we could use gcc generic vectors
162  * and do 16 bytes at a time.
163  */
164 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
166 {                                                                       \
167     intptr_t i, opr_sz = simd_oprsz(desc);                              \
168     for (i = 0; i < opr_sz; ) {                                         \
169         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
170         do {                                                            \
171             if (pg & 1) {                                               \
172                 TYPE nn = *(TYPE *)(vn + H(i));                         \
173                 TYPE mm = *(TYPE *)(vm + H(i));                         \
174                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
175             }                                                           \
176             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
177         } while (i & 15);                                               \
178     }                                                                   \
179 }
180 
181 /* Similarly, specialized for 64-bit operands.  */
182 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
183 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
184 {                                                               \
185     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
186     TYPE *d = vd, *n = vn, *m = vm;                             \
187     uint8_t *pg = vg;                                           \
188     for (i = 0; i < opr_sz; i += 1) {                           \
189         if (pg[H1(i)] & 1) {                                    \
190             TYPE nn = n[i], mm = m[i];                          \
191             d[i] = OP(nn, mm);                                  \
192         }                                                       \
193     }                                                           \
194 }
195 
196 #define DO_AND(N, M)  (N & M)
197 #define DO_EOR(N, M)  (N ^ M)
198 #define DO_ORR(N, M)  (N | M)
199 #define DO_BIC(N, M)  (N & ~M)
200 #define DO_ADD(N, M)  (N + M)
201 #define DO_SUB(N, M)  (N - M)
202 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
203 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
204 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
205 #define DO_MUL(N, M)  (N * M)
206 
207 
208 /*
209  * We must avoid the C undefined behaviour cases: division by
210  * zero and signed division of INT_MIN by -1. Both of these
211  * have architecturally defined required results for Arm.
212  * We special case all signed divisions by -1 to avoid having
213  * to deduce the minimum integer for the type involved.
214  */
215 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
216 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
217 
218 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
219 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
220 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
221 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
222 
223 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
224 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
225 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
226 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
227 
228 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
229 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
230 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
231 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
232 
233 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
234 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
235 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
236 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
237 
238 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
239 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
240 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
241 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
242 
243 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
244 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
245 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
246 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
247 
248 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
249 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
250 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
251 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
252 
253 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
254 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
255 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
256 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
257 
258 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
259 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
260 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
261 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
262 
263 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
264 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
265 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
266 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
267 
268 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
269 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
270 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
271 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
272 
273 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
274 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
275 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
276 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
277 
278 /* Because the computation type is at least twice as large as required,
279    these work for both signed and unsigned source types.  */
280 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
281 {
282     return (n * m) >> 8;
283 }
284 
285 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
286 {
287     return (n * m) >> 16;
288 }
289 
290 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
291 {
292     return (n * m) >> 32;
293 }
294 
295 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
296 {
297     uint64_t lo, hi;
298     muls64(&lo, &hi, n, m);
299     return hi;
300 }
301 
302 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
303 {
304     uint64_t lo, hi;
305     mulu64(&lo, &hi, n, m);
306     return hi;
307 }
308 
309 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
310 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
311 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
312 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
313 
314 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
315 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
316 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
317 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
318 
319 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
320 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
321 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
322 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
323 
324 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
325 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
326 
327 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
328 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
329 
330 /* Note that all bits of the shift are significant
331    and not modulo the element size.  */
332 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
333 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
334 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
335 
336 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
337 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
338 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
339 
340 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
341 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
342 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
343 
344 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
345 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
346 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
347 
348 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
349 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
350 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
351 
352 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
353 {
354     int8_t n1 = n, n2 = n >> 8;
355     return m + n1 + n2;
356 }
357 
358 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
359 {
360     int16_t n1 = n, n2 = n >> 16;
361     return m + n1 + n2;
362 }
363 
364 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
365 {
366     int32_t n1 = n, n2 = n >> 32;
367     return m + n1 + n2;
368 }
369 
370 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
371 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
372 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
373 
374 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
375 {
376     uint8_t n1 = n, n2 = n >> 8;
377     return m + n1 + n2;
378 }
379 
380 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
381 {
382     uint16_t n1 = n, n2 = n >> 16;
383     return m + n1 + n2;
384 }
385 
386 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
387 {
388     uint32_t n1 = n, n2 = n >> 32;
389     return m + n1 + n2;
390 }
391 
392 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
393 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
394 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
395 
396 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
397 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
398 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
399 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
400 
401 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
402 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
403 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
404 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
405 
406 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
407 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
408 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
409 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
410 
411 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
412 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
413 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
414 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
415 
416 /*
417  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
418  * We pass in a pointer to a dummy saturation field to trigger
419  * the saturating arithmetic but discard the information about
420  * whether it has occurred.
421  */
422 #define do_sqshl_b(n, m) \
423    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
424 #define do_sqshl_h(n, m) \
425    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
426 #define do_sqshl_s(n, m) \
427    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
428 #define do_sqshl_d(n, m) \
429    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
430 
431 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
432 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
433 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
434 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
435 
436 #define do_uqshl_b(n, m) \
437    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
438 #define do_uqshl_h(n, m) \
439    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
440 #define do_uqshl_s(n, m) \
441    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
442 #define do_uqshl_d(n, m) \
443    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
444 
445 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
446 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
447 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
448 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
449 
450 #define do_sqrshl_b(n, m) \
451    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
452 #define do_sqrshl_h(n, m) \
453    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
454 #define do_sqrshl_s(n, m) \
455    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
456 #define do_sqrshl_d(n, m) \
457    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
458 
459 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
460 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
461 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
462 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
463 
464 #undef do_sqrshl_d
465 
466 #define do_uqrshl_b(n, m) \
467    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
468 #define do_uqrshl_h(n, m) \
469    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
470 #define do_uqrshl_s(n, m) \
471    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
472 #define do_uqrshl_d(n, m) \
473    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
474 
475 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
476 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
477 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
478 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
479 
480 #undef do_uqrshl_d
481 
482 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
483 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
484 
485 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
486 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
487 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
488 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
489 
490 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
491 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
492 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
493 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
494 
495 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
496 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
497 
498 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
499 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
500 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
501 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
502 
503 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
504 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
506 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
507 
508 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
509 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
510 
511 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
512 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
513 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
514 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
515 
516 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
517 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
519 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
520 
521 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
522 {
523     return val >= max ? max : val <= min ? min : val;
524 }
525 
526 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
527 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
528 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
529 
530 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
531 {
532     int64_t r = n + m;
533     if (((r ^ n) & ~(n ^ m)) < 0) {
534         /* Signed overflow.  */
535         return r < 0 ? INT64_MAX : INT64_MIN;
536     }
537     return r;
538 }
539 
540 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
541 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
542 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
543 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
544 
545 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
546 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
547 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
548 
549 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
550 {
551     uint64_t r = n + m;
552     return r < n ? UINT64_MAX : r;
553 }
554 
555 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
556 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
557 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
558 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
559 
560 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
561 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
562 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
563 
564 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
565 {
566     int64_t r = n - m;
567     if (((r ^ n) & (n ^ m)) < 0) {
568         /* Signed overflow.  */
569         return r < 0 ? INT64_MAX : INT64_MIN;
570     }
571     return r;
572 }
573 
574 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
575 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
576 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
577 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
578 
579 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
580 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
581 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
582 
583 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
584 {
585     return n > m ? n - m : 0;
586 }
587 
588 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
589 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
590 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
591 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
592 
593 #define DO_SUQADD_B(n, m) \
594     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
595 #define DO_SUQADD_H(n, m) \
596     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
597 #define DO_SUQADD_S(n, m) \
598     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
599 
600 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
601 {
602     uint64_t r = n + m;
603 
604     if (n < 0) {
605         /* Note that m - abs(n) cannot underflow. */
606         if (r > INT64_MAX) {
607             /* Result is either very large positive or negative. */
608             if (m > -n) {
609                 /* m > abs(n), so r is a very large positive. */
610                 return INT64_MAX;
611             }
612             /* Result is negative. */
613         }
614     } else {
615         /* Both inputs are positive: check for overflow.  */
616         if (r < m || r > INT64_MAX) {
617             return INT64_MAX;
618         }
619     }
620     return r;
621 }
622 
623 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
624 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
625 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
626 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
627 
628 #define DO_USQADD_B(n, m) \
629     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
630 #define DO_USQADD_H(n, m) \
631     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
632 #define DO_USQADD_S(n, m) \
633     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
634 
635 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
636 {
637     uint64_t r = n + m;
638 
639     if (m < 0) {
640         return n < -m ? 0 : r;
641     }
642     return r < n ? UINT64_MAX : r;
643 }
644 
645 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
646 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
647 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
648 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
649 
650 #undef DO_ZPZZ
651 #undef DO_ZPZZ_D
652 
653 /*
654  * Three operand expander, operating on element pairs.
655  * If the slot I is even, the elements from from VN {I, I+1}.
656  * If the slot I is odd, the elements from from VM {I-1, I}.
657  * Load all of the input elements in each pair before overwriting output.
658  */
659 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
660 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
661 {                                                               \
662     intptr_t i, opr_sz = simd_oprsz(desc);                      \
663     for (i = 0; i < opr_sz; ) {                                 \
664         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
665         do {                                                    \
666             TYPE n0 = *(TYPE *)(vn + H(i));                     \
667             TYPE m0 = *(TYPE *)(vm + H(i));                     \
668             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
669             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
670             if (pg & 1) {                                       \
671                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
672             }                                                   \
673             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
674             if (pg & 1) {                                       \
675                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
676             }                                                   \
677             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
678         } while (i & 15);                                       \
679     }                                                           \
680 }
681 
682 /* Similarly, specialized for 64-bit operands.  */
683 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
685 {                                                               \
686     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
687     TYPE *d = vd, *n = vn, *m = vm;                             \
688     uint8_t *pg = vg;                                           \
689     for (i = 0; i < opr_sz; i += 2) {                           \
690         TYPE n0 = n[i], n1 = n[i + 1];                          \
691         TYPE m0 = m[i], m1 = m[i + 1];                          \
692         if (pg[H1(i)] & 1) {                                    \
693             d[i] = OP(n0, n1);                                  \
694         }                                                       \
695         if (pg[H1(i + 1)] & 1) {                                \
696             d[i + 1] = OP(m0, m1);                              \
697         }                                                       \
698     }                                                           \
699 }
700 
701 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
702 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
703 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
704 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
705 
706 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
707 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
709 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
710 
711 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
712 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
714 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
715 
716 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
717 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
719 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
720 
721 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
722 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
724 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
725 
726 #undef DO_ZPZZ_PAIR
727 #undef DO_ZPZZ_PAIR_D
728 
729 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
730 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
731                   void *status, uint32_t desc)                          \
732 {                                                                       \
733     intptr_t i, opr_sz = simd_oprsz(desc);                              \
734     for (i = 0; i < opr_sz; ) {                                         \
735         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
736         do {                                                            \
737             TYPE n0 = *(TYPE *)(vn + H(i));                             \
738             TYPE m0 = *(TYPE *)(vm + H(i));                             \
739             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
740             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
741             if (pg & 1) {                                               \
742                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
743             }                                                           \
744             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
745             if (pg & 1) {                                               \
746                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
747             }                                                           \
748             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
749         } while (i & 15);                                               \
750     }                                                                   \
751 }
752 
753 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
754 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
756 
757 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
758 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
760 
761 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
762 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
764 
765 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
766 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
768 
769 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
770 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
772 
773 #undef DO_ZPZZ_PAIR_FP
774 
775 /* Three-operand expander, controlled by a predicate, in which the
776  * third operand is "wide".  That is, for D = N op M, the same 64-bit
777  * value of M is used with all of the narrower values of N.
778  */
779 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
780 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
781 {                                                                       \
782     intptr_t i, opr_sz = simd_oprsz(desc);                              \
783     for (i = 0; i < opr_sz; ) {                                         \
784         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
785         TYPEW mm = *(TYPEW *)(vm + i);                                  \
786         do {                                                            \
787             if (pg & 1) {                                               \
788                 TYPE nn = *(TYPE *)(vn + H(i));                         \
789                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
790             }                                                           \
791             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
792         } while (i & 7);                                                \
793     }                                                                   \
794 }
795 
796 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
797 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
798 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
799 
800 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
801 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
802 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
803 
804 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
807 
808 #undef DO_ZPZW
809 
810 /* Fully general two-operand expander, controlled by a predicate.
811  */
812 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
813 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
814 {                                                               \
815     intptr_t i, opr_sz = simd_oprsz(desc);                      \
816     for (i = 0; i < opr_sz; ) {                                 \
817         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
818         do {                                                    \
819             if (pg & 1) {                                       \
820                 TYPE nn = *(TYPE *)(vn + H(i));                 \
821                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
822             }                                                   \
823             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
824         } while (i & 15);                                       \
825     }                                                           \
826 }
827 
828 /* Similarly, specialized for 64-bit operands.  */
829 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
830 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
831 {                                                               \
832     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
833     TYPE *d = vd, *n = vn;                                      \
834     uint8_t *pg = vg;                                           \
835     for (i = 0; i < opr_sz; i += 1) {                           \
836         if (pg[H1(i)] & 1) {                                    \
837             TYPE nn = n[i];                                     \
838             d[i] = OP(nn);                                      \
839         }                                                       \
840     }                                                           \
841 }
842 
843 #define DO_CLS_B(N)   (clrsb32(N) - 24)
844 #define DO_CLS_H(N)   (clrsb32(N) - 16)
845 
846 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
847 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
848 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
849 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
850 
851 #define DO_CLZ_B(N)   (clz32(N) - 24)
852 #define DO_CLZ_H(N)   (clz32(N) - 16)
853 
854 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
855 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
856 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
857 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
858 
859 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
860 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
861 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
862 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
863 
864 #define DO_CNOT(N)    (N == 0)
865 
866 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
867 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
868 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
869 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
870 
871 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
872 
873 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
874 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
875 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
876 
877 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
878 
879 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
880 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
881 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
882 
883 #define DO_NOT(N)    (~N)
884 
885 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
886 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
887 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
888 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
889 
890 #define DO_SXTB(N)    ((int8_t)N)
891 #define DO_SXTH(N)    ((int16_t)N)
892 #define DO_SXTS(N)    ((int32_t)N)
893 #define DO_UXTB(N)    ((uint8_t)N)
894 #define DO_UXTH(N)    ((uint16_t)N)
895 #define DO_UXTS(N)    ((uint32_t)N)
896 
897 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
898 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
899 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
900 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
901 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
902 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
903 
904 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
905 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
906 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
907 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
908 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
909 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
910 
911 #define DO_ABS(N)    (N < 0 ? -N : N)
912 
913 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
914 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
915 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
916 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
917 
918 #define DO_NEG(N)    (-N)
919 
920 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
921 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
922 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
923 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
924 
925 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
926 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
927 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
928 
929 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
930 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
931 
932 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
933 
934 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
935 {
936     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
937     uint64_t *d = vd, *n = vn;
938     uint8_t *pg = vg;
939 
940     for (i = 0; i < opr_sz; i += 2) {
941         if (pg[H1(i)] & 1) {
942             uint64_t n0 = n[i + 0];
943             uint64_t n1 = n[i + 1];
944             d[i + 0] = n1;
945             d[i + 1] = n0;
946         }
947     }
948 }
949 
950 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
951 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
952 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
953 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
954 
955 #define DO_SQABS(X) \
956     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
957        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
958 
959 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
960 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
961 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
962 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
963 
964 #define DO_SQNEG(X) \
965     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
966        x_ == min_ ? -min_ - 1 : -x_; })
967 
968 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
969 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
970 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
971 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
972 
973 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
974 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
975 
976 /* Three-operand expander, unpredicated, in which the third operand is "wide".
977  */
978 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
979 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
980 {                                                              \
981     intptr_t i, opr_sz = simd_oprsz(desc);                     \
982     for (i = 0; i < opr_sz; ) {                                \
983         TYPEW mm = *(TYPEW *)(vm + i);                         \
984         do {                                                   \
985             TYPE nn = *(TYPE *)(vn + H(i));                    \
986             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
987             i += sizeof(TYPE);                                 \
988         } while (i & 7);                                       \
989     }                                                          \
990 }
991 
992 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
993 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
994 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
995 
996 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
997 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
998 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
999 
1000 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1001 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1002 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1003 
1004 #undef DO_ZZW
1005 
1006 #undef DO_CLS_B
1007 #undef DO_CLS_H
1008 #undef DO_CLZ_B
1009 #undef DO_CLZ_H
1010 #undef DO_CNOT
1011 #undef DO_FABS
1012 #undef DO_FNEG
1013 #undef DO_ABS
1014 #undef DO_NEG
1015 #undef DO_ZPZ
1016 #undef DO_ZPZ_D
1017 
1018 /*
1019  * Three-operand expander, unpredicated, in which the two inputs are
1020  * selected from the top or bottom half of the wide column.
1021  */
1022 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1023 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1024 {                                                                       \
1025     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1026     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1027     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1028     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1029         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1030         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1031         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1032     }                                                                   \
1033 }
1034 
1035 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1036 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1037 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1038 
1039 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1040 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1041 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1042 
1043 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1044 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1045 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1046 
1047 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1048 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1049 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1050 
1051 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1052 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1053 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1054 
1055 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1056 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1057 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1058 
1059 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1060 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1061 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1062 
1063 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1064 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1065 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1066 
1067 /* Note that the multiply cannot overflow, but the doubling can. */
1068 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1069 {
1070     int16_t val = n * m;
1071     return DO_SQADD_H(val, val);
1072 }
1073 
1074 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1075 {
1076     int32_t val = n * m;
1077     return DO_SQADD_S(val, val);
1078 }
1079 
1080 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1081 {
1082     int64_t val = n * m;
1083     return do_sqadd_d(val, val);
1084 }
1085 
1086 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1087 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1088 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1089 
1090 #undef DO_ZZZ_TB
1091 
1092 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1093 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1094 {                                                              \
1095     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1096     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1097     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1098         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1099         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1100         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1101     }                                                          \
1102 }
1103 
1104 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1105 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1106 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1107 
1108 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1109 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1110 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1111 
1112 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1113 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1114 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1115 
1116 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1117 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1118 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1119 
1120 #undef DO_ZZZ_WTB
1121 
1122 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1123 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1124 {                                                                       \
1125     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1126     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1127     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1128     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1129         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1130         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1131         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1132     }                                                                   \
1133 }
1134 
1135 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1136 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1137 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1138 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1139 
1140 #undef DO_ZZZ_NTB
1141 
1142 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1143 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1144 {                                                               \
1145     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1146     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1147     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1148         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1149         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1150         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1151         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1152     }                                                           \
1153 }
1154 
1155 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1156 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1157 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1158 
1159 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1160 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1161 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1162 
1163 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1164 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1165 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1166 
1167 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1168 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1169 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1170 
1171 #define DO_NMUL(N, M)  -(N * M)
1172 
1173 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1174 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1176 
1177 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1178 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1180 
1181 #undef DO_ZZZW_ACC
1182 
1183 #define DO_XTNB(NAME, TYPE, OP) \
1184 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1185 {                                                            \
1186     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1187     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1188         TYPE nn = *(TYPE *)(vn + i);                         \
1189         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1190         *(TYPE *)(vd + i) = nn;                              \
1191     }                                                        \
1192 }
1193 
1194 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1195 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1196 {                                                                       \
1197     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1198     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1199         TYPE nn = *(TYPE *)(vn + i);                                    \
1200         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1201     }                                                                   \
1202 }
1203 
1204 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1205 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1206 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1207 
1208 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1209 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1210 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1211 
1212 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1213 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1214 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1215 
1216 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1217 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1218 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1219 
1220 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1221 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1222 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1223 
1224 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1225 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1226 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1227 
1228 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1229 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1230 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1231 
1232 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1233 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1234 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1235 
1236 #undef DO_XTNB
1237 #undef DO_XTNT
1238 
1239 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1240 {
1241     intptr_t i, opr_sz = simd_oprsz(desc);
1242     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1243     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1244     uint32_t *a = va, *n = vn;
1245     uint64_t *d = vd, *m = vm;
1246 
1247     for (i = 0; i < opr_sz / 8; ++i) {
1248         uint32_t e1 = a[2 * i + H4(0)];
1249         uint32_t e2 = n[2 * i + sel] ^ inv;
1250         uint64_t c = extract64(m[i], 32, 1);
1251         /* Compute and store the entire 33-bit result at once. */
1252         d[i] = c + e1 + e2;
1253     }
1254 }
1255 
1256 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1257 {
1258     intptr_t i, opr_sz = simd_oprsz(desc);
1259     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1260     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1261     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1262 
1263     for (i = 0; i < opr_sz / 8; i += 2) {
1264         Int128 e1 = int128_make64(a[i]);
1265         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1266         Int128 c = int128_make64(m[i + 1] & 1);
1267         Int128 r = int128_add(int128_add(e1, e2), c);
1268         d[i + 0] = int128_getlo(r);
1269         d[i + 1] = int128_gethi(r);
1270     }
1271 }
1272 
1273 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1274 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1275 {                                                                       \
1276     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1277     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1278     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1279     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1280         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1281         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1282         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1283         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1284     }                                                                   \
1285 }
1286 
1287 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1288            do_sqdmull_h, DO_SQADD_H)
1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1290            do_sqdmull_s, DO_SQADD_S)
1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1292            do_sqdmull_d, do_sqadd_d)
1293 
1294 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1295            do_sqdmull_h, DO_SQSUB_H)
1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1297            do_sqdmull_s, DO_SQSUB_S)
1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1299            do_sqdmull_d, do_sqsub_d)
1300 
1301 #undef DO_SQDMLAL
1302 
1303 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1304 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1305 {                                                               \
1306     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1307     int rot = simd_data(desc);                                  \
1308     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1309     bool sub_r = rot == 1 || rot == 2;                          \
1310     bool sub_i = rot >= 2;                                      \
1311     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1312     for (i = 0; i < opr_sz; i += 2) {                           \
1313         TYPE elt1_a = n[H(i + sel_a)];                          \
1314         TYPE elt2_a = m[H(i + sel_a)];                          \
1315         TYPE elt2_b = m[H(i + sel_b)];                          \
1316         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1317         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1318     }                                                           \
1319 }
1320 
1321 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1322 
1323 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1324 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1325 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1326 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1327 
1328 #define DO_SQRDMLAH_B(N, M, A, S) \
1329     do_sqrdmlah_b(N, M, A, S, true)
1330 #define DO_SQRDMLAH_H(N, M, A, S) \
1331     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1332 #define DO_SQRDMLAH_S(N, M, A, S) \
1333     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1334 #define DO_SQRDMLAH_D(N, M, A, S) \
1335     do_sqrdmlah_d(N, M, A, S, true)
1336 
1337 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1338 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1341 
1342 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1343 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1344 {                                                                           \
1345     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1346     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1347     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1348     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1349     bool sub_r = rot == 1 || rot == 2;                                      \
1350     bool sub_i = rot >= 2;                                                  \
1351     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1352     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1353         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1354         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1355         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1356             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1357             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1358             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1359         }                                                                   \
1360     }                                                                       \
1361 }
1362 
1363 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1364 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1365 
1366 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1367 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1368 
1369 #undef DO_CMLA
1370 #undef DO_CMLA_FUNC
1371 #undef DO_CMLA_IDX_FUNC
1372 #undef DO_SQRDMLAH_B
1373 #undef DO_SQRDMLAH_H
1374 #undef DO_SQRDMLAH_S
1375 #undef DO_SQRDMLAH_D
1376 
1377 /* Note N and M are 4 elements bundled into one unit. */
1378 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1379                          int sel_a, int sel_b, int sub_i)
1380 {
1381     for (int i = 0; i <= 1; i++) {
1382         int32_t elt1_r = (int8_t)(n >> (16 * i));
1383         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1384         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1385         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1386 
1387         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1388     }
1389     return a;
1390 }
1391 
1392 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1393                          int sel_a, int sel_b, int sub_i)
1394 {
1395     for (int i = 0; i <= 1; i++) {
1396         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1397         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1398         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1399         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1400 
1401         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1402     }
1403     return a;
1404 }
1405 
1406 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1407                               void *va, uint32_t desc)
1408 {
1409     int opr_sz = simd_oprsz(desc);
1410     int rot = simd_data(desc);
1411     int sel_a = rot & 1;
1412     int sel_b = sel_a ^ 1;
1413     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1414     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1415 
1416     for (int e = 0; e < opr_sz / 4; e++) {
1417         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1418     }
1419 }
1420 
1421 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1422                               void *va, uint32_t desc)
1423 {
1424     int opr_sz = simd_oprsz(desc);
1425     int rot = simd_data(desc);
1426     int sel_a = rot & 1;
1427     int sel_b = sel_a ^ 1;
1428     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1429     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1430 
1431     for (int e = 0; e < opr_sz / 8; e++) {
1432         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1433     }
1434 }
1435 
1436 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1437                              void *va, uint32_t desc)
1438 {
1439     int opr_sz = simd_oprsz(desc);
1440     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1441     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1442     int sel_a = rot & 1;
1443     int sel_b = sel_a ^ 1;
1444     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1445     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1446 
1447     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1448         uint32_t seg_m = m[seg + idx];
1449         for (int e = 0; e < 4; e++) {
1450             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1451                                    sel_a, sel_b, sub_i);
1452         }
1453     }
1454 }
1455 
1456 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1457                              void *va, uint32_t desc)
1458 {
1459     int seg, opr_sz = simd_oprsz(desc);
1460     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1461     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1462     int sel_a = rot & 1;
1463     int sel_b = sel_a ^ 1;
1464     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1465     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1466 
1467     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1468         uint64_t seg_m = m[seg + idx];
1469         for (int e = 0; e < 2; e++) {
1470             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1471                                    sel_a, sel_b, sub_i);
1472         }
1473     }
1474 }
1475 
1476 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1477 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1478 {                                                                       \
1479     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1480     intptr_t i, j, idx = simd_data(desc);                               \
1481     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1482     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1483         TYPE mm = m[i];                                                 \
1484         for (j = 0; j < segment; j++) {                                 \
1485             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1486         }                                                               \
1487     }                                                                   \
1488 }
1489 
1490 #define DO_SQRDMLAH_H(N, M, A) \
1491     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1492 #define DO_SQRDMLAH_S(N, M, A) \
1493     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1494 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1495 
1496 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1497 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1498 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1499 
1500 #define DO_SQRDMLSH_H(N, M, A) \
1501     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1502 #define DO_SQRDMLSH_S(N, M, A) \
1503     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1504 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1505 
1506 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1507 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1508 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1509 
1510 #undef DO_ZZXZ
1511 
1512 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1513 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1514 {                                                                         \
1515     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1516     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1517     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1518     for (i = 0; i < oprsz; i += 16) {                                     \
1519         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1520         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1521             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1522             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1523             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1524         }                                                                 \
1525     }                                                                     \
1526 }
1527 
1528 #define DO_MLA(N, M, A)  (A + N * M)
1529 
1530 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1531 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1532 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1533 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1534 
1535 #define DO_MLS(N, M, A)  (A - N * M)
1536 
1537 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1538 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1539 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1540 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1541 
1542 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1543 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1544 
1545 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1546 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1547 
1548 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1549 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1550 
1551 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1552 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1553 
1554 #undef DO_MLA
1555 #undef DO_MLS
1556 #undef DO_ZZXW
1557 
1558 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1559 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1560 {                                                                         \
1561     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1562     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1563     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1564     for (i = 0; i < oprsz; i += 16) {                                     \
1565         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1566         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1567             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1568             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1569         }                                                                 \
1570     }                                                                     \
1571 }
1572 
1573 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1574 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1575 
1576 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1577 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1578 
1579 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1580 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1581 
1582 #undef DO_ZZX
1583 
1584 #define DO_BITPERM(NAME, TYPE, OP) \
1585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1586 {                                                              \
1587     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1588     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1589         TYPE nn = *(TYPE *)(vn + i);                           \
1590         TYPE mm = *(TYPE *)(vm + i);                           \
1591         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1592     }                                                          \
1593 }
1594 
1595 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1596 {
1597     uint64_t res = 0;
1598     int db, rb = 0;
1599 
1600     for (db = 0; db < n; ++db) {
1601         if ((mask >> db) & 1) {
1602             res |= ((data >> db) & 1) << rb;
1603             ++rb;
1604         }
1605     }
1606     return res;
1607 }
1608 
1609 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1610 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1611 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1612 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1613 
1614 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1615 {
1616     uint64_t res = 0;
1617     int rb, db = 0;
1618 
1619     for (rb = 0; rb < n; ++rb) {
1620         if ((mask >> rb) & 1) {
1621             res |= ((data >> db) & 1) << rb;
1622             ++db;
1623         }
1624     }
1625     return res;
1626 }
1627 
1628 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1629 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1630 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1631 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1632 
1633 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1634 {
1635     uint64_t resm = 0, resu = 0;
1636     int db, rbm = 0, rbu = 0;
1637 
1638     for (db = 0; db < n; ++db) {
1639         uint64_t val = (data >> db) & 1;
1640         if ((mask >> db) & 1) {
1641             resm |= val << rbm++;
1642         } else {
1643             resu |= val << rbu++;
1644         }
1645     }
1646 
1647     return resm | (resu << rbm);
1648 }
1649 
1650 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1651 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1652 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1653 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1654 
1655 #undef DO_BITPERM
1656 
1657 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1659 {                                                               \
1660     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1661     int sub_r = simd_data(desc);                                \
1662     if (sub_r) {                                                \
1663         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1664             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1665             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1666             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1667             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1668             acc_r = ADD_OP(acc_r, el2_i);                       \
1669             acc_i = SUB_OP(acc_i, el2_r);                       \
1670             *(TYPE *)(vd + H(i)) = acc_r;                       \
1671             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1672         }                                                       \
1673     } else {                                                    \
1674         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1675             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1676             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1677             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1678             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1679             acc_r = SUB_OP(acc_r, el2_i);                       \
1680             acc_i = ADD_OP(acc_i, el2_r);                       \
1681             *(TYPE *)(vd + H(i)) = acc_r;                       \
1682             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1683         }                                                       \
1684     }                                                           \
1685 }
1686 
1687 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1688 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1689 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1690 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1691 
1692 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1693 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1694 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1695 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1696 
1697 #undef DO_CADD
1698 
1699 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1700 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1701 {                                                              \
1702     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1703     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1704     int shift = simd_data(desc) >> 1;                          \
1705     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1706         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1707         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1708     }                                                          \
1709 }
1710 
1711 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1712 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1713 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1714 
1715 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1716 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1717 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1718 
1719 #undef DO_ZZI_SHLL
1720 
1721 /* Two-operand reduction expander, controlled by a predicate.
1722  * The difference between TYPERED and TYPERET has to do with
1723  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1724  * but TYPERET must be unsigned so that e.g. a 32-bit value
1725  * is not sign-extended to the ABI uint64_t return type.
1726  */
1727 /* ??? If we were to vectorize this by hand the reduction ordering
1728  * would change.  For integer operands, this is perfectly fine.
1729  */
1730 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1731 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1732 {                                                          \
1733     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1734     TYPERED ret = INIT;                                    \
1735     for (i = 0; i < opr_sz; ) {                            \
1736         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1737         do {                                               \
1738             if (pg & 1) {                                  \
1739                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1740                 ret = OP(ret, nn);                         \
1741             }                                              \
1742             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1743         } while (i & 15);                                  \
1744     }                                                      \
1745     return (TYPERET)ret;                                   \
1746 }
1747 
1748 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1749 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1750 {                                                          \
1751     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1752     TYPEE *n = vn;                                         \
1753     uint8_t *pg = vg;                                      \
1754     TYPER ret = INIT;                                      \
1755     for (i = 0; i < opr_sz; i += 1) {                      \
1756         if (pg[H1(i)] & 1) {                               \
1757             TYPEE nn = n[i];                               \
1758             ret = OP(ret, nn);                             \
1759         }                                                  \
1760     }                                                      \
1761     return ret;                                            \
1762 }
1763 
1764 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1765 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1766 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1767 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1768 
1769 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1770 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1771 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1772 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1773 
1774 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1775 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1776 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1777 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1778 
1779 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1780 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1781 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1782 
1783 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1784 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1785 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1786 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1787 
1788 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1789 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1790 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1791 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1792 
1793 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1794 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1795 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1796 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1797 
1798 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1799 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1800 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1801 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1802 
1803 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1804 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1805 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1806 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1807 
1808 #undef DO_VPZ
1809 #undef DO_VPZ_D
1810 
1811 /* Two vector operand, one scalar operand, unpredicated.  */
1812 #define DO_ZZI(NAME, TYPE, OP)                                       \
1813 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1814 {                                                                    \
1815     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1816     TYPE s = s64, *d = vd, *n = vn;                                  \
1817     for (i = 0; i < opr_sz; ++i) {                                   \
1818         d[i] = OP(n[i], s);                                          \
1819     }                                                                \
1820 }
1821 
1822 #define DO_SUBR(X, Y)   (Y - X)
1823 
1824 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1825 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1826 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1827 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1828 
1829 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1830 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1831 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1832 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1833 
1834 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1835 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1836 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1837 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1838 
1839 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1840 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1841 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1842 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1843 
1844 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1845 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1846 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1847 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1848 
1849 #undef DO_ZZI
1850 
1851 #undef DO_AND
1852 #undef DO_ORR
1853 #undef DO_EOR
1854 #undef DO_BIC
1855 #undef DO_ADD
1856 #undef DO_SUB
1857 #undef DO_MAX
1858 #undef DO_MIN
1859 #undef DO_ABD
1860 #undef DO_MUL
1861 #undef DO_DIV
1862 #undef DO_ASR
1863 #undef DO_LSR
1864 #undef DO_LSL
1865 #undef DO_SUBR
1866 
1867 /* Similar to the ARM LastActiveElement pseudocode function, except the
1868    result is multiplied by the element size.  This includes the not found
1869    indication; e.g. not found for esz=3 is -8.  */
1870 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1871 {
1872     uint64_t mask = pred_esz_masks[esz];
1873     intptr_t i = words;
1874 
1875     do {
1876         uint64_t this_g = g[--i] & mask;
1877         if (this_g) {
1878             return i * 64 + (63 - clz64(this_g));
1879         }
1880     } while (i > 0);
1881     return (intptr_t)-1 << esz;
1882 }
1883 
1884 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1885 {
1886     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1887     uint32_t flags = PREDTEST_INIT;
1888     uint64_t *d = vd, *g = vg;
1889     intptr_t i = 0;
1890 
1891     do {
1892         uint64_t this_d = d[i];
1893         uint64_t this_g = g[i];
1894 
1895         if (this_g) {
1896             if (!(flags & 4)) {
1897                 /* Set in D the first bit of G.  */
1898                 this_d |= this_g & -this_g;
1899                 d[i] = this_d;
1900             }
1901             flags = iter_predtest_fwd(this_d, this_g, flags);
1902         }
1903     } while (++i < words);
1904 
1905     return flags;
1906 }
1907 
1908 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1909 {
1910     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1911     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1912     uint32_t flags = PREDTEST_INIT;
1913     uint64_t *d = vd, *g = vg, esz_mask;
1914     intptr_t i, next;
1915 
1916     next = last_active_element(vd, words, esz) + (1 << esz);
1917     esz_mask = pred_esz_masks[esz];
1918 
1919     /* Similar to the pseudocode for pnext, but scaled by ESZ
1920        so that we find the correct bit.  */
1921     if (next < words * 64) {
1922         uint64_t mask = -1;
1923 
1924         if (next & 63) {
1925             mask = ~((1ull << (next & 63)) - 1);
1926             next &= -64;
1927         }
1928         do {
1929             uint64_t this_g = g[next / 64] & esz_mask & mask;
1930             if (this_g != 0) {
1931                 next = (next & -64) + ctz64(this_g);
1932                 break;
1933             }
1934             next += 64;
1935             mask = -1;
1936         } while (next < words * 64);
1937     }
1938 
1939     i = 0;
1940     do {
1941         uint64_t this_d = 0;
1942         if (i == next / 64) {
1943             this_d = 1ull << (next & 63);
1944         }
1945         d[i] = this_d;
1946         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1947     } while (++i < words);
1948 
1949     return flags;
1950 }
1951 
1952 /*
1953  * Copy Zn into Zd, and store zero into inactive elements.
1954  * If inv, store zeros into the active elements.
1955  */
1956 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1957 {
1958     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1959     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1960     uint64_t *d = vd, *n = vn;
1961     uint8_t *pg = vg;
1962 
1963     for (i = 0; i < opr_sz; i += 1) {
1964         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1965     }
1966 }
1967 
1968 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1969 {
1970     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1971     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1972     uint64_t *d = vd, *n = vn;
1973     uint8_t *pg = vg;
1974 
1975     for (i = 0; i < opr_sz; i += 1) {
1976         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1977     }
1978 }
1979 
1980 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1981 {
1982     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1983     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1984     uint64_t *d = vd, *n = vn;
1985     uint8_t *pg = vg;
1986 
1987     for (i = 0; i < opr_sz; i += 1) {
1988         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1989     }
1990 }
1991 
1992 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1993 {
1994     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1995     uint64_t *d = vd, *n = vn;
1996     uint8_t *pg = vg;
1997     uint8_t inv = simd_data(desc);
1998 
1999     for (i = 0; i < opr_sz; i += 1) {
2000         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2001     }
2002 }
2003 
2004 /* Three-operand expander, immediate operand, controlled by a predicate.
2005  */
2006 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2007 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2008 {                                                               \
2009     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2010     TYPE imm = simd_data(desc);                                 \
2011     for (i = 0; i < opr_sz; ) {                                 \
2012         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2013         do {                                                    \
2014             if (pg & 1) {                                       \
2015                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2016                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2017             }                                                   \
2018             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2019         } while (i & 15);                                       \
2020     }                                                           \
2021 }
2022 
2023 /* Similarly, specialized for 64-bit operands.  */
2024 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2025 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2026 {                                                               \
2027     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2028     TYPE *d = vd, *n = vn;                                      \
2029     TYPE imm = simd_data(desc);                                 \
2030     uint8_t *pg = vg;                                           \
2031     for (i = 0; i < opr_sz; i += 1) {                           \
2032         if (pg[H1(i)] & 1) {                                    \
2033             TYPE nn = n[i];                                     \
2034             d[i] = OP(nn, imm);                                 \
2035         }                                                       \
2036     }                                                           \
2037 }
2038 
2039 #define DO_SHR(N, M)  (N >> M)
2040 #define DO_SHL(N, M)  (N << M)
2041 
2042 /* Arithmetic shift right for division.  This rounds negative numbers
2043    toward zero as per signed division.  Therefore before shifting,
2044    when N is negative, add 2**M-1.  */
2045 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2046 
2047 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2048 {
2049     if (likely(sh < 64)) {
2050         return (x >> sh) + ((x >> (sh - 1)) & 1);
2051     } else if (sh == 64) {
2052         return x >> 63;
2053     } else {
2054         return 0;
2055     }
2056 }
2057 
2058 static inline int64_t do_srshr(int64_t x, unsigned sh)
2059 {
2060     if (likely(sh < 64)) {
2061         return (x >> sh) + ((x >> (sh - 1)) & 1);
2062     } else {
2063         /* Rounding the sign bit always produces 0. */
2064         return 0;
2065     }
2066 }
2067 
2068 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2069 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2070 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2071 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2072 
2073 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2074 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2075 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2076 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2077 
2078 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2079 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2080 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2081 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2082 
2083 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2084 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2085 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2086 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2087 
2088 /* SVE2 bitwise shift by immediate */
2089 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2090 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2091 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2092 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2093 
2094 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2095 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2096 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2097 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2098 
2099 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2100 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2101 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2102 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2103 
2104 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2105 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2106 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2107 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2108 
2109 #define do_suqrshl_b(n, m) \
2110    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2111 #define do_suqrshl_h(n, m) \
2112    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2113 #define do_suqrshl_s(n, m) \
2114    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2115 #define do_suqrshl_d(n, m) \
2116    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2117 
2118 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2119 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2120 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2121 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2122 
2123 #undef DO_ASRD
2124 #undef DO_ZPZI
2125 #undef DO_ZPZI_D
2126 
2127 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2128 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2129 {                                                            \
2130     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2131     int shift = simd_data(desc);                             \
2132     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2133         TYPEW nn = *(TYPEW *)(vn + i);                       \
2134         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2135     }                                                        \
2136 }
2137 
2138 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2139 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2140 {                                                                 \
2141     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2142     int shift = simd_data(desc);                                  \
2143     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2144         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2145         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2146     }                                                             \
2147 }
2148 
2149 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2150 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2151 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2152 
2153 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2154 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2155 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2156 
2157 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2158 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2159 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2160 
2161 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2162 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2163 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2164 
2165 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2166 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2167 #define DO_SQSHRUN_D(x, sh) \
2168     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2169 
2170 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2171 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2172 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2173 
2174 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2175 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2176 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2177 
2178 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2179 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2180 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2181 
2182 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2183 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2184 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2185 
2186 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2187 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2188 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2189 
2190 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2191 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2192 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2193 
2194 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2195 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2196 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2197 
2198 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2199 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2200 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2201 
2202 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2203 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2204 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2205 
2206 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2207 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2208 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2209 
2210 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2211 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2212 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2213 
2214 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2215 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2216 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2217 
2218 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2219 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2220 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2221 
2222 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2223 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2224 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2225 
2226 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2227 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2228 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2229 
2230 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2231 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2232 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2233 
2234 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2235 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2236 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2237 
2238 #undef DO_SHRNB
2239 #undef DO_SHRNT
2240 
2241 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2242 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2243 {                                                                           \
2244     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2245     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2246         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2247         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2248         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2249     }                                                                       \
2250 }
2251 
2252 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2253 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2254 {                                                                           \
2255     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2256     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2257         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2258         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2259         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2260     }                                                                       \
2261 }
2262 
2263 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2264 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2265 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2266 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267 
2268 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2269 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2270 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2271 
2272 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2273 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2274 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2275 
2276 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2277 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2278 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2279 
2280 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2281 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2282 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2283 
2284 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2285 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2286 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2287 
2288 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2289 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2290 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2291 
2292 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2293 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2294 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2295 
2296 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2297 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2298 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2299 
2300 #undef DO_RSUBHN
2301 #undef DO_SUBHN
2302 #undef DO_RADDHN
2303 #undef DO_ADDHN
2304 
2305 #undef DO_BINOPNB
2306 
2307 /* Fully general four-operand expander, controlled by a predicate.
2308  */
2309 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2310 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2311                   void *vg, uint32_t desc)                    \
2312 {                                                             \
2313     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2314     for (i = 0; i < opr_sz; ) {                               \
2315         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2316         do {                                                  \
2317             if (pg & 1) {                                     \
2318                 TYPE nn = *(TYPE *)(vn + H(i));               \
2319                 TYPE mm = *(TYPE *)(vm + H(i));               \
2320                 TYPE aa = *(TYPE *)(va + H(i));               \
2321                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2322             }                                                 \
2323             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2324         } while (i & 15);                                     \
2325     }                                                         \
2326 }
2327 
2328 /* Similarly, specialized for 64-bit operands.  */
2329 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2330 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2331                   void *vg, uint32_t desc)                    \
2332 {                                                             \
2333     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2334     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2335     uint8_t *pg = vg;                                         \
2336     for (i = 0; i < opr_sz; i += 1) {                         \
2337         if (pg[H1(i)] & 1) {                                  \
2338             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2339             d[i] = OP(aa, nn, mm);                            \
2340         }                                                     \
2341     }                                                         \
2342 }
2343 
2344 #define DO_MLA(A, N, M)  (A + N * M)
2345 #define DO_MLS(A, N, M)  (A - N * M)
2346 
2347 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2348 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2349 
2350 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2351 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2352 
2353 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2354 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2355 
2356 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2357 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2358 
2359 #undef DO_MLA
2360 #undef DO_MLS
2361 #undef DO_ZPZZZ
2362 #undef DO_ZPZZZ_D
2363 
2364 void HELPER(sve_index_b)(void *vd, uint32_t start,
2365                          uint32_t incr, uint32_t desc)
2366 {
2367     intptr_t i, opr_sz = simd_oprsz(desc);
2368     uint8_t *d = vd;
2369     for (i = 0; i < opr_sz; i += 1) {
2370         d[H1(i)] = start + i * incr;
2371     }
2372 }
2373 
2374 void HELPER(sve_index_h)(void *vd, uint32_t start,
2375                          uint32_t incr, uint32_t desc)
2376 {
2377     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2378     uint16_t *d = vd;
2379     for (i = 0; i < opr_sz; i += 1) {
2380         d[H2(i)] = start + i * incr;
2381     }
2382 }
2383 
2384 void HELPER(sve_index_s)(void *vd, uint32_t start,
2385                          uint32_t incr, uint32_t desc)
2386 {
2387     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2388     uint32_t *d = vd;
2389     for (i = 0; i < opr_sz; i += 1) {
2390         d[H4(i)] = start + i * incr;
2391     }
2392 }
2393 
2394 void HELPER(sve_index_d)(void *vd, uint64_t start,
2395                          uint64_t incr, uint32_t desc)
2396 {
2397     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2398     uint64_t *d = vd;
2399     for (i = 0; i < opr_sz; i += 1) {
2400         d[i] = start + i * incr;
2401     }
2402 }
2403 
2404 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2405 {
2406     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2407     uint32_t sh = simd_data(desc);
2408     uint32_t *d = vd, *n = vn, *m = vm;
2409     for (i = 0; i < opr_sz; i += 1) {
2410         d[i] = n[i] + (m[i] << sh);
2411     }
2412 }
2413 
2414 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2415 {
2416     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2417     uint64_t sh = simd_data(desc);
2418     uint64_t *d = vd, *n = vn, *m = vm;
2419     for (i = 0; i < opr_sz; i += 1) {
2420         d[i] = n[i] + (m[i] << sh);
2421     }
2422 }
2423 
2424 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2425 {
2426     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2427     uint64_t sh = simd_data(desc);
2428     uint64_t *d = vd, *n = vn, *m = vm;
2429     for (i = 0; i < opr_sz; i += 1) {
2430         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2431     }
2432 }
2433 
2434 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2435 {
2436     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2437     uint64_t sh = simd_data(desc);
2438     uint64_t *d = vd, *n = vn, *m = vm;
2439     for (i = 0; i < opr_sz; i += 1) {
2440         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2441     }
2442 }
2443 
2444 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2445 {
2446     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2447     static const uint16_t coeff[] = {
2448         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2449         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2450         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2451         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2452     };
2453     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2454     uint16_t *d = vd, *n = vn;
2455 
2456     for (i = 0; i < opr_sz; i++) {
2457         uint16_t nn = n[i];
2458         intptr_t idx = extract32(nn, 0, 5);
2459         uint16_t exp = extract32(nn, 5, 5);
2460         d[i] = coeff[idx] | (exp << 10);
2461     }
2462 }
2463 
2464 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2465 {
2466     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2467     static const uint32_t coeff[] = {
2468         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2469         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2470         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2471         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2472         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2473         0x1ef532, 0x20b051, 0x227043, 0x243516,
2474         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2475         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2476         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2477         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2478         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2479         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2480         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2481         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2482         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2483         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2484     };
2485     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2486     uint32_t *d = vd, *n = vn;
2487 
2488     for (i = 0; i < opr_sz; i++) {
2489         uint32_t nn = n[i];
2490         intptr_t idx = extract32(nn, 0, 6);
2491         uint32_t exp = extract32(nn, 6, 8);
2492         d[i] = coeff[idx] | (exp << 23);
2493     }
2494 }
2495 
2496 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2497 {
2498     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2499     static const uint64_t coeff[] = {
2500         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2501         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2502         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2503         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2504         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2505         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2506         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2507         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2508         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2509         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2510         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2511         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2512         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2513         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2514         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2515         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2516         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2517         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2518         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2519         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2520         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2521         0xFA7C1819E90D8ull,
2522     };
2523     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2524     uint64_t *d = vd, *n = vn;
2525 
2526     for (i = 0; i < opr_sz; i++) {
2527         uint64_t nn = n[i];
2528         intptr_t idx = extract32(nn, 0, 6);
2529         uint64_t exp = extract32(nn, 6, 11);
2530         d[i] = coeff[idx] | (exp << 52);
2531     }
2532 }
2533 
2534 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2535 {
2536     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2537     uint16_t *d = vd, *n = vn, *m = vm;
2538     for (i = 0; i < opr_sz; i += 1) {
2539         uint16_t nn = n[i];
2540         uint16_t mm = m[i];
2541         if (mm & 1) {
2542             nn = float16_one;
2543         }
2544         d[i] = nn ^ (mm & 2) << 14;
2545     }
2546 }
2547 
2548 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2549 {
2550     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2551     uint32_t *d = vd, *n = vn, *m = vm;
2552     for (i = 0; i < opr_sz; i += 1) {
2553         uint32_t nn = n[i];
2554         uint32_t mm = m[i];
2555         if (mm & 1) {
2556             nn = float32_one;
2557         }
2558         d[i] = nn ^ (mm & 2) << 30;
2559     }
2560 }
2561 
2562 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2563 {
2564     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2565     uint64_t *d = vd, *n = vn, *m = vm;
2566     for (i = 0; i < opr_sz; i += 1) {
2567         uint64_t nn = n[i];
2568         uint64_t mm = m[i];
2569         if (mm & 1) {
2570             nn = float64_one;
2571         }
2572         d[i] = nn ^ (mm & 2) << 62;
2573     }
2574 }
2575 
2576 /*
2577  * Signed saturating addition with scalar operand.
2578  */
2579 
2580 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2581 {
2582     intptr_t i, oprsz = simd_oprsz(desc);
2583 
2584     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2585         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2586     }
2587 }
2588 
2589 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2590 {
2591     intptr_t i, oprsz = simd_oprsz(desc);
2592 
2593     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2594         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2595     }
2596 }
2597 
2598 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2599 {
2600     intptr_t i, oprsz = simd_oprsz(desc);
2601 
2602     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2603         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2604     }
2605 }
2606 
2607 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2608 {
2609     intptr_t i, oprsz = simd_oprsz(desc);
2610 
2611     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2612         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2613     }
2614 }
2615 
2616 /*
2617  * Unsigned saturating addition with scalar operand.
2618  */
2619 
2620 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2621 {
2622     intptr_t i, oprsz = simd_oprsz(desc);
2623 
2624     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2625         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2626     }
2627 }
2628 
2629 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2630 {
2631     intptr_t i, oprsz = simd_oprsz(desc);
2632 
2633     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2634         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2635     }
2636 }
2637 
2638 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2639 {
2640     intptr_t i, oprsz = simd_oprsz(desc);
2641 
2642     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2643         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2644     }
2645 }
2646 
2647 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2648 {
2649     intptr_t i, oprsz = simd_oprsz(desc);
2650 
2651     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2652         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2653     }
2654 }
2655 
2656 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2657 {
2658     intptr_t i, oprsz = simd_oprsz(desc);
2659 
2660     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2661         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2662     }
2663 }
2664 
2665 /* Two operand predicated copy immediate with merge.  All valid immediates
2666  * can fit within 17 signed bits in the simd_data field.
2667  */
2668 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2669                          uint64_t mm, uint32_t desc)
2670 {
2671     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2672     uint64_t *d = vd, *n = vn;
2673     uint8_t *pg = vg;
2674 
2675     mm = dup_const(MO_8, mm);
2676     for (i = 0; i < opr_sz; i += 1) {
2677         uint64_t nn = n[i];
2678         uint64_t pp = expand_pred_b(pg[H1(i)]);
2679         d[i] = (mm & pp) | (nn & ~pp);
2680     }
2681 }
2682 
2683 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2684                          uint64_t mm, uint32_t desc)
2685 {
2686     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2687     uint64_t *d = vd, *n = vn;
2688     uint8_t *pg = vg;
2689 
2690     mm = dup_const(MO_16, mm);
2691     for (i = 0; i < opr_sz; i += 1) {
2692         uint64_t nn = n[i];
2693         uint64_t pp = expand_pred_h(pg[H1(i)]);
2694         d[i] = (mm & pp) | (nn & ~pp);
2695     }
2696 }
2697 
2698 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2699                          uint64_t mm, uint32_t desc)
2700 {
2701     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2702     uint64_t *d = vd, *n = vn;
2703     uint8_t *pg = vg;
2704 
2705     mm = dup_const(MO_32, mm);
2706     for (i = 0; i < opr_sz; i += 1) {
2707         uint64_t nn = n[i];
2708         uint64_t pp = expand_pred_s(pg[H1(i)]);
2709         d[i] = (mm & pp) | (nn & ~pp);
2710     }
2711 }
2712 
2713 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2714                          uint64_t mm, uint32_t desc)
2715 {
2716     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2717     uint64_t *d = vd, *n = vn;
2718     uint8_t *pg = vg;
2719 
2720     for (i = 0; i < opr_sz; i += 1) {
2721         uint64_t nn = n[i];
2722         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2723     }
2724 }
2725 
2726 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2727 {
2728     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2729     uint64_t *d = vd;
2730     uint8_t *pg = vg;
2731 
2732     val = dup_const(MO_8, val);
2733     for (i = 0; i < opr_sz; i += 1) {
2734         d[i] = val & expand_pred_b(pg[H1(i)]);
2735     }
2736 }
2737 
2738 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2739 {
2740     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2741     uint64_t *d = vd;
2742     uint8_t *pg = vg;
2743 
2744     val = dup_const(MO_16, val);
2745     for (i = 0; i < opr_sz; i += 1) {
2746         d[i] = val & expand_pred_h(pg[H1(i)]);
2747     }
2748 }
2749 
2750 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2751 {
2752     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2753     uint64_t *d = vd;
2754     uint8_t *pg = vg;
2755 
2756     val = dup_const(MO_32, val);
2757     for (i = 0; i < opr_sz; i += 1) {
2758         d[i] = val & expand_pred_s(pg[H1(i)]);
2759     }
2760 }
2761 
2762 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2763 {
2764     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2765     uint64_t *d = vd;
2766     uint8_t *pg = vg;
2767 
2768     for (i = 0; i < opr_sz; i += 1) {
2769         d[i] = (pg[H1(i)] & 1 ? val : 0);
2770     }
2771 }
2772 
2773 /* Big-endian hosts need to frob the byte indices.  If the copy
2774  * happens to be 8-byte aligned, then no frobbing necessary.
2775  */
2776 static void swap_memmove(void *vd, void *vs, size_t n)
2777 {
2778     uintptr_t d = (uintptr_t)vd;
2779     uintptr_t s = (uintptr_t)vs;
2780     uintptr_t o = (d | s | n) & 7;
2781     size_t i;
2782 
2783 #if !HOST_BIG_ENDIAN
2784     o = 0;
2785 #endif
2786     switch (o) {
2787     case 0:
2788         memmove(vd, vs, n);
2789         break;
2790 
2791     case 4:
2792         if (d < s || d >= s + n) {
2793             for (i = 0; i < n; i += 4) {
2794                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2795             }
2796         } else {
2797             for (i = n; i > 0; ) {
2798                 i -= 4;
2799                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2800             }
2801         }
2802         break;
2803 
2804     case 2:
2805     case 6:
2806         if (d < s || d >= s + n) {
2807             for (i = 0; i < n; i += 2) {
2808                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2809             }
2810         } else {
2811             for (i = n; i > 0; ) {
2812                 i -= 2;
2813                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2814             }
2815         }
2816         break;
2817 
2818     default:
2819         if (d < s || d >= s + n) {
2820             for (i = 0; i < n; i++) {
2821                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2822             }
2823         } else {
2824             for (i = n; i > 0; ) {
2825                 i -= 1;
2826                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2827             }
2828         }
2829         break;
2830     }
2831 }
2832 
2833 /* Similarly for memset of 0.  */
2834 static void swap_memzero(void *vd, size_t n)
2835 {
2836     uintptr_t d = (uintptr_t)vd;
2837     uintptr_t o = (d | n) & 7;
2838     size_t i;
2839 
2840     /* Usually, the first bit of a predicate is set, so N is 0.  */
2841     if (likely(n == 0)) {
2842         return;
2843     }
2844 
2845 #if !HOST_BIG_ENDIAN
2846     o = 0;
2847 #endif
2848     switch (o) {
2849     case 0:
2850         memset(vd, 0, n);
2851         break;
2852 
2853     case 4:
2854         for (i = 0; i < n; i += 4) {
2855             *(uint32_t *)H1_4(d + i) = 0;
2856         }
2857         break;
2858 
2859     case 2:
2860     case 6:
2861         for (i = 0; i < n; i += 2) {
2862             *(uint16_t *)H1_2(d + i) = 0;
2863         }
2864         break;
2865 
2866     default:
2867         for (i = 0; i < n; i++) {
2868             *(uint8_t *)H1(d + i) = 0;
2869         }
2870         break;
2871     }
2872 }
2873 
2874 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2875 {
2876     intptr_t opr_sz = simd_oprsz(desc);
2877     size_t n_ofs = simd_data(desc);
2878     size_t n_siz = opr_sz - n_ofs;
2879 
2880     if (vd != vm) {
2881         swap_memmove(vd, vn + n_ofs, n_siz);
2882         swap_memmove(vd + n_siz, vm, n_ofs);
2883     } else if (vd != vn) {
2884         swap_memmove(vd + n_siz, vd, n_ofs);
2885         swap_memmove(vd, vn + n_ofs, n_siz);
2886     } else {
2887         /* vd == vn == vm.  Need temp space.  */
2888         ARMVectorReg tmp;
2889         swap_memmove(&tmp, vm, n_ofs);
2890         swap_memmove(vd, vd + n_ofs, n_siz);
2891         memcpy(vd + n_siz, &tmp, n_ofs);
2892     }
2893 }
2894 
2895 #define DO_INSR(NAME, TYPE, H) \
2896 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2897 {                                                                  \
2898     intptr_t opr_sz = simd_oprsz(desc);                            \
2899     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2900     *(TYPE *)(vd + H(0)) = val;                                    \
2901 }
2902 
2903 DO_INSR(sve_insr_b, uint8_t, H1)
2904 DO_INSR(sve_insr_h, uint16_t, H1_2)
2905 DO_INSR(sve_insr_s, uint32_t, H1_4)
2906 DO_INSR(sve_insr_d, uint64_t, H1_8)
2907 
2908 #undef DO_INSR
2909 
2910 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2911 {
2912     intptr_t i, j, opr_sz = simd_oprsz(desc);
2913     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2914         uint64_t f = *(uint64_t *)(vn + i);
2915         uint64_t b = *(uint64_t *)(vn + j);
2916         *(uint64_t *)(vd + i) = bswap64(b);
2917         *(uint64_t *)(vd + j) = bswap64(f);
2918     }
2919 }
2920 
2921 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2922 {
2923     intptr_t i, j, opr_sz = simd_oprsz(desc);
2924     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2925         uint64_t f = *(uint64_t *)(vn + i);
2926         uint64_t b = *(uint64_t *)(vn + j);
2927         *(uint64_t *)(vd + i) = hswap64(b);
2928         *(uint64_t *)(vd + j) = hswap64(f);
2929     }
2930 }
2931 
2932 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2933 {
2934     intptr_t i, j, opr_sz = simd_oprsz(desc);
2935     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2936         uint64_t f = *(uint64_t *)(vn + i);
2937         uint64_t b = *(uint64_t *)(vn + j);
2938         *(uint64_t *)(vd + i) = rol64(b, 32);
2939         *(uint64_t *)(vd + j) = rol64(f, 32);
2940     }
2941 }
2942 
2943 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2944 {
2945     intptr_t i, j, opr_sz = simd_oprsz(desc);
2946     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947         uint64_t f = *(uint64_t *)(vn + i);
2948         uint64_t b = *(uint64_t *)(vn + j);
2949         *(uint64_t *)(vd + i) = b;
2950         *(uint64_t *)(vd + j) = f;
2951     }
2952 }
2953 
2954 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2955 
2956 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2957                            bool is_tbx, tb_impl_fn *fn)
2958 {
2959     ARMVectorReg scratch;
2960     uintptr_t oprsz = simd_oprsz(desc);
2961 
2962     if (unlikely(vd == vn)) {
2963         vn = memcpy(&scratch, vn, oprsz);
2964     }
2965 
2966     fn(vd, vn, NULL, vm, oprsz, is_tbx);
2967 }
2968 
2969 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2970                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2971 {
2972     ARMVectorReg scratch;
2973     uintptr_t oprsz = simd_oprsz(desc);
2974 
2975     if (unlikely(vd == vn0)) {
2976         vn0 = memcpy(&scratch, vn0, oprsz);
2977         if (vd == vn1) {
2978             vn1 = vn0;
2979         }
2980     } else if (unlikely(vd == vn1)) {
2981         vn1 = memcpy(&scratch, vn1, oprsz);
2982     }
2983 
2984     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2985 }
2986 
2987 #define DO_TB(SUFF, TYPE, H)                                            \
2988 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2989                                 void *vm, uintptr_t oprsz, bool is_tbx) \
2990 {                                                                       \
2991     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2992     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2993     for (i = 0; i < nelem; ++i) {                                       \
2994         TYPE index = indexes[H1(i)], val = 0;                           \
2995         if (index < nelem) {                                            \
2996             val = tbl0[H(index)];                                       \
2997         } else {                                                        \
2998             index -= nelem;                                             \
2999             if (tbl1 && index < nelem) {                                \
3000                 val = tbl1[H(index)];                                   \
3001             } else if (is_tbx) {                                        \
3002                 continue;                                               \
3003             }                                                           \
3004         }                                                               \
3005         d[H(i)] = val;                                                  \
3006     }                                                                   \
3007 }                                                                       \
3008 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3009 {                                                                       \
3010     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3011 }                                                                       \
3012 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3013                              void *vm, uint32_t desc)                   \
3014 {                                                                       \
3015     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3016 }                                                                       \
3017 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3018 {                                                                       \
3019     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3020 }
3021 
3022 DO_TB(b, uint8_t, H1)
3023 DO_TB(h, uint16_t, H2)
3024 DO_TB(s, uint32_t, H4)
3025 DO_TB(d, uint64_t, H8)
3026 
3027 #undef DO_TB
3028 
3029 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3030 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3031 {                                                              \
3032     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3033     TYPED *d = vd;                                             \
3034     TYPES *n = vn;                                             \
3035     ARMVectorReg tmp;                                          \
3036     if (unlikely(vn - vd < opr_sz)) {                          \
3037         n = memcpy(&tmp, n, opr_sz / 2);                       \
3038     }                                                          \
3039     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3040         d[HD(i)] = n[HS(i)];                                   \
3041     }                                                          \
3042 }
3043 
3044 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3045 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3046 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3047 
3048 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3049 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3050 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3051 
3052 #undef DO_UNPK
3053 
3054 /* Mask of bits included in the even numbered predicates of width esz.
3055  * We also use this for expand_bits/compress_bits, and so extend the
3056  * same pattern out to 16-bit units.
3057  */
3058 static const uint64_t even_bit_esz_masks[5] = {
3059     0x5555555555555555ull,
3060     0x3333333333333333ull,
3061     0x0f0f0f0f0f0f0f0full,
3062     0x00ff00ff00ff00ffull,
3063     0x0000ffff0000ffffull,
3064 };
3065 
3066 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3067  * For N==0, this corresponds to the operation that in qemu/bitops.h
3068  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3069  * section 7-2 Shuffling Bits.
3070  */
3071 static uint64_t expand_bits(uint64_t x, int n)
3072 {
3073     int i;
3074 
3075     x &= 0xffffffffu;
3076     for (i = 4; i >= n; i--) {
3077         int sh = 1 << i;
3078         x = ((x << sh) | x) & even_bit_esz_masks[i];
3079     }
3080     return x;
3081 }
3082 
3083 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3084  * For N==0, this corresponds to the operation that in qemu/bitops.h
3085  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3086  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3087  */
3088 static uint64_t compress_bits(uint64_t x, int n)
3089 {
3090     int i;
3091 
3092     for (i = n; i <= 4; i++) {
3093         int sh = 1 << i;
3094         x &= even_bit_esz_masks[i];
3095         x = (x >> sh) | x;
3096     }
3097     return x & 0xffffffffu;
3098 }
3099 
3100 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3101 {
3102     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3103     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3104     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3105     int esize = 1 << esz;
3106     uint64_t *d = vd;
3107     intptr_t i;
3108 
3109     if (oprsz <= 8) {
3110         uint64_t nn = *(uint64_t *)vn;
3111         uint64_t mm = *(uint64_t *)vm;
3112         int half = 4 * oprsz;
3113 
3114         nn = extract64(nn, high * half, half);
3115         mm = extract64(mm, high * half, half);
3116         nn = expand_bits(nn, esz);
3117         mm = expand_bits(mm, esz);
3118         d[0] = nn | (mm << esize);
3119     } else {
3120         ARMPredicateReg tmp;
3121 
3122         /* We produce output faster than we consume input.
3123            Therefore we must be mindful of possible overlap.  */
3124         if (vd == vn) {
3125             vn = memcpy(&tmp, vn, oprsz);
3126             if (vd == vm) {
3127                 vm = vn;
3128             }
3129         } else if (vd == vm) {
3130             vm = memcpy(&tmp, vm, oprsz);
3131         }
3132         if (high) {
3133             high = oprsz >> 1;
3134         }
3135 
3136         if ((oprsz & 7) == 0) {
3137             uint32_t *n = vn, *m = vm;
3138             high >>= 2;
3139 
3140             for (i = 0; i < oprsz / 8; i++) {
3141                 uint64_t nn = n[H4(high + i)];
3142                 uint64_t mm = m[H4(high + i)];
3143 
3144                 nn = expand_bits(nn, esz);
3145                 mm = expand_bits(mm, esz);
3146                 d[i] = nn | (mm << esize);
3147             }
3148         } else {
3149             uint8_t *n = vn, *m = vm;
3150             uint16_t *d16 = vd;
3151 
3152             for (i = 0; i < oprsz / 2; i++) {
3153                 uint16_t nn = n[H1(high + i)];
3154                 uint16_t mm = m[H1(high + i)];
3155 
3156                 nn = expand_bits(nn, esz);
3157                 mm = expand_bits(mm, esz);
3158                 d16[H2(i)] = nn | (mm << esize);
3159             }
3160         }
3161     }
3162 }
3163 
3164 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3165 {
3166     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3167     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3168     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3169     uint64_t *d = vd, *n = vn, *m = vm;
3170     uint64_t l, h;
3171     intptr_t i;
3172 
3173     if (oprsz <= 8) {
3174         l = compress_bits(n[0] >> odd, esz);
3175         h = compress_bits(m[0] >> odd, esz);
3176         d[0] = l | (h << (4 * oprsz));
3177     } else {
3178         ARMPredicateReg tmp_m;
3179         intptr_t oprsz_16 = oprsz / 16;
3180 
3181         if ((vm - vd) < (uintptr_t)oprsz) {
3182             m = memcpy(&tmp_m, vm, oprsz);
3183         }
3184 
3185         for (i = 0; i < oprsz_16; i++) {
3186             l = n[2 * i + 0];
3187             h = n[2 * i + 1];
3188             l = compress_bits(l >> odd, esz);
3189             h = compress_bits(h >> odd, esz);
3190             d[i] = l | (h << 32);
3191         }
3192 
3193         /*
3194          * For VL which is not a multiple of 512, the results from M do not
3195          * align nicely with the uint64_t for D.  Put the aligned results
3196          * from M into TMP_M and then copy it into place afterward.
3197          */
3198         if (oprsz & 15) {
3199             int final_shift = (oprsz & 15) * 2;
3200 
3201             l = n[2 * i + 0];
3202             h = n[2 * i + 1];
3203             l = compress_bits(l >> odd, esz);
3204             h = compress_bits(h >> odd, esz);
3205             d[i] = l | (h << final_shift);
3206 
3207             for (i = 0; i < oprsz_16; i++) {
3208                 l = m[2 * i + 0];
3209                 h = m[2 * i + 1];
3210                 l = compress_bits(l >> odd, esz);
3211                 h = compress_bits(h >> odd, esz);
3212                 tmp_m.p[i] = l | (h << 32);
3213             }
3214             l = m[2 * i + 0];
3215             h = m[2 * i + 1];
3216             l = compress_bits(l >> odd, esz);
3217             h = compress_bits(h >> odd, esz);
3218             tmp_m.p[i] = l | (h << final_shift);
3219 
3220             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3221         } else {
3222             for (i = 0; i < oprsz_16; i++) {
3223                 l = m[2 * i + 0];
3224                 h = m[2 * i + 1];
3225                 l = compress_bits(l >> odd, esz);
3226                 h = compress_bits(h >> odd, esz);
3227                 d[oprsz_16 + i] = l | (h << 32);
3228             }
3229         }
3230     }
3231 }
3232 
3233 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3234 {
3235     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3236     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3237     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3238     uint64_t *d = vd, *n = vn, *m = vm;
3239     uint64_t mask;
3240     int shr, shl;
3241     intptr_t i;
3242 
3243     shl = 1 << esz;
3244     shr = 0;
3245     mask = even_bit_esz_masks[esz];
3246     if (odd) {
3247         mask <<= shl;
3248         shr = shl;
3249         shl = 0;
3250     }
3251 
3252     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3253         uint64_t nn = (n[i] & mask) >> shr;
3254         uint64_t mm = (m[i] & mask) << shl;
3255         d[i] = nn + mm;
3256     }
3257 }
3258 
3259 /* Reverse units of 2**N bits.  */
3260 static uint64_t reverse_bits_64(uint64_t x, int n)
3261 {
3262     int i, sh;
3263 
3264     x = bswap64(x);
3265     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3266         uint64_t mask = even_bit_esz_masks[i];
3267         x = ((x & mask) << sh) | ((x >> sh) & mask);
3268     }
3269     return x;
3270 }
3271 
3272 static uint8_t reverse_bits_8(uint8_t x, int n)
3273 {
3274     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3275     int i, sh;
3276 
3277     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3278         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3279     }
3280     return x;
3281 }
3282 
3283 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3284 {
3285     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3286     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3287     intptr_t i, oprsz_2 = oprsz / 2;
3288 
3289     if (oprsz <= 8) {
3290         uint64_t l = *(uint64_t *)vn;
3291         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3292         *(uint64_t *)vd = l;
3293     } else if ((oprsz & 15) == 0) {
3294         for (i = 0; i < oprsz_2; i += 8) {
3295             intptr_t ih = oprsz - 8 - i;
3296             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3297             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3298             *(uint64_t *)(vd + i) = h;
3299             *(uint64_t *)(vd + ih) = l;
3300         }
3301     } else {
3302         for (i = 0; i < oprsz_2; i += 1) {
3303             intptr_t il = H1(i);
3304             intptr_t ih = H1(oprsz - 1 - i);
3305             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3306             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3307             *(uint8_t *)(vd + il) = h;
3308             *(uint8_t *)(vd + ih) = l;
3309         }
3310     }
3311 }
3312 
3313 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3314 {
3315     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3316     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3317     uint64_t *d = vd;
3318     intptr_t i;
3319 
3320     if (oprsz <= 8) {
3321         uint64_t nn = *(uint64_t *)vn;
3322         int half = 4 * oprsz;
3323 
3324         nn = extract64(nn, high * half, half);
3325         nn = expand_bits(nn, 0);
3326         d[0] = nn;
3327     } else {
3328         ARMPredicateReg tmp_n;
3329 
3330         /* We produce output faster than we consume input.
3331            Therefore we must be mindful of possible overlap.  */
3332         if ((vn - vd) < (uintptr_t)oprsz) {
3333             vn = memcpy(&tmp_n, vn, oprsz);
3334         }
3335         if (high) {
3336             high = oprsz >> 1;
3337         }
3338 
3339         if ((oprsz & 7) == 0) {
3340             uint32_t *n = vn;
3341             high >>= 2;
3342 
3343             for (i = 0; i < oprsz / 8; i++) {
3344                 uint64_t nn = n[H4(high + i)];
3345                 d[i] = expand_bits(nn, 0);
3346             }
3347         } else {
3348             uint16_t *d16 = vd;
3349             uint8_t *n = vn;
3350 
3351             for (i = 0; i < oprsz / 2; i++) {
3352                 uint16_t nn = n[H1(high + i)];
3353                 d16[H2(i)] = expand_bits(nn, 0);
3354             }
3355         }
3356     }
3357 }
3358 
3359 #define DO_ZIP(NAME, TYPE, H) \
3360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3361 {                                                                    \
3362     intptr_t oprsz = simd_oprsz(desc);                               \
3363     intptr_t odd_ofs = simd_data(desc);                              \
3364     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3365     ARMVectorReg tmp_n, tmp_m;                                       \
3366     /* We produce output faster than we consume input.               \
3367        Therefore we must be mindful of possible overlap.  */         \
3368     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3369         vn = memcpy(&tmp_n, vn, oprsz);                              \
3370     }                                                                \
3371     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3372         vm = memcpy(&tmp_m, vm, oprsz);                              \
3373     }                                                                \
3374     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3375         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3376         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3377             *(TYPE *)(vm + odd_ofs + H(i));                          \
3378     }                                                                \
3379     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3380         memset(vd + oprsz - 16, 0, 16);                              \
3381     }                                                                \
3382 }
3383 
3384 DO_ZIP(sve_zip_b, uint8_t, H1)
3385 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3386 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3387 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3388 DO_ZIP(sve2_zip_q, Int128, )
3389 
3390 #define DO_UZP(NAME, TYPE, H) \
3391 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3392 {                                                                      \
3393     intptr_t oprsz = simd_oprsz(desc);                                 \
3394     intptr_t odd_ofs = simd_data(desc);                                \
3395     intptr_t i, p;                                                     \
3396     ARMVectorReg tmp_m;                                                \
3397     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3398         vm = memcpy(&tmp_m, vm, oprsz);                                \
3399     }                                                                  \
3400     i = 0, p = odd_ofs;                                                \
3401     do {                                                               \
3402         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3403         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3404     } while (p < oprsz);                                               \
3405     p -= oprsz;                                                        \
3406     do {                                                               \
3407         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3408         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3409     } while (p < oprsz);                                               \
3410     tcg_debug_assert(i == oprsz);                                      \
3411 }
3412 
3413 DO_UZP(sve_uzp_b, uint8_t, H1)
3414 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3415 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3416 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3417 DO_UZP(sve2_uzp_q, Int128, )
3418 
3419 #define DO_TRN(NAME, TYPE, H) \
3420 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3421 {                                                                      \
3422     intptr_t oprsz = simd_oprsz(desc);                                 \
3423     intptr_t odd_ofs = simd_data(desc);                                \
3424     intptr_t i;                                                        \
3425     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3426         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3427         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3428         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3429         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3430     }                                                                  \
3431     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3432         memset(vd + oprsz - 16, 0, 16);                                \
3433     }                                                                  \
3434 }
3435 
3436 DO_TRN(sve_trn_b, uint8_t, H1)
3437 DO_TRN(sve_trn_h, uint16_t, H1_2)
3438 DO_TRN(sve_trn_s, uint32_t, H1_4)
3439 DO_TRN(sve_trn_d, uint64_t, H1_8)
3440 DO_TRN(sve2_trn_q, Int128, )
3441 
3442 #undef DO_ZIP
3443 #undef DO_UZP
3444 #undef DO_TRN
3445 
3446 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3447 {
3448     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3449     uint32_t *d = vd, *n = vn;
3450     uint8_t *pg = vg;
3451 
3452     for (i = j = 0; i < opr_sz; i++) {
3453         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3454             d[H4(j)] = n[H4(i)];
3455             j++;
3456         }
3457     }
3458     for (; j < opr_sz; j++) {
3459         d[H4(j)] = 0;
3460     }
3461 }
3462 
3463 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3464 {
3465     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3466     uint64_t *d = vd, *n = vn;
3467     uint8_t *pg = vg;
3468 
3469     for (i = j = 0; i < opr_sz; i++) {
3470         if (pg[H1(i)] & 1) {
3471             d[j] = n[i];
3472             j++;
3473         }
3474     }
3475     for (; j < opr_sz; j++) {
3476         d[j] = 0;
3477     }
3478 }
3479 
3480 /* Similar to the ARM LastActiveElement pseudocode function, except the
3481  * result is multiplied by the element size.  This includes the not found
3482  * indication; e.g. not found for esz=3 is -8.
3483  */
3484 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3485 {
3486     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3487     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3488 
3489     return last_active_element(vg, words, esz);
3490 }
3491 
3492 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3493 {
3494     intptr_t opr_sz = simd_oprsz(desc) / 8;
3495     int esz = simd_data(desc);
3496     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3497     intptr_t i, first_i, last_i;
3498     ARMVectorReg tmp;
3499 
3500     first_i = last_i = 0;
3501     first_g = last_g = 0;
3502 
3503     /* Find the extent of the active elements within VG.  */
3504     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3505         pg = *(uint64_t *)(vg + i) & mask;
3506         if (pg) {
3507             if (last_g == 0) {
3508                 last_g = pg;
3509                 last_i = i;
3510             }
3511             first_g = pg;
3512             first_i = i;
3513         }
3514     }
3515 
3516     len = 0;
3517     if (first_g != 0) {
3518         first_i = first_i * 8 + ctz64(first_g);
3519         last_i = last_i * 8 + 63 - clz64(last_g);
3520         len = last_i - first_i + (1 << esz);
3521         if (vd == vm) {
3522             vm = memcpy(&tmp, vm, opr_sz * 8);
3523         }
3524         swap_memmove(vd, vn + first_i, len);
3525     }
3526     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3527 }
3528 
3529 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3530                             void *vg, uint32_t desc)
3531 {
3532     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3533     uint64_t *d = vd, *n = vn, *m = vm;
3534     uint8_t *pg = vg;
3535 
3536     for (i = 0; i < opr_sz; i += 1) {
3537         uint64_t nn = n[i], mm = m[i];
3538         uint64_t pp = expand_pred_b(pg[H1(i)]);
3539         d[i] = (nn & pp) | (mm & ~pp);
3540     }
3541 }
3542 
3543 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3544                             void *vg, uint32_t desc)
3545 {
3546     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3547     uint64_t *d = vd, *n = vn, *m = vm;
3548     uint8_t *pg = vg;
3549 
3550     for (i = 0; i < opr_sz; i += 1) {
3551         uint64_t nn = n[i], mm = m[i];
3552         uint64_t pp = expand_pred_h(pg[H1(i)]);
3553         d[i] = (nn & pp) | (mm & ~pp);
3554     }
3555 }
3556 
3557 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3558                             void *vg, uint32_t desc)
3559 {
3560     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3561     uint64_t *d = vd, *n = vn, *m = vm;
3562     uint8_t *pg = vg;
3563 
3564     for (i = 0; i < opr_sz; i += 1) {
3565         uint64_t nn = n[i], mm = m[i];
3566         uint64_t pp = expand_pred_s(pg[H1(i)]);
3567         d[i] = (nn & pp) | (mm & ~pp);
3568     }
3569 }
3570 
3571 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3572                             void *vg, uint32_t desc)
3573 {
3574     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3575     uint64_t *d = vd, *n = vn, *m = vm;
3576     uint8_t *pg = vg;
3577 
3578     for (i = 0; i < opr_sz; i += 1) {
3579         uint64_t nn = n[i], mm = m[i];
3580         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3581     }
3582 }
3583 
3584 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3585                             void *vg, uint32_t desc)
3586 {
3587     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3588     Int128 *d = vd, *n = vn, *m = vm;
3589     uint16_t *pg = vg;
3590 
3591     for (i = 0; i < opr_sz; i += 1) {
3592         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3593     }
3594 }
3595 
3596 /* Two operand comparison controlled by a predicate.
3597  * ??? It is very tempting to want to be able to expand this inline
3598  * with x86 instructions, e.g.
3599  *
3600  *    vcmpeqw    zm, zn, %ymm0
3601  *    vpmovmskb  %ymm0, %eax
3602  *    and        $0x5555, %eax
3603  *    and        pg, %eax
3604  *
3605  * or even aarch64, e.g.
3606  *
3607  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3608  *    cmeq       v0.8h, zn, zm
3609  *    and        v0.8h, v0.8h, mask
3610  *    addv       h0, v0.8h
3611  *    and        v0.8b, pg
3612  *
3613  * However, coming up with an abstraction that allows vector inputs and
3614  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3615  * scalar outputs, is tricky.
3616  */
3617 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3618 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3619 {                                                                            \
3620     intptr_t opr_sz = simd_oprsz(desc);                                      \
3621     uint32_t flags = PREDTEST_INIT;                                          \
3622     intptr_t i = opr_sz;                                                     \
3623     do {                                                                     \
3624         uint64_t out = 0, pg;                                                \
3625         do {                                                                 \
3626             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3627             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3628             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3629             out |= nn OP mm;                                                 \
3630         } while (i & 63);                                                    \
3631         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3632         out &= pg;                                                           \
3633         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3634         flags = iter_predtest_bwd(out, pg, flags);                           \
3635     } while (i > 0);                                                         \
3636     return flags;                                                            \
3637 }
3638 
3639 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3640     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3641 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3642     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3643 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3644     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3645 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3646     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3647 
3648 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3649 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3650 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3651 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3652 
3653 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3654 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3655 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3656 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3657 
3658 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3659 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3660 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3661 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3662 
3663 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3664 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3665 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3666 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3667 
3668 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3669 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3670 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3671 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3672 
3673 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3674 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3675 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3676 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3677 
3678 #undef DO_CMP_PPZZ_B
3679 #undef DO_CMP_PPZZ_H
3680 #undef DO_CMP_PPZZ_S
3681 #undef DO_CMP_PPZZ_D
3682 #undef DO_CMP_PPZZ
3683 
3684 /* Similar, but the second source is "wide".  */
3685 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3686 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3687 {                                                                            \
3688     intptr_t opr_sz = simd_oprsz(desc);                                      \
3689     uint32_t flags = PREDTEST_INIT;                                          \
3690     intptr_t i = opr_sz;                                                     \
3691     do {                                                                     \
3692         uint64_t out = 0, pg;                                                \
3693         do {                                                                 \
3694             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3695             do {                                                             \
3696                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3697                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3698                 out |= nn OP mm;                                             \
3699             } while (i & 7);                                                 \
3700         } while (i & 63);                                                    \
3701         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3702         out &= pg;                                                           \
3703         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3704         flags = iter_predtest_bwd(out, pg, flags);                           \
3705     } while (i > 0);                                                         \
3706     return flags;                                                            \
3707 }
3708 
3709 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3710     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3711 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3712     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3713 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3714     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3715 
3716 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3717 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3718 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3719 
3720 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3721 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3722 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3723 
3724 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3725 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3726 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3727 
3728 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3729 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3730 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3731 
3732 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3733 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3734 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3735 
3736 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3737 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3738 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3739 
3740 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3741 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3742 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3743 
3744 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3745 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3746 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3747 
3748 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3749 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3750 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3751 
3752 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3753 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3754 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3755 
3756 #undef DO_CMP_PPZW_B
3757 #undef DO_CMP_PPZW_H
3758 #undef DO_CMP_PPZW_S
3759 #undef DO_CMP_PPZW
3760 
3761 /* Similar, but the second source is immediate.  */
3762 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3763 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3764 {                                                                    \
3765     intptr_t opr_sz = simd_oprsz(desc);                              \
3766     uint32_t flags = PREDTEST_INIT;                                  \
3767     TYPE mm = simd_data(desc);                                       \
3768     intptr_t i = opr_sz;                                             \
3769     do {                                                             \
3770         uint64_t out = 0, pg;                                        \
3771         do {                                                         \
3772             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3773             TYPE nn = *(TYPE *)(vn + H(i));                          \
3774             out |= nn OP mm;                                         \
3775         } while (i & 63);                                            \
3776         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3777         out &= pg;                                                   \
3778         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3779         flags = iter_predtest_bwd(out, pg, flags);                   \
3780     } while (i > 0);                                                 \
3781     return flags;                                                    \
3782 }
3783 
3784 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3785     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3786 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3787     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3788 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3789     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3790 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3791     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3792 
3793 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3794 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3795 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3796 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3797 
3798 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3799 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3800 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3801 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3802 
3803 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3804 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3805 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3806 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3807 
3808 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3809 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3810 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3811 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3812 
3813 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3814 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3815 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3816 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3817 
3818 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3819 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3820 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3821 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3822 
3823 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3824 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3825 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3826 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3827 
3828 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3829 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3830 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3831 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3832 
3833 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3834 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3835 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3836 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3837 
3838 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3839 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3840 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3841 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3842 
3843 #undef DO_CMP_PPZI_B
3844 #undef DO_CMP_PPZI_H
3845 #undef DO_CMP_PPZI_S
3846 #undef DO_CMP_PPZI_D
3847 #undef DO_CMP_PPZI
3848 
3849 /* Similar to the ARM LastActive pseudocode function.  */
3850 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3851 {
3852     intptr_t i;
3853 
3854     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3855         uint64_t pg = *(uint64_t *)(vg + i);
3856         if (pg) {
3857             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3858         }
3859     }
3860     return 0;
3861 }
3862 
3863 /* Compute a mask into RETB that is true for all G, up to and including
3864  * (if after) or excluding (if !after) the first G & N.
3865  * Return true if BRK found.
3866  */
3867 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3868                         bool brk, bool after)
3869 {
3870     uint64_t b;
3871 
3872     if (brk) {
3873         b = 0;
3874     } else if ((g & n) == 0) {
3875         /* For all G, no N are set; break not found.  */
3876         b = g;
3877     } else {
3878         /* Break somewhere in N.  Locate it.  */
3879         b = g & n;            /* guard true, pred true */
3880         b = b & -b;           /* first such */
3881         if (after) {
3882             b = b | (b - 1);  /* break after same */
3883         } else {
3884             b = b - 1;        /* break before same */
3885         }
3886         brk = true;
3887     }
3888 
3889     *retb = b;
3890     return brk;
3891 }
3892 
3893 /* Compute a zeroing BRK.  */
3894 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3895                           intptr_t oprsz, bool after)
3896 {
3897     bool brk = false;
3898     intptr_t i;
3899 
3900     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3901         uint64_t this_b, this_g = g[i];
3902 
3903         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3904         d[i] = this_b & this_g;
3905     }
3906 }
3907 
3908 /* Likewise, but also compute flags.  */
3909 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3910                                intptr_t oprsz, bool after)
3911 {
3912     uint32_t flags = PREDTEST_INIT;
3913     bool brk = false;
3914     intptr_t i;
3915 
3916     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3917         uint64_t this_b, this_d, this_g = g[i];
3918 
3919         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3920         d[i] = this_d = this_b & this_g;
3921         flags = iter_predtest_fwd(this_d, this_g, flags);
3922     }
3923     return flags;
3924 }
3925 
3926 /* Compute a merging BRK.  */
3927 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3928                           intptr_t oprsz, bool after)
3929 {
3930     bool brk = false;
3931     intptr_t i;
3932 
3933     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3934         uint64_t this_b, this_g = g[i];
3935 
3936         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3937         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3938     }
3939 }
3940 
3941 /* Likewise, but also compute flags.  */
3942 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3943                                intptr_t oprsz, bool after)
3944 {
3945     uint32_t flags = PREDTEST_INIT;
3946     bool brk = false;
3947     intptr_t i;
3948 
3949     for (i = 0; i < oprsz / 8; ++i) {
3950         uint64_t this_b, this_d = d[i], this_g = g[i];
3951 
3952         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3953         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3954         flags = iter_predtest_fwd(this_d, this_g, flags);
3955     }
3956     return flags;
3957 }
3958 
3959 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3960 {
3961     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3962      * The compiler should turn this into 4 64-bit integer stores.
3963      */
3964     memset(d, 0, sizeof(ARMPredicateReg));
3965     return PREDTEST_INIT;
3966 }
3967 
3968 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3969                        uint32_t pred_desc)
3970 {
3971     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3972     if (last_active_pred(vn, vg, oprsz)) {
3973         compute_brk_z(vd, vm, vg, oprsz, true);
3974     } else {
3975         do_zero(vd, oprsz);
3976     }
3977 }
3978 
3979 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3980                             uint32_t pred_desc)
3981 {
3982     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3983     if (last_active_pred(vn, vg, oprsz)) {
3984         return compute_brks_z(vd, vm, vg, oprsz, true);
3985     } else {
3986         return do_zero(vd, oprsz);
3987     }
3988 }
3989 
3990 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3991                        uint32_t pred_desc)
3992 {
3993     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3994     if (last_active_pred(vn, vg, oprsz)) {
3995         compute_brk_z(vd, vm, vg, oprsz, false);
3996     } else {
3997         do_zero(vd, oprsz);
3998     }
3999 }
4000 
4001 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4002                             uint32_t pred_desc)
4003 {
4004     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005     if (last_active_pred(vn, vg, oprsz)) {
4006         return compute_brks_z(vd, vm, vg, oprsz, false);
4007     } else {
4008         return do_zero(vd, oprsz);
4009     }
4010 }
4011 
4012 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4013 {
4014     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4015     compute_brk_z(vd, vn, vg, oprsz, true);
4016 }
4017 
4018 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4019 {
4020     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4021     return compute_brks_z(vd, vn, vg, oprsz, true);
4022 }
4023 
4024 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4025 {
4026     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4027     compute_brk_z(vd, vn, vg, oprsz, false);
4028 }
4029 
4030 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4031 {
4032     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4033     return compute_brks_z(vd, vn, vg, oprsz, false);
4034 }
4035 
4036 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4037 {
4038     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4039     compute_brk_m(vd, vn, vg, oprsz, true);
4040 }
4041 
4042 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4043 {
4044     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4045     return compute_brks_m(vd, vn, vg, oprsz, true);
4046 }
4047 
4048 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4049 {
4050     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4051     compute_brk_m(vd, vn, vg, oprsz, false);
4052 }
4053 
4054 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4055 {
4056     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4057     return compute_brks_m(vd, vn, vg, oprsz, false);
4058 }
4059 
4060 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4061 {
4062     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4063     if (!last_active_pred(vn, vg, oprsz)) {
4064         do_zero(vd, oprsz);
4065     }
4066 }
4067 
4068 /* As if PredTest(Ones(PL), D, esz).  */
4069 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4070                               uint64_t esz_mask)
4071 {
4072     uint32_t flags = PREDTEST_INIT;
4073     intptr_t i;
4074 
4075     for (i = 0; i < oprsz / 8; i++) {
4076         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4077     }
4078     if (oprsz & 7) {
4079         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4080         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4081     }
4082     return flags;
4083 }
4084 
4085 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4086 {
4087     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4088     if (last_active_pred(vn, vg, oprsz)) {
4089         return predtest_ones(vd, oprsz, -1);
4090     } else {
4091         return do_zero(vd, oprsz);
4092     }
4093 }
4094 
4095 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4096 {
4097     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4098     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4099     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4100     intptr_t i;
4101 
4102     for (i = 0; i < words; ++i) {
4103         uint64_t t = n[i] & g[i] & mask;
4104         sum += ctpop64(t);
4105     }
4106     return sum;
4107 }
4108 
4109 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4110 {
4111     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4112     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4113     uint64_t esz_mask = pred_esz_masks[esz];
4114     ARMPredicateReg *d = vd;
4115     uint32_t flags;
4116     intptr_t i;
4117 
4118     /* Begin with a zero predicate register.  */
4119     flags = do_zero(d, oprsz);
4120     if (count == 0) {
4121         return flags;
4122     }
4123 
4124     /* Set all of the requested bits.  */
4125     for (i = 0; i < count / 64; ++i) {
4126         d->p[i] = esz_mask;
4127     }
4128     if (count & 63) {
4129         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4130     }
4131 
4132     return predtest_ones(d, oprsz, esz_mask);
4133 }
4134 
4135 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4136 {
4137     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4138     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4139     uint64_t esz_mask = pred_esz_masks[esz];
4140     ARMPredicateReg *d = vd;
4141     intptr_t i, invcount, oprbits;
4142     uint64_t bits;
4143 
4144     if (count == 0) {
4145         return do_zero(d, oprsz);
4146     }
4147 
4148     oprbits = oprsz * 8;
4149     tcg_debug_assert(count <= oprbits);
4150 
4151     bits = esz_mask;
4152     if (oprbits & 63) {
4153         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4154     }
4155 
4156     invcount = oprbits - count;
4157     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4158         d->p[i] = bits;
4159         bits = esz_mask;
4160     }
4161 
4162     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4163 
4164     while (--i >= 0) {
4165         d->p[i] = 0;
4166     }
4167 
4168     return predtest_ones(d, oprsz, esz_mask);
4169 }
4170 
4171 /* Recursive reduction on a function;
4172  * C.f. the ARM ARM function ReducePredicated.
4173  *
4174  * While it would be possible to write this without the DATA temporary,
4175  * it is much simpler to process the predicate register this way.
4176  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4177  * little to gain with a more complex non-recursive form.
4178  */
4179 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4180 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4181 {                                                                     \
4182     if (n == 1) {                                                     \
4183         return *data;                                                 \
4184     } else {                                                          \
4185         uintptr_t half = n / 2;                                       \
4186         TYPE lo = NAME##_reduce(data, status, half);                  \
4187         TYPE hi = NAME##_reduce(data + half, status, half);           \
4188         return TYPE##_##FUNC(lo, hi, status);                         \
4189     }                                                                 \
4190 }                                                                     \
4191 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4192 {                                                                     \
4193     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4194     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4195     for (i = 0; i < oprsz; ) {                                        \
4196         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4197         do {                                                          \
4198             TYPE nn = *(TYPE *)(vn + H(i));                           \
4199             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4200             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4201         } while (i & 15);                                             \
4202     }                                                                 \
4203     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4204         *(TYPE *)((void *)data + i) = IDENT;                          \
4205     }                                                                 \
4206     return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4207 }
4208 
4209 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4210 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4211 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4212 
4213 /* Identity is floatN_default_nan, without the function call.  */
4214 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4215 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4216 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4217 
4218 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4219 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4220 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4221 
4222 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4223 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4224 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4225 
4226 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4227 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4228 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4229 
4230 #undef DO_REDUCE
4231 
4232 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4233                              void *status, uint32_t desc)
4234 {
4235     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4236     float16 result = nn;
4237 
4238     do {
4239         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4240         do {
4241             if (pg & 1) {
4242                 float16 mm = *(float16 *)(vm + H1_2(i));
4243                 result = float16_add(result, mm, status);
4244             }
4245             i += sizeof(float16), pg >>= sizeof(float16);
4246         } while (i & 15);
4247     } while (i < opr_sz);
4248 
4249     return result;
4250 }
4251 
4252 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4253                              void *status, uint32_t desc)
4254 {
4255     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4256     float32 result = nn;
4257 
4258     do {
4259         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4260         do {
4261             if (pg & 1) {
4262                 float32 mm = *(float32 *)(vm + H1_2(i));
4263                 result = float32_add(result, mm, status);
4264             }
4265             i += sizeof(float32), pg >>= sizeof(float32);
4266         } while (i & 15);
4267     } while (i < opr_sz);
4268 
4269     return result;
4270 }
4271 
4272 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4273                              void *status, uint32_t desc)
4274 {
4275     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4276     uint64_t *m = vm;
4277     uint8_t *pg = vg;
4278 
4279     for (i = 0; i < opr_sz; i++) {
4280         if (pg[H1(i)] & 1) {
4281             nn = float64_add(nn, m[i], status);
4282         }
4283     }
4284 
4285     return nn;
4286 }
4287 
4288 /* Fully general three-operand expander, controlled by a predicate,
4289  * With the extra float_status parameter.
4290  */
4291 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4292 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4293                   void *status, uint32_t desc)                  \
4294 {                                                               \
4295     intptr_t i = simd_oprsz(desc);                              \
4296     uint64_t *g = vg;                                           \
4297     do {                                                        \
4298         uint64_t pg = g[(i - 1) >> 6];                          \
4299         do {                                                    \
4300             i -= sizeof(TYPE);                                  \
4301             if (likely((pg >> (i & 63)) & 1)) {                 \
4302                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4303                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4304                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4305             }                                                   \
4306         } while (i & 63);                                       \
4307     } while (i != 0);                                           \
4308 }
4309 
4310 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4311 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4312 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4313 
4314 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4315 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4316 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4317 
4318 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4319 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4320 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4321 
4322 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4323 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4324 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4325 
4326 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4327 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4328 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4329 
4330 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4331 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4332 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4333 
4334 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4335 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4336 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4337 
4338 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4339 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4340 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4341 
4342 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4343 {
4344     return float16_abs(float16_sub(a, b, s));
4345 }
4346 
4347 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4348 {
4349     return float32_abs(float32_sub(a, b, s));
4350 }
4351 
4352 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4353 {
4354     return float64_abs(float64_sub(a, b, s));
4355 }
4356 
4357 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4358 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4359 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4360 
4361 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4362 {
4363     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4364     return float64_scalbn(a, b_int, s);
4365 }
4366 
4367 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4368 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4369 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4370 
4371 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4372 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4373 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4374 
4375 #undef DO_ZPZZ_FP
4376 
4377 /* Three-operand expander, with one scalar operand, controlled by
4378  * a predicate, with the extra float_status parameter.
4379  */
4380 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4381 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4382                   void *status, uint32_t desc)                    \
4383 {                                                                 \
4384     intptr_t i = simd_oprsz(desc);                                \
4385     uint64_t *g = vg;                                             \
4386     TYPE mm = scalar;                                             \
4387     do {                                                          \
4388         uint64_t pg = g[(i - 1) >> 6];                            \
4389         do {                                                      \
4390             i -= sizeof(TYPE);                                    \
4391             if (likely((pg >> (i & 63)) & 1)) {                   \
4392                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4393                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4394             }                                                     \
4395         } while (i & 63);                                         \
4396     } while (i != 0);                                             \
4397 }
4398 
4399 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4400 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4401 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4402 
4403 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4404 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4405 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4406 
4407 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4408 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4409 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4410 
4411 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4412 {
4413     return float16_sub(b, a, s);
4414 }
4415 
4416 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4417 {
4418     return float32_sub(b, a, s);
4419 }
4420 
4421 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4422 {
4423     return float64_sub(b, a, s);
4424 }
4425 
4426 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4427 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4428 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4429 
4430 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4431 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4432 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4433 
4434 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4435 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4436 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4437 
4438 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4439 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4440 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4441 
4442 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4443 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4444 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4445 
4446 /* Fully general two-operand expander, controlled by a predicate,
4447  * With the extra float_status parameter.
4448  */
4449 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4450 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4451 {                                                                     \
4452     intptr_t i = simd_oprsz(desc);                                    \
4453     uint64_t *g = vg;                                                 \
4454     do {                                                              \
4455         uint64_t pg = g[(i - 1) >> 6];                                \
4456         do {                                                          \
4457             i -= sizeof(TYPE);                                        \
4458             if (likely((pg >> (i & 63)) & 1)) {                       \
4459                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4460                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4461             }                                                         \
4462         } while (i & 63);                                             \
4463     } while (i != 0);                                                 \
4464 }
4465 
4466 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4467  * FZ16.  When converting from fp16, this affects flushing input denormals;
4468  * when converting to fp16, this affects flushing output denormals.
4469  */
4470 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4471 {
4472     bool save = get_flush_inputs_to_zero(fpst);
4473     float32 ret;
4474 
4475     set_flush_inputs_to_zero(false, fpst);
4476     ret = float16_to_float32(f, true, fpst);
4477     set_flush_inputs_to_zero(save, fpst);
4478     return ret;
4479 }
4480 
4481 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4482 {
4483     bool save = get_flush_inputs_to_zero(fpst);
4484     float64 ret;
4485 
4486     set_flush_inputs_to_zero(false, fpst);
4487     ret = float16_to_float64(f, true, fpst);
4488     set_flush_inputs_to_zero(save, fpst);
4489     return ret;
4490 }
4491 
4492 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4493 {
4494     bool save = get_flush_to_zero(fpst);
4495     float16 ret;
4496 
4497     set_flush_to_zero(false, fpst);
4498     ret = float32_to_float16(f, true, fpst);
4499     set_flush_to_zero(save, fpst);
4500     return ret;
4501 }
4502 
4503 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4504 {
4505     bool save = get_flush_to_zero(fpst);
4506     float16 ret;
4507 
4508     set_flush_to_zero(false, fpst);
4509     ret = float64_to_float16(f, true, fpst);
4510     set_flush_to_zero(save, fpst);
4511     return ret;
4512 }
4513 
4514 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4515 {
4516     if (float16_is_any_nan(f)) {
4517         float_raise(float_flag_invalid, s);
4518         return 0;
4519     }
4520     return float16_to_int16_round_to_zero(f, s);
4521 }
4522 
4523 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4524 {
4525     if (float16_is_any_nan(f)) {
4526         float_raise(float_flag_invalid, s);
4527         return 0;
4528     }
4529     return float16_to_int64_round_to_zero(f, s);
4530 }
4531 
4532 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4533 {
4534     if (float32_is_any_nan(f)) {
4535         float_raise(float_flag_invalid, s);
4536         return 0;
4537     }
4538     return float32_to_int64_round_to_zero(f, s);
4539 }
4540 
4541 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4542 {
4543     if (float64_is_any_nan(f)) {
4544         float_raise(float_flag_invalid, s);
4545         return 0;
4546     }
4547     return float64_to_int64_round_to_zero(f, s);
4548 }
4549 
4550 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4551 {
4552     if (float16_is_any_nan(f)) {
4553         float_raise(float_flag_invalid, s);
4554         return 0;
4555     }
4556     return float16_to_uint16_round_to_zero(f, s);
4557 }
4558 
4559 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4560 {
4561     if (float16_is_any_nan(f)) {
4562         float_raise(float_flag_invalid, s);
4563         return 0;
4564     }
4565     return float16_to_uint64_round_to_zero(f, s);
4566 }
4567 
4568 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4569 {
4570     if (float32_is_any_nan(f)) {
4571         float_raise(float_flag_invalid, s);
4572         return 0;
4573     }
4574     return float32_to_uint64_round_to_zero(f, s);
4575 }
4576 
4577 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4578 {
4579     if (float64_is_any_nan(f)) {
4580         float_raise(float_flag_invalid, s);
4581         return 0;
4582     }
4583     return float64_to_uint64_round_to_zero(f, s);
4584 }
4585 
4586 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4587 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4588 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4589 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4590 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4591 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4592 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4593 
4594 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4595 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4596 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4597 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4598 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4599 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4600 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4601 
4602 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4603 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4604 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4605 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4606 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4607 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4608 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4609 
4610 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4611 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4612 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4613 
4614 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4615 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4616 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4617 
4618 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4619 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4620 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4621 
4622 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4623 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4624 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4625 
4626 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4627 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4628 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4629 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4630 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4631 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4632 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4633 
4634 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4635 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4636 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4637 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4638 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4639 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4640 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4641 
4642 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4643 {
4644     /* Extract frac to the top of the uint32_t. */
4645     uint32_t frac = (uint32_t)a << (16 + 6);
4646     int16_t exp = extract32(a, 10, 5);
4647 
4648     if (unlikely(exp == 0)) {
4649         if (frac != 0) {
4650             if (!get_flush_inputs_to_zero(s)) {
4651                 /* denormal: bias - fractional_zeros */
4652                 return -15 - clz32(frac);
4653             }
4654             /* flush to zero */
4655             float_raise(float_flag_input_denormal, s);
4656         }
4657     } else if (unlikely(exp == 0x1f)) {
4658         if (frac == 0) {
4659             return INT16_MAX; /* infinity */
4660         }
4661     } else {
4662         /* normal: exp - bias */
4663         return exp - 15;
4664     }
4665     /* nan or zero */
4666     float_raise(float_flag_invalid, s);
4667     return INT16_MIN;
4668 }
4669 
4670 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4671 {
4672     /* Extract frac to the top of the uint32_t. */
4673     uint32_t frac = a << 9;
4674     int32_t exp = extract32(a, 23, 8);
4675 
4676     if (unlikely(exp == 0)) {
4677         if (frac != 0) {
4678             if (!get_flush_inputs_to_zero(s)) {
4679                 /* denormal: bias - fractional_zeros */
4680                 return -127 - clz32(frac);
4681             }
4682             /* flush to zero */
4683             float_raise(float_flag_input_denormal, s);
4684         }
4685     } else if (unlikely(exp == 0xff)) {
4686         if (frac == 0) {
4687             return INT32_MAX; /* infinity */
4688         }
4689     } else {
4690         /* normal: exp - bias */
4691         return exp - 127;
4692     }
4693     /* nan or zero */
4694     float_raise(float_flag_invalid, s);
4695     return INT32_MIN;
4696 }
4697 
4698 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4699 {
4700     /* Extract frac to the top of the uint64_t. */
4701     uint64_t frac = a << 12;
4702     int64_t exp = extract64(a, 52, 11);
4703 
4704     if (unlikely(exp == 0)) {
4705         if (frac != 0) {
4706             if (!get_flush_inputs_to_zero(s)) {
4707                 /* denormal: bias - fractional_zeros */
4708                 return -1023 - clz64(frac);
4709             }
4710             /* flush to zero */
4711             float_raise(float_flag_input_denormal, s);
4712         }
4713     } else if (unlikely(exp == 0x7ff)) {
4714         if (frac == 0) {
4715             return INT64_MAX; /* infinity */
4716         }
4717     } else {
4718         /* normal: exp - bias */
4719         return exp - 1023;
4720     }
4721     /* nan or zero */
4722     float_raise(float_flag_invalid, s);
4723     return INT64_MIN;
4724 }
4725 
4726 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4727 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4728 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4729 
4730 #undef DO_ZPZ_FP
4731 
4732 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4733                             float_status *status, uint32_t desc,
4734                             uint16_t neg1, uint16_t neg3)
4735 {
4736     intptr_t i = simd_oprsz(desc);
4737     uint64_t *g = vg;
4738 
4739     do {
4740         uint64_t pg = g[(i - 1) >> 6];
4741         do {
4742             i -= 2;
4743             if (likely((pg >> (i & 63)) & 1)) {
4744                 float16 e1, e2, e3, r;
4745 
4746                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4747                 e2 = *(uint16_t *)(vm + H1_2(i));
4748                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4749                 r = float16_muladd(e1, e2, e3, 0, status);
4750                 *(uint16_t *)(vd + H1_2(i)) = r;
4751             }
4752         } while (i & 63);
4753     } while (i != 0);
4754 }
4755 
4756 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4757                               void *vg, void *status, uint32_t desc)
4758 {
4759     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4760 }
4761 
4762 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4763                               void *vg, void *status, uint32_t desc)
4764 {
4765     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4766 }
4767 
4768 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4769                                void *vg, void *status, uint32_t desc)
4770 {
4771     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4772 }
4773 
4774 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4775                                void *vg, void *status, uint32_t desc)
4776 {
4777     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4778 }
4779 
4780 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4781                             float_status *status, uint32_t desc,
4782                             uint32_t neg1, uint32_t neg3)
4783 {
4784     intptr_t i = simd_oprsz(desc);
4785     uint64_t *g = vg;
4786 
4787     do {
4788         uint64_t pg = g[(i - 1) >> 6];
4789         do {
4790             i -= 4;
4791             if (likely((pg >> (i & 63)) & 1)) {
4792                 float32 e1, e2, e3, r;
4793 
4794                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4795                 e2 = *(uint32_t *)(vm + H1_4(i));
4796                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4797                 r = float32_muladd(e1, e2, e3, 0, status);
4798                 *(uint32_t *)(vd + H1_4(i)) = r;
4799             }
4800         } while (i & 63);
4801     } while (i != 0);
4802 }
4803 
4804 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4805                               void *vg, void *status, uint32_t desc)
4806 {
4807     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4808 }
4809 
4810 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4811                               void *vg, void *status, uint32_t desc)
4812 {
4813     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4814 }
4815 
4816 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4817                                void *vg, void *status, uint32_t desc)
4818 {
4819     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4820 }
4821 
4822 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4823                                void *vg, void *status, uint32_t desc)
4824 {
4825     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4826 }
4827 
4828 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4829                             float_status *status, uint32_t desc,
4830                             uint64_t neg1, uint64_t neg3)
4831 {
4832     intptr_t i = simd_oprsz(desc);
4833     uint64_t *g = vg;
4834 
4835     do {
4836         uint64_t pg = g[(i - 1) >> 6];
4837         do {
4838             i -= 8;
4839             if (likely((pg >> (i & 63)) & 1)) {
4840                 float64 e1, e2, e3, r;
4841 
4842                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4843                 e2 = *(uint64_t *)(vm + i);
4844                 e3 = *(uint64_t *)(va + i) ^ neg3;
4845                 r = float64_muladd(e1, e2, e3, 0, status);
4846                 *(uint64_t *)(vd + i) = r;
4847             }
4848         } while (i & 63);
4849     } while (i != 0);
4850 }
4851 
4852 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4853                               void *vg, void *status, uint32_t desc)
4854 {
4855     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4856 }
4857 
4858 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4859                               void *vg, void *status, uint32_t desc)
4860 {
4861     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4862 }
4863 
4864 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4865                                void *vg, void *status, uint32_t desc)
4866 {
4867     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4868 }
4869 
4870 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4871                                void *vg, void *status, uint32_t desc)
4872 {
4873     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4874 }
4875 
4876 /* Two operand floating-point comparison controlled by a predicate.
4877  * Unlike the integer version, we are not allowed to optimistically
4878  * compare operands, since the comparison may have side effects wrt
4879  * the FPSR.
4880  */
4881 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4882 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4883                   void *status, uint32_t desc)                          \
4884 {                                                                       \
4885     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4886     uint64_t *d = vd, *g = vg;                                          \
4887     do {                                                                \
4888         uint64_t out = 0, pg = g[j];                                    \
4889         do {                                                            \
4890             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4891             if (likely((pg >> (i & 63)) & 1)) {                         \
4892                 TYPE nn = *(TYPE *)(vn + H(i));                         \
4893                 TYPE mm = *(TYPE *)(vm + H(i));                         \
4894                 out |= OP(TYPE, nn, mm, status);                        \
4895             }                                                           \
4896         } while (i & 63);                                               \
4897         d[j--] = out;                                                   \
4898     } while (i > 0);                                                    \
4899 }
4900 
4901 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4902     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4903 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4904     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4905 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4906     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4907 
4908 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4909     DO_FPCMP_PPZZ_H(NAME, OP)   \
4910     DO_FPCMP_PPZZ_S(NAME, OP)   \
4911     DO_FPCMP_PPZZ_D(NAME, OP)
4912 
4913 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4914 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4915 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4916 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4917 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4918 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4919 #define DO_FCMUO(TYPE, X, Y, ST)  \
4920     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4921 #define DO_FACGE(TYPE, X, Y, ST)  \
4922     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4923 #define DO_FACGT(TYPE, X, Y, ST)  \
4924     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4925 
4926 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4927 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4928 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4931 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4932 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4933 
4934 #undef DO_FPCMP_PPZZ_ALL
4935 #undef DO_FPCMP_PPZZ_D
4936 #undef DO_FPCMP_PPZZ_S
4937 #undef DO_FPCMP_PPZZ_H
4938 #undef DO_FPCMP_PPZZ
4939 
4940 /* One operand floating-point comparison against zero, controlled
4941  * by a predicate.
4942  */
4943 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4944 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4945                   void *status, uint32_t desc)             \
4946 {                                                          \
4947     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4948     uint64_t *d = vd, *g = vg;                             \
4949     do {                                                   \
4950         uint64_t out = 0, pg = g[j];                       \
4951         do {                                               \
4952             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4953             if ((pg >> (i & 63)) & 1) {                    \
4954                 TYPE nn = *(TYPE *)(vn + H(i));            \
4955                 out |= OP(TYPE, nn, 0, status);            \
4956             }                                              \
4957         } while (i & 63);                                  \
4958         d[j--] = out;                                      \
4959     } while (i > 0);                                       \
4960 }
4961 
4962 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4963     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4964 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4965     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4966 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4967     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4968 
4969 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4970     DO_FPCMP_PPZ0_H(NAME, OP)   \
4971     DO_FPCMP_PPZ0_S(NAME, OP)   \
4972     DO_FPCMP_PPZ0_D(NAME, OP)
4973 
4974 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4975 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4980 
4981 /* FP Trig Multiply-Add. */
4982 
4983 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4984 {
4985     static const float16 coeff[16] = {
4986         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4987         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988     };
4989     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4990     intptr_t x = simd_data(desc);
4991     float16 *d = vd, *n = vn, *m = vm;
4992     for (i = 0; i < opr_sz; i++) {
4993         float16 mm = m[i];
4994         intptr_t xx = x;
4995         if (float16_is_neg(mm)) {
4996             mm = float16_abs(mm);
4997             xx += 8;
4998         }
4999         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5000     }
5001 }
5002 
5003 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5004 {
5005     static const float32 coeff[16] = {
5006         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5007         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5008         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5009         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5010     };
5011     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5012     intptr_t x = simd_data(desc);
5013     float32 *d = vd, *n = vn, *m = vm;
5014     for (i = 0; i < opr_sz; i++) {
5015         float32 mm = m[i];
5016         intptr_t xx = x;
5017         if (float32_is_neg(mm)) {
5018             mm = float32_abs(mm);
5019             xx += 8;
5020         }
5021         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5022     }
5023 }
5024 
5025 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5026 {
5027     static const float64 coeff[16] = {
5028         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5029         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5030         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5031         0x3de5d8408868552full, 0x0000000000000000ull,
5032         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5033         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5034         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5035         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5036     };
5037     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5038     intptr_t x = simd_data(desc);
5039     float64 *d = vd, *n = vn, *m = vm;
5040     for (i = 0; i < opr_sz; i++) {
5041         float64 mm = m[i];
5042         intptr_t xx = x;
5043         if (float64_is_neg(mm)) {
5044             mm = float64_abs(mm);
5045             xx += 8;
5046         }
5047         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5048     }
5049 }
5050 
5051 /*
5052  * FP Complex Add
5053  */
5054 
5055 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5056                          void *vs, uint32_t desc)
5057 {
5058     intptr_t j, i = simd_oprsz(desc);
5059     uint64_t *g = vg;
5060     float16 neg_imag = float16_set_sign(0, simd_data(desc));
5061     float16 neg_real = float16_chs(neg_imag);
5062 
5063     do {
5064         uint64_t pg = g[(i - 1) >> 6];
5065         do {
5066             float16 e0, e1, e2, e3;
5067 
5068             /* I holds the real index; J holds the imag index.  */
5069             j = i - sizeof(float16);
5070             i -= 2 * sizeof(float16);
5071 
5072             e0 = *(float16 *)(vn + H1_2(i));
5073             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5074             e2 = *(float16 *)(vn + H1_2(j));
5075             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5076 
5077             if (likely((pg >> (i & 63)) & 1)) {
5078                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5079             }
5080             if (likely((pg >> (j & 63)) & 1)) {
5081                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5082             }
5083         } while (i & 63);
5084     } while (i != 0);
5085 }
5086 
5087 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5088                          void *vs, uint32_t desc)
5089 {
5090     intptr_t j, i = simd_oprsz(desc);
5091     uint64_t *g = vg;
5092     float32 neg_imag = float32_set_sign(0, simd_data(desc));
5093     float32 neg_real = float32_chs(neg_imag);
5094 
5095     do {
5096         uint64_t pg = g[(i - 1) >> 6];
5097         do {
5098             float32 e0, e1, e2, e3;
5099 
5100             /* I holds the real index; J holds the imag index.  */
5101             j = i - sizeof(float32);
5102             i -= 2 * sizeof(float32);
5103 
5104             e0 = *(float32 *)(vn + H1_2(i));
5105             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5106             e2 = *(float32 *)(vn + H1_2(j));
5107             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5108 
5109             if (likely((pg >> (i & 63)) & 1)) {
5110                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5111             }
5112             if (likely((pg >> (j & 63)) & 1)) {
5113                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5114             }
5115         } while (i & 63);
5116     } while (i != 0);
5117 }
5118 
5119 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5120                          void *vs, uint32_t desc)
5121 {
5122     intptr_t j, i = simd_oprsz(desc);
5123     uint64_t *g = vg;
5124     float64 neg_imag = float64_set_sign(0, simd_data(desc));
5125     float64 neg_real = float64_chs(neg_imag);
5126 
5127     do {
5128         uint64_t pg = g[(i - 1) >> 6];
5129         do {
5130             float64 e0, e1, e2, e3;
5131 
5132             /* I holds the real index; J holds the imag index.  */
5133             j = i - sizeof(float64);
5134             i -= 2 * sizeof(float64);
5135 
5136             e0 = *(float64 *)(vn + H1_2(i));
5137             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5138             e2 = *(float64 *)(vn + H1_2(j));
5139             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5140 
5141             if (likely((pg >> (i & 63)) & 1)) {
5142                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5143             }
5144             if (likely((pg >> (j & 63)) & 1)) {
5145                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5146             }
5147         } while (i & 63);
5148     } while (i != 0);
5149 }
5150 
5151 /*
5152  * FP Complex Multiply
5153  */
5154 
5155 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5156                                void *vg, void *status, uint32_t desc)
5157 {
5158     intptr_t j, i = simd_oprsz(desc);
5159     unsigned rot = simd_data(desc);
5160     bool flip = rot & 1;
5161     float16 neg_imag, neg_real;
5162     uint64_t *g = vg;
5163 
5164     neg_imag = float16_set_sign(0, (rot & 2) != 0);
5165     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5166 
5167     do {
5168         uint64_t pg = g[(i - 1) >> 6];
5169         do {
5170             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5171 
5172             /* I holds the real index; J holds the imag index.  */
5173             j = i - sizeof(float16);
5174             i -= 2 * sizeof(float16);
5175 
5176             nr = *(float16 *)(vn + H1_2(i));
5177             ni = *(float16 *)(vn + H1_2(j));
5178             mr = *(float16 *)(vm + H1_2(i));
5179             mi = *(float16 *)(vm + H1_2(j));
5180 
5181             e2 = (flip ? ni : nr);
5182             e1 = (flip ? mi : mr) ^ neg_real;
5183             e4 = e2;
5184             e3 = (flip ? mr : mi) ^ neg_imag;
5185 
5186             if (likely((pg >> (i & 63)) & 1)) {
5187                 d = *(float16 *)(va + H1_2(i));
5188                 d = float16_muladd(e2, e1, d, 0, status);
5189                 *(float16 *)(vd + H1_2(i)) = d;
5190             }
5191             if (likely((pg >> (j & 63)) & 1)) {
5192                 d = *(float16 *)(va + H1_2(j));
5193                 d = float16_muladd(e4, e3, d, 0, status);
5194                 *(float16 *)(vd + H1_2(j)) = d;
5195             }
5196         } while (i & 63);
5197     } while (i != 0);
5198 }
5199 
5200 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5201                                void *vg, void *status, uint32_t desc)
5202 {
5203     intptr_t j, i = simd_oprsz(desc);
5204     unsigned rot = simd_data(desc);
5205     bool flip = rot & 1;
5206     float32 neg_imag, neg_real;
5207     uint64_t *g = vg;
5208 
5209     neg_imag = float32_set_sign(0, (rot & 2) != 0);
5210     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5211 
5212     do {
5213         uint64_t pg = g[(i - 1) >> 6];
5214         do {
5215             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5216 
5217             /* I holds the real index; J holds the imag index.  */
5218             j = i - sizeof(float32);
5219             i -= 2 * sizeof(float32);
5220 
5221             nr = *(float32 *)(vn + H1_2(i));
5222             ni = *(float32 *)(vn + H1_2(j));
5223             mr = *(float32 *)(vm + H1_2(i));
5224             mi = *(float32 *)(vm + H1_2(j));
5225 
5226             e2 = (flip ? ni : nr);
5227             e1 = (flip ? mi : mr) ^ neg_real;
5228             e4 = e2;
5229             e3 = (flip ? mr : mi) ^ neg_imag;
5230 
5231             if (likely((pg >> (i & 63)) & 1)) {
5232                 d = *(float32 *)(va + H1_2(i));
5233                 d = float32_muladd(e2, e1, d, 0, status);
5234                 *(float32 *)(vd + H1_2(i)) = d;
5235             }
5236             if (likely((pg >> (j & 63)) & 1)) {
5237                 d = *(float32 *)(va + H1_2(j));
5238                 d = float32_muladd(e4, e3, d, 0, status);
5239                 *(float32 *)(vd + H1_2(j)) = d;
5240             }
5241         } while (i & 63);
5242     } while (i != 0);
5243 }
5244 
5245 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5246                                void *vg, void *status, uint32_t desc)
5247 {
5248     intptr_t j, i = simd_oprsz(desc);
5249     unsigned rot = simd_data(desc);
5250     bool flip = rot & 1;
5251     float64 neg_imag, neg_real;
5252     uint64_t *g = vg;
5253 
5254     neg_imag = float64_set_sign(0, (rot & 2) != 0);
5255     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5256 
5257     do {
5258         uint64_t pg = g[(i - 1) >> 6];
5259         do {
5260             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5261 
5262             /* I holds the real index; J holds the imag index.  */
5263             j = i - sizeof(float64);
5264             i -= 2 * sizeof(float64);
5265 
5266             nr = *(float64 *)(vn + H1_2(i));
5267             ni = *(float64 *)(vn + H1_2(j));
5268             mr = *(float64 *)(vm + H1_2(i));
5269             mi = *(float64 *)(vm + H1_2(j));
5270 
5271             e2 = (flip ? ni : nr);
5272             e1 = (flip ? mi : mr) ^ neg_real;
5273             e4 = e2;
5274             e3 = (flip ? mr : mi) ^ neg_imag;
5275 
5276             if (likely((pg >> (i & 63)) & 1)) {
5277                 d = *(float64 *)(va + H1_2(i));
5278                 d = float64_muladd(e2, e1, d, 0, status);
5279                 *(float64 *)(vd + H1_2(i)) = d;
5280             }
5281             if (likely((pg >> (j & 63)) & 1)) {
5282                 d = *(float64 *)(va + H1_2(j));
5283                 d = float64_muladd(e4, e3, d, 0, status);
5284                 *(float64 *)(vd + H1_2(j)) = d;
5285             }
5286         } while (i & 63);
5287     } while (i != 0);
5288 }
5289 
5290 /*
5291  * Load contiguous data, protected by a governing predicate.
5292  */
5293 
5294 /*
5295  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5296  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5297  * element >= @reg_off, or @reg_max if there were no active elements at all.
5298  */
5299 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5300                                  intptr_t reg_max, int esz)
5301 {
5302     uint64_t pg_mask = pred_esz_masks[esz];
5303     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5304 
5305     /* In normal usage, the first element is active.  */
5306     if (likely(pg & 1)) {
5307         return reg_off;
5308     }
5309 
5310     if (pg == 0) {
5311         reg_off &= -64;
5312         do {
5313             reg_off += 64;
5314             if (unlikely(reg_off >= reg_max)) {
5315                 /* The entire predicate was false.  */
5316                 return reg_max;
5317             }
5318             pg = vg[reg_off >> 6] & pg_mask;
5319         } while (pg == 0);
5320     }
5321     reg_off += ctz64(pg);
5322 
5323     /* We should never see an out of range predicate bit set.  */
5324     tcg_debug_assert(reg_off < reg_max);
5325     return reg_off;
5326 }
5327 
5328 /*
5329  * Resolve the guest virtual address to info->host and info->flags.
5330  * If @nofault, return false if the page is invalid, otherwise
5331  * exit via page fault exception.
5332  */
5333 
5334 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5335                     target_ulong addr, int mem_off, MMUAccessType access_type,
5336                     int mmu_idx, uintptr_t retaddr)
5337 {
5338     int flags;
5339 
5340     addr += mem_off;
5341 
5342     /*
5343      * User-only currently always issues with TBI.  See the comment
5344      * above useronly_clean_ptr.  Usually we clean this top byte away
5345      * during translation, but we can't do that for e.g. vector + imm
5346      * addressing modes.
5347      *
5348      * We currently always enable TBI for user-only, and do not provide
5349      * a way to turn it off.  So clean the pointer unconditionally here,
5350      * rather than look it up here, or pass it down from above.
5351      */
5352     addr = useronly_clean_ptr(addr);
5353 
5354 #ifdef CONFIG_USER_ONLY
5355     flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5356                                &info->host, retaddr);
5357 #else
5358     CPUTLBEntryFull *full;
5359     flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
5360                               &info->host, &full, retaddr);
5361 #endif
5362     info->flags = flags;
5363 
5364     if (flags & TLB_INVALID_MASK) {
5365         g_assert(nofault);
5366         return false;
5367     }
5368 
5369 #ifdef CONFIG_USER_ONLY
5370     memset(&info->attrs, 0, sizeof(info->attrs));
5371     /* Require both ANON and MTE; see allocation_tag_mem(). */
5372     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5373 #else
5374     info->attrs = full->attrs;
5375     info->tagged = full->pte_attrs == 0xf0;
5376 #endif
5377 
5378     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5379     info->host -= mem_off;
5380     return true;
5381 }
5382 
5383 /*
5384  * Find first active element on each page, and a loose bound for the
5385  * final element on each page.  Identify any single element that spans
5386  * the page boundary.  Return true if there are any active elements.
5387  */
5388 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5389                             intptr_t reg_max, int esz, int msize)
5390 {
5391     const int esize = 1 << esz;
5392     const uint64_t pg_mask = pred_esz_masks[esz];
5393     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5394     intptr_t mem_off_last, mem_off_split;
5395     intptr_t page_split, elt_split;
5396     intptr_t i;
5397 
5398     /* Set all of the element indices to -1, and the TLB data to 0. */
5399     memset(info, -1, offsetof(SVEContLdSt, page));
5400     memset(info->page, 0, sizeof(info->page));
5401 
5402     /* Gross scan over the entire predicate to find bounds. */
5403     i = 0;
5404     do {
5405         uint64_t pg = vg[i] & pg_mask;
5406         if (pg) {
5407             reg_off_last = i * 64 + 63 - clz64(pg);
5408             if (reg_off_first < 0) {
5409                 reg_off_first = i * 64 + ctz64(pg);
5410             }
5411         }
5412     } while (++i * 64 < reg_max);
5413 
5414     if (unlikely(reg_off_first < 0)) {
5415         /* No active elements, no pages touched. */
5416         return false;
5417     }
5418     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5419 
5420     info->reg_off_first[0] = reg_off_first;
5421     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5422     mem_off_last = (reg_off_last >> esz) * msize;
5423 
5424     page_split = -(addr | TARGET_PAGE_MASK);
5425     if (likely(mem_off_last + msize <= page_split)) {
5426         /* The entire operation fits within a single page. */
5427         info->reg_off_last[0] = reg_off_last;
5428         return true;
5429     }
5430 
5431     info->page_split = page_split;
5432     elt_split = page_split / msize;
5433     reg_off_split = elt_split << esz;
5434     mem_off_split = elt_split * msize;
5435 
5436     /*
5437      * This is the last full element on the first page, but it is not
5438      * necessarily active.  If there is no full element, i.e. the first
5439      * active element is the one that's split, this value remains -1.
5440      * It is useful as iteration bounds.
5441      */
5442     if (elt_split != 0) {
5443         info->reg_off_last[0] = reg_off_split - esize;
5444     }
5445 
5446     /* Determine if an unaligned element spans the pages.  */
5447     if (page_split % msize != 0) {
5448         /* It is helpful to know if the split element is active. */
5449         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5450             info->reg_off_split = reg_off_split;
5451             info->mem_off_split = mem_off_split;
5452 
5453             if (reg_off_split == reg_off_last) {
5454                 /* The page crossing element is last. */
5455                 return true;
5456             }
5457         }
5458         reg_off_split += esize;
5459         mem_off_split += msize;
5460     }
5461 
5462     /*
5463      * We do want the first active element on the second page, because
5464      * this may affect the address reported in an exception.
5465      */
5466     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5467     tcg_debug_assert(reg_off_split <= reg_off_last);
5468     info->reg_off_first[1] = reg_off_split;
5469     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5470     info->reg_off_last[1] = reg_off_last;
5471     return true;
5472 }
5473 
5474 /*
5475  * Resolve the guest virtual addresses to info->page[].
5476  * Control the generation of page faults with @fault.  Return false if
5477  * there is no work to do, which can only happen with @fault == FAULT_NO.
5478  */
5479 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5480                          CPUARMState *env, target_ulong addr,
5481                          MMUAccessType access_type, uintptr_t retaddr)
5482 {
5483     int mmu_idx = cpu_mmu_index(env, false);
5484     int mem_off = info->mem_off_first[0];
5485     bool nofault = fault == FAULT_NO;
5486     bool have_work = true;
5487 
5488     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5489                         access_type, mmu_idx, retaddr)) {
5490         /* No work to be done. */
5491         return false;
5492     }
5493 
5494     if (likely(info->page_split < 0)) {
5495         /* The entire operation was on the one page. */
5496         return true;
5497     }
5498 
5499     /*
5500      * If the second page is invalid, then we want the fault address to be
5501      * the first byte on that page which is accessed.
5502      */
5503     if (info->mem_off_split >= 0) {
5504         /*
5505          * There is an element split across the pages.  The fault address
5506          * should be the first byte of the second page.
5507          */
5508         mem_off = info->page_split;
5509         /*
5510          * If the split element is also the first active element
5511          * of the vector, then:  For first-fault we should continue
5512          * to generate faults for the second page.  For no-fault,
5513          * we have work only if the second page is valid.
5514          */
5515         if (info->mem_off_first[0] < info->mem_off_split) {
5516             nofault = FAULT_FIRST;
5517             have_work = false;
5518         }
5519     } else {
5520         /*
5521          * There is no element split across the pages.  The fault address
5522          * should be the first active element on the second page.
5523          */
5524         mem_off = info->mem_off_first[1];
5525         /*
5526          * There must have been one active element on the first page,
5527          * so we're out of first-fault territory.
5528          */
5529         nofault = fault != FAULT_ALL;
5530     }
5531 
5532     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5533                                 access_type, mmu_idx, retaddr);
5534     return have_work;
5535 }
5536 
5537 #ifndef CONFIG_USER_ONLY
5538 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5539                                uint64_t *vg, target_ulong addr,
5540                                int esize, int msize, int wp_access,
5541                                uintptr_t retaddr)
5542 {
5543     intptr_t mem_off, reg_off, reg_last;
5544     int flags0 = info->page[0].flags;
5545     int flags1 = info->page[1].flags;
5546 
5547     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5548         return;
5549     }
5550 
5551     /* Indicate that watchpoints are handled. */
5552     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5553     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5554 
5555     if (flags0 & TLB_WATCHPOINT) {
5556         mem_off = info->mem_off_first[0];
5557         reg_off = info->reg_off_first[0];
5558         reg_last = info->reg_off_last[0];
5559 
5560         while (reg_off <= reg_last) {
5561             uint64_t pg = vg[reg_off >> 6];
5562             do {
5563                 if ((pg >> (reg_off & 63)) & 1) {
5564                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5565                                          msize, info->page[0].attrs,
5566                                          wp_access, retaddr);
5567                 }
5568                 reg_off += esize;
5569                 mem_off += msize;
5570             } while (reg_off <= reg_last && (reg_off & 63));
5571         }
5572     }
5573 
5574     mem_off = info->mem_off_split;
5575     if (mem_off >= 0) {
5576         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5577                              info->page[0].attrs, wp_access, retaddr);
5578     }
5579 
5580     mem_off = info->mem_off_first[1];
5581     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5582         reg_off = info->reg_off_first[1];
5583         reg_last = info->reg_off_last[1];
5584 
5585         do {
5586             uint64_t pg = vg[reg_off >> 6];
5587             do {
5588                 if ((pg >> (reg_off & 63)) & 1) {
5589                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5590                                          msize, info->page[1].attrs,
5591                                          wp_access, retaddr);
5592                 }
5593                 reg_off += esize;
5594                 mem_off += msize;
5595             } while (reg_off & 63);
5596         } while (reg_off <= reg_last);
5597     }
5598 }
5599 #endif
5600 
5601 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5602                              uint64_t *vg, target_ulong addr, int esize,
5603                              int msize, uint32_t mtedesc, uintptr_t ra)
5604 {
5605     intptr_t mem_off, reg_off, reg_last;
5606 
5607     /* Process the page only if MemAttr == Tagged. */
5608     if (info->page[0].tagged) {
5609         mem_off = info->mem_off_first[0];
5610         reg_off = info->reg_off_first[0];
5611         reg_last = info->reg_off_split;
5612         if (reg_last < 0) {
5613             reg_last = info->reg_off_last[0];
5614         }
5615 
5616         do {
5617             uint64_t pg = vg[reg_off >> 6];
5618             do {
5619                 if ((pg >> (reg_off & 63)) & 1) {
5620                     mte_check(env, mtedesc, addr, ra);
5621                 }
5622                 reg_off += esize;
5623                 mem_off += msize;
5624             } while (reg_off <= reg_last && (reg_off & 63));
5625         } while (reg_off <= reg_last);
5626     }
5627 
5628     mem_off = info->mem_off_first[1];
5629     if (mem_off >= 0 && info->page[1].tagged) {
5630         reg_off = info->reg_off_first[1];
5631         reg_last = info->reg_off_last[1];
5632 
5633         do {
5634             uint64_t pg = vg[reg_off >> 6];
5635             do {
5636                 if ((pg >> (reg_off & 63)) & 1) {
5637                     mte_check(env, mtedesc, addr, ra);
5638                 }
5639                 reg_off += esize;
5640                 mem_off += msize;
5641             } while (reg_off & 63);
5642         } while (reg_off <= reg_last);
5643     }
5644 }
5645 
5646 /*
5647  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5648  */
5649 static inline QEMU_ALWAYS_INLINE
5650 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5651                uint32_t desc, const uintptr_t retaddr,
5652                const int esz, const int msz, const int N, uint32_t mtedesc,
5653                sve_ldst1_host_fn *host_fn,
5654                sve_ldst1_tlb_fn *tlb_fn)
5655 {
5656     const unsigned rd = simd_data(desc);
5657     const intptr_t reg_max = simd_oprsz(desc);
5658     intptr_t reg_off, reg_last, mem_off;
5659     SVEContLdSt info;
5660     void *host;
5661     int flags, i;
5662 
5663     /* Find the active elements.  */
5664     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5665         /* The entire predicate was false; no load occurs.  */
5666         for (i = 0; i < N; ++i) {
5667             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5668         }
5669         return;
5670     }
5671 
5672     /* Probe the page(s).  Exit with exception for any invalid page. */
5673     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5674 
5675     /* Handle watchpoints for all active elements. */
5676     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5677                               BP_MEM_READ, retaddr);
5678 
5679     /*
5680      * Handle mte checks for all active elements.
5681      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5682      */
5683     if (mtedesc) {
5684         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5685                                 mtedesc, retaddr);
5686     }
5687 
5688     flags = info.page[0].flags | info.page[1].flags;
5689     if (unlikely(flags != 0)) {
5690 #ifdef CONFIG_USER_ONLY
5691         g_assert_not_reached();
5692 #else
5693         /*
5694          * At least one page includes MMIO.
5695          * Any bus operation can fail with cpu_transaction_failed,
5696          * which for ARM will raise SyncExternal.  Perform the load
5697          * into scratch memory to preserve register state until the end.
5698          */
5699         ARMVectorReg scratch[4] = { };
5700 
5701         mem_off = info.mem_off_first[0];
5702         reg_off = info.reg_off_first[0];
5703         reg_last = info.reg_off_last[1];
5704         if (reg_last < 0) {
5705             reg_last = info.reg_off_split;
5706             if (reg_last < 0) {
5707                 reg_last = info.reg_off_last[0];
5708             }
5709         }
5710 
5711         do {
5712             uint64_t pg = vg[reg_off >> 6];
5713             do {
5714                 if ((pg >> (reg_off & 63)) & 1) {
5715                     for (i = 0; i < N; ++i) {
5716                         tlb_fn(env, &scratch[i], reg_off,
5717                                addr + mem_off + (i << msz), retaddr);
5718                     }
5719                 }
5720                 reg_off += 1 << esz;
5721                 mem_off += N << msz;
5722             } while (reg_off & 63);
5723         } while (reg_off <= reg_last);
5724 
5725         for (i = 0; i < N; ++i) {
5726             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5727         }
5728         return;
5729 #endif
5730     }
5731 
5732     /* The entire operation is in RAM, on valid pages. */
5733 
5734     for (i = 0; i < N; ++i) {
5735         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5736     }
5737 
5738     mem_off = info.mem_off_first[0];
5739     reg_off = info.reg_off_first[0];
5740     reg_last = info.reg_off_last[0];
5741     host = info.page[0].host;
5742 
5743     while (reg_off <= reg_last) {
5744         uint64_t pg = vg[reg_off >> 6];
5745         do {
5746             if ((pg >> (reg_off & 63)) & 1) {
5747                 for (i = 0; i < N; ++i) {
5748                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5749                             host + mem_off + (i << msz));
5750                 }
5751             }
5752             reg_off += 1 << esz;
5753             mem_off += N << msz;
5754         } while (reg_off <= reg_last && (reg_off & 63));
5755     }
5756 
5757     /*
5758      * Use the slow path to manage the cross-page misalignment.
5759      * But we know this is RAM and cannot trap.
5760      */
5761     mem_off = info.mem_off_split;
5762     if (unlikely(mem_off >= 0)) {
5763         reg_off = info.reg_off_split;
5764         for (i = 0; i < N; ++i) {
5765             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5766                    addr + mem_off + (i << msz), retaddr);
5767         }
5768     }
5769 
5770     mem_off = info.mem_off_first[1];
5771     if (unlikely(mem_off >= 0)) {
5772         reg_off = info.reg_off_first[1];
5773         reg_last = info.reg_off_last[1];
5774         host = info.page[1].host;
5775 
5776         do {
5777             uint64_t pg = vg[reg_off >> 6];
5778             do {
5779                 if ((pg >> (reg_off & 63)) & 1) {
5780                     for (i = 0; i < N; ++i) {
5781                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5782                                 host + mem_off + (i << msz));
5783                     }
5784                 }
5785                 reg_off += 1 << esz;
5786                 mem_off += N << msz;
5787             } while (reg_off & 63);
5788         } while (reg_off <= reg_last);
5789     }
5790 }
5791 
5792 static inline QEMU_ALWAYS_INLINE
5793 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5794                    uint32_t desc, const uintptr_t ra,
5795                    const int esz, const int msz, const int N,
5796                    sve_ldst1_host_fn *host_fn,
5797                    sve_ldst1_tlb_fn *tlb_fn)
5798 {
5799     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5800     int bit55 = extract64(addr, 55, 1);
5801 
5802     /* Remove mtedesc from the normal sve descriptor. */
5803     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5804 
5805     /* Perform gross MTE suppression early. */
5806     if (!tbi_check(desc, bit55) ||
5807         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5808         mtedesc = 0;
5809     }
5810 
5811     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5812 }
5813 
5814 #define DO_LD1_1(NAME, ESZ)                                             \
5815 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5816                             target_ulong addr, uint32_t desc)           \
5817 {                                                                       \
5818     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5819               sve_##NAME##_host, sve_##NAME##_tlb);                     \
5820 }                                                                       \
5821 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5822                                 target_ulong addr, uint32_t desc)       \
5823 {                                                                       \
5824     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5825                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
5826 }
5827 
5828 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5829 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5830                                target_ulong addr, uint32_t desc)        \
5831 {                                                                       \
5832     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5833               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5834 }                                                                       \
5835 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5836                                target_ulong addr, uint32_t desc)        \
5837 {                                                                       \
5838     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5839               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5840 }                                                                       \
5841 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5842                                    target_ulong addr, uint32_t desc)    \
5843 {                                                                       \
5844     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5845                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5846 }                                                                       \
5847 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5848                                    target_ulong addr, uint32_t desc)    \
5849 {                                                                       \
5850     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5851                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5852 }
5853 
5854 DO_LD1_1(ld1bb,  MO_8)
5855 DO_LD1_1(ld1bhu, MO_16)
5856 DO_LD1_1(ld1bhs, MO_16)
5857 DO_LD1_1(ld1bsu, MO_32)
5858 DO_LD1_1(ld1bss, MO_32)
5859 DO_LD1_1(ld1bdu, MO_64)
5860 DO_LD1_1(ld1bds, MO_64)
5861 
5862 DO_LD1_2(ld1hh,  MO_16, MO_16)
5863 DO_LD1_2(ld1hsu, MO_32, MO_16)
5864 DO_LD1_2(ld1hss, MO_32, MO_16)
5865 DO_LD1_2(ld1hdu, MO_64, MO_16)
5866 DO_LD1_2(ld1hds, MO_64, MO_16)
5867 
5868 DO_LD1_2(ld1ss,  MO_32, MO_32)
5869 DO_LD1_2(ld1sdu, MO_64, MO_32)
5870 DO_LD1_2(ld1sds, MO_64, MO_32)
5871 
5872 DO_LD1_2(ld1dd,  MO_64, MO_64)
5873 
5874 #undef DO_LD1_1
5875 #undef DO_LD1_2
5876 
5877 #define DO_LDN_1(N)                                                     \
5878 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5879                              target_ulong addr, uint32_t desc)          \
5880 {                                                                       \
5881     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5882               sve_ld1bb_host, sve_ld1bb_tlb);                           \
5883 }                                                                       \
5884 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5885                                  target_ulong addr, uint32_t desc)      \
5886 {                                                                       \
5887     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5888                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
5889 }
5890 
5891 #define DO_LDN_2(N, SUFF, ESZ)                                          \
5892 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5893                                     target_ulong addr, uint32_t desc)   \
5894 {                                                                       \
5895     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5896               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5897 }                                                                       \
5898 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5899                                     target_ulong addr, uint32_t desc)   \
5900 {                                                                       \
5901     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5902               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5903 }                                                                       \
5904 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5905                                         target_ulong addr, uint32_t desc) \
5906 {                                                                       \
5907     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5908                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5909 }                                                                       \
5910 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5911                                         target_ulong addr, uint32_t desc) \
5912 {                                                                       \
5913     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5914                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5915 }
5916 
5917 DO_LDN_1(2)
5918 DO_LDN_1(3)
5919 DO_LDN_1(4)
5920 
5921 DO_LDN_2(2, hh, MO_16)
5922 DO_LDN_2(3, hh, MO_16)
5923 DO_LDN_2(4, hh, MO_16)
5924 
5925 DO_LDN_2(2, ss, MO_32)
5926 DO_LDN_2(3, ss, MO_32)
5927 DO_LDN_2(4, ss, MO_32)
5928 
5929 DO_LDN_2(2, dd, MO_64)
5930 DO_LDN_2(3, dd, MO_64)
5931 DO_LDN_2(4, dd, MO_64)
5932 
5933 #undef DO_LDN_1
5934 #undef DO_LDN_2
5935 
5936 /*
5937  * Load contiguous data, first-fault and no-fault.
5938  *
5939  * For user-only, one could argue that we should hold the mmap_lock during
5940  * the operation so that there is no race between page_check_range and the
5941  * load operation.  However, unmapping pages out from under a running thread
5942  * is extraordinarily unlikely.  This theoretical race condition also affects
5943  * linux-user/ in its get_user/put_user macros.
5944  *
5945  * TODO: Construct some helpers, written in assembly, that interact with
5946  * host_signal_handler to produce memory ops which can properly report errors
5947  * without racing.
5948  */
5949 
5950 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5951  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5952  * option, which leaves subsequent data unchanged.
5953  */
5954 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5955 {
5956     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5957 
5958     if (i & 63) {
5959         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5960         i = ROUND_UP(i, 64);
5961     }
5962     for (; i < oprsz; i += 64) {
5963         ffr[i / 64] = 0;
5964     }
5965 }
5966 
5967 /*
5968  * Common helper for all contiguous no-fault and first-fault loads.
5969  */
5970 static inline QEMU_ALWAYS_INLINE
5971 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5972                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5973                    const int esz, const int msz, const SVEContFault fault,
5974                    sve_ldst1_host_fn *host_fn,
5975                    sve_ldst1_tlb_fn *tlb_fn)
5976 {
5977     const unsigned rd = simd_data(desc);
5978     void *vd = &env->vfp.zregs[rd];
5979     const intptr_t reg_max = simd_oprsz(desc);
5980     intptr_t reg_off, mem_off, reg_last;
5981     SVEContLdSt info;
5982     int flags;
5983     void *host;
5984 
5985     /* Find the active elements.  */
5986     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5987         /* The entire predicate was false; no load occurs.  */
5988         memset(vd, 0, reg_max);
5989         return;
5990     }
5991     reg_off = info.reg_off_first[0];
5992 
5993     /* Probe the page(s). */
5994     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5995         /* Fault on first element. */
5996         tcg_debug_assert(fault == FAULT_NO);
5997         memset(vd, 0, reg_max);
5998         goto do_fault;
5999     }
6000 
6001     mem_off = info.mem_off_first[0];
6002     flags = info.page[0].flags;
6003 
6004     /*
6005      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6006      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6007      */
6008     if (!info.page[0].tagged) {
6009         mtedesc = 0;
6010     }
6011 
6012     if (fault == FAULT_FIRST) {
6013         /* Trapping mte check for the first-fault element.  */
6014         if (mtedesc) {
6015             mte_check(env, mtedesc, addr + mem_off, retaddr);
6016         }
6017 
6018         /*
6019          * Special handling of the first active element,
6020          * if it crosses a page boundary or is MMIO.
6021          */
6022         bool is_split = mem_off == info.mem_off_split;
6023         if (unlikely(flags != 0) || unlikely(is_split)) {
6024             /*
6025              * Use the slow path for cross-page handling.
6026              * Might trap for MMIO or watchpoints.
6027              */
6028             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6029 
6030             /* After any fault, zero the other elements. */
6031             swap_memzero(vd, reg_off);
6032             reg_off += 1 << esz;
6033             mem_off += 1 << msz;
6034             swap_memzero(vd + reg_off, reg_max - reg_off);
6035 
6036             if (is_split) {
6037                 goto second_page;
6038             }
6039         } else {
6040             memset(vd, 0, reg_max);
6041         }
6042     } else {
6043         memset(vd, 0, reg_max);
6044         if (unlikely(mem_off == info.mem_off_split)) {
6045             /* The first active element crosses a page boundary. */
6046             flags |= info.page[1].flags;
6047             if (unlikely(flags & TLB_MMIO)) {
6048                 /* Some page is MMIO, see below. */
6049                 goto do_fault;
6050             }
6051             if (unlikely(flags & TLB_WATCHPOINT) &&
6052                 (cpu_watchpoint_address_matches
6053                  (env_cpu(env), addr + mem_off, 1 << msz)
6054                  & BP_MEM_READ)) {
6055                 /* Watchpoint hit, see below. */
6056                 goto do_fault;
6057             }
6058             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6059                 goto do_fault;
6060             }
6061             /*
6062              * Use the slow path for cross-page handling.
6063              * This is RAM, without a watchpoint, and will not trap.
6064              */
6065             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6066             goto second_page;
6067         }
6068     }
6069 
6070     /*
6071      * From this point on, all memory operations are MemSingleNF.
6072      *
6073      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6074      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6075      *
6076      * Unfortuately we do not have access to the memory attributes from the
6077      * PTE to tell Device memory from Normal memory.  So we make a mostly
6078      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6079      * This gives the right answer for the common cases of "Normal memory,
6080      * backed by host RAM" and "Device memory, backed by MMIO".
6081      * The architecture allows us to suppress an NF load and return
6082      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6083      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6084      * get wrong is "Device memory, backed by host RAM", for which we
6085      * should return (UNKNOWN, FAULT) for but do not.
6086      *
6087      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6088      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6089      * architectural breakpoints the same.
6090      */
6091     if (unlikely(flags & TLB_MMIO)) {
6092         goto do_fault;
6093     }
6094 
6095     reg_last = info.reg_off_last[0];
6096     host = info.page[0].host;
6097 
6098     do {
6099         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6100         do {
6101             if ((pg >> (reg_off & 63)) & 1) {
6102                 if (unlikely(flags & TLB_WATCHPOINT) &&
6103                     (cpu_watchpoint_address_matches
6104                      (env_cpu(env), addr + mem_off, 1 << msz)
6105                      & BP_MEM_READ)) {
6106                     goto do_fault;
6107                 }
6108                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6109                     goto do_fault;
6110                 }
6111                 host_fn(vd, reg_off, host + mem_off);
6112             }
6113             reg_off += 1 << esz;
6114             mem_off += 1 << msz;
6115         } while (reg_off <= reg_last && (reg_off & 63));
6116     } while (reg_off <= reg_last);
6117 
6118     /*
6119      * MemSingleNF is allowed to fail for any reason.  We have special
6120      * code above to handle the first element crossing a page boundary.
6121      * As an implementation choice, decline to handle a cross-page element
6122      * in any other position.
6123      */
6124     reg_off = info.reg_off_split;
6125     if (reg_off >= 0) {
6126         goto do_fault;
6127     }
6128 
6129  second_page:
6130     reg_off = info.reg_off_first[1];
6131     if (likely(reg_off < 0)) {
6132         /* No active elements on the second page.  All done. */
6133         return;
6134     }
6135 
6136     /*
6137      * MemSingleNF is allowed to fail for any reason.  As an implementation
6138      * choice, decline to handle elements on the second page.  This should
6139      * be low frequency as the guest walks through memory -- the next
6140      * iteration of the guest's loop should be aligned on the page boundary,
6141      * and then all following iterations will stay aligned.
6142      */
6143 
6144  do_fault:
6145     record_fault(env, reg_off, reg_max);
6146 }
6147 
6148 static inline QEMU_ALWAYS_INLINE
6149 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6150                        uint32_t desc, const uintptr_t retaddr,
6151                        const int esz, const int msz, const SVEContFault fault,
6152                        sve_ldst1_host_fn *host_fn,
6153                        sve_ldst1_tlb_fn *tlb_fn)
6154 {
6155     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6156     int bit55 = extract64(addr, 55, 1);
6157 
6158     /* Remove mtedesc from the normal sve descriptor. */
6159     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6160 
6161     /* Perform gross MTE suppression early. */
6162     if (!tbi_check(desc, bit55) ||
6163         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6164         mtedesc = 0;
6165     }
6166 
6167     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6168                   esz, msz, fault, host_fn, tlb_fn);
6169 }
6170 
6171 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6172 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6173                                  target_ulong addr, uint32_t desc)      \
6174 {                                                                       \
6175     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6176                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6177 }                                                                       \
6178 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6179                                  target_ulong addr, uint32_t desc)      \
6180 {                                                                       \
6181     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6182                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6183 }                                                                       \
6184 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6185                                      target_ulong addr, uint32_t desc)  \
6186 {                                                                       \
6187     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6188                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6189 }                                                                       \
6190 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6191                                      target_ulong addr, uint32_t desc)  \
6192 {                                                                       \
6193     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6194                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6195 }
6196 
6197 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6198 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6199                                     target_ulong addr, uint32_t desc)   \
6200 {                                                                       \
6201     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6202                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6203 }                                                                       \
6204 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6205                                     target_ulong addr, uint32_t desc)   \
6206 {                                                                       \
6207     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6208                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6209 }                                                                       \
6210 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6211                                     target_ulong addr, uint32_t desc)   \
6212 {                                                                       \
6213     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6214                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6215 }                                                                       \
6216 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6217                                     target_ulong addr, uint32_t desc)   \
6218 {                                                                       \
6219     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6220                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6221 }                                                                       \
6222 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6223                                         target_ulong addr, uint32_t desc) \
6224 {                                                                       \
6225     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6226                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6227 }                                                                       \
6228 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6229                                         target_ulong addr, uint32_t desc) \
6230 {                                                                       \
6231     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6232                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6233 }                                                                       \
6234 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6235                                         target_ulong addr, uint32_t desc) \
6236 {                                                                       \
6237     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6238                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6239 }                                                                       \
6240 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6241                                         target_ulong addr, uint32_t desc) \
6242 {                                                                       \
6243     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6244                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6245 }
6246 
6247 DO_LDFF1_LDNF1_1(bb,  MO_8)
6248 DO_LDFF1_LDNF1_1(bhu, MO_16)
6249 DO_LDFF1_LDNF1_1(bhs, MO_16)
6250 DO_LDFF1_LDNF1_1(bsu, MO_32)
6251 DO_LDFF1_LDNF1_1(bss, MO_32)
6252 DO_LDFF1_LDNF1_1(bdu, MO_64)
6253 DO_LDFF1_LDNF1_1(bds, MO_64)
6254 
6255 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6256 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6257 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6258 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6259 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6260 
6261 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6262 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6263 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6264 
6265 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6266 
6267 #undef DO_LDFF1_LDNF1_1
6268 #undef DO_LDFF1_LDNF1_2
6269 
6270 /*
6271  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6272  */
6273 
6274 static inline QEMU_ALWAYS_INLINE
6275 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6276                uint32_t desc, const uintptr_t retaddr,
6277                const int esz, const int msz, const int N, uint32_t mtedesc,
6278                sve_ldst1_host_fn *host_fn,
6279                sve_ldst1_tlb_fn *tlb_fn)
6280 {
6281     const unsigned rd = simd_data(desc);
6282     const intptr_t reg_max = simd_oprsz(desc);
6283     intptr_t reg_off, reg_last, mem_off;
6284     SVEContLdSt info;
6285     void *host;
6286     int i, flags;
6287 
6288     /* Find the active elements.  */
6289     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6290         /* The entire predicate was false; no store occurs.  */
6291         return;
6292     }
6293 
6294     /* Probe the page(s).  Exit with exception for any invalid page. */
6295     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6296 
6297     /* Handle watchpoints for all active elements. */
6298     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6299                               BP_MEM_WRITE, retaddr);
6300 
6301     /*
6302      * Handle mte checks for all active elements.
6303      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6304      */
6305     if (mtedesc) {
6306         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6307                                 mtedesc, retaddr);
6308     }
6309 
6310     flags = info.page[0].flags | info.page[1].flags;
6311     if (unlikely(flags != 0)) {
6312 #ifdef CONFIG_USER_ONLY
6313         g_assert_not_reached();
6314 #else
6315         /*
6316          * At least one page includes MMIO.
6317          * Any bus operation can fail with cpu_transaction_failed,
6318          * which for ARM will raise SyncExternal.  We cannot avoid
6319          * this fault and will leave with the store incomplete.
6320          */
6321         mem_off = info.mem_off_first[0];
6322         reg_off = info.reg_off_first[0];
6323         reg_last = info.reg_off_last[1];
6324         if (reg_last < 0) {
6325             reg_last = info.reg_off_split;
6326             if (reg_last < 0) {
6327                 reg_last = info.reg_off_last[0];
6328             }
6329         }
6330 
6331         do {
6332             uint64_t pg = vg[reg_off >> 6];
6333             do {
6334                 if ((pg >> (reg_off & 63)) & 1) {
6335                     for (i = 0; i < N; ++i) {
6336                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6337                                addr + mem_off + (i << msz), retaddr);
6338                     }
6339                 }
6340                 reg_off += 1 << esz;
6341                 mem_off += N << msz;
6342             } while (reg_off & 63);
6343         } while (reg_off <= reg_last);
6344         return;
6345 #endif
6346     }
6347 
6348     mem_off = info.mem_off_first[0];
6349     reg_off = info.reg_off_first[0];
6350     reg_last = info.reg_off_last[0];
6351     host = info.page[0].host;
6352 
6353     while (reg_off <= reg_last) {
6354         uint64_t pg = vg[reg_off >> 6];
6355         do {
6356             if ((pg >> (reg_off & 63)) & 1) {
6357                 for (i = 0; i < N; ++i) {
6358                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6359                             host + mem_off + (i << msz));
6360                 }
6361             }
6362             reg_off += 1 << esz;
6363             mem_off += N << msz;
6364         } while (reg_off <= reg_last && (reg_off & 63));
6365     }
6366 
6367     /*
6368      * Use the slow path to manage the cross-page misalignment.
6369      * But we know this is RAM and cannot trap.
6370      */
6371     mem_off = info.mem_off_split;
6372     if (unlikely(mem_off >= 0)) {
6373         reg_off = info.reg_off_split;
6374         for (i = 0; i < N; ++i) {
6375             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6376                    addr + mem_off + (i << msz), retaddr);
6377         }
6378     }
6379 
6380     mem_off = info.mem_off_first[1];
6381     if (unlikely(mem_off >= 0)) {
6382         reg_off = info.reg_off_first[1];
6383         reg_last = info.reg_off_last[1];
6384         host = info.page[1].host;
6385 
6386         do {
6387             uint64_t pg = vg[reg_off >> 6];
6388             do {
6389                 if ((pg >> (reg_off & 63)) & 1) {
6390                     for (i = 0; i < N; ++i) {
6391                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6392                                 host + mem_off + (i << msz));
6393                     }
6394                 }
6395                 reg_off += 1 << esz;
6396                 mem_off += N << msz;
6397             } while (reg_off & 63);
6398         } while (reg_off <= reg_last);
6399     }
6400 }
6401 
6402 static inline QEMU_ALWAYS_INLINE
6403 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6404                    uint32_t desc, const uintptr_t ra,
6405                    const int esz, const int msz, const int N,
6406                    sve_ldst1_host_fn *host_fn,
6407                    sve_ldst1_tlb_fn *tlb_fn)
6408 {
6409     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6410     int bit55 = extract64(addr, 55, 1);
6411 
6412     /* Remove mtedesc from the normal sve descriptor. */
6413     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6414 
6415     /* Perform gross MTE suppression early. */
6416     if (!tbi_check(desc, bit55) ||
6417         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6418         mtedesc = 0;
6419     }
6420 
6421     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6422 }
6423 
6424 #define DO_STN_1(N, NAME, ESZ)                                          \
6425 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6426                                  target_ulong addr, uint32_t desc)      \
6427 {                                                                       \
6428     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6429               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6430 }                                                                       \
6431 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6432                                      target_ulong addr, uint32_t desc)  \
6433 {                                                                       \
6434     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6435                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6436 }
6437 
6438 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6439 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6440                                     target_ulong addr, uint32_t desc)   \
6441 {                                                                       \
6442     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6443               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6444 }                                                                       \
6445 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6446                                     target_ulong addr, uint32_t desc)   \
6447 {                                                                       \
6448     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6449               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6450 }                                                                       \
6451 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6452                                         target_ulong addr, uint32_t desc) \
6453 {                                                                       \
6454     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6455                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6456 }                                                                       \
6457 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6458                                         target_ulong addr, uint32_t desc) \
6459 {                                                                       \
6460     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6461                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6462 }
6463 
6464 DO_STN_1(1, bb, MO_8)
6465 DO_STN_1(1, bh, MO_16)
6466 DO_STN_1(1, bs, MO_32)
6467 DO_STN_1(1, bd, MO_64)
6468 DO_STN_1(2, bb, MO_8)
6469 DO_STN_1(3, bb, MO_8)
6470 DO_STN_1(4, bb, MO_8)
6471 
6472 DO_STN_2(1, hh, MO_16, MO_16)
6473 DO_STN_2(1, hs, MO_32, MO_16)
6474 DO_STN_2(1, hd, MO_64, MO_16)
6475 DO_STN_2(2, hh, MO_16, MO_16)
6476 DO_STN_2(3, hh, MO_16, MO_16)
6477 DO_STN_2(4, hh, MO_16, MO_16)
6478 
6479 DO_STN_2(1, ss, MO_32, MO_32)
6480 DO_STN_2(1, sd, MO_64, MO_32)
6481 DO_STN_2(2, ss, MO_32, MO_32)
6482 DO_STN_2(3, ss, MO_32, MO_32)
6483 DO_STN_2(4, ss, MO_32, MO_32)
6484 
6485 DO_STN_2(1, dd, MO_64, MO_64)
6486 DO_STN_2(2, dd, MO_64, MO_64)
6487 DO_STN_2(3, dd, MO_64, MO_64)
6488 DO_STN_2(4, dd, MO_64, MO_64)
6489 
6490 #undef DO_STN_1
6491 #undef DO_STN_2
6492 
6493 /*
6494  * Loads with a vector index.
6495  */
6496 
6497 /*
6498  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6499  */
6500 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6501 
6502 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6503 {
6504     return *(uint32_t *)(reg + H1_4(reg_ofs));
6505 }
6506 
6507 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6508 {
6509     return *(int32_t *)(reg + H1_4(reg_ofs));
6510 }
6511 
6512 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6513 {
6514     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6515 }
6516 
6517 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6518 {
6519     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6520 }
6521 
6522 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6523 {
6524     return *(uint64_t *)(reg + reg_ofs);
6525 }
6526 
6527 static inline QEMU_ALWAYS_INLINE
6528 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6529                target_ulong base, uint32_t desc, uintptr_t retaddr,
6530                uint32_t mtedesc, int esize, int msize,
6531                zreg_off_fn *off_fn,
6532                sve_ldst1_host_fn *host_fn,
6533                sve_ldst1_tlb_fn *tlb_fn)
6534 {
6535     const int mmu_idx = cpu_mmu_index(env, false);
6536     const intptr_t reg_max = simd_oprsz(desc);
6537     const int scale = simd_data(desc);
6538     ARMVectorReg scratch;
6539     intptr_t reg_off;
6540     SVEHostPage info, info2;
6541 
6542     memset(&scratch, 0, reg_max);
6543     reg_off = 0;
6544     do {
6545         uint64_t pg = vg[reg_off >> 6];
6546         do {
6547             if (likely(pg & 1)) {
6548                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6549                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6550 
6551                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6552                                mmu_idx, retaddr);
6553 
6554                 if (likely(in_page >= msize)) {
6555                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6556                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6557                                              info.attrs, BP_MEM_READ, retaddr);
6558                     }
6559                     if (mtedesc && info.tagged) {
6560                         mte_check(env, mtedesc, addr, retaddr);
6561                     }
6562                     if (unlikely(info.flags & TLB_MMIO)) {
6563                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6564                     } else {
6565                         host_fn(&scratch, reg_off, info.host);
6566                     }
6567                 } else {
6568                     /* Element crosses the page boundary. */
6569                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6570                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6571                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6572                         cpu_check_watchpoint(env_cpu(env), addr,
6573                                              msize, info.attrs,
6574                                              BP_MEM_READ, retaddr);
6575                     }
6576                     if (mtedesc && info.tagged) {
6577                         mte_check(env, mtedesc, addr, retaddr);
6578                     }
6579                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6580                 }
6581             }
6582             reg_off += esize;
6583             pg >>= esize;
6584         } while (reg_off & 63);
6585     } while (reg_off < reg_max);
6586 
6587     /* Wait until all exceptions have been raised to write back.  */
6588     memcpy(vd, &scratch, reg_max);
6589 }
6590 
6591 static inline QEMU_ALWAYS_INLINE
6592 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6593                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6594                    int esize, int msize, zreg_off_fn *off_fn,
6595                    sve_ldst1_host_fn *host_fn,
6596                    sve_ldst1_tlb_fn *tlb_fn)
6597 {
6598     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6599     /* Remove mtedesc from the normal sve descriptor. */
6600     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6601 
6602     /*
6603      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6604      * offset base entirely over the address space hole to change the
6605      * pointer tag, or change the bit55 selector.  So we could here
6606      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6607      */
6608     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6609               esize, msize, off_fn, host_fn, tlb_fn);
6610 }
6611 
6612 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6613 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6614                                  void *vm, target_ulong base, uint32_t desc) \
6615 {                                                                            \
6616     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6617               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6618 }                                                                            \
6619 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6620      void *vm, target_ulong base, uint32_t desc)                             \
6621 {                                                                            \
6622     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6623                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6624 }
6625 
6626 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6627 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6628                                  void *vm, target_ulong base, uint32_t desc) \
6629 {                                                                            \
6630     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6631               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6632 }                                                                            \
6633 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6634     void *vm, target_ulong base, uint32_t desc)                              \
6635 {                                                                            \
6636     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6637                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6638 }
6639 
6640 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6641 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6642 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6643 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6644 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6645 
6646 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6647 DO_LD1_ZPZ_S(bss, zss, MO_8)
6648 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6649 DO_LD1_ZPZ_D(bds, zss, MO_8)
6650 DO_LD1_ZPZ_D(bds, zd, MO_8)
6651 
6652 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6653 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6654 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6655 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6656 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6657 
6658 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6659 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6660 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6661 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6662 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6663 
6664 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6665 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6666 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6667 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6668 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6669 
6670 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6671 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6672 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6673 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6674 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6675 
6676 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6677 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6678 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6679 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6680 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6681 
6682 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6683 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6684 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6685 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6686 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6687 
6688 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6689 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6690 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6691 
6692 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6693 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6694 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6695 
6696 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6697 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6698 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6699 
6700 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6701 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6702 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6703 
6704 #undef DO_LD1_ZPZ_S
6705 #undef DO_LD1_ZPZ_D
6706 
6707 /* First fault loads with a vector index.  */
6708 
6709 /*
6710  * Common helpers for all gather first-faulting loads.
6711  */
6712 
6713 static inline QEMU_ALWAYS_INLINE
6714 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6715                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6716                  uint32_t mtedesc, const int esz, const int msz,
6717                  zreg_off_fn *off_fn,
6718                  sve_ldst1_host_fn *host_fn,
6719                  sve_ldst1_tlb_fn *tlb_fn)
6720 {
6721     const int mmu_idx = cpu_mmu_index(env, false);
6722     const intptr_t reg_max = simd_oprsz(desc);
6723     const int scale = simd_data(desc);
6724     const int esize = 1 << esz;
6725     const int msize = 1 << msz;
6726     intptr_t reg_off;
6727     SVEHostPage info;
6728     target_ulong addr, in_page;
6729 
6730     /* Skip to the first true predicate.  */
6731     reg_off = find_next_active(vg, 0, reg_max, esz);
6732     if (unlikely(reg_off >= reg_max)) {
6733         /* The entire predicate was false; no load occurs.  */
6734         memset(vd, 0, reg_max);
6735         return;
6736     }
6737 
6738     /*
6739      * Probe the first element, allowing faults.
6740      */
6741     addr = base + (off_fn(vm, reg_off) << scale);
6742     if (mtedesc) {
6743         mte_check(env, mtedesc, addr, retaddr);
6744     }
6745     tlb_fn(env, vd, reg_off, addr, retaddr);
6746 
6747     /* After any fault, zero the other elements. */
6748     swap_memzero(vd, reg_off);
6749     reg_off += esize;
6750     swap_memzero(vd + reg_off, reg_max - reg_off);
6751 
6752     /*
6753      * Probe the remaining elements, not allowing faults.
6754      */
6755     while (reg_off < reg_max) {
6756         uint64_t pg = vg[reg_off >> 6];
6757         do {
6758             if (likely((pg >> (reg_off & 63)) & 1)) {
6759                 addr = base + (off_fn(vm, reg_off) << scale);
6760                 in_page = -(addr | TARGET_PAGE_MASK);
6761 
6762                 if (unlikely(in_page < msize)) {
6763                     /* Stop if the element crosses a page boundary. */
6764                     goto fault;
6765                 }
6766 
6767                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6768                                mmu_idx, retaddr);
6769                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6770                     goto fault;
6771                 }
6772                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6773                     (cpu_watchpoint_address_matches
6774                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6775                     goto fault;
6776                 }
6777                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6778                     goto fault;
6779                 }
6780 
6781                 host_fn(vd, reg_off, info.host);
6782             }
6783             reg_off += esize;
6784         } while (reg_off & 63);
6785     }
6786     return;
6787 
6788  fault:
6789     record_fault(env, reg_off, reg_max);
6790 }
6791 
6792 static inline QEMU_ALWAYS_INLINE
6793 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6794                      target_ulong base, uint32_t desc, uintptr_t retaddr,
6795                      const int esz, const int msz,
6796                      zreg_off_fn *off_fn,
6797                      sve_ldst1_host_fn *host_fn,
6798                      sve_ldst1_tlb_fn *tlb_fn)
6799 {
6800     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6801     /* Remove mtedesc from the normal sve descriptor. */
6802     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6803 
6804     /*
6805      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6806      * offset base entirely over the address space hole to change the
6807      * pointer tag, or change the bit55 selector.  So we could here
6808      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6809      */
6810     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6811                 esz, msz, off_fn, host_fn, tlb_fn);
6812 }
6813 
6814 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6815 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6816     (CPUARMState *env, void *vd, void *vg,                              \
6817      void *vm, target_ulong base, uint32_t desc)                        \
6818 {                                                                       \
6819     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6820                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6821 }                                                                       \
6822 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6823     (CPUARMState *env, void *vd, void *vg,                              \
6824      void *vm, target_ulong base, uint32_t desc)                        \
6825 {                                                                       \
6826     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6827                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6828 }
6829 
6830 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6831 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6832     (CPUARMState *env, void *vd, void *vg,                              \
6833      void *vm, target_ulong base, uint32_t desc)                        \
6834 {                                                                       \
6835     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6836                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6837 }                                                                       \
6838 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6839     (CPUARMState *env, void *vd, void *vg,                              \
6840      void *vm, target_ulong base, uint32_t desc)                        \
6841 {                                                                       \
6842     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6843                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6844 }
6845 
6846 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6847 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6848 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6849 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6850 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6851 
6852 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6853 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6854 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6855 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6856 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6857 
6858 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6859 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6860 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6861 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6862 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6863 
6864 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6865 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6866 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6867 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6868 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6869 
6870 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6871 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6872 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6873 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6874 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6875 
6876 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6877 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6878 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6879 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6880 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6881 
6882 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6883 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6884 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6885 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6886 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6887 
6888 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6889 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6890 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6891 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6892 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6893 
6894 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6895 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6896 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6897 
6898 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6899 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6900 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6901 
6902 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6903 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6904 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6905 
6906 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6907 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6908 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6909 
6910 /* Stores with a vector index.  */
6911 
6912 static inline QEMU_ALWAYS_INLINE
6913 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6914                target_ulong base, uint32_t desc, uintptr_t retaddr,
6915                uint32_t mtedesc, int esize, int msize,
6916                zreg_off_fn *off_fn,
6917                sve_ldst1_host_fn *host_fn,
6918                sve_ldst1_tlb_fn *tlb_fn)
6919 {
6920     const int mmu_idx = cpu_mmu_index(env, false);
6921     const intptr_t reg_max = simd_oprsz(desc);
6922     const int scale = simd_data(desc);
6923     void *host[ARM_MAX_VQ * 4];
6924     intptr_t reg_off, i;
6925     SVEHostPage info, info2;
6926 
6927     /*
6928      * Probe all of the elements for host addresses and flags.
6929      */
6930     i = reg_off = 0;
6931     do {
6932         uint64_t pg = vg[reg_off >> 6];
6933         do {
6934             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6935             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6936 
6937             host[i] = NULL;
6938             if (likely((pg >> (reg_off & 63)) & 1)) {
6939                 if (likely(in_page >= msize)) {
6940                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6941                                    mmu_idx, retaddr);
6942                     if (!(info.flags & TLB_MMIO)) {
6943                         host[i] = info.host;
6944                     }
6945                 } else {
6946                     /*
6947                      * Element crosses the page boundary.
6948                      * Probe both pages, but do not record the host address,
6949                      * so that we use the slow path.
6950                      */
6951                     sve_probe_page(&info, false, env, addr, 0,
6952                                    MMU_DATA_STORE, mmu_idx, retaddr);
6953                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6954                                    MMU_DATA_STORE, mmu_idx, retaddr);
6955                     info.flags |= info2.flags;
6956                 }
6957 
6958                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6959                     cpu_check_watchpoint(env_cpu(env), addr, msize,
6960                                          info.attrs, BP_MEM_WRITE, retaddr);
6961                 }
6962 
6963                 if (mtedesc && info.tagged) {
6964                     mte_check(env, mtedesc, addr, retaddr);
6965                 }
6966             }
6967             i += 1;
6968             reg_off += esize;
6969         } while (reg_off & 63);
6970     } while (reg_off < reg_max);
6971 
6972     /*
6973      * Now that we have recognized all exceptions except SyncExternal
6974      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6975      *
6976      * Note for the common case of an element in RAM, not crossing a page
6977      * boundary, we have stored the host address in host[].  This doubles
6978      * as a first-level check against the predicate, since only enabled
6979      * elements have non-null host addresses.
6980      */
6981     i = reg_off = 0;
6982     do {
6983         void *h = host[i];
6984         if (likely(h != NULL)) {
6985             host_fn(vd, reg_off, h);
6986         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6987             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6988             tlb_fn(env, vd, reg_off, addr, retaddr);
6989         }
6990         i += 1;
6991         reg_off += esize;
6992     } while (reg_off < reg_max);
6993 }
6994 
6995 static inline QEMU_ALWAYS_INLINE
6996 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6997                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6998                    int esize, int msize, zreg_off_fn *off_fn,
6999                    sve_ldst1_host_fn *host_fn,
7000                    sve_ldst1_tlb_fn *tlb_fn)
7001 {
7002     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7003     /* Remove mtedesc from the normal sve descriptor. */
7004     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7005 
7006     /*
7007      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7008      * offset base entirely over the address space hole to change the
7009      * pointer tag, or change the bit55 selector.  So we could here
7010      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7011      */
7012     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7013               esize, msize, off_fn, host_fn, tlb_fn);
7014 }
7015 
7016 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7017 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7018                                  void *vm, target_ulong base, uint32_t desc) \
7019 {                                                                       \
7020     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7021               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7022 }                                                                       \
7023 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7024     void *vm, target_ulong base, uint32_t desc)                         \
7025 {                                                                       \
7026     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7027                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7028 }
7029 
7030 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7031 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7032                                  void *vm, target_ulong base, uint32_t desc) \
7033 {                                                                       \
7034     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7035               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7036 }                                                                       \
7037 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7038     void *vm, target_ulong base, uint32_t desc)                         \
7039 {                                                                       \
7040     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7041                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7042 }
7043 
7044 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7045 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7046 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7047 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7048 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7049 
7050 DO_ST1_ZPZ_S(bs, zss, MO_8)
7051 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7052 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7053 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7054 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7055 
7056 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7057 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7058 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7059 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7060 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7061 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7062 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7063 
7064 DO_ST1_ZPZ_D(bd, zss, MO_8)
7065 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7066 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7067 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7068 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7069 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7070 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7071 
7072 DO_ST1_ZPZ_D(bd, zd, MO_8)
7073 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7074 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7075 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7076 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7077 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7078 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7079 
7080 #undef DO_ST1_ZPZ_S
7081 #undef DO_ST1_ZPZ_D
7082 
7083 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7084 {
7085     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7086     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7087 
7088     for (i = 0; i < opr_sz; ++i) {
7089         d[i] = n[i] ^ m[i] ^ k[i];
7090     }
7091 }
7092 
7093 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7094 {
7095     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7096     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7097 
7098     for (i = 0; i < opr_sz; ++i) {
7099         d[i] = n[i] ^ (m[i] & ~k[i]);
7100     }
7101 }
7102 
7103 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7104 {
7105     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7106     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7107 
7108     for (i = 0; i < opr_sz; ++i) {
7109         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7110     }
7111 }
7112 
7113 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7114 {
7115     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7116     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7117 
7118     for (i = 0; i < opr_sz; ++i) {
7119         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7120     }
7121 }
7122 
7123 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7124 {
7125     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7126     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7127 
7128     for (i = 0; i < opr_sz; ++i) {
7129         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7130     }
7131 }
7132 
7133 /*
7134  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7135  * See hasless(v,1) from
7136  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7137  */
7138 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7139 {
7140     int bits = 8 << esz;
7141     uint64_t ones = dup_const(esz, 1);
7142     uint64_t signs = ones << (bits - 1);
7143     uint64_t cmp0, cmp1;
7144 
7145     cmp1 = dup_const(esz, n);
7146     cmp0 = cmp1 ^ m0;
7147     cmp1 = cmp1 ^ m1;
7148     cmp0 = (cmp0 - ones) & ~cmp0;
7149     cmp1 = (cmp1 - ones) & ~cmp1;
7150     return (cmp0 | cmp1) & signs;
7151 }
7152 
7153 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7154                                 uint32_t desc, int esz, bool nmatch)
7155 {
7156     uint16_t esz_mask = pred_esz_masks[esz];
7157     intptr_t opr_sz = simd_oprsz(desc);
7158     uint32_t flags = PREDTEST_INIT;
7159     intptr_t i, j, k;
7160 
7161     for (i = 0; i < opr_sz; i += 16) {
7162         uint64_t m0 = *(uint64_t *)(vm + i);
7163         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7164         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7165         uint16_t out = 0;
7166 
7167         for (j = 0; j < 16; j += 8) {
7168             uint64_t n = *(uint64_t *)(vn + i + j);
7169 
7170             for (k = 0; k < 8; k += 1 << esz) {
7171                 if (pg & (1 << (j + k))) {
7172                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7173                     out |= (o ^ nmatch) << (j + k);
7174                 }
7175             }
7176         }
7177         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7178         flags = iter_predtest_fwd(out, pg, flags);
7179     }
7180     return flags;
7181 }
7182 
7183 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7184 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7185 {                                                                             \
7186     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7187 }
7188 
7189 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7190 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7191 
7192 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7193 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7194 
7195 #undef DO_PPZZ_MATCH
7196 
7197 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7198                             uint32_t desc)
7199 {
7200     ARMVectorReg scratch;
7201     intptr_t i, j;
7202     intptr_t opr_sz = simd_oprsz(desc);
7203     uint32_t *d = vd, *n = vn, *m = vm;
7204     uint8_t *pg = vg;
7205 
7206     if (d == n) {
7207         n = memcpy(&scratch, n, opr_sz);
7208         if (d == m) {
7209             m = n;
7210         }
7211     } else if (d == m) {
7212         m = memcpy(&scratch, m, opr_sz);
7213     }
7214 
7215     for (i = 0; i < opr_sz; i += 4) {
7216         uint64_t count = 0;
7217         uint8_t pred;
7218 
7219         pred = pg[H1(i >> 3)] >> (i & 7);
7220         if (pred & 1) {
7221             uint32_t nn = n[H4(i >> 2)];
7222 
7223             for (j = 0; j <= i; j += 4) {
7224                 pred = pg[H1(j >> 3)] >> (j & 7);
7225                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7226                     ++count;
7227                 }
7228             }
7229         }
7230         d[H4(i >> 2)] = count;
7231     }
7232 }
7233 
7234 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7235                             uint32_t desc)
7236 {
7237     ARMVectorReg scratch;
7238     intptr_t i, j;
7239     intptr_t opr_sz = simd_oprsz(desc);
7240     uint64_t *d = vd, *n = vn, *m = vm;
7241     uint8_t *pg = vg;
7242 
7243     if (d == n) {
7244         n = memcpy(&scratch, n, opr_sz);
7245         if (d == m) {
7246             m = n;
7247         }
7248     } else if (d == m) {
7249         m = memcpy(&scratch, m, opr_sz);
7250     }
7251 
7252     for (i = 0; i < opr_sz / 8; ++i) {
7253         uint64_t count = 0;
7254         if (pg[H1(i)] & 1) {
7255             uint64_t nn = n[i];
7256             for (j = 0; j <= i; ++j) {
7257                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7258                     ++count;
7259                 }
7260             }
7261         }
7262         d[i] = count;
7263     }
7264 }
7265 
7266 /*
7267  * Returns the number of bytes in m0 and m1 that match n.
7268  * Unlike do_match2 we don't just need true/false, we need an exact count.
7269  * This requires two extra logical operations.
7270  */
7271 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7272 {
7273     const uint64_t mask = dup_const(MO_8, 0x7f);
7274     uint64_t cmp0, cmp1;
7275 
7276     cmp1 = dup_const(MO_8, n);
7277     cmp0 = cmp1 ^ m0;
7278     cmp1 = cmp1 ^ m1;
7279 
7280     /*
7281      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7282      * 2: carry in to msb if byte != 0 (+ mask)
7283      * 3: set msb if cmp has msb set (| cmp)
7284      * 4: set ~msb to ignore them (| mask)
7285      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7286      * 5: invert, resulting in 0x80 if and only if byte == 0.
7287      */
7288     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7289     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7290 
7291     /*
7292      * Combine the two compares in a way that the bits do
7293      * not overlap, and so preserves the count of set bits.
7294      * If the host has an efficient instruction for ctpop,
7295      * then ctpop(x) + ctpop(y) has the same number of
7296      * operations as ctpop(x | (y >> 1)).  If the host does
7297      * not have an efficient ctpop, then we only want to
7298      * use it once.
7299      */
7300     return ctpop64(cmp0 | (cmp1 >> 1));
7301 }
7302 
7303 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7304 {
7305     intptr_t i, j;
7306     intptr_t opr_sz = simd_oprsz(desc);
7307 
7308     for (i = 0; i < opr_sz; i += 16) {
7309         uint64_t n0 = *(uint64_t *)(vn + i);
7310         uint64_t m0 = *(uint64_t *)(vm + i);
7311         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7312         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7313         uint64_t out0 = 0;
7314         uint64_t out1 = 0;
7315 
7316         for (j = 0; j < 64; j += 8) {
7317             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7318             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7319             out0 |= cnt0 << j;
7320             out1 |= cnt1 << j;
7321         }
7322 
7323         *(uint64_t *)(vd + i) = out0;
7324         *(uint64_t *)(vd + i + 8) = out1;
7325     }
7326 }
7327 
7328 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7329 {
7330     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7331     int shr = simd_data(desc);
7332     int shl = 8 - shr;
7333     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7334     uint64_t *d = vd, *n = vn, *m = vm;
7335 
7336     for (i = 0; i < opr_sz; ++i) {
7337         uint64_t t = n[i] ^ m[i];
7338         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7339     }
7340 }
7341 
7342 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7343 {
7344     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7345     int shr = simd_data(desc);
7346     int shl = 16 - shr;
7347     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7348     uint64_t *d = vd, *n = vn, *m = vm;
7349 
7350     for (i = 0; i < opr_sz; ++i) {
7351         uint64_t t = n[i] ^ m[i];
7352         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7353     }
7354 }
7355 
7356 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7357 {
7358     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7359     int shr = simd_data(desc);
7360     uint32_t *d = vd, *n = vn, *m = vm;
7361 
7362     for (i = 0; i < opr_sz; ++i) {
7363         d[i] = ror32(n[i] ^ m[i], shr);
7364     }
7365 }
7366 
7367 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7368                      void *status, uint32_t desc)
7369 {
7370     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7371 
7372     for (s = 0; s < opr_sz; ++s) {
7373         float32 *n = vn + s * sizeof(float32) * 4;
7374         float32 *m = vm + s * sizeof(float32) * 4;
7375         float32 *a = va + s * sizeof(float32) * 4;
7376         float32 *d = vd + s * sizeof(float32) * 4;
7377         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7378         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7379         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7380         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7381         float32 p0, p1;
7382 
7383         /* i = 0, j = 0 */
7384         p0 = float32_mul(n00, m00, status);
7385         p1 = float32_mul(n01, m01, status);
7386         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7387 
7388         /* i = 0, j = 1 */
7389         p0 = float32_mul(n00, m10, status);
7390         p1 = float32_mul(n01, m11, status);
7391         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7392 
7393         /* i = 1, j = 0 */
7394         p0 = float32_mul(n10, m00, status);
7395         p1 = float32_mul(n11, m01, status);
7396         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7397 
7398         /* i = 1, j = 1 */
7399         p0 = float32_mul(n10, m10, status);
7400         p1 = float32_mul(n11, m11, status);
7401         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7402     }
7403 }
7404 
7405 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7406                      void *status, uint32_t desc)
7407 {
7408     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7409 
7410     for (s = 0; s < opr_sz; ++s) {
7411         float64 *n = vn + s * sizeof(float64) * 4;
7412         float64 *m = vm + s * sizeof(float64) * 4;
7413         float64 *a = va + s * sizeof(float64) * 4;
7414         float64 *d = vd + s * sizeof(float64) * 4;
7415         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7416         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7417         float64 p0, p1;
7418 
7419         /* i = 0, j = 0 */
7420         p0 = float64_mul(n00, m00, status);
7421         p1 = float64_mul(n01, m01, status);
7422         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7423 
7424         /* i = 0, j = 1 */
7425         p0 = float64_mul(n00, m10, status);
7426         p1 = float64_mul(n01, m11, status);
7427         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7428 
7429         /* i = 1, j = 0 */
7430         p0 = float64_mul(n10, m00, status);
7431         p1 = float64_mul(n11, m01, status);
7432         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7433 
7434         /* i = 1, j = 1 */
7435         p0 = float64_mul(n10, m10, status);
7436         p1 = float64_mul(n11, m11, status);
7437         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7438     }
7439 }
7440 
7441 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7442 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7443 {                                                                             \
7444     intptr_t i = simd_oprsz(desc);                                            \
7445     uint64_t *g = vg;                                                         \
7446     do {                                                                      \
7447         uint64_t pg = g[(i - 1) >> 6];                                        \
7448         do {                                                                  \
7449             i -= sizeof(TYPEW);                                               \
7450             if (likely((pg >> (i & 63)) & 1)) {                               \
7451                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7452                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7453             }                                                                 \
7454         } while (i & 63);                                                     \
7455     } while (i != 0);                                                         \
7456 }
7457 
7458 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7459 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7460 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7461 
7462 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7463 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7464 {                                                                             \
7465     intptr_t i = simd_oprsz(desc);                                            \
7466     uint64_t *g = vg;                                                         \
7467     do {                                                                      \
7468         uint64_t pg = g[(i - 1) >> 6];                                        \
7469         do {                                                                  \
7470             i -= sizeof(TYPEW);                                               \
7471             if (likely((pg >> (i & 63)) & 1)) {                               \
7472                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7473                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7474             }                                                                 \
7475         } while (i & 63);                                                     \
7476     } while (i != 0);                                                         \
7477 }
7478 
7479 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7480 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7481 
7482 #undef DO_FCVTLT
7483 #undef DO_FCVTNT
7484