xref: /qemu/target/riscv/vector_helper.c (revision de6cd759)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
54         /* only set vill bit. */
55         env->vill = 1;
56         env->vtype = 0;
57         env->vl = 0;
58         env->vstart = 0;
59         return 0;
60     }
61 
62     vlmax = vext_get_vlmax(cpu, s2);
63     if (s1 <= vlmax) {
64         vl = s1;
65     } else {
66         vl = vlmax;
67     }
68     env->vl = vl;
69     env->vtype = s2;
70     env->vstart = 0;
71     env->vill = 0;
72     return vl;
73 }
74 
75 /*
76  * Note that vector data is stored in host-endian 64-bit chunks,
77  * so addressing units smaller than that needs a host-endian fixup.
78  */
79 #if HOST_BIG_ENDIAN
80 #define H1(x)   ((x) ^ 7)
81 #define H1_2(x) ((x) ^ 6)
82 #define H1_4(x) ((x) ^ 4)
83 #define H2(x)   ((x) ^ 3)
84 #define H4(x)   ((x) ^ 1)
85 #define H8(x)   ((x))
86 #else
87 #define H1(x)   (x)
88 #define H1_2(x) (x)
89 #define H1_4(x) (x)
90 #define H2(x)   (x)
91 #define H4(x)   (x)
92 #define H8(x)   (x)
93 #endif
94 
95 static inline uint32_t vext_nf(uint32_t desc)
96 {
97     return FIELD_EX32(simd_data(desc), VDATA, NF);
98 }
99 
100 static inline uint32_t vext_vm(uint32_t desc)
101 {
102     return FIELD_EX32(simd_data(desc), VDATA, VM);
103 }
104 
105 /*
106  * Encode LMUL to lmul as following:
107  *     LMUL    vlmul    lmul
108  *      1       000       0
109  *      2       001       1
110  *      4       010       2
111  *      8       011       3
112  *      -       100       -
113  *     1/8      101      -3
114  *     1/4      110      -2
115  *     1/2      111      -1
116  */
117 static inline int32_t vext_lmul(uint32_t desc)
118 {
119     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
120 }
121 
122 static inline uint32_t vext_vta(uint32_t desc)
123 {
124     return FIELD_EX32(simd_data(desc), VDATA, VTA);
125 }
126 
127 static inline uint32_t vext_vma(uint32_t desc)
128 {
129     return FIELD_EX32(simd_data(desc), VDATA, VMA);
130 }
131 
132 static inline uint32_t vext_vta_all_1s(uint32_t desc)
133 {
134     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
135 }
136 
137 /*
138  * Get the maximum number of elements can be operated.
139  *
140  * log2_esz: log2 of element size in bytes.
141  */
142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
143 {
144     /*
145      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
146      * so vlen in bytes (vlenb) is encoded as maxsz.
147      */
148     uint32_t vlenb = simd_maxsz(desc);
149 
150     /* Return VLMAX */
151     int scale = vext_lmul(desc) - log2_esz;
152     return scale < 0 ? vlenb >> -scale : vlenb << scale;
153 }
154 
155 /*
156  * Get number of total elements, including prestart, body and tail elements.
157  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
158  * are held in the same vector register.
159  */
160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
161                                             uint32_t esz)
162 {
163     uint32_t vlenb = simd_maxsz(desc);
164     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
165     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
166                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
167     return (vlenb << emul) / esz;
168 }
169 
170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
171 {
172     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
173 }
174 
175 /*
176  * This function checks watchpoint before real load operation.
177  *
178  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
179  * In user mode, there is no watchpoint support now.
180  *
181  * It will trigger an exception if there is no mapping in TLB
182  * and page table walk can't fill the TLB entry. Then the guest
183  * software can return here after process the exception or never return.
184  */
185 static void probe_pages(CPURISCVState *env, target_ulong addr,
186                         target_ulong len, uintptr_t ra,
187                         MMUAccessType access_type)
188 {
189     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
190     target_ulong curlen = MIN(pagelen, len);
191 
192     probe_access(env, adjust_addr(env, addr), curlen, access_type,
193                  cpu_mmu_index(env, false), ra);
194     if (len > curlen) {
195         addr += curlen;
196         curlen = len - curlen;
197         probe_access(env, adjust_addr(env, addr), curlen, access_type,
198                      cpu_mmu_index(env, false), ra);
199     }
200 }
201 
202 /* set agnostic elements to 1s */
203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
204                               uint32_t tot)
205 {
206     if (is_agnostic == 0) {
207         /* policy undisturbed */
208         return;
209     }
210     if (tot - cnt == 0) {
211         return;
212     }
213     memset(base + cnt, -1, tot - cnt);
214 }
215 
216 static inline void vext_set_elem_mask(void *v0, int index,
217                                       uint8_t value)
218 {
219     int idx = index / 64;
220     int pos = index % 64;
221     uint64_t old = ((uint64_t *)v0)[idx];
222     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
223 }
224 
225 /*
226  * Earlier designs (pre-0.9) had a varying number of bits
227  * per mask value (MLEN). In the 0.9 design, MLEN=1.
228  * (Section 4.5)
229  */
230 static inline int vext_elem_mask(void *v0, int index)
231 {
232     int idx = index / 64;
233     int pos = index  % 64;
234     return (((uint64_t *)v0)[idx] >> pos) & 1;
235 }
236 
237 /* elements operations for load and store */
238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
239                                uint32_t idx, void *vd, uintptr_t retaddr);
240 
241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
242 static void NAME(CPURISCVState *env, abi_ptr addr,         \
243                  uint32_t idx, void *vd, uintptr_t retaddr)\
244 {                                                          \
245     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
246     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
247 }                                                          \
248 
249 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
253 
254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
255 static void NAME(CPURISCVState *env, abi_ptr addr,         \
256                  uint32_t idx, void *vd, uintptr_t retaddr)\
257 {                                                          \
258     ETYPE data = *((ETYPE *)vd + H(idx));                  \
259     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
260 }
261 
262 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
266 
267 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
268                                    uint32_t desc, uint32_t nf,
269                                    uint32_t esz, uint32_t max_elems)
270 {
271     uint32_t vta = vext_vta(desc);
272     int k;
273 
274     if (vta == 0) {
275         return;
276     }
277 
278     for (k = 0; k < nf; ++k) {
279         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
280                           (k * max_elems + max_elems) * esz);
281     }
282 }
283 
284 /*
285  * stride: access vector element from strided memory
286  */
287 static void
288 vext_ldst_stride(void *vd, void *v0, target_ulong base,
289                  target_ulong stride, CPURISCVState *env,
290                  uint32_t desc, uint32_t vm,
291                  vext_ldst_elem_fn *ldst_elem,
292                  uint32_t log2_esz, uintptr_t ra)
293 {
294     uint32_t i, k;
295     uint32_t nf = vext_nf(desc);
296     uint32_t max_elems = vext_max_elems(desc, log2_esz);
297     uint32_t esz = 1 << log2_esz;
298     uint32_t vma = vext_vma(desc);
299 
300     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
301         k = 0;
302         while (k < nf) {
303             if (!vm && !vext_elem_mask(v0, i)) {
304                 /* set masked-off elements to 1s */
305                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
306                                   (i + k * max_elems + 1) * esz);
307                 k++;
308                 continue;
309             }
310             target_ulong addr = base + stride * i + (k << log2_esz);
311             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
312             k++;
313         }
314     }
315     env->vstart = 0;
316 
317     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
318 }
319 
320 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
321 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
322                   target_ulong stride, CPURISCVState *env,              \
323                   uint32_t desc)                                        \
324 {                                                                       \
325     uint32_t vm = vext_vm(desc);                                        \
326     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
327                      ctzl(sizeof(ETYPE)), GETPC());                     \
328 }
329 
330 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
331 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
332 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
333 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
334 
335 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
336 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
337                   target_ulong stride, CPURISCVState *env,              \
338                   uint32_t desc)                                        \
339 {                                                                       \
340     uint32_t vm = vext_vm(desc);                                        \
341     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
342                      ctzl(sizeof(ETYPE)), GETPC());                     \
343 }
344 
345 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
346 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
347 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
348 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
349 
350 /*
351  * unit-stride: access elements stored contiguously in memory
352  */
353 
354 /* unmasked unit-stride load and store operation */
355 static void
356 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
357              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
358              uintptr_t ra)
359 {
360     uint32_t i, k;
361     uint32_t nf = vext_nf(desc);
362     uint32_t max_elems = vext_max_elems(desc, log2_esz);
363     uint32_t esz = 1 << log2_esz;
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375 
376     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
377 }
378 
379 /*
380  * masked unit-stride load and store operation will be a special case of
381  * stride, stride = NF * sizeof (ETYPE)
382  */
383 
384 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
385 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
386                          CPURISCVState *env, uint32_t desc)             \
387 {                                                                       \
388     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
389     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
390                      ctzl(sizeof(ETYPE)), GETPC());                     \
391 }                                                                       \
392                                                                         \
393 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
394                   CPURISCVState *env, uint32_t desc)                    \
395 {                                                                       \
396     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
397                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
398 }
399 
400 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
401 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
402 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
403 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
404 
405 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
406 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
407                          CPURISCVState *env, uint32_t desc)              \
408 {                                                                        \
409     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
410     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
411                      ctzl(sizeof(ETYPE)), GETPC());                      \
412 }                                                                        \
413                                                                          \
414 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
415                   CPURISCVState *env, uint32_t desc)                     \
416 {                                                                        \
417     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
418                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
419 }
420 
421 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
422 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
423 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
424 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
425 
426 /*
427  * unit stride mask load and store, EEW = 1
428  */
429 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
430                     CPURISCVState *env, uint32_t desc)
431 {
432     /* evl = ceil(vl/8) */
433     uint8_t evl = (env->vl + 7) >> 3;
434     vext_ldst_us(vd, base, env, desc, lde_b,
435                  0, evl, GETPC());
436 }
437 
438 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
439                     CPURISCVState *env, uint32_t desc)
440 {
441     /* evl = ceil(vl/8) */
442     uint8_t evl = (env->vl + 7) >> 3;
443     vext_ldst_us(vd, base, env, desc, ste_b,
444                  0, evl, GETPC());
445 }
446 
447 /*
448  * index: access vector element from indexed memory
449  */
450 typedef target_ulong vext_get_index_addr(target_ulong base,
451         uint32_t idx, void *vs2);
452 
453 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
454 static target_ulong NAME(target_ulong base,            \
455                          uint32_t idx, void *vs2)      \
456 {                                                      \
457     return (base + *((ETYPE *)vs2 + H(idx)));          \
458 }
459 
460 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
461 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
462 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
463 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
464 
465 static inline void
466 vext_ldst_index(void *vd, void *v0, target_ulong base,
467                 void *vs2, CPURISCVState *env, uint32_t desc,
468                 vext_get_index_addr get_index_addr,
469                 vext_ldst_elem_fn *ldst_elem,
470                 uint32_t log2_esz, uintptr_t ra)
471 {
472     uint32_t i, k;
473     uint32_t nf = vext_nf(desc);
474     uint32_t vm = vext_vm(desc);
475     uint32_t max_elems = vext_max_elems(desc, log2_esz);
476     uint32_t esz = 1 << log2_esz;
477     uint32_t vma = vext_vma(desc);
478 
479     /* load bytes from guest memory */
480     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
481         k = 0;
482         while (k < nf) {
483             if (!vm && !vext_elem_mask(v0, i)) {
484                 /* set masked-off elements to 1s */
485                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
486                                   (i + k * max_elems + 1) * esz);
487                 k++;
488                 continue;
489             }
490             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
491             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
492             k++;
493         }
494     }
495     env->vstart = 0;
496 
497     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
498 }
499 
500 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
501 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
502                   void *vs2, CPURISCVState *env, uint32_t desc)            \
503 {                                                                          \
504     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
505                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
506 }
507 
508 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
509 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
510 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
511 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
512 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
513 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
514 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
515 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
516 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
517 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
518 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
519 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
520 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
521 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
522 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
523 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
524 
525 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
526 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
527                   void *vs2, CPURISCVState *env, uint32_t desc)  \
528 {                                                                \
529     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
530                     STORE_FN, ctzl(sizeof(ETYPE)),               \
531                     GETPC());                                    \
532 }
533 
534 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
535 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
536 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
537 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
538 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
539 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
540 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
541 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
542 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
543 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
544 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
545 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
546 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
547 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
548 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
549 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
550 
551 /*
552  * unit-stride fault-only-fisrt load instructions
553  */
554 static inline void
555 vext_ldff(void *vd, void *v0, target_ulong base,
556           CPURISCVState *env, uint32_t desc,
557           vext_ldst_elem_fn *ldst_elem,
558           uint32_t log2_esz, uintptr_t ra)
559 {
560     void *host;
561     uint32_t i, k, vl = 0;
562     uint32_t nf = vext_nf(desc);
563     uint32_t vm = vext_vm(desc);
564     uint32_t max_elems = vext_max_elems(desc, log2_esz);
565     uint32_t esz = 1 << log2_esz;
566     uint32_t vma = vext_vma(desc);
567     target_ulong addr, offset, remain;
568 
569     /* probe every access */
570     for (i = env->vstart; i < env->vl; i++) {
571         if (!vm && !vext_elem_mask(v0, i)) {
572             continue;
573         }
574         addr = adjust_addr(env, base + i * (nf << log2_esz));
575         if (i == 0) {
576             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
577         } else {
578             /* if it triggers an exception, no need to check watchpoint */
579             remain = nf << log2_esz;
580             while (remain > 0) {
581                 offset = -(addr | TARGET_PAGE_MASK);
582                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
583                                          cpu_mmu_index(env, false));
584                 if (host) {
585 #ifdef CONFIG_USER_ONLY
586                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
587                         vl = i;
588                         goto ProbeSuccess;
589                     }
590 #else
591                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
592 #endif
593                 } else {
594                     vl = i;
595                     goto ProbeSuccess;
596                 }
597                 if (remain <=  offset) {
598                     break;
599                 }
600                 remain -= offset;
601                 addr = adjust_addr(env, addr + offset);
602             }
603         }
604     }
605 ProbeSuccess:
606     /* load bytes from guest memory */
607     if (vl != 0) {
608         env->vl = vl;
609     }
610     for (i = env->vstart; i < env->vl; i++) {
611         k = 0;
612         while (k < nf) {
613             if (!vm && !vext_elem_mask(v0, i)) {
614                 /* set masked-off elements to 1s */
615                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
616                                   (i + k * max_elems + 1) * esz);
617                 k++;
618                 continue;
619             }
620             target_ulong addr = base + ((i * nf + k) << log2_esz);
621             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
622             k++;
623         }
624     }
625     env->vstart = 0;
626 
627     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
628 }
629 
630 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
631 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
632                   CPURISCVState *env, uint32_t desc)      \
633 {                                                         \
634     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
635               ctzl(sizeof(ETYPE)), GETPC());              \
636 }
637 
638 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
639 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
640 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
641 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
642 
643 #define DO_SWAP(N, M) (M)
644 #define DO_AND(N, M)  (N & M)
645 #define DO_XOR(N, M)  (N ^ M)
646 #define DO_OR(N, M)   (N | M)
647 #define DO_ADD(N, M)  (N + M)
648 
649 /* Signed min/max */
650 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
651 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
652 
653 /*
654  * load and store whole register instructions
655  */
656 static void
657 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
658                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
659 {
660     uint32_t i, k, off, pos;
661     uint32_t nf = vext_nf(desc);
662     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
663     uint32_t max_elems = vlenb >> log2_esz;
664 
665     k = env->vstart / max_elems;
666     off = env->vstart % max_elems;
667 
668     if (off) {
669         /* load/store rest of elements of current segment pointed by vstart */
670         for (pos = off; pos < max_elems; pos++, env->vstart++) {
671             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
672             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
673                       ra);
674         }
675         k++;
676     }
677 
678     /* load/store elements for rest of segments */
679     for (; k < nf; k++) {
680         for (i = 0; i < max_elems; i++, env->vstart++) {
681             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
682             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
683         }
684     }
685 
686     env->vstart = 0;
687 }
688 
689 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
690 void HELPER(NAME)(void *vd, target_ulong base,       \
691                   CPURISCVState *env, uint32_t desc) \
692 {                                                    \
693     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
694                     ctzl(sizeof(ETYPE)), GETPC());   \
695 }
696 
697 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
698 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
699 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
700 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
701 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
702 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
703 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
704 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
705 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
706 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
707 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
708 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
709 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
710 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
711 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
712 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
713 
714 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
715 void HELPER(NAME)(void *vd, target_ulong base,       \
716                   CPURISCVState *env, uint32_t desc) \
717 {                                                    \
718     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
719                     ctzl(sizeof(ETYPE)), GETPC());   \
720 }
721 
722 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
723 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
724 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
725 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
726 
727 /*
728  * Vector Integer Arithmetic Instructions
729  */
730 
731 /* expand macro args before macro */
732 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
733 
734 /* (TD, T1, T2, TX1, TX2) */
735 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
736 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
737 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
738 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
739 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
740 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
741 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
742 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
743 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
744 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
745 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
746 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
747 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
748 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
749 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
750 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
751 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
752 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
753 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
754 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
755 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
756 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
757 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
758 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
759 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
760 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
761 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
762 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
763 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
764 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
765 
766 /* operation of two vector elements */
767 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
768 
769 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
770 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
771 {                                                               \
772     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
773     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
774     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
775 }
776 #define DO_SUB(N, M) (N - M)
777 #define DO_RSUB(N, M) (M - N)
778 
779 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
780 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
781 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
782 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
783 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
784 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
785 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
786 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
787 
788 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
789                        CPURISCVState *env, uint32_t desc,
790                        opivv2_fn *fn, uint32_t esz)
791 {
792     uint32_t vm = vext_vm(desc);
793     uint32_t vl = env->vl;
794     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
795     uint32_t vta = vext_vta(desc);
796     uint32_t vma = vext_vma(desc);
797     uint32_t i;
798 
799     for (i = env->vstart; i < vl; i++) {
800         if (!vm && !vext_elem_mask(v0, i)) {
801             /* set masked-off elements to 1s */
802             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
803             continue;
804         }
805         fn(vd, vs1, vs2, i);
806     }
807     env->vstart = 0;
808     /* set tail elements to 1s */
809     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
810 }
811 
812 /* generate the helpers for OPIVV */
813 #define GEN_VEXT_VV(NAME, ESZ)                            \
814 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
815                   void *vs2, CPURISCVState *env,          \
816                   uint32_t desc)                          \
817 {                                                         \
818     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
819                do_##NAME, ESZ);                           \
820 }
821 
822 GEN_VEXT_VV(vadd_vv_b, 1)
823 GEN_VEXT_VV(vadd_vv_h, 2)
824 GEN_VEXT_VV(vadd_vv_w, 4)
825 GEN_VEXT_VV(vadd_vv_d, 8)
826 GEN_VEXT_VV(vsub_vv_b, 1)
827 GEN_VEXT_VV(vsub_vv_h, 2)
828 GEN_VEXT_VV(vsub_vv_w, 4)
829 GEN_VEXT_VV(vsub_vv_d, 8)
830 
831 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
832 
833 /*
834  * (T1)s1 gives the real operator type.
835  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
836  */
837 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
838 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
839 {                                                                   \
840     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
841     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
842 }
843 
844 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
845 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
846 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
847 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
848 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
849 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
850 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
851 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
852 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
853 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
854 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
855 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
856 
857 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
858                        CPURISCVState *env, uint32_t desc,
859                        opivx2_fn fn, uint32_t esz)
860 {
861     uint32_t vm = vext_vm(desc);
862     uint32_t vl = env->vl;
863     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
864     uint32_t vta = vext_vta(desc);
865     uint32_t vma = vext_vma(desc);
866     uint32_t i;
867 
868     for (i = env->vstart; i < vl; i++) {
869         if (!vm && !vext_elem_mask(v0, i)) {
870             /* set masked-off elements to 1s */
871             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
872             continue;
873         }
874         fn(vd, s1, vs2, i);
875     }
876     env->vstart = 0;
877     /* set tail elements to 1s */
878     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
879 }
880 
881 /* generate the helpers for OPIVX */
882 #define GEN_VEXT_VX(NAME, ESZ)                            \
883 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
884                   void *vs2, CPURISCVState *env,          \
885                   uint32_t desc)                          \
886 {                                                         \
887     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
888                do_##NAME, ESZ);                           \
889 }
890 
891 GEN_VEXT_VX(vadd_vx_b, 1)
892 GEN_VEXT_VX(vadd_vx_h, 2)
893 GEN_VEXT_VX(vadd_vx_w, 4)
894 GEN_VEXT_VX(vadd_vx_d, 8)
895 GEN_VEXT_VX(vsub_vx_b, 1)
896 GEN_VEXT_VX(vsub_vx_h, 2)
897 GEN_VEXT_VX(vsub_vx_w, 4)
898 GEN_VEXT_VX(vsub_vx_d, 8)
899 GEN_VEXT_VX(vrsub_vx_b, 1)
900 GEN_VEXT_VX(vrsub_vx_h, 2)
901 GEN_VEXT_VX(vrsub_vx_w, 4)
902 GEN_VEXT_VX(vrsub_vx_d, 8)
903 
904 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
905 {
906     intptr_t oprsz = simd_oprsz(desc);
907     intptr_t i;
908 
909     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
910         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
911     }
912 }
913 
914 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
915 {
916     intptr_t oprsz = simd_oprsz(desc);
917     intptr_t i;
918 
919     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
920         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
921     }
922 }
923 
924 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
925 {
926     intptr_t oprsz = simd_oprsz(desc);
927     intptr_t i;
928 
929     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
930         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
931     }
932 }
933 
934 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
935 {
936     intptr_t oprsz = simd_oprsz(desc);
937     intptr_t i;
938 
939     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
940         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
941     }
942 }
943 
944 /* Vector Widening Integer Add/Subtract */
945 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
946 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
947 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
948 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
949 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
950 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
951 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
952 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
953 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
954 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
955 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
956 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
957 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
958 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
959 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
960 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
961 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
962 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
963 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
964 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
965 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
966 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
967 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
968 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
969 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
970 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
972 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
973 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
975 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
976 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
978 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
979 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
981 GEN_VEXT_VV(vwaddu_vv_b, 2)
982 GEN_VEXT_VV(vwaddu_vv_h, 4)
983 GEN_VEXT_VV(vwaddu_vv_w, 8)
984 GEN_VEXT_VV(vwsubu_vv_b, 2)
985 GEN_VEXT_VV(vwsubu_vv_h, 4)
986 GEN_VEXT_VV(vwsubu_vv_w, 8)
987 GEN_VEXT_VV(vwadd_vv_b, 2)
988 GEN_VEXT_VV(vwadd_vv_h, 4)
989 GEN_VEXT_VV(vwadd_vv_w, 8)
990 GEN_VEXT_VV(vwsub_vv_b, 2)
991 GEN_VEXT_VV(vwsub_vv_h, 4)
992 GEN_VEXT_VV(vwsub_vv_w, 8)
993 GEN_VEXT_VV(vwaddu_wv_b, 2)
994 GEN_VEXT_VV(vwaddu_wv_h, 4)
995 GEN_VEXT_VV(vwaddu_wv_w, 8)
996 GEN_VEXT_VV(vwsubu_wv_b, 2)
997 GEN_VEXT_VV(vwsubu_wv_h, 4)
998 GEN_VEXT_VV(vwsubu_wv_w, 8)
999 GEN_VEXT_VV(vwadd_wv_b, 2)
1000 GEN_VEXT_VV(vwadd_wv_h, 4)
1001 GEN_VEXT_VV(vwadd_wv_w, 8)
1002 GEN_VEXT_VV(vwsub_wv_b, 2)
1003 GEN_VEXT_VV(vwsub_wv_h, 4)
1004 GEN_VEXT_VV(vwsub_wv_w, 8)
1005 
1006 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1007 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1008 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1009 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1010 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1011 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1012 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1013 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1014 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1015 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1016 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1017 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1018 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1019 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1021 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1022 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1024 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1025 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1027 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1028 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1030 GEN_VEXT_VX(vwaddu_vx_b, 2)
1031 GEN_VEXT_VX(vwaddu_vx_h, 4)
1032 GEN_VEXT_VX(vwaddu_vx_w, 8)
1033 GEN_VEXT_VX(vwsubu_vx_b, 2)
1034 GEN_VEXT_VX(vwsubu_vx_h, 4)
1035 GEN_VEXT_VX(vwsubu_vx_w, 8)
1036 GEN_VEXT_VX(vwadd_vx_b, 2)
1037 GEN_VEXT_VX(vwadd_vx_h, 4)
1038 GEN_VEXT_VX(vwadd_vx_w, 8)
1039 GEN_VEXT_VX(vwsub_vx_b, 2)
1040 GEN_VEXT_VX(vwsub_vx_h, 4)
1041 GEN_VEXT_VX(vwsub_vx_w, 8)
1042 GEN_VEXT_VX(vwaddu_wx_b, 2)
1043 GEN_VEXT_VX(vwaddu_wx_h, 4)
1044 GEN_VEXT_VX(vwaddu_wx_w, 8)
1045 GEN_VEXT_VX(vwsubu_wx_b, 2)
1046 GEN_VEXT_VX(vwsubu_wx_h, 4)
1047 GEN_VEXT_VX(vwsubu_wx_w, 8)
1048 GEN_VEXT_VX(vwadd_wx_b, 2)
1049 GEN_VEXT_VX(vwadd_wx_h, 4)
1050 GEN_VEXT_VX(vwadd_wx_w, 8)
1051 GEN_VEXT_VX(vwsub_wx_b, 2)
1052 GEN_VEXT_VX(vwsub_wx_h, 4)
1053 GEN_VEXT_VX(vwsub_wx_w, 8)
1054 
1055 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1056 #define DO_VADC(N, M, C) (N + M + C)
1057 #define DO_VSBC(N, M, C) (N - M - C)
1058 
1059 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1060 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1061                   CPURISCVState *env, uint32_t desc)          \
1062 {                                                             \
1063     uint32_t vl = env->vl;                                    \
1064     uint32_t esz = sizeof(ETYPE);                             \
1065     uint32_t total_elems =                                    \
1066         vext_get_total_elems(env, desc, esz);                 \
1067     uint32_t vta = vext_vta(desc);                            \
1068     uint32_t i;                                               \
1069                                                               \
1070     for (i = env->vstart; i < vl; i++) {                      \
1071         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1072         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1073         ETYPE carry = vext_elem_mask(v0, i);                  \
1074                                                               \
1075         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1076     }                                                         \
1077     env->vstart = 0;                                          \
1078     /* set tail elements to 1s */                             \
1079     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1080 }
1081 
1082 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1083 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1084 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1085 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1086 
1087 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1088 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1089 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1090 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1091 
1092 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1093 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1094                   CPURISCVState *env, uint32_t desc)                     \
1095 {                                                                        \
1096     uint32_t vl = env->vl;                                               \
1097     uint32_t esz = sizeof(ETYPE);                                        \
1098     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1099     uint32_t vta = vext_vta(desc);                                       \
1100     uint32_t i;                                                          \
1101                                                                          \
1102     for (i = env->vstart; i < vl; i++) {                                 \
1103         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1104         ETYPE carry = vext_elem_mask(v0, i);                             \
1105                                                                          \
1106         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1107     }                                                                    \
1108     env->vstart = 0;                                                     \
1109     /* set tail elements to 1s */                                        \
1110     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1111 }
1112 
1113 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1114 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1115 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1116 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1117 
1118 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1119 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1120 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1121 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1122 
1123 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1124                           (__typeof(N))(N + M) < N)
1125 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1126 
1127 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1128 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1129                   CPURISCVState *env, uint32_t desc)          \
1130 {                                                             \
1131     uint32_t vl = env->vl;                                    \
1132     uint32_t vm = vext_vm(desc);                              \
1133     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1134     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1135     uint32_t i;                                               \
1136                                                               \
1137     for (i = env->vstart; i < vl; i++) {                      \
1138         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1139         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1140         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1141         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1142     }                                                         \
1143     env->vstart = 0;                                          \
1144     /*
1145      * mask destination register are always tail-agnostic
1146      * set tail elements to 1s
1147      */                                                       \
1148     if (vta_all_1s) {                                         \
1149         for (; i < total_elems; i++) {                        \
1150             vext_set_elem_mask(vd, i, 1);                     \
1151         }                                                     \
1152     }                                                         \
1153 }
1154 
1155 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1156 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1157 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1158 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1159 
1160 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1161 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1162 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1163 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1164 
1165 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1166 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1167                   void *vs2, CPURISCVState *env, uint32_t desc) \
1168 {                                                               \
1169     uint32_t vl = env->vl;                                      \
1170     uint32_t vm = vext_vm(desc);                                \
1171     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1172     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1173     uint32_t i;                                                 \
1174                                                                 \
1175     for (i = env->vstart; i < vl; i++) {                        \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1178         vext_set_elem_mask(vd, i,                               \
1179                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1180     }                                                           \
1181     env->vstart = 0;                                            \
1182     /*
1183      * mask destination register are always tail-agnostic
1184      * set tail elements to 1s
1185      */                                                         \
1186     if (vta_all_1s) {                                           \
1187         for (; i < total_elems; i++) {                          \
1188             vext_set_elem_mask(vd, i, 1);                       \
1189         }                                                       \
1190     }                                                           \
1191 }
1192 
1193 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1194 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1195 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1196 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1197 
1198 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1199 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1200 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1201 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1202 
1203 /* Vector Bitwise Logical Instructions */
1204 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1205 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1206 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1207 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1208 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1209 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1210 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1211 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1212 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1213 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1214 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1215 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1216 GEN_VEXT_VV(vand_vv_b, 1)
1217 GEN_VEXT_VV(vand_vv_h, 2)
1218 GEN_VEXT_VV(vand_vv_w, 4)
1219 GEN_VEXT_VV(vand_vv_d, 8)
1220 GEN_VEXT_VV(vor_vv_b, 1)
1221 GEN_VEXT_VV(vor_vv_h, 2)
1222 GEN_VEXT_VV(vor_vv_w, 4)
1223 GEN_VEXT_VV(vor_vv_d, 8)
1224 GEN_VEXT_VV(vxor_vv_b, 1)
1225 GEN_VEXT_VV(vxor_vv_h, 2)
1226 GEN_VEXT_VV(vxor_vv_w, 4)
1227 GEN_VEXT_VV(vxor_vv_d, 8)
1228 
1229 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1230 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1231 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1232 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1233 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1234 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1235 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1236 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1237 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1238 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1239 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1240 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1241 GEN_VEXT_VX(vand_vx_b, 1)
1242 GEN_VEXT_VX(vand_vx_h, 2)
1243 GEN_VEXT_VX(vand_vx_w, 4)
1244 GEN_VEXT_VX(vand_vx_d, 8)
1245 GEN_VEXT_VX(vor_vx_b, 1)
1246 GEN_VEXT_VX(vor_vx_h, 2)
1247 GEN_VEXT_VX(vor_vx_w, 4)
1248 GEN_VEXT_VX(vor_vx_d, 8)
1249 GEN_VEXT_VX(vxor_vx_b, 1)
1250 GEN_VEXT_VX(vxor_vx_h, 2)
1251 GEN_VEXT_VX(vxor_vx_w, 4)
1252 GEN_VEXT_VX(vxor_vx_d, 8)
1253 
1254 /* Vector Single-Width Bit Shift Instructions */
1255 #define DO_SLL(N, M)  (N << (M))
1256 #define DO_SRL(N, M)  (N >> (M))
1257 
1258 /* generate the helpers for shift instructions with two vector operators */
1259 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1260 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1261                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1262 {                                                                         \
1263     uint32_t vm = vext_vm(desc);                                          \
1264     uint32_t vl = env->vl;                                                \
1265     uint32_t esz = sizeof(TS1);                                           \
1266     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1267     uint32_t vta = vext_vta(desc);                                        \
1268     uint32_t vma = vext_vma(desc);                                        \
1269     uint32_t i;                                                           \
1270                                                                           \
1271     for (i = env->vstart; i < vl; i++) {                                  \
1272         if (!vm && !vext_elem_mask(v0, i)) {                              \
1273             /* set masked-off elements to 1s */                           \
1274             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1275             continue;                                                     \
1276         }                                                                 \
1277         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1278         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1279         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1280     }                                                                     \
1281     env->vstart = 0;                                                      \
1282     /* set tail elements to 1s */                                         \
1283     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1284 }
1285 
1286 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1287 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1288 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1289 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1290 
1291 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1292 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1293 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1294 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1295 
1296 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1297 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1298 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1299 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1300 
1301 /*
1302  * generate the helpers for shift instructions with one vector and one scalar
1303  */
1304 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1305 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1306                   void *vs2, CPURISCVState *env,            \
1307                   uint32_t desc)                            \
1308 {                                                           \
1309     uint32_t vm = vext_vm(desc);                            \
1310     uint32_t vl = env->vl;                                  \
1311     uint32_t esz = sizeof(TD);                              \
1312     uint32_t total_elems =                                  \
1313         vext_get_total_elems(env, desc, esz);               \
1314     uint32_t vta = vext_vta(desc);                          \
1315     uint32_t vma = vext_vma(desc);                          \
1316     uint32_t i;                                             \
1317                                                             \
1318     for (i = env->vstart; i < vl; i++) {                    \
1319         if (!vm && !vext_elem_mask(v0, i)) {                \
1320             /* set masked-off elements to 1s */             \
1321             vext_set_elems_1s(vd, vma, i * esz,             \
1322                               (i + 1) * esz);               \
1323             continue;                                       \
1324         }                                                   \
1325         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1326         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1327     }                                                       \
1328     env->vstart = 0;                                        \
1329     /* set tail elements to 1s */                           \
1330     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1331 }
1332 
1333 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1334 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1336 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1337 
1338 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1341 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1342 
1343 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1344 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1346 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1347 
1348 /* Vector Narrowing Integer Right Shift Instructions */
1349 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1351 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1352 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1354 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1355 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1357 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1358 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1360 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1361 
1362 /* Vector Integer Comparison Instructions */
1363 #define DO_MSEQ(N, M) (N == M)
1364 #define DO_MSNE(N, M) (N != M)
1365 #define DO_MSLT(N, M) (N < M)
1366 #define DO_MSLE(N, M) (N <= M)
1367 #define DO_MSGT(N, M) (N > M)
1368 
1369 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1370 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1371                   CPURISCVState *env, uint32_t desc)          \
1372 {                                                             \
1373     uint32_t vm = vext_vm(desc);                              \
1374     uint32_t vl = env->vl;                                    \
1375     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1376     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1377     uint32_t vma = vext_vma(desc);                            \
1378     uint32_t i;                                               \
1379                                                               \
1380     for (i = env->vstart; i < vl; i++) {                      \
1381         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1382         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1383         if (!vm && !vext_elem_mask(v0, i)) {                  \
1384             /* set masked-off elements to 1s */               \
1385             if (vma) {                                        \
1386                 vext_set_elem_mask(vd, i, 1);                 \
1387             }                                                 \
1388             continue;                                         \
1389         }                                                     \
1390         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1391     }                                                         \
1392     env->vstart = 0;                                          \
1393     /*
1394      * mask destination register are always tail-agnostic
1395      * set tail elements to 1s
1396      */                                                       \
1397     if (vta_all_1s) {                                         \
1398         for (; i < total_elems; i++) {                        \
1399             vext_set_elem_mask(vd, i, 1);                     \
1400         }                                                     \
1401     }                                                         \
1402 }
1403 
1404 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1405 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1406 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1408 
1409 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1410 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1411 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1413 
1414 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1415 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1416 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1418 
1419 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1420 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1421 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1423 
1424 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1425 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1426 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1428 
1429 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1430 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1431 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1433 
1434 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1435 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1436                   CPURISCVState *env, uint32_t desc)                \
1437 {                                                                   \
1438     uint32_t vm = vext_vm(desc);                                    \
1439     uint32_t vl = env->vl;                                          \
1440     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1441     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1442     uint32_t vma = vext_vma(desc);                                  \
1443     uint32_t i;                                                     \
1444                                                                     \
1445     for (i = env->vstart; i < vl; i++) {                            \
1446         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1447         if (!vm && !vext_elem_mask(v0, i)) {                        \
1448             /* set masked-off elements to 1s */                     \
1449             if (vma) {                                              \
1450                 vext_set_elem_mask(vd, i, 1);                       \
1451             }                                                       \
1452             continue;                                               \
1453         }                                                           \
1454         vext_set_elem_mask(vd, i,                                   \
1455                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1456     }                                                               \
1457     env->vstart = 0;                                                \
1458     /*
1459      * mask destination register are always tail-agnostic
1460      * set tail elements to 1s
1461      */                                                             \
1462     if (vta_all_1s) {                                               \
1463         for (; i < total_elems; i++) {                              \
1464             vext_set_elem_mask(vd, i, 1);                           \
1465         }                                                           \
1466     }                                                               \
1467 }
1468 
1469 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1470 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1471 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1472 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1473 
1474 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1475 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1476 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1477 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1478 
1479 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1480 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1481 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1482 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1483 
1484 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1485 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1488 
1489 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1490 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1491 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1492 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1493 
1494 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1495 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1498 
1499 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1500 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1501 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1502 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1503 
1504 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1505 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1508 
1509 /* Vector Integer Min/Max Instructions */
1510 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1511 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1512 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1513 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1514 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1515 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1516 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1517 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1518 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1519 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1520 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1521 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1522 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1523 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1524 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1525 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1526 GEN_VEXT_VV(vminu_vv_b, 1)
1527 GEN_VEXT_VV(vminu_vv_h, 2)
1528 GEN_VEXT_VV(vminu_vv_w, 4)
1529 GEN_VEXT_VV(vminu_vv_d, 8)
1530 GEN_VEXT_VV(vmin_vv_b, 1)
1531 GEN_VEXT_VV(vmin_vv_h, 2)
1532 GEN_VEXT_VV(vmin_vv_w, 4)
1533 GEN_VEXT_VV(vmin_vv_d, 8)
1534 GEN_VEXT_VV(vmaxu_vv_b, 1)
1535 GEN_VEXT_VV(vmaxu_vv_h, 2)
1536 GEN_VEXT_VV(vmaxu_vv_w, 4)
1537 GEN_VEXT_VV(vmaxu_vv_d, 8)
1538 GEN_VEXT_VV(vmax_vv_b, 1)
1539 GEN_VEXT_VV(vmax_vv_h, 2)
1540 GEN_VEXT_VV(vmax_vv_w, 4)
1541 GEN_VEXT_VV(vmax_vv_d, 8)
1542 
1543 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1544 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1545 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1546 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1547 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1548 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1549 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1550 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1551 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1552 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1553 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1554 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1555 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1556 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1557 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1558 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1559 GEN_VEXT_VX(vminu_vx_b, 1)
1560 GEN_VEXT_VX(vminu_vx_h, 2)
1561 GEN_VEXT_VX(vminu_vx_w, 4)
1562 GEN_VEXT_VX(vminu_vx_d, 8)
1563 GEN_VEXT_VX(vmin_vx_b, 1)
1564 GEN_VEXT_VX(vmin_vx_h, 2)
1565 GEN_VEXT_VX(vmin_vx_w, 4)
1566 GEN_VEXT_VX(vmin_vx_d, 8)
1567 GEN_VEXT_VX(vmaxu_vx_b, 1)
1568 GEN_VEXT_VX(vmaxu_vx_h, 2)
1569 GEN_VEXT_VX(vmaxu_vx_w, 4)
1570 GEN_VEXT_VX(vmaxu_vx_d, 8)
1571 GEN_VEXT_VX(vmax_vx_b, 1)
1572 GEN_VEXT_VX(vmax_vx_h, 2)
1573 GEN_VEXT_VX(vmax_vx_w, 4)
1574 GEN_VEXT_VX(vmax_vx_d, 8)
1575 
1576 /* Vector Single-Width Integer Multiply Instructions */
1577 #define DO_MUL(N, M) (N * M)
1578 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1579 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1580 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1581 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1582 GEN_VEXT_VV(vmul_vv_b, 1)
1583 GEN_VEXT_VV(vmul_vv_h, 2)
1584 GEN_VEXT_VV(vmul_vv_w, 4)
1585 GEN_VEXT_VV(vmul_vv_d, 8)
1586 
1587 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1588 {
1589     return (int16_t)s2 * (int16_t)s1 >> 8;
1590 }
1591 
1592 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1593 {
1594     return (int32_t)s2 * (int32_t)s1 >> 16;
1595 }
1596 
1597 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1598 {
1599     return (int64_t)s2 * (int64_t)s1 >> 32;
1600 }
1601 
1602 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1603 {
1604     uint64_t hi_64, lo_64;
1605 
1606     muls64(&lo_64, &hi_64, s1, s2);
1607     return hi_64;
1608 }
1609 
1610 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1611 {
1612     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1613 }
1614 
1615 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1616 {
1617     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1618 }
1619 
1620 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1621 {
1622     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1623 }
1624 
1625 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1626 {
1627     uint64_t hi_64, lo_64;
1628 
1629     mulu64(&lo_64, &hi_64, s2, s1);
1630     return hi_64;
1631 }
1632 
1633 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1634 {
1635     return (int16_t)s2 * (uint16_t)s1 >> 8;
1636 }
1637 
1638 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1639 {
1640     return (int32_t)s2 * (uint32_t)s1 >> 16;
1641 }
1642 
1643 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1644 {
1645     return (int64_t)s2 * (uint64_t)s1 >> 32;
1646 }
1647 
1648 /*
1649  * Let  A = signed operand,
1650  *      B = unsigned operand
1651  *      P = mulu64(A, B), unsigned product
1652  *
1653  * LET  X = 2 ** 64  - A, 2's complement of A
1654  *      SP = signed product
1655  * THEN
1656  *      IF A < 0
1657  *          SP = -X * B
1658  *             = -(2 ** 64 - A) * B
1659  *             = A * B - 2 ** 64 * B
1660  *             = P - 2 ** 64 * B
1661  *      ELSE
1662  *          SP = P
1663  * THEN
1664  *      HI_P -= (A < 0 ? B : 0)
1665  */
1666 
1667 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1668 {
1669     uint64_t hi_64, lo_64;
1670 
1671     mulu64(&lo_64, &hi_64, s2, s1);
1672 
1673     hi_64 -= s2 < 0 ? s1 : 0;
1674     return hi_64;
1675 }
1676 
1677 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1678 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1679 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1680 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1681 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1682 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1683 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1684 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1685 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1686 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1687 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1688 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1689 GEN_VEXT_VV(vmulh_vv_b, 1)
1690 GEN_VEXT_VV(vmulh_vv_h, 2)
1691 GEN_VEXT_VV(vmulh_vv_w, 4)
1692 GEN_VEXT_VV(vmulh_vv_d, 8)
1693 GEN_VEXT_VV(vmulhu_vv_b, 1)
1694 GEN_VEXT_VV(vmulhu_vv_h, 2)
1695 GEN_VEXT_VV(vmulhu_vv_w, 4)
1696 GEN_VEXT_VV(vmulhu_vv_d, 8)
1697 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1698 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1699 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1700 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1701 
1702 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1703 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1704 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1705 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1706 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1707 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1708 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1709 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1710 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1711 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1712 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1713 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1714 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1715 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1716 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1717 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1718 GEN_VEXT_VX(vmul_vx_b, 1)
1719 GEN_VEXT_VX(vmul_vx_h, 2)
1720 GEN_VEXT_VX(vmul_vx_w, 4)
1721 GEN_VEXT_VX(vmul_vx_d, 8)
1722 GEN_VEXT_VX(vmulh_vx_b, 1)
1723 GEN_VEXT_VX(vmulh_vx_h, 2)
1724 GEN_VEXT_VX(vmulh_vx_w, 4)
1725 GEN_VEXT_VX(vmulh_vx_d, 8)
1726 GEN_VEXT_VX(vmulhu_vx_b, 1)
1727 GEN_VEXT_VX(vmulhu_vx_h, 2)
1728 GEN_VEXT_VX(vmulhu_vx_w, 4)
1729 GEN_VEXT_VX(vmulhu_vx_d, 8)
1730 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1731 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1732 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1733 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1734 
1735 /* Vector Integer Divide Instructions */
1736 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1737 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1738 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1739         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1740 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1741         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1742 
1743 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1744 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1745 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1746 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1747 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1748 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1749 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1750 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1751 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1752 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1753 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1754 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1755 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1756 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1757 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1758 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1759 GEN_VEXT_VV(vdivu_vv_b, 1)
1760 GEN_VEXT_VV(vdivu_vv_h, 2)
1761 GEN_VEXT_VV(vdivu_vv_w, 4)
1762 GEN_VEXT_VV(vdivu_vv_d, 8)
1763 GEN_VEXT_VV(vdiv_vv_b, 1)
1764 GEN_VEXT_VV(vdiv_vv_h, 2)
1765 GEN_VEXT_VV(vdiv_vv_w, 4)
1766 GEN_VEXT_VV(vdiv_vv_d, 8)
1767 GEN_VEXT_VV(vremu_vv_b, 1)
1768 GEN_VEXT_VV(vremu_vv_h, 2)
1769 GEN_VEXT_VV(vremu_vv_w, 4)
1770 GEN_VEXT_VV(vremu_vv_d, 8)
1771 GEN_VEXT_VV(vrem_vv_b, 1)
1772 GEN_VEXT_VV(vrem_vv_h, 2)
1773 GEN_VEXT_VV(vrem_vv_w, 4)
1774 GEN_VEXT_VV(vrem_vv_d, 8)
1775 
1776 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1777 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1778 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1779 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1780 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1781 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1782 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1783 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1784 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1785 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1786 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1787 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1788 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1789 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1790 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1791 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1792 GEN_VEXT_VX(vdivu_vx_b, 1)
1793 GEN_VEXT_VX(vdivu_vx_h, 2)
1794 GEN_VEXT_VX(vdivu_vx_w, 4)
1795 GEN_VEXT_VX(vdivu_vx_d, 8)
1796 GEN_VEXT_VX(vdiv_vx_b, 1)
1797 GEN_VEXT_VX(vdiv_vx_h, 2)
1798 GEN_VEXT_VX(vdiv_vx_w, 4)
1799 GEN_VEXT_VX(vdiv_vx_d, 8)
1800 GEN_VEXT_VX(vremu_vx_b, 1)
1801 GEN_VEXT_VX(vremu_vx_h, 2)
1802 GEN_VEXT_VX(vremu_vx_w, 4)
1803 GEN_VEXT_VX(vremu_vx_d, 8)
1804 GEN_VEXT_VX(vrem_vx_b, 1)
1805 GEN_VEXT_VX(vrem_vx_h, 2)
1806 GEN_VEXT_VX(vrem_vx_w, 4)
1807 GEN_VEXT_VX(vrem_vx_d, 8)
1808 
1809 /* Vector Widening Integer Multiply Instructions */
1810 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1811 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1812 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1813 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1814 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1815 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1816 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1819 GEN_VEXT_VV(vwmul_vv_b, 2)
1820 GEN_VEXT_VV(vwmul_vv_h, 4)
1821 GEN_VEXT_VV(vwmul_vv_w, 8)
1822 GEN_VEXT_VV(vwmulu_vv_b, 2)
1823 GEN_VEXT_VV(vwmulu_vv_h, 4)
1824 GEN_VEXT_VV(vwmulu_vv_w, 8)
1825 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1826 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1827 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1828 
1829 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1830 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1831 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1832 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1833 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1834 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1835 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1838 GEN_VEXT_VX(vwmul_vx_b, 2)
1839 GEN_VEXT_VX(vwmul_vx_h, 4)
1840 GEN_VEXT_VX(vwmul_vx_w, 8)
1841 GEN_VEXT_VX(vwmulu_vx_b, 2)
1842 GEN_VEXT_VX(vwmulu_vx_h, 4)
1843 GEN_VEXT_VX(vwmulu_vx_w, 8)
1844 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1845 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1846 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1847 
1848 /* Vector Single-Width Integer Multiply-Add Instructions */
1849 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1850 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1851 {                                                                  \
1852     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1853     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1854     TD d = *((TD *)vd + HD(i));                                    \
1855     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1856 }
1857 
1858 #define DO_MACC(N, M, D) (M * N + D)
1859 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1860 #define DO_MADD(N, M, D) (M * D + N)
1861 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1862 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1863 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1864 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1865 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1866 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1867 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1868 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1869 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1870 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1871 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1872 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1873 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1874 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1875 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1876 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1877 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1878 GEN_VEXT_VV(vmacc_vv_b, 1)
1879 GEN_VEXT_VV(vmacc_vv_h, 2)
1880 GEN_VEXT_VV(vmacc_vv_w, 4)
1881 GEN_VEXT_VV(vmacc_vv_d, 8)
1882 GEN_VEXT_VV(vnmsac_vv_b, 1)
1883 GEN_VEXT_VV(vnmsac_vv_h, 2)
1884 GEN_VEXT_VV(vnmsac_vv_w, 4)
1885 GEN_VEXT_VV(vnmsac_vv_d, 8)
1886 GEN_VEXT_VV(vmadd_vv_b, 1)
1887 GEN_VEXT_VV(vmadd_vv_h, 2)
1888 GEN_VEXT_VV(vmadd_vv_w, 4)
1889 GEN_VEXT_VV(vmadd_vv_d, 8)
1890 GEN_VEXT_VV(vnmsub_vv_b, 1)
1891 GEN_VEXT_VV(vnmsub_vv_h, 2)
1892 GEN_VEXT_VV(vnmsub_vv_w, 4)
1893 GEN_VEXT_VV(vnmsub_vv_d, 8)
1894 
1895 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1896 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1897 {                                                                   \
1898     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1899     TD d = *((TD *)vd + HD(i));                                     \
1900     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1901 }
1902 
1903 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1904 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1905 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1906 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1907 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1908 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1909 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1910 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1911 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1912 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1913 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1914 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1915 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1916 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1917 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1918 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1919 GEN_VEXT_VX(vmacc_vx_b, 1)
1920 GEN_VEXT_VX(vmacc_vx_h, 2)
1921 GEN_VEXT_VX(vmacc_vx_w, 4)
1922 GEN_VEXT_VX(vmacc_vx_d, 8)
1923 GEN_VEXT_VX(vnmsac_vx_b, 1)
1924 GEN_VEXT_VX(vnmsac_vx_h, 2)
1925 GEN_VEXT_VX(vnmsac_vx_w, 4)
1926 GEN_VEXT_VX(vnmsac_vx_d, 8)
1927 GEN_VEXT_VX(vmadd_vx_b, 1)
1928 GEN_VEXT_VX(vmadd_vx_h, 2)
1929 GEN_VEXT_VX(vmadd_vx_w, 4)
1930 GEN_VEXT_VX(vmadd_vx_d, 8)
1931 GEN_VEXT_VX(vnmsub_vx_b, 1)
1932 GEN_VEXT_VX(vnmsub_vx_h, 2)
1933 GEN_VEXT_VX(vnmsub_vx_w, 4)
1934 GEN_VEXT_VX(vnmsub_vx_d, 8)
1935 
1936 /* Vector Widening Integer Multiply-Add Instructions */
1937 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1938 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1939 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1940 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1941 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1942 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1943 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1946 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1947 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1948 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1949 GEN_VEXT_VV(vwmacc_vv_b, 2)
1950 GEN_VEXT_VV(vwmacc_vv_h, 4)
1951 GEN_VEXT_VV(vwmacc_vv_w, 8)
1952 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1955 
1956 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1957 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1958 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1959 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1960 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1961 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1962 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1968 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1969 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1970 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1971 GEN_VEXT_VX(vwmacc_vx_b, 2)
1972 GEN_VEXT_VX(vwmacc_vx_h, 4)
1973 GEN_VEXT_VX(vwmacc_vx_w, 8)
1974 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1977 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1978 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1979 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1980 
1981 /* Vector Integer Merge and Move Instructions */
1982 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1983 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1984                   uint32_t desc)                                     \
1985 {                                                                    \
1986     uint32_t vl = env->vl;                                           \
1987     uint32_t esz = sizeof(ETYPE);                                    \
1988     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1989     uint32_t vta = vext_vta(desc);                                   \
1990     uint32_t i;                                                      \
1991                                                                      \
1992     for (i = env->vstart; i < vl; i++) {                             \
1993         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1994         *((ETYPE *)vd + H(i)) = s1;                                  \
1995     }                                                                \
1996     env->vstart = 0;                                                 \
1997     /* set tail elements to 1s */                                    \
1998     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1999 }
2000 
2001 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2002 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2003 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2004 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2005 
2006 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2007 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2008                   uint32_t desc)                                     \
2009 {                                                                    \
2010     uint32_t vl = env->vl;                                           \
2011     uint32_t esz = sizeof(ETYPE);                                    \
2012     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2013     uint32_t vta = vext_vta(desc);                                   \
2014     uint32_t i;                                                      \
2015                                                                      \
2016     for (i = env->vstart; i < vl; i++) {                             \
2017         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2018     }                                                                \
2019     env->vstart = 0;                                                 \
2020     /* set tail elements to 1s */                                    \
2021     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2022 }
2023 
2024 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2025 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2026 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2027 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2028 
2029 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2030 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2031                   CPURISCVState *env, uint32_t desc)                 \
2032 {                                                                    \
2033     uint32_t vl = env->vl;                                           \
2034     uint32_t esz = sizeof(ETYPE);                                    \
2035     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2036     uint32_t vta = vext_vta(desc);                                   \
2037     uint32_t i;                                                      \
2038                                                                      \
2039     for (i = env->vstart; i < vl; i++) {                             \
2040         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2041         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2042     }                                                                \
2043     env->vstart = 0;                                                 \
2044     /* set tail elements to 1s */                                    \
2045     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2046 }
2047 
2048 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2049 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2050 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2051 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2052 
2053 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2054 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2055                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2056 {                                                                    \
2057     uint32_t vl = env->vl;                                           \
2058     uint32_t esz = sizeof(ETYPE);                                    \
2059     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2060     uint32_t vta = vext_vta(desc);                                   \
2061     uint32_t i;                                                      \
2062                                                                      \
2063     for (i = env->vstart; i < vl; i++) {                             \
2064         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2065         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2066                    (ETYPE)(target_long)s1);                          \
2067         *((ETYPE *)vd + H(i)) = d;                                   \
2068     }                                                                \
2069     env->vstart = 0;                                                 \
2070     /* set tail elements to 1s */                                    \
2071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2072 }
2073 
2074 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2075 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2078 
2079 /*
2080  * Vector Fixed-Point Arithmetic Instructions
2081  */
2082 
2083 /* Vector Single-Width Saturating Add and Subtract */
2084 
2085 /*
2086  * As fixed point instructions probably have round mode and saturation,
2087  * define common macros for fixed point here.
2088  */
2089 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2090                           CPURISCVState *env, int vxrm);
2091 
2092 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2093 static inline void                                                  \
2094 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2095           CPURISCVState *env, int vxrm)                             \
2096 {                                                                   \
2097     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2098     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2099     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2100 }
2101 
2102 static inline void
2103 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2104              CPURISCVState *env,
2105              uint32_t vl, uint32_t vm, int vxrm,
2106              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2107 {
2108     for (uint32_t i = env->vstart; i < vl; i++) {
2109         if (!vm && !vext_elem_mask(v0, i)) {
2110             /* set masked-off elements to 1s */
2111             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2112             continue;
2113         }
2114         fn(vd, vs1, vs2, i, env, vxrm);
2115     }
2116     env->vstart = 0;
2117 }
2118 
2119 static inline void
2120 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2121              CPURISCVState *env,
2122              uint32_t desc,
2123              opivv2_rm_fn *fn, uint32_t esz)
2124 {
2125     uint32_t vm = vext_vm(desc);
2126     uint32_t vl = env->vl;
2127     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2128     uint32_t vta = vext_vta(desc);
2129     uint32_t vma = vext_vma(desc);
2130 
2131     switch (env->vxrm) {
2132     case 0: /* rnu */
2133         vext_vv_rm_1(vd, v0, vs1, vs2,
2134                      env, vl, vm, 0, fn, vma, esz);
2135         break;
2136     case 1: /* rne */
2137         vext_vv_rm_1(vd, v0, vs1, vs2,
2138                      env, vl, vm, 1, fn, vma, esz);
2139         break;
2140     case 2: /* rdn */
2141         vext_vv_rm_1(vd, v0, vs1, vs2,
2142                      env, vl, vm, 2, fn, vma, esz);
2143         break;
2144     default: /* rod */
2145         vext_vv_rm_1(vd, v0, vs1, vs2,
2146                      env, vl, vm, 3, fn, vma, esz);
2147         break;
2148     }
2149     /* set tail elements to 1s */
2150     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2151 }
2152 
2153 /* generate helpers for fixed point instructions with OPIVV format */
2154 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2155 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2156                   CPURISCVState *env, uint32_t desc)            \
2157 {                                                               \
2158     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2159                  do_##NAME, ESZ);                               \
2160 }
2161 
2162 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2163                              uint8_t b)
2164 {
2165     uint8_t res = a + b;
2166     if (res < a) {
2167         res = UINT8_MAX;
2168         env->vxsat = 0x1;
2169     }
2170     return res;
2171 }
2172 
2173 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2174                                uint16_t b)
2175 {
2176     uint16_t res = a + b;
2177     if (res < a) {
2178         res = UINT16_MAX;
2179         env->vxsat = 0x1;
2180     }
2181     return res;
2182 }
2183 
2184 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2185                                uint32_t b)
2186 {
2187     uint32_t res = a + b;
2188     if (res < a) {
2189         res = UINT32_MAX;
2190         env->vxsat = 0x1;
2191     }
2192     return res;
2193 }
2194 
2195 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2196                                uint64_t b)
2197 {
2198     uint64_t res = a + b;
2199     if (res < a) {
2200         res = UINT64_MAX;
2201         env->vxsat = 0x1;
2202     }
2203     return res;
2204 }
2205 
2206 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2207 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2209 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2210 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2211 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2212 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2213 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2214 
2215 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2216                           CPURISCVState *env, int vxrm);
2217 
2218 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2219 static inline void                                                  \
2220 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2221           CPURISCVState *env, int vxrm)                             \
2222 {                                                                   \
2223     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2224     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2225 }
2226 
2227 static inline void
2228 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2229              CPURISCVState *env,
2230              uint32_t vl, uint32_t vm, int vxrm,
2231              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2232 {
2233     for (uint32_t i = env->vstart; i < vl; i++) {
2234         if (!vm && !vext_elem_mask(v0, i)) {
2235             /* set masked-off elements to 1s */
2236             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2237             continue;
2238         }
2239         fn(vd, s1, vs2, i, env, vxrm);
2240     }
2241     env->vstart = 0;
2242 }
2243 
2244 static inline void
2245 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2246              CPURISCVState *env,
2247              uint32_t desc,
2248              opivx2_rm_fn *fn, uint32_t esz)
2249 {
2250     uint32_t vm = vext_vm(desc);
2251     uint32_t vl = env->vl;
2252     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2253     uint32_t vta = vext_vta(desc);
2254     uint32_t vma = vext_vma(desc);
2255 
2256     switch (env->vxrm) {
2257     case 0: /* rnu */
2258         vext_vx_rm_1(vd, v0, s1, vs2,
2259                      env, vl, vm, 0, fn, vma, esz);
2260         break;
2261     case 1: /* rne */
2262         vext_vx_rm_1(vd, v0, s1, vs2,
2263                      env, vl, vm, 1, fn, vma, esz);
2264         break;
2265     case 2: /* rdn */
2266         vext_vx_rm_1(vd, v0, s1, vs2,
2267                      env, vl, vm, 2, fn, vma, esz);
2268         break;
2269     default: /* rod */
2270         vext_vx_rm_1(vd, v0, s1, vs2,
2271                      env, vl, vm, 3, fn, vma, esz);
2272         break;
2273     }
2274     /* set tail elements to 1s */
2275     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2276 }
2277 
2278 /* generate helpers for fixed point instructions with OPIVX format */
2279 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2280 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2281                   void *vs2, CPURISCVState *env,          \
2282                   uint32_t desc)                          \
2283 {                                                         \
2284     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2285                  do_##NAME, ESZ);                         \
2286 }
2287 
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2290 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2291 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2292 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2293 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2294 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2295 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2296 
2297 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2298 {
2299     int8_t res = a + b;
2300     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2301         res = a > 0 ? INT8_MAX : INT8_MIN;
2302         env->vxsat = 0x1;
2303     }
2304     return res;
2305 }
2306 
2307 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2308                              int16_t b)
2309 {
2310     int16_t res = a + b;
2311     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2312         res = a > 0 ? INT16_MAX : INT16_MIN;
2313         env->vxsat = 0x1;
2314     }
2315     return res;
2316 }
2317 
2318 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2319                              int32_t b)
2320 {
2321     int32_t res = a + b;
2322     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2323         res = a > 0 ? INT32_MAX : INT32_MIN;
2324         env->vxsat = 0x1;
2325     }
2326     return res;
2327 }
2328 
2329 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2330                              int64_t b)
2331 {
2332     int64_t res = a + b;
2333     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2334         res = a > 0 ? INT64_MAX : INT64_MIN;
2335         env->vxsat = 0x1;
2336     }
2337     return res;
2338 }
2339 
2340 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2341 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2342 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2343 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2344 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2345 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2346 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2347 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2348 
2349 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2350 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2351 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2352 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2353 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2354 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2355 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2356 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2357 
2358 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2359                              uint8_t b)
2360 {
2361     uint8_t res = a - b;
2362     if (res > a) {
2363         res = 0;
2364         env->vxsat = 0x1;
2365     }
2366     return res;
2367 }
2368 
2369 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2370                                uint16_t b)
2371 {
2372     uint16_t res = a - b;
2373     if (res > a) {
2374         res = 0;
2375         env->vxsat = 0x1;
2376     }
2377     return res;
2378 }
2379 
2380 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2381                                uint32_t b)
2382 {
2383     uint32_t res = a - b;
2384     if (res > a) {
2385         res = 0;
2386         env->vxsat = 0x1;
2387     }
2388     return res;
2389 }
2390 
2391 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2392                                uint64_t b)
2393 {
2394     uint64_t res = a - b;
2395     if (res > a) {
2396         res = 0;
2397         env->vxsat = 0x1;
2398     }
2399     return res;
2400 }
2401 
2402 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2403 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2404 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2405 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2406 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2407 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2408 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2409 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2410 
2411 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2412 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2413 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2414 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2415 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2416 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2417 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2418 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2419 
2420 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2421 {
2422     int8_t res = a - b;
2423     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2424         res = a >= 0 ? INT8_MAX : INT8_MIN;
2425         env->vxsat = 0x1;
2426     }
2427     return res;
2428 }
2429 
2430 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2431                              int16_t b)
2432 {
2433     int16_t res = a - b;
2434     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2435         res = a >= 0 ? INT16_MAX : INT16_MIN;
2436         env->vxsat = 0x1;
2437     }
2438     return res;
2439 }
2440 
2441 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2442                              int32_t b)
2443 {
2444     int32_t res = a - b;
2445     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2446         res = a >= 0 ? INT32_MAX : INT32_MIN;
2447         env->vxsat = 0x1;
2448     }
2449     return res;
2450 }
2451 
2452 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2453                              int64_t b)
2454 {
2455     int64_t res = a - b;
2456     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2457         res = a >= 0 ? INT64_MAX : INT64_MIN;
2458         env->vxsat = 0x1;
2459     }
2460     return res;
2461 }
2462 
2463 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2464 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2465 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2466 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2467 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2468 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2469 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2470 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2471 
2472 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2473 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2474 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2475 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2476 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2477 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2478 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2479 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2480 
2481 /* Vector Single-Width Averaging Add and Subtract */
2482 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2483 {
2484     uint8_t d = extract64(v, shift, 1);
2485     uint8_t d1;
2486     uint64_t D1, D2;
2487 
2488     if (shift == 0 || shift > 64) {
2489         return 0;
2490     }
2491 
2492     d1 = extract64(v, shift - 1, 1);
2493     D1 = extract64(v, 0, shift);
2494     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2495         return d1;
2496     } else if (vxrm == 1) { /* round-to-nearest-even */
2497         if (shift > 1) {
2498             D2 = extract64(v, 0, shift - 1);
2499             return d1 & ((D2 != 0) | d);
2500         } else {
2501             return d1 & d;
2502         }
2503     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2504         return !d & (D1 != 0);
2505     }
2506     return 0; /* round-down (truncate) */
2507 }
2508 
2509 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2510                              int32_t b)
2511 {
2512     int64_t res = (int64_t)a + b;
2513     uint8_t round = get_round(vxrm, res, 1);
2514 
2515     return (res >> 1) + round;
2516 }
2517 
2518 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2519                              int64_t b)
2520 {
2521     int64_t res = a + b;
2522     uint8_t round = get_round(vxrm, res, 1);
2523     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2524 
2525     /* With signed overflow, bit 64 is inverse of bit 63. */
2526     return ((res >> 1) ^ over) + round;
2527 }
2528 
2529 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2530 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2531 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2532 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2533 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2534 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2535 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2536 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2537 
2538 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2539 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2540 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2541 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2542 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2543 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2544 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2545 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2546 
2547 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2548                                uint32_t a, uint32_t b)
2549 {
2550     uint64_t res = (uint64_t)a + b;
2551     uint8_t round = get_round(vxrm, res, 1);
2552 
2553     return (res >> 1) + round;
2554 }
2555 
2556 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2557                                uint64_t a, uint64_t b)
2558 {
2559     uint64_t res = a + b;
2560     uint8_t round = get_round(vxrm, res, 1);
2561     uint64_t over = (uint64_t)(res < a) << 63;
2562 
2563     return ((res >> 1) | over) + round;
2564 }
2565 
2566 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2567 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2568 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2569 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2570 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2571 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2572 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2573 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2574 
2575 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2576 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2577 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2578 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2579 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2580 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2581 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2582 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2583 
2584 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2585                              int32_t b)
2586 {
2587     int64_t res = (int64_t)a - b;
2588     uint8_t round = get_round(vxrm, res, 1);
2589 
2590     return (res >> 1) + round;
2591 }
2592 
2593 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2594                              int64_t b)
2595 {
2596     int64_t res = (int64_t)a - b;
2597     uint8_t round = get_round(vxrm, res, 1);
2598     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2599 
2600     /* With signed overflow, bit 64 is inverse of bit 63. */
2601     return ((res >> 1) ^ over) + round;
2602 }
2603 
2604 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2605 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2606 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2607 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2608 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2609 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2610 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2611 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2612 
2613 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2614 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2615 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2616 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2617 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2618 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2619 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2620 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2621 
2622 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2623                                uint32_t a, uint32_t b)
2624 {
2625     int64_t res = (int64_t)a - b;
2626     uint8_t round = get_round(vxrm, res, 1);
2627 
2628     return (res >> 1) + round;
2629 }
2630 
2631 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2632                                uint64_t a, uint64_t b)
2633 {
2634     uint64_t res = (uint64_t)a - b;
2635     uint8_t round = get_round(vxrm, res, 1);
2636     uint64_t over = (uint64_t)(res > a) << 63;
2637 
2638     return ((res >> 1) | over) + round;
2639 }
2640 
2641 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2642 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2643 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2644 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2645 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2646 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2647 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2648 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2649 
2650 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2651 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2652 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2653 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2654 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2655 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2656 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2657 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2658 
2659 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2660 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2661 {
2662     uint8_t round;
2663     int16_t res;
2664 
2665     res = (int16_t)a * (int16_t)b;
2666     round = get_round(vxrm, res, 7);
2667     res = (res >> 7) + round;
2668 
2669     if (res > INT8_MAX) {
2670         env->vxsat = 0x1;
2671         return INT8_MAX;
2672     } else if (res < INT8_MIN) {
2673         env->vxsat = 0x1;
2674         return INT8_MIN;
2675     } else {
2676         return res;
2677     }
2678 }
2679 
2680 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2681 {
2682     uint8_t round;
2683     int32_t res;
2684 
2685     res = (int32_t)a * (int32_t)b;
2686     round = get_round(vxrm, res, 15);
2687     res = (res >> 15) + round;
2688 
2689     if (res > INT16_MAX) {
2690         env->vxsat = 0x1;
2691         return INT16_MAX;
2692     } else if (res < INT16_MIN) {
2693         env->vxsat = 0x1;
2694         return INT16_MIN;
2695     } else {
2696         return res;
2697     }
2698 }
2699 
2700 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2701 {
2702     uint8_t round;
2703     int64_t res;
2704 
2705     res = (int64_t)a * (int64_t)b;
2706     round = get_round(vxrm, res, 31);
2707     res = (res >> 31) + round;
2708 
2709     if (res > INT32_MAX) {
2710         env->vxsat = 0x1;
2711         return INT32_MAX;
2712     } else if (res < INT32_MIN) {
2713         env->vxsat = 0x1;
2714         return INT32_MIN;
2715     } else {
2716         return res;
2717     }
2718 }
2719 
2720 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2721 {
2722     uint8_t round;
2723     uint64_t hi_64, lo_64;
2724     int64_t res;
2725 
2726     if (a == INT64_MIN && b == INT64_MIN) {
2727         env->vxsat = 1;
2728         return INT64_MAX;
2729     }
2730 
2731     muls64(&lo_64, &hi_64, a, b);
2732     round = get_round(vxrm, lo_64, 63);
2733     /*
2734      * Cannot overflow, as there are always
2735      * 2 sign bits after multiply.
2736      */
2737     res = (hi_64 << 1) | (lo_64 >> 63);
2738     if (round) {
2739         if (res == INT64_MAX) {
2740             env->vxsat = 1;
2741         } else {
2742             res += 1;
2743         }
2744     }
2745     return res;
2746 }
2747 
2748 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2749 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2750 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2751 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2752 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2753 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2754 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2755 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2756 
2757 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2758 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2759 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2760 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2761 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2762 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2763 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2764 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2765 
2766 /* Vector Single-Width Scaling Shift Instructions */
2767 static inline uint8_t
2768 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2769 {
2770     uint8_t round, shift = b & 0x7;
2771     uint8_t res;
2772 
2773     round = get_round(vxrm, a, shift);
2774     res = (a >> shift) + round;
2775     return res;
2776 }
2777 static inline uint16_t
2778 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2779 {
2780     uint8_t round, shift = b & 0xf;
2781 
2782     round = get_round(vxrm, a, shift);
2783     return (a >> shift) + round;
2784 }
2785 static inline uint32_t
2786 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2787 {
2788     uint8_t round, shift = b & 0x1f;
2789 
2790     round = get_round(vxrm, a, shift);
2791     return (a >> shift) + round;
2792 }
2793 static inline uint64_t
2794 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2795 {
2796     uint8_t round, shift = b & 0x3f;
2797 
2798     round = get_round(vxrm, a, shift);
2799     return (a >> shift) + round;
2800 }
2801 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2802 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2803 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2804 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2805 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2806 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2807 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2808 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2809 
2810 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2811 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2812 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2813 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2814 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2815 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2816 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2817 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2818 
2819 static inline int8_t
2820 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2821 {
2822     uint8_t round, shift = b & 0x7;
2823 
2824     round = get_round(vxrm, a, shift);
2825     return (a >> shift) + round;
2826 }
2827 static inline int16_t
2828 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2829 {
2830     uint8_t round, shift = b & 0xf;
2831 
2832     round = get_round(vxrm, a, shift);
2833     return (a >> shift) + round;
2834 }
2835 static inline int32_t
2836 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2837 {
2838     uint8_t round, shift = b & 0x1f;
2839 
2840     round = get_round(vxrm, a, shift);
2841     return (a >> shift) + round;
2842 }
2843 static inline int64_t
2844 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2845 {
2846     uint8_t round, shift = b & 0x3f;
2847 
2848     round = get_round(vxrm, a, shift);
2849     return (a >> shift) + round;
2850 }
2851 
2852 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2853 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2854 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2855 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2856 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2857 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2858 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2859 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2860 
2861 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2862 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2863 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2864 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2865 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2866 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2867 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2868 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2869 
2870 /* Vector Narrowing Fixed-Point Clip Instructions */
2871 static inline int8_t
2872 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2873 {
2874     uint8_t round, shift = b & 0xf;
2875     int16_t res;
2876 
2877     round = get_round(vxrm, a, shift);
2878     res = (a >> shift) + round;
2879     if (res > INT8_MAX) {
2880         env->vxsat = 0x1;
2881         return INT8_MAX;
2882     } else if (res < INT8_MIN) {
2883         env->vxsat = 0x1;
2884         return INT8_MIN;
2885     } else {
2886         return res;
2887     }
2888 }
2889 
2890 static inline int16_t
2891 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2892 {
2893     uint8_t round, shift = b & 0x1f;
2894     int32_t res;
2895 
2896     round = get_round(vxrm, a, shift);
2897     res = (a >> shift) + round;
2898     if (res > INT16_MAX) {
2899         env->vxsat = 0x1;
2900         return INT16_MAX;
2901     } else if (res < INT16_MIN) {
2902         env->vxsat = 0x1;
2903         return INT16_MIN;
2904     } else {
2905         return res;
2906     }
2907 }
2908 
2909 static inline int32_t
2910 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2911 {
2912     uint8_t round, shift = b & 0x3f;
2913     int64_t res;
2914 
2915     round = get_round(vxrm, a, shift);
2916     res = (a >> shift) + round;
2917     if (res > INT32_MAX) {
2918         env->vxsat = 0x1;
2919         return INT32_MAX;
2920     } else if (res < INT32_MIN) {
2921         env->vxsat = 0x1;
2922         return INT32_MIN;
2923     } else {
2924         return res;
2925     }
2926 }
2927 
2928 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2929 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2930 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2931 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2932 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2933 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2934 
2935 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2936 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2937 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2938 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2939 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2940 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2941 
2942 static inline uint8_t
2943 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2944 {
2945     uint8_t round, shift = b & 0xf;
2946     uint16_t res;
2947 
2948     round = get_round(vxrm, a, shift);
2949     res = (a >> shift) + round;
2950     if (res > UINT8_MAX) {
2951         env->vxsat = 0x1;
2952         return UINT8_MAX;
2953     } else {
2954         return res;
2955     }
2956 }
2957 
2958 static inline uint16_t
2959 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2960 {
2961     uint8_t round, shift = b & 0x1f;
2962     uint32_t res;
2963 
2964     round = get_round(vxrm, a, shift);
2965     res = (a >> shift) + round;
2966     if (res > UINT16_MAX) {
2967         env->vxsat = 0x1;
2968         return UINT16_MAX;
2969     } else {
2970         return res;
2971     }
2972 }
2973 
2974 static inline uint32_t
2975 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2976 {
2977     uint8_t round, shift = b & 0x3f;
2978     uint64_t res;
2979 
2980     round = get_round(vxrm, a, shift);
2981     res = (a >> shift) + round;
2982     if (res > UINT32_MAX) {
2983         env->vxsat = 0x1;
2984         return UINT32_MAX;
2985     } else {
2986         return res;
2987     }
2988 }
2989 
2990 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2991 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2992 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2993 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2994 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2995 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2996 
2997 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2998 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2999 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3000 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3001 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3002 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3003 
3004 /*
3005  * Vector Float Point Arithmetic Instructions
3006  */
3007 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3008 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3009 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3010                       CPURISCVState *env)                      \
3011 {                                                              \
3012     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3013     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3014     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3015 }
3016 
3017 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3018 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3019                   void *vs2, CPURISCVState *env,          \
3020                   uint32_t desc)                          \
3021 {                                                         \
3022     uint32_t vm = vext_vm(desc);                          \
3023     uint32_t vl = env->vl;                                \
3024     uint32_t total_elems =                                \
3025         vext_get_total_elems(env, desc, ESZ);             \
3026     uint32_t vta = vext_vta(desc);                        \
3027     uint32_t vma = vext_vma(desc);                        \
3028     uint32_t i;                                           \
3029                                                           \
3030     for (i = env->vstart; i < vl; i++) {                  \
3031         if (!vm && !vext_elem_mask(v0, i)) {              \
3032             /* set masked-off elements to 1s */           \
3033             vext_set_elems_1s(vd, vma, i * ESZ,           \
3034                               (i + 1) * ESZ);             \
3035             continue;                                     \
3036         }                                                 \
3037         do_##NAME(vd, vs1, vs2, i, env);                  \
3038     }                                                     \
3039     env->vstart = 0;                                      \
3040     /* set tail elements to 1s */                         \
3041     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3042                       total_elems * ESZ);                 \
3043 }
3044 
3045 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3046 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3047 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3048 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3049 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3050 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3051 
3052 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3053 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3054                       CPURISCVState *env)                      \
3055 {                                                              \
3056     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3057     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3058 }
3059 
3060 #define GEN_VEXT_VF(NAME, ESZ)                            \
3061 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3062                   void *vs2, CPURISCVState *env,          \
3063                   uint32_t desc)                          \
3064 {                                                         \
3065     uint32_t vm = vext_vm(desc);                          \
3066     uint32_t vl = env->vl;                                \
3067     uint32_t total_elems =                                \
3068         vext_get_total_elems(env, desc, ESZ);             \
3069     uint32_t vta = vext_vta(desc);                        \
3070     uint32_t vma = vext_vma(desc);                        \
3071     uint32_t i;                                           \
3072                                                           \
3073     for (i = env->vstart; i < vl; i++) {                  \
3074         if (!vm && !vext_elem_mask(v0, i)) {              \
3075             /* set masked-off elements to 1s */           \
3076             vext_set_elems_1s(vd, vma, i * ESZ,           \
3077                               (i + 1) * ESZ);             \
3078             continue;                                     \
3079         }                                                 \
3080         do_##NAME(vd, s1, vs2, i, env);                   \
3081     }                                                     \
3082     env->vstart = 0;                                      \
3083     /* set tail elements to 1s */                         \
3084     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3085                       total_elems * ESZ);                 \
3086 }
3087 
3088 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3089 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3090 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3091 GEN_VEXT_VF(vfadd_vf_h, 2)
3092 GEN_VEXT_VF(vfadd_vf_w, 4)
3093 GEN_VEXT_VF(vfadd_vf_d, 8)
3094 
3095 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3096 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3097 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3098 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3099 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3100 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3101 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3102 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3103 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3104 GEN_VEXT_VF(vfsub_vf_h, 2)
3105 GEN_VEXT_VF(vfsub_vf_w, 4)
3106 GEN_VEXT_VF(vfsub_vf_d, 8)
3107 
3108 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3109 {
3110     return float16_sub(b, a, s);
3111 }
3112 
3113 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3114 {
3115     return float32_sub(b, a, s);
3116 }
3117 
3118 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3119 {
3120     return float64_sub(b, a, s);
3121 }
3122 
3123 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3124 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3125 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3126 GEN_VEXT_VF(vfrsub_vf_h, 2)
3127 GEN_VEXT_VF(vfrsub_vf_w, 4)
3128 GEN_VEXT_VF(vfrsub_vf_d, 8)
3129 
3130 /* Vector Widening Floating-Point Add/Subtract Instructions */
3131 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3132 {
3133     return float32_add(float16_to_float32(a, true, s),
3134                        float16_to_float32(b, true, s), s);
3135 }
3136 
3137 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3138 {
3139     return float64_add(float32_to_float64(a, s),
3140                        float32_to_float64(b, s), s);
3141 
3142 }
3143 
3144 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3145 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3146 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3147 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3148 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3149 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3150 GEN_VEXT_VF(vfwadd_vf_h, 4)
3151 GEN_VEXT_VF(vfwadd_vf_w, 8)
3152 
3153 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3154 {
3155     return float32_sub(float16_to_float32(a, true, s),
3156                        float16_to_float32(b, true, s), s);
3157 }
3158 
3159 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3160 {
3161     return float64_sub(float32_to_float64(a, s),
3162                        float32_to_float64(b, s), s);
3163 
3164 }
3165 
3166 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3167 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3168 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3169 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3170 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3171 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3172 GEN_VEXT_VF(vfwsub_vf_h, 4)
3173 GEN_VEXT_VF(vfwsub_vf_w, 8)
3174 
3175 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3176 {
3177     return float32_add(a, float16_to_float32(b, true, s), s);
3178 }
3179 
3180 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3181 {
3182     return float64_add(a, float32_to_float64(b, s), s);
3183 }
3184 
3185 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3186 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3187 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3188 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3189 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3190 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3191 GEN_VEXT_VF(vfwadd_wf_h, 4)
3192 GEN_VEXT_VF(vfwadd_wf_w, 8)
3193 
3194 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3195 {
3196     return float32_sub(a, float16_to_float32(b, true, s), s);
3197 }
3198 
3199 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3200 {
3201     return float64_sub(a, float32_to_float64(b, s), s);
3202 }
3203 
3204 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3205 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3206 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3207 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3208 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3209 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3210 GEN_VEXT_VF(vfwsub_wf_h, 4)
3211 GEN_VEXT_VF(vfwsub_wf_w, 8)
3212 
3213 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3214 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3215 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3216 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3217 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3218 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3219 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3220 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3221 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3222 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3223 GEN_VEXT_VF(vfmul_vf_h, 2)
3224 GEN_VEXT_VF(vfmul_vf_w, 4)
3225 GEN_VEXT_VF(vfmul_vf_d, 8)
3226 
3227 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3228 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3229 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3230 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3231 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3232 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3233 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3234 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3235 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3236 GEN_VEXT_VF(vfdiv_vf_h, 2)
3237 GEN_VEXT_VF(vfdiv_vf_w, 4)
3238 GEN_VEXT_VF(vfdiv_vf_d, 8)
3239 
3240 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3241 {
3242     return float16_div(b, a, s);
3243 }
3244 
3245 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3246 {
3247     return float32_div(b, a, s);
3248 }
3249 
3250 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3251 {
3252     return float64_div(b, a, s);
3253 }
3254 
3255 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3256 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3257 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3258 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3259 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3260 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3261 
3262 /* Vector Widening Floating-Point Multiply */
3263 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3264 {
3265     return float32_mul(float16_to_float32(a, true, s),
3266                        float16_to_float32(b, true, s), s);
3267 }
3268 
3269 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3270 {
3271     return float64_mul(float32_to_float64(a, s),
3272                        float32_to_float64(b, s), s);
3273 
3274 }
3275 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3276 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3277 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3278 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3279 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3280 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3281 GEN_VEXT_VF(vfwmul_vf_h, 4)
3282 GEN_VEXT_VF(vfwmul_vf_w, 8)
3283 
3284 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3285 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3286 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3287                       CPURISCVState *env)                          \
3288 {                                                                  \
3289     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3290     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3291     TD d = *((TD *)vd + HD(i));                                    \
3292     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3293 }
3294 
3295 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3296 {
3297     return float16_muladd(a, b, d, 0, s);
3298 }
3299 
3300 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3301 {
3302     return float32_muladd(a, b, d, 0, s);
3303 }
3304 
3305 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3306 {
3307     return float64_muladd(a, b, d, 0, s);
3308 }
3309 
3310 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3311 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3312 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3313 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3314 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3315 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3316 
3317 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3318 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3319                       CPURISCVState *env)                         \
3320 {                                                                 \
3321     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3322     TD d = *((TD *)vd + HD(i));                                   \
3323     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3324 }
3325 
3326 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3327 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3328 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3329 GEN_VEXT_VF(vfmacc_vf_h, 2)
3330 GEN_VEXT_VF(vfmacc_vf_w, 4)
3331 GEN_VEXT_VF(vfmacc_vf_d, 8)
3332 
3333 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3334 {
3335     return float16_muladd(a, b, d, float_muladd_negate_c |
3336                                    float_muladd_negate_product, s);
3337 }
3338 
3339 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3340 {
3341     return float32_muladd(a, b, d, float_muladd_negate_c |
3342                                    float_muladd_negate_product, s);
3343 }
3344 
3345 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3346 {
3347     return float64_muladd(a, b, d, float_muladd_negate_c |
3348                                    float_muladd_negate_product, s);
3349 }
3350 
3351 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3352 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3353 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3354 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3355 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3356 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3357 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3358 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3359 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3360 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3361 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3362 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3363 
3364 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3365 {
3366     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3367 }
3368 
3369 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3370 {
3371     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3372 }
3373 
3374 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3375 {
3376     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3377 }
3378 
3379 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3380 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3381 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3382 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3383 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3384 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3385 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3386 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3387 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3388 GEN_VEXT_VF(vfmsac_vf_h, 2)
3389 GEN_VEXT_VF(vfmsac_vf_w, 4)
3390 GEN_VEXT_VF(vfmsac_vf_d, 8)
3391 
3392 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3393 {
3394     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3395 }
3396 
3397 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3398 {
3399     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3400 }
3401 
3402 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3403 {
3404     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3405 }
3406 
3407 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3408 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3409 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3410 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3411 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3412 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3413 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3414 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3415 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3416 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3417 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3418 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3419 
3420 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3421 {
3422     return float16_muladd(d, b, a, 0, s);
3423 }
3424 
3425 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3426 {
3427     return float32_muladd(d, b, a, 0, s);
3428 }
3429 
3430 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3431 {
3432     return float64_muladd(d, b, a, 0, s);
3433 }
3434 
3435 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3436 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3437 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3438 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3439 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3440 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3441 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3442 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3443 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3444 GEN_VEXT_VF(vfmadd_vf_h, 2)
3445 GEN_VEXT_VF(vfmadd_vf_w, 4)
3446 GEN_VEXT_VF(vfmadd_vf_d, 8)
3447 
3448 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3449 {
3450     return float16_muladd(d, b, a, float_muladd_negate_c |
3451                                    float_muladd_negate_product, s);
3452 }
3453 
3454 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3455 {
3456     return float32_muladd(d, b, a, float_muladd_negate_c |
3457                                    float_muladd_negate_product, s);
3458 }
3459 
3460 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3461 {
3462     return float64_muladd(d, b, a, float_muladd_negate_c |
3463                                    float_muladd_negate_product, s);
3464 }
3465 
3466 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3467 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3468 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3469 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3470 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3471 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3472 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3473 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3474 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3475 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3476 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3477 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3478 
3479 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3480 {
3481     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3482 }
3483 
3484 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3485 {
3486     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3487 }
3488 
3489 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3490 {
3491     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3492 }
3493 
3494 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3495 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3496 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3497 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3498 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3499 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3500 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3501 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3502 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3503 GEN_VEXT_VF(vfmsub_vf_h, 2)
3504 GEN_VEXT_VF(vfmsub_vf_w, 4)
3505 GEN_VEXT_VF(vfmsub_vf_d, 8)
3506 
3507 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3508 {
3509     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3510 }
3511 
3512 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3513 {
3514     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3515 }
3516 
3517 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3518 {
3519     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3520 }
3521 
3522 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3523 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3524 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3525 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3526 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3527 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3528 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3529 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3530 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3531 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3532 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3533 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3534 
3535 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3536 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3537 {
3538     return float32_muladd(float16_to_float32(a, true, s),
3539                           float16_to_float32(b, true, s), d, 0, s);
3540 }
3541 
3542 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3543 {
3544     return float64_muladd(float32_to_float64(a, s),
3545                           float32_to_float64(b, s), d, 0, s);
3546 }
3547 
3548 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3549 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3550 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3551 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3552 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3553 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3554 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3555 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3556 
3557 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3558 {
3559     return float32_muladd(float16_to_float32(a, true, s),
3560                           float16_to_float32(b, true, s), d,
3561                           float_muladd_negate_c | float_muladd_negate_product,
3562                           s);
3563 }
3564 
3565 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3566 {
3567     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3568                           d, float_muladd_negate_c |
3569                              float_muladd_negate_product, s);
3570 }
3571 
3572 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3573 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3574 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3575 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3576 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3577 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3578 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3579 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3580 
3581 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3582 {
3583     return float32_muladd(float16_to_float32(a, true, s),
3584                           float16_to_float32(b, true, s), d,
3585                           float_muladd_negate_c, s);
3586 }
3587 
3588 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3589 {
3590     return float64_muladd(float32_to_float64(a, s),
3591                           float32_to_float64(b, s), d,
3592                           float_muladd_negate_c, s);
3593 }
3594 
3595 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3596 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3597 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3598 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3599 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3600 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3601 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3602 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3603 
3604 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3605 {
3606     return float32_muladd(float16_to_float32(a, true, s),
3607                           float16_to_float32(b, true, s), d,
3608                           float_muladd_negate_product, s);
3609 }
3610 
3611 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3612 {
3613     return float64_muladd(float32_to_float64(a, s),
3614                           float32_to_float64(b, s), d,
3615                           float_muladd_negate_product, s);
3616 }
3617 
3618 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3619 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3620 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3621 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3622 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3623 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3624 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3625 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3626 
3627 /* Vector Floating-Point Square-Root Instruction */
3628 /* (TD, T2, TX2) */
3629 #define OP_UU_H uint16_t, uint16_t, uint16_t
3630 #define OP_UU_W uint32_t, uint32_t, uint32_t
3631 #define OP_UU_D uint64_t, uint64_t, uint64_t
3632 
3633 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3634 static void do_##NAME(void *vd, void *vs2, int i,      \
3635                       CPURISCVState *env)              \
3636 {                                                      \
3637     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3638     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3639 }
3640 
3641 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3642 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3643                   CPURISCVState *env, uint32_t desc)   \
3644 {                                                      \
3645     uint32_t vm = vext_vm(desc);                       \
3646     uint32_t vl = env->vl;                             \
3647     uint32_t total_elems =                             \
3648         vext_get_total_elems(env, desc, ESZ);          \
3649     uint32_t vta = vext_vta(desc);                     \
3650     uint32_t vma = vext_vma(desc);                     \
3651     uint32_t i;                                        \
3652                                                        \
3653     if (vl == 0) {                                     \
3654         return;                                        \
3655     }                                                  \
3656     for (i = env->vstart; i < vl; i++) {               \
3657         if (!vm && !vext_elem_mask(v0, i)) {           \
3658             /* set masked-off elements to 1s */        \
3659             vext_set_elems_1s(vd, vma, i * ESZ,        \
3660                               (i + 1) * ESZ);          \
3661             continue;                                  \
3662         }                                              \
3663         do_##NAME(vd, vs2, i, env);                    \
3664     }                                                  \
3665     env->vstart = 0;                                   \
3666     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3667                       total_elems * ESZ);              \
3668 }
3669 
3670 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3671 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3672 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3673 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3674 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3675 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3676 
3677 /*
3678  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3679  *
3680  * Adapted from riscv-v-spec recip.c:
3681  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3682  */
3683 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3684 {
3685     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3686     uint64_t exp = extract64(f, frac_size, exp_size);
3687     uint64_t frac = extract64(f, 0, frac_size);
3688 
3689     const uint8_t lookup_table[] = {
3690         52, 51, 50, 48, 47, 46, 44, 43,
3691         42, 41, 40, 39, 38, 36, 35, 34,
3692         33, 32, 31, 30, 30, 29, 28, 27,
3693         26, 25, 24, 23, 23, 22, 21, 20,
3694         19, 19, 18, 17, 16, 16, 15, 14,
3695         14, 13, 12, 12, 11, 10, 10, 9,
3696         9, 8, 7, 7, 6, 6, 5, 4,
3697         4, 3, 3, 2, 2, 1, 1, 0,
3698         127, 125, 123, 121, 119, 118, 116, 114,
3699         113, 111, 109, 108, 106, 105, 103, 102,
3700         100, 99, 97, 96, 95, 93, 92, 91,
3701         90, 88, 87, 86, 85, 84, 83, 82,
3702         80, 79, 78, 77, 76, 75, 74, 73,
3703         72, 71, 70, 70, 69, 68, 67, 66,
3704         65, 64, 63, 63, 62, 61, 60, 59,
3705         59, 58, 57, 56, 56, 55, 54, 53
3706     };
3707     const int precision = 7;
3708 
3709     if (exp == 0 && frac != 0) { /* subnormal */
3710         /* Normalize the subnormal. */
3711         while (extract64(frac, frac_size - 1, 1) == 0) {
3712             exp--;
3713             frac <<= 1;
3714         }
3715 
3716         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3717     }
3718 
3719     int idx = ((exp & 1) << (precision - 1)) |
3720               (frac >> (frac_size - precision + 1));
3721     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3722                         (frac_size - precision);
3723     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3724 
3725     uint64_t val = 0;
3726     val = deposit64(val, 0, frac_size, out_frac);
3727     val = deposit64(val, frac_size, exp_size, out_exp);
3728     val = deposit64(val, frac_size + exp_size, 1, sign);
3729     return val;
3730 }
3731 
3732 static float16 frsqrt7_h(float16 f, float_status *s)
3733 {
3734     int exp_size = 5, frac_size = 10;
3735     bool sign = float16_is_neg(f);
3736 
3737     /*
3738      * frsqrt7(sNaN) = canonical NaN
3739      * frsqrt7(-inf) = canonical NaN
3740      * frsqrt7(-normal) = canonical NaN
3741      * frsqrt7(-subnormal) = canonical NaN
3742      */
3743     if (float16_is_signaling_nan(f, s) ||
3744         (float16_is_infinity(f) && sign) ||
3745         (float16_is_normal(f) && sign) ||
3746         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3747         s->float_exception_flags |= float_flag_invalid;
3748         return float16_default_nan(s);
3749     }
3750 
3751     /* frsqrt7(qNaN) = canonical NaN */
3752     if (float16_is_quiet_nan(f, s)) {
3753         return float16_default_nan(s);
3754     }
3755 
3756     /* frsqrt7(+-0) = +-inf */
3757     if (float16_is_zero(f)) {
3758         s->float_exception_flags |= float_flag_divbyzero;
3759         return float16_set_sign(float16_infinity, sign);
3760     }
3761 
3762     /* frsqrt7(+inf) = +0 */
3763     if (float16_is_infinity(f) && !sign) {
3764         return float16_set_sign(float16_zero, sign);
3765     }
3766 
3767     /* +normal, +subnormal */
3768     uint64_t val = frsqrt7(f, exp_size, frac_size);
3769     return make_float16(val);
3770 }
3771 
3772 static float32 frsqrt7_s(float32 f, float_status *s)
3773 {
3774     int exp_size = 8, frac_size = 23;
3775     bool sign = float32_is_neg(f);
3776 
3777     /*
3778      * frsqrt7(sNaN) = canonical NaN
3779      * frsqrt7(-inf) = canonical NaN
3780      * frsqrt7(-normal) = canonical NaN
3781      * frsqrt7(-subnormal) = canonical NaN
3782      */
3783     if (float32_is_signaling_nan(f, s) ||
3784         (float32_is_infinity(f) && sign) ||
3785         (float32_is_normal(f) && sign) ||
3786         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3787         s->float_exception_flags |= float_flag_invalid;
3788         return float32_default_nan(s);
3789     }
3790 
3791     /* frsqrt7(qNaN) = canonical NaN */
3792     if (float32_is_quiet_nan(f, s)) {
3793         return float32_default_nan(s);
3794     }
3795 
3796     /* frsqrt7(+-0) = +-inf */
3797     if (float32_is_zero(f)) {
3798         s->float_exception_flags |= float_flag_divbyzero;
3799         return float32_set_sign(float32_infinity, sign);
3800     }
3801 
3802     /* frsqrt7(+inf) = +0 */
3803     if (float32_is_infinity(f) && !sign) {
3804         return float32_set_sign(float32_zero, sign);
3805     }
3806 
3807     /* +normal, +subnormal */
3808     uint64_t val = frsqrt7(f, exp_size, frac_size);
3809     return make_float32(val);
3810 }
3811 
3812 static float64 frsqrt7_d(float64 f, float_status *s)
3813 {
3814     int exp_size = 11, frac_size = 52;
3815     bool sign = float64_is_neg(f);
3816 
3817     /*
3818      * frsqrt7(sNaN) = canonical NaN
3819      * frsqrt7(-inf) = canonical NaN
3820      * frsqrt7(-normal) = canonical NaN
3821      * frsqrt7(-subnormal) = canonical NaN
3822      */
3823     if (float64_is_signaling_nan(f, s) ||
3824         (float64_is_infinity(f) && sign) ||
3825         (float64_is_normal(f) && sign) ||
3826         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3827         s->float_exception_flags |= float_flag_invalid;
3828         return float64_default_nan(s);
3829     }
3830 
3831     /* frsqrt7(qNaN) = canonical NaN */
3832     if (float64_is_quiet_nan(f, s)) {
3833         return float64_default_nan(s);
3834     }
3835 
3836     /* frsqrt7(+-0) = +-inf */
3837     if (float64_is_zero(f)) {
3838         s->float_exception_flags |= float_flag_divbyzero;
3839         return float64_set_sign(float64_infinity, sign);
3840     }
3841 
3842     /* frsqrt7(+inf) = +0 */
3843     if (float64_is_infinity(f) && !sign) {
3844         return float64_set_sign(float64_zero, sign);
3845     }
3846 
3847     /* +normal, +subnormal */
3848     uint64_t val = frsqrt7(f, exp_size, frac_size);
3849     return make_float64(val);
3850 }
3851 
3852 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3853 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3854 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3855 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3856 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3857 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3858 
3859 /*
3860  * Vector Floating-Point Reciprocal Estimate Instruction
3861  *
3862  * Adapted from riscv-v-spec recip.c:
3863  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3864  */
3865 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3866                       float_status *s)
3867 {
3868     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3869     uint64_t exp = extract64(f, frac_size, exp_size);
3870     uint64_t frac = extract64(f, 0, frac_size);
3871 
3872     const uint8_t lookup_table[] = {
3873         127, 125, 123, 121, 119, 117, 116, 114,
3874         112, 110, 109, 107, 105, 104, 102, 100,
3875         99, 97, 96, 94, 93, 91, 90, 88,
3876         87, 85, 84, 83, 81, 80, 79, 77,
3877         76, 75, 74, 72, 71, 70, 69, 68,
3878         66, 65, 64, 63, 62, 61, 60, 59,
3879         58, 57, 56, 55, 54, 53, 52, 51,
3880         50, 49, 48, 47, 46, 45, 44, 43,
3881         42, 41, 40, 40, 39, 38, 37, 36,
3882         35, 35, 34, 33, 32, 31, 31, 30,
3883         29, 28, 28, 27, 26, 25, 25, 24,
3884         23, 23, 22, 21, 21, 20, 19, 19,
3885         18, 17, 17, 16, 15, 15, 14, 14,
3886         13, 12, 12, 11, 11, 10, 9, 9,
3887         8, 8, 7, 7, 6, 5, 5, 4,
3888         4, 3, 3, 2, 2, 1, 1, 0
3889     };
3890     const int precision = 7;
3891 
3892     if (exp == 0 && frac != 0) { /* subnormal */
3893         /* Normalize the subnormal. */
3894         while (extract64(frac, frac_size - 1, 1) == 0) {
3895             exp--;
3896             frac <<= 1;
3897         }
3898 
3899         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3900 
3901         if (exp != 0 && exp != UINT64_MAX) {
3902             /*
3903              * Overflow to inf or max value of same sign,
3904              * depending on sign and rounding mode.
3905              */
3906             s->float_exception_flags |= (float_flag_inexact |
3907                                          float_flag_overflow);
3908 
3909             if ((s->float_rounding_mode == float_round_to_zero) ||
3910                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3911                 ((s->float_rounding_mode == float_round_up) && sign)) {
3912                 /* Return greatest/negative finite value. */
3913                 return (sign << (exp_size + frac_size)) |
3914                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3915             } else {
3916                 /* Return +-inf. */
3917                 return (sign << (exp_size + frac_size)) |
3918                        MAKE_64BIT_MASK(frac_size, exp_size);
3919             }
3920         }
3921     }
3922 
3923     int idx = frac >> (frac_size - precision);
3924     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3925                         (frac_size - precision);
3926     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3927 
3928     if (out_exp == 0 || out_exp == UINT64_MAX) {
3929         /*
3930          * The result is subnormal, but don't raise the underflow exception,
3931          * because there's no additional loss of precision.
3932          */
3933         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3934         if (out_exp == UINT64_MAX) {
3935             out_frac >>= 1;
3936             out_exp = 0;
3937         }
3938     }
3939 
3940     uint64_t val = 0;
3941     val = deposit64(val, 0, frac_size, out_frac);
3942     val = deposit64(val, frac_size, exp_size, out_exp);
3943     val = deposit64(val, frac_size + exp_size, 1, sign);
3944     return val;
3945 }
3946 
3947 static float16 frec7_h(float16 f, float_status *s)
3948 {
3949     int exp_size = 5, frac_size = 10;
3950     bool sign = float16_is_neg(f);
3951 
3952     /* frec7(+-inf) = +-0 */
3953     if (float16_is_infinity(f)) {
3954         return float16_set_sign(float16_zero, sign);
3955     }
3956 
3957     /* frec7(+-0) = +-inf */
3958     if (float16_is_zero(f)) {
3959         s->float_exception_flags |= float_flag_divbyzero;
3960         return float16_set_sign(float16_infinity, sign);
3961     }
3962 
3963     /* frec7(sNaN) = canonical NaN */
3964     if (float16_is_signaling_nan(f, s)) {
3965         s->float_exception_flags |= float_flag_invalid;
3966         return float16_default_nan(s);
3967     }
3968 
3969     /* frec7(qNaN) = canonical NaN */
3970     if (float16_is_quiet_nan(f, s)) {
3971         return float16_default_nan(s);
3972     }
3973 
3974     /* +-normal, +-subnormal */
3975     uint64_t val = frec7(f, exp_size, frac_size, s);
3976     return make_float16(val);
3977 }
3978 
3979 static float32 frec7_s(float32 f, float_status *s)
3980 {
3981     int exp_size = 8, frac_size = 23;
3982     bool sign = float32_is_neg(f);
3983 
3984     /* frec7(+-inf) = +-0 */
3985     if (float32_is_infinity(f)) {
3986         return float32_set_sign(float32_zero, sign);
3987     }
3988 
3989     /* frec7(+-0) = +-inf */
3990     if (float32_is_zero(f)) {
3991         s->float_exception_flags |= float_flag_divbyzero;
3992         return float32_set_sign(float32_infinity, sign);
3993     }
3994 
3995     /* frec7(sNaN) = canonical NaN */
3996     if (float32_is_signaling_nan(f, s)) {
3997         s->float_exception_flags |= float_flag_invalid;
3998         return float32_default_nan(s);
3999     }
4000 
4001     /* frec7(qNaN) = canonical NaN */
4002     if (float32_is_quiet_nan(f, s)) {
4003         return float32_default_nan(s);
4004     }
4005 
4006     /* +-normal, +-subnormal */
4007     uint64_t val = frec7(f, exp_size, frac_size, s);
4008     return make_float32(val);
4009 }
4010 
4011 static float64 frec7_d(float64 f, float_status *s)
4012 {
4013     int exp_size = 11, frac_size = 52;
4014     bool sign = float64_is_neg(f);
4015 
4016     /* frec7(+-inf) = +-0 */
4017     if (float64_is_infinity(f)) {
4018         return float64_set_sign(float64_zero, sign);
4019     }
4020 
4021     /* frec7(+-0) = +-inf */
4022     if (float64_is_zero(f)) {
4023         s->float_exception_flags |= float_flag_divbyzero;
4024         return float64_set_sign(float64_infinity, sign);
4025     }
4026 
4027     /* frec7(sNaN) = canonical NaN */
4028     if (float64_is_signaling_nan(f, s)) {
4029         s->float_exception_flags |= float_flag_invalid;
4030         return float64_default_nan(s);
4031     }
4032 
4033     /* frec7(qNaN) = canonical NaN */
4034     if (float64_is_quiet_nan(f, s)) {
4035         return float64_default_nan(s);
4036     }
4037 
4038     /* +-normal, +-subnormal */
4039     uint64_t val = frec7(f, exp_size, frac_size, s);
4040     return make_float64(val);
4041 }
4042 
4043 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4044 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4045 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4046 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4047 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4048 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4049 
4050 /* Vector Floating-Point MIN/MAX Instructions */
4051 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4052 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4053 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4054 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4055 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4056 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4057 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4058 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4059 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4060 GEN_VEXT_VF(vfmin_vf_h, 2)
4061 GEN_VEXT_VF(vfmin_vf_w, 4)
4062 GEN_VEXT_VF(vfmin_vf_d, 8)
4063 
4064 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4065 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4066 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4067 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4068 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4069 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4070 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4071 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4072 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4073 GEN_VEXT_VF(vfmax_vf_h, 2)
4074 GEN_VEXT_VF(vfmax_vf_w, 4)
4075 GEN_VEXT_VF(vfmax_vf_d, 8)
4076 
4077 /* Vector Floating-Point Sign-Injection Instructions */
4078 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4079 {
4080     return deposit64(b, 0, 15, a);
4081 }
4082 
4083 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4084 {
4085     return deposit64(b, 0, 31, a);
4086 }
4087 
4088 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4089 {
4090     return deposit64(b, 0, 63, a);
4091 }
4092 
4093 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4094 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4095 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4096 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4097 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4098 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4099 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4100 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4101 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4102 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4103 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4104 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4105 
4106 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4107 {
4108     return deposit64(~b, 0, 15, a);
4109 }
4110 
4111 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4112 {
4113     return deposit64(~b, 0, 31, a);
4114 }
4115 
4116 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4117 {
4118     return deposit64(~b, 0, 63, a);
4119 }
4120 
4121 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4122 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4123 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4124 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4125 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4126 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4127 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4128 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4129 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4130 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4131 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4132 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4133 
4134 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4135 {
4136     return deposit64(b ^ a, 0, 15, a);
4137 }
4138 
4139 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4140 {
4141     return deposit64(b ^ a, 0, 31, a);
4142 }
4143 
4144 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4145 {
4146     return deposit64(b ^ a, 0, 63, a);
4147 }
4148 
4149 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4150 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4151 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4152 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4153 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4154 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4155 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4156 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4157 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4158 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4159 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4160 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4161 
4162 /* Vector Floating-Point Compare Instructions */
4163 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4164 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4165                   CPURISCVState *env, uint32_t desc)          \
4166 {                                                             \
4167     uint32_t vm = vext_vm(desc);                              \
4168     uint32_t vl = env->vl;                                    \
4169     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4170     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4171     uint32_t vma = vext_vma(desc);                            \
4172     uint32_t i;                                               \
4173                                                               \
4174     for (i = env->vstart; i < vl; i++) {                      \
4175         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4177         if (!vm && !vext_elem_mask(v0, i)) {                  \
4178             /* set masked-off elements to 1s */               \
4179             if (vma) {                                        \
4180                 vext_set_elem_mask(vd, i, 1);                 \
4181             }                                                 \
4182             continue;                                         \
4183         }                                                     \
4184         vext_set_elem_mask(vd, i,                             \
4185                            DO_OP(s2, s1, &env->fp_status));   \
4186     }                                                         \
4187     env->vstart = 0;                                          \
4188     /*
4189      * mask destination register are always tail-agnostic
4190      * set tail elements to 1s
4191      */                                                       \
4192     if (vta_all_1s) {                                         \
4193         for (; i < total_elems; i++) {                        \
4194             vext_set_elem_mask(vd, i, 1);                     \
4195         }                                                     \
4196     }                                                         \
4197 }
4198 
4199 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4200 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4201 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4202 
4203 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4204 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4205                   CPURISCVState *env, uint32_t desc)                \
4206 {                                                                   \
4207     uint32_t vm = vext_vm(desc);                                    \
4208     uint32_t vl = env->vl;                                          \
4209     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4210     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4211     uint32_t vma = vext_vma(desc);                                  \
4212     uint32_t i;                                                     \
4213                                                                     \
4214     for (i = env->vstart; i < vl; i++) {                            \
4215         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4216         if (!vm && !vext_elem_mask(v0, i)) {                        \
4217             /* set masked-off elements to 1s */                     \
4218             if (vma) {                                              \
4219                 vext_set_elem_mask(vd, i, 1);                       \
4220             }                                                       \
4221             continue;                                               \
4222         }                                                           \
4223         vext_set_elem_mask(vd, i,                                   \
4224                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4225     }                                                               \
4226     env->vstart = 0;                                                \
4227     /*
4228      * mask destination register are always tail-agnostic
4229      * set tail elements to 1s
4230      */                                                             \
4231     if (vta_all_1s) {                                               \
4232         for (; i < total_elems; i++) {                              \
4233             vext_set_elem_mask(vd, i, 1);                           \
4234         }                                                           \
4235     }                                                               \
4236 }
4237 
4238 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4239 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4240 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4241 
4242 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4243 {
4244     FloatRelation compare = float16_compare_quiet(a, b, s);
4245     return compare != float_relation_equal;
4246 }
4247 
4248 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4249 {
4250     FloatRelation compare = float32_compare_quiet(a, b, s);
4251     return compare != float_relation_equal;
4252 }
4253 
4254 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4255 {
4256     FloatRelation compare = float64_compare_quiet(a, b, s);
4257     return compare != float_relation_equal;
4258 }
4259 
4260 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4261 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4262 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4263 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4264 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4265 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4266 
4267 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4268 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4269 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4270 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4271 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4272 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4273 
4274 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4275 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4276 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4277 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4278 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4279 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4280 
4281 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4282 {
4283     FloatRelation compare = float16_compare(a, b, s);
4284     return compare == float_relation_greater;
4285 }
4286 
4287 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4288 {
4289     FloatRelation compare = float32_compare(a, b, s);
4290     return compare == float_relation_greater;
4291 }
4292 
4293 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4294 {
4295     FloatRelation compare = float64_compare(a, b, s);
4296     return compare == float_relation_greater;
4297 }
4298 
4299 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4300 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4301 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4302 
4303 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4304 {
4305     FloatRelation compare = float16_compare(a, b, s);
4306     return compare == float_relation_greater ||
4307            compare == float_relation_equal;
4308 }
4309 
4310 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4311 {
4312     FloatRelation compare = float32_compare(a, b, s);
4313     return compare == float_relation_greater ||
4314            compare == float_relation_equal;
4315 }
4316 
4317 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4318 {
4319     FloatRelation compare = float64_compare(a, b, s);
4320     return compare == float_relation_greater ||
4321            compare == float_relation_equal;
4322 }
4323 
4324 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4325 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4326 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4327 
4328 /* Vector Floating-Point Classify Instruction */
4329 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4330 static void do_##NAME(void *vd, void *vs2, int i)      \
4331 {                                                      \
4332     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4333     *((TD *)vd + HD(i)) = OP(s2);                      \
4334 }
4335 
4336 #define GEN_VEXT_V(NAME, ESZ)                          \
4337 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4338                   CPURISCVState *env, uint32_t desc)   \
4339 {                                                      \
4340     uint32_t vm = vext_vm(desc);                       \
4341     uint32_t vl = env->vl;                             \
4342     uint32_t total_elems =                             \
4343         vext_get_total_elems(env, desc, ESZ);          \
4344     uint32_t vta = vext_vta(desc);                     \
4345     uint32_t vma = vext_vma(desc);                     \
4346     uint32_t i;                                        \
4347                                                        \
4348     for (i = env->vstart; i < vl; i++) {               \
4349         if (!vm && !vext_elem_mask(v0, i)) {           \
4350             /* set masked-off elements to 1s */        \
4351             vext_set_elems_1s(vd, vma, i * ESZ,        \
4352                               (i + 1) * ESZ);          \
4353             continue;                                  \
4354         }                                              \
4355         do_##NAME(vd, vs2, i);                         \
4356     }                                                  \
4357     env->vstart = 0;                                   \
4358     /* set tail elements to 1s */                      \
4359     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4360                       total_elems * ESZ);              \
4361 }
4362 
4363 target_ulong fclass_h(uint64_t frs1)
4364 {
4365     float16 f = frs1;
4366     bool sign = float16_is_neg(f);
4367 
4368     if (float16_is_infinity(f)) {
4369         return sign ? 1 << 0 : 1 << 7;
4370     } else if (float16_is_zero(f)) {
4371         return sign ? 1 << 3 : 1 << 4;
4372     } else if (float16_is_zero_or_denormal(f)) {
4373         return sign ? 1 << 2 : 1 << 5;
4374     } else if (float16_is_any_nan(f)) {
4375         float_status s = { }; /* for snan_bit_is_one */
4376         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4377     } else {
4378         return sign ? 1 << 1 : 1 << 6;
4379     }
4380 }
4381 
4382 target_ulong fclass_s(uint64_t frs1)
4383 {
4384     float32 f = frs1;
4385     bool sign = float32_is_neg(f);
4386 
4387     if (float32_is_infinity(f)) {
4388         return sign ? 1 << 0 : 1 << 7;
4389     } else if (float32_is_zero(f)) {
4390         return sign ? 1 << 3 : 1 << 4;
4391     } else if (float32_is_zero_or_denormal(f)) {
4392         return sign ? 1 << 2 : 1 << 5;
4393     } else if (float32_is_any_nan(f)) {
4394         float_status s = { }; /* for snan_bit_is_one */
4395         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4396     } else {
4397         return sign ? 1 << 1 : 1 << 6;
4398     }
4399 }
4400 
4401 target_ulong fclass_d(uint64_t frs1)
4402 {
4403     float64 f = frs1;
4404     bool sign = float64_is_neg(f);
4405 
4406     if (float64_is_infinity(f)) {
4407         return sign ? 1 << 0 : 1 << 7;
4408     } else if (float64_is_zero(f)) {
4409         return sign ? 1 << 3 : 1 << 4;
4410     } else if (float64_is_zero_or_denormal(f)) {
4411         return sign ? 1 << 2 : 1 << 5;
4412     } else if (float64_is_any_nan(f)) {
4413         float_status s = { }; /* for snan_bit_is_one */
4414         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4415     } else {
4416         return sign ? 1 << 1 : 1 << 6;
4417     }
4418 }
4419 
4420 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4421 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4422 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4423 GEN_VEXT_V(vfclass_v_h, 2)
4424 GEN_VEXT_V(vfclass_v_w, 4)
4425 GEN_VEXT_V(vfclass_v_d, 8)
4426 
4427 /* Vector Floating-Point Merge Instruction */
4428 
4429 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4430 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4431                   CPURISCVState *env, uint32_t desc)          \
4432 {                                                             \
4433     uint32_t vm = vext_vm(desc);                              \
4434     uint32_t vl = env->vl;                                    \
4435     uint32_t esz = sizeof(ETYPE);                             \
4436     uint32_t total_elems =                                    \
4437         vext_get_total_elems(env, desc, esz);                 \
4438     uint32_t vta = vext_vta(desc);                            \
4439     uint32_t i;                                               \
4440                                                               \
4441     for (i = env->vstart; i < vl; i++) {                      \
4442         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4443         *((ETYPE *)vd + H(i)) =                               \
4444             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4445     }                                                         \
4446     env->vstart = 0;                                          \
4447     /* set tail elements to 1s */                             \
4448     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4449 }
4450 
4451 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4452 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4453 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4454 
4455 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4456 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4457 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4458 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4459 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4460 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4461 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4462 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4463 
4464 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4465 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4466 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4467 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4468 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4469 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4470 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4471 
4472 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4473 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4474 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4475 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4476 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4477 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4478 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4479 
4480 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4481 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4482 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4483 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4484 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4485 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4486 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4487 
4488 /* Widening Floating-Point/Integer Type-Convert Instructions */
4489 /* (TD, T2, TX2) */
4490 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4491 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4492 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4493 /*
4494  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4495  */
4496 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4497 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4498 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4499 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4500 
4501 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4502 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4503 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4504 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4505 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4506 
4507 /*
4508  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4509  */
4510 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4511 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4512 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4513 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4514 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4515 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4516 
4517 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4518 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4519 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4520 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4521 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4522 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4523 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4524 
4525 /*
4526  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4527  */
4528 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4529 {
4530     return float16_to_float32(a, true, s);
4531 }
4532 
4533 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4534 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4535 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4536 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4537 
4538 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4539 /* (TD, T2, TX2) */
4540 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4541 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4542 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4543 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4544 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4545 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4546 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4547 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4548 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4549 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4550 
4551 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4552 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4553 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4554 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4555 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4556 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4557 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4558 
4559 /*
4560  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4561  */
4562 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4563 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4564 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4565 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4566 
4567 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4568 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4569 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4570 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4571 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4572 
4573 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4574 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4575 {
4576     return float32_to_float16(a, true, s);
4577 }
4578 
4579 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4580 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4581 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4582 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4583 
4584 /*
4585  * Vector Reduction Operations
4586  */
4587 /* Vector Single-Width Integer Reduction Instructions */
4588 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4589 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4590                   void *vs2, CPURISCVState *env,          \
4591                   uint32_t desc)                          \
4592 {                                                         \
4593     uint32_t vm = vext_vm(desc);                          \
4594     uint32_t vl = env->vl;                                \
4595     uint32_t esz = sizeof(TD);                            \
4596     uint32_t vlenb = simd_maxsz(desc);                    \
4597     uint32_t vta = vext_vta(desc);                        \
4598     uint32_t i;                                           \
4599     TD s1 =  *((TD *)vs1 + HD(0));                        \
4600                                                           \
4601     for (i = env->vstart; i < vl; i++) {                  \
4602         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4603         if (!vm && !vext_elem_mask(v0, i)) {              \
4604             continue;                                     \
4605         }                                                 \
4606         s1 = OP(s1, (TD)s2);                              \
4607     }                                                     \
4608     *((TD *)vd + HD(0)) = s1;                             \
4609     env->vstart = 0;                                      \
4610     /* set tail elements to 1s */                         \
4611     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4612 }
4613 
4614 /* vd[0] = sum(vs1[0], vs2[*]) */
4615 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4616 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4617 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4618 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4619 
4620 /* vd[0] = maxu(vs1[0], vs2[*]) */
4621 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4622 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4623 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4624 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4625 
4626 /* vd[0] = max(vs1[0], vs2[*]) */
4627 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4628 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4629 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4630 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4631 
4632 /* vd[0] = minu(vs1[0], vs2[*]) */
4633 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4634 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4635 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4636 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4637 
4638 /* vd[0] = min(vs1[0], vs2[*]) */
4639 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4640 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4641 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4642 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4643 
4644 /* vd[0] = and(vs1[0], vs2[*]) */
4645 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4646 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4647 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4648 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4649 
4650 /* vd[0] = or(vs1[0], vs2[*]) */
4651 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4652 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4653 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4654 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4655 
4656 /* vd[0] = xor(vs1[0], vs2[*]) */
4657 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4658 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4659 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4660 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4661 
4662 /* Vector Widening Integer Reduction Instructions */
4663 /* signed sum reduction into double-width accumulator */
4664 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4665 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4666 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4667 
4668 /* Unsigned sum reduction into double-width accumulator */
4669 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4670 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4671 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4672 
4673 /* Vector Single-Width Floating-Point Reduction Instructions */
4674 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4675 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4676                   void *vs2, CPURISCVState *env,           \
4677                   uint32_t desc)                           \
4678 {                                                          \
4679     uint32_t vm = vext_vm(desc);                           \
4680     uint32_t vl = env->vl;                                 \
4681     uint32_t esz = sizeof(TD);                             \
4682     uint32_t vlenb = simd_maxsz(desc);                     \
4683     uint32_t vta = vext_vta(desc);                         \
4684     uint32_t i;                                            \
4685     TD s1 =  *((TD *)vs1 + HD(0));                         \
4686                                                            \
4687     for (i = env->vstart; i < vl; i++) {                   \
4688         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4689         if (!vm && !vext_elem_mask(v0, i)) {               \
4690             continue;                                      \
4691         }                                                  \
4692         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4693     }                                                      \
4694     *((TD *)vd + HD(0)) = s1;                              \
4695     env->vstart = 0;                                       \
4696     /* set tail elements to 1s */                          \
4697     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4698 }
4699 
4700 /* Unordered sum */
4701 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4702 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4703 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4704 
4705 /* Ordered sum */
4706 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4707 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4708 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4709 
4710 /* Maximum value */
4711 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4712               float16_maximum_number)
4713 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4714               float32_maximum_number)
4715 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4716               float64_maximum_number)
4717 
4718 /* Minimum value */
4719 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4720               float16_minimum_number)
4721 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4722               float32_minimum_number)
4723 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4724               float64_minimum_number)
4725 
4726 /* Vector Widening Floating-Point Add Instructions */
4727 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4728 {
4729     return float32_add(a, float16_to_float32(b, true, s), s);
4730 }
4731 
4732 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4733 {
4734     return float64_add(a, float32_to_float64(b, s), s);
4735 }
4736 
4737 /* Vector Widening Floating-Point Reduction Instructions */
4738 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4739 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4740 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4741 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4742 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4743 
4744 /*
4745  * Vector Mask Operations
4746  */
4747 /* Vector Mask-Register Logical Instructions */
4748 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4749 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4750                   void *vs2, CPURISCVState *env,          \
4751                   uint32_t desc)                          \
4752 {                                                         \
4753     uint32_t vl = env->vl;                                \
4754     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4755     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4756     uint32_t i;                                           \
4757     int a, b;                                             \
4758                                                           \
4759     for (i = env->vstart; i < vl; i++) {                  \
4760         a = vext_elem_mask(vs1, i);                       \
4761         b = vext_elem_mask(vs2, i);                       \
4762         vext_set_elem_mask(vd, i, OP(b, a));              \
4763     }                                                     \
4764     env->vstart = 0;                                      \
4765     /*
4766      * mask destination register are always tail-agnostic
4767      * set tail elements to 1s
4768      */                                                   \
4769     if (vta_all_1s) {                                     \
4770         for (; i < total_elems; i++) {                    \
4771             vext_set_elem_mask(vd, i, 1);                 \
4772         }                                                 \
4773     }                                                     \
4774 }
4775 
4776 #define DO_NAND(N, M)  (!(N & M))
4777 #define DO_ANDNOT(N, M)  (N & !M)
4778 #define DO_NOR(N, M)  (!(N | M))
4779 #define DO_ORNOT(N, M)  (N | !M)
4780 #define DO_XNOR(N, M)  (!(N ^ M))
4781 
4782 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4783 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4784 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4785 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4786 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4787 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4788 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4789 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4790 
4791 /* Vector count population in mask vcpop */
4792 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4793                              uint32_t desc)
4794 {
4795     target_ulong cnt = 0;
4796     uint32_t vm = vext_vm(desc);
4797     uint32_t vl = env->vl;
4798     int i;
4799 
4800     for (i = env->vstart; i < vl; i++) {
4801         if (vm || vext_elem_mask(v0, i)) {
4802             if (vext_elem_mask(vs2, i)) {
4803                 cnt++;
4804             }
4805         }
4806     }
4807     env->vstart = 0;
4808     return cnt;
4809 }
4810 
4811 /* vfirst find-first-set mask bit */
4812 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4813                               uint32_t desc)
4814 {
4815     uint32_t vm = vext_vm(desc);
4816     uint32_t vl = env->vl;
4817     int i;
4818 
4819     for (i = env->vstart; i < vl; i++) {
4820         if (vm || vext_elem_mask(v0, i)) {
4821             if (vext_elem_mask(vs2, i)) {
4822                 return i;
4823             }
4824         }
4825     }
4826     env->vstart = 0;
4827     return -1LL;
4828 }
4829 
4830 enum set_mask_type {
4831     ONLY_FIRST = 1,
4832     INCLUDE_FIRST,
4833     BEFORE_FIRST,
4834 };
4835 
4836 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4837                    uint32_t desc, enum set_mask_type type)
4838 {
4839     uint32_t vm = vext_vm(desc);
4840     uint32_t vl = env->vl;
4841     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4842     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4843     uint32_t vma = vext_vma(desc);
4844     int i;
4845     bool first_mask_bit = false;
4846 
4847     for (i = env->vstart; i < vl; i++) {
4848         if (!vm && !vext_elem_mask(v0, i)) {
4849             /* set masked-off elements to 1s */
4850             if (vma) {
4851                 vext_set_elem_mask(vd, i, 1);
4852             }
4853             continue;
4854         }
4855         /* write a zero to all following active elements */
4856         if (first_mask_bit) {
4857             vext_set_elem_mask(vd, i, 0);
4858             continue;
4859         }
4860         if (vext_elem_mask(vs2, i)) {
4861             first_mask_bit = true;
4862             if (type == BEFORE_FIRST) {
4863                 vext_set_elem_mask(vd, i, 0);
4864             } else {
4865                 vext_set_elem_mask(vd, i, 1);
4866             }
4867         } else {
4868             if (type == ONLY_FIRST) {
4869                 vext_set_elem_mask(vd, i, 0);
4870             } else {
4871                 vext_set_elem_mask(vd, i, 1);
4872             }
4873         }
4874     }
4875     env->vstart = 0;
4876     /*
4877      * mask destination register are always tail-agnostic
4878      * set tail elements to 1s
4879      */
4880     if (vta_all_1s) {
4881         for (; i < total_elems; i++) {
4882             vext_set_elem_mask(vd, i, 1);
4883         }
4884     }
4885 }
4886 
4887 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4888                      uint32_t desc)
4889 {
4890     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4891 }
4892 
4893 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4894                      uint32_t desc)
4895 {
4896     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4897 }
4898 
4899 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4900                      uint32_t desc)
4901 {
4902     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4903 }
4904 
4905 /* Vector Iota Instruction */
4906 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4907 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4908                   uint32_t desc)                                          \
4909 {                                                                         \
4910     uint32_t vm = vext_vm(desc);                                          \
4911     uint32_t vl = env->vl;                                                \
4912     uint32_t esz = sizeof(ETYPE);                                         \
4913     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4914     uint32_t vta = vext_vta(desc);                                        \
4915     uint32_t vma = vext_vma(desc);                                        \
4916     uint32_t sum = 0;                                                     \
4917     int i;                                                                \
4918                                                                           \
4919     for (i = env->vstart; i < vl; i++) {                                  \
4920         if (!vm && !vext_elem_mask(v0, i)) {                              \
4921             /* set masked-off elements to 1s */                           \
4922             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4923             continue;                                                     \
4924         }                                                                 \
4925         *((ETYPE *)vd + H(i)) = sum;                                      \
4926         if (vext_elem_mask(vs2, i)) {                                     \
4927             sum++;                                                        \
4928         }                                                                 \
4929     }                                                                     \
4930     env->vstart = 0;                                                      \
4931     /* set tail elements to 1s */                                         \
4932     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4933 }
4934 
4935 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4936 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4937 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4938 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4939 
4940 /* Vector Element Index Instruction */
4941 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4942 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4943 {                                                                         \
4944     uint32_t vm = vext_vm(desc);                                          \
4945     uint32_t vl = env->vl;                                                \
4946     uint32_t esz = sizeof(ETYPE);                                         \
4947     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4948     uint32_t vta = vext_vta(desc);                                        \
4949     uint32_t vma = vext_vma(desc);                                        \
4950     int i;                                                                \
4951                                                                           \
4952     for (i = env->vstart; i < vl; i++) {                                  \
4953         if (!vm && !vext_elem_mask(v0, i)) {                              \
4954             /* set masked-off elements to 1s */                           \
4955             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4956             continue;                                                     \
4957         }                                                                 \
4958         *((ETYPE *)vd + H(i)) = i;                                        \
4959     }                                                                     \
4960     env->vstart = 0;                                                      \
4961     /* set tail elements to 1s */                                         \
4962     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4963 }
4964 
4965 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4966 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4967 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4968 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4969 
4970 /*
4971  * Vector Permutation Instructions
4972  */
4973 
4974 /* Vector Slide Instructions */
4975 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4976 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4977                   CPURISCVState *env, uint32_t desc)                      \
4978 {                                                                         \
4979     uint32_t vm = vext_vm(desc);                                          \
4980     uint32_t vl = env->vl;                                                \
4981     uint32_t esz = sizeof(ETYPE);                                         \
4982     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4983     uint32_t vta = vext_vta(desc);                                        \
4984     uint32_t vma = vext_vma(desc);                                        \
4985     target_ulong offset = s1, i_min, i;                                   \
4986                                                                           \
4987     i_min = MAX(env->vstart, offset);                                     \
4988     for (i = i_min; i < vl; i++) {                                        \
4989         if (!vm && !vext_elem_mask(v0, i)) {                              \
4990             /* set masked-off elements to 1s */                           \
4991             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4992             continue;                                                     \
4993         }                                                                 \
4994         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4995     }                                                                     \
4996     /* set tail elements to 1s */                                         \
4997     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4998 }
4999 
5000 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5001 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5002 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5003 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5004 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5005 
5006 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5007 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5008                   CPURISCVState *env, uint32_t desc)                      \
5009 {                                                                         \
5010     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5011     uint32_t vm = vext_vm(desc);                                          \
5012     uint32_t vl = env->vl;                                                \
5013     uint32_t esz = sizeof(ETYPE);                                         \
5014     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5015     uint32_t vta = vext_vta(desc);                                        \
5016     uint32_t vma = vext_vma(desc);                                        \
5017     target_ulong i_max, i;                                                \
5018                                                                           \
5019     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5020     for (i = env->vstart; i < i_max; ++i) {                               \
5021         if (!vm && !vext_elem_mask(v0, i)) {                              \
5022             /* set masked-off elements to 1s */                           \
5023             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5024             continue;                                                     \
5025         }                                                                 \
5026         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5027     }                                                                     \
5028                                                                           \
5029     for (i = i_max; i < vl; ++i) {                                        \
5030         if (vm || vext_elem_mask(v0, i)) {                                \
5031             *((ETYPE *)vd + H(i)) = 0;                                    \
5032         }                                                                 \
5033     }                                                                     \
5034                                                                           \
5035     env->vstart = 0;                                                      \
5036     /* set tail elements to 1s */                                         \
5037     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5038 }
5039 
5040 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5041 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5042 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5043 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5044 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5045 
5046 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5047 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5048                                  void *vs2, CPURISCVState *env,             \
5049                                  uint32_t desc)                             \
5050 {                                                                           \
5051     typedef uint##BITWIDTH##_t ETYPE;                                       \
5052     uint32_t vm = vext_vm(desc);                                            \
5053     uint32_t vl = env->vl;                                                  \
5054     uint32_t esz = sizeof(ETYPE);                                           \
5055     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5056     uint32_t vta = vext_vta(desc);                                          \
5057     uint32_t vma = vext_vma(desc);                                          \
5058     uint32_t i;                                                             \
5059                                                                             \
5060     for (i = env->vstart; i < vl; i++) {                                    \
5061         if (!vm && !vext_elem_mask(v0, i)) {                                \
5062             /* set masked-off elements to 1s */                             \
5063             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5064             continue;                                                       \
5065         }                                                                   \
5066         if (i == 0) {                                                       \
5067             *((ETYPE *)vd + H(i)) = s1;                                     \
5068         } else {                                                            \
5069             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5070         }                                                                   \
5071     }                                                                       \
5072     env->vstart = 0;                                                        \
5073     /* set tail elements to 1s */                                           \
5074     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5075 }
5076 
5077 GEN_VEXT_VSLIE1UP(8,  H1)
5078 GEN_VEXT_VSLIE1UP(16, H2)
5079 GEN_VEXT_VSLIE1UP(32, H4)
5080 GEN_VEXT_VSLIE1UP(64, H8)
5081 
5082 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5083 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5084                   CPURISCVState *env, uint32_t desc)              \
5085 {                                                                 \
5086     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5087 }
5088 
5089 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5090 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5091 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5092 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5093 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5094 
5095 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5096 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5097                                    void *vs2, CPURISCVState *env,             \
5098                                    uint32_t desc)                             \
5099 {                                                                             \
5100     typedef uint##BITWIDTH##_t ETYPE;                                         \
5101     uint32_t vm = vext_vm(desc);                                              \
5102     uint32_t vl = env->vl;                                                    \
5103     uint32_t esz = sizeof(ETYPE);                                             \
5104     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5105     uint32_t vta = vext_vta(desc);                                            \
5106     uint32_t vma = vext_vma(desc);                                            \
5107     uint32_t i;                                                               \
5108                                                                               \
5109     for (i = env->vstart; i < vl; i++) {                                      \
5110         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5111             /* set masked-off elements to 1s */                               \
5112             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5113             continue;                                                         \
5114         }                                                                     \
5115         if (i == vl - 1) {                                                    \
5116             *((ETYPE *)vd + H(i)) = s1;                                       \
5117         } else {                                                              \
5118             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5119         }                                                                     \
5120     }                                                                         \
5121     env->vstart = 0;                                                          \
5122     /* set tail elements to 1s */                                             \
5123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5124 }
5125 
5126 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5127 GEN_VEXT_VSLIDE1DOWN(16, H2)
5128 GEN_VEXT_VSLIDE1DOWN(32, H4)
5129 GEN_VEXT_VSLIDE1DOWN(64, H8)
5130 
5131 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5132 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5133                   CPURISCVState *env, uint32_t desc)              \
5134 {                                                                 \
5135     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5136 }
5137 
5138 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5139 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5140 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5141 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5142 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5143 
5144 /* Vector Floating-Point Slide Instructions */
5145 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5146 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5147                   CPURISCVState *env, uint32_t desc)          \
5148 {                                                             \
5149     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5150 }
5151 
5152 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5153 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5154 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5155 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5156 
5157 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5158 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5159                   CPURISCVState *env, uint32_t desc)          \
5160 {                                                             \
5161     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5162 }
5163 
5164 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5165 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5166 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5167 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5168 
5169 /* Vector Register Gather Instruction */
5170 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5171 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5172                   CPURISCVState *env, uint32_t desc)                      \
5173 {                                                                         \
5174     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5175     uint32_t vm = vext_vm(desc);                                          \
5176     uint32_t vl = env->vl;                                                \
5177     uint32_t esz = sizeof(TS2);                                           \
5178     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5179     uint32_t vta = vext_vta(desc);                                        \
5180     uint32_t vma = vext_vma(desc);                                        \
5181     uint64_t index;                                                       \
5182     uint32_t i;                                                           \
5183                                                                           \
5184     for (i = env->vstart; i < vl; i++) {                                  \
5185         if (!vm && !vext_elem_mask(v0, i)) {                              \
5186             /* set masked-off elements to 1s */                           \
5187             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5188             continue;                                                     \
5189         }                                                                 \
5190         index = *((TS1 *)vs1 + HS1(i));                                   \
5191         if (index >= vlmax) {                                             \
5192             *((TS2 *)vd + HS2(i)) = 0;                                    \
5193         } else {                                                          \
5194             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5195         }                                                                 \
5196     }                                                                     \
5197     env->vstart = 0;                                                      \
5198     /* set tail elements to 1s */                                         \
5199     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5200 }
5201 
5202 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5203 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5204 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5205 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5206 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5207 
5208 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5209 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5210 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5211 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5212 
5213 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5214 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5215                   CPURISCVState *env, uint32_t desc)                      \
5216 {                                                                         \
5217     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5218     uint32_t vm = vext_vm(desc);                                          \
5219     uint32_t vl = env->vl;                                                \
5220     uint32_t esz = sizeof(ETYPE);                                         \
5221     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5222     uint32_t vta = vext_vta(desc);                                        \
5223     uint32_t vma = vext_vma(desc);                                        \
5224     uint64_t index = s1;                                                  \
5225     uint32_t i;                                                           \
5226                                                                           \
5227     for (i = env->vstart; i < vl; i++) {                                  \
5228         if (!vm && !vext_elem_mask(v0, i)) {                              \
5229             /* set masked-off elements to 1s */                           \
5230             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5231             continue;                                                     \
5232         }                                                                 \
5233         if (index >= vlmax) {                                             \
5234             *((ETYPE *)vd + H(i)) = 0;                                    \
5235         } else {                                                          \
5236             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5237         }                                                                 \
5238     }                                                                     \
5239     env->vstart = 0;                                                      \
5240     /* set tail elements to 1s */                                         \
5241     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5242 }
5243 
5244 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5245 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5246 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5247 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5248 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5249 
5250 /* Vector Compress Instruction */
5251 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5252 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5253                   CPURISCVState *env, uint32_t desc)                      \
5254 {                                                                         \
5255     uint32_t vl = env->vl;                                                \
5256     uint32_t esz = sizeof(ETYPE);                                         \
5257     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5258     uint32_t vta = vext_vta(desc);                                        \
5259     uint32_t num = 0, i;                                                  \
5260                                                                           \
5261     for (i = env->vstart; i < vl; i++) {                                  \
5262         if (!vext_elem_mask(vs1, i)) {                                    \
5263             continue;                                                     \
5264         }                                                                 \
5265         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5266         num++;                                                            \
5267     }                                                                     \
5268     env->vstart = 0;                                                      \
5269     /* set tail elements to 1s */                                         \
5270     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5271 }
5272 
5273 /* Compress into vd elements of vs2 where vs1 is enabled */
5274 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5275 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5276 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5277 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5278 
5279 /* Vector Whole Register Move */
5280 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5281 {
5282     /* EEW = SEW */
5283     uint32_t maxsz = simd_maxsz(desc);
5284     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5285     uint32_t startb = env->vstart * sewb;
5286     uint32_t i = startb;
5287 
5288     memcpy((uint8_t *)vd + H1(i),
5289            (uint8_t *)vs2 + H1(i),
5290            maxsz - startb);
5291 
5292     env->vstart = 0;
5293 }
5294 
5295 /* Vector Integer Extension */
5296 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5297 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5298                   CPURISCVState *env, uint32_t desc)             \
5299 {                                                                \
5300     uint32_t vl = env->vl;                                       \
5301     uint32_t vm = vext_vm(desc);                                 \
5302     uint32_t esz = sizeof(ETYPE);                                \
5303     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5304     uint32_t vta = vext_vta(desc);                               \
5305     uint32_t vma = vext_vma(desc);                               \
5306     uint32_t i;                                                  \
5307                                                                  \
5308     for (i = env->vstart; i < vl; i++) {                         \
5309         if (!vm && !vext_elem_mask(v0, i)) {                     \
5310             /* set masked-off elements to 1s */                  \
5311             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5312             continue;                                            \
5313         }                                                        \
5314         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5315     }                                                            \
5316     env->vstart = 0;                                             \
5317     /* set tail elements to 1s */                                \
5318     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5319 }
5320 
5321 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5322 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5323 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5324 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5325 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5326 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5327 
5328 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5329 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5330 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5331 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5332 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5333 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5334