xref: /qemu/target/riscv/vector_helper.c (revision a0e93dd8)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "internals.h"
30 #include "vector_internals.h"
31 #include <math.h>
32 
33 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
34                             target_ulong s2)
35 {
36     int vlmax, vl;
37     RISCVCPU *cpu = env_archcpu(env);
38     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
39     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
40     uint16_t sew = 8 << vsew;
41     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
42     int xlen = riscv_cpu_xlen(env);
43     bool vill = (s2 >> (xlen - 1)) & 0x1;
44     target_ulong reserved = s2 &
45                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
46                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
47     int8_t lmul;
48 
49     if (vlmul & 4) {
50         /*
51          * Fractional LMUL, check:
52          *
53          * VLEN * LMUL >= SEW
54          * VLEN >> (8 - lmul) >= sew
55          * (vlenb << 3) >> (8 - lmul) >= sew
56          * vlenb >> (8 - 3 - lmul) >= sew
57          */
58         if (vlmul == 4 ||
59             cpu->cfg.vlenb >> (8 - 3 - vlmul) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else {
79         vl = vlmax;
80     }
81     env->vl = vl;
82     env->vtype = s2;
83     env->vstart = 0;
84     env->vill = 0;
85     return vl;
86 }
87 
88 /*
89  * Get the maximum number of elements can be operated.
90  *
91  * log2_esz: log2 of element size in bytes.
92  */
93 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
94 {
95     /*
96      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
97      * so vlen in bytes (vlenb) is encoded as maxsz.
98      */
99     uint32_t vlenb = simd_maxsz(desc);
100 
101     /* Return VLMAX */
102     int scale = vext_lmul(desc) - log2_esz;
103     return scale < 0 ? vlenb >> -scale : vlenb << scale;
104 }
105 
106 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
107 {
108     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
109 }
110 
111 /*
112  * This function checks watchpoint before real load operation.
113  *
114  * In system mode, the TLB API probe_access is enough for watchpoint check.
115  * In user mode, there is no watchpoint support now.
116  *
117  * It will trigger an exception if there is no mapping in TLB
118  * and page table walk can't fill the TLB entry. Then the guest
119  * software can return here after process the exception or never return.
120  */
121 static void probe_pages(CPURISCVState *env, target_ulong addr,
122                         target_ulong len, uintptr_t ra,
123                         MMUAccessType access_type)
124 {
125     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
126     target_ulong curlen = MIN(pagelen, len);
127     int mmu_index = riscv_env_mmu_index(env, false);
128 
129     probe_access(env, adjust_addr(env, addr), curlen, access_type,
130                  mmu_index, ra);
131     if (len > curlen) {
132         addr += curlen;
133         curlen = len - curlen;
134         probe_access(env, adjust_addr(env, addr), curlen, access_type,
135                      mmu_index, ra);
136     }
137 }
138 
139 static inline void vext_set_elem_mask(void *v0, int index,
140                                       uint8_t value)
141 {
142     int idx = index / 64;
143     int pos = index % 64;
144     uint64_t old = ((uint64_t *)v0)[idx];
145     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
146 }
147 
148 /* elements operations for load and store */
149 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
150                                uint32_t idx, void *vd, uintptr_t retaddr);
151 
152 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
153 static void NAME(CPURISCVState *env, abi_ptr addr,         \
154                  uint32_t idx, void *vd, uintptr_t retaddr)\
155 {                                                          \
156     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
157     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
158 }                                                          \
159 
160 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
161 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
162 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
163 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
164 
165 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
166 static void NAME(CPURISCVState *env, abi_ptr addr,         \
167                  uint32_t idx, void *vd, uintptr_t retaddr)\
168 {                                                          \
169     ETYPE data = *((ETYPE *)vd + H(idx));                  \
170     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
171 }
172 
173 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
174 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
175 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
176 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
177 
178 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
179                                    uint32_t desc, uint32_t nf,
180                                    uint32_t esz, uint32_t max_elems)
181 {
182     uint32_t vta = vext_vta(desc);
183     int k;
184 
185     if (vta == 0) {
186         return;
187     }
188 
189     for (k = 0; k < nf; ++k) {
190         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
191                           (k * max_elems + max_elems) * esz);
192     }
193 }
194 
195 /*
196  * stride: access vector element from strided memory
197  */
198 static void
199 vext_ldst_stride(void *vd, void *v0, target_ulong base,
200                  target_ulong stride, CPURISCVState *env,
201                  uint32_t desc, uint32_t vm,
202                  vext_ldst_elem_fn *ldst_elem,
203                  uint32_t log2_esz, uintptr_t ra)
204 {
205     uint32_t i, k;
206     uint32_t nf = vext_nf(desc);
207     uint32_t max_elems = vext_max_elems(desc, log2_esz);
208     uint32_t esz = 1 << log2_esz;
209     uint32_t vma = vext_vma(desc);
210 
211     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
212         k = 0;
213         while (k < nf) {
214             if (!vm && !vext_elem_mask(v0, i)) {
215                 /* set masked-off elements to 1s */
216                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
217                                   (i + k * max_elems + 1) * esz);
218                 k++;
219                 continue;
220             }
221             target_ulong addr = base + stride * i + (k << log2_esz);
222             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
223             k++;
224         }
225     }
226     env->vstart = 0;
227 
228     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
229 }
230 
231 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
232 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
233                   target_ulong stride, CPURISCVState *env,              \
234                   uint32_t desc)                                        \
235 {                                                                       \
236     uint32_t vm = vext_vm(desc);                                        \
237     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
238                      ctzl(sizeof(ETYPE)), GETPC());                     \
239 }
240 
241 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
242 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
243 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
244 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
245 
246 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
247 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
248                   target_ulong stride, CPURISCVState *env,              \
249                   uint32_t desc)                                        \
250 {                                                                       \
251     uint32_t vm = vext_vm(desc);                                        \
252     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
253                      ctzl(sizeof(ETYPE)), GETPC());                     \
254 }
255 
256 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
257 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
258 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
259 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
260 
261 /*
262  * unit-stride: access elements stored contiguously in memory
263  */
264 
265 /* unmasked unit-stride load and store operation */
266 static void
267 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
268              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
269              uintptr_t ra)
270 {
271     uint32_t i, k;
272     uint32_t nf = vext_nf(desc);
273     uint32_t max_elems = vext_max_elems(desc, log2_esz);
274     uint32_t esz = 1 << log2_esz;
275 
276     /* load bytes from guest memory */
277     for (i = env->vstart; i < evl; i++, env->vstart++) {
278         k = 0;
279         while (k < nf) {
280             target_ulong addr = base + ((i * nf + k) << log2_esz);
281             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
282             k++;
283         }
284     }
285     env->vstart = 0;
286 
287     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
288 }
289 
290 /*
291  * masked unit-stride load and store operation will be a special case of
292  * stride, stride = NF * sizeof (ETYPE)
293  */
294 
295 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
296 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
297                          CPURISCVState *env, uint32_t desc)             \
298 {                                                                       \
299     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
300     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
301                      ctzl(sizeof(ETYPE)), GETPC());                     \
302 }                                                                       \
303                                                                         \
304 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
305                   CPURISCVState *env, uint32_t desc)                    \
306 {                                                                       \
307     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
308                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
309 }
310 
311 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
312 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
313 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
314 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
315 
316 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
317 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
318                          CPURISCVState *env, uint32_t desc)              \
319 {                                                                        \
320     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
321     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
322                      ctzl(sizeof(ETYPE)), GETPC());                      \
323 }                                                                        \
324                                                                          \
325 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
326                   CPURISCVState *env, uint32_t desc)                     \
327 {                                                                        \
328     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
329                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
330 }
331 
332 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
333 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
334 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
335 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
336 
337 /*
338  * unit stride mask load and store, EEW = 1
339  */
340 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
341                     CPURISCVState *env, uint32_t desc)
342 {
343     /* evl = ceil(vl/8) */
344     uint8_t evl = (env->vl + 7) >> 3;
345     vext_ldst_us(vd, base, env, desc, lde_b,
346                  0, evl, GETPC());
347 }
348 
349 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
350                     CPURISCVState *env, uint32_t desc)
351 {
352     /* evl = ceil(vl/8) */
353     uint8_t evl = (env->vl + 7) >> 3;
354     vext_ldst_us(vd, base, env, desc, ste_b,
355                  0, evl, GETPC());
356 }
357 
358 /*
359  * index: access vector element from indexed memory
360  */
361 typedef target_ulong vext_get_index_addr(target_ulong base,
362         uint32_t idx, void *vs2);
363 
364 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
365 static target_ulong NAME(target_ulong base,            \
366                          uint32_t idx, void *vs2)      \
367 {                                                      \
368     return (base + *((ETYPE *)vs2 + H(idx)));          \
369 }
370 
371 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
372 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
373 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
374 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
375 
376 static inline void
377 vext_ldst_index(void *vd, void *v0, target_ulong base,
378                 void *vs2, CPURISCVState *env, uint32_t desc,
379                 vext_get_index_addr get_index_addr,
380                 vext_ldst_elem_fn *ldst_elem,
381                 uint32_t log2_esz, uintptr_t ra)
382 {
383     uint32_t i, k;
384     uint32_t nf = vext_nf(desc);
385     uint32_t vm = vext_vm(desc);
386     uint32_t max_elems = vext_max_elems(desc, log2_esz);
387     uint32_t esz = 1 << log2_esz;
388     uint32_t vma = vext_vma(desc);
389 
390     /* load bytes from guest memory */
391     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
392         k = 0;
393         while (k < nf) {
394             if (!vm && !vext_elem_mask(v0, i)) {
395                 /* set masked-off elements to 1s */
396                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
397                                   (i + k * max_elems + 1) * esz);
398                 k++;
399                 continue;
400             }
401             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
402             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
403             k++;
404         }
405     }
406     env->vstart = 0;
407 
408     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
409 }
410 
411 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
412 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
413                   void *vs2, CPURISCVState *env, uint32_t desc)            \
414 {                                                                          \
415     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
416                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
417 }
418 
419 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
420 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
421 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
422 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
423 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
424 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
425 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
426 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
427 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
428 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
429 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
430 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
431 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
432 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
433 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
434 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
435 
436 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
437 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
438                   void *vs2, CPURISCVState *env, uint32_t desc)  \
439 {                                                                \
440     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
441                     STORE_FN, ctzl(sizeof(ETYPE)),               \
442                     GETPC());                                    \
443 }
444 
445 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
446 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
447 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
448 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
449 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
450 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
451 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
452 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
453 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
454 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
455 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
456 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
457 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
458 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
459 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
460 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
461 
462 /*
463  * unit-stride fault-only-fisrt load instructions
464  */
465 static inline void
466 vext_ldff(void *vd, void *v0, target_ulong base,
467           CPURISCVState *env, uint32_t desc,
468           vext_ldst_elem_fn *ldst_elem,
469           uint32_t log2_esz, uintptr_t ra)
470 {
471     void *host;
472     uint32_t i, k, vl = 0;
473     uint32_t nf = vext_nf(desc);
474     uint32_t vm = vext_vm(desc);
475     uint32_t max_elems = vext_max_elems(desc, log2_esz);
476     uint32_t esz = 1 << log2_esz;
477     uint32_t vma = vext_vma(desc);
478     target_ulong addr, offset, remain;
479     int mmu_index = riscv_env_mmu_index(env, false);
480 
481     /* probe every access */
482     for (i = env->vstart; i < env->vl; i++) {
483         if (!vm && !vext_elem_mask(v0, i)) {
484             continue;
485         }
486         addr = adjust_addr(env, base + i * (nf << log2_esz));
487         if (i == 0) {
488             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
489         } else {
490             /* if it triggers an exception, no need to check watchpoint */
491             remain = nf << log2_esz;
492             while (remain > 0) {
493                 offset = -(addr | TARGET_PAGE_MASK);
494                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_index);
495                 if (host) {
496 #ifdef CONFIG_USER_ONLY
497                     if (!page_check_range(addr, offset, PAGE_READ)) {
498                         vl = i;
499                         goto ProbeSuccess;
500                     }
501 #else
502                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
503 #endif
504                 } else {
505                     vl = i;
506                     goto ProbeSuccess;
507                 }
508                 if (remain <=  offset) {
509                     break;
510                 }
511                 remain -= offset;
512                 addr = adjust_addr(env, addr + offset);
513             }
514         }
515     }
516 ProbeSuccess:
517     /* load bytes from guest memory */
518     if (vl != 0) {
519         env->vl = vl;
520     }
521     for (i = env->vstart; i < env->vl; i++) {
522         k = 0;
523         while (k < nf) {
524             if (!vm && !vext_elem_mask(v0, i)) {
525                 /* set masked-off elements to 1s */
526                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
527                                   (i + k * max_elems + 1) * esz);
528                 k++;
529                 continue;
530             }
531             addr = base + ((i * nf + k) << log2_esz);
532             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
533             k++;
534         }
535     }
536     env->vstart = 0;
537 
538     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
539 }
540 
541 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
542 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
543                   CPURISCVState *env, uint32_t desc)      \
544 {                                                         \
545     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
546               ctzl(sizeof(ETYPE)), GETPC());              \
547 }
548 
549 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
550 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
551 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
552 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
553 
554 #define DO_SWAP(N, M) (M)
555 #define DO_AND(N, M)  (N & M)
556 #define DO_XOR(N, M)  (N ^ M)
557 #define DO_OR(N, M)   (N | M)
558 #define DO_ADD(N, M)  (N + M)
559 
560 /* Signed min/max */
561 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
562 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
563 
564 /*
565  * load and store whole register instructions
566  */
567 static void
568 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
569                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
570 {
571     uint32_t i, k, off, pos;
572     uint32_t nf = vext_nf(desc);
573     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
574     uint32_t max_elems = vlenb >> log2_esz;
575 
576     k = env->vstart / max_elems;
577     off = env->vstart % max_elems;
578 
579     if (off) {
580         /* load/store rest of elements of current segment pointed by vstart */
581         for (pos = off; pos < max_elems; pos++, env->vstart++) {
582             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
583             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
584                       ra);
585         }
586         k++;
587     }
588 
589     /* load/store elements for rest of segments */
590     for (; k < nf; k++) {
591         for (i = 0; i < max_elems; i++, env->vstart++) {
592             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
593             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
594         }
595     }
596 
597     env->vstart = 0;
598 }
599 
600 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
601 void HELPER(NAME)(void *vd, target_ulong base,       \
602                   CPURISCVState *env, uint32_t desc) \
603 {                                                    \
604     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
605                     ctzl(sizeof(ETYPE)), GETPC());   \
606 }
607 
608 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
609 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
610 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
611 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
612 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
613 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
614 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
615 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
616 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
617 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
618 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
619 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
620 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
621 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
622 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
623 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
624 
625 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
626 void HELPER(NAME)(void *vd, target_ulong base,       \
627                   CPURISCVState *env, uint32_t desc) \
628 {                                                    \
629     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
630                     ctzl(sizeof(ETYPE)), GETPC());   \
631 }
632 
633 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
634 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
635 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
636 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
637 
638 /*
639  * Vector Integer Arithmetic Instructions
640  */
641 
642 /* (TD, T1, T2, TX1, TX2) */
643 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
644 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
645 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
646 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
647 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
648 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
649 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
650 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
651 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
652 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
653 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
654 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
655 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
656 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
657 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
658 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
659 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
660 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
661 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
662 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
663 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
664 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
665 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
666 
667 #define DO_SUB(N, M) (N - M)
668 #define DO_RSUB(N, M) (M - N)
669 
670 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
671 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
672 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
673 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
674 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
675 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
676 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
677 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
678 
679 GEN_VEXT_VV(vadd_vv_b, 1)
680 GEN_VEXT_VV(vadd_vv_h, 2)
681 GEN_VEXT_VV(vadd_vv_w, 4)
682 GEN_VEXT_VV(vadd_vv_d, 8)
683 GEN_VEXT_VV(vsub_vv_b, 1)
684 GEN_VEXT_VV(vsub_vv_h, 2)
685 GEN_VEXT_VV(vsub_vv_w, 4)
686 GEN_VEXT_VV(vsub_vv_d, 8)
687 
688 
689 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
690 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
691 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
692 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
693 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
694 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
695 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
696 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
697 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
698 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
699 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
700 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
701 
702 GEN_VEXT_VX(vadd_vx_b, 1)
703 GEN_VEXT_VX(vadd_vx_h, 2)
704 GEN_VEXT_VX(vadd_vx_w, 4)
705 GEN_VEXT_VX(vadd_vx_d, 8)
706 GEN_VEXT_VX(vsub_vx_b, 1)
707 GEN_VEXT_VX(vsub_vx_h, 2)
708 GEN_VEXT_VX(vsub_vx_w, 4)
709 GEN_VEXT_VX(vsub_vx_d, 8)
710 GEN_VEXT_VX(vrsub_vx_b, 1)
711 GEN_VEXT_VX(vrsub_vx_h, 2)
712 GEN_VEXT_VX(vrsub_vx_w, 4)
713 GEN_VEXT_VX(vrsub_vx_d, 8)
714 
715 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
716 {
717     intptr_t oprsz = simd_oprsz(desc);
718     intptr_t i;
719 
720     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
721         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
722     }
723 }
724 
725 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
726 {
727     intptr_t oprsz = simd_oprsz(desc);
728     intptr_t i;
729 
730     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
731         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
732     }
733 }
734 
735 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
736 {
737     intptr_t oprsz = simd_oprsz(desc);
738     intptr_t i;
739 
740     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
741         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
742     }
743 }
744 
745 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
746 {
747     intptr_t oprsz = simd_oprsz(desc);
748     intptr_t i;
749 
750     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
751         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
752     }
753 }
754 
755 /* Vector Widening Integer Add/Subtract */
756 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
757 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
758 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
759 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
760 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
761 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
762 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
763 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
764 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
765 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
766 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
767 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
768 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
769 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
770 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
771 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
772 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
773 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
774 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
775 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
776 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
777 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
778 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
779 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
780 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
781 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
782 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
783 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
784 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
785 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
786 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
787 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
788 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
789 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
790 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
791 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
792 GEN_VEXT_VV(vwaddu_vv_b, 2)
793 GEN_VEXT_VV(vwaddu_vv_h, 4)
794 GEN_VEXT_VV(vwaddu_vv_w, 8)
795 GEN_VEXT_VV(vwsubu_vv_b, 2)
796 GEN_VEXT_VV(vwsubu_vv_h, 4)
797 GEN_VEXT_VV(vwsubu_vv_w, 8)
798 GEN_VEXT_VV(vwadd_vv_b, 2)
799 GEN_VEXT_VV(vwadd_vv_h, 4)
800 GEN_VEXT_VV(vwadd_vv_w, 8)
801 GEN_VEXT_VV(vwsub_vv_b, 2)
802 GEN_VEXT_VV(vwsub_vv_h, 4)
803 GEN_VEXT_VV(vwsub_vv_w, 8)
804 GEN_VEXT_VV(vwaddu_wv_b, 2)
805 GEN_VEXT_VV(vwaddu_wv_h, 4)
806 GEN_VEXT_VV(vwaddu_wv_w, 8)
807 GEN_VEXT_VV(vwsubu_wv_b, 2)
808 GEN_VEXT_VV(vwsubu_wv_h, 4)
809 GEN_VEXT_VV(vwsubu_wv_w, 8)
810 GEN_VEXT_VV(vwadd_wv_b, 2)
811 GEN_VEXT_VV(vwadd_wv_h, 4)
812 GEN_VEXT_VV(vwadd_wv_w, 8)
813 GEN_VEXT_VV(vwsub_wv_b, 2)
814 GEN_VEXT_VV(vwsub_wv_h, 4)
815 GEN_VEXT_VV(vwsub_wv_w, 8)
816 
817 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
818 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
819 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
820 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
821 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
822 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
823 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
824 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
825 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
826 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
827 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
828 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
829 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
830 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
831 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
832 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
833 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
834 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
835 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
836 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
837 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
838 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
839 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
840 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
841 GEN_VEXT_VX(vwaddu_vx_b, 2)
842 GEN_VEXT_VX(vwaddu_vx_h, 4)
843 GEN_VEXT_VX(vwaddu_vx_w, 8)
844 GEN_VEXT_VX(vwsubu_vx_b, 2)
845 GEN_VEXT_VX(vwsubu_vx_h, 4)
846 GEN_VEXT_VX(vwsubu_vx_w, 8)
847 GEN_VEXT_VX(vwadd_vx_b, 2)
848 GEN_VEXT_VX(vwadd_vx_h, 4)
849 GEN_VEXT_VX(vwadd_vx_w, 8)
850 GEN_VEXT_VX(vwsub_vx_b, 2)
851 GEN_VEXT_VX(vwsub_vx_h, 4)
852 GEN_VEXT_VX(vwsub_vx_w, 8)
853 GEN_VEXT_VX(vwaddu_wx_b, 2)
854 GEN_VEXT_VX(vwaddu_wx_h, 4)
855 GEN_VEXT_VX(vwaddu_wx_w, 8)
856 GEN_VEXT_VX(vwsubu_wx_b, 2)
857 GEN_VEXT_VX(vwsubu_wx_h, 4)
858 GEN_VEXT_VX(vwsubu_wx_w, 8)
859 GEN_VEXT_VX(vwadd_wx_b, 2)
860 GEN_VEXT_VX(vwadd_wx_h, 4)
861 GEN_VEXT_VX(vwadd_wx_w, 8)
862 GEN_VEXT_VX(vwsub_wx_b, 2)
863 GEN_VEXT_VX(vwsub_wx_h, 4)
864 GEN_VEXT_VX(vwsub_wx_w, 8)
865 
866 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
867 #define DO_VADC(N, M, C) (N + M + C)
868 #define DO_VSBC(N, M, C) (N - M - C)
869 
870 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
871 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
872                   CPURISCVState *env, uint32_t desc)          \
873 {                                                             \
874     uint32_t vl = env->vl;                                    \
875     uint32_t esz = sizeof(ETYPE);                             \
876     uint32_t total_elems =                                    \
877         vext_get_total_elems(env, desc, esz);                 \
878     uint32_t vta = vext_vta(desc);                            \
879     uint32_t i;                                               \
880                                                               \
881     for (i = env->vstart; i < vl; i++) {                      \
882         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
883         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
884         ETYPE carry = vext_elem_mask(v0, i);                  \
885                                                               \
886         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
887     }                                                         \
888     env->vstart = 0;                                          \
889     /* set tail elements to 1s */                             \
890     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
891 }
892 
893 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
894 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
895 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
896 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
897 
898 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
899 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
900 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
901 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
902 
903 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
904 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
905                   CPURISCVState *env, uint32_t desc)                     \
906 {                                                                        \
907     uint32_t vl = env->vl;                                               \
908     uint32_t esz = sizeof(ETYPE);                                        \
909     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
910     uint32_t vta = vext_vta(desc);                                       \
911     uint32_t i;                                                          \
912                                                                          \
913     for (i = env->vstart; i < vl; i++) {                                 \
914         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
915         ETYPE carry = vext_elem_mask(v0, i);                             \
916                                                                          \
917         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
918     }                                                                    \
919     env->vstart = 0;                                                     \
920     /* set tail elements to 1s */                                        \
921     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
922 }
923 
924 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
925 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
926 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
927 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
928 
929 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
930 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
931 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
932 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
933 
934 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
935                           (__typeof(N))(N + M) < N)
936 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
937 
938 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
939 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
940                   CPURISCVState *env, uint32_t desc)          \
941 {                                                             \
942     uint32_t vl = env->vl;                                    \
943     uint32_t vm = vext_vm(desc);                              \
944     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
945     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
946     uint32_t i;                                               \
947                                                               \
948     for (i = env->vstart; i < vl; i++) {                      \
949         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
950         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
951         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
952         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
953     }                                                         \
954     env->vstart = 0;                                          \
955     /*
956      * mask destination register are always tail-agnostic
957      * set tail elements to 1s
958      */                                                       \
959     if (vta_all_1s) {                                         \
960         for (; i < total_elems; i++) {                        \
961             vext_set_elem_mask(vd, i, 1);                     \
962         }                                                     \
963     }                                                         \
964 }
965 
966 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
967 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
968 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
969 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
970 
971 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
972 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
973 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
974 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
975 
976 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
977 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
978                   void *vs2, CPURISCVState *env, uint32_t desc) \
979 {                                                               \
980     uint32_t vl = env->vl;                                      \
981     uint32_t vm = vext_vm(desc);                                \
982     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
983     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
984     uint32_t i;                                                 \
985                                                                 \
986     for (i = env->vstart; i < vl; i++) {                        \
987         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
988         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
989         vext_set_elem_mask(vd, i,                               \
990                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
991     }                                                           \
992     env->vstart = 0;                                            \
993     /*
994      * mask destination register are always tail-agnostic
995      * set tail elements to 1s
996      */                                                         \
997     if (vta_all_1s) {                                           \
998         for (; i < total_elems; i++) {                          \
999             vext_set_elem_mask(vd, i, 1);                       \
1000         }                                                       \
1001     }                                                           \
1002 }
1003 
1004 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1005 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1006 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1007 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1008 
1009 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1010 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1011 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1012 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1013 
1014 /* Vector Bitwise Logical Instructions */
1015 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1016 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1017 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1018 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1019 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1020 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1021 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1022 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1023 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1024 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1025 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1026 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1027 GEN_VEXT_VV(vand_vv_b, 1)
1028 GEN_VEXT_VV(vand_vv_h, 2)
1029 GEN_VEXT_VV(vand_vv_w, 4)
1030 GEN_VEXT_VV(vand_vv_d, 8)
1031 GEN_VEXT_VV(vor_vv_b, 1)
1032 GEN_VEXT_VV(vor_vv_h, 2)
1033 GEN_VEXT_VV(vor_vv_w, 4)
1034 GEN_VEXT_VV(vor_vv_d, 8)
1035 GEN_VEXT_VV(vxor_vv_b, 1)
1036 GEN_VEXT_VV(vxor_vv_h, 2)
1037 GEN_VEXT_VV(vxor_vv_w, 4)
1038 GEN_VEXT_VV(vxor_vv_d, 8)
1039 
1040 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1041 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1042 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1043 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1044 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1045 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1046 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1047 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1048 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1049 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1050 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1051 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1052 GEN_VEXT_VX(vand_vx_b, 1)
1053 GEN_VEXT_VX(vand_vx_h, 2)
1054 GEN_VEXT_VX(vand_vx_w, 4)
1055 GEN_VEXT_VX(vand_vx_d, 8)
1056 GEN_VEXT_VX(vor_vx_b, 1)
1057 GEN_VEXT_VX(vor_vx_h, 2)
1058 GEN_VEXT_VX(vor_vx_w, 4)
1059 GEN_VEXT_VX(vor_vx_d, 8)
1060 GEN_VEXT_VX(vxor_vx_b, 1)
1061 GEN_VEXT_VX(vxor_vx_h, 2)
1062 GEN_VEXT_VX(vxor_vx_w, 4)
1063 GEN_VEXT_VX(vxor_vx_d, 8)
1064 
1065 /* Vector Single-Width Bit Shift Instructions */
1066 #define DO_SLL(N, M)  (N << (M))
1067 #define DO_SRL(N, M)  (N >> (M))
1068 
1069 /* generate the helpers for shift instructions with two vector operators */
1070 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1071 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1072                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1073 {                                                                         \
1074     uint32_t vm = vext_vm(desc);                                          \
1075     uint32_t vl = env->vl;                                                \
1076     uint32_t esz = sizeof(TS1);                                           \
1077     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1078     uint32_t vta = vext_vta(desc);                                        \
1079     uint32_t vma = vext_vma(desc);                                        \
1080     uint32_t i;                                                           \
1081                                                                           \
1082     for (i = env->vstart; i < vl; i++) {                                  \
1083         if (!vm && !vext_elem_mask(v0, i)) {                              \
1084             /* set masked-off elements to 1s */                           \
1085             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1086             continue;                                                     \
1087         }                                                                 \
1088         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1089         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1090         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1091     }                                                                     \
1092     env->vstart = 0;                                                      \
1093     /* set tail elements to 1s */                                         \
1094     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1095 }
1096 
1097 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1098 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1099 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1100 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1101 
1102 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1103 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1104 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1105 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1106 
1107 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1108 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1109 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1110 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1111 
1112 /*
1113  * generate the helpers for shift instructions with one vector and one scalar
1114  */
1115 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1116 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1117                   void *vs2, CPURISCVState *env,            \
1118                   uint32_t desc)                            \
1119 {                                                           \
1120     uint32_t vm = vext_vm(desc);                            \
1121     uint32_t vl = env->vl;                                  \
1122     uint32_t esz = sizeof(TD);                              \
1123     uint32_t total_elems =                                  \
1124         vext_get_total_elems(env, desc, esz);               \
1125     uint32_t vta = vext_vta(desc);                          \
1126     uint32_t vma = vext_vma(desc);                          \
1127     uint32_t i;                                             \
1128                                                             \
1129     for (i = env->vstart; i < vl; i++) {                    \
1130         if (!vm && !vext_elem_mask(v0, i)) {                \
1131             /* set masked-off elements to 1s */             \
1132             vext_set_elems_1s(vd, vma, i * esz,             \
1133                               (i + 1) * esz);               \
1134             continue;                                       \
1135         }                                                   \
1136         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1137         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1138     }                                                       \
1139     env->vstart = 0;                                        \
1140     /* set tail elements to 1s */                           \
1141     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1142 }
1143 
1144 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1145 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1146 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1147 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1148 
1149 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1150 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1151 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1152 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1153 
1154 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1155 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1156 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1157 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1158 
1159 /* Vector Narrowing Integer Right Shift Instructions */
1160 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1161 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1162 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1163 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1164 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1165 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1166 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1167 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1168 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1169 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1170 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1171 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1172 
1173 /* Vector Integer Comparison Instructions */
1174 #define DO_MSEQ(N, M) (N == M)
1175 #define DO_MSNE(N, M) (N != M)
1176 #define DO_MSLT(N, M) (N < M)
1177 #define DO_MSLE(N, M) (N <= M)
1178 #define DO_MSGT(N, M) (N > M)
1179 
1180 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1181 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1182                   CPURISCVState *env, uint32_t desc)          \
1183 {                                                             \
1184     uint32_t vm = vext_vm(desc);                              \
1185     uint32_t vl = env->vl;                                    \
1186     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1187     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1188     uint32_t vma = vext_vma(desc);                            \
1189     uint32_t i;                                               \
1190                                                               \
1191     for (i = env->vstart; i < vl; i++) {                      \
1192         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1193         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1194         if (!vm && !vext_elem_mask(v0, i)) {                  \
1195             /* set masked-off elements to 1s */               \
1196             if (vma) {                                        \
1197                 vext_set_elem_mask(vd, i, 1);                 \
1198             }                                                 \
1199             continue;                                         \
1200         }                                                     \
1201         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1202     }                                                         \
1203     env->vstart = 0;                                          \
1204     /*
1205      * mask destination register are always tail-agnostic
1206      * set tail elements to 1s
1207      */                                                       \
1208     if (vta_all_1s) {                                         \
1209         for (; i < total_elems; i++) {                        \
1210             vext_set_elem_mask(vd, i, 1);                     \
1211         }                                                     \
1212     }                                                         \
1213 }
1214 
1215 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1216 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1217 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1218 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1219 
1220 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1221 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1222 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1223 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1224 
1225 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1226 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1227 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1228 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1229 
1230 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1231 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1232 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1233 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1234 
1235 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1236 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1237 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1238 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1239 
1240 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1241 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1242 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1243 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1244 
1245 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1246 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1247                   CPURISCVState *env, uint32_t desc)                \
1248 {                                                                   \
1249     uint32_t vm = vext_vm(desc);                                    \
1250     uint32_t vl = env->vl;                                          \
1251     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1252     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1253     uint32_t vma = vext_vma(desc);                                  \
1254     uint32_t i;                                                     \
1255                                                                     \
1256     for (i = env->vstart; i < vl; i++) {                            \
1257         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1258         if (!vm && !vext_elem_mask(v0, i)) {                        \
1259             /* set masked-off elements to 1s */                     \
1260             if (vma) {                                              \
1261                 vext_set_elem_mask(vd, i, 1);                       \
1262             }                                                       \
1263             continue;                                               \
1264         }                                                           \
1265         vext_set_elem_mask(vd, i,                                   \
1266                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1267     }                                                               \
1268     env->vstart = 0;                                                \
1269     /*
1270      * mask destination register are always tail-agnostic
1271      * set tail elements to 1s
1272      */                                                             \
1273     if (vta_all_1s) {                                               \
1274         for (; i < total_elems; i++) {                              \
1275             vext_set_elem_mask(vd, i, 1);                           \
1276         }                                                           \
1277     }                                                               \
1278 }
1279 
1280 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1281 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1282 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1283 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1284 
1285 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1286 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1287 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1288 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1289 
1290 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1291 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1292 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1293 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1294 
1295 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1296 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1297 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1298 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1299 
1300 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1301 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1302 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1303 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1304 
1305 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1306 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1307 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1308 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1309 
1310 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1311 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1312 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1313 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1314 
1315 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1316 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1317 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1318 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1319 
1320 /* Vector Integer Min/Max Instructions */
1321 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1322 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1323 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1324 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1325 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1326 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1327 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1328 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1329 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1330 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1331 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1332 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1333 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1334 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1335 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1336 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1337 GEN_VEXT_VV(vminu_vv_b, 1)
1338 GEN_VEXT_VV(vminu_vv_h, 2)
1339 GEN_VEXT_VV(vminu_vv_w, 4)
1340 GEN_VEXT_VV(vminu_vv_d, 8)
1341 GEN_VEXT_VV(vmin_vv_b, 1)
1342 GEN_VEXT_VV(vmin_vv_h, 2)
1343 GEN_VEXT_VV(vmin_vv_w, 4)
1344 GEN_VEXT_VV(vmin_vv_d, 8)
1345 GEN_VEXT_VV(vmaxu_vv_b, 1)
1346 GEN_VEXT_VV(vmaxu_vv_h, 2)
1347 GEN_VEXT_VV(vmaxu_vv_w, 4)
1348 GEN_VEXT_VV(vmaxu_vv_d, 8)
1349 GEN_VEXT_VV(vmax_vv_b, 1)
1350 GEN_VEXT_VV(vmax_vv_h, 2)
1351 GEN_VEXT_VV(vmax_vv_w, 4)
1352 GEN_VEXT_VV(vmax_vv_d, 8)
1353 
1354 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1355 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1356 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1357 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1358 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1359 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1360 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1361 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1362 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1363 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1364 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1365 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1366 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1367 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1368 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1369 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1370 GEN_VEXT_VX(vminu_vx_b, 1)
1371 GEN_VEXT_VX(vminu_vx_h, 2)
1372 GEN_VEXT_VX(vminu_vx_w, 4)
1373 GEN_VEXT_VX(vminu_vx_d, 8)
1374 GEN_VEXT_VX(vmin_vx_b, 1)
1375 GEN_VEXT_VX(vmin_vx_h, 2)
1376 GEN_VEXT_VX(vmin_vx_w, 4)
1377 GEN_VEXT_VX(vmin_vx_d, 8)
1378 GEN_VEXT_VX(vmaxu_vx_b, 1)
1379 GEN_VEXT_VX(vmaxu_vx_h, 2)
1380 GEN_VEXT_VX(vmaxu_vx_w, 4)
1381 GEN_VEXT_VX(vmaxu_vx_d, 8)
1382 GEN_VEXT_VX(vmax_vx_b, 1)
1383 GEN_VEXT_VX(vmax_vx_h, 2)
1384 GEN_VEXT_VX(vmax_vx_w, 4)
1385 GEN_VEXT_VX(vmax_vx_d, 8)
1386 
1387 /* Vector Single-Width Integer Multiply Instructions */
1388 #define DO_MUL(N, M) (N * M)
1389 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1390 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1391 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1392 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1393 GEN_VEXT_VV(vmul_vv_b, 1)
1394 GEN_VEXT_VV(vmul_vv_h, 2)
1395 GEN_VEXT_VV(vmul_vv_w, 4)
1396 GEN_VEXT_VV(vmul_vv_d, 8)
1397 
1398 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1399 {
1400     return (int16_t)s2 * (int16_t)s1 >> 8;
1401 }
1402 
1403 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1404 {
1405     return (int32_t)s2 * (int32_t)s1 >> 16;
1406 }
1407 
1408 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1409 {
1410     return (int64_t)s2 * (int64_t)s1 >> 32;
1411 }
1412 
1413 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1414 {
1415     uint64_t hi_64, lo_64;
1416 
1417     muls64(&lo_64, &hi_64, s1, s2);
1418     return hi_64;
1419 }
1420 
1421 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1422 {
1423     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1424 }
1425 
1426 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1427 {
1428     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1429 }
1430 
1431 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1432 {
1433     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1434 }
1435 
1436 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1437 {
1438     uint64_t hi_64, lo_64;
1439 
1440     mulu64(&lo_64, &hi_64, s2, s1);
1441     return hi_64;
1442 }
1443 
1444 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1445 {
1446     return (int16_t)s2 * (uint16_t)s1 >> 8;
1447 }
1448 
1449 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1450 {
1451     return (int32_t)s2 * (uint32_t)s1 >> 16;
1452 }
1453 
1454 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1455 {
1456     return (int64_t)s2 * (uint64_t)s1 >> 32;
1457 }
1458 
1459 /*
1460  * Let  A = signed operand,
1461  *      B = unsigned operand
1462  *      P = mulu64(A, B), unsigned product
1463  *
1464  * LET  X = 2 ** 64  - A, 2's complement of A
1465  *      SP = signed product
1466  * THEN
1467  *      IF A < 0
1468  *          SP = -X * B
1469  *             = -(2 ** 64 - A) * B
1470  *             = A * B - 2 ** 64 * B
1471  *             = P - 2 ** 64 * B
1472  *      ELSE
1473  *          SP = P
1474  * THEN
1475  *      HI_P -= (A < 0 ? B : 0)
1476  */
1477 
1478 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1479 {
1480     uint64_t hi_64, lo_64;
1481 
1482     mulu64(&lo_64, &hi_64, s2, s1);
1483 
1484     hi_64 -= s2 < 0 ? s1 : 0;
1485     return hi_64;
1486 }
1487 
1488 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1489 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1490 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1491 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1492 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1493 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1494 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1495 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1496 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1497 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1498 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1499 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1500 GEN_VEXT_VV(vmulh_vv_b, 1)
1501 GEN_VEXT_VV(vmulh_vv_h, 2)
1502 GEN_VEXT_VV(vmulh_vv_w, 4)
1503 GEN_VEXT_VV(vmulh_vv_d, 8)
1504 GEN_VEXT_VV(vmulhu_vv_b, 1)
1505 GEN_VEXT_VV(vmulhu_vv_h, 2)
1506 GEN_VEXT_VV(vmulhu_vv_w, 4)
1507 GEN_VEXT_VV(vmulhu_vv_d, 8)
1508 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1509 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1510 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1511 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1512 
1513 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1514 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1515 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1516 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1517 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1518 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1519 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1520 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1521 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1522 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1523 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1524 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1525 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1526 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1527 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1528 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1529 GEN_VEXT_VX(vmul_vx_b, 1)
1530 GEN_VEXT_VX(vmul_vx_h, 2)
1531 GEN_VEXT_VX(vmul_vx_w, 4)
1532 GEN_VEXT_VX(vmul_vx_d, 8)
1533 GEN_VEXT_VX(vmulh_vx_b, 1)
1534 GEN_VEXT_VX(vmulh_vx_h, 2)
1535 GEN_VEXT_VX(vmulh_vx_w, 4)
1536 GEN_VEXT_VX(vmulh_vx_d, 8)
1537 GEN_VEXT_VX(vmulhu_vx_b, 1)
1538 GEN_VEXT_VX(vmulhu_vx_h, 2)
1539 GEN_VEXT_VX(vmulhu_vx_w, 4)
1540 GEN_VEXT_VX(vmulhu_vx_d, 8)
1541 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1542 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1543 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1544 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1545 
1546 /* Vector Integer Divide Instructions */
1547 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1548 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1549 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1550         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1551 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1552         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1553 
1554 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1555 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1556 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1557 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1558 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1559 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1560 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1561 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1562 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1563 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1564 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1565 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1566 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1567 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1568 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1569 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1570 GEN_VEXT_VV(vdivu_vv_b, 1)
1571 GEN_VEXT_VV(vdivu_vv_h, 2)
1572 GEN_VEXT_VV(vdivu_vv_w, 4)
1573 GEN_VEXT_VV(vdivu_vv_d, 8)
1574 GEN_VEXT_VV(vdiv_vv_b, 1)
1575 GEN_VEXT_VV(vdiv_vv_h, 2)
1576 GEN_VEXT_VV(vdiv_vv_w, 4)
1577 GEN_VEXT_VV(vdiv_vv_d, 8)
1578 GEN_VEXT_VV(vremu_vv_b, 1)
1579 GEN_VEXT_VV(vremu_vv_h, 2)
1580 GEN_VEXT_VV(vremu_vv_w, 4)
1581 GEN_VEXT_VV(vremu_vv_d, 8)
1582 GEN_VEXT_VV(vrem_vv_b, 1)
1583 GEN_VEXT_VV(vrem_vv_h, 2)
1584 GEN_VEXT_VV(vrem_vv_w, 4)
1585 GEN_VEXT_VV(vrem_vv_d, 8)
1586 
1587 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1588 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1589 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1590 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1591 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1592 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1593 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1594 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1595 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1596 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1597 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1598 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1599 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1600 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1601 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1602 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1603 GEN_VEXT_VX(vdivu_vx_b, 1)
1604 GEN_VEXT_VX(vdivu_vx_h, 2)
1605 GEN_VEXT_VX(vdivu_vx_w, 4)
1606 GEN_VEXT_VX(vdivu_vx_d, 8)
1607 GEN_VEXT_VX(vdiv_vx_b, 1)
1608 GEN_VEXT_VX(vdiv_vx_h, 2)
1609 GEN_VEXT_VX(vdiv_vx_w, 4)
1610 GEN_VEXT_VX(vdiv_vx_d, 8)
1611 GEN_VEXT_VX(vremu_vx_b, 1)
1612 GEN_VEXT_VX(vremu_vx_h, 2)
1613 GEN_VEXT_VX(vremu_vx_w, 4)
1614 GEN_VEXT_VX(vremu_vx_d, 8)
1615 GEN_VEXT_VX(vrem_vx_b, 1)
1616 GEN_VEXT_VX(vrem_vx_h, 2)
1617 GEN_VEXT_VX(vrem_vx_w, 4)
1618 GEN_VEXT_VX(vrem_vx_d, 8)
1619 
1620 /* Vector Widening Integer Multiply Instructions */
1621 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1622 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1623 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1624 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1625 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1626 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1627 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1628 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1629 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1630 GEN_VEXT_VV(vwmul_vv_b, 2)
1631 GEN_VEXT_VV(vwmul_vv_h, 4)
1632 GEN_VEXT_VV(vwmul_vv_w, 8)
1633 GEN_VEXT_VV(vwmulu_vv_b, 2)
1634 GEN_VEXT_VV(vwmulu_vv_h, 4)
1635 GEN_VEXT_VV(vwmulu_vv_w, 8)
1636 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1637 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1638 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1639 
1640 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1641 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1642 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1643 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1644 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1645 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1646 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1647 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1648 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1649 GEN_VEXT_VX(vwmul_vx_b, 2)
1650 GEN_VEXT_VX(vwmul_vx_h, 4)
1651 GEN_VEXT_VX(vwmul_vx_w, 8)
1652 GEN_VEXT_VX(vwmulu_vx_b, 2)
1653 GEN_VEXT_VX(vwmulu_vx_h, 4)
1654 GEN_VEXT_VX(vwmulu_vx_w, 8)
1655 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1656 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1657 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1658 
1659 /* Vector Single-Width Integer Multiply-Add Instructions */
1660 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1661 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1662 {                                                                  \
1663     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1664     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1665     TD d = *((TD *)vd + HD(i));                                    \
1666     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1667 }
1668 
1669 #define DO_MACC(N, M, D) (M * N + D)
1670 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1671 #define DO_MADD(N, M, D) (M * D + N)
1672 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1673 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1674 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1675 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1676 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1677 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1678 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1679 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1680 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1681 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1682 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1683 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1684 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1685 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1686 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1687 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1688 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1689 GEN_VEXT_VV(vmacc_vv_b, 1)
1690 GEN_VEXT_VV(vmacc_vv_h, 2)
1691 GEN_VEXT_VV(vmacc_vv_w, 4)
1692 GEN_VEXT_VV(vmacc_vv_d, 8)
1693 GEN_VEXT_VV(vnmsac_vv_b, 1)
1694 GEN_VEXT_VV(vnmsac_vv_h, 2)
1695 GEN_VEXT_VV(vnmsac_vv_w, 4)
1696 GEN_VEXT_VV(vnmsac_vv_d, 8)
1697 GEN_VEXT_VV(vmadd_vv_b, 1)
1698 GEN_VEXT_VV(vmadd_vv_h, 2)
1699 GEN_VEXT_VV(vmadd_vv_w, 4)
1700 GEN_VEXT_VV(vmadd_vv_d, 8)
1701 GEN_VEXT_VV(vnmsub_vv_b, 1)
1702 GEN_VEXT_VV(vnmsub_vv_h, 2)
1703 GEN_VEXT_VV(vnmsub_vv_w, 4)
1704 GEN_VEXT_VV(vnmsub_vv_d, 8)
1705 
1706 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1707 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1708 {                                                                   \
1709     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1710     TD d = *((TD *)vd + HD(i));                                     \
1711     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1712 }
1713 
1714 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1715 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1716 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1717 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1718 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1719 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1720 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1721 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1722 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1723 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1724 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1725 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1726 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1727 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1728 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1729 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1730 GEN_VEXT_VX(vmacc_vx_b, 1)
1731 GEN_VEXT_VX(vmacc_vx_h, 2)
1732 GEN_VEXT_VX(vmacc_vx_w, 4)
1733 GEN_VEXT_VX(vmacc_vx_d, 8)
1734 GEN_VEXT_VX(vnmsac_vx_b, 1)
1735 GEN_VEXT_VX(vnmsac_vx_h, 2)
1736 GEN_VEXT_VX(vnmsac_vx_w, 4)
1737 GEN_VEXT_VX(vnmsac_vx_d, 8)
1738 GEN_VEXT_VX(vmadd_vx_b, 1)
1739 GEN_VEXT_VX(vmadd_vx_h, 2)
1740 GEN_VEXT_VX(vmadd_vx_w, 4)
1741 GEN_VEXT_VX(vmadd_vx_d, 8)
1742 GEN_VEXT_VX(vnmsub_vx_b, 1)
1743 GEN_VEXT_VX(vnmsub_vx_h, 2)
1744 GEN_VEXT_VX(vnmsub_vx_w, 4)
1745 GEN_VEXT_VX(vnmsub_vx_d, 8)
1746 
1747 /* Vector Widening Integer Multiply-Add Instructions */
1748 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1749 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1750 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1751 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1752 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1753 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1754 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1755 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1756 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1757 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1758 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1759 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1760 GEN_VEXT_VV(vwmacc_vv_b, 2)
1761 GEN_VEXT_VV(vwmacc_vv_h, 4)
1762 GEN_VEXT_VV(vwmacc_vv_w, 8)
1763 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1764 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1765 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1766 
1767 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1768 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1769 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1770 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1771 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1772 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1773 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1774 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1775 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1776 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1777 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1778 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1779 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1780 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1781 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1782 GEN_VEXT_VX(vwmacc_vx_b, 2)
1783 GEN_VEXT_VX(vwmacc_vx_h, 4)
1784 GEN_VEXT_VX(vwmacc_vx_w, 8)
1785 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1786 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1787 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1788 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1789 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1790 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1791 
1792 /* Vector Integer Merge and Move Instructions */
1793 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1794 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1795                   uint32_t desc)                                     \
1796 {                                                                    \
1797     uint32_t vl = env->vl;                                           \
1798     uint32_t esz = sizeof(ETYPE);                                    \
1799     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1800     uint32_t vta = vext_vta(desc);                                   \
1801     uint32_t i;                                                      \
1802                                                                      \
1803     for (i = env->vstart; i < vl; i++) {                             \
1804         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1805         *((ETYPE *)vd + H(i)) = s1;                                  \
1806     }                                                                \
1807     env->vstart = 0;                                                 \
1808     /* set tail elements to 1s */                                    \
1809     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1810 }
1811 
1812 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1813 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1814 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1815 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1816 
1817 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1818 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1819                   uint32_t desc)                                     \
1820 {                                                                    \
1821     uint32_t vl = env->vl;                                           \
1822     uint32_t esz = sizeof(ETYPE);                                    \
1823     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1824     uint32_t vta = vext_vta(desc);                                   \
1825     uint32_t i;                                                      \
1826                                                                      \
1827     for (i = env->vstart; i < vl; i++) {                             \
1828         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1829     }                                                                \
1830     env->vstart = 0;                                                 \
1831     /* set tail elements to 1s */                                    \
1832     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1833 }
1834 
1835 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1836 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1837 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1838 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1839 
1840 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1841 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1842                   CPURISCVState *env, uint32_t desc)                 \
1843 {                                                                    \
1844     uint32_t vl = env->vl;                                           \
1845     uint32_t esz = sizeof(ETYPE);                                    \
1846     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1847     uint32_t vta = vext_vta(desc);                                   \
1848     uint32_t i;                                                      \
1849                                                                      \
1850     for (i = env->vstart; i < vl; i++) {                             \
1851         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1852         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1853     }                                                                \
1854     env->vstart = 0;                                                 \
1855     /* set tail elements to 1s */                                    \
1856     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1857 }
1858 
1859 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1860 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1861 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1862 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1863 
1864 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1865 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1866                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1867 {                                                                    \
1868     uint32_t vl = env->vl;                                           \
1869     uint32_t esz = sizeof(ETYPE);                                    \
1870     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1871     uint32_t vta = vext_vta(desc);                                   \
1872     uint32_t i;                                                      \
1873                                                                      \
1874     for (i = env->vstart; i < vl; i++) {                             \
1875         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1876         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1877                    (ETYPE)(target_long)s1);                          \
1878         *((ETYPE *)vd + H(i)) = d;                                   \
1879     }                                                                \
1880     env->vstart = 0;                                                 \
1881     /* set tail elements to 1s */                                    \
1882     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1883 }
1884 
1885 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1886 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1887 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1888 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1889 
1890 /*
1891  * Vector Fixed-Point Arithmetic Instructions
1892  */
1893 
1894 /* Vector Single-Width Saturating Add and Subtract */
1895 
1896 /*
1897  * As fixed point instructions probably have round mode and saturation,
1898  * define common macros for fixed point here.
1899  */
1900 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1901                           CPURISCVState *env, int vxrm);
1902 
1903 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1904 static inline void                                                  \
1905 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1906           CPURISCVState *env, int vxrm)                             \
1907 {                                                                   \
1908     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1909     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1910     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1911 }
1912 
1913 static inline void
1914 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1915              CPURISCVState *env,
1916              uint32_t vl, uint32_t vm, int vxrm,
1917              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1918 {
1919     for (uint32_t i = env->vstart; i < vl; i++) {
1920         if (!vm && !vext_elem_mask(v0, i)) {
1921             /* set masked-off elements to 1s */
1922             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1923             continue;
1924         }
1925         fn(vd, vs1, vs2, i, env, vxrm);
1926     }
1927     env->vstart = 0;
1928 }
1929 
1930 static inline void
1931 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1932              CPURISCVState *env,
1933              uint32_t desc,
1934              opivv2_rm_fn *fn, uint32_t esz)
1935 {
1936     uint32_t vm = vext_vm(desc);
1937     uint32_t vl = env->vl;
1938     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1939     uint32_t vta = vext_vta(desc);
1940     uint32_t vma = vext_vma(desc);
1941 
1942     switch (env->vxrm) {
1943     case 0: /* rnu */
1944         vext_vv_rm_1(vd, v0, vs1, vs2,
1945                      env, vl, vm, 0, fn, vma, esz);
1946         break;
1947     case 1: /* rne */
1948         vext_vv_rm_1(vd, v0, vs1, vs2,
1949                      env, vl, vm, 1, fn, vma, esz);
1950         break;
1951     case 2: /* rdn */
1952         vext_vv_rm_1(vd, v0, vs1, vs2,
1953                      env, vl, vm, 2, fn, vma, esz);
1954         break;
1955     default: /* rod */
1956         vext_vv_rm_1(vd, v0, vs1, vs2,
1957                      env, vl, vm, 3, fn, vma, esz);
1958         break;
1959     }
1960     /* set tail elements to 1s */
1961     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
1962 }
1963 
1964 /* generate helpers for fixed point instructions with OPIVV format */
1965 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
1966 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1967                   CPURISCVState *env, uint32_t desc)            \
1968 {                                                               \
1969     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
1970                  do_##NAME, ESZ);                               \
1971 }
1972 
1973 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
1974                              uint8_t b)
1975 {
1976     uint8_t res = a + b;
1977     if (res < a) {
1978         res = UINT8_MAX;
1979         env->vxsat = 0x1;
1980     }
1981     return res;
1982 }
1983 
1984 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
1985                                uint16_t b)
1986 {
1987     uint16_t res = a + b;
1988     if (res < a) {
1989         res = UINT16_MAX;
1990         env->vxsat = 0x1;
1991     }
1992     return res;
1993 }
1994 
1995 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
1996                                uint32_t b)
1997 {
1998     uint32_t res = a + b;
1999     if (res < a) {
2000         res = UINT32_MAX;
2001         env->vxsat = 0x1;
2002     }
2003     return res;
2004 }
2005 
2006 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2007                                uint64_t b)
2008 {
2009     uint64_t res = a + b;
2010     if (res < a) {
2011         res = UINT64_MAX;
2012         env->vxsat = 0x1;
2013     }
2014     return res;
2015 }
2016 
2017 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2018 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2019 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2020 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2021 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2022 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2023 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2024 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2025 
2026 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2027                           CPURISCVState *env, int vxrm);
2028 
2029 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2030 static inline void                                                  \
2031 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2032           CPURISCVState *env, int vxrm)                             \
2033 {                                                                   \
2034     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2035     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2036 }
2037 
2038 static inline void
2039 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2040              CPURISCVState *env,
2041              uint32_t vl, uint32_t vm, int vxrm,
2042              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2043 {
2044     for (uint32_t i = env->vstart; i < vl; i++) {
2045         if (!vm && !vext_elem_mask(v0, i)) {
2046             /* set masked-off elements to 1s */
2047             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2048             continue;
2049         }
2050         fn(vd, s1, vs2, i, env, vxrm);
2051     }
2052     env->vstart = 0;
2053 }
2054 
2055 static inline void
2056 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2057              CPURISCVState *env,
2058              uint32_t desc,
2059              opivx2_rm_fn *fn, uint32_t esz)
2060 {
2061     uint32_t vm = vext_vm(desc);
2062     uint32_t vl = env->vl;
2063     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2064     uint32_t vta = vext_vta(desc);
2065     uint32_t vma = vext_vma(desc);
2066 
2067     switch (env->vxrm) {
2068     case 0: /* rnu */
2069         vext_vx_rm_1(vd, v0, s1, vs2,
2070                      env, vl, vm, 0, fn, vma, esz);
2071         break;
2072     case 1: /* rne */
2073         vext_vx_rm_1(vd, v0, s1, vs2,
2074                      env, vl, vm, 1, fn, vma, esz);
2075         break;
2076     case 2: /* rdn */
2077         vext_vx_rm_1(vd, v0, s1, vs2,
2078                      env, vl, vm, 2, fn, vma, esz);
2079         break;
2080     default: /* rod */
2081         vext_vx_rm_1(vd, v0, s1, vs2,
2082                      env, vl, vm, 3, fn, vma, esz);
2083         break;
2084     }
2085     /* set tail elements to 1s */
2086     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2087 }
2088 
2089 /* generate helpers for fixed point instructions with OPIVX format */
2090 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2091 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2092                   void *vs2, CPURISCVState *env,          \
2093                   uint32_t desc)                          \
2094 {                                                         \
2095     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2096                  do_##NAME, ESZ);                         \
2097 }
2098 
2099 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2100 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2101 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2102 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2103 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2104 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2105 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2106 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2107 
2108 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2109 {
2110     int8_t res = a + b;
2111     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2112         res = a > 0 ? INT8_MAX : INT8_MIN;
2113         env->vxsat = 0x1;
2114     }
2115     return res;
2116 }
2117 
2118 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2119                              int16_t b)
2120 {
2121     int16_t res = a + b;
2122     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2123         res = a > 0 ? INT16_MAX : INT16_MIN;
2124         env->vxsat = 0x1;
2125     }
2126     return res;
2127 }
2128 
2129 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2130                              int32_t b)
2131 {
2132     int32_t res = a + b;
2133     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2134         res = a > 0 ? INT32_MAX : INT32_MIN;
2135         env->vxsat = 0x1;
2136     }
2137     return res;
2138 }
2139 
2140 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2141                              int64_t b)
2142 {
2143     int64_t res = a + b;
2144     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2145         res = a > 0 ? INT64_MAX : INT64_MIN;
2146         env->vxsat = 0x1;
2147     }
2148     return res;
2149 }
2150 
2151 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2152 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2153 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2154 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2155 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2156 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2157 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2158 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2159 
2160 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2161 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2162 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2163 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2164 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2165 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2166 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2167 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2168 
2169 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2170                              uint8_t b)
2171 {
2172     uint8_t res = a - b;
2173     if (res > a) {
2174         res = 0;
2175         env->vxsat = 0x1;
2176     }
2177     return res;
2178 }
2179 
2180 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2181                                uint16_t b)
2182 {
2183     uint16_t res = a - b;
2184     if (res > a) {
2185         res = 0;
2186         env->vxsat = 0x1;
2187     }
2188     return res;
2189 }
2190 
2191 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2192                                uint32_t b)
2193 {
2194     uint32_t res = a - b;
2195     if (res > a) {
2196         res = 0;
2197         env->vxsat = 0x1;
2198     }
2199     return res;
2200 }
2201 
2202 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2203                                uint64_t b)
2204 {
2205     uint64_t res = a - b;
2206     if (res > a) {
2207         res = 0;
2208         env->vxsat = 0x1;
2209     }
2210     return res;
2211 }
2212 
2213 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2214 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2215 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2216 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2217 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2218 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2219 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2220 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2221 
2222 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2223 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2224 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2225 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2226 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2227 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2228 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2229 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2230 
2231 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2232 {
2233     int8_t res = a - b;
2234     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2235         res = a >= 0 ? INT8_MAX : INT8_MIN;
2236         env->vxsat = 0x1;
2237     }
2238     return res;
2239 }
2240 
2241 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2242                              int16_t b)
2243 {
2244     int16_t res = a - b;
2245     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2246         res = a >= 0 ? INT16_MAX : INT16_MIN;
2247         env->vxsat = 0x1;
2248     }
2249     return res;
2250 }
2251 
2252 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2253                              int32_t b)
2254 {
2255     int32_t res = a - b;
2256     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2257         res = a >= 0 ? INT32_MAX : INT32_MIN;
2258         env->vxsat = 0x1;
2259     }
2260     return res;
2261 }
2262 
2263 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2264                              int64_t b)
2265 {
2266     int64_t res = a - b;
2267     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2268         res = a >= 0 ? INT64_MAX : INT64_MIN;
2269         env->vxsat = 0x1;
2270     }
2271     return res;
2272 }
2273 
2274 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2275 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2276 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2277 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2278 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2279 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2280 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2281 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2282 
2283 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2284 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2285 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2286 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2287 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2288 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2289 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2290 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2291 
2292 /* Vector Single-Width Averaging Add and Subtract */
2293 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2294 {
2295     uint8_t d = extract64(v, shift, 1);
2296     uint8_t d1;
2297     uint64_t D1, D2;
2298 
2299     if (shift == 0 || shift > 64) {
2300         return 0;
2301     }
2302 
2303     d1 = extract64(v, shift - 1, 1);
2304     D1 = extract64(v, 0, shift);
2305     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2306         return d1;
2307     } else if (vxrm == 1) { /* round-to-nearest-even */
2308         if (shift > 1) {
2309             D2 = extract64(v, 0, shift - 1);
2310             return d1 & ((D2 != 0) | d);
2311         } else {
2312             return d1 & d;
2313         }
2314     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2315         return !d & (D1 != 0);
2316     }
2317     return 0; /* round-down (truncate) */
2318 }
2319 
2320 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2321                              int32_t b)
2322 {
2323     int64_t res = (int64_t)a + b;
2324     uint8_t round = get_round(vxrm, res, 1);
2325 
2326     return (res >> 1) + round;
2327 }
2328 
2329 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2330                              int64_t b)
2331 {
2332     int64_t res = a + b;
2333     uint8_t round = get_round(vxrm, res, 1);
2334     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2335 
2336     /* With signed overflow, bit 64 is inverse of bit 63. */
2337     return ((res >> 1) ^ over) + round;
2338 }
2339 
2340 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2341 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2342 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2343 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2344 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2345 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2346 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2347 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2348 
2349 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2350 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2351 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2352 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2353 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2354 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2355 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2356 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2357 
2358 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2359                                uint32_t a, uint32_t b)
2360 {
2361     uint64_t res = (uint64_t)a + b;
2362     uint8_t round = get_round(vxrm, res, 1);
2363 
2364     return (res >> 1) + round;
2365 }
2366 
2367 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2368                                uint64_t a, uint64_t b)
2369 {
2370     uint64_t res = a + b;
2371     uint8_t round = get_round(vxrm, res, 1);
2372     uint64_t over = (uint64_t)(res < a) << 63;
2373 
2374     return ((res >> 1) | over) + round;
2375 }
2376 
2377 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2378 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2379 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2380 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2381 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2382 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2383 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2384 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2385 
2386 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2387 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2388 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2389 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2390 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2391 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2392 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2393 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2394 
2395 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2396                              int32_t b)
2397 {
2398     int64_t res = (int64_t)a - b;
2399     uint8_t round = get_round(vxrm, res, 1);
2400 
2401     return (res >> 1) + round;
2402 }
2403 
2404 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2405                              int64_t b)
2406 {
2407     int64_t res = (int64_t)a - b;
2408     uint8_t round = get_round(vxrm, res, 1);
2409     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2410 
2411     /* With signed overflow, bit 64 is inverse of bit 63. */
2412     return ((res >> 1) ^ over) + round;
2413 }
2414 
2415 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2416 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2417 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2418 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2419 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2420 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2421 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2422 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2423 
2424 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2425 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2426 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2427 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2428 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2429 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2430 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2431 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2432 
2433 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2434                                uint32_t a, uint32_t b)
2435 {
2436     int64_t res = (int64_t)a - b;
2437     uint8_t round = get_round(vxrm, res, 1);
2438 
2439     return (res >> 1) + round;
2440 }
2441 
2442 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2443                                uint64_t a, uint64_t b)
2444 {
2445     uint64_t res = (uint64_t)a - b;
2446     uint8_t round = get_round(vxrm, res, 1);
2447     uint64_t over = (uint64_t)(res > a) << 63;
2448 
2449     return ((res >> 1) | over) + round;
2450 }
2451 
2452 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2453 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2454 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2455 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2456 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2457 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2458 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2459 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2460 
2461 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2462 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2463 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2464 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2465 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2466 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2467 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2468 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2469 
2470 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2471 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2472 {
2473     uint8_t round;
2474     int16_t res;
2475 
2476     res = (int16_t)a * (int16_t)b;
2477     round = get_round(vxrm, res, 7);
2478     res = (res >> 7) + round;
2479 
2480     if (res > INT8_MAX) {
2481         env->vxsat = 0x1;
2482         return INT8_MAX;
2483     } else if (res < INT8_MIN) {
2484         env->vxsat = 0x1;
2485         return INT8_MIN;
2486     } else {
2487         return res;
2488     }
2489 }
2490 
2491 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2492 {
2493     uint8_t round;
2494     int32_t res;
2495 
2496     res = (int32_t)a * (int32_t)b;
2497     round = get_round(vxrm, res, 15);
2498     res = (res >> 15) + round;
2499 
2500     if (res > INT16_MAX) {
2501         env->vxsat = 0x1;
2502         return INT16_MAX;
2503     } else if (res < INT16_MIN) {
2504         env->vxsat = 0x1;
2505         return INT16_MIN;
2506     } else {
2507         return res;
2508     }
2509 }
2510 
2511 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2512 {
2513     uint8_t round;
2514     int64_t res;
2515 
2516     res = (int64_t)a * (int64_t)b;
2517     round = get_round(vxrm, res, 31);
2518     res = (res >> 31) + round;
2519 
2520     if (res > INT32_MAX) {
2521         env->vxsat = 0x1;
2522         return INT32_MAX;
2523     } else if (res < INT32_MIN) {
2524         env->vxsat = 0x1;
2525         return INT32_MIN;
2526     } else {
2527         return res;
2528     }
2529 }
2530 
2531 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2532 {
2533     uint8_t round;
2534     uint64_t hi_64, lo_64;
2535     int64_t res;
2536 
2537     if (a == INT64_MIN && b == INT64_MIN) {
2538         env->vxsat = 1;
2539         return INT64_MAX;
2540     }
2541 
2542     muls64(&lo_64, &hi_64, a, b);
2543     round = get_round(vxrm, lo_64, 63);
2544     /*
2545      * Cannot overflow, as there are always
2546      * 2 sign bits after multiply.
2547      */
2548     res = (hi_64 << 1) | (lo_64 >> 63);
2549     if (round) {
2550         if (res == INT64_MAX) {
2551             env->vxsat = 1;
2552         } else {
2553             res += 1;
2554         }
2555     }
2556     return res;
2557 }
2558 
2559 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2560 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2561 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2562 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2563 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2564 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2565 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2566 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2567 
2568 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2569 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2570 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2571 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2572 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2573 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2574 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2575 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2576 
2577 /* Vector Single-Width Scaling Shift Instructions */
2578 static inline uint8_t
2579 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2580 {
2581     uint8_t round, shift = b & 0x7;
2582     uint8_t res;
2583 
2584     round = get_round(vxrm, a, shift);
2585     res = (a >> shift) + round;
2586     return res;
2587 }
2588 static inline uint16_t
2589 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2590 {
2591     uint8_t round, shift = b & 0xf;
2592 
2593     round = get_round(vxrm, a, shift);
2594     return (a >> shift) + round;
2595 }
2596 static inline uint32_t
2597 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2598 {
2599     uint8_t round, shift = b & 0x1f;
2600 
2601     round = get_round(vxrm, a, shift);
2602     return (a >> shift) + round;
2603 }
2604 static inline uint64_t
2605 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2606 {
2607     uint8_t round, shift = b & 0x3f;
2608 
2609     round = get_round(vxrm, a, shift);
2610     return (a >> shift) + round;
2611 }
2612 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2613 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2614 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2615 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2616 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2617 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2618 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2619 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2620 
2621 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2622 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2623 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2624 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2625 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2626 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2627 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2628 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2629 
2630 static inline int8_t
2631 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2632 {
2633     uint8_t round, shift = b & 0x7;
2634 
2635     round = get_round(vxrm, a, shift);
2636     return (a >> shift) + round;
2637 }
2638 static inline int16_t
2639 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2640 {
2641     uint8_t round, shift = b & 0xf;
2642 
2643     round = get_round(vxrm, a, shift);
2644     return (a >> shift) + round;
2645 }
2646 static inline int32_t
2647 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2648 {
2649     uint8_t round, shift = b & 0x1f;
2650 
2651     round = get_round(vxrm, a, shift);
2652     return (a >> shift) + round;
2653 }
2654 static inline int64_t
2655 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2656 {
2657     uint8_t round, shift = b & 0x3f;
2658 
2659     round = get_round(vxrm, a, shift);
2660     return (a >> shift) + round;
2661 }
2662 
2663 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2664 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2665 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2666 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2667 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2668 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2669 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2670 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2671 
2672 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2673 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2674 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2675 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2676 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2677 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2678 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2679 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2680 
2681 /* Vector Narrowing Fixed-Point Clip Instructions */
2682 static inline int8_t
2683 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2684 {
2685     uint8_t round, shift = b & 0xf;
2686     int16_t res;
2687 
2688     round = get_round(vxrm, a, shift);
2689     res = (a >> shift) + round;
2690     if (res > INT8_MAX) {
2691         env->vxsat = 0x1;
2692         return INT8_MAX;
2693     } else if (res < INT8_MIN) {
2694         env->vxsat = 0x1;
2695         return INT8_MIN;
2696     } else {
2697         return res;
2698     }
2699 }
2700 
2701 static inline int16_t
2702 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2703 {
2704     uint8_t round, shift = b & 0x1f;
2705     int32_t res;
2706 
2707     round = get_round(vxrm, a, shift);
2708     res = (a >> shift) + round;
2709     if (res > INT16_MAX) {
2710         env->vxsat = 0x1;
2711         return INT16_MAX;
2712     } else if (res < INT16_MIN) {
2713         env->vxsat = 0x1;
2714         return INT16_MIN;
2715     } else {
2716         return res;
2717     }
2718 }
2719 
2720 static inline int32_t
2721 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2722 {
2723     uint8_t round, shift = b & 0x3f;
2724     int64_t res;
2725 
2726     round = get_round(vxrm, a, shift);
2727     res = (a >> shift) + round;
2728     if (res > INT32_MAX) {
2729         env->vxsat = 0x1;
2730         return INT32_MAX;
2731     } else if (res < INT32_MIN) {
2732         env->vxsat = 0x1;
2733         return INT32_MIN;
2734     } else {
2735         return res;
2736     }
2737 }
2738 
2739 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2740 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2741 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2742 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2743 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2744 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2745 
2746 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2747 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2748 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2749 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2750 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2751 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2752 
2753 static inline uint8_t
2754 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2755 {
2756     uint8_t round, shift = b & 0xf;
2757     uint16_t res;
2758 
2759     round = get_round(vxrm, a, shift);
2760     res = (a >> shift) + round;
2761     if (res > UINT8_MAX) {
2762         env->vxsat = 0x1;
2763         return UINT8_MAX;
2764     } else {
2765         return res;
2766     }
2767 }
2768 
2769 static inline uint16_t
2770 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2771 {
2772     uint8_t round, shift = b & 0x1f;
2773     uint32_t res;
2774 
2775     round = get_round(vxrm, a, shift);
2776     res = (a >> shift) + round;
2777     if (res > UINT16_MAX) {
2778         env->vxsat = 0x1;
2779         return UINT16_MAX;
2780     } else {
2781         return res;
2782     }
2783 }
2784 
2785 static inline uint32_t
2786 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2787 {
2788     uint8_t round, shift = b & 0x3f;
2789     uint64_t res;
2790 
2791     round = get_round(vxrm, a, shift);
2792     res = (a >> shift) + round;
2793     if (res > UINT32_MAX) {
2794         env->vxsat = 0x1;
2795         return UINT32_MAX;
2796     } else {
2797         return res;
2798     }
2799 }
2800 
2801 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2802 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2803 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2804 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2805 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2806 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2807 
2808 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2809 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2810 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2811 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2812 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2813 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2814 
2815 /*
2816  * Vector Float Point Arithmetic Instructions
2817  */
2818 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2819 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2820 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2821                       CPURISCVState *env)                      \
2822 {                                                              \
2823     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2824     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2825     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2826 }
2827 
2828 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2829 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2830                   void *vs2, CPURISCVState *env,          \
2831                   uint32_t desc)                          \
2832 {                                                         \
2833     uint32_t vm = vext_vm(desc);                          \
2834     uint32_t vl = env->vl;                                \
2835     uint32_t total_elems =                                \
2836         vext_get_total_elems(env, desc, ESZ);             \
2837     uint32_t vta = vext_vta(desc);                        \
2838     uint32_t vma = vext_vma(desc);                        \
2839     uint32_t i;                                           \
2840                                                           \
2841     for (i = env->vstart; i < vl; i++) {                  \
2842         if (!vm && !vext_elem_mask(v0, i)) {              \
2843             /* set masked-off elements to 1s */           \
2844             vext_set_elems_1s(vd, vma, i * ESZ,           \
2845                               (i + 1) * ESZ);             \
2846             continue;                                     \
2847         }                                                 \
2848         do_##NAME(vd, vs1, vs2, i, env);                  \
2849     }                                                     \
2850     env->vstart = 0;                                      \
2851     /* set tail elements to 1s */                         \
2852     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2853                       total_elems * ESZ);                 \
2854 }
2855 
2856 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2857 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2858 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2859 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2860 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2861 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2862 
2863 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2864 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2865                       CPURISCVState *env)                      \
2866 {                                                              \
2867     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2868     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2869 }
2870 
2871 #define GEN_VEXT_VF(NAME, ESZ)                            \
2872 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2873                   void *vs2, CPURISCVState *env,          \
2874                   uint32_t desc)                          \
2875 {                                                         \
2876     uint32_t vm = vext_vm(desc);                          \
2877     uint32_t vl = env->vl;                                \
2878     uint32_t total_elems =                                \
2879         vext_get_total_elems(env, desc, ESZ);             \
2880     uint32_t vta = vext_vta(desc);                        \
2881     uint32_t vma = vext_vma(desc);                        \
2882     uint32_t i;                                           \
2883                                                           \
2884     for (i = env->vstart; i < vl; i++) {                  \
2885         if (!vm && !vext_elem_mask(v0, i)) {              \
2886             /* set masked-off elements to 1s */           \
2887             vext_set_elems_1s(vd, vma, i * ESZ,           \
2888                               (i + 1) * ESZ);             \
2889             continue;                                     \
2890         }                                                 \
2891         do_##NAME(vd, s1, vs2, i, env);                   \
2892     }                                                     \
2893     env->vstart = 0;                                      \
2894     /* set tail elements to 1s */                         \
2895     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2896                       total_elems * ESZ);                 \
2897 }
2898 
2899 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2900 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2901 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2902 GEN_VEXT_VF(vfadd_vf_h, 2)
2903 GEN_VEXT_VF(vfadd_vf_w, 4)
2904 GEN_VEXT_VF(vfadd_vf_d, 8)
2905 
2906 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2907 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2908 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2909 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2910 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2911 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2912 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2913 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2914 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2915 GEN_VEXT_VF(vfsub_vf_h, 2)
2916 GEN_VEXT_VF(vfsub_vf_w, 4)
2917 GEN_VEXT_VF(vfsub_vf_d, 8)
2918 
2919 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2920 {
2921     return float16_sub(b, a, s);
2922 }
2923 
2924 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2925 {
2926     return float32_sub(b, a, s);
2927 }
2928 
2929 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2930 {
2931     return float64_sub(b, a, s);
2932 }
2933 
2934 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2935 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2936 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2937 GEN_VEXT_VF(vfrsub_vf_h, 2)
2938 GEN_VEXT_VF(vfrsub_vf_w, 4)
2939 GEN_VEXT_VF(vfrsub_vf_d, 8)
2940 
2941 /* Vector Widening Floating-Point Add/Subtract Instructions */
2942 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2943 {
2944     return float32_add(float16_to_float32(a, true, s),
2945                        float16_to_float32(b, true, s), s);
2946 }
2947 
2948 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2949 {
2950     return float64_add(float32_to_float64(a, s),
2951                        float32_to_float64(b, s), s);
2952 
2953 }
2954 
2955 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
2956 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
2957 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
2958 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
2959 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
2960 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
2961 GEN_VEXT_VF(vfwadd_vf_h, 4)
2962 GEN_VEXT_VF(vfwadd_vf_w, 8)
2963 
2964 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
2965 {
2966     return float32_sub(float16_to_float32(a, true, s),
2967                        float16_to_float32(b, true, s), s);
2968 }
2969 
2970 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
2971 {
2972     return float64_sub(float32_to_float64(a, s),
2973                        float32_to_float64(b, s), s);
2974 
2975 }
2976 
2977 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
2978 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
2979 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
2980 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
2981 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
2982 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
2983 GEN_VEXT_VF(vfwsub_vf_h, 4)
2984 GEN_VEXT_VF(vfwsub_vf_w, 8)
2985 
2986 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
2987 {
2988     return float32_add(a, float16_to_float32(b, true, s), s);
2989 }
2990 
2991 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
2992 {
2993     return float64_add(a, float32_to_float64(b, s), s);
2994 }
2995 
2996 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
2997 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
2998 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
2999 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3000 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3001 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3002 GEN_VEXT_VF(vfwadd_wf_h, 4)
3003 GEN_VEXT_VF(vfwadd_wf_w, 8)
3004 
3005 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3006 {
3007     return float32_sub(a, float16_to_float32(b, true, s), s);
3008 }
3009 
3010 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3011 {
3012     return float64_sub(a, float32_to_float64(b, s), s);
3013 }
3014 
3015 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3016 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3017 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3018 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3019 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3020 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3021 GEN_VEXT_VF(vfwsub_wf_h, 4)
3022 GEN_VEXT_VF(vfwsub_wf_w, 8)
3023 
3024 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3025 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3026 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3027 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3028 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3029 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3030 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3031 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3032 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3033 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3034 GEN_VEXT_VF(vfmul_vf_h, 2)
3035 GEN_VEXT_VF(vfmul_vf_w, 4)
3036 GEN_VEXT_VF(vfmul_vf_d, 8)
3037 
3038 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3039 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3040 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3041 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3042 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3043 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3044 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3045 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3046 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3047 GEN_VEXT_VF(vfdiv_vf_h, 2)
3048 GEN_VEXT_VF(vfdiv_vf_w, 4)
3049 GEN_VEXT_VF(vfdiv_vf_d, 8)
3050 
3051 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3052 {
3053     return float16_div(b, a, s);
3054 }
3055 
3056 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3057 {
3058     return float32_div(b, a, s);
3059 }
3060 
3061 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3062 {
3063     return float64_div(b, a, s);
3064 }
3065 
3066 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3067 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3068 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3069 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3070 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3071 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3072 
3073 /* Vector Widening Floating-Point Multiply */
3074 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3075 {
3076     return float32_mul(float16_to_float32(a, true, s),
3077                        float16_to_float32(b, true, s), s);
3078 }
3079 
3080 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3081 {
3082     return float64_mul(float32_to_float64(a, s),
3083                        float32_to_float64(b, s), s);
3084 
3085 }
3086 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3087 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3088 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3089 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3090 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3091 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3092 GEN_VEXT_VF(vfwmul_vf_h, 4)
3093 GEN_VEXT_VF(vfwmul_vf_w, 8)
3094 
3095 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3096 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3097 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3098                       CPURISCVState *env)                          \
3099 {                                                                  \
3100     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3101     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3102     TD d = *((TD *)vd + HD(i));                                    \
3103     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3104 }
3105 
3106 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3107 {
3108     return float16_muladd(a, b, d, 0, s);
3109 }
3110 
3111 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3112 {
3113     return float32_muladd(a, b, d, 0, s);
3114 }
3115 
3116 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3117 {
3118     return float64_muladd(a, b, d, 0, s);
3119 }
3120 
3121 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3122 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3123 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3124 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3125 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3126 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3127 
3128 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3129 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3130                       CPURISCVState *env)                         \
3131 {                                                                 \
3132     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3133     TD d = *((TD *)vd + HD(i));                                   \
3134     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3135 }
3136 
3137 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3138 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3139 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3140 GEN_VEXT_VF(vfmacc_vf_h, 2)
3141 GEN_VEXT_VF(vfmacc_vf_w, 4)
3142 GEN_VEXT_VF(vfmacc_vf_d, 8)
3143 
3144 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3145 {
3146     return float16_muladd(a, b, d, float_muladd_negate_c |
3147                                    float_muladd_negate_product, s);
3148 }
3149 
3150 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3151 {
3152     return float32_muladd(a, b, d, float_muladd_negate_c |
3153                                    float_muladd_negate_product, s);
3154 }
3155 
3156 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3157 {
3158     return float64_muladd(a, b, d, float_muladd_negate_c |
3159                                    float_muladd_negate_product, s);
3160 }
3161 
3162 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3163 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3164 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3165 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3166 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3167 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3168 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3169 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3170 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3171 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3172 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3173 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3174 
3175 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3176 {
3177     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3178 }
3179 
3180 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3181 {
3182     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3183 }
3184 
3185 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3186 {
3187     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3188 }
3189 
3190 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3191 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3192 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3193 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3194 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3195 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3196 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3197 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3198 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3199 GEN_VEXT_VF(vfmsac_vf_h, 2)
3200 GEN_VEXT_VF(vfmsac_vf_w, 4)
3201 GEN_VEXT_VF(vfmsac_vf_d, 8)
3202 
3203 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3204 {
3205     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3206 }
3207 
3208 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3209 {
3210     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3211 }
3212 
3213 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3214 {
3215     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3216 }
3217 
3218 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3219 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3220 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3221 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3222 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3223 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3224 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3225 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3226 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3227 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3228 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3229 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3230 
3231 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3232 {
3233     return float16_muladd(d, b, a, 0, s);
3234 }
3235 
3236 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3237 {
3238     return float32_muladd(d, b, a, 0, s);
3239 }
3240 
3241 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3242 {
3243     return float64_muladd(d, b, a, 0, s);
3244 }
3245 
3246 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3247 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3248 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3249 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3250 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3251 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3252 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3253 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3254 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3255 GEN_VEXT_VF(vfmadd_vf_h, 2)
3256 GEN_VEXT_VF(vfmadd_vf_w, 4)
3257 GEN_VEXT_VF(vfmadd_vf_d, 8)
3258 
3259 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3260 {
3261     return float16_muladd(d, b, a, float_muladd_negate_c |
3262                                    float_muladd_negate_product, s);
3263 }
3264 
3265 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3266 {
3267     return float32_muladd(d, b, a, float_muladd_negate_c |
3268                                    float_muladd_negate_product, s);
3269 }
3270 
3271 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3272 {
3273     return float64_muladd(d, b, a, float_muladd_negate_c |
3274                                    float_muladd_negate_product, s);
3275 }
3276 
3277 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3278 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3279 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3280 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3281 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3282 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3283 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3284 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3285 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3286 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3287 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3288 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3289 
3290 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3291 {
3292     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3293 }
3294 
3295 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3296 {
3297     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3298 }
3299 
3300 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3301 {
3302     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3303 }
3304 
3305 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3306 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3307 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3308 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3309 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3310 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3311 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3312 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3313 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3314 GEN_VEXT_VF(vfmsub_vf_h, 2)
3315 GEN_VEXT_VF(vfmsub_vf_w, 4)
3316 GEN_VEXT_VF(vfmsub_vf_d, 8)
3317 
3318 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3319 {
3320     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3321 }
3322 
3323 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3324 {
3325     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3326 }
3327 
3328 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3329 {
3330     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3331 }
3332 
3333 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3334 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3335 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3336 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3337 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3338 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3339 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3340 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3341 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3342 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3343 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3344 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3345 
3346 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3347 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3348 {
3349     return float32_muladd(float16_to_float32(a, true, s),
3350                           float16_to_float32(b, true, s), d, 0, s);
3351 }
3352 
3353 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3354 {
3355     return float64_muladd(float32_to_float64(a, s),
3356                           float32_to_float64(b, s), d, 0, s);
3357 }
3358 
3359 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3360 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3361 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3362 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3363 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3364 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3365 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3366 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3367 
3368 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3369 {
3370     return float32_muladd(bfloat16_to_float32(a, s),
3371                           bfloat16_to_float32(b, s), d, 0, s);
3372 }
3373 
3374 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3375 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3376 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3377 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3378 
3379 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3380 {
3381     return float32_muladd(float16_to_float32(a, true, s),
3382                           float16_to_float32(b, true, s), d,
3383                           float_muladd_negate_c | float_muladd_negate_product,
3384                           s);
3385 }
3386 
3387 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3388 {
3389     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3390                           d, float_muladd_negate_c |
3391                              float_muladd_negate_product, s);
3392 }
3393 
3394 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3395 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3396 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3397 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3398 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3399 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3400 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3401 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3402 
3403 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3404 {
3405     return float32_muladd(float16_to_float32(a, true, s),
3406                           float16_to_float32(b, true, s), d,
3407                           float_muladd_negate_c, s);
3408 }
3409 
3410 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3411 {
3412     return float64_muladd(float32_to_float64(a, s),
3413                           float32_to_float64(b, s), d,
3414                           float_muladd_negate_c, s);
3415 }
3416 
3417 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3418 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3419 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3420 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3421 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3422 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3423 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3424 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3425 
3426 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3427 {
3428     return float32_muladd(float16_to_float32(a, true, s),
3429                           float16_to_float32(b, true, s), d,
3430                           float_muladd_negate_product, s);
3431 }
3432 
3433 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3434 {
3435     return float64_muladd(float32_to_float64(a, s),
3436                           float32_to_float64(b, s), d,
3437                           float_muladd_negate_product, s);
3438 }
3439 
3440 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3441 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3442 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3443 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3444 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3445 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3446 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3447 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3448 
3449 /* Vector Floating-Point Square-Root Instruction */
3450 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3451 static void do_##NAME(void *vd, void *vs2, int i,      \
3452                       CPURISCVState *env)              \
3453 {                                                      \
3454     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3455     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3456 }
3457 
3458 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3459 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3460                   CPURISCVState *env, uint32_t desc)   \
3461 {                                                      \
3462     uint32_t vm = vext_vm(desc);                       \
3463     uint32_t vl = env->vl;                             \
3464     uint32_t total_elems =                             \
3465         vext_get_total_elems(env, desc, ESZ);          \
3466     uint32_t vta = vext_vta(desc);                     \
3467     uint32_t vma = vext_vma(desc);                     \
3468     uint32_t i;                                        \
3469                                                        \
3470     if (vl == 0) {                                     \
3471         return;                                        \
3472     }                                                  \
3473     for (i = env->vstart; i < vl; i++) {               \
3474         if (!vm && !vext_elem_mask(v0, i)) {           \
3475             /* set masked-off elements to 1s */        \
3476             vext_set_elems_1s(vd, vma, i * ESZ,        \
3477                               (i + 1) * ESZ);          \
3478             continue;                                  \
3479         }                                              \
3480         do_##NAME(vd, vs2, i, env);                    \
3481     }                                                  \
3482     env->vstart = 0;                                   \
3483     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3484                       total_elems * ESZ);              \
3485 }
3486 
3487 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3488 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3489 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3490 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3491 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3492 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3493 
3494 /*
3495  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3496  *
3497  * Adapted from riscv-v-spec recip.c:
3498  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3499  */
3500 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3501 {
3502     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3503     uint64_t exp = extract64(f, frac_size, exp_size);
3504     uint64_t frac = extract64(f, 0, frac_size);
3505 
3506     const uint8_t lookup_table[] = {
3507         52, 51, 50, 48, 47, 46, 44, 43,
3508         42, 41, 40, 39, 38, 36, 35, 34,
3509         33, 32, 31, 30, 30, 29, 28, 27,
3510         26, 25, 24, 23, 23, 22, 21, 20,
3511         19, 19, 18, 17, 16, 16, 15, 14,
3512         14, 13, 12, 12, 11, 10, 10, 9,
3513         9, 8, 7, 7, 6, 6, 5, 4,
3514         4, 3, 3, 2, 2, 1, 1, 0,
3515         127, 125, 123, 121, 119, 118, 116, 114,
3516         113, 111, 109, 108, 106, 105, 103, 102,
3517         100, 99, 97, 96, 95, 93, 92, 91,
3518         90, 88, 87, 86, 85, 84, 83, 82,
3519         80, 79, 78, 77, 76, 75, 74, 73,
3520         72, 71, 70, 70, 69, 68, 67, 66,
3521         65, 64, 63, 63, 62, 61, 60, 59,
3522         59, 58, 57, 56, 56, 55, 54, 53
3523     };
3524     const int precision = 7;
3525 
3526     if (exp == 0 && frac != 0) { /* subnormal */
3527         /* Normalize the subnormal. */
3528         while (extract64(frac, frac_size - 1, 1) == 0) {
3529             exp--;
3530             frac <<= 1;
3531         }
3532 
3533         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3534     }
3535 
3536     int idx = ((exp & 1) << (precision - 1)) |
3537               (frac >> (frac_size - precision + 1));
3538     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3539                         (frac_size - precision);
3540     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3541 
3542     uint64_t val = 0;
3543     val = deposit64(val, 0, frac_size, out_frac);
3544     val = deposit64(val, frac_size, exp_size, out_exp);
3545     val = deposit64(val, frac_size + exp_size, 1, sign);
3546     return val;
3547 }
3548 
3549 static float16 frsqrt7_h(float16 f, float_status *s)
3550 {
3551     int exp_size = 5, frac_size = 10;
3552     bool sign = float16_is_neg(f);
3553 
3554     /*
3555      * frsqrt7(sNaN) = canonical NaN
3556      * frsqrt7(-inf) = canonical NaN
3557      * frsqrt7(-normal) = canonical NaN
3558      * frsqrt7(-subnormal) = canonical NaN
3559      */
3560     if (float16_is_signaling_nan(f, s) ||
3561         (float16_is_infinity(f) && sign) ||
3562         (float16_is_normal(f) && sign) ||
3563         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3564         s->float_exception_flags |= float_flag_invalid;
3565         return float16_default_nan(s);
3566     }
3567 
3568     /* frsqrt7(qNaN) = canonical NaN */
3569     if (float16_is_quiet_nan(f, s)) {
3570         return float16_default_nan(s);
3571     }
3572 
3573     /* frsqrt7(+-0) = +-inf */
3574     if (float16_is_zero(f)) {
3575         s->float_exception_flags |= float_flag_divbyzero;
3576         return float16_set_sign(float16_infinity, sign);
3577     }
3578 
3579     /* frsqrt7(+inf) = +0 */
3580     if (float16_is_infinity(f) && !sign) {
3581         return float16_set_sign(float16_zero, sign);
3582     }
3583 
3584     /* +normal, +subnormal */
3585     uint64_t val = frsqrt7(f, exp_size, frac_size);
3586     return make_float16(val);
3587 }
3588 
3589 static float32 frsqrt7_s(float32 f, float_status *s)
3590 {
3591     int exp_size = 8, frac_size = 23;
3592     bool sign = float32_is_neg(f);
3593 
3594     /*
3595      * frsqrt7(sNaN) = canonical NaN
3596      * frsqrt7(-inf) = canonical NaN
3597      * frsqrt7(-normal) = canonical NaN
3598      * frsqrt7(-subnormal) = canonical NaN
3599      */
3600     if (float32_is_signaling_nan(f, s) ||
3601         (float32_is_infinity(f) && sign) ||
3602         (float32_is_normal(f) && sign) ||
3603         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3604         s->float_exception_flags |= float_flag_invalid;
3605         return float32_default_nan(s);
3606     }
3607 
3608     /* frsqrt7(qNaN) = canonical NaN */
3609     if (float32_is_quiet_nan(f, s)) {
3610         return float32_default_nan(s);
3611     }
3612 
3613     /* frsqrt7(+-0) = +-inf */
3614     if (float32_is_zero(f)) {
3615         s->float_exception_flags |= float_flag_divbyzero;
3616         return float32_set_sign(float32_infinity, sign);
3617     }
3618 
3619     /* frsqrt7(+inf) = +0 */
3620     if (float32_is_infinity(f) && !sign) {
3621         return float32_set_sign(float32_zero, sign);
3622     }
3623 
3624     /* +normal, +subnormal */
3625     uint64_t val = frsqrt7(f, exp_size, frac_size);
3626     return make_float32(val);
3627 }
3628 
3629 static float64 frsqrt7_d(float64 f, float_status *s)
3630 {
3631     int exp_size = 11, frac_size = 52;
3632     bool sign = float64_is_neg(f);
3633 
3634     /*
3635      * frsqrt7(sNaN) = canonical NaN
3636      * frsqrt7(-inf) = canonical NaN
3637      * frsqrt7(-normal) = canonical NaN
3638      * frsqrt7(-subnormal) = canonical NaN
3639      */
3640     if (float64_is_signaling_nan(f, s) ||
3641         (float64_is_infinity(f) && sign) ||
3642         (float64_is_normal(f) && sign) ||
3643         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3644         s->float_exception_flags |= float_flag_invalid;
3645         return float64_default_nan(s);
3646     }
3647 
3648     /* frsqrt7(qNaN) = canonical NaN */
3649     if (float64_is_quiet_nan(f, s)) {
3650         return float64_default_nan(s);
3651     }
3652 
3653     /* frsqrt7(+-0) = +-inf */
3654     if (float64_is_zero(f)) {
3655         s->float_exception_flags |= float_flag_divbyzero;
3656         return float64_set_sign(float64_infinity, sign);
3657     }
3658 
3659     /* frsqrt7(+inf) = +0 */
3660     if (float64_is_infinity(f) && !sign) {
3661         return float64_set_sign(float64_zero, sign);
3662     }
3663 
3664     /* +normal, +subnormal */
3665     uint64_t val = frsqrt7(f, exp_size, frac_size);
3666     return make_float64(val);
3667 }
3668 
3669 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3670 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3671 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3672 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3673 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3674 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3675 
3676 /*
3677  * Vector Floating-Point Reciprocal Estimate Instruction
3678  *
3679  * Adapted from riscv-v-spec recip.c:
3680  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3681  */
3682 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3683                       float_status *s)
3684 {
3685     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3686     uint64_t exp = extract64(f, frac_size, exp_size);
3687     uint64_t frac = extract64(f, 0, frac_size);
3688 
3689     const uint8_t lookup_table[] = {
3690         127, 125, 123, 121, 119, 117, 116, 114,
3691         112, 110, 109, 107, 105, 104, 102, 100,
3692         99, 97, 96, 94, 93, 91, 90, 88,
3693         87, 85, 84, 83, 81, 80, 79, 77,
3694         76, 75, 74, 72, 71, 70, 69, 68,
3695         66, 65, 64, 63, 62, 61, 60, 59,
3696         58, 57, 56, 55, 54, 53, 52, 51,
3697         50, 49, 48, 47, 46, 45, 44, 43,
3698         42, 41, 40, 40, 39, 38, 37, 36,
3699         35, 35, 34, 33, 32, 31, 31, 30,
3700         29, 28, 28, 27, 26, 25, 25, 24,
3701         23, 23, 22, 21, 21, 20, 19, 19,
3702         18, 17, 17, 16, 15, 15, 14, 14,
3703         13, 12, 12, 11, 11, 10, 9, 9,
3704         8, 8, 7, 7, 6, 5, 5, 4,
3705         4, 3, 3, 2, 2, 1, 1, 0
3706     };
3707     const int precision = 7;
3708 
3709     if (exp == 0 && frac != 0) { /* subnormal */
3710         /* Normalize the subnormal. */
3711         while (extract64(frac, frac_size - 1, 1) == 0) {
3712             exp--;
3713             frac <<= 1;
3714         }
3715 
3716         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3717 
3718         if (exp != 0 && exp != UINT64_MAX) {
3719             /*
3720              * Overflow to inf or max value of same sign,
3721              * depending on sign and rounding mode.
3722              */
3723             s->float_exception_flags |= (float_flag_inexact |
3724                                          float_flag_overflow);
3725 
3726             if ((s->float_rounding_mode == float_round_to_zero) ||
3727                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3728                 ((s->float_rounding_mode == float_round_up) && sign)) {
3729                 /* Return greatest/negative finite value. */
3730                 return (sign << (exp_size + frac_size)) |
3731                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3732             } else {
3733                 /* Return +-inf. */
3734                 return (sign << (exp_size + frac_size)) |
3735                        MAKE_64BIT_MASK(frac_size, exp_size);
3736             }
3737         }
3738     }
3739 
3740     int idx = frac >> (frac_size - precision);
3741     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3742                         (frac_size - precision);
3743     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3744 
3745     if (out_exp == 0 || out_exp == UINT64_MAX) {
3746         /*
3747          * The result is subnormal, but don't raise the underflow exception,
3748          * because there's no additional loss of precision.
3749          */
3750         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3751         if (out_exp == UINT64_MAX) {
3752             out_frac >>= 1;
3753             out_exp = 0;
3754         }
3755     }
3756 
3757     uint64_t val = 0;
3758     val = deposit64(val, 0, frac_size, out_frac);
3759     val = deposit64(val, frac_size, exp_size, out_exp);
3760     val = deposit64(val, frac_size + exp_size, 1, sign);
3761     return val;
3762 }
3763 
3764 static float16 frec7_h(float16 f, float_status *s)
3765 {
3766     int exp_size = 5, frac_size = 10;
3767     bool sign = float16_is_neg(f);
3768 
3769     /* frec7(+-inf) = +-0 */
3770     if (float16_is_infinity(f)) {
3771         return float16_set_sign(float16_zero, sign);
3772     }
3773 
3774     /* frec7(+-0) = +-inf */
3775     if (float16_is_zero(f)) {
3776         s->float_exception_flags |= float_flag_divbyzero;
3777         return float16_set_sign(float16_infinity, sign);
3778     }
3779 
3780     /* frec7(sNaN) = canonical NaN */
3781     if (float16_is_signaling_nan(f, s)) {
3782         s->float_exception_flags |= float_flag_invalid;
3783         return float16_default_nan(s);
3784     }
3785 
3786     /* frec7(qNaN) = canonical NaN */
3787     if (float16_is_quiet_nan(f, s)) {
3788         return float16_default_nan(s);
3789     }
3790 
3791     /* +-normal, +-subnormal */
3792     uint64_t val = frec7(f, exp_size, frac_size, s);
3793     return make_float16(val);
3794 }
3795 
3796 static float32 frec7_s(float32 f, float_status *s)
3797 {
3798     int exp_size = 8, frac_size = 23;
3799     bool sign = float32_is_neg(f);
3800 
3801     /* frec7(+-inf) = +-0 */
3802     if (float32_is_infinity(f)) {
3803         return float32_set_sign(float32_zero, sign);
3804     }
3805 
3806     /* frec7(+-0) = +-inf */
3807     if (float32_is_zero(f)) {
3808         s->float_exception_flags |= float_flag_divbyzero;
3809         return float32_set_sign(float32_infinity, sign);
3810     }
3811 
3812     /* frec7(sNaN) = canonical NaN */
3813     if (float32_is_signaling_nan(f, s)) {
3814         s->float_exception_flags |= float_flag_invalid;
3815         return float32_default_nan(s);
3816     }
3817 
3818     /* frec7(qNaN) = canonical NaN */
3819     if (float32_is_quiet_nan(f, s)) {
3820         return float32_default_nan(s);
3821     }
3822 
3823     /* +-normal, +-subnormal */
3824     uint64_t val = frec7(f, exp_size, frac_size, s);
3825     return make_float32(val);
3826 }
3827 
3828 static float64 frec7_d(float64 f, float_status *s)
3829 {
3830     int exp_size = 11, frac_size = 52;
3831     bool sign = float64_is_neg(f);
3832 
3833     /* frec7(+-inf) = +-0 */
3834     if (float64_is_infinity(f)) {
3835         return float64_set_sign(float64_zero, sign);
3836     }
3837 
3838     /* frec7(+-0) = +-inf */
3839     if (float64_is_zero(f)) {
3840         s->float_exception_flags |= float_flag_divbyzero;
3841         return float64_set_sign(float64_infinity, sign);
3842     }
3843 
3844     /* frec7(sNaN) = canonical NaN */
3845     if (float64_is_signaling_nan(f, s)) {
3846         s->float_exception_flags |= float_flag_invalid;
3847         return float64_default_nan(s);
3848     }
3849 
3850     /* frec7(qNaN) = canonical NaN */
3851     if (float64_is_quiet_nan(f, s)) {
3852         return float64_default_nan(s);
3853     }
3854 
3855     /* +-normal, +-subnormal */
3856     uint64_t val = frec7(f, exp_size, frac_size, s);
3857     return make_float64(val);
3858 }
3859 
3860 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3861 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3862 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3863 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3864 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3865 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3866 
3867 /* Vector Floating-Point MIN/MAX Instructions */
3868 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3869 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3870 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3871 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3872 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3873 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3874 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3875 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3876 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3877 GEN_VEXT_VF(vfmin_vf_h, 2)
3878 GEN_VEXT_VF(vfmin_vf_w, 4)
3879 GEN_VEXT_VF(vfmin_vf_d, 8)
3880 
3881 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3882 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3883 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3884 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3885 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3886 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3887 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3888 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3889 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3890 GEN_VEXT_VF(vfmax_vf_h, 2)
3891 GEN_VEXT_VF(vfmax_vf_w, 4)
3892 GEN_VEXT_VF(vfmax_vf_d, 8)
3893 
3894 /* Vector Floating-Point Sign-Injection Instructions */
3895 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3896 {
3897     return deposit64(b, 0, 15, a);
3898 }
3899 
3900 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3901 {
3902     return deposit64(b, 0, 31, a);
3903 }
3904 
3905 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3906 {
3907     return deposit64(b, 0, 63, a);
3908 }
3909 
3910 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3911 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3912 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3913 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3914 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3915 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3916 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3917 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3918 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3919 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3920 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3921 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3922 
3923 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3924 {
3925     return deposit64(~b, 0, 15, a);
3926 }
3927 
3928 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3929 {
3930     return deposit64(~b, 0, 31, a);
3931 }
3932 
3933 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3934 {
3935     return deposit64(~b, 0, 63, a);
3936 }
3937 
3938 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3939 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3940 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3941 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3942 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3943 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3944 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3945 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3946 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3947 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
3948 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
3949 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
3950 
3951 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3952 {
3953     return deposit64(b ^ a, 0, 15, a);
3954 }
3955 
3956 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3957 {
3958     return deposit64(b ^ a, 0, 31, a);
3959 }
3960 
3961 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3962 {
3963     return deposit64(b ^ a, 0, 63, a);
3964 }
3965 
3966 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3967 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3968 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3969 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
3970 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
3971 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
3972 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3973 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3974 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3975 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
3976 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
3977 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
3978 
3979 /* Vector Floating-Point Compare Instructions */
3980 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3981 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3982                   CPURISCVState *env, uint32_t desc)          \
3983 {                                                             \
3984     uint32_t vm = vext_vm(desc);                              \
3985     uint32_t vl = env->vl;                                    \
3986     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
3987     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
3988     uint32_t vma = vext_vma(desc);                            \
3989     uint32_t i;                                               \
3990                                                               \
3991     for (i = env->vstart; i < vl; i++) {                      \
3992         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3993         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3994         if (!vm && !vext_elem_mask(v0, i)) {                  \
3995             /* set masked-off elements to 1s */               \
3996             if (vma) {                                        \
3997                 vext_set_elem_mask(vd, i, 1);                 \
3998             }                                                 \
3999             continue;                                         \
4000         }                                                     \
4001         vext_set_elem_mask(vd, i,                             \
4002                            DO_OP(s2, s1, &env->fp_status));   \
4003     }                                                         \
4004     env->vstart = 0;                                          \
4005     /*
4006      * mask destination register are always tail-agnostic
4007      * set tail elements to 1s
4008      */                                                       \
4009     if (vta_all_1s) {                                         \
4010         for (; i < total_elems; i++) {                        \
4011             vext_set_elem_mask(vd, i, 1);                     \
4012         }                                                     \
4013     }                                                         \
4014 }
4015 
4016 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4017 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4018 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4019 
4020 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4021 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4022                   CPURISCVState *env, uint32_t desc)                \
4023 {                                                                   \
4024     uint32_t vm = vext_vm(desc);                                    \
4025     uint32_t vl = env->vl;                                          \
4026     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4027     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4028     uint32_t vma = vext_vma(desc);                                  \
4029     uint32_t i;                                                     \
4030                                                                     \
4031     for (i = env->vstart; i < vl; i++) {                            \
4032         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4033         if (!vm && !vext_elem_mask(v0, i)) {                        \
4034             /* set masked-off elements to 1s */                     \
4035             if (vma) {                                              \
4036                 vext_set_elem_mask(vd, i, 1);                       \
4037             }                                                       \
4038             continue;                                               \
4039         }                                                           \
4040         vext_set_elem_mask(vd, i,                                   \
4041                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4042     }                                                               \
4043     env->vstart = 0;                                                \
4044     /*
4045      * mask destination register are always tail-agnostic
4046      * set tail elements to 1s
4047      */                                                             \
4048     if (vta_all_1s) {                                               \
4049         for (; i < total_elems; i++) {                              \
4050             vext_set_elem_mask(vd, i, 1);                           \
4051         }                                                           \
4052     }                                                               \
4053 }
4054 
4055 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4056 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4057 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4058 
4059 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4060 {
4061     FloatRelation compare = float16_compare_quiet(a, b, s);
4062     return compare != float_relation_equal;
4063 }
4064 
4065 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4066 {
4067     FloatRelation compare = float32_compare_quiet(a, b, s);
4068     return compare != float_relation_equal;
4069 }
4070 
4071 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4072 {
4073     FloatRelation compare = float64_compare_quiet(a, b, s);
4074     return compare != float_relation_equal;
4075 }
4076 
4077 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4078 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4079 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4080 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4081 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4082 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4083 
4084 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4085 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4086 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4087 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4088 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4089 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4090 
4091 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4092 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4093 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4094 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4095 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4096 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4097 
4098 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4099 {
4100     FloatRelation compare = float16_compare(a, b, s);
4101     return compare == float_relation_greater;
4102 }
4103 
4104 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4105 {
4106     FloatRelation compare = float32_compare(a, b, s);
4107     return compare == float_relation_greater;
4108 }
4109 
4110 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4111 {
4112     FloatRelation compare = float64_compare(a, b, s);
4113     return compare == float_relation_greater;
4114 }
4115 
4116 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4117 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4118 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4119 
4120 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4121 {
4122     FloatRelation compare = float16_compare(a, b, s);
4123     return compare == float_relation_greater ||
4124            compare == float_relation_equal;
4125 }
4126 
4127 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4128 {
4129     FloatRelation compare = float32_compare(a, b, s);
4130     return compare == float_relation_greater ||
4131            compare == float_relation_equal;
4132 }
4133 
4134 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4135 {
4136     FloatRelation compare = float64_compare(a, b, s);
4137     return compare == float_relation_greater ||
4138            compare == float_relation_equal;
4139 }
4140 
4141 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4142 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4143 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4144 
4145 /* Vector Floating-Point Classify Instruction */
4146 target_ulong fclass_h(uint64_t frs1)
4147 {
4148     float16 f = frs1;
4149     bool sign = float16_is_neg(f);
4150 
4151     if (float16_is_infinity(f)) {
4152         return sign ? 1 << 0 : 1 << 7;
4153     } else if (float16_is_zero(f)) {
4154         return sign ? 1 << 3 : 1 << 4;
4155     } else if (float16_is_zero_or_denormal(f)) {
4156         return sign ? 1 << 2 : 1 << 5;
4157     } else if (float16_is_any_nan(f)) {
4158         float_status s = { }; /* for snan_bit_is_one */
4159         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4160     } else {
4161         return sign ? 1 << 1 : 1 << 6;
4162     }
4163 }
4164 
4165 target_ulong fclass_s(uint64_t frs1)
4166 {
4167     float32 f = frs1;
4168     bool sign = float32_is_neg(f);
4169 
4170     if (float32_is_infinity(f)) {
4171         return sign ? 1 << 0 : 1 << 7;
4172     } else if (float32_is_zero(f)) {
4173         return sign ? 1 << 3 : 1 << 4;
4174     } else if (float32_is_zero_or_denormal(f)) {
4175         return sign ? 1 << 2 : 1 << 5;
4176     } else if (float32_is_any_nan(f)) {
4177         float_status s = { }; /* for snan_bit_is_one */
4178         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4179     } else {
4180         return sign ? 1 << 1 : 1 << 6;
4181     }
4182 }
4183 
4184 target_ulong fclass_d(uint64_t frs1)
4185 {
4186     float64 f = frs1;
4187     bool sign = float64_is_neg(f);
4188 
4189     if (float64_is_infinity(f)) {
4190         return sign ? 1 << 0 : 1 << 7;
4191     } else if (float64_is_zero(f)) {
4192         return sign ? 1 << 3 : 1 << 4;
4193     } else if (float64_is_zero_or_denormal(f)) {
4194         return sign ? 1 << 2 : 1 << 5;
4195     } else if (float64_is_any_nan(f)) {
4196         float_status s = { }; /* for snan_bit_is_one */
4197         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4198     } else {
4199         return sign ? 1 << 1 : 1 << 6;
4200     }
4201 }
4202 
4203 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4204 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4205 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4206 GEN_VEXT_V(vfclass_v_h, 2)
4207 GEN_VEXT_V(vfclass_v_w, 4)
4208 GEN_VEXT_V(vfclass_v_d, 8)
4209 
4210 /* Vector Floating-Point Merge Instruction */
4211 
4212 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4213 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4214                   CPURISCVState *env, uint32_t desc)          \
4215 {                                                             \
4216     uint32_t vm = vext_vm(desc);                              \
4217     uint32_t vl = env->vl;                                    \
4218     uint32_t esz = sizeof(ETYPE);                             \
4219     uint32_t total_elems =                                    \
4220         vext_get_total_elems(env, desc, esz);                 \
4221     uint32_t vta = vext_vta(desc);                            \
4222     uint32_t i;                                               \
4223                                                               \
4224     for (i = env->vstart; i < vl; i++) {                      \
4225         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4226         *((ETYPE *)vd + H(i)) =                               \
4227             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4228     }                                                         \
4229     env->vstart = 0;                                          \
4230     /* set tail elements to 1s */                             \
4231     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4232 }
4233 
4234 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4235 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4236 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4237 
4238 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4239 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4240 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4241 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4242 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4243 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4244 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4245 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4246 
4247 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4248 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4249 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4250 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4251 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4252 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4253 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4254 
4255 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4256 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4257 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4258 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4259 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4260 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4261 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4262 
4263 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4264 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4265 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4266 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4267 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4268 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4269 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4270 
4271 /* Widening Floating-Point/Integer Type-Convert Instructions */
4272 /* (TD, T2, TX2) */
4273 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4274 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4275 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4276 /*
4277  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4278  */
4279 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4280 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4281 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4282 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4283 
4284 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4285 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4286 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4287 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4288 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4289 
4290 /*
4291  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4292  */
4293 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4294 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4295 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4296 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4297 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4298 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4299 
4300 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4301 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4302 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4303 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4304 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4305 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4306 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4307 
4308 /*
4309  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4310  */
4311 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4312 {
4313     return float16_to_float32(a, true, s);
4314 }
4315 
4316 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4317 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4318 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4319 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4320 
4321 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4322 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4323 
4324 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4325 /* (TD, T2, TX2) */
4326 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4327 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4328 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4329 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4330 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4331 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4332 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4333 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4334 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4335 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4336 
4337 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4338 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4339 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4340 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4341 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4342 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4343 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4344 
4345 /*
4346  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4347  */
4348 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4349 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4350 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4351 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4352 
4353 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4354 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4355 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4356 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4357 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4358 
4359 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4360 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4361 {
4362     return float32_to_float16(a, true, s);
4363 }
4364 
4365 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4366 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4367 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4368 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4369 
4370 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4371 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4372 
4373 /*
4374  * Vector Reduction Operations
4375  */
4376 /* Vector Single-Width Integer Reduction Instructions */
4377 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4378 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4379                   void *vs2, CPURISCVState *env,          \
4380                   uint32_t desc)                          \
4381 {                                                         \
4382     uint32_t vm = vext_vm(desc);                          \
4383     uint32_t vl = env->vl;                                \
4384     uint32_t esz = sizeof(TD);                            \
4385     uint32_t vlenb = simd_maxsz(desc);                    \
4386     uint32_t vta = vext_vta(desc);                        \
4387     uint32_t i;                                           \
4388     TD s1 =  *((TD *)vs1 + HD(0));                        \
4389                                                           \
4390     for (i = env->vstart; i < vl; i++) {                  \
4391         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4392         if (!vm && !vext_elem_mask(v0, i)) {              \
4393             continue;                                     \
4394         }                                                 \
4395         s1 = OP(s1, (TD)s2);                              \
4396     }                                                     \
4397     *((TD *)vd + HD(0)) = s1;                             \
4398     env->vstart = 0;                                      \
4399     /* set tail elements to 1s */                         \
4400     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4401 }
4402 
4403 /* vd[0] = sum(vs1[0], vs2[*]) */
4404 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4405 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4406 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4407 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4408 
4409 /* vd[0] = maxu(vs1[0], vs2[*]) */
4410 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4411 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4412 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4413 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4414 
4415 /* vd[0] = max(vs1[0], vs2[*]) */
4416 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4417 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4418 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4419 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4420 
4421 /* vd[0] = minu(vs1[0], vs2[*]) */
4422 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4423 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4424 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4425 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4426 
4427 /* vd[0] = min(vs1[0], vs2[*]) */
4428 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4429 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4430 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4431 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4432 
4433 /* vd[0] = and(vs1[0], vs2[*]) */
4434 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4435 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4436 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4437 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4438 
4439 /* vd[0] = or(vs1[0], vs2[*]) */
4440 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4441 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4442 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4443 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4444 
4445 /* vd[0] = xor(vs1[0], vs2[*]) */
4446 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4447 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4448 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4449 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4450 
4451 /* Vector Widening Integer Reduction Instructions */
4452 /* signed sum reduction into double-width accumulator */
4453 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4454 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4455 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4456 
4457 /* Unsigned sum reduction into double-width accumulator */
4458 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4459 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4460 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4461 
4462 /* Vector Single-Width Floating-Point Reduction Instructions */
4463 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4464 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4465                   void *vs2, CPURISCVState *env,           \
4466                   uint32_t desc)                           \
4467 {                                                          \
4468     uint32_t vm = vext_vm(desc);                           \
4469     uint32_t vl = env->vl;                                 \
4470     uint32_t esz = sizeof(TD);                             \
4471     uint32_t vlenb = simd_maxsz(desc);                     \
4472     uint32_t vta = vext_vta(desc);                         \
4473     uint32_t i;                                            \
4474     TD s1 =  *((TD *)vs1 + HD(0));                         \
4475                                                            \
4476     for (i = env->vstart; i < vl; i++) {                   \
4477         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4478         if (!vm && !vext_elem_mask(v0, i)) {               \
4479             continue;                                      \
4480         }                                                  \
4481         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4482     }                                                      \
4483     *((TD *)vd + HD(0)) = s1;                              \
4484     env->vstart = 0;                                       \
4485     /* set tail elements to 1s */                          \
4486     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4487 }
4488 
4489 /* Unordered sum */
4490 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4491 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4492 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4493 
4494 /* Ordered sum */
4495 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4496 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4497 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4498 
4499 /* Maximum value */
4500 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4501               float16_maximum_number)
4502 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4503               float32_maximum_number)
4504 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4505               float64_maximum_number)
4506 
4507 /* Minimum value */
4508 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4509               float16_minimum_number)
4510 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4511               float32_minimum_number)
4512 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4513               float64_minimum_number)
4514 
4515 /* Vector Widening Floating-Point Add Instructions */
4516 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4517 {
4518     return float32_add(a, float16_to_float32(b, true, s), s);
4519 }
4520 
4521 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4522 {
4523     return float64_add(a, float32_to_float64(b, s), s);
4524 }
4525 
4526 /* Vector Widening Floating-Point Reduction Instructions */
4527 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4528 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4529 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4530 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4531 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4532 
4533 /*
4534  * Vector Mask Operations
4535  */
4536 /* Vector Mask-Register Logical Instructions */
4537 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4538 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4539                   void *vs2, CPURISCVState *env,          \
4540                   uint32_t desc)                          \
4541 {                                                         \
4542     uint32_t vl = env->vl;                                \
4543     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4544     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4545     uint32_t i;                                           \
4546     int a, b;                                             \
4547                                                           \
4548     for (i = env->vstart; i < vl; i++) {                  \
4549         a = vext_elem_mask(vs1, i);                       \
4550         b = vext_elem_mask(vs2, i);                       \
4551         vext_set_elem_mask(vd, i, OP(b, a));              \
4552     }                                                     \
4553     env->vstart = 0;                                      \
4554     /*
4555      * mask destination register are always tail-agnostic
4556      * set tail elements to 1s
4557      */                                                   \
4558     if (vta_all_1s) {                                     \
4559         for (; i < total_elems; i++) {                    \
4560             vext_set_elem_mask(vd, i, 1);                 \
4561         }                                                 \
4562     }                                                     \
4563 }
4564 
4565 #define DO_NAND(N, M)  (!(N & M))
4566 #define DO_ANDNOT(N, M)  (N & !M)
4567 #define DO_NOR(N, M)  (!(N | M))
4568 #define DO_ORNOT(N, M)  (N | !M)
4569 #define DO_XNOR(N, M)  (!(N ^ M))
4570 
4571 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4572 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4573 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4574 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4575 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4576 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4577 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4578 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4579 
4580 /* Vector count population in mask vcpop */
4581 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4582                              uint32_t desc)
4583 {
4584     target_ulong cnt = 0;
4585     uint32_t vm = vext_vm(desc);
4586     uint32_t vl = env->vl;
4587     int i;
4588 
4589     for (i = env->vstart; i < vl; i++) {
4590         if (vm || vext_elem_mask(v0, i)) {
4591             if (vext_elem_mask(vs2, i)) {
4592                 cnt++;
4593             }
4594         }
4595     }
4596     env->vstart = 0;
4597     return cnt;
4598 }
4599 
4600 /* vfirst find-first-set mask bit */
4601 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4602                               uint32_t desc)
4603 {
4604     uint32_t vm = vext_vm(desc);
4605     uint32_t vl = env->vl;
4606     int i;
4607 
4608     for (i = env->vstart; i < vl; i++) {
4609         if (vm || vext_elem_mask(v0, i)) {
4610             if (vext_elem_mask(vs2, i)) {
4611                 return i;
4612             }
4613         }
4614     }
4615     env->vstart = 0;
4616     return -1LL;
4617 }
4618 
4619 enum set_mask_type {
4620     ONLY_FIRST = 1,
4621     INCLUDE_FIRST,
4622     BEFORE_FIRST,
4623 };
4624 
4625 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4626                    uint32_t desc, enum set_mask_type type)
4627 {
4628     uint32_t vm = vext_vm(desc);
4629     uint32_t vl = env->vl;
4630     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4631     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4632     uint32_t vma = vext_vma(desc);
4633     int i;
4634     bool first_mask_bit = false;
4635 
4636     for (i = env->vstart; i < vl; i++) {
4637         if (!vm && !vext_elem_mask(v0, i)) {
4638             /* set masked-off elements to 1s */
4639             if (vma) {
4640                 vext_set_elem_mask(vd, i, 1);
4641             }
4642             continue;
4643         }
4644         /* write a zero to all following active elements */
4645         if (first_mask_bit) {
4646             vext_set_elem_mask(vd, i, 0);
4647             continue;
4648         }
4649         if (vext_elem_mask(vs2, i)) {
4650             first_mask_bit = true;
4651             if (type == BEFORE_FIRST) {
4652                 vext_set_elem_mask(vd, i, 0);
4653             } else {
4654                 vext_set_elem_mask(vd, i, 1);
4655             }
4656         } else {
4657             if (type == ONLY_FIRST) {
4658                 vext_set_elem_mask(vd, i, 0);
4659             } else {
4660                 vext_set_elem_mask(vd, i, 1);
4661             }
4662         }
4663     }
4664     env->vstart = 0;
4665     /*
4666      * mask destination register are always tail-agnostic
4667      * set tail elements to 1s
4668      */
4669     if (vta_all_1s) {
4670         for (; i < total_elems; i++) {
4671             vext_set_elem_mask(vd, i, 1);
4672         }
4673     }
4674 }
4675 
4676 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4677                      uint32_t desc)
4678 {
4679     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4680 }
4681 
4682 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4683                      uint32_t desc)
4684 {
4685     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4686 }
4687 
4688 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4689                      uint32_t desc)
4690 {
4691     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4692 }
4693 
4694 /* Vector Iota Instruction */
4695 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4696 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4697                   uint32_t desc)                                          \
4698 {                                                                         \
4699     uint32_t vm = vext_vm(desc);                                          \
4700     uint32_t vl = env->vl;                                                \
4701     uint32_t esz = sizeof(ETYPE);                                         \
4702     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4703     uint32_t vta = vext_vta(desc);                                        \
4704     uint32_t vma = vext_vma(desc);                                        \
4705     uint32_t sum = 0;                                                     \
4706     int i;                                                                \
4707                                                                           \
4708     for (i = env->vstart; i < vl; i++) {                                  \
4709         if (!vm && !vext_elem_mask(v0, i)) {                              \
4710             /* set masked-off elements to 1s */                           \
4711             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4712             continue;                                                     \
4713         }                                                                 \
4714         *((ETYPE *)vd + H(i)) = sum;                                      \
4715         if (vext_elem_mask(vs2, i)) {                                     \
4716             sum++;                                                        \
4717         }                                                                 \
4718     }                                                                     \
4719     env->vstart = 0;                                                      \
4720     /* set tail elements to 1s */                                         \
4721     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4722 }
4723 
4724 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4725 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4726 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4727 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4728 
4729 /* Vector Element Index Instruction */
4730 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4731 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4732 {                                                                         \
4733     uint32_t vm = vext_vm(desc);                                          \
4734     uint32_t vl = env->vl;                                                \
4735     uint32_t esz = sizeof(ETYPE);                                         \
4736     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4737     uint32_t vta = vext_vta(desc);                                        \
4738     uint32_t vma = vext_vma(desc);                                        \
4739     int i;                                                                \
4740                                                                           \
4741     for (i = env->vstart; i < vl; i++) {                                  \
4742         if (!vm && !vext_elem_mask(v0, i)) {                              \
4743             /* set masked-off elements to 1s */                           \
4744             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4745             continue;                                                     \
4746         }                                                                 \
4747         *((ETYPE *)vd + H(i)) = i;                                        \
4748     }                                                                     \
4749     env->vstart = 0;                                                      \
4750     /* set tail elements to 1s */                                         \
4751     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4752 }
4753 
4754 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4755 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4756 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4757 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4758 
4759 /*
4760  * Vector Permutation Instructions
4761  */
4762 
4763 /* Vector Slide Instructions */
4764 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4765 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4766                   CPURISCVState *env, uint32_t desc)                      \
4767 {                                                                         \
4768     uint32_t vm = vext_vm(desc);                                          \
4769     uint32_t vl = env->vl;                                                \
4770     uint32_t esz = sizeof(ETYPE);                                         \
4771     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4772     uint32_t vta = vext_vta(desc);                                        \
4773     uint32_t vma = vext_vma(desc);                                        \
4774     target_ulong offset = s1, i_min, i;                                   \
4775                                                                           \
4776     i_min = MAX(env->vstart, offset);                                     \
4777     for (i = i_min; i < vl; i++) {                                        \
4778         if (!vm && !vext_elem_mask(v0, i)) {                              \
4779             /* set masked-off elements to 1s */                           \
4780             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4781             continue;                                                     \
4782         }                                                                 \
4783         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4784     }                                                                     \
4785     /* set tail elements to 1s */                                         \
4786     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4787 }
4788 
4789 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4790 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4791 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4792 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4793 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4794 
4795 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4796 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4797                   CPURISCVState *env, uint32_t desc)                      \
4798 {                                                                         \
4799     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4800     uint32_t vm = vext_vm(desc);                                          \
4801     uint32_t vl = env->vl;                                                \
4802     uint32_t esz = sizeof(ETYPE);                                         \
4803     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4804     uint32_t vta = vext_vta(desc);                                        \
4805     uint32_t vma = vext_vma(desc);                                        \
4806     target_ulong i_max, i_min, i;                                         \
4807                                                                           \
4808     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
4809     i_max = MAX(i_min, env->vstart);                                      \
4810     for (i = env->vstart; i < i_max; ++i) {                               \
4811         if (!vm && !vext_elem_mask(v0, i)) {                              \
4812             /* set masked-off elements to 1s */                           \
4813             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4814             continue;                                                     \
4815         }                                                                 \
4816         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4817     }                                                                     \
4818                                                                           \
4819     for (i = i_max; i < vl; ++i) {                                        \
4820         if (vm || vext_elem_mask(v0, i)) {                                \
4821             *((ETYPE *)vd + H(i)) = 0;                                    \
4822         }                                                                 \
4823     }                                                                     \
4824                                                                           \
4825     env->vstart = 0;                                                      \
4826     /* set tail elements to 1s */                                         \
4827     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4828 }
4829 
4830 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4831 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4832 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4833 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4834 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4835 
4836 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4837 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4838                                  void *vs2, CPURISCVState *env,             \
4839                                  uint32_t desc)                             \
4840 {                                                                           \
4841     typedef uint##BITWIDTH##_t ETYPE;                                       \
4842     uint32_t vm = vext_vm(desc);                                            \
4843     uint32_t vl = env->vl;                                                  \
4844     uint32_t esz = sizeof(ETYPE);                                           \
4845     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4846     uint32_t vta = vext_vta(desc);                                          \
4847     uint32_t vma = vext_vma(desc);                                          \
4848     uint32_t i;                                                             \
4849                                                                             \
4850     for (i = env->vstart; i < vl; i++) {                                    \
4851         if (!vm && !vext_elem_mask(v0, i)) {                                \
4852             /* set masked-off elements to 1s */                             \
4853             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4854             continue;                                                       \
4855         }                                                                   \
4856         if (i == 0) {                                                       \
4857             *((ETYPE *)vd + H(i)) = s1;                                     \
4858         } else {                                                            \
4859             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4860         }                                                                   \
4861     }                                                                       \
4862     env->vstart = 0;                                                        \
4863     /* set tail elements to 1s */                                           \
4864     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4865 }
4866 
4867 GEN_VEXT_VSLIE1UP(8,  H1)
4868 GEN_VEXT_VSLIE1UP(16, H2)
4869 GEN_VEXT_VSLIE1UP(32, H4)
4870 GEN_VEXT_VSLIE1UP(64, H8)
4871 
4872 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4873 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4874                   CPURISCVState *env, uint32_t desc)              \
4875 {                                                                 \
4876     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4877 }
4878 
4879 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4880 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4881 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4882 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4883 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4884 
4885 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4886 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4887                                    void *vs2, CPURISCVState *env,             \
4888                                    uint32_t desc)                             \
4889 {                                                                             \
4890     typedef uint##BITWIDTH##_t ETYPE;                                         \
4891     uint32_t vm = vext_vm(desc);                                              \
4892     uint32_t vl = env->vl;                                                    \
4893     uint32_t esz = sizeof(ETYPE);                                             \
4894     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4895     uint32_t vta = vext_vta(desc);                                            \
4896     uint32_t vma = vext_vma(desc);                                            \
4897     uint32_t i;                                                               \
4898                                                                               \
4899     for (i = env->vstart; i < vl; i++) {                                      \
4900         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4901             /* set masked-off elements to 1s */                               \
4902             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4903             continue;                                                         \
4904         }                                                                     \
4905         if (i == vl - 1) {                                                    \
4906             *((ETYPE *)vd + H(i)) = s1;                                       \
4907         } else {                                                              \
4908             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4909         }                                                                     \
4910     }                                                                         \
4911     env->vstart = 0;                                                          \
4912     /* set tail elements to 1s */                                             \
4913     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4914 }
4915 
4916 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4917 GEN_VEXT_VSLIDE1DOWN(16, H2)
4918 GEN_VEXT_VSLIDE1DOWN(32, H4)
4919 GEN_VEXT_VSLIDE1DOWN(64, H8)
4920 
4921 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4922 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4923                   CPURISCVState *env, uint32_t desc)              \
4924 {                                                                 \
4925     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4926 }
4927 
4928 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4929 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4930 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4931 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4932 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4933 
4934 /* Vector Floating-Point Slide Instructions */
4935 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4936 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4937                   CPURISCVState *env, uint32_t desc)          \
4938 {                                                             \
4939     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4940 }
4941 
4942 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4943 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4944 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4945 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4946 
4947 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4948 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4949                   CPURISCVState *env, uint32_t desc)          \
4950 {                                                             \
4951     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4952 }
4953 
4954 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4955 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4956 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4957 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4958 
4959 /* Vector Register Gather Instruction */
4960 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4961 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4962                   CPURISCVState *env, uint32_t desc)                      \
4963 {                                                                         \
4964     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4965     uint32_t vm = vext_vm(desc);                                          \
4966     uint32_t vl = env->vl;                                                \
4967     uint32_t esz = sizeof(TS2);                                           \
4968     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4969     uint32_t vta = vext_vta(desc);                                        \
4970     uint32_t vma = vext_vma(desc);                                        \
4971     uint64_t index;                                                       \
4972     uint32_t i;                                                           \
4973                                                                           \
4974     for (i = env->vstart; i < vl; i++) {                                  \
4975         if (!vm && !vext_elem_mask(v0, i)) {                              \
4976             /* set masked-off elements to 1s */                           \
4977             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4978             continue;                                                     \
4979         }                                                                 \
4980         index = *((TS1 *)vs1 + HS1(i));                                   \
4981         if (index >= vlmax) {                                             \
4982             *((TS2 *)vd + HS2(i)) = 0;                                    \
4983         } else {                                                          \
4984             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4985         }                                                                 \
4986     }                                                                     \
4987     env->vstart = 0;                                                      \
4988     /* set tail elements to 1s */                                         \
4989     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4990 }
4991 
4992 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4993 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4994 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4995 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4996 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4997 
4998 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4999 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5000 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5001 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5002 
5003 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5004 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5005                   CPURISCVState *env, uint32_t desc)                      \
5006 {                                                                         \
5007     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5008     uint32_t vm = vext_vm(desc);                                          \
5009     uint32_t vl = env->vl;                                                \
5010     uint32_t esz = sizeof(ETYPE);                                         \
5011     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5012     uint32_t vta = vext_vta(desc);                                        \
5013     uint32_t vma = vext_vma(desc);                                        \
5014     uint64_t index = s1;                                                  \
5015     uint32_t i;                                                           \
5016                                                                           \
5017     for (i = env->vstart; i < vl; i++) {                                  \
5018         if (!vm && !vext_elem_mask(v0, i)) {                              \
5019             /* set masked-off elements to 1s */                           \
5020             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5021             continue;                                                     \
5022         }                                                                 \
5023         if (index >= vlmax) {                                             \
5024             *((ETYPE *)vd + H(i)) = 0;                                    \
5025         } else {                                                          \
5026             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5027         }                                                                 \
5028     }                                                                     \
5029     env->vstart = 0;                                                      \
5030     /* set tail elements to 1s */                                         \
5031     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5032 }
5033 
5034 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5035 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5036 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5037 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5038 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5039 
5040 /* Vector Compress Instruction */
5041 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5042 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5043                   CPURISCVState *env, uint32_t desc)                      \
5044 {                                                                         \
5045     uint32_t vl = env->vl;                                                \
5046     uint32_t esz = sizeof(ETYPE);                                         \
5047     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5048     uint32_t vta = vext_vta(desc);                                        \
5049     uint32_t num = 0, i;                                                  \
5050                                                                           \
5051     for (i = env->vstart; i < vl; i++) {                                  \
5052         if (!vext_elem_mask(vs1, i)) {                                    \
5053             continue;                                                     \
5054         }                                                                 \
5055         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5056         num++;                                                            \
5057     }                                                                     \
5058     env->vstart = 0;                                                      \
5059     /* set tail elements to 1s */                                         \
5060     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5061 }
5062 
5063 /* Compress into vd elements of vs2 where vs1 is enabled */
5064 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5065 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5066 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5067 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5068 
5069 /* Vector Whole Register Move */
5070 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5071 {
5072     /* EEW = SEW */
5073     uint32_t maxsz = simd_maxsz(desc);
5074     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5075     uint32_t startb = env->vstart * sewb;
5076     uint32_t i = startb;
5077 
5078     memcpy((uint8_t *)vd + H1(i),
5079            (uint8_t *)vs2 + H1(i),
5080            maxsz - startb);
5081 
5082     env->vstart = 0;
5083 }
5084 
5085 /* Vector Integer Extension */
5086 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5087 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5088                   CPURISCVState *env, uint32_t desc)             \
5089 {                                                                \
5090     uint32_t vl = env->vl;                                       \
5091     uint32_t vm = vext_vm(desc);                                 \
5092     uint32_t esz = sizeof(ETYPE);                                \
5093     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5094     uint32_t vta = vext_vta(desc);                               \
5095     uint32_t vma = vext_vma(desc);                               \
5096     uint32_t i;                                                  \
5097                                                                  \
5098     for (i = env->vstart; i < vl; i++) {                         \
5099         if (!vm && !vext_elem_mask(v0, i)) {                     \
5100             /* set masked-off elements to 1s */                  \
5101             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5102             continue;                                            \
5103         }                                                        \
5104         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5105     }                                                            \
5106     env->vstart = 0;                                             \
5107     /* set tail elements to 1s */                                \
5108     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5109 }
5110 
5111 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5112 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5113 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5114 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5115 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5116 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5117 
5118 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5119 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5120 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5121 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5122 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5123 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5124