xref: /qemu/target/riscv/vector_helper.c (revision 7653b1ea)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "internals.h"
30 #include "vector_internals.h"
31 #include <math.h>
32 
33 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
34                             target_ulong s2)
35 {
36     int vlmax, vl;
37     RISCVCPU *cpu = env_archcpu(env);
38     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
39     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
40     uint16_t sew = 8 << vsew;
41     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
42     int xlen = riscv_cpu_xlen(env);
43     bool vill = (s2 >> (xlen - 1)) & 0x1;
44     target_ulong reserved = s2 &
45                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
46                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
47     uint16_t vlen = cpu->cfg.vlenb << 3;
48     int8_t lmul;
49 
50     if (vlmul & 4) {
51         /*
52          * Fractional LMUL, check:
53          *
54          * VLEN * LMUL >= SEW
55          * VLEN >> (8 - lmul) >= sew
56          * (vlenb << 3) >> (8 - lmul) >= sew
57          */
58         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
59             vill = true;
60         }
61     }
62 
63     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
64         /* only set vill bit. */
65         env->vill = 1;
66         env->vtype = 0;
67         env->vl = 0;
68         env->vstart = 0;
69         return 0;
70     }
71 
72     /* lmul encoded as in DisasContext::lmul */
73     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
74     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
75     if (s1 <= vlmax) {
76         vl = s1;
77     } else {
78         vl = vlmax;
79     }
80     env->vl = vl;
81     env->vtype = s2;
82     env->vstart = 0;
83     env->vill = 0;
84     return vl;
85 }
86 
87 /*
88  * Get the maximum number of elements can be operated.
89  *
90  * log2_esz: log2 of element size in bytes.
91  */
92 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
93 {
94     /*
95      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
96      * so vlen in bytes (vlenb) is encoded as maxsz.
97      */
98     uint32_t vlenb = simd_maxsz(desc);
99 
100     /* Return VLMAX */
101     int scale = vext_lmul(desc) - log2_esz;
102     return scale < 0 ? vlenb >> -scale : vlenb << scale;
103 }
104 
105 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
106 {
107     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
108 }
109 
110 /*
111  * This function checks watchpoint before real load operation.
112  *
113  * In system mode, the TLB API probe_access is enough for watchpoint check.
114  * In user mode, there is no watchpoint support now.
115  *
116  * It will trigger an exception if there is no mapping in TLB
117  * and page table walk can't fill the TLB entry. Then the guest
118  * software can return here after process the exception or never return.
119  */
120 static void probe_pages(CPURISCVState *env, target_ulong addr,
121                         target_ulong len, uintptr_t ra,
122                         MMUAccessType access_type)
123 {
124     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
125     target_ulong curlen = MIN(pagelen, len);
126     int mmu_index = riscv_env_mmu_index(env, false);
127 
128     probe_access(env, adjust_addr(env, addr), curlen, access_type,
129                  mmu_index, ra);
130     if (len > curlen) {
131         addr += curlen;
132         curlen = len - curlen;
133         probe_access(env, adjust_addr(env, addr), curlen, access_type,
134                      mmu_index, ra);
135     }
136 }
137 
138 static inline void vext_set_elem_mask(void *v0, int index,
139                                       uint8_t value)
140 {
141     int idx = index / 64;
142     int pos = index % 64;
143     uint64_t old = ((uint64_t *)v0)[idx];
144     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
145 }
146 
147 /* elements operations for load and store */
148 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
149                                uint32_t idx, void *vd, uintptr_t retaddr);
150 
151 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
152 static void NAME(CPURISCVState *env, abi_ptr addr,         \
153                  uint32_t idx, void *vd, uintptr_t retaddr)\
154 {                                                          \
155     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
156     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
157 }                                                          \
158 
159 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
160 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
161 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
162 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
163 
164 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
165 static void NAME(CPURISCVState *env, abi_ptr addr,         \
166                  uint32_t idx, void *vd, uintptr_t retaddr)\
167 {                                                          \
168     ETYPE data = *((ETYPE *)vd + H(idx));                  \
169     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
170 }
171 
172 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
173 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
174 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
175 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
176 
177 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
178                                    uint32_t desc, uint32_t nf,
179                                    uint32_t esz, uint32_t max_elems)
180 {
181     uint32_t vta = vext_vta(desc);
182     int k;
183 
184     if (vta == 0) {
185         return;
186     }
187 
188     for (k = 0; k < nf; ++k) {
189         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
190                           (k * max_elems + max_elems) * esz);
191     }
192 }
193 
194 /*
195  * stride: access vector element from strided memory
196  */
197 static void
198 vext_ldst_stride(void *vd, void *v0, target_ulong base,
199                  target_ulong stride, CPURISCVState *env,
200                  uint32_t desc, uint32_t vm,
201                  vext_ldst_elem_fn *ldst_elem,
202                  uint32_t log2_esz, uintptr_t ra)
203 {
204     uint32_t i, k;
205     uint32_t nf = vext_nf(desc);
206     uint32_t max_elems = vext_max_elems(desc, log2_esz);
207     uint32_t esz = 1 << log2_esz;
208     uint32_t vma = vext_vma(desc);
209 
210     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
211         k = 0;
212         while (k < nf) {
213             if (!vm && !vext_elem_mask(v0, i)) {
214                 /* set masked-off elements to 1s */
215                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
216                                   (i + k * max_elems + 1) * esz);
217                 k++;
218                 continue;
219             }
220             target_ulong addr = base + stride * i + (k << log2_esz);
221             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
222             k++;
223         }
224     }
225     env->vstart = 0;
226 
227     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
228 }
229 
230 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
231 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
232                   target_ulong stride, CPURISCVState *env,              \
233                   uint32_t desc)                                        \
234 {                                                                       \
235     uint32_t vm = vext_vm(desc);                                        \
236     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
237                      ctzl(sizeof(ETYPE)), GETPC());                     \
238 }
239 
240 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
241 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
242 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
243 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
244 
245 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
246 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
247                   target_ulong stride, CPURISCVState *env,              \
248                   uint32_t desc)                                        \
249 {                                                                       \
250     uint32_t vm = vext_vm(desc);                                        \
251     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
252                      ctzl(sizeof(ETYPE)), GETPC());                     \
253 }
254 
255 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
256 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
257 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
258 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
259 
260 /*
261  * unit-stride: access elements stored contiguously in memory
262  */
263 
264 /* unmasked unit-stride load and store operation */
265 static void
266 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
267              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
268              uintptr_t ra)
269 {
270     uint32_t i, k;
271     uint32_t nf = vext_nf(desc);
272     uint32_t max_elems = vext_max_elems(desc, log2_esz);
273     uint32_t esz = 1 << log2_esz;
274 
275     /* load bytes from guest memory */
276     for (i = env->vstart; i < evl; i++, env->vstart++) {
277         k = 0;
278         while (k < nf) {
279             target_ulong addr = base + ((i * nf + k) << log2_esz);
280             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
281             k++;
282         }
283     }
284     env->vstart = 0;
285 
286     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
287 }
288 
289 /*
290  * masked unit-stride load and store operation will be a special case of
291  * stride, stride = NF * sizeof (ETYPE)
292  */
293 
294 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
295 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
296                          CPURISCVState *env, uint32_t desc)             \
297 {                                                                       \
298     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
299     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
300                      ctzl(sizeof(ETYPE)), GETPC());                     \
301 }                                                                       \
302                                                                         \
303 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
304                   CPURISCVState *env, uint32_t desc)                    \
305 {                                                                       \
306     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
307                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
308 }
309 
310 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
311 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
312 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
313 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
314 
315 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
316 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
317                          CPURISCVState *env, uint32_t desc)              \
318 {                                                                        \
319     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
320     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
321                      ctzl(sizeof(ETYPE)), GETPC());                      \
322 }                                                                        \
323                                                                          \
324 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
325                   CPURISCVState *env, uint32_t desc)                     \
326 {                                                                        \
327     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
328                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
329 }
330 
331 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
332 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
333 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
334 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
335 
336 /*
337  * unit stride mask load and store, EEW = 1
338  */
339 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
340                     CPURISCVState *env, uint32_t desc)
341 {
342     /* evl = ceil(vl/8) */
343     uint8_t evl = (env->vl + 7) >> 3;
344     vext_ldst_us(vd, base, env, desc, lde_b,
345                  0, evl, GETPC());
346 }
347 
348 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
349                     CPURISCVState *env, uint32_t desc)
350 {
351     /* evl = ceil(vl/8) */
352     uint8_t evl = (env->vl + 7) >> 3;
353     vext_ldst_us(vd, base, env, desc, ste_b,
354                  0, evl, GETPC());
355 }
356 
357 /*
358  * index: access vector element from indexed memory
359  */
360 typedef target_ulong vext_get_index_addr(target_ulong base,
361         uint32_t idx, void *vs2);
362 
363 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
364 static target_ulong NAME(target_ulong base,            \
365                          uint32_t idx, void *vs2)      \
366 {                                                      \
367     return (base + *((ETYPE *)vs2 + H(idx)));          \
368 }
369 
370 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
371 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
372 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
373 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
374 
375 static inline void
376 vext_ldst_index(void *vd, void *v0, target_ulong base,
377                 void *vs2, CPURISCVState *env, uint32_t desc,
378                 vext_get_index_addr get_index_addr,
379                 vext_ldst_elem_fn *ldst_elem,
380                 uint32_t log2_esz, uintptr_t ra)
381 {
382     uint32_t i, k;
383     uint32_t nf = vext_nf(desc);
384     uint32_t vm = vext_vm(desc);
385     uint32_t max_elems = vext_max_elems(desc, log2_esz);
386     uint32_t esz = 1 << log2_esz;
387     uint32_t vma = vext_vma(desc);
388 
389     /* load bytes from guest memory */
390     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
391         k = 0;
392         while (k < nf) {
393             if (!vm && !vext_elem_mask(v0, i)) {
394                 /* set masked-off elements to 1s */
395                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
396                                   (i + k * max_elems + 1) * esz);
397                 k++;
398                 continue;
399             }
400             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
401             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
402             k++;
403         }
404     }
405     env->vstart = 0;
406 
407     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
408 }
409 
410 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
411 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
412                   void *vs2, CPURISCVState *env, uint32_t desc)            \
413 {                                                                          \
414     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
415                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
416 }
417 
418 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
419 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
420 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
421 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
422 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
423 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
424 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
425 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
426 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
427 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
428 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
429 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
430 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
431 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
432 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
433 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
434 
435 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
436 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
437                   void *vs2, CPURISCVState *env, uint32_t desc)  \
438 {                                                                \
439     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
440                     STORE_FN, ctzl(sizeof(ETYPE)),               \
441                     GETPC());                                    \
442 }
443 
444 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
445 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
446 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
447 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
448 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
449 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
450 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
451 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
452 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
453 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
454 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
455 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
456 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
457 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
458 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
459 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
460 
461 /*
462  * unit-stride fault-only-fisrt load instructions
463  */
464 static inline void
465 vext_ldff(void *vd, void *v0, target_ulong base,
466           CPURISCVState *env, uint32_t desc,
467           vext_ldst_elem_fn *ldst_elem,
468           uint32_t log2_esz, uintptr_t ra)
469 {
470     void *host;
471     uint32_t i, k, vl = 0;
472     uint32_t nf = vext_nf(desc);
473     uint32_t vm = vext_vm(desc);
474     uint32_t max_elems = vext_max_elems(desc, log2_esz);
475     uint32_t esz = 1 << log2_esz;
476     uint32_t vma = vext_vma(desc);
477     target_ulong addr, offset, remain;
478     int mmu_index = riscv_env_mmu_index(env, false);
479 
480     /* probe every access */
481     for (i = env->vstart; i < env->vl; i++) {
482         if (!vm && !vext_elem_mask(v0, i)) {
483             continue;
484         }
485         addr = adjust_addr(env, base + i * (nf << log2_esz));
486         if (i == 0) {
487             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
488         } else {
489             /* if it triggers an exception, no need to check watchpoint */
490             remain = nf << log2_esz;
491             while (remain > 0) {
492                 offset = -(addr | TARGET_PAGE_MASK);
493                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_index);
494                 if (host) {
495 #ifdef CONFIG_USER_ONLY
496                     if (!page_check_range(addr, offset, PAGE_READ)) {
497                         vl = i;
498                         goto ProbeSuccess;
499                     }
500 #else
501                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
502 #endif
503                 } else {
504                     vl = i;
505                     goto ProbeSuccess;
506                 }
507                 if (remain <=  offset) {
508                     break;
509                 }
510                 remain -= offset;
511                 addr = adjust_addr(env, addr + offset);
512             }
513         }
514     }
515 ProbeSuccess:
516     /* load bytes from guest memory */
517     if (vl != 0) {
518         env->vl = vl;
519     }
520     for (i = env->vstart; i < env->vl; i++) {
521         k = 0;
522         while (k < nf) {
523             if (!vm && !vext_elem_mask(v0, i)) {
524                 /* set masked-off elements to 1s */
525                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
526                                   (i + k * max_elems + 1) * esz);
527                 k++;
528                 continue;
529             }
530             addr = base + ((i * nf + k) << log2_esz);
531             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
532             k++;
533         }
534     }
535     env->vstart = 0;
536 
537     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
538 }
539 
540 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
541 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
542                   CPURISCVState *env, uint32_t desc)      \
543 {                                                         \
544     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
545               ctzl(sizeof(ETYPE)), GETPC());              \
546 }
547 
548 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
549 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
550 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
551 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
552 
553 #define DO_SWAP(N, M) (M)
554 #define DO_AND(N, M)  (N & M)
555 #define DO_XOR(N, M)  (N ^ M)
556 #define DO_OR(N, M)   (N | M)
557 #define DO_ADD(N, M)  (N + M)
558 
559 /* Signed min/max */
560 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
561 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
562 
563 /*
564  * load and store whole register instructions
565  */
566 static void
567 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
568                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
569 {
570     uint32_t i, k, off, pos;
571     uint32_t nf = vext_nf(desc);
572     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
573     uint32_t max_elems = vlenb >> log2_esz;
574 
575     k = env->vstart / max_elems;
576     off = env->vstart % max_elems;
577 
578     if (off) {
579         /* load/store rest of elements of current segment pointed by vstart */
580         for (pos = off; pos < max_elems; pos++, env->vstart++) {
581             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
582             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
583                       ra);
584         }
585         k++;
586     }
587 
588     /* load/store elements for rest of segments */
589     for (; k < nf; k++) {
590         for (i = 0; i < max_elems; i++, env->vstart++) {
591             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
592             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
593         }
594     }
595 
596     env->vstart = 0;
597 }
598 
599 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
600 void HELPER(NAME)(void *vd, target_ulong base,       \
601                   CPURISCVState *env, uint32_t desc) \
602 {                                                    \
603     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
604                     ctzl(sizeof(ETYPE)), GETPC());   \
605 }
606 
607 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
608 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
609 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
610 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
611 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
612 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
613 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
614 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
615 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
616 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
617 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
618 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
619 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
620 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
621 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
622 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
623 
624 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
625 void HELPER(NAME)(void *vd, target_ulong base,       \
626                   CPURISCVState *env, uint32_t desc) \
627 {                                                    \
628     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
629                     ctzl(sizeof(ETYPE)), GETPC());   \
630 }
631 
632 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
633 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
634 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
635 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
636 
637 /*
638  * Vector Integer Arithmetic Instructions
639  */
640 
641 /* (TD, T1, T2, TX1, TX2) */
642 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
643 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
644 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
645 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
646 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
647 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
648 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
649 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
650 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
651 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
652 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
653 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
654 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
655 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
656 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
657 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
658 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
659 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
660 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
661 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
662 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
663 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
664 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
665 
666 #define DO_SUB(N, M) (N - M)
667 #define DO_RSUB(N, M) (M - N)
668 
669 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
670 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
671 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
672 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
673 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
674 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
675 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
676 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
677 
678 GEN_VEXT_VV(vadd_vv_b, 1)
679 GEN_VEXT_VV(vadd_vv_h, 2)
680 GEN_VEXT_VV(vadd_vv_w, 4)
681 GEN_VEXT_VV(vadd_vv_d, 8)
682 GEN_VEXT_VV(vsub_vv_b, 1)
683 GEN_VEXT_VV(vsub_vv_h, 2)
684 GEN_VEXT_VV(vsub_vv_w, 4)
685 GEN_VEXT_VV(vsub_vv_d, 8)
686 
687 
688 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
689 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
690 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
691 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
692 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
693 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
694 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
695 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
696 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
697 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
698 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
699 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
700 
701 GEN_VEXT_VX(vadd_vx_b, 1)
702 GEN_VEXT_VX(vadd_vx_h, 2)
703 GEN_VEXT_VX(vadd_vx_w, 4)
704 GEN_VEXT_VX(vadd_vx_d, 8)
705 GEN_VEXT_VX(vsub_vx_b, 1)
706 GEN_VEXT_VX(vsub_vx_h, 2)
707 GEN_VEXT_VX(vsub_vx_w, 4)
708 GEN_VEXT_VX(vsub_vx_d, 8)
709 GEN_VEXT_VX(vrsub_vx_b, 1)
710 GEN_VEXT_VX(vrsub_vx_h, 2)
711 GEN_VEXT_VX(vrsub_vx_w, 4)
712 GEN_VEXT_VX(vrsub_vx_d, 8)
713 
714 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
715 {
716     intptr_t oprsz = simd_oprsz(desc);
717     intptr_t i;
718 
719     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
720         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
721     }
722 }
723 
724 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
725 {
726     intptr_t oprsz = simd_oprsz(desc);
727     intptr_t i;
728 
729     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
730         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
731     }
732 }
733 
734 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
735 {
736     intptr_t oprsz = simd_oprsz(desc);
737     intptr_t i;
738 
739     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
740         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
741     }
742 }
743 
744 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
745 {
746     intptr_t oprsz = simd_oprsz(desc);
747     intptr_t i;
748 
749     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
750         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
751     }
752 }
753 
754 /* Vector Widening Integer Add/Subtract */
755 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
756 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
757 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
758 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
759 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
760 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
761 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
762 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
763 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
764 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
765 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
766 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
767 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
768 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
769 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
770 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
771 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
772 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
773 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
774 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
775 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
776 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
777 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
778 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
779 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
780 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
781 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
782 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
783 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
784 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
785 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
786 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
787 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
788 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
789 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
790 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
791 GEN_VEXT_VV(vwaddu_vv_b, 2)
792 GEN_VEXT_VV(vwaddu_vv_h, 4)
793 GEN_VEXT_VV(vwaddu_vv_w, 8)
794 GEN_VEXT_VV(vwsubu_vv_b, 2)
795 GEN_VEXT_VV(vwsubu_vv_h, 4)
796 GEN_VEXT_VV(vwsubu_vv_w, 8)
797 GEN_VEXT_VV(vwadd_vv_b, 2)
798 GEN_VEXT_VV(vwadd_vv_h, 4)
799 GEN_VEXT_VV(vwadd_vv_w, 8)
800 GEN_VEXT_VV(vwsub_vv_b, 2)
801 GEN_VEXT_VV(vwsub_vv_h, 4)
802 GEN_VEXT_VV(vwsub_vv_w, 8)
803 GEN_VEXT_VV(vwaddu_wv_b, 2)
804 GEN_VEXT_VV(vwaddu_wv_h, 4)
805 GEN_VEXT_VV(vwaddu_wv_w, 8)
806 GEN_VEXT_VV(vwsubu_wv_b, 2)
807 GEN_VEXT_VV(vwsubu_wv_h, 4)
808 GEN_VEXT_VV(vwsubu_wv_w, 8)
809 GEN_VEXT_VV(vwadd_wv_b, 2)
810 GEN_VEXT_VV(vwadd_wv_h, 4)
811 GEN_VEXT_VV(vwadd_wv_w, 8)
812 GEN_VEXT_VV(vwsub_wv_b, 2)
813 GEN_VEXT_VV(vwsub_wv_h, 4)
814 GEN_VEXT_VV(vwsub_wv_w, 8)
815 
816 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
817 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
818 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
819 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
820 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
821 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
822 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
823 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
824 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
825 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
826 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
827 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
828 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
829 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
830 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
831 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
832 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
833 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
834 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
835 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
836 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
837 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
838 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
839 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
840 GEN_VEXT_VX(vwaddu_vx_b, 2)
841 GEN_VEXT_VX(vwaddu_vx_h, 4)
842 GEN_VEXT_VX(vwaddu_vx_w, 8)
843 GEN_VEXT_VX(vwsubu_vx_b, 2)
844 GEN_VEXT_VX(vwsubu_vx_h, 4)
845 GEN_VEXT_VX(vwsubu_vx_w, 8)
846 GEN_VEXT_VX(vwadd_vx_b, 2)
847 GEN_VEXT_VX(vwadd_vx_h, 4)
848 GEN_VEXT_VX(vwadd_vx_w, 8)
849 GEN_VEXT_VX(vwsub_vx_b, 2)
850 GEN_VEXT_VX(vwsub_vx_h, 4)
851 GEN_VEXT_VX(vwsub_vx_w, 8)
852 GEN_VEXT_VX(vwaddu_wx_b, 2)
853 GEN_VEXT_VX(vwaddu_wx_h, 4)
854 GEN_VEXT_VX(vwaddu_wx_w, 8)
855 GEN_VEXT_VX(vwsubu_wx_b, 2)
856 GEN_VEXT_VX(vwsubu_wx_h, 4)
857 GEN_VEXT_VX(vwsubu_wx_w, 8)
858 GEN_VEXT_VX(vwadd_wx_b, 2)
859 GEN_VEXT_VX(vwadd_wx_h, 4)
860 GEN_VEXT_VX(vwadd_wx_w, 8)
861 GEN_VEXT_VX(vwsub_wx_b, 2)
862 GEN_VEXT_VX(vwsub_wx_h, 4)
863 GEN_VEXT_VX(vwsub_wx_w, 8)
864 
865 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
866 #define DO_VADC(N, M, C) (N + M + C)
867 #define DO_VSBC(N, M, C) (N - M - C)
868 
869 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
870 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
871                   CPURISCVState *env, uint32_t desc)          \
872 {                                                             \
873     uint32_t vl = env->vl;                                    \
874     uint32_t esz = sizeof(ETYPE);                             \
875     uint32_t total_elems =                                    \
876         vext_get_total_elems(env, desc, esz);                 \
877     uint32_t vta = vext_vta(desc);                            \
878     uint32_t i;                                               \
879                                                               \
880     for (i = env->vstart; i < vl; i++) {                      \
881         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
882         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
883         ETYPE carry = vext_elem_mask(v0, i);                  \
884                                                               \
885         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
886     }                                                         \
887     env->vstart = 0;                                          \
888     /* set tail elements to 1s */                             \
889     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
890 }
891 
892 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
893 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
894 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
895 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
896 
897 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
898 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
899 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
900 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
901 
902 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
903 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
904                   CPURISCVState *env, uint32_t desc)                     \
905 {                                                                        \
906     uint32_t vl = env->vl;                                               \
907     uint32_t esz = sizeof(ETYPE);                                        \
908     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
909     uint32_t vta = vext_vta(desc);                                       \
910     uint32_t i;                                                          \
911                                                                          \
912     for (i = env->vstart; i < vl; i++) {                                 \
913         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
914         ETYPE carry = vext_elem_mask(v0, i);                             \
915                                                                          \
916         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
917     }                                                                    \
918     env->vstart = 0;                                                     \
919     /* set tail elements to 1s */                                        \
920     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
921 }
922 
923 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
924 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
925 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
926 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
927 
928 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
929 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
930 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
931 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
932 
933 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
934                           (__typeof(N))(N + M) < N)
935 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
936 
937 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
938 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
939                   CPURISCVState *env, uint32_t desc)          \
940 {                                                             \
941     uint32_t vl = env->vl;                                    \
942     uint32_t vm = vext_vm(desc);                              \
943     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
944     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
945     uint32_t i;                                               \
946                                                               \
947     for (i = env->vstart; i < vl; i++) {                      \
948         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
949         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
950         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
951         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
952     }                                                         \
953     env->vstart = 0;                                          \
954     /*
955      * mask destination register are always tail-agnostic
956      * set tail elements to 1s
957      */                                                       \
958     if (vta_all_1s) {                                         \
959         for (; i < total_elems; i++) {                        \
960             vext_set_elem_mask(vd, i, 1);                     \
961         }                                                     \
962     }                                                         \
963 }
964 
965 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
966 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
967 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
968 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
969 
970 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
971 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
972 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
973 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
974 
975 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
976 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
977                   void *vs2, CPURISCVState *env, uint32_t desc) \
978 {                                                               \
979     uint32_t vl = env->vl;                                      \
980     uint32_t vm = vext_vm(desc);                                \
981     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
982     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
983     uint32_t i;                                                 \
984                                                                 \
985     for (i = env->vstart; i < vl; i++) {                        \
986         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
987         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
988         vext_set_elem_mask(vd, i,                               \
989                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
990     }                                                           \
991     env->vstart = 0;                                            \
992     /*
993      * mask destination register are always tail-agnostic
994      * set tail elements to 1s
995      */                                                         \
996     if (vta_all_1s) {                                           \
997         for (; i < total_elems; i++) {                          \
998             vext_set_elem_mask(vd, i, 1);                       \
999         }                                                       \
1000     }                                                           \
1001 }
1002 
1003 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1004 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1005 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1006 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1007 
1008 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1009 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1010 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1011 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1012 
1013 /* Vector Bitwise Logical Instructions */
1014 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1015 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1016 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1017 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1018 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1019 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1020 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1021 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1022 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1023 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1024 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1025 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1026 GEN_VEXT_VV(vand_vv_b, 1)
1027 GEN_VEXT_VV(vand_vv_h, 2)
1028 GEN_VEXT_VV(vand_vv_w, 4)
1029 GEN_VEXT_VV(vand_vv_d, 8)
1030 GEN_VEXT_VV(vor_vv_b, 1)
1031 GEN_VEXT_VV(vor_vv_h, 2)
1032 GEN_VEXT_VV(vor_vv_w, 4)
1033 GEN_VEXT_VV(vor_vv_d, 8)
1034 GEN_VEXT_VV(vxor_vv_b, 1)
1035 GEN_VEXT_VV(vxor_vv_h, 2)
1036 GEN_VEXT_VV(vxor_vv_w, 4)
1037 GEN_VEXT_VV(vxor_vv_d, 8)
1038 
1039 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1040 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1041 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1042 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1043 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1044 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1045 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1046 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1047 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1048 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1049 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1050 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1051 GEN_VEXT_VX(vand_vx_b, 1)
1052 GEN_VEXT_VX(vand_vx_h, 2)
1053 GEN_VEXT_VX(vand_vx_w, 4)
1054 GEN_VEXT_VX(vand_vx_d, 8)
1055 GEN_VEXT_VX(vor_vx_b, 1)
1056 GEN_VEXT_VX(vor_vx_h, 2)
1057 GEN_VEXT_VX(vor_vx_w, 4)
1058 GEN_VEXT_VX(vor_vx_d, 8)
1059 GEN_VEXT_VX(vxor_vx_b, 1)
1060 GEN_VEXT_VX(vxor_vx_h, 2)
1061 GEN_VEXT_VX(vxor_vx_w, 4)
1062 GEN_VEXT_VX(vxor_vx_d, 8)
1063 
1064 /* Vector Single-Width Bit Shift Instructions */
1065 #define DO_SLL(N, M)  (N << (M))
1066 #define DO_SRL(N, M)  (N >> (M))
1067 
1068 /* generate the helpers for shift instructions with two vector operators */
1069 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1070 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1071                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1072 {                                                                         \
1073     uint32_t vm = vext_vm(desc);                                          \
1074     uint32_t vl = env->vl;                                                \
1075     uint32_t esz = sizeof(TS1);                                           \
1076     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1077     uint32_t vta = vext_vta(desc);                                        \
1078     uint32_t vma = vext_vma(desc);                                        \
1079     uint32_t i;                                                           \
1080                                                                           \
1081     for (i = env->vstart; i < vl; i++) {                                  \
1082         if (!vm && !vext_elem_mask(v0, i)) {                              \
1083             /* set masked-off elements to 1s */                           \
1084             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1085             continue;                                                     \
1086         }                                                                 \
1087         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1088         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1089         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1090     }                                                                     \
1091     env->vstart = 0;                                                      \
1092     /* set tail elements to 1s */                                         \
1093     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1094 }
1095 
1096 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1097 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1098 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1099 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1100 
1101 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1102 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1103 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1104 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1105 
1106 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1107 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1108 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1109 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1110 
1111 /*
1112  * generate the helpers for shift instructions with one vector and one scalar
1113  */
1114 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1115 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1116                   void *vs2, CPURISCVState *env,            \
1117                   uint32_t desc)                            \
1118 {                                                           \
1119     uint32_t vm = vext_vm(desc);                            \
1120     uint32_t vl = env->vl;                                  \
1121     uint32_t esz = sizeof(TD);                              \
1122     uint32_t total_elems =                                  \
1123         vext_get_total_elems(env, desc, esz);               \
1124     uint32_t vta = vext_vta(desc);                          \
1125     uint32_t vma = vext_vma(desc);                          \
1126     uint32_t i;                                             \
1127                                                             \
1128     for (i = env->vstart; i < vl; i++) {                    \
1129         if (!vm && !vext_elem_mask(v0, i)) {                \
1130             /* set masked-off elements to 1s */             \
1131             vext_set_elems_1s(vd, vma, i * esz,             \
1132                               (i + 1) * esz);               \
1133             continue;                                       \
1134         }                                                   \
1135         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1136         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1137     }                                                       \
1138     env->vstart = 0;                                        \
1139     /* set tail elements to 1s */                           \
1140     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1141 }
1142 
1143 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1144 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1145 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1146 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1147 
1148 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1149 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1150 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1151 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1152 
1153 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1154 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1155 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1156 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1157 
1158 /* Vector Narrowing Integer Right Shift Instructions */
1159 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1160 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1161 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1162 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1163 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1164 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1165 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1166 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1167 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1168 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1169 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1170 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1171 
1172 /* Vector Integer Comparison Instructions */
1173 #define DO_MSEQ(N, M) (N == M)
1174 #define DO_MSNE(N, M) (N != M)
1175 #define DO_MSLT(N, M) (N < M)
1176 #define DO_MSLE(N, M) (N <= M)
1177 #define DO_MSGT(N, M) (N > M)
1178 
1179 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1180 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1181                   CPURISCVState *env, uint32_t desc)          \
1182 {                                                             \
1183     uint32_t vm = vext_vm(desc);                              \
1184     uint32_t vl = env->vl;                                    \
1185     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1186     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1187     uint32_t vma = vext_vma(desc);                            \
1188     uint32_t i;                                               \
1189                                                               \
1190     for (i = env->vstart; i < vl; i++) {                      \
1191         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1192         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1193         if (!vm && !vext_elem_mask(v0, i)) {                  \
1194             /* set masked-off elements to 1s */               \
1195             if (vma) {                                        \
1196                 vext_set_elem_mask(vd, i, 1);                 \
1197             }                                                 \
1198             continue;                                         \
1199         }                                                     \
1200         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1201     }                                                         \
1202     env->vstart = 0;                                          \
1203     /*
1204      * mask destination register are always tail-agnostic
1205      * set tail elements to 1s
1206      */                                                       \
1207     if (vta_all_1s) {                                         \
1208         for (; i < total_elems; i++) {                        \
1209             vext_set_elem_mask(vd, i, 1);                     \
1210         }                                                     \
1211     }                                                         \
1212 }
1213 
1214 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1215 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1216 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1217 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1218 
1219 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1220 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1221 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1222 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1223 
1224 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1225 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1226 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1227 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1228 
1229 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1230 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1231 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1232 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1233 
1234 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1235 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1236 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1237 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1238 
1239 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1240 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1241 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1242 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1243 
1244 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1245 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1246                   CPURISCVState *env, uint32_t desc)                \
1247 {                                                                   \
1248     uint32_t vm = vext_vm(desc);                                    \
1249     uint32_t vl = env->vl;                                          \
1250     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1251     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1252     uint32_t vma = vext_vma(desc);                                  \
1253     uint32_t i;                                                     \
1254                                                                     \
1255     for (i = env->vstart; i < vl; i++) {                            \
1256         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1257         if (!vm && !vext_elem_mask(v0, i)) {                        \
1258             /* set masked-off elements to 1s */                     \
1259             if (vma) {                                              \
1260                 vext_set_elem_mask(vd, i, 1);                       \
1261             }                                                       \
1262             continue;                                               \
1263         }                                                           \
1264         vext_set_elem_mask(vd, i,                                   \
1265                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1266     }                                                               \
1267     env->vstart = 0;                                                \
1268     /*
1269      * mask destination register are always tail-agnostic
1270      * set tail elements to 1s
1271      */                                                             \
1272     if (vta_all_1s) {                                               \
1273         for (; i < total_elems; i++) {                              \
1274             vext_set_elem_mask(vd, i, 1);                           \
1275         }                                                           \
1276     }                                                               \
1277 }
1278 
1279 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1280 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1281 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1282 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1283 
1284 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1285 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1286 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1287 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1288 
1289 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1290 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1291 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1292 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1293 
1294 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1295 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1296 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1297 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1298 
1299 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1300 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1301 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1302 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1303 
1304 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1305 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1306 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1307 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1308 
1309 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1310 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1311 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1312 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1313 
1314 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1315 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1316 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1317 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1318 
1319 /* Vector Integer Min/Max Instructions */
1320 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1321 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1322 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1323 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1324 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1325 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1326 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1327 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1328 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1329 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1330 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1331 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1332 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1333 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1334 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1335 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1336 GEN_VEXT_VV(vminu_vv_b, 1)
1337 GEN_VEXT_VV(vminu_vv_h, 2)
1338 GEN_VEXT_VV(vminu_vv_w, 4)
1339 GEN_VEXT_VV(vminu_vv_d, 8)
1340 GEN_VEXT_VV(vmin_vv_b, 1)
1341 GEN_VEXT_VV(vmin_vv_h, 2)
1342 GEN_VEXT_VV(vmin_vv_w, 4)
1343 GEN_VEXT_VV(vmin_vv_d, 8)
1344 GEN_VEXT_VV(vmaxu_vv_b, 1)
1345 GEN_VEXT_VV(vmaxu_vv_h, 2)
1346 GEN_VEXT_VV(vmaxu_vv_w, 4)
1347 GEN_VEXT_VV(vmaxu_vv_d, 8)
1348 GEN_VEXT_VV(vmax_vv_b, 1)
1349 GEN_VEXT_VV(vmax_vv_h, 2)
1350 GEN_VEXT_VV(vmax_vv_w, 4)
1351 GEN_VEXT_VV(vmax_vv_d, 8)
1352 
1353 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1354 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1355 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1356 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1357 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1358 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1359 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1360 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1361 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1362 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1363 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1364 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1365 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1366 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1367 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1368 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1369 GEN_VEXT_VX(vminu_vx_b, 1)
1370 GEN_VEXT_VX(vminu_vx_h, 2)
1371 GEN_VEXT_VX(vminu_vx_w, 4)
1372 GEN_VEXT_VX(vminu_vx_d, 8)
1373 GEN_VEXT_VX(vmin_vx_b, 1)
1374 GEN_VEXT_VX(vmin_vx_h, 2)
1375 GEN_VEXT_VX(vmin_vx_w, 4)
1376 GEN_VEXT_VX(vmin_vx_d, 8)
1377 GEN_VEXT_VX(vmaxu_vx_b, 1)
1378 GEN_VEXT_VX(vmaxu_vx_h, 2)
1379 GEN_VEXT_VX(vmaxu_vx_w, 4)
1380 GEN_VEXT_VX(vmaxu_vx_d, 8)
1381 GEN_VEXT_VX(vmax_vx_b, 1)
1382 GEN_VEXT_VX(vmax_vx_h, 2)
1383 GEN_VEXT_VX(vmax_vx_w, 4)
1384 GEN_VEXT_VX(vmax_vx_d, 8)
1385 
1386 /* Vector Single-Width Integer Multiply Instructions */
1387 #define DO_MUL(N, M) (N * M)
1388 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1389 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1390 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1391 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1392 GEN_VEXT_VV(vmul_vv_b, 1)
1393 GEN_VEXT_VV(vmul_vv_h, 2)
1394 GEN_VEXT_VV(vmul_vv_w, 4)
1395 GEN_VEXT_VV(vmul_vv_d, 8)
1396 
1397 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1398 {
1399     return (int16_t)s2 * (int16_t)s1 >> 8;
1400 }
1401 
1402 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1403 {
1404     return (int32_t)s2 * (int32_t)s1 >> 16;
1405 }
1406 
1407 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1408 {
1409     return (int64_t)s2 * (int64_t)s1 >> 32;
1410 }
1411 
1412 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1413 {
1414     uint64_t hi_64, lo_64;
1415 
1416     muls64(&lo_64, &hi_64, s1, s2);
1417     return hi_64;
1418 }
1419 
1420 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1421 {
1422     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1423 }
1424 
1425 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1426 {
1427     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1428 }
1429 
1430 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1431 {
1432     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1433 }
1434 
1435 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1436 {
1437     uint64_t hi_64, lo_64;
1438 
1439     mulu64(&lo_64, &hi_64, s2, s1);
1440     return hi_64;
1441 }
1442 
1443 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1444 {
1445     return (int16_t)s2 * (uint16_t)s1 >> 8;
1446 }
1447 
1448 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1449 {
1450     return (int32_t)s2 * (uint32_t)s1 >> 16;
1451 }
1452 
1453 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1454 {
1455     return (int64_t)s2 * (uint64_t)s1 >> 32;
1456 }
1457 
1458 /*
1459  * Let  A = signed operand,
1460  *      B = unsigned operand
1461  *      P = mulu64(A, B), unsigned product
1462  *
1463  * LET  X = 2 ** 64  - A, 2's complement of A
1464  *      SP = signed product
1465  * THEN
1466  *      IF A < 0
1467  *          SP = -X * B
1468  *             = -(2 ** 64 - A) * B
1469  *             = A * B - 2 ** 64 * B
1470  *             = P - 2 ** 64 * B
1471  *      ELSE
1472  *          SP = P
1473  * THEN
1474  *      HI_P -= (A < 0 ? B : 0)
1475  */
1476 
1477 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1478 {
1479     uint64_t hi_64, lo_64;
1480 
1481     mulu64(&lo_64, &hi_64, s2, s1);
1482 
1483     hi_64 -= s2 < 0 ? s1 : 0;
1484     return hi_64;
1485 }
1486 
1487 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1488 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1489 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1490 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1491 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1492 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1493 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1494 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1495 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1496 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1497 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1498 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1499 GEN_VEXT_VV(vmulh_vv_b, 1)
1500 GEN_VEXT_VV(vmulh_vv_h, 2)
1501 GEN_VEXT_VV(vmulh_vv_w, 4)
1502 GEN_VEXT_VV(vmulh_vv_d, 8)
1503 GEN_VEXT_VV(vmulhu_vv_b, 1)
1504 GEN_VEXT_VV(vmulhu_vv_h, 2)
1505 GEN_VEXT_VV(vmulhu_vv_w, 4)
1506 GEN_VEXT_VV(vmulhu_vv_d, 8)
1507 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1508 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1509 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1510 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1511 
1512 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1513 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1514 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1515 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1516 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1517 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1518 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1519 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1520 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1521 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1522 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1523 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1524 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1525 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1526 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1527 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1528 GEN_VEXT_VX(vmul_vx_b, 1)
1529 GEN_VEXT_VX(vmul_vx_h, 2)
1530 GEN_VEXT_VX(vmul_vx_w, 4)
1531 GEN_VEXT_VX(vmul_vx_d, 8)
1532 GEN_VEXT_VX(vmulh_vx_b, 1)
1533 GEN_VEXT_VX(vmulh_vx_h, 2)
1534 GEN_VEXT_VX(vmulh_vx_w, 4)
1535 GEN_VEXT_VX(vmulh_vx_d, 8)
1536 GEN_VEXT_VX(vmulhu_vx_b, 1)
1537 GEN_VEXT_VX(vmulhu_vx_h, 2)
1538 GEN_VEXT_VX(vmulhu_vx_w, 4)
1539 GEN_VEXT_VX(vmulhu_vx_d, 8)
1540 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1541 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1542 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1543 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1544 
1545 /* Vector Integer Divide Instructions */
1546 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1547 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1548 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1549         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1550 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1551         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1552 
1553 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1554 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1555 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1556 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1557 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1558 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1559 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1560 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1561 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1562 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1563 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1564 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1565 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1566 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1567 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1568 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1569 GEN_VEXT_VV(vdivu_vv_b, 1)
1570 GEN_VEXT_VV(vdivu_vv_h, 2)
1571 GEN_VEXT_VV(vdivu_vv_w, 4)
1572 GEN_VEXT_VV(vdivu_vv_d, 8)
1573 GEN_VEXT_VV(vdiv_vv_b, 1)
1574 GEN_VEXT_VV(vdiv_vv_h, 2)
1575 GEN_VEXT_VV(vdiv_vv_w, 4)
1576 GEN_VEXT_VV(vdiv_vv_d, 8)
1577 GEN_VEXT_VV(vremu_vv_b, 1)
1578 GEN_VEXT_VV(vremu_vv_h, 2)
1579 GEN_VEXT_VV(vremu_vv_w, 4)
1580 GEN_VEXT_VV(vremu_vv_d, 8)
1581 GEN_VEXT_VV(vrem_vv_b, 1)
1582 GEN_VEXT_VV(vrem_vv_h, 2)
1583 GEN_VEXT_VV(vrem_vv_w, 4)
1584 GEN_VEXT_VV(vrem_vv_d, 8)
1585 
1586 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1587 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1588 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1589 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1590 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1591 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1592 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1593 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1594 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1595 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1596 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1597 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1598 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1599 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1600 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1601 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1602 GEN_VEXT_VX(vdivu_vx_b, 1)
1603 GEN_VEXT_VX(vdivu_vx_h, 2)
1604 GEN_VEXT_VX(vdivu_vx_w, 4)
1605 GEN_VEXT_VX(vdivu_vx_d, 8)
1606 GEN_VEXT_VX(vdiv_vx_b, 1)
1607 GEN_VEXT_VX(vdiv_vx_h, 2)
1608 GEN_VEXT_VX(vdiv_vx_w, 4)
1609 GEN_VEXT_VX(vdiv_vx_d, 8)
1610 GEN_VEXT_VX(vremu_vx_b, 1)
1611 GEN_VEXT_VX(vremu_vx_h, 2)
1612 GEN_VEXT_VX(vremu_vx_w, 4)
1613 GEN_VEXT_VX(vremu_vx_d, 8)
1614 GEN_VEXT_VX(vrem_vx_b, 1)
1615 GEN_VEXT_VX(vrem_vx_h, 2)
1616 GEN_VEXT_VX(vrem_vx_w, 4)
1617 GEN_VEXT_VX(vrem_vx_d, 8)
1618 
1619 /* Vector Widening Integer Multiply Instructions */
1620 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1621 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1622 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1623 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1624 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1625 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1626 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1627 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1628 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1629 GEN_VEXT_VV(vwmul_vv_b, 2)
1630 GEN_VEXT_VV(vwmul_vv_h, 4)
1631 GEN_VEXT_VV(vwmul_vv_w, 8)
1632 GEN_VEXT_VV(vwmulu_vv_b, 2)
1633 GEN_VEXT_VV(vwmulu_vv_h, 4)
1634 GEN_VEXT_VV(vwmulu_vv_w, 8)
1635 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1636 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1637 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1638 
1639 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1640 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1641 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1642 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1643 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1644 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1645 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1646 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1647 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1648 GEN_VEXT_VX(vwmul_vx_b, 2)
1649 GEN_VEXT_VX(vwmul_vx_h, 4)
1650 GEN_VEXT_VX(vwmul_vx_w, 8)
1651 GEN_VEXT_VX(vwmulu_vx_b, 2)
1652 GEN_VEXT_VX(vwmulu_vx_h, 4)
1653 GEN_VEXT_VX(vwmulu_vx_w, 8)
1654 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1655 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1656 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1657 
1658 /* Vector Single-Width Integer Multiply-Add Instructions */
1659 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1660 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1661 {                                                                  \
1662     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1663     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1664     TD d = *((TD *)vd + HD(i));                                    \
1665     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1666 }
1667 
1668 #define DO_MACC(N, M, D) (M * N + D)
1669 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1670 #define DO_MADD(N, M, D) (M * D + N)
1671 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1672 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1673 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1674 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1675 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1676 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1677 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1678 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1679 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1680 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1681 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1682 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1683 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1684 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1685 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1686 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1687 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1688 GEN_VEXT_VV(vmacc_vv_b, 1)
1689 GEN_VEXT_VV(vmacc_vv_h, 2)
1690 GEN_VEXT_VV(vmacc_vv_w, 4)
1691 GEN_VEXT_VV(vmacc_vv_d, 8)
1692 GEN_VEXT_VV(vnmsac_vv_b, 1)
1693 GEN_VEXT_VV(vnmsac_vv_h, 2)
1694 GEN_VEXT_VV(vnmsac_vv_w, 4)
1695 GEN_VEXT_VV(vnmsac_vv_d, 8)
1696 GEN_VEXT_VV(vmadd_vv_b, 1)
1697 GEN_VEXT_VV(vmadd_vv_h, 2)
1698 GEN_VEXT_VV(vmadd_vv_w, 4)
1699 GEN_VEXT_VV(vmadd_vv_d, 8)
1700 GEN_VEXT_VV(vnmsub_vv_b, 1)
1701 GEN_VEXT_VV(vnmsub_vv_h, 2)
1702 GEN_VEXT_VV(vnmsub_vv_w, 4)
1703 GEN_VEXT_VV(vnmsub_vv_d, 8)
1704 
1705 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1706 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1707 {                                                                   \
1708     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1709     TD d = *((TD *)vd + HD(i));                                     \
1710     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1711 }
1712 
1713 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1714 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1715 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1716 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1717 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1718 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1719 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1720 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1721 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1722 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1723 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1724 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1725 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1726 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1727 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1728 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1729 GEN_VEXT_VX(vmacc_vx_b, 1)
1730 GEN_VEXT_VX(vmacc_vx_h, 2)
1731 GEN_VEXT_VX(vmacc_vx_w, 4)
1732 GEN_VEXT_VX(vmacc_vx_d, 8)
1733 GEN_VEXT_VX(vnmsac_vx_b, 1)
1734 GEN_VEXT_VX(vnmsac_vx_h, 2)
1735 GEN_VEXT_VX(vnmsac_vx_w, 4)
1736 GEN_VEXT_VX(vnmsac_vx_d, 8)
1737 GEN_VEXT_VX(vmadd_vx_b, 1)
1738 GEN_VEXT_VX(vmadd_vx_h, 2)
1739 GEN_VEXT_VX(vmadd_vx_w, 4)
1740 GEN_VEXT_VX(vmadd_vx_d, 8)
1741 GEN_VEXT_VX(vnmsub_vx_b, 1)
1742 GEN_VEXT_VX(vnmsub_vx_h, 2)
1743 GEN_VEXT_VX(vnmsub_vx_w, 4)
1744 GEN_VEXT_VX(vnmsub_vx_d, 8)
1745 
1746 /* Vector Widening Integer Multiply-Add Instructions */
1747 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1748 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1749 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1750 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1751 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1752 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1753 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1754 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1755 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1756 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1757 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1758 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1759 GEN_VEXT_VV(vwmacc_vv_b, 2)
1760 GEN_VEXT_VV(vwmacc_vv_h, 4)
1761 GEN_VEXT_VV(vwmacc_vv_w, 8)
1762 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1763 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1764 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1765 
1766 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1767 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1768 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1769 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1770 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1771 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1772 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1773 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1774 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1775 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1776 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1777 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1778 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1779 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1780 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1781 GEN_VEXT_VX(vwmacc_vx_b, 2)
1782 GEN_VEXT_VX(vwmacc_vx_h, 4)
1783 GEN_VEXT_VX(vwmacc_vx_w, 8)
1784 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1785 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1786 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1787 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1788 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1789 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1790 
1791 /* Vector Integer Merge and Move Instructions */
1792 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1793 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1794                   uint32_t desc)                                     \
1795 {                                                                    \
1796     uint32_t vl = env->vl;                                           \
1797     uint32_t esz = sizeof(ETYPE);                                    \
1798     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1799     uint32_t vta = vext_vta(desc);                                   \
1800     uint32_t i;                                                      \
1801                                                                      \
1802     for (i = env->vstart; i < vl; i++) {                             \
1803         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1804         *((ETYPE *)vd + H(i)) = s1;                                  \
1805     }                                                                \
1806     env->vstart = 0;                                                 \
1807     /* set tail elements to 1s */                                    \
1808     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1809 }
1810 
1811 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1812 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1813 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1814 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1815 
1816 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1817 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1818                   uint32_t desc)                                     \
1819 {                                                                    \
1820     uint32_t vl = env->vl;                                           \
1821     uint32_t esz = sizeof(ETYPE);                                    \
1822     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1823     uint32_t vta = vext_vta(desc);                                   \
1824     uint32_t i;                                                      \
1825                                                                      \
1826     for (i = env->vstart; i < vl; i++) {                             \
1827         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1828     }                                                                \
1829     env->vstart = 0;                                                 \
1830     /* set tail elements to 1s */                                    \
1831     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1832 }
1833 
1834 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1835 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1836 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1837 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1838 
1839 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1840 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1841                   CPURISCVState *env, uint32_t desc)                 \
1842 {                                                                    \
1843     uint32_t vl = env->vl;                                           \
1844     uint32_t esz = sizeof(ETYPE);                                    \
1845     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1846     uint32_t vta = vext_vta(desc);                                   \
1847     uint32_t i;                                                      \
1848                                                                      \
1849     for (i = env->vstart; i < vl; i++) {                             \
1850         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1851         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1852     }                                                                \
1853     env->vstart = 0;                                                 \
1854     /* set tail elements to 1s */                                    \
1855     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1856 }
1857 
1858 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1859 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1860 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1861 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1862 
1863 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1864 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1865                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1866 {                                                                    \
1867     uint32_t vl = env->vl;                                           \
1868     uint32_t esz = sizeof(ETYPE);                                    \
1869     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1870     uint32_t vta = vext_vta(desc);                                   \
1871     uint32_t i;                                                      \
1872                                                                      \
1873     for (i = env->vstart; i < vl; i++) {                             \
1874         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1875         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1876                    (ETYPE)(target_long)s1);                          \
1877         *((ETYPE *)vd + H(i)) = d;                                   \
1878     }                                                                \
1879     env->vstart = 0;                                                 \
1880     /* set tail elements to 1s */                                    \
1881     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1882 }
1883 
1884 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1885 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1886 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1887 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1888 
1889 /*
1890  * Vector Fixed-Point Arithmetic Instructions
1891  */
1892 
1893 /* Vector Single-Width Saturating Add and Subtract */
1894 
1895 /*
1896  * As fixed point instructions probably have round mode and saturation,
1897  * define common macros for fixed point here.
1898  */
1899 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1900                           CPURISCVState *env, int vxrm);
1901 
1902 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1903 static inline void                                                  \
1904 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1905           CPURISCVState *env, int vxrm)                             \
1906 {                                                                   \
1907     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1908     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1909     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1910 }
1911 
1912 static inline void
1913 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1914              CPURISCVState *env,
1915              uint32_t vl, uint32_t vm, int vxrm,
1916              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1917 {
1918     for (uint32_t i = env->vstart; i < vl; i++) {
1919         if (!vm && !vext_elem_mask(v0, i)) {
1920             /* set masked-off elements to 1s */
1921             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1922             continue;
1923         }
1924         fn(vd, vs1, vs2, i, env, vxrm);
1925     }
1926     env->vstart = 0;
1927 }
1928 
1929 static inline void
1930 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1931              CPURISCVState *env,
1932              uint32_t desc,
1933              opivv2_rm_fn *fn, uint32_t esz)
1934 {
1935     uint32_t vm = vext_vm(desc);
1936     uint32_t vl = env->vl;
1937     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1938     uint32_t vta = vext_vta(desc);
1939     uint32_t vma = vext_vma(desc);
1940 
1941     switch (env->vxrm) {
1942     case 0: /* rnu */
1943         vext_vv_rm_1(vd, v0, vs1, vs2,
1944                      env, vl, vm, 0, fn, vma, esz);
1945         break;
1946     case 1: /* rne */
1947         vext_vv_rm_1(vd, v0, vs1, vs2,
1948                      env, vl, vm, 1, fn, vma, esz);
1949         break;
1950     case 2: /* rdn */
1951         vext_vv_rm_1(vd, v0, vs1, vs2,
1952                      env, vl, vm, 2, fn, vma, esz);
1953         break;
1954     default: /* rod */
1955         vext_vv_rm_1(vd, v0, vs1, vs2,
1956                      env, vl, vm, 3, fn, vma, esz);
1957         break;
1958     }
1959     /* set tail elements to 1s */
1960     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
1961 }
1962 
1963 /* generate helpers for fixed point instructions with OPIVV format */
1964 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
1965 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1966                   CPURISCVState *env, uint32_t desc)            \
1967 {                                                               \
1968     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
1969                  do_##NAME, ESZ);                               \
1970 }
1971 
1972 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
1973                              uint8_t b)
1974 {
1975     uint8_t res = a + b;
1976     if (res < a) {
1977         res = UINT8_MAX;
1978         env->vxsat = 0x1;
1979     }
1980     return res;
1981 }
1982 
1983 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
1984                                uint16_t b)
1985 {
1986     uint16_t res = a + b;
1987     if (res < a) {
1988         res = UINT16_MAX;
1989         env->vxsat = 0x1;
1990     }
1991     return res;
1992 }
1993 
1994 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
1995                                uint32_t b)
1996 {
1997     uint32_t res = a + b;
1998     if (res < a) {
1999         res = UINT32_MAX;
2000         env->vxsat = 0x1;
2001     }
2002     return res;
2003 }
2004 
2005 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2006                                uint64_t b)
2007 {
2008     uint64_t res = a + b;
2009     if (res < a) {
2010         res = UINT64_MAX;
2011         env->vxsat = 0x1;
2012     }
2013     return res;
2014 }
2015 
2016 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2017 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2018 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2019 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2020 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2021 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2022 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2023 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2024 
2025 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2026                           CPURISCVState *env, int vxrm);
2027 
2028 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2029 static inline void                                                  \
2030 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2031           CPURISCVState *env, int vxrm)                             \
2032 {                                                                   \
2033     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2034     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2035 }
2036 
2037 static inline void
2038 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2039              CPURISCVState *env,
2040              uint32_t vl, uint32_t vm, int vxrm,
2041              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2042 {
2043     for (uint32_t i = env->vstart; i < vl; i++) {
2044         if (!vm && !vext_elem_mask(v0, i)) {
2045             /* set masked-off elements to 1s */
2046             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2047             continue;
2048         }
2049         fn(vd, s1, vs2, i, env, vxrm);
2050     }
2051     env->vstart = 0;
2052 }
2053 
2054 static inline void
2055 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2056              CPURISCVState *env,
2057              uint32_t desc,
2058              opivx2_rm_fn *fn, uint32_t esz)
2059 {
2060     uint32_t vm = vext_vm(desc);
2061     uint32_t vl = env->vl;
2062     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2063     uint32_t vta = vext_vta(desc);
2064     uint32_t vma = vext_vma(desc);
2065 
2066     switch (env->vxrm) {
2067     case 0: /* rnu */
2068         vext_vx_rm_1(vd, v0, s1, vs2,
2069                      env, vl, vm, 0, fn, vma, esz);
2070         break;
2071     case 1: /* rne */
2072         vext_vx_rm_1(vd, v0, s1, vs2,
2073                      env, vl, vm, 1, fn, vma, esz);
2074         break;
2075     case 2: /* rdn */
2076         vext_vx_rm_1(vd, v0, s1, vs2,
2077                      env, vl, vm, 2, fn, vma, esz);
2078         break;
2079     default: /* rod */
2080         vext_vx_rm_1(vd, v0, s1, vs2,
2081                      env, vl, vm, 3, fn, vma, esz);
2082         break;
2083     }
2084     /* set tail elements to 1s */
2085     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2086 }
2087 
2088 /* generate helpers for fixed point instructions with OPIVX format */
2089 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2090 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2091                   void *vs2, CPURISCVState *env,          \
2092                   uint32_t desc)                          \
2093 {                                                         \
2094     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2095                  do_##NAME, ESZ);                         \
2096 }
2097 
2098 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2099 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2100 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2101 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2102 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2103 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2104 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2105 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2106 
2107 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2108 {
2109     int8_t res = a + b;
2110     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2111         res = a > 0 ? INT8_MAX : INT8_MIN;
2112         env->vxsat = 0x1;
2113     }
2114     return res;
2115 }
2116 
2117 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2118                              int16_t b)
2119 {
2120     int16_t res = a + b;
2121     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2122         res = a > 0 ? INT16_MAX : INT16_MIN;
2123         env->vxsat = 0x1;
2124     }
2125     return res;
2126 }
2127 
2128 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2129                              int32_t b)
2130 {
2131     int32_t res = a + b;
2132     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2133         res = a > 0 ? INT32_MAX : INT32_MIN;
2134         env->vxsat = 0x1;
2135     }
2136     return res;
2137 }
2138 
2139 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2140                              int64_t b)
2141 {
2142     int64_t res = a + b;
2143     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2144         res = a > 0 ? INT64_MAX : INT64_MIN;
2145         env->vxsat = 0x1;
2146     }
2147     return res;
2148 }
2149 
2150 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2151 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2152 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2153 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2154 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2155 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2156 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2157 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2158 
2159 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2160 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2161 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2162 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2163 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2164 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2165 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2166 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2167 
2168 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2169                              uint8_t b)
2170 {
2171     uint8_t res = a - b;
2172     if (res > a) {
2173         res = 0;
2174         env->vxsat = 0x1;
2175     }
2176     return res;
2177 }
2178 
2179 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2180                                uint16_t b)
2181 {
2182     uint16_t res = a - b;
2183     if (res > a) {
2184         res = 0;
2185         env->vxsat = 0x1;
2186     }
2187     return res;
2188 }
2189 
2190 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2191                                uint32_t b)
2192 {
2193     uint32_t res = a - b;
2194     if (res > a) {
2195         res = 0;
2196         env->vxsat = 0x1;
2197     }
2198     return res;
2199 }
2200 
2201 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2202                                uint64_t b)
2203 {
2204     uint64_t res = a - b;
2205     if (res > a) {
2206         res = 0;
2207         env->vxsat = 0x1;
2208     }
2209     return res;
2210 }
2211 
2212 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2213 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2214 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2215 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2216 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2217 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2218 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2219 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2220 
2221 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2222 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2223 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2224 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2225 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2226 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2227 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2228 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2229 
2230 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2231 {
2232     int8_t res = a - b;
2233     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2234         res = a >= 0 ? INT8_MAX : INT8_MIN;
2235         env->vxsat = 0x1;
2236     }
2237     return res;
2238 }
2239 
2240 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2241                              int16_t b)
2242 {
2243     int16_t res = a - b;
2244     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2245         res = a >= 0 ? INT16_MAX : INT16_MIN;
2246         env->vxsat = 0x1;
2247     }
2248     return res;
2249 }
2250 
2251 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2252                              int32_t b)
2253 {
2254     int32_t res = a - b;
2255     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2256         res = a >= 0 ? INT32_MAX : INT32_MIN;
2257         env->vxsat = 0x1;
2258     }
2259     return res;
2260 }
2261 
2262 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2263                              int64_t b)
2264 {
2265     int64_t res = a - b;
2266     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2267         res = a >= 0 ? INT64_MAX : INT64_MIN;
2268         env->vxsat = 0x1;
2269     }
2270     return res;
2271 }
2272 
2273 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2274 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2275 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2276 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2277 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2278 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2279 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2280 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2281 
2282 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2283 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2284 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2285 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2286 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2287 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2288 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2289 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2290 
2291 /* Vector Single-Width Averaging Add and Subtract */
2292 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2293 {
2294     uint8_t d = extract64(v, shift, 1);
2295     uint8_t d1;
2296     uint64_t D1, D2;
2297 
2298     if (shift == 0 || shift > 64) {
2299         return 0;
2300     }
2301 
2302     d1 = extract64(v, shift - 1, 1);
2303     D1 = extract64(v, 0, shift);
2304     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2305         return d1;
2306     } else if (vxrm == 1) { /* round-to-nearest-even */
2307         if (shift > 1) {
2308             D2 = extract64(v, 0, shift - 1);
2309             return d1 & ((D2 != 0) | d);
2310         } else {
2311             return d1 & d;
2312         }
2313     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2314         return !d & (D1 != 0);
2315     }
2316     return 0; /* round-down (truncate) */
2317 }
2318 
2319 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2320                              int32_t b)
2321 {
2322     int64_t res = (int64_t)a + b;
2323     uint8_t round = get_round(vxrm, res, 1);
2324 
2325     return (res >> 1) + round;
2326 }
2327 
2328 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2329                              int64_t b)
2330 {
2331     int64_t res = a + b;
2332     uint8_t round = get_round(vxrm, res, 1);
2333     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2334 
2335     /* With signed overflow, bit 64 is inverse of bit 63. */
2336     return ((res >> 1) ^ over) + round;
2337 }
2338 
2339 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2340 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2341 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2342 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2343 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2344 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2345 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2346 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2347 
2348 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2349 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2350 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2351 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2352 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2353 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2354 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2355 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2356 
2357 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2358                                uint32_t a, uint32_t b)
2359 {
2360     uint64_t res = (uint64_t)a + b;
2361     uint8_t round = get_round(vxrm, res, 1);
2362 
2363     return (res >> 1) + round;
2364 }
2365 
2366 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2367                                uint64_t a, uint64_t b)
2368 {
2369     uint64_t res = a + b;
2370     uint8_t round = get_round(vxrm, res, 1);
2371     uint64_t over = (uint64_t)(res < a) << 63;
2372 
2373     return ((res >> 1) | over) + round;
2374 }
2375 
2376 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2377 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2378 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2379 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2380 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2381 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2382 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2383 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2384 
2385 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2386 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2387 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2388 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2389 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2390 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2391 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2392 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2393 
2394 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2395                              int32_t b)
2396 {
2397     int64_t res = (int64_t)a - b;
2398     uint8_t round = get_round(vxrm, res, 1);
2399 
2400     return (res >> 1) + round;
2401 }
2402 
2403 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2404                              int64_t b)
2405 {
2406     int64_t res = (int64_t)a - b;
2407     uint8_t round = get_round(vxrm, res, 1);
2408     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2409 
2410     /* With signed overflow, bit 64 is inverse of bit 63. */
2411     return ((res >> 1) ^ over) + round;
2412 }
2413 
2414 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2415 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2416 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2417 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2418 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2419 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2420 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2421 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2422 
2423 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2424 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2425 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2426 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2427 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2428 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2429 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2430 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2431 
2432 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2433                                uint32_t a, uint32_t b)
2434 {
2435     int64_t res = (int64_t)a - b;
2436     uint8_t round = get_round(vxrm, res, 1);
2437 
2438     return (res >> 1) + round;
2439 }
2440 
2441 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2442                                uint64_t a, uint64_t b)
2443 {
2444     uint64_t res = (uint64_t)a - b;
2445     uint8_t round = get_round(vxrm, res, 1);
2446     uint64_t over = (uint64_t)(res > a) << 63;
2447 
2448     return ((res >> 1) | over) + round;
2449 }
2450 
2451 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2452 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2453 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2454 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2455 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2456 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2457 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2458 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2459 
2460 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2461 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2462 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2463 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2464 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2465 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2466 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2467 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2468 
2469 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2470 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2471 {
2472     uint8_t round;
2473     int16_t res;
2474 
2475     res = (int16_t)a * (int16_t)b;
2476     round = get_round(vxrm, res, 7);
2477     res = (res >> 7) + round;
2478 
2479     if (res > INT8_MAX) {
2480         env->vxsat = 0x1;
2481         return INT8_MAX;
2482     } else if (res < INT8_MIN) {
2483         env->vxsat = 0x1;
2484         return INT8_MIN;
2485     } else {
2486         return res;
2487     }
2488 }
2489 
2490 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2491 {
2492     uint8_t round;
2493     int32_t res;
2494 
2495     res = (int32_t)a * (int32_t)b;
2496     round = get_round(vxrm, res, 15);
2497     res = (res >> 15) + round;
2498 
2499     if (res > INT16_MAX) {
2500         env->vxsat = 0x1;
2501         return INT16_MAX;
2502     } else if (res < INT16_MIN) {
2503         env->vxsat = 0x1;
2504         return INT16_MIN;
2505     } else {
2506         return res;
2507     }
2508 }
2509 
2510 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2511 {
2512     uint8_t round;
2513     int64_t res;
2514 
2515     res = (int64_t)a * (int64_t)b;
2516     round = get_round(vxrm, res, 31);
2517     res = (res >> 31) + round;
2518 
2519     if (res > INT32_MAX) {
2520         env->vxsat = 0x1;
2521         return INT32_MAX;
2522     } else if (res < INT32_MIN) {
2523         env->vxsat = 0x1;
2524         return INT32_MIN;
2525     } else {
2526         return res;
2527     }
2528 }
2529 
2530 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2531 {
2532     uint8_t round;
2533     uint64_t hi_64, lo_64;
2534     int64_t res;
2535 
2536     if (a == INT64_MIN && b == INT64_MIN) {
2537         env->vxsat = 1;
2538         return INT64_MAX;
2539     }
2540 
2541     muls64(&lo_64, &hi_64, a, b);
2542     round = get_round(vxrm, lo_64, 63);
2543     /*
2544      * Cannot overflow, as there are always
2545      * 2 sign bits after multiply.
2546      */
2547     res = (hi_64 << 1) | (lo_64 >> 63);
2548     if (round) {
2549         if (res == INT64_MAX) {
2550             env->vxsat = 1;
2551         } else {
2552             res += 1;
2553         }
2554     }
2555     return res;
2556 }
2557 
2558 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2559 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2560 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2561 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2562 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2563 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2564 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2565 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2566 
2567 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2568 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2569 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2570 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2571 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2572 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2573 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2574 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2575 
2576 /* Vector Single-Width Scaling Shift Instructions */
2577 static inline uint8_t
2578 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2579 {
2580     uint8_t round, shift = b & 0x7;
2581     uint8_t res;
2582 
2583     round = get_round(vxrm, a, shift);
2584     res = (a >> shift) + round;
2585     return res;
2586 }
2587 static inline uint16_t
2588 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2589 {
2590     uint8_t round, shift = b & 0xf;
2591 
2592     round = get_round(vxrm, a, shift);
2593     return (a >> shift) + round;
2594 }
2595 static inline uint32_t
2596 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2597 {
2598     uint8_t round, shift = b & 0x1f;
2599 
2600     round = get_round(vxrm, a, shift);
2601     return (a >> shift) + round;
2602 }
2603 static inline uint64_t
2604 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2605 {
2606     uint8_t round, shift = b & 0x3f;
2607 
2608     round = get_round(vxrm, a, shift);
2609     return (a >> shift) + round;
2610 }
2611 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2612 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2613 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2614 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2615 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2616 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2617 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2618 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2619 
2620 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2621 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2622 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2623 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2624 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2625 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2626 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2627 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2628 
2629 static inline int8_t
2630 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2631 {
2632     uint8_t round, shift = b & 0x7;
2633 
2634     round = get_round(vxrm, a, shift);
2635     return (a >> shift) + round;
2636 }
2637 static inline int16_t
2638 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2639 {
2640     uint8_t round, shift = b & 0xf;
2641 
2642     round = get_round(vxrm, a, shift);
2643     return (a >> shift) + round;
2644 }
2645 static inline int32_t
2646 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2647 {
2648     uint8_t round, shift = b & 0x1f;
2649 
2650     round = get_round(vxrm, a, shift);
2651     return (a >> shift) + round;
2652 }
2653 static inline int64_t
2654 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2655 {
2656     uint8_t round, shift = b & 0x3f;
2657 
2658     round = get_round(vxrm, a, shift);
2659     return (a >> shift) + round;
2660 }
2661 
2662 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2663 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2664 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2665 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2666 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2667 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2668 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2669 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2670 
2671 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2672 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2673 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2674 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2675 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2676 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2677 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2678 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2679 
2680 /* Vector Narrowing Fixed-Point Clip Instructions */
2681 static inline int8_t
2682 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2683 {
2684     uint8_t round, shift = b & 0xf;
2685     int16_t res;
2686 
2687     round = get_round(vxrm, a, shift);
2688     res = (a >> shift) + round;
2689     if (res > INT8_MAX) {
2690         env->vxsat = 0x1;
2691         return INT8_MAX;
2692     } else if (res < INT8_MIN) {
2693         env->vxsat = 0x1;
2694         return INT8_MIN;
2695     } else {
2696         return res;
2697     }
2698 }
2699 
2700 static inline int16_t
2701 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2702 {
2703     uint8_t round, shift = b & 0x1f;
2704     int32_t res;
2705 
2706     round = get_round(vxrm, a, shift);
2707     res = (a >> shift) + round;
2708     if (res > INT16_MAX) {
2709         env->vxsat = 0x1;
2710         return INT16_MAX;
2711     } else if (res < INT16_MIN) {
2712         env->vxsat = 0x1;
2713         return INT16_MIN;
2714     } else {
2715         return res;
2716     }
2717 }
2718 
2719 static inline int32_t
2720 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2721 {
2722     uint8_t round, shift = b & 0x3f;
2723     int64_t res;
2724 
2725     round = get_round(vxrm, a, shift);
2726     res = (a >> shift) + round;
2727     if (res > INT32_MAX) {
2728         env->vxsat = 0x1;
2729         return INT32_MAX;
2730     } else if (res < INT32_MIN) {
2731         env->vxsat = 0x1;
2732         return INT32_MIN;
2733     } else {
2734         return res;
2735     }
2736 }
2737 
2738 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2739 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2740 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2741 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2742 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2743 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2744 
2745 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2746 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2747 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2748 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2749 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2750 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2751 
2752 static inline uint8_t
2753 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2754 {
2755     uint8_t round, shift = b & 0xf;
2756     uint16_t res;
2757 
2758     round = get_round(vxrm, a, shift);
2759     res = (a >> shift) + round;
2760     if (res > UINT8_MAX) {
2761         env->vxsat = 0x1;
2762         return UINT8_MAX;
2763     } else {
2764         return res;
2765     }
2766 }
2767 
2768 static inline uint16_t
2769 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2770 {
2771     uint8_t round, shift = b & 0x1f;
2772     uint32_t res;
2773 
2774     round = get_round(vxrm, a, shift);
2775     res = (a >> shift) + round;
2776     if (res > UINT16_MAX) {
2777         env->vxsat = 0x1;
2778         return UINT16_MAX;
2779     } else {
2780         return res;
2781     }
2782 }
2783 
2784 static inline uint32_t
2785 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2786 {
2787     uint8_t round, shift = b & 0x3f;
2788     uint64_t res;
2789 
2790     round = get_round(vxrm, a, shift);
2791     res = (a >> shift) + round;
2792     if (res > UINT32_MAX) {
2793         env->vxsat = 0x1;
2794         return UINT32_MAX;
2795     } else {
2796         return res;
2797     }
2798 }
2799 
2800 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2801 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2802 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2803 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2804 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2805 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2806 
2807 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2808 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2809 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2810 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2811 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2812 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2813 
2814 /*
2815  * Vector Float Point Arithmetic Instructions
2816  */
2817 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2818 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2819 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2820                       CPURISCVState *env)                      \
2821 {                                                              \
2822     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2823     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2824     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2825 }
2826 
2827 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2828 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2829                   void *vs2, CPURISCVState *env,          \
2830                   uint32_t desc)                          \
2831 {                                                         \
2832     uint32_t vm = vext_vm(desc);                          \
2833     uint32_t vl = env->vl;                                \
2834     uint32_t total_elems =                                \
2835         vext_get_total_elems(env, desc, ESZ);             \
2836     uint32_t vta = vext_vta(desc);                        \
2837     uint32_t vma = vext_vma(desc);                        \
2838     uint32_t i;                                           \
2839                                                           \
2840     for (i = env->vstart; i < vl; i++) {                  \
2841         if (!vm && !vext_elem_mask(v0, i)) {              \
2842             /* set masked-off elements to 1s */           \
2843             vext_set_elems_1s(vd, vma, i * ESZ,           \
2844                               (i + 1) * ESZ);             \
2845             continue;                                     \
2846         }                                                 \
2847         do_##NAME(vd, vs1, vs2, i, env);                  \
2848     }                                                     \
2849     env->vstart = 0;                                      \
2850     /* set tail elements to 1s */                         \
2851     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2852                       total_elems * ESZ);                 \
2853 }
2854 
2855 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2856 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2857 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2858 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2859 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2860 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2861 
2862 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2863 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2864                       CPURISCVState *env)                      \
2865 {                                                              \
2866     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2867     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2868 }
2869 
2870 #define GEN_VEXT_VF(NAME, ESZ)                            \
2871 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2872                   void *vs2, CPURISCVState *env,          \
2873                   uint32_t desc)                          \
2874 {                                                         \
2875     uint32_t vm = vext_vm(desc);                          \
2876     uint32_t vl = env->vl;                                \
2877     uint32_t total_elems =                                \
2878         vext_get_total_elems(env, desc, ESZ);             \
2879     uint32_t vta = vext_vta(desc);                        \
2880     uint32_t vma = vext_vma(desc);                        \
2881     uint32_t i;                                           \
2882                                                           \
2883     for (i = env->vstart; i < vl; i++) {                  \
2884         if (!vm && !vext_elem_mask(v0, i)) {              \
2885             /* set masked-off elements to 1s */           \
2886             vext_set_elems_1s(vd, vma, i * ESZ,           \
2887                               (i + 1) * ESZ);             \
2888             continue;                                     \
2889         }                                                 \
2890         do_##NAME(vd, s1, vs2, i, env);                   \
2891     }                                                     \
2892     env->vstart = 0;                                      \
2893     /* set tail elements to 1s */                         \
2894     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2895                       total_elems * ESZ);                 \
2896 }
2897 
2898 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2899 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2900 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2901 GEN_VEXT_VF(vfadd_vf_h, 2)
2902 GEN_VEXT_VF(vfadd_vf_w, 4)
2903 GEN_VEXT_VF(vfadd_vf_d, 8)
2904 
2905 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2906 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2907 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2908 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2909 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2910 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2911 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2912 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2913 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2914 GEN_VEXT_VF(vfsub_vf_h, 2)
2915 GEN_VEXT_VF(vfsub_vf_w, 4)
2916 GEN_VEXT_VF(vfsub_vf_d, 8)
2917 
2918 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2919 {
2920     return float16_sub(b, a, s);
2921 }
2922 
2923 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2924 {
2925     return float32_sub(b, a, s);
2926 }
2927 
2928 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2929 {
2930     return float64_sub(b, a, s);
2931 }
2932 
2933 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2934 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2935 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2936 GEN_VEXT_VF(vfrsub_vf_h, 2)
2937 GEN_VEXT_VF(vfrsub_vf_w, 4)
2938 GEN_VEXT_VF(vfrsub_vf_d, 8)
2939 
2940 /* Vector Widening Floating-Point Add/Subtract Instructions */
2941 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2942 {
2943     return float32_add(float16_to_float32(a, true, s),
2944                        float16_to_float32(b, true, s), s);
2945 }
2946 
2947 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2948 {
2949     return float64_add(float32_to_float64(a, s),
2950                        float32_to_float64(b, s), s);
2951 
2952 }
2953 
2954 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
2955 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
2956 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
2957 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
2958 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
2959 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
2960 GEN_VEXT_VF(vfwadd_vf_h, 4)
2961 GEN_VEXT_VF(vfwadd_vf_w, 8)
2962 
2963 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
2964 {
2965     return float32_sub(float16_to_float32(a, true, s),
2966                        float16_to_float32(b, true, s), s);
2967 }
2968 
2969 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
2970 {
2971     return float64_sub(float32_to_float64(a, s),
2972                        float32_to_float64(b, s), s);
2973 
2974 }
2975 
2976 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
2977 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
2978 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
2979 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
2980 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
2981 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
2982 GEN_VEXT_VF(vfwsub_vf_h, 4)
2983 GEN_VEXT_VF(vfwsub_vf_w, 8)
2984 
2985 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
2986 {
2987     return float32_add(a, float16_to_float32(b, true, s), s);
2988 }
2989 
2990 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
2991 {
2992     return float64_add(a, float32_to_float64(b, s), s);
2993 }
2994 
2995 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
2996 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
2997 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
2998 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
2999 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3000 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3001 GEN_VEXT_VF(vfwadd_wf_h, 4)
3002 GEN_VEXT_VF(vfwadd_wf_w, 8)
3003 
3004 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3005 {
3006     return float32_sub(a, float16_to_float32(b, true, s), s);
3007 }
3008 
3009 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3010 {
3011     return float64_sub(a, float32_to_float64(b, s), s);
3012 }
3013 
3014 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3015 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3016 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3017 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3018 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3019 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3020 GEN_VEXT_VF(vfwsub_wf_h, 4)
3021 GEN_VEXT_VF(vfwsub_wf_w, 8)
3022 
3023 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3024 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3025 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3026 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3027 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3028 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3029 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3030 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3031 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3032 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3033 GEN_VEXT_VF(vfmul_vf_h, 2)
3034 GEN_VEXT_VF(vfmul_vf_w, 4)
3035 GEN_VEXT_VF(vfmul_vf_d, 8)
3036 
3037 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3038 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3039 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3040 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3041 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3042 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3043 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3044 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3045 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3046 GEN_VEXT_VF(vfdiv_vf_h, 2)
3047 GEN_VEXT_VF(vfdiv_vf_w, 4)
3048 GEN_VEXT_VF(vfdiv_vf_d, 8)
3049 
3050 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3051 {
3052     return float16_div(b, a, s);
3053 }
3054 
3055 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3056 {
3057     return float32_div(b, a, s);
3058 }
3059 
3060 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3061 {
3062     return float64_div(b, a, s);
3063 }
3064 
3065 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3066 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3067 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3068 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3069 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3070 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3071 
3072 /* Vector Widening Floating-Point Multiply */
3073 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3074 {
3075     return float32_mul(float16_to_float32(a, true, s),
3076                        float16_to_float32(b, true, s), s);
3077 }
3078 
3079 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3080 {
3081     return float64_mul(float32_to_float64(a, s),
3082                        float32_to_float64(b, s), s);
3083 
3084 }
3085 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3086 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3087 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3088 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3089 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3090 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3091 GEN_VEXT_VF(vfwmul_vf_h, 4)
3092 GEN_VEXT_VF(vfwmul_vf_w, 8)
3093 
3094 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3095 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3096 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3097                       CPURISCVState *env)                          \
3098 {                                                                  \
3099     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3100     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3101     TD d = *((TD *)vd + HD(i));                                    \
3102     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3103 }
3104 
3105 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3106 {
3107     return float16_muladd(a, b, d, 0, s);
3108 }
3109 
3110 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3111 {
3112     return float32_muladd(a, b, d, 0, s);
3113 }
3114 
3115 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3116 {
3117     return float64_muladd(a, b, d, 0, s);
3118 }
3119 
3120 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3121 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3122 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3123 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3124 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3125 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3126 
3127 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3128 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3129                       CPURISCVState *env)                         \
3130 {                                                                 \
3131     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3132     TD d = *((TD *)vd + HD(i));                                   \
3133     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3134 }
3135 
3136 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3137 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3138 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3139 GEN_VEXT_VF(vfmacc_vf_h, 2)
3140 GEN_VEXT_VF(vfmacc_vf_w, 4)
3141 GEN_VEXT_VF(vfmacc_vf_d, 8)
3142 
3143 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3144 {
3145     return float16_muladd(a, b, d, float_muladd_negate_c |
3146                                    float_muladd_negate_product, s);
3147 }
3148 
3149 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3150 {
3151     return float32_muladd(a, b, d, float_muladd_negate_c |
3152                                    float_muladd_negate_product, s);
3153 }
3154 
3155 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3156 {
3157     return float64_muladd(a, b, d, float_muladd_negate_c |
3158                                    float_muladd_negate_product, s);
3159 }
3160 
3161 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3162 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3163 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3164 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3165 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3166 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3167 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3168 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3169 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3170 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3171 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3172 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3173 
3174 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3175 {
3176     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3177 }
3178 
3179 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3180 {
3181     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3182 }
3183 
3184 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3185 {
3186     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3187 }
3188 
3189 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3190 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3191 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3192 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3193 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3194 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3195 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3196 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3197 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3198 GEN_VEXT_VF(vfmsac_vf_h, 2)
3199 GEN_VEXT_VF(vfmsac_vf_w, 4)
3200 GEN_VEXT_VF(vfmsac_vf_d, 8)
3201 
3202 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3203 {
3204     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3205 }
3206 
3207 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3208 {
3209     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3210 }
3211 
3212 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3213 {
3214     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3215 }
3216 
3217 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3218 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3219 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3220 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3221 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3222 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3223 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3224 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3225 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3226 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3227 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3228 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3229 
3230 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3231 {
3232     return float16_muladd(d, b, a, 0, s);
3233 }
3234 
3235 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3236 {
3237     return float32_muladd(d, b, a, 0, s);
3238 }
3239 
3240 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3241 {
3242     return float64_muladd(d, b, a, 0, s);
3243 }
3244 
3245 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3246 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3247 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3248 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3249 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3250 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3251 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3252 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3253 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3254 GEN_VEXT_VF(vfmadd_vf_h, 2)
3255 GEN_VEXT_VF(vfmadd_vf_w, 4)
3256 GEN_VEXT_VF(vfmadd_vf_d, 8)
3257 
3258 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3259 {
3260     return float16_muladd(d, b, a, float_muladd_negate_c |
3261                                    float_muladd_negate_product, s);
3262 }
3263 
3264 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3265 {
3266     return float32_muladd(d, b, a, float_muladd_negate_c |
3267                                    float_muladd_negate_product, s);
3268 }
3269 
3270 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3271 {
3272     return float64_muladd(d, b, a, float_muladd_negate_c |
3273                                    float_muladd_negate_product, s);
3274 }
3275 
3276 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3277 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3278 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3279 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3280 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3281 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3282 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3283 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3284 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3285 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3286 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3287 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3288 
3289 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3290 {
3291     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3292 }
3293 
3294 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3295 {
3296     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3297 }
3298 
3299 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3300 {
3301     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3302 }
3303 
3304 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3305 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3306 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3307 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3308 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3309 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3310 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3311 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3312 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3313 GEN_VEXT_VF(vfmsub_vf_h, 2)
3314 GEN_VEXT_VF(vfmsub_vf_w, 4)
3315 GEN_VEXT_VF(vfmsub_vf_d, 8)
3316 
3317 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3318 {
3319     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3320 }
3321 
3322 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3323 {
3324     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3325 }
3326 
3327 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3328 {
3329     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3330 }
3331 
3332 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3333 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3334 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3335 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3336 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3337 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3338 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3339 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3340 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3341 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3342 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3343 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3344 
3345 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3346 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3347 {
3348     return float32_muladd(float16_to_float32(a, true, s),
3349                           float16_to_float32(b, true, s), d, 0, s);
3350 }
3351 
3352 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3353 {
3354     return float64_muladd(float32_to_float64(a, s),
3355                           float32_to_float64(b, s), d, 0, s);
3356 }
3357 
3358 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3359 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3360 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3361 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3362 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3363 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3364 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3365 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3366 
3367 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3368 {
3369     return float32_muladd(bfloat16_to_float32(a, s),
3370                           bfloat16_to_float32(b, s), d, 0, s);
3371 }
3372 
3373 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3374 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3375 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3376 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3377 
3378 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3379 {
3380     return float32_muladd(float16_to_float32(a, true, s),
3381                           float16_to_float32(b, true, s), d,
3382                           float_muladd_negate_c | float_muladd_negate_product,
3383                           s);
3384 }
3385 
3386 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3387 {
3388     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3389                           d, float_muladd_negate_c |
3390                              float_muladd_negate_product, s);
3391 }
3392 
3393 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3394 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3395 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3396 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3397 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3398 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3399 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3400 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3401 
3402 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3403 {
3404     return float32_muladd(float16_to_float32(a, true, s),
3405                           float16_to_float32(b, true, s), d,
3406                           float_muladd_negate_c, s);
3407 }
3408 
3409 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3410 {
3411     return float64_muladd(float32_to_float64(a, s),
3412                           float32_to_float64(b, s), d,
3413                           float_muladd_negate_c, s);
3414 }
3415 
3416 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3417 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3418 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3419 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3420 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3421 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3422 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3423 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3424 
3425 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3426 {
3427     return float32_muladd(float16_to_float32(a, true, s),
3428                           float16_to_float32(b, true, s), d,
3429                           float_muladd_negate_product, s);
3430 }
3431 
3432 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3433 {
3434     return float64_muladd(float32_to_float64(a, s),
3435                           float32_to_float64(b, s), d,
3436                           float_muladd_negate_product, s);
3437 }
3438 
3439 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3440 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3441 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3442 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3443 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3444 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3445 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3446 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3447 
3448 /* Vector Floating-Point Square-Root Instruction */
3449 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3450 static void do_##NAME(void *vd, void *vs2, int i,      \
3451                       CPURISCVState *env)              \
3452 {                                                      \
3453     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3454     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3455 }
3456 
3457 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3458 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3459                   CPURISCVState *env, uint32_t desc)   \
3460 {                                                      \
3461     uint32_t vm = vext_vm(desc);                       \
3462     uint32_t vl = env->vl;                             \
3463     uint32_t total_elems =                             \
3464         vext_get_total_elems(env, desc, ESZ);          \
3465     uint32_t vta = vext_vta(desc);                     \
3466     uint32_t vma = vext_vma(desc);                     \
3467     uint32_t i;                                        \
3468                                                        \
3469     if (vl == 0) {                                     \
3470         return;                                        \
3471     }                                                  \
3472     for (i = env->vstart; i < vl; i++) {               \
3473         if (!vm && !vext_elem_mask(v0, i)) {           \
3474             /* set masked-off elements to 1s */        \
3475             vext_set_elems_1s(vd, vma, i * ESZ,        \
3476                               (i + 1) * ESZ);          \
3477             continue;                                  \
3478         }                                              \
3479         do_##NAME(vd, vs2, i, env);                    \
3480     }                                                  \
3481     env->vstart = 0;                                   \
3482     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3483                       total_elems * ESZ);              \
3484 }
3485 
3486 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3487 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3488 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3489 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3490 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3491 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3492 
3493 /*
3494  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3495  *
3496  * Adapted from riscv-v-spec recip.c:
3497  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3498  */
3499 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3500 {
3501     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3502     uint64_t exp = extract64(f, frac_size, exp_size);
3503     uint64_t frac = extract64(f, 0, frac_size);
3504 
3505     const uint8_t lookup_table[] = {
3506         52, 51, 50, 48, 47, 46, 44, 43,
3507         42, 41, 40, 39, 38, 36, 35, 34,
3508         33, 32, 31, 30, 30, 29, 28, 27,
3509         26, 25, 24, 23, 23, 22, 21, 20,
3510         19, 19, 18, 17, 16, 16, 15, 14,
3511         14, 13, 12, 12, 11, 10, 10, 9,
3512         9, 8, 7, 7, 6, 6, 5, 4,
3513         4, 3, 3, 2, 2, 1, 1, 0,
3514         127, 125, 123, 121, 119, 118, 116, 114,
3515         113, 111, 109, 108, 106, 105, 103, 102,
3516         100, 99, 97, 96, 95, 93, 92, 91,
3517         90, 88, 87, 86, 85, 84, 83, 82,
3518         80, 79, 78, 77, 76, 75, 74, 73,
3519         72, 71, 70, 70, 69, 68, 67, 66,
3520         65, 64, 63, 63, 62, 61, 60, 59,
3521         59, 58, 57, 56, 56, 55, 54, 53
3522     };
3523     const int precision = 7;
3524 
3525     if (exp == 0 && frac != 0) { /* subnormal */
3526         /* Normalize the subnormal. */
3527         while (extract64(frac, frac_size - 1, 1) == 0) {
3528             exp--;
3529             frac <<= 1;
3530         }
3531 
3532         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3533     }
3534 
3535     int idx = ((exp & 1) << (precision - 1)) |
3536               (frac >> (frac_size - precision + 1));
3537     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3538                         (frac_size - precision);
3539     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3540 
3541     uint64_t val = 0;
3542     val = deposit64(val, 0, frac_size, out_frac);
3543     val = deposit64(val, frac_size, exp_size, out_exp);
3544     val = deposit64(val, frac_size + exp_size, 1, sign);
3545     return val;
3546 }
3547 
3548 static float16 frsqrt7_h(float16 f, float_status *s)
3549 {
3550     int exp_size = 5, frac_size = 10;
3551     bool sign = float16_is_neg(f);
3552 
3553     /*
3554      * frsqrt7(sNaN) = canonical NaN
3555      * frsqrt7(-inf) = canonical NaN
3556      * frsqrt7(-normal) = canonical NaN
3557      * frsqrt7(-subnormal) = canonical NaN
3558      */
3559     if (float16_is_signaling_nan(f, s) ||
3560         (float16_is_infinity(f) && sign) ||
3561         (float16_is_normal(f) && sign) ||
3562         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3563         s->float_exception_flags |= float_flag_invalid;
3564         return float16_default_nan(s);
3565     }
3566 
3567     /* frsqrt7(qNaN) = canonical NaN */
3568     if (float16_is_quiet_nan(f, s)) {
3569         return float16_default_nan(s);
3570     }
3571 
3572     /* frsqrt7(+-0) = +-inf */
3573     if (float16_is_zero(f)) {
3574         s->float_exception_flags |= float_flag_divbyzero;
3575         return float16_set_sign(float16_infinity, sign);
3576     }
3577 
3578     /* frsqrt7(+inf) = +0 */
3579     if (float16_is_infinity(f) && !sign) {
3580         return float16_set_sign(float16_zero, sign);
3581     }
3582 
3583     /* +normal, +subnormal */
3584     uint64_t val = frsqrt7(f, exp_size, frac_size);
3585     return make_float16(val);
3586 }
3587 
3588 static float32 frsqrt7_s(float32 f, float_status *s)
3589 {
3590     int exp_size = 8, frac_size = 23;
3591     bool sign = float32_is_neg(f);
3592 
3593     /*
3594      * frsqrt7(sNaN) = canonical NaN
3595      * frsqrt7(-inf) = canonical NaN
3596      * frsqrt7(-normal) = canonical NaN
3597      * frsqrt7(-subnormal) = canonical NaN
3598      */
3599     if (float32_is_signaling_nan(f, s) ||
3600         (float32_is_infinity(f) && sign) ||
3601         (float32_is_normal(f) && sign) ||
3602         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3603         s->float_exception_flags |= float_flag_invalid;
3604         return float32_default_nan(s);
3605     }
3606 
3607     /* frsqrt7(qNaN) = canonical NaN */
3608     if (float32_is_quiet_nan(f, s)) {
3609         return float32_default_nan(s);
3610     }
3611 
3612     /* frsqrt7(+-0) = +-inf */
3613     if (float32_is_zero(f)) {
3614         s->float_exception_flags |= float_flag_divbyzero;
3615         return float32_set_sign(float32_infinity, sign);
3616     }
3617 
3618     /* frsqrt7(+inf) = +0 */
3619     if (float32_is_infinity(f) && !sign) {
3620         return float32_set_sign(float32_zero, sign);
3621     }
3622 
3623     /* +normal, +subnormal */
3624     uint64_t val = frsqrt7(f, exp_size, frac_size);
3625     return make_float32(val);
3626 }
3627 
3628 static float64 frsqrt7_d(float64 f, float_status *s)
3629 {
3630     int exp_size = 11, frac_size = 52;
3631     bool sign = float64_is_neg(f);
3632 
3633     /*
3634      * frsqrt7(sNaN) = canonical NaN
3635      * frsqrt7(-inf) = canonical NaN
3636      * frsqrt7(-normal) = canonical NaN
3637      * frsqrt7(-subnormal) = canonical NaN
3638      */
3639     if (float64_is_signaling_nan(f, s) ||
3640         (float64_is_infinity(f) && sign) ||
3641         (float64_is_normal(f) && sign) ||
3642         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3643         s->float_exception_flags |= float_flag_invalid;
3644         return float64_default_nan(s);
3645     }
3646 
3647     /* frsqrt7(qNaN) = canonical NaN */
3648     if (float64_is_quiet_nan(f, s)) {
3649         return float64_default_nan(s);
3650     }
3651 
3652     /* frsqrt7(+-0) = +-inf */
3653     if (float64_is_zero(f)) {
3654         s->float_exception_flags |= float_flag_divbyzero;
3655         return float64_set_sign(float64_infinity, sign);
3656     }
3657 
3658     /* frsqrt7(+inf) = +0 */
3659     if (float64_is_infinity(f) && !sign) {
3660         return float64_set_sign(float64_zero, sign);
3661     }
3662 
3663     /* +normal, +subnormal */
3664     uint64_t val = frsqrt7(f, exp_size, frac_size);
3665     return make_float64(val);
3666 }
3667 
3668 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3669 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3670 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3671 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3672 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3673 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3674 
3675 /*
3676  * Vector Floating-Point Reciprocal Estimate Instruction
3677  *
3678  * Adapted from riscv-v-spec recip.c:
3679  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3680  */
3681 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3682                       float_status *s)
3683 {
3684     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3685     uint64_t exp = extract64(f, frac_size, exp_size);
3686     uint64_t frac = extract64(f, 0, frac_size);
3687 
3688     const uint8_t lookup_table[] = {
3689         127, 125, 123, 121, 119, 117, 116, 114,
3690         112, 110, 109, 107, 105, 104, 102, 100,
3691         99, 97, 96, 94, 93, 91, 90, 88,
3692         87, 85, 84, 83, 81, 80, 79, 77,
3693         76, 75, 74, 72, 71, 70, 69, 68,
3694         66, 65, 64, 63, 62, 61, 60, 59,
3695         58, 57, 56, 55, 54, 53, 52, 51,
3696         50, 49, 48, 47, 46, 45, 44, 43,
3697         42, 41, 40, 40, 39, 38, 37, 36,
3698         35, 35, 34, 33, 32, 31, 31, 30,
3699         29, 28, 28, 27, 26, 25, 25, 24,
3700         23, 23, 22, 21, 21, 20, 19, 19,
3701         18, 17, 17, 16, 15, 15, 14, 14,
3702         13, 12, 12, 11, 11, 10, 9, 9,
3703         8, 8, 7, 7, 6, 5, 5, 4,
3704         4, 3, 3, 2, 2, 1, 1, 0
3705     };
3706     const int precision = 7;
3707 
3708     if (exp == 0 && frac != 0) { /* subnormal */
3709         /* Normalize the subnormal. */
3710         while (extract64(frac, frac_size - 1, 1) == 0) {
3711             exp--;
3712             frac <<= 1;
3713         }
3714 
3715         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3716 
3717         if (exp != 0 && exp != UINT64_MAX) {
3718             /*
3719              * Overflow to inf or max value of same sign,
3720              * depending on sign and rounding mode.
3721              */
3722             s->float_exception_flags |= (float_flag_inexact |
3723                                          float_flag_overflow);
3724 
3725             if ((s->float_rounding_mode == float_round_to_zero) ||
3726                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3727                 ((s->float_rounding_mode == float_round_up) && sign)) {
3728                 /* Return greatest/negative finite value. */
3729                 return (sign << (exp_size + frac_size)) |
3730                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3731             } else {
3732                 /* Return +-inf. */
3733                 return (sign << (exp_size + frac_size)) |
3734                        MAKE_64BIT_MASK(frac_size, exp_size);
3735             }
3736         }
3737     }
3738 
3739     int idx = frac >> (frac_size - precision);
3740     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3741                         (frac_size - precision);
3742     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3743 
3744     if (out_exp == 0 || out_exp == UINT64_MAX) {
3745         /*
3746          * The result is subnormal, but don't raise the underflow exception,
3747          * because there's no additional loss of precision.
3748          */
3749         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3750         if (out_exp == UINT64_MAX) {
3751             out_frac >>= 1;
3752             out_exp = 0;
3753         }
3754     }
3755 
3756     uint64_t val = 0;
3757     val = deposit64(val, 0, frac_size, out_frac);
3758     val = deposit64(val, frac_size, exp_size, out_exp);
3759     val = deposit64(val, frac_size + exp_size, 1, sign);
3760     return val;
3761 }
3762 
3763 static float16 frec7_h(float16 f, float_status *s)
3764 {
3765     int exp_size = 5, frac_size = 10;
3766     bool sign = float16_is_neg(f);
3767 
3768     /* frec7(+-inf) = +-0 */
3769     if (float16_is_infinity(f)) {
3770         return float16_set_sign(float16_zero, sign);
3771     }
3772 
3773     /* frec7(+-0) = +-inf */
3774     if (float16_is_zero(f)) {
3775         s->float_exception_flags |= float_flag_divbyzero;
3776         return float16_set_sign(float16_infinity, sign);
3777     }
3778 
3779     /* frec7(sNaN) = canonical NaN */
3780     if (float16_is_signaling_nan(f, s)) {
3781         s->float_exception_flags |= float_flag_invalid;
3782         return float16_default_nan(s);
3783     }
3784 
3785     /* frec7(qNaN) = canonical NaN */
3786     if (float16_is_quiet_nan(f, s)) {
3787         return float16_default_nan(s);
3788     }
3789 
3790     /* +-normal, +-subnormal */
3791     uint64_t val = frec7(f, exp_size, frac_size, s);
3792     return make_float16(val);
3793 }
3794 
3795 static float32 frec7_s(float32 f, float_status *s)
3796 {
3797     int exp_size = 8, frac_size = 23;
3798     bool sign = float32_is_neg(f);
3799 
3800     /* frec7(+-inf) = +-0 */
3801     if (float32_is_infinity(f)) {
3802         return float32_set_sign(float32_zero, sign);
3803     }
3804 
3805     /* frec7(+-0) = +-inf */
3806     if (float32_is_zero(f)) {
3807         s->float_exception_flags |= float_flag_divbyzero;
3808         return float32_set_sign(float32_infinity, sign);
3809     }
3810 
3811     /* frec7(sNaN) = canonical NaN */
3812     if (float32_is_signaling_nan(f, s)) {
3813         s->float_exception_flags |= float_flag_invalid;
3814         return float32_default_nan(s);
3815     }
3816 
3817     /* frec7(qNaN) = canonical NaN */
3818     if (float32_is_quiet_nan(f, s)) {
3819         return float32_default_nan(s);
3820     }
3821 
3822     /* +-normal, +-subnormal */
3823     uint64_t val = frec7(f, exp_size, frac_size, s);
3824     return make_float32(val);
3825 }
3826 
3827 static float64 frec7_d(float64 f, float_status *s)
3828 {
3829     int exp_size = 11, frac_size = 52;
3830     bool sign = float64_is_neg(f);
3831 
3832     /* frec7(+-inf) = +-0 */
3833     if (float64_is_infinity(f)) {
3834         return float64_set_sign(float64_zero, sign);
3835     }
3836 
3837     /* frec7(+-0) = +-inf */
3838     if (float64_is_zero(f)) {
3839         s->float_exception_flags |= float_flag_divbyzero;
3840         return float64_set_sign(float64_infinity, sign);
3841     }
3842 
3843     /* frec7(sNaN) = canonical NaN */
3844     if (float64_is_signaling_nan(f, s)) {
3845         s->float_exception_flags |= float_flag_invalid;
3846         return float64_default_nan(s);
3847     }
3848 
3849     /* frec7(qNaN) = canonical NaN */
3850     if (float64_is_quiet_nan(f, s)) {
3851         return float64_default_nan(s);
3852     }
3853 
3854     /* +-normal, +-subnormal */
3855     uint64_t val = frec7(f, exp_size, frac_size, s);
3856     return make_float64(val);
3857 }
3858 
3859 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3860 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3861 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3862 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3863 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3864 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3865 
3866 /* Vector Floating-Point MIN/MAX Instructions */
3867 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3868 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3869 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3870 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3871 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3872 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3873 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3874 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3875 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3876 GEN_VEXT_VF(vfmin_vf_h, 2)
3877 GEN_VEXT_VF(vfmin_vf_w, 4)
3878 GEN_VEXT_VF(vfmin_vf_d, 8)
3879 
3880 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3881 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3882 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3883 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3884 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3885 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3886 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3887 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3888 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3889 GEN_VEXT_VF(vfmax_vf_h, 2)
3890 GEN_VEXT_VF(vfmax_vf_w, 4)
3891 GEN_VEXT_VF(vfmax_vf_d, 8)
3892 
3893 /* Vector Floating-Point Sign-Injection Instructions */
3894 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3895 {
3896     return deposit64(b, 0, 15, a);
3897 }
3898 
3899 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3900 {
3901     return deposit64(b, 0, 31, a);
3902 }
3903 
3904 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3905 {
3906     return deposit64(b, 0, 63, a);
3907 }
3908 
3909 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3910 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3911 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3912 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3913 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3914 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3915 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3916 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3917 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3918 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3919 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3920 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3921 
3922 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3923 {
3924     return deposit64(~b, 0, 15, a);
3925 }
3926 
3927 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3928 {
3929     return deposit64(~b, 0, 31, a);
3930 }
3931 
3932 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3933 {
3934     return deposit64(~b, 0, 63, a);
3935 }
3936 
3937 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3938 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3939 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3940 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3941 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3942 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3943 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3944 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3945 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3946 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
3947 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
3948 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
3949 
3950 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3951 {
3952     return deposit64(b ^ a, 0, 15, a);
3953 }
3954 
3955 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3956 {
3957     return deposit64(b ^ a, 0, 31, a);
3958 }
3959 
3960 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3961 {
3962     return deposit64(b ^ a, 0, 63, a);
3963 }
3964 
3965 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3966 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3967 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3968 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
3969 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
3970 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
3971 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3972 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3973 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3974 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
3975 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
3976 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
3977 
3978 /* Vector Floating-Point Compare Instructions */
3979 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3980 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3981                   CPURISCVState *env, uint32_t desc)          \
3982 {                                                             \
3983     uint32_t vm = vext_vm(desc);                              \
3984     uint32_t vl = env->vl;                                    \
3985     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
3986     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
3987     uint32_t vma = vext_vma(desc);                            \
3988     uint32_t i;                                               \
3989                                                               \
3990     for (i = env->vstart; i < vl; i++) {                      \
3991         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3992         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3993         if (!vm && !vext_elem_mask(v0, i)) {                  \
3994             /* set masked-off elements to 1s */               \
3995             if (vma) {                                        \
3996                 vext_set_elem_mask(vd, i, 1);                 \
3997             }                                                 \
3998             continue;                                         \
3999         }                                                     \
4000         vext_set_elem_mask(vd, i,                             \
4001                            DO_OP(s2, s1, &env->fp_status));   \
4002     }                                                         \
4003     env->vstart = 0;                                          \
4004     /*
4005      * mask destination register are always tail-agnostic
4006      * set tail elements to 1s
4007      */                                                       \
4008     if (vta_all_1s) {                                         \
4009         for (; i < total_elems; i++) {                        \
4010             vext_set_elem_mask(vd, i, 1);                     \
4011         }                                                     \
4012     }                                                         \
4013 }
4014 
4015 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4016 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4017 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4018 
4019 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4020 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4021                   CPURISCVState *env, uint32_t desc)                \
4022 {                                                                   \
4023     uint32_t vm = vext_vm(desc);                                    \
4024     uint32_t vl = env->vl;                                          \
4025     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4026     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4027     uint32_t vma = vext_vma(desc);                                  \
4028     uint32_t i;                                                     \
4029                                                                     \
4030     for (i = env->vstart; i < vl; i++) {                            \
4031         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4032         if (!vm && !vext_elem_mask(v0, i)) {                        \
4033             /* set masked-off elements to 1s */                     \
4034             if (vma) {                                              \
4035                 vext_set_elem_mask(vd, i, 1);                       \
4036             }                                                       \
4037             continue;                                               \
4038         }                                                           \
4039         vext_set_elem_mask(vd, i,                                   \
4040                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4041     }                                                               \
4042     env->vstart = 0;                                                \
4043     /*
4044      * mask destination register are always tail-agnostic
4045      * set tail elements to 1s
4046      */                                                             \
4047     if (vta_all_1s) {                                               \
4048         for (; i < total_elems; i++) {                              \
4049             vext_set_elem_mask(vd, i, 1);                           \
4050         }                                                           \
4051     }                                                               \
4052 }
4053 
4054 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4055 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4056 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4057 
4058 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4059 {
4060     FloatRelation compare = float16_compare_quiet(a, b, s);
4061     return compare != float_relation_equal;
4062 }
4063 
4064 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4065 {
4066     FloatRelation compare = float32_compare_quiet(a, b, s);
4067     return compare != float_relation_equal;
4068 }
4069 
4070 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4071 {
4072     FloatRelation compare = float64_compare_quiet(a, b, s);
4073     return compare != float_relation_equal;
4074 }
4075 
4076 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4077 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4078 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4079 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4080 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4081 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4082 
4083 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4084 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4085 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4086 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4087 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4088 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4089 
4090 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4091 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4092 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4093 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4094 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4095 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4096 
4097 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4098 {
4099     FloatRelation compare = float16_compare(a, b, s);
4100     return compare == float_relation_greater;
4101 }
4102 
4103 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4104 {
4105     FloatRelation compare = float32_compare(a, b, s);
4106     return compare == float_relation_greater;
4107 }
4108 
4109 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4110 {
4111     FloatRelation compare = float64_compare(a, b, s);
4112     return compare == float_relation_greater;
4113 }
4114 
4115 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4116 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4117 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4118 
4119 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4120 {
4121     FloatRelation compare = float16_compare(a, b, s);
4122     return compare == float_relation_greater ||
4123            compare == float_relation_equal;
4124 }
4125 
4126 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4127 {
4128     FloatRelation compare = float32_compare(a, b, s);
4129     return compare == float_relation_greater ||
4130            compare == float_relation_equal;
4131 }
4132 
4133 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4134 {
4135     FloatRelation compare = float64_compare(a, b, s);
4136     return compare == float_relation_greater ||
4137            compare == float_relation_equal;
4138 }
4139 
4140 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4141 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4142 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4143 
4144 /* Vector Floating-Point Classify Instruction */
4145 target_ulong fclass_h(uint64_t frs1)
4146 {
4147     float16 f = frs1;
4148     bool sign = float16_is_neg(f);
4149 
4150     if (float16_is_infinity(f)) {
4151         return sign ? 1 << 0 : 1 << 7;
4152     } else if (float16_is_zero(f)) {
4153         return sign ? 1 << 3 : 1 << 4;
4154     } else if (float16_is_zero_or_denormal(f)) {
4155         return sign ? 1 << 2 : 1 << 5;
4156     } else if (float16_is_any_nan(f)) {
4157         float_status s = { }; /* for snan_bit_is_one */
4158         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4159     } else {
4160         return sign ? 1 << 1 : 1 << 6;
4161     }
4162 }
4163 
4164 target_ulong fclass_s(uint64_t frs1)
4165 {
4166     float32 f = frs1;
4167     bool sign = float32_is_neg(f);
4168 
4169     if (float32_is_infinity(f)) {
4170         return sign ? 1 << 0 : 1 << 7;
4171     } else if (float32_is_zero(f)) {
4172         return sign ? 1 << 3 : 1 << 4;
4173     } else if (float32_is_zero_or_denormal(f)) {
4174         return sign ? 1 << 2 : 1 << 5;
4175     } else if (float32_is_any_nan(f)) {
4176         float_status s = { }; /* for snan_bit_is_one */
4177         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4178     } else {
4179         return sign ? 1 << 1 : 1 << 6;
4180     }
4181 }
4182 
4183 target_ulong fclass_d(uint64_t frs1)
4184 {
4185     float64 f = frs1;
4186     bool sign = float64_is_neg(f);
4187 
4188     if (float64_is_infinity(f)) {
4189         return sign ? 1 << 0 : 1 << 7;
4190     } else if (float64_is_zero(f)) {
4191         return sign ? 1 << 3 : 1 << 4;
4192     } else if (float64_is_zero_or_denormal(f)) {
4193         return sign ? 1 << 2 : 1 << 5;
4194     } else if (float64_is_any_nan(f)) {
4195         float_status s = { }; /* for snan_bit_is_one */
4196         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4197     } else {
4198         return sign ? 1 << 1 : 1 << 6;
4199     }
4200 }
4201 
4202 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4203 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4204 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4205 GEN_VEXT_V(vfclass_v_h, 2)
4206 GEN_VEXT_V(vfclass_v_w, 4)
4207 GEN_VEXT_V(vfclass_v_d, 8)
4208 
4209 /* Vector Floating-Point Merge Instruction */
4210 
4211 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4212 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4213                   CPURISCVState *env, uint32_t desc)          \
4214 {                                                             \
4215     uint32_t vm = vext_vm(desc);                              \
4216     uint32_t vl = env->vl;                                    \
4217     uint32_t esz = sizeof(ETYPE);                             \
4218     uint32_t total_elems =                                    \
4219         vext_get_total_elems(env, desc, esz);                 \
4220     uint32_t vta = vext_vta(desc);                            \
4221     uint32_t i;                                               \
4222                                                               \
4223     for (i = env->vstart; i < vl; i++) {                      \
4224         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4225         *((ETYPE *)vd + H(i)) =                               \
4226             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4227     }                                                         \
4228     env->vstart = 0;                                          \
4229     /* set tail elements to 1s */                             \
4230     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4231 }
4232 
4233 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4234 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4235 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4236 
4237 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4238 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4239 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4240 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4241 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4242 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4243 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4244 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4245 
4246 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4247 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4248 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4249 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4250 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4251 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4252 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4253 
4254 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4255 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4256 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4257 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4258 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4259 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4260 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4261 
4262 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4263 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4264 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4265 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4266 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4267 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4268 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4269 
4270 /* Widening Floating-Point/Integer Type-Convert Instructions */
4271 /* (TD, T2, TX2) */
4272 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4273 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4274 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4275 /*
4276  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4277  */
4278 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4279 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4280 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4281 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4282 
4283 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4284 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4285 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4286 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4287 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4288 
4289 /*
4290  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4291  */
4292 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4293 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4294 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4295 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4296 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4297 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4298 
4299 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4300 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4301 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4302 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4303 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4304 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4305 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4306 
4307 /*
4308  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4309  */
4310 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4311 {
4312     return float16_to_float32(a, true, s);
4313 }
4314 
4315 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4316 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4317 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4318 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4319 
4320 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4321 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4322 
4323 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4324 /* (TD, T2, TX2) */
4325 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4326 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4327 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4328 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4329 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4330 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4331 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4332 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4333 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4334 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4335 
4336 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4337 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4338 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4339 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4340 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4341 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4342 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4343 
4344 /*
4345  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4346  */
4347 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4348 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4349 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4350 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4351 
4352 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4353 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4354 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4355 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4356 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4357 
4358 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4359 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4360 {
4361     return float32_to_float16(a, true, s);
4362 }
4363 
4364 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4365 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4366 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4367 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4368 
4369 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4370 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4371 
4372 /*
4373  * Vector Reduction Operations
4374  */
4375 /* Vector Single-Width Integer Reduction Instructions */
4376 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4377 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4378                   void *vs2, CPURISCVState *env,          \
4379                   uint32_t desc)                          \
4380 {                                                         \
4381     uint32_t vm = vext_vm(desc);                          \
4382     uint32_t vl = env->vl;                                \
4383     uint32_t esz = sizeof(TD);                            \
4384     uint32_t vlenb = simd_maxsz(desc);                    \
4385     uint32_t vta = vext_vta(desc);                        \
4386     uint32_t i;                                           \
4387     TD s1 =  *((TD *)vs1 + HD(0));                        \
4388                                                           \
4389     for (i = env->vstart; i < vl; i++) {                  \
4390         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4391         if (!vm && !vext_elem_mask(v0, i)) {              \
4392             continue;                                     \
4393         }                                                 \
4394         s1 = OP(s1, (TD)s2);                              \
4395     }                                                     \
4396     *((TD *)vd + HD(0)) = s1;                             \
4397     env->vstart = 0;                                      \
4398     /* set tail elements to 1s */                         \
4399     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4400 }
4401 
4402 /* vd[0] = sum(vs1[0], vs2[*]) */
4403 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4404 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4405 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4406 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4407 
4408 /* vd[0] = maxu(vs1[0], vs2[*]) */
4409 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4410 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4411 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4412 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4413 
4414 /* vd[0] = max(vs1[0], vs2[*]) */
4415 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4416 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4417 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4418 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4419 
4420 /* vd[0] = minu(vs1[0], vs2[*]) */
4421 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4422 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4423 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4424 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4425 
4426 /* vd[0] = min(vs1[0], vs2[*]) */
4427 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4428 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4429 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4430 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4431 
4432 /* vd[0] = and(vs1[0], vs2[*]) */
4433 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4434 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4435 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4436 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4437 
4438 /* vd[0] = or(vs1[0], vs2[*]) */
4439 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4440 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4441 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4442 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4443 
4444 /* vd[0] = xor(vs1[0], vs2[*]) */
4445 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4446 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4447 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4448 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4449 
4450 /* Vector Widening Integer Reduction Instructions */
4451 /* signed sum reduction into double-width accumulator */
4452 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4453 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4454 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4455 
4456 /* Unsigned sum reduction into double-width accumulator */
4457 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4458 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4459 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4460 
4461 /* Vector Single-Width Floating-Point Reduction Instructions */
4462 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4463 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4464                   void *vs2, CPURISCVState *env,           \
4465                   uint32_t desc)                           \
4466 {                                                          \
4467     uint32_t vm = vext_vm(desc);                           \
4468     uint32_t vl = env->vl;                                 \
4469     uint32_t esz = sizeof(TD);                             \
4470     uint32_t vlenb = simd_maxsz(desc);                     \
4471     uint32_t vta = vext_vta(desc);                         \
4472     uint32_t i;                                            \
4473     TD s1 =  *((TD *)vs1 + HD(0));                         \
4474                                                            \
4475     for (i = env->vstart; i < vl; i++) {                   \
4476         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4477         if (!vm && !vext_elem_mask(v0, i)) {               \
4478             continue;                                      \
4479         }                                                  \
4480         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4481     }                                                      \
4482     *((TD *)vd + HD(0)) = s1;                              \
4483     env->vstart = 0;                                       \
4484     /* set tail elements to 1s */                          \
4485     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4486 }
4487 
4488 /* Unordered sum */
4489 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4490 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4491 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4492 
4493 /* Ordered sum */
4494 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4495 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4496 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4497 
4498 /* Maximum value */
4499 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4500               float16_maximum_number)
4501 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4502               float32_maximum_number)
4503 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4504               float64_maximum_number)
4505 
4506 /* Minimum value */
4507 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4508               float16_minimum_number)
4509 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4510               float32_minimum_number)
4511 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4512               float64_minimum_number)
4513 
4514 /* Vector Widening Floating-Point Add Instructions */
4515 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4516 {
4517     return float32_add(a, float16_to_float32(b, true, s), s);
4518 }
4519 
4520 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4521 {
4522     return float64_add(a, float32_to_float64(b, s), s);
4523 }
4524 
4525 /* Vector Widening Floating-Point Reduction Instructions */
4526 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4527 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4528 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4529 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4530 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4531 
4532 /*
4533  * Vector Mask Operations
4534  */
4535 /* Vector Mask-Register Logical Instructions */
4536 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4537 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4538                   void *vs2, CPURISCVState *env,          \
4539                   uint32_t desc)                          \
4540 {                                                         \
4541     uint32_t vl = env->vl;                                \
4542     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4543     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4544     uint32_t i;                                           \
4545     int a, b;                                             \
4546                                                           \
4547     for (i = env->vstart; i < vl; i++) {                  \
4548         a = vext_elem_mask(vs1, i);                       \
4549         b = vext_elem_mask(vs2, i);                       \
4550         vext_set_elem_mask(vd, i, OP(b, a));              \
4551     }                                                     \
4552     env->vstart = 0;                                      \
4553     /*
4554      * mask destination register are always tail-agnostic
4555      * set tail elements to 1s
4556      */                                                   \
4557     if (vta_all_1s) {                                     \
4558         for (; i < total_elems; i++) {                    \
4559             vext_set_elem_mask(vd, i, 1);                 \
4560         }                                                 \
4561     }                                                     \
4562 }
4563 
4564 #define DO_NAND(N, M)  (!(N & M))
4565 #define DO_ANDNOT(N, M)  (N & !M)
4566 #define DO_NOR(N, M)  (!(N | M))
4567 #define DO_ORNOT(N, M)  (N | !M)
4568 #define DO_XNOR(N, M)  (!(N ^ M))
4569 
4570 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4571 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4572 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4573 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4574 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4575 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4576 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4577 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4578 
4579 /* Vector count population in mask vcpop */
4580 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4581                              uint32_t desc)
4582 {
4583     target_ulong cnt = 0;
4584     uint32_t vm = vext_vm(desc);
4585     uint32_t vl = env->vl;
4586     int i;
4587 
4588     for (i = env->vstart; i < vl; i++) {
4589         if (vm || vext_elem_mask(v0, i)) {
4590             if (vext_elem_mask(vs2, i)) {
4591                 cnt++;
4592             }
4593         }
4594     }
4595     env->vstart = 0;
4596     return cnt;
4597 }
4598 
4599 /* vfirst find-first-set mask bit */
4600 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4601                               uint32_t desc)
4602 {
4603     uint32_t vm = vext_vm(desc);
4604     uint32_t vl = env->vl;
4605     int i;
4606 
4607     for (i = env->vstart; i < vl; i++) {
4608         if (vm || vext_elem_mask(v0, i)) {
4609             if (vext_elem_mask(vs2, i)) {
4610                 return i;
4611             }
4612         }
4613     }
4614     env->vstart = 0;
4615     return -1LL;
4616 }
4617 
4618 enum set_mask_type {
4619     ONLY_FIRST = 1,
4620     INCLUDE_FIRST,
4621     BEFORE_FIRST,
4622 };
4623 
4624 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4625                    uint32_t desc, enum set_mask_type type)
4626 {
4627     uint32_t vm = vext_vm(desc);
4628     uint32_t vl = env->vl;
4629     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4630     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4631     uint32_t vma = vext_vma(desc);
4632     int i;
4633     bool first_mask_bit = false;
4634 
4635     for (i = env->vstart; i < vl; i++) {
4636         if (!vm && !vext_elem_mask(v0, i)) {
4637             /* set masked-off elements to 1s */
4638             if (vma) {
4639                 vext_set_elem_mask(vd, i, 1);
4640             }
4641             continue;
4642         }
4643         /* write a zero to all following active elements */
4644         if (first_mask_bit) {
4645             vext_set_elem_mask(vd, i, 0);
4646             continue;
4647         }
4648         if (vext_elem_mask(vs2, i)) {
4649             first_mask_bit = true;
4650             if (type == BEFORE_FIRST) {
4651                 vext_set_elem_mask(vd, i, 0);
4652             } else {
4653                 vext_set_elem_mask(vd, i, 1);
4654             }
4655         } else {
4656             if (type == ONLY_FIRST) {
4657                 vext_set_elem_mask(vd, i, 0);
4658             } else {
4659                 vext_set_elem_mask(vd, i, 1);
4660             }
4661         }
4662     }
4663     env->vstart = 0;
4664     /*
4665      * mask destination register are always tail-agnostic
4666      * set tail elements to 1s
4667      */
4668     if (vta_all_1s) {
4669         for (; i < total_elems; i++) {
4670             vext_set_elem_mask(vd, i, 1);
4671         }
4672     }
4673 }
4674 
4675 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4676                      uint32_t desc)
4677 {
4678     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4679 }
4680 
4681 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4682                      uint32_t desc)
4683 {
4684     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4685 }
4686 
4687 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4688                      uint32_t desc)
4689 {
4690     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4691 }
4692 
4693 /* Vector Iota Instruction */
4694 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4695 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4696                   uint32_t desc)                                          \
4697 {                                                                         \
4698     uint32_t vm = vext_vm(desc);                                          \
4699     uint32_t vl = env->vl;                                                \
4700     uint32_t esz = sizeof(ETYPE);                                         \
4701     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4702     uint32_t vta = vext_vta(desc);                                        \
4703     uint32_t vma = vext_vma(desc);                                        \
4704     uint32_t sum = 0;                                                     \
4705     int i;                                                                \
4706                                                                           \
4707     for (i = env->vstart; i < vl; i++) {                                  \
4708         if (!vm && !vext_elem_mask(v0, i)) {                              \
4709             /* set masked-off elements to 1s */                           \
4710             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4711             continue;                                                     \
4712         }                                                                 \
4713         *((ETYPE *)vd + H(i)) = sum;                                      \
4714         if (vext_elem_mask(vs2, i)) {                                     \
4715             sum++;                                                        \
4716         }                                                                 \
4717     }                                                                     \
4718     env->vstart = 0;                                                      \
4719     /* set tail elements to 1s */                                         \
4720     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4721 }
4722 
4723 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4724 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4725 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4726 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4727 
4728 /* Vector Element Index Instruction */
4729 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4730 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4731 {                                                                         \
4732     uint32_t vm = vext_vm(desc);                                          \
4733     uint32_t vl = env->vl;                                                \
4734     uint32_t esz = sizeof(ETYPE);                                         \
4735     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4736     uint32_t vta = vext_vta(desc);                                        \
4737     uint32_t vma = vext_vma(desc);                                        \
4738     int i;                                                                \
4739                                                                           \
4740     for (i = env->vstart; i < vl; i++) {                                  \
4741         if (!vm && !vext_elem_mask(v0, i)) {                              \
4742             /* set masked-off elements to 1s */                           \
4743             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4744             continue;                                                     \
4745         }                                                                 \
4746         *((ETYPE *)vd + H(i)) = i;                                        \
4747     }                                                                     \
4748     env->vstart = 0;                                                      \
4749     /* set tail elements to 1s */                                         \
4750     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4751 }
4752 
4753 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4754 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4755 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4756 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4757 
4758 /*
4759  * Vector Permutation Instructions
4760  */
4761 
4762 /* Vector Slide Instructions */
4763 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4764 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4765                   CPURISCVState *env, uint32_t desc)                      \
4766 {                                                                         \
4767     uint32_t vm = vext_vm(desc);                                          \
4768     uint32_t vl = env->vl;                                                \
4769     uint32_t esz = sizeof(ETYPE);                                         \
4770     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4771     uint32_t vta = vext_vta(desc);                                        \
4772     uint32_t vma = vext_vma(desc);                                        \
4773     target_ulong offset = s1, i_min, i;                                   \
4774                                                                           \
4775     i_min = MAX(env->vstart, offset);                                     \
4776     for (i = i_min; i < vl; i++) {                                        \
4777         if (!vm && !vext_elem_mask(v0, i)) {                              \
4778             /* set masked-off elements to 1s */                           \
4779             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4780             continue;                                                     \
4781         }                                                                 \
4782         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4783     }                                                                     \
4784     /* set tail elements to 1s */                                         \
4785     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4786 }
4787 
4788 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4789 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4790 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4791 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4792 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4793 
4794 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4795 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4796                   CPURISCVState *env, uint32_t desc)                      \
4797 {                                                                         \
4798     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4799     uint32_t vm = vext_vm(desc);                                          \
4800     uint32_t vl = env->vl;                                                \
4801     uint32_t esz = sizeof(ETYPE);                                         \
4802     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4803     uint32_t vta = vext_vta(desc);                                        \
4804     uint32_t vma = vext_vma(desc);                                        \
4805     target_ulong i_max, i_min, i;                                         \
4806                                                                           \
4807     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
4808     i_max = MAX(i_min, env->vstart);                                      \
4809     for (i = env->vstart; i < i_max; ++i) {                               \
4810         if (!vm && !vext_elem_mask(v0, i)) {                              \
4811             /* set masked-off elements to 1s */                           \
4812             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4813             continue;                                                     \
4814         }                                                                 \
4815         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4816     }                                                                     \
4817                                                                           \
4818     for (i = i_max; i < vl; ++i) {                                        \
4819         if (vm || vext_elem_mask(v0, i)) {                                \
4820             *((ETYPE *)vd + H(i)) = 0;                                    \
4821         }                                                                 \
4822     }                                                                     \
4823                                                                           \
4824     env->vstart = 0;                                                      \
4825     /* set tail elements to 1s */                                         \
4826     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4827 }
4828 
4829 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4830 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4831 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4832 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4833 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4834 
4835 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4836 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4837                                  void *vs2, CPURISCVState *env,             \
4838                                  uint32_t desc)                             \
4839 {                                                                           \
4840     typedef uint##BITWIDTH##_t ETYPE;                                       \
4841     uint32_t vm = vext_vm(desc);                                            \
4842     uint32_t vl = env->vl;                                                  \
4843     uint32_t esz = sizeof(ETYPE);                                           \
4844     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4845     uint32_t vta = vext_vta(desc);                                          \
4846     uint32_t vma = vext_vma(desc);                                          \
4847     uint32_t i;                                                             \
4848                                                                             \
4849     for (i = env->vstart; i < vl; i++) {                                    \
4850         if (!vm && !vext_elem_mask(v0, i)) {                                \
4851             /* set masked-off elements to 1s */                             \
4852             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4853             continue;                                                       \
4854         }                                                                   \
4855         if (i == 0) {                                                       \
4856             *((ETYPE *)vd + H(i)) = s1;                                     \
4857         } else {                                                            \
4858             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4859         }                                                                   \
4860     }                                                                       \
4861     env->vstart = 0;                                                        \
4862     /* set tail elements to 1s */                                           \
4863     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4864 }
4865 
4866 GEN_VEXT_VSLIE1UP(8,  H1)
4867 GEN_VEXT_VSLIE1UP(16, H2)
4868 GEN_VEXT_VSLIE1UP(32, H4)
4869 GEN_VEXT_VSLIE1UP(64, H8)
4870 
4871 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4872 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4873                   CPURISCVState *env, uint32_t desc)              \
4874 {                                                                 \
4875     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4876 }
4877 
4878 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4879 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4880 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4881 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4882 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4883 
4884 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4885 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4886                                    void *vs2, CPURISCVState *env,             \
4887                                    uint32_t desc)                             \
4888 {                                                                             \
4889     typedef uint##BITWIDTH##_t ETYPE;                                         \
4890     uint32_t vm = vext_vm(desc);                                              \
4891     uint32_t vl = env->vl;                                                    \
4892     uint32_t esz = sizeof(ETYPE);                                             \
4893     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4894     uint32_t vta = vext_vta(desc);                                            \
4895     uint32_t vma = vext_vma(desc);                                            \
4896     uint32_t i;                                                               \
4897                                                                               \
4898     for (i = env->vstart; i < vl; i++) {                                      \
4899         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4900             /* set masked-off elements to 1s */                               \
4901             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4902             continue;                                                         \
4903         }                                                                     \
4904         if (i == vl - 1) {                                                    \
4905             *((ETYPE *)vd + H(i)) = s1;                                       \
4906         } else {                                                              \
4907             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4908         }                                                                     \
4909     }                                                                         \
4910     env->vstart = 0;                                                          \
4911     /* set tail elements to 1s */                                             \
4912     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4913 }
4914 
4915 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4916 GEN_VEXT_VSLIDE1DOWN(16, H2)
4917 GEN_VEXT_VSLIDE1DOWN(32, H4)
4918 GEN_VEXT_VSLIDE1DOWN(64, H8)
4919 
4920 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4921 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4922                   CPURISCVState *env, uint32_t desc)              \
4923 {                                                                 \
4924     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4925 }
4926 
4927 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4928 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4929 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4930 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4931 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4932 
4933 /* Vector Floating-Point Slide Instructions */
4934 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4935 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4936                   CPURISCVState *env, uint32_t desc)          \
4937 {                                                             \
4938     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4939 }
4940 
4941 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4942 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4943 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4944 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4945 
4946 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4947 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4948                   CPURISCVState *env, uint32_t desc)          \
4949 {                                                             \
4950     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4951 }
4952 
4953 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4954 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4955 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4956 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4957 
4958 /* Vector Register Gather Instruction */
4959 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4960 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4961                   CPURISCVState *env, uint32_t desc)                      \
4962 {                                                                         \
4963     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4964     uint32_t vm = vext_vm(desc);                                          \
4965     uint32_t vl = env->vl;                                                \
4966     uint32_t esz = sizeof(TS2);                                           \
4967     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4968     uint32_t vta = vext_vta(desc);                                        \
4969     uint32_t vma = vext_vma(desc);                                        \
4970     uint64_t index;                                                       \
4971     uint32_t i;                                                           \
4972                                                                           \
4973     for (i = env->vstart; i < vl; i++) {                                  \
4974         if (!vm && !vext_elem_mask(v0, i)) {                              \
4975             /* set masked-off elements to 1s */                           \
4976             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4977             continue;                                                     \
4978         }                                                                 \
4979         index = *((TS1 *)vs1 + HS1(i));                                   \
4980         if (index >= vlmax) {                                             \
4981             *((TS2 *)vd + HS2(i)) = 0;                                    \
4982         } else {                                                          \
4983             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4984         }                                                                 \
4985     }                                                                     \
4986     env->vstart = 0;                                                      \
4987     /* set tail elements to 1s */                                         \
4988     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4989 }
4990 
4991 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4992 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4993 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4994 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4995 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4996 
4997 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4998 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4999 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5000 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5001 
5002 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5003 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5004                   CPURISCVState *env, uint32_t desc)                      \
5005 {                                                                         \
5006     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5007     uint32_t vm = vext_vm(desc);                                          \
5008     uint32_t vl = env->vl;                                                \
5009     uint32_t esz = sizeof(ETYPE);                                         \
5010     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5011     uint32_t vta = vext_vta(desc);                                        \
5012     uint32_t vma = vext_vma(desc);                                        \
5013     uint64_t index = s1;                                                  \
5014     uint32_t i;                                                           \
5015                                                                           \
5016     for (i = env->vstart; i < vl; i++) {                                  \
5017         if (!vm && !vext_elem_mask(v0, i)) {                              \
5018             /* set masked-off elements to 1s */                           \
5019             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5020             continue;                                                     \
5021         }                                                                 \
5022         if (index >= vlmax) {                                             \
5023             *((ETYPE *)vd + H(i)) = 0;                                    \
5024         } else {                                                          \
5025             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5026         }                                                                 \
5027     }                                                                     \
5028     env->vstart = 0;                                                      \
5029     /* set tail elements to 1s */                                         \
5030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5031 }
5032 
5033 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5034 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5035 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5036 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5037 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5038 
5039 /* Vector Compress Instruction */
5040 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5041 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5042                   CPURISCVState *env, uint32_t desc)                      \
5043 {                                                                         \
5044     uint32_t vl = env->vl;                                                \
5045     uint32_t esz = sizeof(ETYPE);                                         \
5046     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5047     uint32_t vta = vext_vta(desc);                                        \
5048     uint32_t num = 0, i;                                                  \
5049                                                                           \
5050     for (i = env->vstart; i < vl; i++) {                                  \
5051         if (!vext_elem_mask(vs1, i)) {                                    \
5052             continue;                                                     \
5053         }                                                                 \
5054         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5055         num++;                                                            \
5056     }                                                                     \
5057     env->vstart = 0;                                                      \
5058     /* set tail elements to 1s */                                         \
5059     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5060 }
5061 
5062 /* Compress into vd elements of vs2 where vs1 is enabled */
5063 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5064 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5065 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5066 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5067 
5068 /* Vector Whole Register Move */
5069 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5070 {
5071     /* EEW = SEW */
5072     uint32_t maxsz = simd_maxsz(desc);
5073     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5074     uint32_t startb = env->vstart * sewb;
5075     uint32_t i = startb;
5076 
5077     memcpy((uint8_t *)vd + H1(i),
5078            (uint8_t *)vs2 + H1(i),
5079            maxsz - startb);
5080 
5081     env->vstart = 0;
5082 }
5083 
5084 /* Vector Integer Extension */
5085 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5086 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5087                   CPURISCVState *env, uint32_t desc)             \
5088 {                                                                \
5089     uint32_t vl = env->vl;                                       \
5090     uint32_t vm = vext_vm(desc);                                 \
5091     uint32_t esz = sizeof(ETYPE);                                \
5092     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5093     uint32_t vta = vext_vta(desc);                               \
5094     uint32_t vma = vext_vma(desc);                               \
5095     uint32_t i;                                                  \
5096                                                                  \
5097     for (i = env->vstart; i < vl; i++) {                         \
5098         if (!vm && !vext_elem_mask(v0, i)) {                     \
5099             /* set masked-off elements to 1s */                  \
5100             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5101             continue;                                            \
5102         }                                                        \
5103         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5104     }                                                            \
5105     env->vstart = 0;                                             \
5106     /* set tail elements to 1s */                                \
5107     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5108 }
5109 
5110 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5111 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5112 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5113 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5114 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5115 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5116 
5117 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5118 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5119 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5120 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5121 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5122 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5123