xref: /qemu/target/riscv/vector_helper.c (revision 14f5a7ba)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "internals.h"
30 #include "vector_internals.h"
31 #include <math.h>
32 
33 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
34                             target_ulong s2)
35 {
36     int vlmax, vl;
37     RISCVCPU *cpu = env_archcpu(env);
38     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
39     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
40     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
41     int xlen = riscv_cpu_xlen(env);
42     bool vill = (s2 >> (xlen - 1)) & 0x1;
43     target_ulong reserved = s2 &
44                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
45                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
46 
47     if (lmul & 4) {
48         /* Fractional LMUL - check LMUL * VLEN >= SEW */
49         if (lmul == 4 ||
50             cpu->cfg.vlen >> (8 - lmul) < sew) {
51             vill = true;
52         }
53     }
54 
55     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
56         /* only set vill bit. */
57         env->vill = 1;
58         env->vtype = 0;
59         env->vl = 0;
60         env->vstart = 0;
61         return 0;
62     }
63 
64     vlmax = vext_get_vlmax(cpu, s2);
65     if (s1 <= vlmax) {
66         vl = s1;
67     } else {
68         vl = vlmax;
69     }
70     env->vl = vl;
71     env->vtype = s2;
72     env->vstart = 0;
73     env->vill = 0;
74     return vl;
75 }
76 
77 /*
78  * Get the maximum number of elements can be operated.
79  *
80  * log2_esz: log2 of element size in bytes.
81  */
82 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
83 {
84     /*
85      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
86      * so vlen in bytes (vlenb) is encoded as maxsz.
87      */
88     uint32_t vlenb = simd_maxsz(desc);
89 
90     /* Return VLMAX */
91     int scale = vext_lmul(desc) - log2_esz;
92     return scale < 0 ? vlenb >> -scale : vlenb << scale;
93 }
94 
95 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
96 {
97     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
98 }
99 
100 /*
101  * This function checks watchpoint before real load operation.
102  *
103  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
104  * In user mode, there is no watchpoint support now.
105  *
106  * It will trigger an exception if there is no mapping in TLB
107  * and page table walk can't fill the TLB entry. Then the guest
108  * software can return here after process the exception or never return.
109  */
110 static void probe_pages(CPURISCVState *env, target_ulong addr,
111                         target_ulong len, uintptr_t ra,
112                         MMUAccessType access_type)
113 {
114     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
115     target_ulong curlen = MIN(pagelen, len);
116 
117     probe_access(env, adjust_addr(env, addr), curlen, access_type,
118                  cpu_mmu_index(env, false), ra);
119     if (len > curlen) {
120         addr += curlen;
121         curlen = len - curlen;
122         probe_access(env, adjust_addr(env, addr), curlen, access_type,
123                      cpu_mmu_index(env, false), ra);
124     }
125 }
126 
127 static inline void vext_set_elem_mask(void *v0, int index,
128                                       uint8_t value)
129 {
130     int idx = index / 64;
131     int pos = index % 64;
132     uint64_t old = ((uint64_t *)v0)[idx];
133     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
134 }
135 
136 /* elements operations for load and store */
137 typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
138                                uint32_t idx, void *vd, uintptr_t retaddr);
139 
140 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
141 static void NAME(CPURISCVState *env, abi_ptr addr,         \
142                  uint32_t idx, void *vd, uintptr_t retaddr)\
143 {                                                          \
144     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
145     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
146 }                                                          \
147 
148 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
149 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
150 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
151 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
152 
153 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
154 static void NAME(CPURISCVState *env, abi_ptr addr,         \
155                  uint32_t idx, void *vd, uintptr_t retaddr)\
156 {                                                          \
157     ETYPE data = *((ETYPE *)vd + H(idx));                  \
158     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
159 }
160 
161 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
162 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
163 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
164 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
165 
166 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
167                                    uint32_t desc, uint32_t nf,
168                                    uint32_t esz, uint32_t max_elems)
169 {
170     uint32_t vta = vext_vta(desc);
171     int k;
172 
173     if (vta == 0) {
174         return;
175     }
176 
177     for (k = 0; k < nf; ++k) {
178         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
179                           (k * max_elems + max_elems) * esz);
180     }
181 }
182 
183 /*
184  * stride: access vector element from strided memory
185  */
186 static void
187 vext_ldst_stride(void *vd, void *v0, target_ulong base,
188                  target_ulong stride, CPURISCVState *env,
189                  uint32_t desc, uint32_t vm,
190                  vext_ldst_elem_fn *ldst_elem,
191                  uint32_t log2_esz, uintptr_t ra)
192 {
193     uint32_t i, k;
194     uint32_t nf = vext_nf(desc);
195     uint32_t max_elems = vext_max_elems(desc, log2_esz);
196     uint32_t esz = 1 << log2_esz;
197     uint32_t vma = vext_vma(desc);
198 
199     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
200         k = 0;
201         while (k < nf) {
202             if (!vm && !vext_elem_mask(v0, i)) {
203                 /* set masked-off elements to 1s */
204                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
205                                   (i + k * max_elems + 1) * esz);
206                 k++;
207                 continue;
208             }
209             target_ulong addr = base + stride * i + (k << log2_esz);
210             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
211             k++;
212         }
213     }
214     env->vstart = 0;
215 
216     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
217 }
218 
219 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
220 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
221                   target_ulong stride, CPURISCVState *env,              \
222                   uint32_t desc)                                        \
223 {                                                                       \
224     uint32_t vm = vext_vm(desc);                                        \
225     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
226                      ctzl(sizeof(ETYPE)), GETPC());                     \
227 }
228 
229 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
230 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
231 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
232 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
233 
234 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
235 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
236                   target_ulong stride, CPURISCVState *env,              \
237                   uint32_t desc)                                        \
238 {                                                                       \
239     uint32_t vm = vext_vm(desc);                                        \
240     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
241                      ctzl(sizeof(ETYPE)), GETPC());                     \
242 }
243 
244 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
245 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
246 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
247 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
248 
249 /*
250  * unit-stride: access elements stored contiguously in memory
251  */
252 
253 /* unmasked unit-stride load and store operation */
254 static void
255 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
256              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
257              uintptr_t ra)
258 {
259     uint32_t i, k;
260     uint32_t nf = vext_nf(desc);
261     uint32_t max_elems = vext_max_elems(desc, log2_esz);
262     uint32_t esz = 1 << log2_esz;
263 
264     /* load bytes from guest memory */
265     for (i = env->vstart; i < evl; i++, env->vstart++) {
266         k = 0;
267         while (k < nf) {
268             target_ulong addr = base + ((i * nf + k) << log2_esz);
269             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
270             k++;
271         }
272     }
273     env->vstart = 0;
274 
275     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
276 }
277 
278 /*
279  * masked unit-stride load and store operation will be a special case of
280  * stride, stride = NF * sizeof (ETYPE)
281  */
282 
283 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
284 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
285                          CPURISCVState *env, uint32_t desc)             \
286 {                                                                       \
287     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
288     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
289                      ctzl(sizeof(ETYPE)), GETPC());                     \
290 }                                                                       \
291                                                                         \
292 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
293                   CPURISCVState *env, uint32_t desc)                    \
294 {                                                                       \
295     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
296                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
297 }
298 
299 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
300 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
301 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
302 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
303 
304 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
305 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
306                          CPURISCVState *env, uint32_t desc)              \
307 {                                                                        \
308     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
309     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
310                      ctzl(sizeof(ETYPE)), GETPC());                      \
311 }                                                                        \
312                                                                          \
313 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
314                   CPURISCVState *env, uint32_t desc)                     \
315 {                                                                        \
316     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
317                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
318 }
319 
320 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
321 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
322 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
323 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
324 
325 /*
326  * unit stride mask load and store, EEW = 1
327  */
328 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
329                     CPURISCVState *env, uint32_t desc)
330 {
331     /* evl = ceil(vl/8) */
332     uint8_t evl = (env->vl + 7) >> 3;
333     vext_ldst_us(vd, base, env, desc, lde_b,
334                  0, evl, GETPC());
335 }
336 
337 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
338                     CPURISCVState *env, uint32_t desc)
339 {
340     /* evl = ceil(vl/8) */
341     uint8_t evl = (env->vl + 7) >> 3;
342     vext_ldst_us(vd, base, env, desc, ste_b,
343                  0, evl, GETPC());
344 }
345 
346 /*
347  * index: access vector element from indexed memory
348  */
349 typedef target_ulong vext_get_index_addr(target_ulong base,
350         uint32_t idx, void *vs2);
351 
352 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
353 static target_ulong NAME(target_ulong base,            \
354                          uint32_t idx, void *vs2)      \
355 {                                                      \
356     return (base + *((ETYPE *)vs2 + H(idx)));          \
357 }
358 
359 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
360 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
361 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
362 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
363 
364 static inline void
365 vext_ldst_index(void *vd, void *v0, target_ulong base,
366                 void *vs2, CPURISCVState *env, uint32_t desc,
367                 vext_get_index_addr get_index_addr,
368                 vext_ldst_elem_fn *ldst_elem,
369                 uint32_t log2_esz, uintptr_t ra)
370 {
371     uint32_t i, k;
372     uint32_t nf = vext_nf(desc);
373     uint32_t vm = vext_vm(desc);
374     uint32_t max_elems = vext_max_elems(desc, log2_esz);
375     uint32_t esz = 1 << log2_esz;
376     uint32_t vma = vext_vma(desc);
377 
378     /* load bytes from guest memory */
379     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
380         k = 0;
381         while (k < nf) {
382             if (!vm && !vext_elem_mask(v0, i)) {
383                 /* set masked-off elements to 1s */
384                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
385                                   (i + k * max_elems + 1) * esz);
386                 k++;
387                 continue;
388             }
389             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
390             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
391             k++;
392         }
393     }
394     env->vstart = 0;
395 
396     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
397 }
398 
399 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
400 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
401                   void *vs2, CPURISCVState *env, uint32_t desc)            \
402 {                                                                          \
403     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
404                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
405 }
406 
407 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
408 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
409 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
410 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
411 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
412 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
413 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
414 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
415 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
416 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
417 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
418 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
419 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
420 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
421 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
422 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
423 
424 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
425 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
426                   void *vs2, CPURISCVState *env, uint32_t desc)  \
427 {                                                                \
428     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
429                     STORE_FN, ctzl(sizeof(ETYPE)),               \
430                     GETPC());                                    \
431 }
432 
433 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
434 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
435 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
436 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
437 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
438 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
439 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
440 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
441 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
442 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
443 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
444 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
445 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
446 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
447 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
448 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
449 
450 /*
451  * unit-stride fault-only-fisrt load instructions
452  */
453 static inline void
454 vext_ldff(void *vd, void *v0, target_ulong base,
455           CPURISCVState *env, uint32_t desc,
456           vext_ldst_elem_fn *ldst_elem,
457           uint32_t log2_esz, uintptr_t ra)
458 {
459     void *host;
460     uint32_t i, k, vl = 0;
461     uint32_t nf = vext_nf(desc);
462     uint32_t vm = vext_vm(desc);
463     uint32_t max_elems = vext_max_elems(desc, log2_esz);
464     uint32_t esz = 1 << log2_esz;
465     uint32_t vma = vext_vma(desc);
466     target_ulong addr, offset, remain;
467 
468     /* probe every access */
469     for (i = env->vstart; i < env->vl; i++) {
470         if (!vm && !vext_elem_mask(v0, i)) {
471             continue;
472         }
473         addr = adjust_addr(env, base + i * (nf << log2_esz));
474         if (i == 0) {
475             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
476         } else {
477             /* if it triggers an exception, no need to check watchpoint */
478             remain = nf << log2_esz;
479             while (remain > 0) {
480                 offset = -(addr | TARGET_PAGE_MASK);
481                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
482                                          cpu_mmu_index(env, false));
483                 if (host) {
484 #ifdef CONFIG_USER_ONLY
485                     if (!page_check_range(addr, offset, PAGE_READ)) {
486                         vl = i;
487                         goto ProbeSuccess;
488                     }
489 #else
490                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
491 #endif
492                 } else {
493                     vl = i;
494                     goto ProbeSuccess;
495                 }
496                 if (remain <=  offset) {
497                     break;
498                 }
499                 remain -= offset;
500                 addr = adjust_addr(env, addr + offset);
501             }
502         }
503     }
504 ProbeSuccess:
505     /* load bytes from guest memory */
506     if (vl != 0) {
507         env->vl = vl;
508     }
509     for (i = env->vstart; i < env->vl; i++) {
510         k = 0;
511         while (k < nf) {
512             if (!vm && !vext_elem_mask(v0, i)) {
513                 /* set masked-off elements to 1s */
514                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
515                                   (i + k * max_elems + 1) * esz);
516                 k++;
517                 continue;
518             }
519             addr = base + ((i * nf + k) << log2_esz);
520             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
521             k++;
522         }
523     }
524     env->vstart = 0;
525 
526     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
527 }
528 
529 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
530 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
531                   CPURISCVState *env, uint32_t desc)      \
532 {                                                         \
533     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
534               ctzl(sizeof(ETYPE)), GETPC());              \
535 }
536 
537 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
538 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
539 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
540 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
541 
542 #define DO_SWAP(N, M) (M)
543 #define DO_AND(N, M)  (N & M)
544 #define DO_XOR(N, M)  (N ^ M)
545 #define DO_OR(N, M)   (N | M)
546 #define DO_ADD(N, M)  (N + M)
547 
548 /* Signed min/max */
549 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
550 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
551 
552 /*
553  * load and store whole register instructions
554  */
555 static void
556 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
557                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
558 {
559     uint32_t i, k, off, pos;
560     uint32_t nf = vext_nf(desc);
561     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
562     uint32_t max_elems = vlenb >> log2_esz;
563 
564     k = env->vstart / max_elems;
565     off = env->vstart % max_elems;
566 
567     if (off) {
568         /* load/store rest of elements of current segment pointed by vstart */
569         for (pos = off; pos < max_elems; pos++, env->vstart++) {
570             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
571             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
572                       ra);
573         }
574         k++;
575     }
576 
577     /* load/store elements for rest of segments */
578     for (; k < nf; k++) {
579         for (i = 0; i < max_elems; i++, env->vstart++) {
580             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
581             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
582         }
583     }
584 
585     env->vstart = 0;
586 }
587 
588 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
589 void HELPER(NAME)(void *vd, target_ulong base,       \
590                   CPURISCVState *env, uint32_t desc) \
591 {                                                    \
592     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
593                     ctzl(sizeof(ETYPE)), GETPC());   \
594 }
595 
596 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
597 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
598 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
599 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
600 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
601 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
602 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
603 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
604 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
605 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
606 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
607 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
608 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
609 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
610 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
611 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
612 
613 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
614 void HELPER(NAME)(void *vd, target_ulong base,       \
615                   CPURISCVState *env, uint32_t desc) \
616 {                                                    \
617     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
618                     ctzl(sizeof(ETYPE)), GETPC());   \
619 }
620 
621 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
622 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
623 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
624 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
625 
626 /*
627  * Vector Integer Arithmetic Instructions
628  */
629 
630 /* (TD, T1, T2, TX1, TX2) */
631 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
632 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
633 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
634 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
635 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
636 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
637 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
638 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
639 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
640 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
641 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
642 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
643 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
644 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
645 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
646 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
647 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
648 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
649 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
650 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
651 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
652 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
653 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
654 
655 #define DO_SUB(N, M) (N - M)
656 #define DO_RSUB(N, M) (M - N)
657 
658 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
659 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
660 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
661 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
662 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
663 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
664 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
665 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
666 
667 GEN_VEXT_VV(vadd_vv_b, 1)
668 GEN_VEXT_VV(vadd_vv_h, 2)
669 GEN_VEXT_VV(vadd_vv_w, 4)
670 GEN_VEXT_VV(vadd_vv_d, 8)
671 GEN_VEXT_VV(vsub_vv_b, 1)
672 GEN_VEXT_VV(vsub_vv_h, 2)
673 GEN_VEXT_VV(vsub_vv_w, 4)
674 GEN_VEXT_VV(vsub_vv_d, 8)
675 
676 
677 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
678 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
679 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
680 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
681 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
682 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
683 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
684 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
685 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
686 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
687 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
688 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
689 
690 GEN_VEXT_VX(vadd_vx_b, 1)
691 GEN_VEXT_VX(vadd_vx_h, 2)
692 GEN_VEXT_VX(vadd_vx_w, 4)
693 GEN_VEXT_VX(vadd_vx_d, 8)
694 GEN_VEXT_VX(vsub_vx_b, 1)
695 GEN_VEXT_VX(vsub_vx_h, 2)
696 GEN_VEXT_VX(vsub_vx_w, 4)
697 GEN_VEXT_VX(vsub_vx_d, 8)
698 GEN_VEXT_VX(vrsub_vx_b, 1)
699 GEN_VEXT_VX(vrsub_vx_h, 2)
700 GEN_VEXT_VX(vrsub_vx_w, 4)
701 GEN_VEXT_VX(vrsub_vx_d, 8)
702 
703 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
704 {
705     intptr_t oprsz = simd_oprsz(desc);
706     intptr_t i;
707 
708     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
709         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
710     }
711 }
712 
713 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
714 {
715     intptr_t oprsz = simd_oprsz(desc);
716     intptr_t i;
717 
718     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
719         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
720     }
721 }
722 
723 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
724 {
725     intptr_t oprsz = simd_oprsz(desc);
726     intptr_t i;
727 
728     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
729         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
730     }
731 }
732 
733 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
734 {
735     intptr_t oprsz = simd_oprsz(desc);
736     intptr_t i;
737 
738     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
739         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
740     }
741 }
742 
743 /* Vector Widening Integer Add/Subtract */
744 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
745 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
746 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
747 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
748 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
749 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
750 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
751 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
752 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
753 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
754 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
755 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
756 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
757 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
758 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
759 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
760 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
761 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
762 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
763 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
764 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
765 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
766 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
767 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
768 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
769 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
770 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
771 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
772 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
773 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
774 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
775 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
776 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
777 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
778 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
779 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
780 GEN_VEXT_VV(vwaddu_vv_b, 2)
781 GEN_VEXT_VV(vwaddu_vv_h, 4)
782 GEN_VEXT_VV(vwaddu_vv_w, 8)
783 GEN_VEXT_VV(vwsubu_vv_b, 2)
784 GEN_VEXT_VV(vwsubu_vv_h, 4)
785 GEN_VEXT_VV(vwsubu_vv_w, 8)
786 GEN_VEXT_VV(vwadd_vv_b, 2)
787 GEN_VEXT_VV(vwadd_vv_h, 4)
788 GEN_VEXT_VV(vwadd_vv_w, 8)
789 GEN_VEXT_VV(vwsub_vv_b, 2)
790 GEN_VEXT_VV(vwsub_vv_h, 4)
791 GEN_VEXT_VV(vwsub_vv_w, 8)
792 GEN_VEXT_VV(vwaddu_wv_b, 2)
793 GEN_VEXT_VV(vwaddu_wv_h, 4)
794 GEN_VEXT_VV(vwaddu_wv_w, 8)
795 GEN_VEXT_VV(vwsubu_wv_b, 2)
796 GEN_VEXT_VV(vwsubu_wv_h, 4)
797 GEN_VEXT_VV(vwsubu_wv_w, 8)
798 GEN_VEXT_VV(vwadd_wv_b, 2)
799 GEN_VEXT_VV(vwadd_wv_h, 4)
800 GEN_VEXT_VV(vwadd_wv_w, 8)
801 GEN_VEXT_VV(vwsub_wv_b, 2)
802 GEN_VEXT_VV(vwsub_wv_h, 4)
803 GEN_VEXT_VV(vwsub_wv_w, 8)
804 
805 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
806 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
807 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
808 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
809 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
810 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
811 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
812 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
813 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
814 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
815 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
816 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
817 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
818 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
819 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
820 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
821 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
822 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
823 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
824 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
825 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
826 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
827 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
828 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
829 GEN_VEXT_VX(vwaddu_vx_b, 2)
830 GEN_VEXT_VX(vwaddu_vx_h, 4)
831 GEN_VEXT_VX(vwaddu_vx_w, 8)
832 GEN_VEXT_VX(vwsubu_vx_b, 2)
833 GEN_VEXT_VX(vwsubu_vx_h, 4)
834 GEN_VEXT_VX(vwsubu_vx_w, 8)
835 GEN_VEXT_VX(vwadd_vx_b, 2)
836 GEN_VEXT_VX(vwadd_vx_h, 4)
837 GEN_VEXT_VX(vwadd_vx_w, 8)
838 GEN_VEXT_VX(vwsub_vx_b, 2)
839 GEN_VEXT_VX(vwsub_vx_h, 4)
840 GEN_VEXT_VX(vwsub_vx_w, 8)
841 GEN_VEXT_VX(vwaddu_wx_b, 2)
842 GEN_VEXT_VX(vwaddu_wx_h, 4)
843 GEN_VEXT_VX(vwaddu_wx_w, 8)
844 GEN_VEXT_VX(vwsubu_wx_b, 2)
845 GEN_VEXT_VX(vwsubu_wx_h, 4)
846 GEN_VEXT_VX(vwsubu_wx_w, 8)
847 GEN_VEXT_VX(vwadd_wx_b, 2)
848 GEN_VEXT_VX(vwadd_wx_h, 4)
849 GEN_VEXT_VX(vwadd_wx_w, 8)
850 GEN_VEXT_VX(vwsub_wx_b, 2)
851 GEN_VEXT_VX(vwsub_wx_h, 4)
852 GEN_VEXT_VX(vwsub_wx_w, 8)
853 
854 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
855 #define DO_VADC(N, M, C) (N + M + C)
856 #define DO_VSBC(N, M, C) (N - M - C)
857 
858 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
859 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
860                   CPURISCVState *env, uint32_t desc)          \
861 {                                                             \
862     uint32_t vl = env->vl;                                    \
863     uint32_t esz = sizeof(ETYPE);                             \
864     uint32_t total_elems =                                    \
865         vext_get_total_elems(env, desc, esz);                 \
866     uint32_t vta = vext_vta(desc);                            \
867     uint32_t i;                                               \
868                                                               \
869     for (i = env->vstart; i < vl; i++) {                      \
870         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
871         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
872         ETYPE carry = vext_elem_mask(v0, i);                  \
873                                                               \
874         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
875     }                                                         \
876     env->vstart = 0;                                          \
877     /* set tail elements to 1s */                             \
878     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
879 }
880 
881 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
882 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
883 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
884 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
885 
886 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
887 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
888 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
889 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
890 
891 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
892 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
893                   CPURISCVState *env, uint32_t desc)                     \
894 {                                                                        \
895     uint32_t vl = env->vl;                                               \
896     uint32_t esz = sizeof(ETYPE);                                        \
897     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
898     uint32_t vta = vext_vta(desc);                                       \
899     uint32_t i;                                                          \
900                                                                          \
901     for (i = env->vstart; i < vl; i++) {                                 \
902         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
903         ETYPE carry = vext_elem_mask(v0, i);                             \
904                                                                          \
905         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
906     }                                                                    \
907     env->vstart = 0;                                                     \
908     /* set tail elements to 1s */                                        \
909     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
910 }
911 
912 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
913 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
914 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
915 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
916 
917 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
918 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
919 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
920 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
921 
922 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
923                           (__typeof(N))(N + M) < N)
924 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
925 
926 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
927 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
928                   CPURISCVState *env, uint32_t desc)          \
929 {                                                             \
930     uint32_t vl = env->vl;                                    \
931     uint32_t vm = vext_vm(desc);                              \
932     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
933     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
934     uint32_t i;                                               \
935                                                               \
936     for (i = env->vstart; i < vl; i++) {                      \
937         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
938         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
939         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
940         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
941     }                                                         \
942     env->vstart = 0;                                          \
943     /*
944      * mask destination register are always tail-agnostic
945      * set tail elements to 1s
946      */                                                       \
947     if (vta_all_1s) {                                         \
948         for (; i < total_elems; i++) {                        \
949             vext_set_elem_mask(vd, i, 1);                     \
950         }                                                     \
951     }                                                         \
952 }
953 
954 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
955 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
956 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
957 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
958 
959 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
960 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
961 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
962 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
963 
964 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
965 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
966                   void *vs2, CPURISCVState *env, uint32_t desc) \
967 {                                                               \
968     uint32_t vl = env->vl;                                      \
969     uint32_t vm = vext_vm(desc);                                \
970     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
971     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
972     uint32_t i;                                                 \
973                                                                 \
974     for (i = env->vstart; i < vl; i++) {                        \
975         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
976         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
977         vext_set_elem_mask(vd, i,                               \
978                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
979     }                                                           \
980     env->vstart = 0;                                            \
981     /*
982      * mask destination register are always tail-agnostic
983      * set tail elements to 1s
984      */                                                         \
985     if (vta_all_1s) {                                           \
986         for (; i < total_elems; i++) {                          \
987             vext_set_elem_mask(vd, i, 1);                       \
988         }                                                       \
989     }                                                           \
990 }
991 
992 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
993 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
994 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
995 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
996 
997 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
998 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
999 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1000 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1001 
1002 /* Vector Bitwise Logical Instructions */
1003 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1004 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1005 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1006 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1007 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1008 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1009 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1010 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1011 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1012 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1013 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1014 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1015 GEN_VEXT_VV(vand_vv_b, 1)
1016 GEN_VEXT_VV(vand_vv_h, 2)
1017 GEN_VEXT_VV(vand_vv_w, 4)
1018 GEN_VEXT_VV(vand_vv_d, 8)
1019 GEN_VEXT_VV(vor_vv_b, 1)
1020 GEN_VEXT_VV(vor_vv_h, 2)
1021 GEN_VEXT_VV(vor_vv_w, 4)
1022 GEN_VEXT_VV(vor_vv_d, 8)
1023 GEN_VEXT_VV(vxor_vv_b, 1)
1024 GEN_VEXT_VV(vxor_vv_h, 2)
1025 GEN_VEXT_VV(vxor_vv_w, 4)
1026 GEN_VEXT_VV(vxor_vv_d, 8)
1027 
1028 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1029 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1030 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1031 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1032 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1033 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1034 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1035 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1036 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1037 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1038 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1039 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1040 GEN_VEXT_VX(vand_vx_b, 1)
1041 GEN_VEXT_VX(vand_vx_h, 2)
1042 GEN_VEXT_VX(vand_vx_w, 4)
1043 GEN_VEXT_VX(vand_vx_d, 8)
1044 GEN_VEXT_VX(vor_vx_b, 1)
1045 GEN_VEXT_VX(vor_vx_h, 2)
1046 GEN_VEXT_VX(vor_vx_w, 4)
1047 GEN_VEXT_VX(vor_vx_d, 8)
1048 GEN_VEXT_VX(vxor_vx_b, 1)
1049 GEN_VEXT_VX(vxor_vx_h, 2)
1050 GEN_VEXT_VX(vxor_vx_w, 4)
1051 GEN_VEXT_VX(vxor_vx_d, 8)
1052 
1053 /* Vector Single-Width Bit Shift Instructions */
1054 #define DO_SLL(N, M)  (N << (M))
1055 #define DO_SRL(N, M)  (N >> (M))
1056 
1057 /* generate the helpers for shift instructions with two vector operators */
1058 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1059 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1060                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1061 {                                                                         \
1062     uint32_t vm = vext_vm(desc);                                          \
1063     uint32_t vl = env->vl;                                                \
1064     uint32_t esz = sizeof(TS1);                                           \
1065     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1066     uint32_t vta = vext_vta(desc);                                        \
1067     uint32_t vma = vext_vma(desc);                                        \
1068     uint32_t i;                                                           \
1069                                                                           \
1070     for (i = env->vstart; i < vl; i++) {                                  \
1071         if (!vm && !vext_elem_mask(v0, i)) {                              \
1072             /* set masked-off elements to 1s */                           \
1073             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1074             continue;                                                     \
1075         }                                                                 \
1076         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1077         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1078         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1079     }                                                                     \
1080     env->vstart = 0;                                                      \
1081     /* set tail elements to 1s */                                         \
1082     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1083 }
1084 
1085 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1086 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1087 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1088 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1089 
1090 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1091 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1092 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1093 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1094 
1095 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1096 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1097 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1098 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1099 
1100 /*
1101  * generate the helpers for shift instructions with one vector and one scalar
1102  */
1103 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1104 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1105                   void *vs2, CPURISCVState *env,            \
1106                   uint32_t desc)                            \
1107 {                                                           \
1108     uint32_t vm = vext_vm(desc);                            \
1109     uint32_t vl = env->vl;                                  \
1110     uint32_t esz = sizeof(TD);                              \
1111     uint32_t total_elems =                                  \
1112         vext_get_total_elems(env, desc, esz);               \
1113     uint32_t vta = vext_vta(desc);                          \
1114     uint32_t vma = vext_vma(desc);                          \
1115     uint32_t i;                                             \
1116                                                             \
1117     for (i = env->vstart; i < vl; i++) {                    \
1118         if (!vm && !vext_elem_mask(v0, i)) {                \
1119             /* set masked-off elements to 1s */             \
1120             vext_set_elems_1s(vd, vma, i * esz,             \
1121                               (i + 1) * esz);               \
1122             continue;                                       \
1123         }                                                   \
1124         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1125         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1126     }                                                       \
1127     env->vstart = 0;                                        \
1128     /* set tail elements to 1s */                           \
1129     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1130 }
1131 
1132 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1133 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1134 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1135 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1136 
1137 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1138 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1139 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1140 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1141 
1142 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1143 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1144 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1145 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1146 
1147 /* Vector Narrowing Integer Right Shift Instructions */
1148 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1149 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1150 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1151 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1152 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1153 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1154 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1155 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1156 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1157 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1158 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1159 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1160 
1161 /* Vector Integer Comparison Instructions */
1162 #define DO_MSEQ(N, M) (N == M)
1163 #define DO_MSNE(N, M) (N != M)
1164 #define DO_MSLT(N, M) (N < M)
1165 #define DO_MSLE(N, M) (N <= M)
1166 #define DO_MSGT(N, M) (N > M)
1167 
1168 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1169 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1170                   CPURISCVState *env, uint32_t desc)          \
1171 {                                                             \
1172     uint32_t vm = vext_vm(desc);                              \
1173     uint32_t vl = env->vl;                                    \
1174     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1175     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1176     uint32_t vma = vext_vma(desc);                            \
1177     uint32_t i;                                               \
1178                                                               \
1179     for (i = env->vstart; i < vl; i++) {                      \
1180         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1181         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1182         if (!vm && !vext_elem_mask(v0, i)) {                  \
1183             /* set masked-off elements to 1s */               \
1184             if (vma) {                                        \
1185                 vext_set_elem_mask(vd, i, 1);                 \
1186             }                                                 \
1187             continue;                                         \
1188         }                                                     \
1189         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1190     }                                                         \
1191     env->vstart = 0;                                          \
1192     /*
1193      * mask destination register are always tail-agnostic
1194      * set tail elements to 1s
1195      */                                                       \
1196     if (vta_all_1s) {                                         \
1197         for (; i < total_elems; i++) {                        \
1198             vext_set_elem_mask(vd, i, 1);                     \
1199         }                                                     \
1200     }                                                         \
1201 }
1202 
1203 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1204 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1205 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1206 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1207 
1208 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1209 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1210 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1211 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1212 
1213 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1214 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1215 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1216 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1217 
1218 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1219 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1220 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1221 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1222 
1223 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1224 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1225 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1226 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1227 
1228 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1229 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1230 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1231 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1232 
1233 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1234 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1235                   CPURISCVState *env, uint32_t desc)                \
1236 {                                                                   \
1237     uint32_t vm = vext_vm(desc);                                    \
1238     uint32_t vl = env->vl;                                          \
1239     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1240     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1241     uint32_t vma = vext_vma(desc);                                  \
1242     uint32_t i;                                                     \
1243                                                                     \
1244     for (i = env->vstart; i < vl; i++) {                            \
1245         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1246         if (!vm && !vext_elem_mask(v0, i)) {                        \
1247             /* set masked-off elements to 1s */                     \
1248             if (vma) {                                              \
1249                 vext_set_elem_mask(vd, i, 1);                       \
1250             }                                                       \
1251             continue;                                               \
1252         }                                                           \
1253         vext_set_elem_mask(vd, i,                                   \
1254                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1255     }                                                               \
1256     env->vstart = 0;                                                \
1257     /*
1258      * mask destination register are always tail-agnostic
1259      * set tail elements to 1s
1260      */                                                             \
1261     if (vta_all_1s) {                                               \
1262         for (; i < total_elems; i++) {                              \
1263             vext_set_elem_mask(vd, i, 1);                           \
1264         }                                                           \
1265     }                                                               \
1266 }
1267 
1268 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1269 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1270 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1271 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1272 
1273 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1274 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1275 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1276 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1277 
1278 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1279 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1280 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1281 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1282 
1283 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1284 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1285 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1286 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1287 
1288 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1289 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1290 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1291 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1292 
1293 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1294 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1295 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1296 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1297 
1298 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1299 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1300 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1301 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1302 
1303 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1304 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1305 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1306 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1307 
1308 /* Vector Integer Min/Max Instructions */
1309 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1310 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1311 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1312 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1313 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1314 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1315 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1316 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1317 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1318 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1319 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1320 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1321 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1322 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1323 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1324 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1325 GEN_VEXT_VV(vminu_vv_b, 1)
1326 GEN_VEXT_VV(vminu_vv_h, 2)
1327 GEN_VEXT_VV(vminu_vv_w, 4)
1328 GEN_VEXT_VV(vminu_vv_d, 8)
1329 GEN_VEXT_VV(vmin_vv_b, 1)
1330 GEN_VEXT_VV(vmin_vv_h, 2)
1331 GEN_VEXT_VV(vmin_vv_w, 4)
1332 GEN_VEXT_VV(vmin_vv_d, 8)
1333 GEN_VEXT_VV(vmaxu_vv_b, 1)
1334 GEN_VEXT_VV(vmaxu_vv_h, 2)
1335 GEN_VEXT_VV(vmaxu_vv_w, 4)
1336 GEN_VEXT_VV(vmaxu_vv_d, 8)
1337 GEN_VEXT_VV(vmax_vv_b, 1)
1338 GEN_VEXT_VV(vmax_vv_h, 2)
1339 GEN_VEXT_VV(vmax_vv_w, 4)
1340 GEN_VEXT_VV(vmax_vv_d, 8)
1341 
1342 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1343 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1344 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1345 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1346 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1347 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1348 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1349 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1350 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1351 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1352 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1353 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1354 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1355 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1356 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1357 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1358 GEN_VEXT_VX(vminu_vx_b, 1)
1359 GEN_VEXT_VX(vminu_vx_h, 2)
1360 GEN_VEXT_VX(vminu_vx_w, 4)
1361 GEN_VEXT_VX(vminu_vx_d, 8)
1362 GEN_VEXT_VX(vmin_vx_b, 1)
1363 GEN_VEXT_VX(vmin_vx_h, 2)
1364 GEN_VEXT_VX(vmin_vx_w, 4)
1365 GEN_VEXT_VX(vmin_vx_d, 8)
1366 GEN_VEXT_VX(vmaxu_vx_b, 1)
1367 GEN_VEXT_VX(vmaxu_vx_h, 2)
1368 GEN_VEXT_VX(vmaxu_vx_w, 4)
1369 GEN_VEXT_VX(vmaxu_vx_d, 8)
1370 GEN_VEXT_VX(vmax_vx_b, 1)
1371 GEN_VEXT_VX(vmax_vx_h, 2)
1372 GEN_VEXT_VX(vmax_vx_w, 4)
1373 GEN_VEXT_VX(vmax_vx_d, 8)
1374 
1375 /* Vector Single-Width Integer Multiply Instructions */
1376 #define DO_MUL(N, M) (N * M)
1377 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1378 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1379 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1380 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1381 GEN_VEXT_VV(vmul_vv_b, 1)
1382 GEN_VEXT_VV(vmul_vv_h, 2)
1383 GEN_VEXT_VV(vmul_vv_w, 4)
1384 GEN_VEXT_VV(vmul_vv_d, 8)
1385 
1386 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1387 {
1388     return (int16_t)s2 * (int16_t)s1 >> 8;
1389 }
1390 
1391 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1392 {
1393     return (int32_t)s2 * (int32_t)s1 >> 16;
1394 }
1395 
1396 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1397 {
1398     return (int64_t)s2 * (int64_t)s1 >> 32;
1399 }
1400 
1401 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1402 {
1403     uint64_t hi_64, lo_64;
1404 
1405     muls64(&lo_64, &hi_64, s1, s2);
1406     return hi_64;
1407 }
1408 
1409 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1410 {
1411     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1412 }
1413 
1414 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1415 {
1416     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1417 }
1418 
1419 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1420 {
1421     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1422 }
1423 
1424 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1425 {
1426     uint64_t hi_64, lo_64;
1427 
1428     mulu64(&lo_64, &hi_64, s2, s1);
1429     return hi_64;
1430 }
1431 
1432 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1433 {
1434     return (int16_t)s2 * (uint16_t)s1 >> 8;
1435 }
1436 
1437 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1438 {
1439     return (int32_t)s2 * (uint32_t)s1 >> 16;
1440 }
1441 
1442 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1443 {
1444     return (int64_t)s2 * (uint64_t)s1 >> 32;
1445 }
1446 
1447 /*
1448  * Let  A = signed operand,
1449  *      B = unsigned operand
1450  *      P = mulu64(A, B), unsigned product
1451  *
1452  * LET  X = 2 ** 64  - A, 2's complement of A
1453  *      SP = signed product
1454  * THEN
1455  *      IF A < 0
1456  *          SP = -X * B
1457  *             = -(2 ** 64 - A) * B
1458  *             = A * B - 2 ** 64 * B
1459  *             = P - 2 ** 64 * B
1460  *      ELSE
1461  *          SP = P
1462  * THEN
1463  *      HI_P -= (A < 0 ? B : 0)
1464  */
1465 
1466 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1467 {
1468     uint64_t hi_64, lo_64;
1469 
1470     mulu64(&lo_64, &hi_64, s2, s1);
1471 
1472     hi_64 -= s2 < 0 ? s1 : 0;
1473     return hi_64;
1474 }
1475 
1476 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1477 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1478 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1479 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1480 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1481 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1482 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1483 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1484 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1485 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1486 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1487 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1488 GEN_VEXT_VV(vmulh_vv_b, 1)
1489 GEN_VEXT_VV(vmulh_vv_h, 2)
1490 GEN_VEXT_VV(vmulh_vv_w, 4)
1491 GEN_VEXT_VV(vmulh_vv_d, 8)
1492 GEN_VEXT_VV(vmulhu_vv_b, 1)
1493 GEN_VEXT_VV(vmulhu_vv_h, 2)
1494 GEN_VEXT_VV(vmulhu_vv_w, 4)
1495 GEN_VEXT_VV(vmulhu_vv_d, 8)
1496 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1497 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1498 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1499 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1500 
1501 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1502 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1503 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1504 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1505 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1506 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1507 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1508 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1509 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1510 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1511 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1512 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1513 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1514 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1515 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1516 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1517 GEN_VEXT_VX(vmul_vx_b, 1)
1518 GEN_VEXT_VX(vmul_vx_h, 2)
1519 GEN_VEXT_VX(vmul_vx_w, 4)
1520 GEN_VEXT_VX(vmul_vx_d, 8)
1521 GEN_VEXT_VX(vmulh_vx_b, 1)
1522 GEN_VEXT_VX(vmulh_vx_h, 2)
1523 GEN_VEXT_VX(vmulh_vx_w, 4)
1524 GEN_VEXT_VX(vmulh_vx_d, 8)
1525 GEN_VEXT_VX(vmulhu_vx_b, 1)
1526 GEN_VEXT_VX(vmulhu_vx_h, 2)
1527 GEN_VEXT_VX(vmulhu_vx_w, 4)
1528 GEN_VEXT_VX(vmulhu_vx_d, 8)
1529 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1530 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1531 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1532 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1533 
1534 /* Vector Integer Divide Instructions */
1535 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1536 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1537 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1538         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1539 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1540         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1541 
1542 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1543 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1544 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1545 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1546 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1547 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1548 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1549 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1550 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1551 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1552 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1553 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1554 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1555 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1556 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1557 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1558 GEN_VEXT_VV(vdivu_vv_b, 1)
1559 GEN_VEXT_VV(vdivu_vv_h, 2)
1560 GEN_VEXT_VV(vdivu_vv_w, 4)
1561 GEN_VEXT_VV(vdivu_vv_d, 8)
1562 GEN_VEXT_VV(vdiv_vv_b, 1)
1563 GEN_VEXT_VV(vdiv_vv_h, 2)
1564 GEN_VEXT_VV(vdiv_vv_w, 4)
1565 GEN_VEXT_VV(vdiv_vv_d, 8)
1566 GEN_VEXT_VV(vremu_vv_b, 1)
1567 GEN_VEXT_VV(vremu_vv_h, 2)
1568 GEN_VEXT_VV(vremu_vv_w, 4)
1569 GEN_VEXT_VV(vremu_vv_d, 8)
1570 GEN_VEXT_VV(vrem_vv_b, 1)
1571 GEN_VEXT_VV(vrem_vv_h, 2)
1572 GEN_VEXT_VV(vrem_vv_w, 4)
1573 GEN_VEXT_VV(vrem_vv_d, 8)
1574 
1575 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1576 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1577 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1578 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1579 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1580 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1581 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1582 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1583 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1584 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1585 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1586 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1587 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1588 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1589 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1590 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1591 GEN_VEXT_VX(vdivu_vx_b, 1)
1592 GEN_VEXT_VX(vdivu_vx_h, 2)
1593 GEN_VEXT_VX(vdivu_vx_w, 4)
1594 GEN_VEXT_VX(vdivu_vx_d, 8)
1595 GEN_VEXT_VX(vdiv_vx_b, 1)
1596 GEN_VEXT_VX(vdiv_vx_h, 2)
1597 GEN_VEXT_VX(vdiv_vx_w, 4)
1598 GEN_VEXT_VX(vdiv_vx_d, 8)
1599 GEN_VEXT_VX(vremu_vx_b, 1)
1600 GEN_VEXT_VX(vremu_vx_h, 2)
1601 GEN_VEXT_VX(vremu_vx_w, 4)
1602 GEN_VEXT_VX(vremu_vx_d, 8)
1603 GEN_VEXT_VX(vrem_vx_b, 1)
1604 GEN_VEXT_VX(vrem_vx_h, 2)
1605 GEN_VEXT_VX(vrem_vx_w, 4)
1606 GEN_VEXT_VX(vrem_vx_d, 8)
1607 
1608 /* Vector Widening Integer Multiply Instructions */
1609 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1610 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1611 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1612 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1613 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1614 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1615 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1616 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1617 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1618 GEN_VEXT_VV(vwmul_vv_b, 2)
1619 GEN_VEXT_VV(vwmul_vv_h, 4)
1620 GEN_VEXT_VV(vwmul_vv_w, 8)
1621 GEN_VEXT_VV(vwmulu_vv_b, 2)
1622 GEN_VEXT_VV(vwmulu_vv_h, 4)
1623 GEN_VEXT_VV(vwmulu_vv_w, 8)
1624 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1625 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1626 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1627 
1628 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1629 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1630 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1631 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1632 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1633 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1634 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1635 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1636 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1637 GEN_VEXT_VX(vwmul_vx_b, 2)
1638 GEN_VEXT_VX(vwmul_vx_h, 4)
1639 GEN_VEXT_VX(vwmul_vx_w, 8)
1640 GEN_VEXT_VX(vwmulu_vx_b, 2)
1641 GEN_VEXT_VX(vwmulu_vx_h, 4)
1642 GEN_VEXT_VX(vwmulu_vx_w, 8)
1643 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1644 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1645 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1646 
1647 /* Vector Single-Width Integer Multiply-Add Instructions */
1648 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1649 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1650 {                                                                  \
1651     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1652     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1653     TD d = *((TD *)vd + HD(i));                                    \
1654     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1655 }
1656 
1657 #define DO_MACC(N, M, D) (M * N + D)
1658 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1659 #define DO_MADD(N, M, D) (M * D + N)
1660 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1661 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1662 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1663 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1664 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1665 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1666 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1667 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1668 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1669 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1670 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1671 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1672 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1673 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1674 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1675 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1676 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1677 GEN_VEXT_VV(vmacc_vv_b, 1)
1678 GEN_VEXT_VV(vmacc_vv_h, 2)
1679 GEN_VEXT_VV(vmacc_vv_w, 4)
1680 GEN_VEXT_VV(vmacc_vv_d, 8)
1681 GEN_VEXT_VV(vnmsac_vv_b, 1)
1682 GEN_VEXT_VV(vnmsac_vv_h, 2)
1683 GEN_VEXT_VV(vnmsac_vv_w, 4)
1684 GEN_VEXT_VV(vnmsac_vv_d, 8)
1685 GEN_VEXT_VV(vmadd_vv_b, 1)
1686 GEN_VEXT_VV(vmadd_vv_h, 2)
1687 GEN_VEXT_VV(vmadd_vv_w, 4)
1688 GEN_VEXT_VV(vmadd_vv_d, 8)
1689 GEN_VEXT_VV(vnmsub_vv_b, 1)
1690 GEN_VEXT_VV(vnmsub_vv_h, 2)
1691 GEN_VEXT_VV(vnmsub_vv_w, 4)
1692 GEN_VEXT_VV(vnmsub_vv_d, 8)
1693 
1694 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1695 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1696 {                                                                   \
1697     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1698     TD d = *((TD *)vd + HD(i));                                     \
1699     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1700 }
1701 
1702 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1703 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1704 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1705 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1706 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1707 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1708 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1709 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1710 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1711 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1712 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1713 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1714 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1715 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1716 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1717 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1718 GEN_VEXT_VX(vmacc_vx_b, 1)
1719 GEN_VEXT_VX(vmacc_vx_h, 2)
1720 GEN_VEXT_VX(vmacc_vx_w, 4)
1721 GEN_VEXT_VX(vmacc_vx_d, 8)
1722 GEN_VEXT_VX(vnmsac_vx_b, 1)
1723 GEN_VEXT_VX(vnmsac_vx_h, 2)
1724 GEN_VEXT_VX(vnmsac_vx_w, 4)
1725 GEN_VEXT_VX(vnmsac_vx_d, 8)
1726 GEN_VEXT_VX(vmadd_vx_b, 1)
1727 GEN_VEXT_VX(vmadd_vx_h, 2)
1728 GEN_VEXT_VX(vmadd_vx_w, 4)
1729 GEN_VEXT_VX(vmadd_vx_d, 8)
1730 GEN_VEXT_VX(vnmsub_vx_b, 1)
1731 GEN_VEXT_VX(vnmsub_vx_h, 2)
1732 GEN_VEXT_VX(vnmsub_vx_w, 4)
1733 GEN_VEXT_VX(vnmsub_vx_d, 8)
1734 
1735 /* Vector Widening Integer Multiply-Add Instructions */
1736 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1737 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1738 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1739 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1740 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1741 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1742 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1743 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1744 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1745 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1746 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1747 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1748 GEN_VEXT_VV(vwmacc_vv_b, 2)
1749 GEN_VEXT_VV(vwmacc_vv_h, 4)
1750 GEN_VEXT_VV(vwmacc_vv_w, 8)
1751 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1752 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1753 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1754 
1755 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1756 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1757 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1758 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1759 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1760 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1761 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1762 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1763 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1764 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1765 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1766 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1767 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1768 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1769 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1770 GEN_VEXT_VX(vwmacc_vx_b, 2)
1771 GEN_VEXT_VX(vwmacc_vx_h, 4)
1772 GEN_VEXT_VX(vwmacc_vx_w, 8)
1773 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1774 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1775 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1776 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1777 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1778 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1779 
1780 /* Vector Integer Merge and Move Instructions */
1781 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1782 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1783                   uint32_t desc)                                     \
1784 {                                                                    \
1785     uint32_t vl = env->vl;                                           \
1786     uint32_t esz = sizeof(ETYPE);                                    \
1787     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1788     uint32_t vta = vext_vta(desc);                                   \
1789     uint32_t i;                                                      \
1790                                                                      \
1791     for (i = env->vstart; i < vl; i++) {                             \
1792         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1793         *((ETYPE *)vd + H(i)) = s1;                                  \
1794     }                                                                \
1795     env->vstart = 0;                                                 \
1796     /* set tail elements to 1s */                                    \
1797     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1798 }
1799 
1800 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1801 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1802 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1803 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1804 
1805 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1806 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1807                   uint32_t desc)                                     \
1808 {                                                                    \
1809     uint32_t vl = env->vl;                                           \
1810     uint32_t esz = sizeof(ETYPE);                                    \
1811     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1812     uint32_t vta = vext_vta(desc);                                   \
1813     uint32_t i;                                                      \
1814                                                                      \
1815     for (i = env->vstart; i < vl; i++) {                             \
1816         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
1817     }                                                                \
1818     env->vstart = 0;                                                 \
1819     /* set tail elements to 1s */                                    \
1820     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1821 }
1822 
1823 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
1824 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
1825 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
1826 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
1827 
1828 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
1829 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
1830                   CPURISCVState *env, uint32_t desc)                 \
1831 {                                                                    \
1832     uint32_t vl = env->vl;                                           \
1833     uint32_t esz = sizeof(ETYPE);                                    \
1834     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1835     uint32_t vta = vext_vta(desc);                                   \
1836     uint32_t i;                                                      \
1837                                                                      \
1838     for (i = env->vstart; i < vl; i++) {                             \
1839         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
1840         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
1841     }                                                                \
1842     env->vstart = 0;                                                 \
1843     /* set tail elements to 1s */                                    \
1844     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1845 }
1846 
1847 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
1848 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
1849 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
1850 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
1851 
1852 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
1853 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
1854                   void *vs2, CPURISCVState *env, uint32_t desc)      \
1855 {                                                                    \
1856     uint32_t vl = env->vl;                                           \
1857     uint32_t esz = sizeof(ETYPE);                                    \
1858     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1859     uint32_t vta = vext_vta(desc);                                   \
1860     uint32_t i;                                                      \
1861                                                                      \
1862     for (i = env->vstart; i < vl; i++) {                             \
1863         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
1864         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
1865                    (ETYPE)(target_long)s1);                          \
1866         *((ETYPE *)vd + H(i)) = d;                                   \
1867     }                                                                \
1868     env->vstart = 0;                                                 \
1869     /* set tail elements to 1s */                                    \
1870     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1871 }
1872 
1873 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
1874 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
1875 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
1876 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
1877 
1878 /*
1879  * Vector Fixed-Point Arithmetic Instructions
1880  */
1881 
1882 /* Vector Single-Width Saturating Add and Subtract */
1883 
1884 /*
1885  * As fixed point instructions probably have round mode and saturation,
1886  * define common macros for fixed point here.
1887  */
1888 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
1889                           CPURISCVState *env, int vxrm);
1890 
1891 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
1892 static inline void                                                  \
1893 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
1894           CPURISCVState *env, int vxrm)                             \
1895 {                                                                   \
1896     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
1897     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1898     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
1899 }
1900 
1901 static inline void
1902 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
1903              CPURISCVState *env,
1904              uint32_t vl, uint32_t vm, int vxrm,
1905              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
1906 {
1907     for (uint32_t i = env->vstart; i < vl; i++) {
1908         if (!vm && !vext_elem_mask(v0, i)) {
1909             /* set masked-off elements to 1s */
1910             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
1911             continue;
1912         }
1913         fn(vd, vs1, vs2, i, env, vxrm);
1914     }
1915     env->vstart = 0;
1916 }
1917 
1918 static inline void
1919 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
1920              CPURISCVState *env,
1921              uint32_t desc,
1922              opivv2_rm_fn *fn, uint32_t esz)
1923 {
1924     uint32_t vm = vext_vm(desc);
1925     uint32_t vl = env->vl;
1926     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
1927     uint32_t vta = vext_vta(desc);
1928     uint32_t vma = vext_vma(desc);
1929 
1930     switch (env->vxrm) {
1931     case 0: /* rnu */
1932         vext_vv_rm_1(vd, v0, vs1, vs2,
1933                      env, vl, vm, 0, fn, vma, esz);
1934         break;
1935     case 1: /* rne */
1936         vext_vv_rm_1(vd, v0, vs1, vs2,
1937                      env, vl, vm, 1, fn, vma, esz);
1938         break;
1939     case 2: /* rdn */
1940         vext_vv_rm_1(vd, v0, vs1, vs2,
1941                      env, vl, vm, 2, fn, vma, esz);
1942         break;
1943     default: /* rod */
1944         vext_vv_rm_1(vd, v0, vs1, vs2,
1945                      env, vl, vm, 3, fn, vma, esz);
1946         break;
1947     }
1948     /* set tail elements to 1s */
1949     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
1950 }
1951 
1952 /* generate helpers for fixed point instructions with OPIVV format */
1953 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
1954 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
1955                   CPURISCVState *env, uint32_t desc)            \
1956 {                                                               \
1957     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
1958                  do_##NAME, ESZ);                               \
1959 }
1960 
1961 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
1962                              uint8_t b)
1963 {
1964     uint8_t res = a + b;
1965     if (res < a) {
1966         res = UINT8_MAX;
1967         env->vxsat = 0x1;
1968     }
1969     return res;
1970 }
1971 
1972 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
1973                                uint16_t b)
1974 {
1975     uint16_t res = a + b;
1976     if (res < a) {
1977         res = UINT16_MAX;
1978         env->vxsat = 0x1;
1979     }
1980     return res;
1981 }
1982 
1983 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
1984                                uint32_t b)
1985 {
1986     uint32_t res = a + b;
1987     if (res < a) {
1988         res = UINT32_MAX;
1989         env->vxsat = 0x1;
1990     }
1991     return res;
1992 }
1993 
1994 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
1995                                uint64_t b)
1996 {
1997     uint64_t res = a + b;
1998     if (res < a) {
1999         res = UINT64_MAX;
2000         env->vxsat = 0x1;
2001     }
2002     return res;
2003 }
2004 
2005 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2006 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2007 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2008 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2009 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2010 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2011 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2012 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2013 
2014 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2015                           CPURISCVState *env, int vxrm);
2016 
2017 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2018 static inline void                                                  \
2019 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2020           CPURISCVState *env, int vxrm)                             \
2021 {                                                                   \
2022     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2023     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2024 }
2025 
2026 static inline void
2027 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2028              CPURISCVState *env,
2029              uint32_t vl, uint32_t vm, int vxrm,
2030              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2031 {
2032     for (uint32_t i = env->vstart; i < vl; i++) {
2033         if (!vm && !vext_elem_mask(v0, i)) {
2034             /* set masked-off elements to 1s */
2035             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2036             continue;
2037         }
2038         fn(vd, s1, vs2, i, env, vxrm);
2039     }
2040     env->vstart = 0;
2041 }
2042 
2043 static inline void
2044 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2045              CPURISCVState *env,
2046              uint32_t desc,
2047              opivx2_rm_fn *fn, uint32_t esz)
2048 {
2049     uint32_t vm = vext_vm(desc);
2050     uint32_t vl = env->vl;
2051     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2052     uint32_t vta = vext_vta(desc);
2053     uint32_t vma = vext_vma(desc);
2054 
2055     switch (env->vxrm) {
2056     case 0: /* rnu */
2057         vext_vx_rm_1(vd, v0, s1, vs2,
2058                      env, vl, vm, 0, fn, vma, esz);
2059         break;
2060     case 1: /* rne */
2061         vext_vx_rm_1(vd, v0, s1, vs2,
2062                      env, vl, vm, 1, fn, vma, esz);
2063         break;
2064     case 2: /* rdn */
2065         vext_vx_rm_1(vd, v0, s1, vs2,
2066                      env, vl, vm, 2, fn, vma, esz);
2067         break;
2068     default: /* rod */
2069         vext_vx_rm_1(vd, v0, s1, vs2,
2070                      env, vl, vm, 3, fn, vma, esz);
2071         break;
2072     }
2073     /* set tail elements to 1s */
2074     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2075 }
2076 
2077 /* generate helpers for fixed point instructions with OPIVX format */
2078 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2079 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2080                   void *vs2, CPURISCVState *env,          \
2081                   uint32_t desc)                          \
2082 {                                                         \
2083     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2084                  do_##NAME, ESZ);                         \
2085 }
2086 
2087 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2088 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2089 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2090 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2091 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2092 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2093 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2094 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2095 
2096 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2097 {
2098     int8_t res = a + b;
2099     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2100         res = a > 0 ? INT8_MAX : INT8_MIN;
2101         env->vxsat = 0x1;
2102     }
2103     return res;
2104 }
2105 
2106 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2107                              int16_t b)
2108 {
2109     int16_t res = a + b;
2110     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2111         res = a > 0 ? INT16_MAX : INT16_MIN;
2112         env->vxsat = 0x1;
2113     }
2114     return res;
2115 }
2116 
2117 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2118                              int32_t b)
2119 {
2120     int32_t res = a + b;
2121     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2122         res = a > 0 ? INT32_MAX : INT32_MIN;
2123         env->vxsat = 0x1;
2124     }
2125     return res;
2126 }
2127 
2128 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2129                              int64_t b)
2130 {
2131     int64_t res = a + b;
2132     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2133         res = a > 0 ? INT64_MAX : INT64_MIN;
2134         env->vxsat = 0x1;
2135     }
2136     return res;
2137 }
2138 
2139 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2140 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2141 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2142 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2143 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2144 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2145 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2146 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2147 
2148 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2149 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2150 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2151 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2152 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2153 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2154 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2155 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2156 
2157 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2158                              uint8_t b)
2159 {
2160     uint8_t res = a - b;
2161     if (res > a) {
2162         res = 0;
2163         env->vxsat = 0x1;
2164     }
2165     return res;
2166 }
2167 
2168 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2169                                uint16_t b)
2170 {
2171     uint16_t res = a - b;
2172     if (res > a) {
2173         res = 0;
2174         env->vxsat = 0x1;
2175     }
2176     return res;
2177 }
2178 
2179 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2180                                uint32_t b)
2181 {
2182     uint32_t res = a - b;
2183     if (res > a) {
2184         res = 0;
2185         env->vxsat = 0x1;
2186     }
2187     return res;
2188 }
2189 
2190 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2191                                uint64_t b)
2192 {
2193     uint64_t res = a - b;
2194     if (res > a) {
2195         res = 0;
2196         env->vxsat = 0x1;
2197     }
2198     return res;
2199 }
2200 
2201 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2202 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2203 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2204 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2205 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2206 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2207 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2208 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2209 
2210 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2211 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2212 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2213 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2214 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2215 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2216 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2217 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2218 
2219 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2220 {
2221     int8_t res = a - b;
2222     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2223         res = a >= 0 ? INT8_MAX : INT8_MIN;
2224         env->vxsat = 0x1;
2225     }
2226     return res;
2227 }
2228 
2229 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2230                              int16_t b)
2231 {
2232     int16_t res = a - b;
2233     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2234         res = a >= 0 ? INT16_MAX : INT16_MIN;
2235         env->vxsat = 0x1;
2236     }
2237     return res;
2238 }
2239 
2240 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2241                              int32_t b)
2242 {
2243     int32_t res = a - b;
2244     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2245         res = a >= 0 ? INT32_MAX : INT32_MIN;
2246         env->vxsat = 0x1;
2247     }
2248     return res;
2249 }
2250 
2251 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2252                              int64_t b)
2253 {
2254     int64_t res = a - b;
2255     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2256         res = a >= 0 ? INT64_MAX : INT64_MIN;
2257         env->vxsat = 0x1;
2258     }
2259     return res;
2260 }
2261 
2262 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2263 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2264 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2265 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2266 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2267 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2268 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2269 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2270 
2271 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2272 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2273 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2274 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2275 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2276 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2277 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2278 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2279 
2280 /* Vector Single-Width Averaging Add and Subtract */
2281 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2282 {
2283     uint8_t d = extract64(v, shift, 1);
2284     uint8_t d1;
2285     uint64_t D1, D2;
2286 
2287     if (shift == 0 || shift > 64) {
2288         return 0;
2289     }
2290 
2291     d1 = extract64(v, shift - 1, 1);
2292     D1 = extract64(v, 0, shift);
2293     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2294         return d1;
2295     } else if (vxrm == 1) { /* round-to-nearest-even */
2296         if (shift > 1) {
2297             D2 = extract64(v, 0, shift - 1);
2298             return d1 & ((D2 != 0) | d);
2299         } else {
2300             return d1 & d;
2301         }
2302     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2303         return !d & (D1 != 0);
2304     }
2305     return 0; /* round-down (truncate) */
2306 }
2307 
2308 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2309                              int32_t b)
2310 {
2311     int64_t res = (int64_t)a + b;
2312     uint8_t round = get_round(vxrm, res, 1);
2313 
2314     return (res >> 1) + round;
2315 }
2316 
2317 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2318                              int64_t b)
2319 {
2320     int64_t res = a + b;
2321     uint8_t round = get_round(vxrm, res, 1);
2322     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2323 
2324     /* With signed overflow, bit 64 is inverse of bit 63. */
2325     return ((res >> 1) ^ over) + round;
2326 }
2327 
2328 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2329 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2330 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2331 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2332 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2333 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2334 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2335 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2336 
2337 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2338 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2339 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2340 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2341 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2342 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2343 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2344 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2345 
2346 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2347                                uint32_t a, uint32_t b)
2348 {
2349     uint64_t res = (uint64_t)a + b;
2350     uint8_t round = get_round(vxrm, res, 1);
2351 
2352     return (res >> 1) + round;
2353 }
2354 
2355 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2356                                uint64_t a, uint64_t b)
2357 {
2358     uint64_t res = a + b;
2359     uint8_t round = get_round(vxrm, res, 1);
2360     uint64_t over = (uint64_t)(res < a) << 63;
2361 
2362     return ((res >> 1) | over) + round;
2363 }
2364 
2365 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2366 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2367 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2368 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2369 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2370 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2371 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2372 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2373 
2374 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2375 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2376 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2377 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2378 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2379 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2380 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2381 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2382 
2383 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2384                              int32_t b)
2385 {
2386     int64_t res = (int64_t)a - b;
2387     uint8_t round = get_round(vxrm, res, 1);
2388 
2389     return (res >> 1) + round;
2390 }
2391 
2392 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2393                              int64_t b)
2394 {
2395     int64_t res = (int64_t)a - b;
2396     uint8_t round = get_round(vxrm, res, 1);
2397     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2398 
2399     /* With signed overflow, bit 64 is inverse of bit 63. */
2400     return ((res >> 1) ^ over) + round;
2401 }
2402 
2403 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2404 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2405 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2406 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2407 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2408 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2409 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2410 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2411 
2412 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2413 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2414 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2415 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2416 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2417 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2418 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2419 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2420 
2421 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2422                                uint32_t a, uint32_t b)
2423 {
2424     int64_t res = (int64_t)a - b;
2425     uint8_t round = get_round(vxrm, res, 1);
2426 
2427     return (res >> 1) + round;
2428 }
2429 
2430 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2431                                uint64_t a, uint64_t b)
2432 {
2433     uint64_t res = (uint64_t)a - b;
2434     uint8_t round = get_round(vxrm, res, 1);
2435     uint64_t over = (uint64_t)(res > a) << 63;
2436 
2437     return ((res >> 1) | over) + round;
2438 }
2439 
2440 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2441 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2442 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2443 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2444 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2445 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2446 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2447 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2448 
2449 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2450 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2451 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2452 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2453 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2454 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2455 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2456 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2457 
2458 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2459 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2460 {
2461     uint8_t round;
2462     int16_t res;
2463 
2464     res = (int16_t)a * (int16_t)b;
2465     round = get_round(vxrm, res, 7);
2466     res = (res >> 7) + round;
2467 
2468     if (res > INT8_MAX) {
2469         env->vxsat = 0x1;
2470         return INT8_MAX;
2471     } else if (res < INT8_MIN) {
2472         env->vxsat = 0x1;
2473         return INT8_MIN;
2474     } else {
2475         return res;
2476     }
2477 }
2478 
2479 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2480 {
2481     uint8_t round;
2482     int32_t res;
2483 
2484     res = (int32_t)a * (int32_t)b;
2485     round = get_round(vxrm, res, 15);
2486     res = (res >> 15) + round;
2487 
2488     if (res > INT16_MAX) {
2489         env->vxsat = 0x1;
2490         return INT16_MAX;
2491     } else if (res < INT16_MIN) {
2492         env->vxsat = 0x1;
2493         return INT16_MIN;
2494     } else {
2495         return res;
2496     }
2497 }
2498 
2499 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2500 {
2501     uint8_t round;
2502     int64_t res;
2503 
2504     res = (int64_t)a * (int64_t)b;
2505     round = get_round(vxrm, res, 31);
2506     res = (res >> 31) + round;
2507 
2508     if (res > INT32_MAX) {
2509         env->vxsat = 0x1;
2510         return INT32_MAX;
2511     } else if (res < INT32_MIN) {
2512         env->vxsat = 0x1;
2513         return INT32_MIN;
2514     } else {
2515         return res;
2516     }
2517 }
2518 
2519 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2520 {
2521     uint8_t round;
2522     uint64_t hi_64, lo_64;
2523     int64_t res;
2524 
2525     if (a == INT64_MIN && b == INT64_MIN) {
2526         env->vxsat = 1;
2527         return INT64_MAX;
2528     }
2529 
2530     muls64(&lo_64, &hi_64, a, b);
2531     round = get_round(vxrm, lo_64, 63);
2532     /*
2533      * Cannot overflow, as there are always
2534      * 2 sign bits after multiply.
2535      */
2536     res = (hi_64 << 1) | (lo_64 >> 63);
2537     if (round) {
2538         if (res == INT64_MAX) {
2539             env->vxsat = 1;
2540         } else {
2541             res += 1;
2542         }
2543     }
2544     return res;
2545 }
2546 
2547 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2548 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2549 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2550 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2551 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2552 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2553 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2554 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2555 
2556 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2557 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2558 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2559 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2560 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2561 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2562 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2563 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2564 
2565 /* Vector Single-Width Scaling Shift Instructions */
2566 static inline uint8_t
2567 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2568 {
2569     uint8_t round, shift = b & 0x7;
2570     uint8_t res;
2571 
2572     round = get_round(vxrm, a, shift);
2573     res = (a >> shift) + round;
2574     return res;
2575 }
2576 static inline uint16_t
2577 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2578 {
2579     uint8_t round, shift = b & 0xf;
2580 
2581     round = get_round(vxrm, a, shift);
2582     return (a >> shift) + round;
2583 }
2584 static inline uint32_t
2585 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2586 {
2587     uint8_t round, shift = b & 0x1f;
2588 
2589     round = get_round(vxrm, a, shift);
2590     return (a >> shift) + round;
2591 }
2592 static inline uint64_t
2593 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2594 {
2595     uint8_t round, shift = b & 0x3f;
2596 
2597     round = get_round(vxrm, a, shift);
2598     return (a >> shift) + round;
2599 }
2600 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2601 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2602 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2603 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2604 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2605 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2606 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2607 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2608 
2609 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2610 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2611 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2612 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2613 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2614 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2615 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2616 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2617 
2618 static inline int8_t
2619 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2620 {
2621     uint8_t round, shift = b & 0x7;
2622 
2623     round = get_round(vxrm, a, shift);
2624     return (a >> shift) + round;
2625 }
2626 static inline int16_t
2627 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2628 {
2629     uint8_t round, shift = b & 0xf;
2630 
2631     round = get_round(vxrm, a, shift);
2632     return (a >> shift) + round;
2633 }
2634 static inline int32_t
2635 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2636 {
2637     uint8_t round, shift = b & 0x1f;
2638 
2639     round = get_round(vxrm, a, shift);
2640     return (a >> shift) + round;
2641 }
2642 static inline int64_t
2643 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2644 {
2645     uint8_t round, shift = b & 0x3f;
2646 
2647     round = get_round(vxrm, a, shift);
2648     return (a >> shift) + round;
2649 }
2650 
2651 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2652 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2653 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2654 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2655 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2656 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2657 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2658 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2659 
2660 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2661 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2662 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2663 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2664 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2665 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2666 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2667 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2668 
2669 /* Vector Narrowing Fixed-Point Clip Instructions */
2670 static inline int8_t
2671 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2672 {
2673     uint8_t round, shift = b & 0xf;
2674     int16_t res;
2675 
2676     round = get_round(vxrm, a, shift);
2677     res = (a >> shift) + round;
2678     if (res > INT8_MAX) {
2679         env->vxsat = 0x1;
2680         return INT8_MAX;
2681     } else if (res < INT8_MIN) {
2682         env->vxsat = 0x1;
2683         return INT8_MIN;
2684     } else {
2685         return res;
2686     }
2687 }
2688 
2689 static inline int16_t
2690 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2691 {
2692     uint8_t round, shift = b & 0x1f;
2693     int32_t res;
2694 
2695     round = get_round(vxrm, a, shift);
2696     res = (a >> shift) + round;
2697     if (res > INT16_MAX) {
2698         env->vxsat = 0x1;
2699         return INT16_MAX;
2700     } else if (res < INT16_MIN) {
2701         env->vxsat = 0x1;
2702         return INT16_MIN;
2703     } else {
2704         return res;
2705     }
2706 }
2707 
2708 static inline int32_t
2709 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2710 {
2711     uint8_t round, shift = b & 0x3f;
2712     int64_t res;
2713 
2714     round = get_round(vxrm, a, shift);
2715     res = (a >> shift) + round;
2716     if (res > INT32_MAX) {
2717         env->vxsat = 0x1;
2718         return INT32_MAX;
2719     } else if (res < INT32_MIN) {
2720         env->vxsat = 0x1;
2721         return INT32_MIN;
2722     } else {
2723         return res;
2724     }
2725 }
2726 
2727 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2728 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2729 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2730 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2731 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2732 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2733 
2734 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2735 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2736 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2737 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2738 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2739 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2740 
2741 static inline uint8_t
2742 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2743 {
2744     uint8_t round, shift = b & 0xf;
2745     uint16_t res;
2746 
2747     round = get_round(vxrm, a, shift);
2748     res = (a >> shift) + round;
2749     if (res > UINT8_MAX) {
2750         env->vxsat = 0x1;
2751         return UINT8_MAX;
2752     } else {
2753         return res;
2754     }
2755 }
2756 
2757 static inline uint16_t
2758 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2759 {
2760     uint8_t round, shift = b & 0x1f;
2761     uint32_t res;
2762 
2763     round = get_round(vxrm, a, shift);
2764     res = (a >> shift) + round;
2765     if (res > UINT16_MAX) {
2766         env->vxsat = 0x1;
2767         return UINT16_MAX;
2768     } else {
2769         return res;
2770     }
2771 }
2772 
2773 static inline uint32_t
2774 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2775 {
2776     uint8_t round, shift = b & 0x3f;
2777     uint64_t res;
2778 
2779     round = get_round(vxrm, a, shift);
2780     res = (a >> shift) + round;
2781     if (res > UINT32_MAX) {
2782         env->vxsat = 0x1;
2783         return UINT32_MAX;
2784     } else {
2785         return res;
2786     }
2787 }
2788 
2789 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2790 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2791 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2792 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2793 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2794 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2795 
2796 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2797 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2798 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2799 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2800 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2801 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2802 
2803 /*
2804  * Vector Float Point Arithmetic Instructions
2805  */
2806 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2807 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2808 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2809                       CPURISCVState *env)                      \
2810 {                                                              \
2811     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2812     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2813     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2814 }
2815 
2816 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2817 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2818                   void *vs2, CPURISCVState *env,          \
2819                   uint32_t desc)                          \
2820 {                                                         \
2821     uint32_t vm = vext_vm(desc);                          \
2822     uint32_t vl = env->vl;                                \
2823     uint32_t total_elems =                                \
2824         vext_get_total_elems(env, desc, ESZ);             \
2825     uint32_t vta = vext_vta(desc);                        \
2826     uint32_t vma = vext_vma(desc);                        \
2827     uint32_t i;                                           \
2828                                                           \
2829     for (i = env->vstart; i < vl; i++) {                  \
2830         if (!vm && !vext_elem_mask(v0, i)) {              \
2831             /* set masked-off elements to 1s */           \
2832             vext_set_elems_1s(vd, vma, i * ESZ,           \
2833                               (i + 1) * ESZ);             \
2834             continue;                                     \
2835         }                                                 \
2836         do_##NAME(vd, vs1, vs2, i, env);                  \
2837     }                                                     \
2838     env->vstart = 0;                                      \
2839     /* set tail elements to 1s */                         \
2840     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2841                       total_elems * ESZ);                 \
2842 }
2843 
2844 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
2845 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
2846 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
2847 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
2848 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
2849 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
2850 
2851 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
2852 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
2853                       CPURISCVState *env)                      \
2854 {                                                              \
2855     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2856     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
2857 }
2858 
2859 #define GEN_VEXT_VF(NAME, ESZ)                            \
2860 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
2861                   void *vs2, CPURISCVState *env,          \
2862                   uint32_t desc)                          \
2863 {                                                         \
2864     uint32_t vm = vext_vm(desc);                          \
2865     uint32_t vl = env->vl;                                \
2866     uint32_t total_elems =                                \
2867         vext_get_total_elems(env, desc, ESZ);             \
2868     uint32_t vta = vext_vta(desc);                        \
2869     uint32_t vma = vext_vma(desc);                        \
2870     uint32_t i;                                           \
2871                                                           \
2872     for (i = env->vstart; i < vl; i++) {                  \
2873         if (!vm && !vext_elem_mask(v0, i)) {              \
2874             /* set masked-off elements to 1s */           \
2875             vext_set_elems_1s(vd, vma, i * ESZ,           \
2876                               (i + 1) * ESZ);             \
2877             continue;                                     \
2878         }                                                 \
2879         do_##NAME(vd, s1, vs2, i, env);                   \
2880     }                                                     \
2881     env->vstart = 0;                                      \
2882     /* set tail elements to 1s */                         \
2883     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
2884                       total_elems * ESZ);                 \
2885 }
2886 
2887 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
2888 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
2889 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
2890 GEN_VEXT_VF(vfadd_vf_h, 2)
2891 GEN_VEXT_VF(vfadd_vf_w, 4)
2892 GEN_VEXT_VF(vfadd_vf_d, 8)
2893 
2894 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
2895 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
2896 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
2897 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
2898 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
2899 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
2900 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
2901 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
2902 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
2903 GEN_VEXT_VF(vfsub_vf_h, 2)
2904 GEN_VEXT_VF(vfsub_vf_w, 4)
2905 GEN_VEXT_VF(vfsub_vf_d, 8)
2906 
2907 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
2908 {
2909     return float16_sub(b, a, s);
2910 }
2911 
2912 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
2913 {
2914     return float32_sub(b, a, s);
2915 }
2916 
2917 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
2918 {
2919     return float64_sub(b, a, s);
2920 }
2921 
2922 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
2923 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
2924 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
2925 GEN_VEXT_VF(vfrsub_vf_h, 2)
2926 GEN_VEXT_VF(vfrsub_vf_w, 4)
2927 GEN_VEXT_VF(vfrsub_vf_d, 8)
2928 
2929 /* Vector Widening Floating-Point Add/Subtract Instructions */
2930 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
2931 {
2932     return float32_add(float16_to_float32(a, true, s),
2933                        float16_to_float32(b, true, s), s);
2934 }
2935 
2936 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
2937 {
2938     return float64_add(float32_to_float64(a, s),
2939                        float32_to_float64(b, s), s);
2940 
2941 }
2942 
2943 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
2944 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
2945 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
2946 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
2947 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
2948 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
2949 GEN_VEXT_VF(vfwadd_vf_h, 4)
2950 GEN_VEXT_VF(vfwadd_vf_w, 8)
2951 
2952 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
2953 {
2954     return float32_sub(float16_to_float32(a, true, s),
2955                        float16_to_float32(b, true, s), s);
2956 }
2957 
2958 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
2959 {
2960     return float64_sub(float32_to_float64(a, s),
2961                        float32_to_float64(b, s), s);
2962 
2963 }
2964 
2965 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
2966 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
2967 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
2968 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
2969 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
2970 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
2971 GEN_VEXT_VF(vfwsub_vf_h, 4)
2972 GEN_VEXT_VF(vfwsub_vf_w, 8)
2973 
2974 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
2975 {
2976     return float32_add(a, float16_to_float32(b, true, s), s);
2977 }
2978 
2979 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
2980 {
2981     return float64_add(a, float32_to_float64(b, s), s);
2982 }
2983 
2984 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
2985 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
2986 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
2987 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
2988 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
2989 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
2990 GEN_VEXT_VF(vfwadd_wf_h, 4)
2991 GEN_VEXT_VF(vfwadd_wf_w, 8)
2992 
2993 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
2994 {
2995     return float32_sub(a, float16_to_float32(b, true, s), s);
2996 }
2997 
2998 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
2999 {
3000     return float64_sub(a, float32_to_float64(b, s), s);
3001 }
3002 
3003 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3004 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3005 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3006 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3007 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3008 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3009 GEN_VEXT_VF(vfwsub_wf_h, 4)
3010 GEN_VEXT_VF(vfwsub_wf_w, 8)
3011 
3012 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3013 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3014 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3015 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3016 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3017 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3018 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3019 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3020 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3021 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3022 GEN_VEXT_VF(vfmul_vf_h, 2)
3023 GEN_VEXT_VF(vfmul_vf_w, 4)
3024 GEN_VEXT_VF(vfmul_vf_d, 8)
3025 
3026 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3027 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3028 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3029 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3030 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3031 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3032 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3033 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3034 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3035 GEN_VEXT_VF(vfdiv_vf_h, 2)
3036 GEN_VEXT_VF(vfdiv_vf_w, 4)
3037 GEN_VEXT_VF(vfdiv_vf_d, 8)
3038 
3039 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3040 {
3041     return float16_div(b, a, s);
3042 }
3043 
3044 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3045 {
3046     return float32_div(b, a, s);
3047 }
3048 
3049 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3050 {
3051     return float64_div(b, a, s);
3052 }
3053 
3054 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3055 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3056 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3057 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3058 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3059 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3060 
3061 /* Vector Widening Floating-Point Multiply */
3062 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3063 {
3064     return float32_mul(float16_to_float32(a, true, s),
3065                        float16_to_float32(b, true, s), s);
3066 }
3067 
3068 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3069 {
3070     return float64_mul(float32_to_float64(a, s),
3071                        float32_to_float64(b, s), s);
3072 
3073 }
3074 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3075 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3076 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3077 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3078 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3079 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3080 GEN_VEXT_VF(vfwmul_vf_h, 4)
3081 GEN_VEXT_VF(vfwmul_vf_w, 8)
3082 
3083 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3084 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3085 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3086                       CPURISCVState *env)                          \
3087 {                                                                  \
3088     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3089     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3090     TD d = *((TD *)vd + HD(i));                                    \
3091     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3092 }
3093 
3094 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3095 {
3096     return float16_muladd(a, b, d, 0, s);
3097 }
3098 
3099 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3100 {
3101     return float32_muladd(a, b, d, 0, s);
3102 }
3103 
3104 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3105 {
3106     return float64_muladd(a, b, d, 0, s);
3107 }
3108 
3109 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3110 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3111 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3112 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3113 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3114 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3115 
3116 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3117 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3118                       CPURISCVState *env)                         \
3119 {                                                                 \
3120     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3121     TD d = *((TD *)vd + HD(i));                                   \
3122     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3123 }
3124 
3125 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3126 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3127 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3128 GEN_VEXT_VF(vfmacc_vf_h, 2)
3129 GEN_VEXT_VF(vfmacc_vf_w, 4)
3130 GEN_VEXT_VF(vfmacc_vf_d, 8)
3131 
3132 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3133 {
3134     return float16_muladd(a, b, d, float_muladd_negate_c |
3135                                    float_muladd_negate_product, s);
3136 }
3137 
3138 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3139 {
3140     return float32_muladd(a, b, d, float_muladd_negate_c |
3141                                    float_muladd_negate_product, s);
3142 }
3143 
3144 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3145 {
3146     return float64_muladd(a, b, d, float_muladd_negate_c |
3147                                    float_muladd_negate_product, s);
3148 }
3149 
3150 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3151 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3152 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3153 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3154 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3155 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3156 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3157 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3158 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3159 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3160 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3161 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3162 
3163 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3164 {
3165     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3166 }
3167 
3168 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3169 {
3170     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3171 }
3172 
3173 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3174 {
3175     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3176 }
3177 
3178 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3179 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3180 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3181 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3182 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3183 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3184 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3185 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3186 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3187 GEN_VEXT_VF(vfmsac_vf_h, 2)
3188 GEN_VEXT_VF(vfmsac_vf_w, 4)
3189 GEN_VEXT_VF(vfmsac_vf_d, 8)
3190 
3191 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3192 {
3193     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3194 }
3195 
3196 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3197 {
3198     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3199 }
3200 
3201 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3202 {
3203     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3204 }
3205 
3206 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3207 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3208 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3209 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3210 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3211 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3212 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3213 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3214 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3215 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3216 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3217 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3218 
3219 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3220 {
3221     return float16_muladd(d, b, a, 0, s);
3222 }
3223 
3224 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3225 {
3226     return float32_muladd(d, b, a, 0, s);
3227 }
3228 
3229 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3230 {
3231     return float64_muladd(d, b, a, 0, s);
3232 }
3233 
3234 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3235 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3236 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3237 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3238 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3239 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3240 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3241 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3242 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3243 GEN_VEXT_VF(vfmadd_vf_h, 2)
3244 GEN_VEXT_VF(vfmadd_vf_w, 4)
3245 GEN_VEXT_VF(vfmadd_vf_d, 8)
3246 
3247 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3248 {
3249     return float16_muladd(d, b, a, float_muladd_negate_c |
3250                                    float_muladd_negate_product, s);
3251 }
3252 
3253 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3254 {
3255     return float32_muladd(d, b, a, float_muladd_negate_c |
3256                                    float_muladd_negate_product, s);
3257 }
3258 
3259 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3260 {
3261     return float64_muladd(d, b, a, float_muladd_negate_c |
3262                                    float_muladd_negate_product, s);
3263 }
3264 
3265 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3266 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3267 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3268 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3269 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3270 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3271 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3272 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3273 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3274 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3275 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3276 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3277 
3278 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3279 {
3280     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3281 }
3282 
3283 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3284 {
3285     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3286 }
3287 
3288 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3289 {
3290     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3291 }
3292 
3293 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3294 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3295 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3296 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3297 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3298 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3299 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3300 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3301 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3302 GEN_VEXT_VF(vfmsub_vf_h, 2)
3303 GEN_VEXT_VF(vfmsub_vf_w, 4)
3304 GEN_VEXT_VF(vfmsub_vf_d, 8)
3305 
3306 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3307 {
3308     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3309 }
3310 
3311 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3312 {
3313     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3314 }
3315 
3316 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3317 {
3318     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3319 }
3320 
3321 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3322 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3323 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3324 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3325 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3326 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3327 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3328 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3329 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3330 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3331 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3332 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3333 
3334 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3335 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3336 {
3337     return float32_muladd(float16_to_float32(a, true, s),
3338                           float16_to_float32(b, true, s), d, 0, s);
3339 }
3340 
3341 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3342 {
3343     return float64_muladd(float32_to_float64(a, s),
3344                           float32_to_float64(b, s), d, 0, s);
3345 }
3346 
3347 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3348 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3349 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3350 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3351 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3352 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3353 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3354 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3355 
3356 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3357 {
3358     return float32_muladd(bfloat16_to_float32(a, s),
3359                           bfloat16_to_float32(b, s), d, 0, s);
3360 }
3361 
3362 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3363 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3364 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmacc16)
3365 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3366 
3367 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3368 {
3369     return float32_muladd(float16_to_float32(a, true, s),
3370                           float16_to_float32(b, true, s), d,
3371                           float_muladd_negate_c | float_muladd_negate_product,
3372                           s);
3373 }
3374 
3375 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3376 {
3377     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3378                           d, float_muladd_negate_c |
3379                              float_muladd_negate_product, s);
3380 }
3381 
3382 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3383 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3384 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3385 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3386 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3387 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3388 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3389 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3390 
3391 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3392 {
3393     return float32_muladd(float16_to_float32(a, true, s),
3394                           float16_to_float32(b, true, s), d,
3395                           float_muladd_negate_c, s);
3396 }
3397 
3398 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3399 {
3400     return float64_muladd(float32_to_float64(a, s),
3401                           float32_to_float64(b, s), d,
3402                           float_muladd_negate_c, s);
3403 }
3404 
3405 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3406 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3407 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3408 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3409 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3410 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3411 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3412 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3413 
3414 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3415 {
3416     return float32_muladd(float16_to_float32(a, true, s),
3417                           float16_to_float32(b, true, s), d,
3418                           float_muladd_negate_product, s);
3419 }
3420 
3421 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3422 {
3423     return float64_muladd(float32_to_float64(a, s),
3424                           float32_to_float64(b, s), d,
3425                           float_muladd_negate_product, s);
3426 }
3427 
3428 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3429 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3430 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3431 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3432 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3433 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3434 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3435 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3436 
3437 /* Vector Floating-Point Square-Root Instruction */
3438 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3439 static void do_##NAME(void *vd, void *vs2, int i,      \
3440                       CPURISCVState *env)              \
3441 {                                                      \
3442     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3443     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3444 }
3445 
3446 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3447 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3448                   CPURISCVState *env, uint32_t desc)   \
3449 {                                                      \
3450     uint32_t vm = vext_vm(desc);                       \
3451     uint32_t vl = env->vl;                             \
3452     uint32_t total_elems =                             \
3453         vext_get_total_elems(env, desc, ESZ);          \
3454     uint32_t vta = vext_vta(desc);                     \
3455     uint32_t vma = vext_vma(desc);                     \
3456     uint32_t i;                                        \
3457                                                        \
3458     if (vl == 0) {                                     \
3459         return;                                        \
3460     }                                                  \
3461     for (i = env->vstart; i < vl; i++) {               \
3462         if (!vm && !vext_elem_mask(v0, i)) {           \
3463             /* set masked-off elements to 1s */        \
3464             vext_set_elems_1s(vd, vma, i * ESZ,        \
3465                               (i + 1) * ESZ);          \
3466             continue;                                  \
3467         }                                              \
3468         do_##NAME(vd, vs2, i, env);                    \
3469     }                                                  \
3470     env->vstart = 0;                                   \
3471     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3472                       total_elems * ESZ);              \
3473 }
3474 
3475 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3476 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3477 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3478 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3479 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3480 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3481 
3482 /*
3483  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3484  *
3485  * Adapted from riscv-v-spec recip.c:
3486  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3487  */
3488 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3489 {
3490     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3491     uint64_t exp = extract64(f, frac_size, exp_size);
3492     uint64_t frac = extract64(f, 0, frac_size);
3493 
3494     const uint8_t lookup_table[] = {
3495         52, 51, 50, 48, 47, 46, 44, 43,
3496         42, 41, 40, 39, 38, 36, 35, 34,
3497         33, 32, 31, 30, 30, 29, 28, 27,
3498         26, 25, 24, 23, 23, 22, 21, 20,
3499         19, 19, 18, 17, 16, 16, 15, 14,
3500         14, 13, 12, 12, 11, 10, 10, 9,
3501         9, 8, 7, 7, 6, 6, 5, 4,
3502         4, 3, 3, 2, 2, 1, 1, 0,
3503         127, 125, 123, 121, 119, 118, 116, 114,
3504         113, 111, 109, 108, 106, 105, 103, 102,
3505         100, 99, 97, 96, 95, 93, 92, 91,
3506         90, 88, 87, 86, 85, 84, 83, 82,
3507         80, 79, 78, 77, 76, 75, 74, 73,
3508         72, 71, 70, 70, 69, 68, 67, 66,
3509         65, 64, 63, 63, 62, 61, 60, 59,
3510         59, 58, 57, 56, 56, 55, 54, 53
3511     };
3512     const int precision = 7;
3513 
3514     if (exp == 0 && frac != 0) { /* subnormal */
3515         /* Normalize the subnormal. */
3516         while (extract64(frac, frac_size - 1, 1) == 0) {
3517             exp--;
3518             frac <<= 1;
3519         }
3520 
3521         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3522     }
3523 
3524     int idx = ((exp & 1) << (precision - 1)) |
3525               (frac >> (frac_size - precision + 1));
3526     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3527                         (frac_size - precision);
3528     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3529 
3530     uint64_t val = 0;
3531     val = deposit64(val, 0, frac_size, out_frac);
3532     val = deposit64(val, frac_size, exp_size, out_exp);
3533     val = deposit64(val, frac_size + exp_size, 1, sign);
3534     return val;
3535 }
3536 
3537 static float16 frsqrt7_h(float16 f, float_status *s)
3538 {
3539     int exp_size = 5, frac_size = 10;
3540     bool sign = float16_is_neg(f);
3541 
3542     /*
3543      * frsqrt7(sNaN) = canonical NaN
3544      * frsqrt7(-inf) = canonical NaN
3545      * frsqrt7(-normal) = canonical NaN
3546      * frsqrt7(-subnormal) = canonical NaN
3547      */
3548     if (float16_is_signaling_nan(f, s) ||
3549         (float16_is_infinity(f) && sign) ||
3550         (float16_is_normal(f) && sign) ||
3551         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3552         s->float_exception_flags |= float_flag_invalid;
3553         return float16_default_nan(s);
3554     }
3555 
3556     /* frsqrt7(qNaN) = canonical NaN */
3557     if (float16_is_quiet_nan(f, s)) {
3558         return float16_default_nan(s);
3559     }
3560 
3561     /* frsqrt7(+-0) = +-inf */
3562     if (float16_is_zero(f)) {
3563         s->float_exception_flags |= float_flag_divbyzero;
3564         return float16_set_sign(float16_infinity, sign);
3565     }
3566 
3567     /* frsqrt7(+inf) = +0 */
3568     if (float16_is_infinity(f) && !sign) {
3569         return float16_set_sign(float16_zero, sign);
3570     }
3571 
3572     /* +normal, +subnormal */
3573     uint64_t val = frsqrt7(f, exp_size, frac_size);
3574     return make_float16(val);
3575 }
3576 
3577 static float32 frsqrt7_s(float32 f, float_status *s)
3578 {
3579     int exp_size = 8, frac_size = 23;
3580     bool sign = float32_is_neg(f);
3581 
3582     /*
3583      * frsqrt7(sNaN) = canonical NaN
3584      * frsqrt7(-inf) = canonical NaN
3585      * frsqrt7(-normal) = canonical NaN
3586      * frsqrt7(-subnormal) = canonical NaN
3587      */
3588     if (float32_is_signaling_nan(f, s) ||
3589         (float32_is_infinity(f) && sign) ||
3590         (float32_is_normal(f) && sign) ||
3591         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3592         s->float_exception_flags |= float_flag_invalid;
3593         return float32_default_nan(s);
3594     }
3595 
3596     /* frsqrt7(qNaN) = canonical NaN */
3597     if (float32_is_quiet_nan(f, s)) {
3598         return float32_default_nan(s);
3599     }
3600 
3601     /* frsqrt7(+-0) = +-inf */
3602     if (float32_is_zero(f)) {
3603         s->float_exception_flags |= float_flag_divbyzero;
3604         return float32_set_sign(float32_infinity, sign);
3605     }
3606 
3607     /* frsqrt7(+inf) = +0 */
3608     if (float32_is_infinity(f) && !sign) {
3609         return float32_set_sign(float32_zero, sign);
3610     }
3611 
3612     /* +normal, +subnormal */
3613     uint64_t val = frsqrt7(f, exp_size, frac_size);
3614     return make_float32(val);
3615 }
3616 
3617 static float64 frsqrt7_d(float64 f, float_status *s)
3618 {
3619     int exp_size = 11, frac_size = 52;
3620     bool sign = float64_is_neg(f);
3621 
3622     /*
3623      * frsqrt7(sNaN) = canonical NaN
3624      * frsqrt7(-inf) = canonical NaN
3625      * frsqrt7(-normal) = canonical NaN
3626      * frsqrt7(-subnormal) = canonical NaN
3627      */
3628     if (float64_is_signaling_nan(f, s) ||
3629         (float64_is_infinity(f) && sign) ||
3630         (float64_is_normal(f) && sign) ||
3631         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3632         s->float_exception_flags |= float_flag_invalid;
3633         return float64_default_nan(s);
3634     }
3635 
3636     /* frsqrt7(qNaN) = canonical NaN */
3637     if (float64_is_quiet_nan(f, s)) {
3638         return float64_default_nan(s);
3639     }
3640 
3641     /* frsqrt7(+-0) = +-inf */
3642     if (float64_is_zero(f)) {
3643         s->float_exception_flags |= float_flag_divbyzero;
3644         return float64_set_sign(float64_infinity, sign);
3645     }
3646 
3647     /* frsqrt7(+inf) = +0 */
3648     if (float64_is_infinity(f) && !sign) {
3649         return float64_set_sign(float64_zero, sign);
3650     }
3651 
3652     /* +normal, +subnormal */
3653     uint64_t val = frsqrt7(f, exp_size, frac_size);
3654     return make_float64(val);
3655 }
3656 
3657 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3658 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3659 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3660 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3661 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3662 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3663 
3664 /*
3665  * Vector Floating-Point Reciprocal Estimate Instruction
3666  *
3667  * Adapted from riscv-v-spec recip.c:
3668  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3669  */
3670 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3671                       float_status *s)
3672 {
3673     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3674     uint64_t exp = extract64(f, frac_size, exp_size);
3675     uint64_t frac = extract64(f, 0, frac_size);
3676 
3677     const uint8_t lookup_table[] = {
3678         127, 125, 123, 121, 119, 117, 116, 114,
3679         112, 110, 109, 107, 105, 104, 102, 100,
3680         99, 97, 96, 94, 93, 91, 90, 88,
3681         87, 85, 84, 83, 81, 80, 79, 77,
3682         76, 75, 74, 72, 71, 70, 69, 68,
3683         66, 65, 64, 63, 62, 61, 60, 59,
3684         58, 57, 56, 55, 54, 53, 52, 51,
3685         50, 49, 48, 47, 46, 45, 44, 43,
3686         42, 41, 40, 40, 39, 38, 37, 36,
3687         35, 35, 34, 33, 32, 31, 31, 30,
3688         29, 28, 28, 27, 26, 25, 25, 24,
3689         23, 23, 22, 21, 21, 20, 19, 19,
3690         18, 17, 17, 16, 15, 15, 14, 14,
3691         13, 12, 12, 11, 11, 10, 9, 9,
3692         8, 8, 7, 7, 6, 5, 5, 4,
3693         4, 3, 3, 2, 2, 1, 1, 0
3694     };
3695     const int precision = 7;
3696 
3697     if (exp == 0 && frac != 0) { /* subnormal */
3698         /* Normalize the subnormal. */
3699         while (extract64(frac, frac_size - 1, 1) == 0) {
3700             exp--;
3701             frac <<= 1;
3702         }
3703 
3704         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3705 
3706         if (exp != 0 && exp != UINT64_MAX) {
3707             /*
3708              * Overflow to inf or max value of same sign,
3709              * depending on sign and rounding mode.
3710              */
3711             s->float_exception_flags |= (float_flag_inexact |
3712                                          float_flag_overflow);
3713 
3714             if ((s->float_rounding_mode == float_round_to_zero) ||
3715                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3716                 ((s->float_rounding_mode == float_round_up) && sign)) {
3717                 /* Return greatest/negative finite value. */
3718                 return (sign << (exp_size + frac_size)) |
3719                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3720             } else {
3721                 /* Return +-inf. */
3722                 return (sign << (exp_size + frac_size)) |
3723                        MAKE_64BIT_MASK(frac_size, exp_size);
3724             }
3725         }
3726     }
3727 
3728     int idx = frac >> (frac_size - precision);
3729     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3730                         (frac_size - precision);
3731     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3732 
3733     if (out_exp == 0 || out_exp == UINT64_MAX) {
3734         /*
3735          * The result is subnormal, but don't raise the underflow exception,
3736          * because there's no additional loss of precision.
3737          */
3738         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3739         if (out_exp == UINT64_MAX) {
3740             out_frac >>= 1;
3741             out_exp = 0;
3742         }
3743     }
3744 
3745     uint64_t val = 0;
3746     val = deposit64(val, 0, frac_size, out_frac);
3747     val = deposit64(val, frac_size, exp_size, out_exp);
3748     val = deposit64(val, frac_size + exp_size, 1, sign);
3749     return val;
3750 }
3751 
3752 static float16 frec7_h(float16 f, float_status *s)
3753 {
3754     int exp_size = 5, frac_size = 10;
3755     bool sign = float16_is_neg(f);
3756 
3757     /* frec7(+-inf) = +-0 */
3758     if (float16_is_infinity(f)) {
3759         return float16_set_sign(float16_zero, sign);
3760     }
3761 
3762     /* frec7(+-0) = +-inf */
3763     if (float16_is_zero(f)) {
3764         s->float_exception_flags |= float_flag_divbyzero;
3765         return float16_set_sign(float16_infinity, sign);
3766     }
3767 
3768     /* frec7(sNaN) = canonical NaN */
3769     if (float16_is_signaling_nan(f, s)) {
3770         s->float_exception_flags |= float_flag_invalid;
3771         return float16_default_nan(s);
3772     }
3773 
3774     /* frec7(qNaN) = canonical NaN */
3775     if (float16_is_quiet_nan(f, s)) {
3776         return float16_default_nan(s);
3777     }
3778 
3779     /* +-normal, +-subnormal */
3780     uint64_t val = frec7(f, exp_size, frac_size, s);
3781     return make_float16(val);
3782 }
3783 
3784 static float32 frec7_s(float32 f, float_status *s)
3785 {
3786     int exp_size = 8, frac_size = 23;
3787     bool sign = float32_is_neg(f);
3788 
3789     /* frec7(+-inf) = +-0 */
3790     if (float32_is_infinity(f)) {
3791         return float32_set_sign(float32_zero, sign);
3792     }
3793 
3794     /* frec7(+-0) = +-inf */
3795     if (float32_is_zero(f)) {
3796         s->float_exception_flags |= float_flag_divbyzero;
3797         return float32_set_sign(float32_infinity, sign);
3798     }
3799 
3800     /* frec7(sNaN) = canonical NaN */
3801     if (float32_is_signaling_nan(f, s)) {
3802         s->float_exception_flags |= float_flag_invalid;
3803         return float32_default_nan(s);
3804     }
3805 
3806     /* frec7(qNaN) = canonical NaN */
3807     if (float32_is_quiet_nan(f, s)) {
3808         return float32_default_nan(s);
3809     }
3810 
3811     /* +-normal, +-subnormal */
3812     uint64_t val = frec7(f, exp_size, frac_size, s);
3813     return make_float32(val);
3814 }
3815 
3816 static float64 frec7_d(float64 f, float_status *s)
3817 {
3818     int exp_size = 11, frac_size = 52;
3819     bool sign = float64_is_neg(f);
3820 
3821     /* frec7(+-inf) = +-0 */
3822     if (float64_is_infinity(f)) {
3823         return float64_set_sign(float64_zero, sign);
3824     }
3825 
3826     /* frec7(+-0) = +-inf */
3827     if (float64_is_zero(f)) {
3828         s->float_exception_flags |= float_flag_divbyzero;
3829         return float64_set_sign(float64_infinity, sign);
3830     }
3831 
3832     /* frec7(sNaN) = canonical NaN */
3833     if (float64_is_signaling_nan(f, s)) {
3834         s->float_exception_flags |= float_flag_invalid;
3835         return float64_default_nan(s);
3836     }
3837 
3838     /* frec7(qNaN) = canonical NaN */
3839     if (float64_is_quiet_nan(f, s)) {
3840         return float64_default_nan(s);
3841     }
3842 
3843     /* +-normal, +-subnormal */
3844     uint64_t val = frec7(f, exp_size, frac_size, s);
3845     return make_float64(val);
3846 }
3847 
3848 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3849 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3850 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3851 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
3852 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
3853 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
3854 
3855 /* Vector Floating-Point MIN/MAX Instructions */
3856 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3857 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3858 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3859 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
3860 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
3861 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
3862 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
3863 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
3864 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
3865 GEN_VEXT_VF(vfmin_vf_h, 2)
3866 GEN_VEXT_VF(vfmin_vf_w, 4)
3867 GEN_VEXT_VF(vfmin_vf_d, 8)
3868 
3869 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
3870 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
3871 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
3872 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
3873 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
3874 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
3875 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
3876 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
3877 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
3878 GEN_VEXT_VF(vfmax_vf_h, 2)
3879 GEN_VEXT_VF(vfmax_vf_w, 4)
3880 GEN_VEXT_VF(vfmax_vf_d, 8)
3881 
3882 /* Vector Floating-Point Sign-Injection Instructions */
3883 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
3884 {
3885     return deposit64(b, 0, 15, a);
3886 }
3887 
3888 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
3889 {
3890     return deposit64(b, 0, 31, a);
3891 }
3892 
3893 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
3894 {
3895     return deposit64(b, 0, 63, a);
3896 }
3897 
3898 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
3899 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
3900 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
3901 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
3902 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
3903 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
3904 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
3905 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
3906 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
3907 GEN_VEXT_VF(vfsgnj_vf_h, 2)
3908 GEN_VEXT_VF(vfsgnj_vf_w, 4)
3909 GEN_VEXT_VF(vfsgnj_vf_d, 8)
3910 
3911 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
3912 {
3913     return deposit64(~b, 0, 15, a);
3914 }
3915 
3916 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
3917 {
3918     return deposit64(~b, 0, 31, a);
3919 }
3920 
3921 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
3922 {
3923     return deposit64(~b, 0, 63, a);
3924 }
3925 
3926 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
3927 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
3928 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
3929 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
3930 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
3931 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
3932 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
3933 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
3934 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
3935 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
3936 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
3937 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
3938 
3939 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
3940 {
3941     return deposit64(b ^ a, 0, 15, a);
3942 }
3943 
3944 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
3945 {
3946     return deposit64(b ^ a, 0, 31, a);
3947 }
3948 
3949 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
3950 {
3951     return deposit64(b ^ a, 0, 63, a);
3952 }
3953 
3954 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
3955 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
3956 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
3957 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
3958 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
3959 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
3960 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
3961 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
3962 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
3963 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
3964 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
3965 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
3966 
3967 /* Vector Floating-Point Compare Instructions */
3968 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
3969 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
3970                   CPURISCVState *env, uint32_t desc)          \
3971 {                                                             \
3972     uint32_t vm = vext_vm(desc);                              \
3973     uint32_t vl = env->vl;                                    \
3974     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
3975     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
3976     uint32_t vma = vext_vma(desc);                            \
3977     uint32_t i;                                               \
3978                                                               \
3979     for (i = env->vstart; i < vl; i++) {                      \
3980         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
3981         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
3982         if (!vm && !vext_elem_mask(v0, i)) {                  \
3983             /* set masked-off elements to 1s */               \
3984             if (vma) {                                        \
3985                 vext_set_elem_mask(vd, i, 1);                 \
3986             }                                                 \
3987             continue;                                         \
3988         }                                                     \
3989         vext_set_elem_mask(vd, i,                             \
3990                            DO_OP(s2, s1, &env->fp_status));   \
3991     }                                                         \
3992     env->vstart = 0;                                          \
3993     /*
3994      * mask destination register are always tail-agnostic
3995      * set tail elements to 1s
3996      */                                                       \
3997     if (vta_all_1s) {                                         \
3998         for (; i < total_elems; i++) {                        \
3999             vext_set_elem_mask(vd, i, 1);                     \
4000         }                                                     \
4001     }                                                         \
4002 }
4003 
4004 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4005 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4006 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4007 
4008 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4009 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4010                   CPURISCVState *env, uint32_t desc)                \
4011 {                                                                   \
4012     uint32_t vm = vext_vm(desc);                                    \
4013     uint32_t vl = env->vl;                                          \
4014     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4015     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4016     uint32_t vma = vext_vma(desc);                                  \
4017     uint32_t i;                                                     \
4018                                                                     \
4019     for (i = env->vstart; i < vl; i++) {                            \
4020         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4021         if (!vm && !vext_elem_mask(v0, i)) {                        \
4022             /* set masked-off elements to 1s */                     \
4023             if (vma) {                                              \
4024                 vext_set_elem_mask(vd, i, 1);                       \
4025             }                                                       \
4026             continue;                                               \
4027         }                                                           \
4028         vext_set_elem_mask(vd, i,                                   \
4029                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4030     }                                                               \
4031     env->vstart = 0;                                                \
4032     /*
4033      * mask destination register are always tail-agnostic
4034      * set tail elements to 1s
4035      */                                                             \
4036     if (vta_all_1s) {                                               \
4037         for (; i < total_elems; i++) {                              \
4038             vext_set_elem_mask(vd, i, 1);                           \
4039         }                                                           \
4040     }                                                               \
4041 }
4042 
4043 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4044 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4045 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4046 
4047 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4048 {
4049     FloatRelation compare = float16_compare_quiet(a, b, s);
4050     return compare != float_relation_equal;
4051 }
4052 
4053 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4054 {
4055     FloatRelation compare = float32_compare_quiet(a, b, s);
4056     return compare != float_relation_equal;
4057 }
4058 
4059 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4060 {
4061     FloatRelation compare = float64_compare_quiet(a, b, s);
4062     return compare != float_relation_equal;
4063 }
4064 
4065 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4066 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4067 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4068 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4069 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4070 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4071 
4072 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4073 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4074 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4075 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4076 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4077 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4078 
4079 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4080 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4081 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4082 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4083 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4084 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4085 
4086 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4087 {
4088     FloatRelation compare = float16_compare(a, b, s);
4089     return compare == float_relation_greater;
4090 }
4091 
4092 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4093 {
4094     FloatRelation compare = float32_compare(a, b, s);
4095     return compare == float_relation_greater;
4096 }
4097 
4098 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4099 {
4100     FloatRelation compare = float64_compare(a, b, s);
4101     return compare == float_relation_greater;
4102 }
4103 
4104 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4105 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4106 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4107 
4108 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4109 {
4110     FloatRelation compare = float16_compare(a, b, s);
4111     return compare == float_relation_greater ||
4112            compare == float_relation_equal;
4113 }
4114 
4115 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4116 {
4117     FloatRelation compare = float32_compare(a, b, s);
4118     return compare == float_relation_greater ||
4119            compare == float_relation_equal;
4120 }
4121 
4122 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4123 {
4124     FloatRelation compare = float64_compare(a, b, s);
4125     return compare == float_relation_greater ||
4126            compare == float_relation_equal;
4127 }
4128 
4129 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4130 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4131 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4132 
4133 /* Vector Floating-Point Classify Instruction */
4134 target_ulong fclass_h(uint64_t frs1)
4135 {
4136     float16 f = frs1;
4137     bool sign = float16_is_neg(f);
4138 
4139     if (float16_is_infinity(f)) {
4140         return sign ? 1 << 0 : 1 << 7;
4141     } else if (float16_is_zero(f)) {
4142         return sign ? 1 << 3 : 1 << 4;
4143     } else if (float16_is_zero_or_denormal(f)) {
4144         return sign ? 1 << 2 : 1 << 5;
4145     } else if (float16_is_any_nan(f)) {
4146         float_status s = { }; /* for snan_bit_is_one */
4147         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4148     } else {
4149         return sign ? 1 << 1 : 1 << 6;
4150     }
4151 }
4152 
4153 target_ulong fclass_s(uint64_t frs1)
4154 {
4155     float32 f = frs1;
4156     bool sign = float32_is_neg(f);
4157 
4158     if (float32_is_infinity(f)) {
4159         return sign ? 1 << 0 : 1 << 7;
4160     } else if (float32_is_zero(f)) {
4161         return sign ? 1 << 3 : 1 << 4;
4162     } else if (float32_is_zero_or_denormal(f)) {
4163         return sign ? 1 << 2 : 1 << 5;
4164     } else if (float32_is_any_nan(f)) {
4165         float_status s = { }; /* for snan_bit_is_one */
4166         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4167     } else {
4168         return sign ? 1 << 1 : 1 << 6;
4169     }
4170 }
4171 
4172 target_ulong fclass_d(uint64_t frs1)
4173 {
4174     float64 f = frs1;
4175     bool sign = float64_is_neg(f);
4176 
4177     if (float64_is_infinity(f)) {
4178         return sign ? 1 << 0 : 1 << 7;
4179     } else if (float64_is_zero(f)) {
4180         return sign ? 1 << 3 : 1 << 4;
4181     } else if (float64_is_zero_or_denormal(f)) {
4182         return sign ? 1 << 2 : 1 << 5;
4183     } else if (float64_is_any_nan(f)) {
4184         float_status s = { }; /* for snan_bit_is_one */
4185         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4186     } else {
4187         return sign ? 1 << 1 : 1 << 6;
4188     }
4189 }
4190 
4191 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4192 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4193 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4194 GEN_VEXT_V(vfclass_v_h, 2)
4195 GEN_VEXT_V(vfclass_v_w, 4)
4196 GEN_VEXT_V(vfclass_v_d, 8)
4197 
4198 /* Vector Floating-Point Merge Instruction */
4199 
4200 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4201 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4202                   CPURISCVState *env, uint32_t desc)          \
4203 {                                                             \
4204     uint32_t vm = vext_vm(desc);                              \
4205     uint32_t vl = env->vl;                                    \
4206     uint32_t esz = sizeof(ETYPE);                             \
4207     uint32_t total_elems =                                    \
4208         vext_get_total_elems(env, desc, esz);                 \
4209     uint32_t vta = vext_vta(desc);                            \
4210     uint32_t i;                                               \
4211                                                               \
4212     for (i = env->vstart; i < vl; i++) {                      \
4213         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4214         *((ETYPE *)vd + H(i)) =                               \
4215             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4216     }                                                         \
4217     env->vstart = 0;                                          \
4218     /* set tail elements to 1s */                             \
4219     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4220 }
4221 
4222 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4223 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4224 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4225 
4226 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4227 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4228 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4229 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4230 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4231 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4232 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4233 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4234 
4235 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4236 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4237 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4238 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4239 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4240 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4241 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4242 
4243 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4244 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4245 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4246 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4247 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4248 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4249 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4250 
4251 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4252 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4253 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4254 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4255 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4256 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4257 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4258 
4259 /* Widening Floating-Point/Integer Type-Convert Instructions */
4260 /* (TD, T2, TX2) */
4261 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4262 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4263 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4264 /*
4265  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4266  */
4267 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4268 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4269 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4270 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4271 
4272 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4273 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4274 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4275 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4276 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4277 
4278 /*
4279  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4280  */
4281 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4282 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4283 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4284 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4285 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4286 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4287 
4288 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4289 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4290 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4291 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4292 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4293 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4294 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4295 
4296 /*
4297  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4298  */
4299 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4300 {
4301     return float16_to_float32(a, true, s);
4302 }
4303 
4304 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4305 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4306 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4307 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4308 
4309 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4310 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4311 
4312 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4313 /* (TD, T2, TX2) */
4314 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4315 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4316 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4317 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4318 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4319 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4320 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4321 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4322 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4323 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4324 
4325 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4326 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4327 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4328 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4329 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4330 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4331 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4332 
4333 /*
4334  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4335  */
4336 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4337 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4338 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4339 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4340 
4341 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4342 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4343 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4344 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4345 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4346 
4347 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4348 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4349 {
4350     return float32_to_float16(a, true, s);
4351 }
4352 
4353 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4354 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4355 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4356 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4357 
4358 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4359 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4360 
4361 /*
4362  * Vector Reduction Operations
4363  */
4364 /* Vector Single-Width Integer Reduction Instructions */
4365 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4366 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4367                   void *vs2, CPURISCVState *env,          \
4368                   uint32_t desc)                          \
4369 {                                                         \
4370     uint32_t vm = vext_vm(desc);                          \
4371     uint32_t vl = env->vl;                                \
4372     uint32_t esz = sizeof(TD);                            \
4373     uint32_t vlenb = simd_maxsz(desc);                    \
4374     uint32_t vta = vext_vta(desc);                        \
4375     uint32_t i;                                           \
4376     TD s1 =  *((TD *)vs1 + HD(0));                        \
4377                                                           \
4378     for (i = env->vstart; i < vl; i++) {                  \
4379         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4380         if (!vm && !vext_elem_mask(v0, i)) {              \
4381             continue;                                     \
4382         }                                                 \
4383         s1 = OP(s1, (TD)s2);                              \
4384     }                                                     \
4385     *((TD *)vd + HD(0)) = s1;                             \
4386     env->vstart = 0;                                      \
4387     /* set tail elements to 1s */                         \
4388     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4389 }
4390 
4391 /* vd[0] = sum(vs1[0], vs2[*]) */
4392 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4393 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4394 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4395 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4396 
4397 /* vd[0] = maxu(vs1[0], vs2[*]) */
4398 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4399 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4400 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4401 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4402 
4403 /* vd[0] = max(vs1[0], vs2[*]) */
4404 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4405 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4406 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4407 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4408 
4409 /* vd[0] = minu(vs1[0], vs2[*]) */
4410 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4411 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4412 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4413 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4414 
4415 /* vd[0] = min(vs1[0], vs2[*]) */
4416 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4417 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4418 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4419 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4420 
4421 /* vd[0] = and(vs1[0], vs2[*]) */
4422 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4423 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4424 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4425 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4426 
4427 /* vd[0] = or(vs1[0], vs2[*]) */
4428 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4429 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4430 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4431 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4432 
4433 /* vd[0] = xor(vs1[0], vs2[*]) */
4434 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4435 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4436 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4437 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4438 
4439 /* Vector Widening Integer Reduction Instructions */
4440 /* signed sum reduction into double-width accumulator */
4441 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4442 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4443 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4444 
4445 /* Unsigned sum reduction into double-width accumulator */
4446 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4447 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4448 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4449 
4450 /* Vector Single-Width Floating-Point Reduction Instructions */
4451 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4452 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4453                   void *vs2, CPURISCVState *env,           \
4454                   uint32_t desc)                           \
4455 {                                                          \
4456     uint32_t vm = vext_vm(desc);                           \
4457     uint32_t vl = env->vl;                                 \
4458     uint32_t esz = sizeof(TD);                             \
4459     uint32_t vlenb = simd_maxsz(desc);                     \
4460     uint32_t vta = vext_vta(desc);                         \
4461     uint32_t i;                                            \
4462     TD s1 =  *((TD *)vs1 + HD(0));                         \
4463                                                            \
4464     for (i = env->vstart; i < vl; i++) {                   \
4465         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4466         if (!vm && !vext_elem_mask(v0, i)) {               \
4467             continue;                                      \
4468         }                                                  \
4469         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4470     }                                                      \
4471     *((TD *)vd + HD(0)) = s1;                              \
4472     env->vstart = 0;                                       \
4473     /* set tail elements to 1s */                          \
4474     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4475 }
4476 
4477 /* Unordered sum */
4478 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4479 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4480 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4481 
4482 /* Ordered sum */
4483 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4484 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4485 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4486 
4487 /* Maximum value */
4488 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4489               float16_maximum_number)
4490 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4491               float32_maximum_number)
4492 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4493               float64_maximum_number)
4494 
4495 /* Minimum value */
4496 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4497               float16_minimum_number)
4498 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4499               float32_minimum_number)
4500 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4501               float64_minimum_number)
4502 
4503 /* Vector Widening Floating-Point Add Instructions */
4504 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4505 {
4506     return float32_add(a, float16_to_float32(b, true, s), s);
4507 }
4508 
4509 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4510 {
4511     return float64_add(a, float32_to_float64(b, s), s);
4512 }
4513 
4514 /* Vector Widening Floating-Point Reduction Instructions */
4515 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4516 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4517 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4518 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4519 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4520 
4521 /*
4522  * Vector Mask Operations
4523  */
4524 /* Vector Mask-Register Logical Instructions */
4525 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4526 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4527                   void *vs2, CPURISCVState *env,          \
4528                   uint32_t desc)                          \
4529 {                                                         \
4530     uint32_t vl = env->vl;                                \
4531     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4532     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4533     uint32_t i;                                           \
4534     int a, b;                                             \
4535                                                           \
4536     for (i = env->vstart; i < vl; i++) {                  \
4537         a = vext_elem_mask(vs1, i);                       \
4538         b = vext_elem_mask(vs2, i);                       \
4539         vext_set_elem_mask(vd, i, OP(b, a));              \
4540     }                                                     \
4541     env->vstart = 0;                                      \
4542     /*
4543      * mask destination register are always tail-agnostic
4544      * set tail elements to 1s
4545      */                                                   \
4546     if (vta_all_1s) {                                     \
4547         for (; i < total_elems; i++) {                    \
4548             vext_set_elem_mask(vd, i, 1);                 \
4549         }                                                 \
4550     }                                                     \
4551 }
4552 
4553 #define DO_NAND(N, M)  (!(N & M))
4554 #define DO_ANDNOT(N, M)  (N & !M)
4555 #define DO_NOR(N, M)  (!(N | M))
4556 #define DO_ORNOT(N, M)  (N | !M)
4557 #define DO_XNOR(N, M)  (!(N ^ M))
4558 
4559 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4560 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4561 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4562 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4563 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4564 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4565 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4566 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4567 
4568 /* Vector count population in mask vcpop */
4569 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4570                              uint32_t desc)
4571 {
4572     target_ulong cnt = 0;
4573     uint32_t vm = vext_vm(desc);
4574     uint32_t vl = env->vl;
4575     int i;
4576 
4577     for (i = env->vstart; i < vl; i++) {
4578         if (vm || vext_elem_mask(v0, i)) {
4579             if (vext_elem_mask(vs2, i)) {
4580                 cnt++;
4581             }
4582         }
4583     }
4584     env->vstart = 0;
4585     return cnt;
4586 }
4587 
4588 /* vfirst find-first-set mask bit */
4589 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4590                               uint32_t desc)
4591 {
4592     uint32_t vm = vext_vm(desc);
4593     uint32_t vl = env->vl;
4594     int i;
4595 
4596     for (i = env->vstart; i < vl; i++) {
4597         if (vm || vext_elem_mask(v0, i)) {
4598             if (vext_elem_mask(vs2, i)) {
4599                 return i;
4600             }
4601         }
4602     }
4603     env->vstart = 0;
4604     return -1LL;
4605 }
4606 
4607 enum set_mask_type {
4608     ONLY_FIRST = 1,
4609     INCLUDE_FIRST,
4610     BEFORE_FIRST,
4611 };
4612 
4613 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4614                    uint32_t desc, enum set_mask_type type)
4615 {
4616     uint32_t vm = vext_vm(desc);
4617     uint32_t vl = env->vl;
4618     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4619     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4620     uint32_t vma = vext_vma(desc);
4621     int i;
4622     bool first_mask_bit = false;
4623 
4624     for (i = env->vstart; i < vl; i++) {
4625         if (!vm && !vext_elem_mask(v0, i)) {
4626             /* set masked-off elements to 1s */
4627             if (vma) {
4628                 vext_set_elem_mask(vd, i, 1);
4629             }
4630             continue;
4631         }
4632         /* write a zero to all following active elements */
4633         if (first_mask_bit) {
4634             vext_set_elem_mask(vd, i, 0);
4635             continue;
4636         }
4637         if (vext_elem_mask(vs2, i)) {
4638             first_mask_bit = true;
4639             if (type == BEFORE_FIRST) {
4640                 vext_set_elem_mask(vd, i, 0);
4641             } else {
4642                 vext_set_elem_mask(vd, i, 1);
4643             }
4644         } else {
4645             if (type == ONLY_FIRST) {
4646                 vext_set_elem_mask(vd, i, 0);
4647             } else {
4648                 vext_set_elem_mask(vd, i, 1);
4649             }
4650         }
4651     }
4652     env->vstart = 0;
4653     /*
4654      * mask destination register are always tail-agnostic
4655      * set tail elements to 1s
4656      */
4657     if (vta_all_1s) {
4658         for (; i < total_elems; i++) {
4659             vext_set_elem_mask(vd, i, 1);
4660         }
4661     }
4662 }
4663 
4664 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4665                      uint32_t desc)
4666 {
4667     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4668 }
4669 
4670 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4671                      uint32_t desc)
4672 {
4673     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4674 }
4675 
4676 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4677                      uint32_t desc)
4678 {
4679     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4680 }
4681 
4682 /* Vector Iota Instruction */
4683 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4684 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4685                   uint32_t desc)                                          \
4686 {                                                                         \
4687     uint32_t vm = vext_vm(desc);                                          \
4688     uint32_t vl = env->vl;                                                \
4689     uint32_t esz = sizeof(ETYPE);                                         \
4690     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4691     uint32_t vta = vext_vta(desc);                                        \
4692     uint32_t vma = vext_vma(desc);                                        \
4693     uint32_t sum = 0;                                                     \
4694     int i;                                                                \
4695                                                                           \
4696     for (i = env->vstart; i < vl; i++) {                                  \
4697         if (!vm && !vext_elem_mask(v0, i)) {                              \
4698             /* set masked-off elements to 1s */                           \
4699             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4700             continue;                                                     \
4701         }                                                                 \
4702         *((ETYPE *)vd + H(i)) = sum;                                      \
4703         if (vext_elem_mask(vs2, i)) {                                     \
4704             sum++;                                                        \
4705         }                                                                 \
4706     }                                                                     \
4707     env->vstart = 0;                                                      \
4708     /* set tail elements to 1s */                                         \
4709     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4710 }
4711 
4712 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4713 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4714 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4715 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4716 
4717 /* Vector Element Index Instruction */
4718 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4719 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4720 {                                                                         \
4721     uint32_t vm = vext_vm(desc);                                          \
4722     uint32_t vl = env->vl;                                                \
4723     uint32_t esz = sizeof(ETYPE);                                         \
4724     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4725     uint32_t vta = vext_vta(desc);                                        \
4726     uint32_t vma = vext_vma(desc);                                        \
4727     int i;                                                                \
4728                                                                           \
4729     for (i = env->vstart; i < vl; i++) {                                  \
4730         if (!vm && !vext_elem_mask(v0, i)) {                              \
4731             /* set masked-off elements to 1s */                           \
4732             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4733             continue;                                                     \
4734         }                                                                 \
4735         *((ETYPE *)vd + H(i)) = i;                                        \
4736     }                                                                     \
4737     env->vstart = 0;                                                      \
4738     /* set tail elements to 1s */                                         \
4739     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4740 }
4741 
4742 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4743 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4744 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4745 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4746 
4747 /*
4748  * Vector Permutation Instructions
4749  */
4750 
4751 /* Vector Slide Instructions */
4752 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4753 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4754                   CPURISCVState *env, uint32_t desc)                      \
4755 {                                                                         \
4756     uint32_t vm = vext_vm(desc);                                          \
4757     uint32_t vl = env->vl;                                                \
4758     uint32_t esz = sizeof(ETYPE);                                         \
4759     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4760     uint32_t vta = vext_vta(desc);                                        \
4761     uint32_t vma = vext_vma(desc);                                        \
4762     target_ulong offset = s1, i_min, i;                                   \
4763                                                                           \
4764     i_min = MAX(env->vstart, offset);                                     \
4765     for (i = i_min; i < vl; i++) {                                        \
4766         if (!vm && !vext_elem_mask(v0, i)) {                              \
4767             /* set masked-off elements to 1s */                           \
4768             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4769             continue;                                                     \
4770         }                                                                 \
4771         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4772     }                                                                     \
4773     /* set tail elements to 1s */                                         \
4774     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4775 }
4776 
4777 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4778 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4779 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4780 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4781 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4782 
4783 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4784 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4785                   CPURISCVState *env, uint32_t desc)                      \
4786 {                                                                         \
4787     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4788     uint32_t vm = vext_vm(desc);                                          \
4789     uint32_t vl = env->vl;                                                \
4790     uint32_t esz = sizeof(ETYPE);                                         \
4791     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4792     uint32_t vta = vext_vta(desc);                                        \
4793     uint32_t vma = vext_vma(desc);                                        \
4794     target_ulong i_max, i_min, i;                                         \
4795                                                                           \
4796     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
4797     i_max = MAX(i_min, env->vstart);                                      \
4798     for (i = env->vstart; i < i_max; ++i) {                               \
4799         if (!vm && !vext_elem_mask(v0, i)) {                              \
4800             /* set masked-off elements to 1s */                           \
4801             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4802             continue;                                                     \
4803         }                                                                 \
4804         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
4805     }                                                                     \
4806                                                                           \
4807     for (i = i_max; i < vl; ++i) {                                        \
4808         if (vm || vext_elem_mask(v0, i)) {                                \
4809             *((ETYPE *)vd + H(i)) = 0;                                    \
4810         }                                                                 \
4811     }                                                                     \
4812                                                                           \
4813     env->vstart = 0;                                                      \
4814     /* set tail elements to 1s */                                         \
4815     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4816 }
4817 
4818 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4819 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4820 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4821 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4822 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4823 
4824 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4825 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4826                                  void *vs2, CPURISCVState *env,             \
4827                                  uint32_t desc)                             \
4828 {                                                                           \
4829     typedef uint##BITWIDTH##_t ETYPE;                                       \
4830     uint32_t vm = vext_vm(desc);                                            \
4831     uint32_t vl = env->vl;                                                  \
4832     uint32_t esz = sizeof(ETYPE);                                           \
4833     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
4834     uint32_t vta = vext_vta(desc);                                          \
4835     uint32_t vma = vext_vma(desc);                                          \
4836     uint32_t i;                                                             \
4837                                                                             \
4838     for (i = env->vstart; i < vl; i++) {                                    \
4839         if (!vm && !vext_elem_mask(v0, i)) {                                \
4840             /* set masked-off elements to 1s */                             \
4841             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
4842             continue;                                                       \
4843         }                                                                   \
4844         if (i == 0) {                                                       \
4845             *((ETYPE *)vd + H(i)) = s1;                                     \
4846         } else {                                                            \
4847             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4848         }                                                                   \
4849     }                                                                       \
4850     env->vstart = 0;                                                        \
4851     /* set tail elements to 1s */                                           \
4852     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
4853 }
4854 
4855 GEN_VEXT_VSLIE1UP(8,  H1)
4856 GEN_VEXT_VSLIE1UP(16, H2)
4857 GEN_VEXT_VSLIE1UP(32, H4)
4858 GEN_VEXT_VSLIE1UP(64, H8)
4859 
4860 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4861 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4862                   CPURISCVState *env, uint32_t desc)              \
4863 {                                                                 \
4864     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4865 }
4866 
4867 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4868 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4869 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4870 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4871 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4872 
4873 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4874 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
4875                                    void *vs2, CPURISCVState *env,             \
4876                                    uint32_t desc)                             \
4877 {                                                                             \
4878     typedef uint##BITWIDTH##_t ETYPE;                                         \
4879     uint32_t vm = vext_vm(desc);                                              \
4880     uint32_t vl = env->vl;                                                    \
4881     uint32_t esz = sizeof(ETYPE);                                             \
4882     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
4883     uint32_t vta = vext_vta(desc);                                            \
4884     uint32_t vma = vext_vma(desc);                                            \
4885     uint32_t i;                                                               \
4886                                                                               \
4887     for (i = env->vstart; i < vl; i++) {                                      \
4888         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4889             /* set masked-off elements to 1s */                               \
4890             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
4891             continue;                                                         \
4892         }                                                                     \
4893         if (i == vl - 1) {                                                    \
4894             *((ETYPE *)vd + H(i)) = s1;                                       \
4895         } else {                                                              \
4896             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4897         }                                                                     \
4898     }                                                                         \
4899     env->vstart = 0;                                                          \
4900     /* set tail elements to 1s */                                             \
4901     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
4902 }
4903 
4904 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4905 GEN_VEXT_VSLIDE1DOWN(16, H2)
4906 GEN_VEXT_VSLIDE1DOWN(32, H4)
4907 GEN_VEXT_VSLIDE1DOWN(64, H8)
4908 
4909 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4910 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4911                   CPURISCVState *env, uint32_t desc)              \
4912 {                                                                 \
4913     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4914 }
4915 
4916 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4917 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4918 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4919 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4920 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4921 
4922 /* Vector Floating-Point Slide Instructions */
4923 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4924 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4925                   CPURISCVState *env, uint32_t desc)          \
4926 {                                                             \
4927     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4928 }
4929 
4930 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4931 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4932 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4933 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4934 
4935 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4936 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4937                   CPURISCVState *env, uint32_t desc)          \
4938 {                                                             \
4939     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4940 }
4941 
4942 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4943 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4944 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4945 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4946 
4947 /* Vector Register Gather Instruction */
4948 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4949 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4950                   CPURISCVState *env, uint32_t desc)                      \
4951 {                                                                         \
4952     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4953     uint32_t vm = vext_vm(desc);                                          \
4954     uint32_t vl = env->vl;                                                \
4955     uint32_t esz = sizeof(TS2);                                           \
4956     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4957     uint32_t vta = vext_vta(desc);                                        \
4958     uint32_t vma = vext_vma(desc);                                        \
4959     uint64_t index;                                                       \
4960     uint32_t i;                                                           \
4961                                                                           \
4962     for (i = env->vstart; i < vl; i++) {                                  \
4963         if (!vm && !vext_elem_mask(v0, i)) {                              \
4964             /* set masked-off elements to 1s */                           \
4965             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4966             continue;                                                     \
4967         }                                                                 \
4968         index = *((TS1 *)vs1 + HS1(i));                                   \
4969         if (index >= vlmax) {                                             \
4970             *((TS2 *)vd + HS2(i)) = 0;                                    \
4971         } else {                                                          \
4972             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
4973         }                                                                 \
4974     }                                                                     \
4975     env->vstart = 0;                                                      \
4976     /* set tail elements to 1s */                                         \
4977     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4978 }
4979 
4980 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
4981 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
4982 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
4983 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
4984 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
4985 
4986 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
4987 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
4988 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
4989 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
4990 
4991 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
4992 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4993                   CPURISCVState *env, uint32_t desc)                      \
4994 {                                                                         \
4995     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4996     uint32_t vm = vext_vm(desc);                                          \
4997     uint32_t vl = env->vl;                                                \
4998     uint32_t esz = sizeof(ETYPE);                                         \
4999     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5000     uint32_t vta = vext_vta(desc);                                        \
5001     uint32_t vma = vext_vma(desc);                                        \
5002     uint64_t index = s1;                                                  \
5003     uint32_t i;                                                           \
5004                                                                           \
5005     for (i = env->vstart; i < vl; i++) {                                  \
5006         if (!vm && !vext_elem_mask(v0, i)) {                              \
5007             /* set masked-off elements to 1s */                           \
5008             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5009             continue;                                                     \
5010         }                                                                 \
5011         if (index >= vlmax) {                                             \
5012             *((ETYPE *)vd + H(i)) = 0;                                    \
5013         } else {                                                          \
5014             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5015         }                                                                 \
5016     }                                                                     \
5017     env->vstart = 0;                                                      \
5018     /* set tail elements to 1s */                                         \
5019     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5020 }
5021 
5022 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5023 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5024 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5025 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5026 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5027 
5028 /* Vector Compress Instruction */
5029 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5030 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5031                   CPURISCVState *env, uint32_t desc)                      \
5032 {                                                                         \
5033     uint32_t vl = env->vl;                                                \
5034     uint32_t esz = sizeof(ETYPE);                                         \
5035     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5036     uint32_t vta = vext_vta(desc);                                        \
5037     uint32_t num = 0, i;                                                  \
5038                                                                           \
5039     for (i = env->vstart; i < vl; i++) {                                  \
5040         if (!vext_elem_mask(vs1, i)) {                                    \
5041             continue;                                                     \
5042         }                                                                 \
5043         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5044         num++;                                                            \
5045     }                                                                     \
5046     env->vstart = 0;                                                      \
5047     /* set tail elements to 1s */                                         \
5048     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5049 }
5050 
5051 /* Compress into vd elements of vs2 where vs1 is enabled */
5052 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5053 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5054 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5055 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5056 
5057 /* Vector Whole Register Move */
5058 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5059 {
5060     /* EEW = SEW */
5061     uint32_t maxsz = simd_maxsz(desc);
5062     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5063     uint32_t startb = env->vstart * sewb;
5064     uint32_t i = startb;
5065 
5066     memcpy((uint8_t *)vd + H1(i),
5067            (uint8_t *)vs2 + H1(i),
5068            maxsz - startb);
5069 
5070     env->vstart = 0;
5071 }
5072 
5073 /* Vector Integer Extension */
5074 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5075 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5076                   CPURISCVState *env, uint32_t desc)             \
5077 {                                                                \
5078     uint32_t vl = env->vl;                                       \
5079     uint32_t vm = vext_vm(desc);                                 \
5080     uint32_t esz = sizeof(ETYPE);                                \
5081     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5082     uint32_t vta = vext_vta(desc);                               \
5083     uint32_t vma = vext_vma(desc);                               \
5084     uint32_t i;                                                  \
5085                                                                  \
5086     for (i = env->vstart; i < vl; i++) {                         \
5087         if (!vm && !vext_elem_mask(v0, i)) {                     \
5088             /* set masked-off elements to 1s */                  \
5089             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5090             continue;                                            \
5091         }                                                        \
5092         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5093     }                                                            \
5094     env->vstart = 0;                                             \
5095     /* set tail elements to 1s */                                \
5096     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5097 }
5098 
5099 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5100 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5101 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5102 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5103 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5104 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5105 
5106 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5107 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5108 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5109 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5110 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5111 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5112