1 // See LICENSE for license details.
2 
3 #ifndef _RISCV_DECODE_H
4 #define _RISCV_DECODE_H
5 
6 #if (-1 != ~0) || ((-1 >> 1) != -1)
7 # error spike requires a two''s-complement c++ implementation
8 #endif
9 
10 #include <algorithm>
11 #include <cstdint>
12 #include <string.h>
13 #include <strings.h>
14 #include "encoding.h"
15 #include "config.h"
16 #include "common.h"
17 #include "softfloat_types.h"
18 #include "specialize.h"
19 #include <cinttypes>
20 
21 typedef int64_t sreg_t;
22 typedef uint64_t reg_t;
23 
24 #ifdef __SIZEOF_INT128__
25 typedef __int128 int128_t;
26 typedef unsigned __int128 uint128_t;
27 #endif
28 
29 const int NXPR = 32;
30 const int NFPR = 32;
31 const int NVPR = 32;
32 const int NCSR = 4096;
33 
34 #define X_RA 1
35 #define X_SP 2
36 
37 #define VCSR_VXRM_SHIFT 1
38 #define VCSR_VXRM  (0x3 << VCSR_VXRM_SHIFT)
39 
40 #define VCSR_VXSAT_SHIFT 0
41 #define VCSR_VXSAT  (0x1 << VCSR_VXSAT_SHIFT)
42 
43 #define FP_RD_NE  0
44 #define FP_RD_0   1
45 #define FP_RD_DN  2
46 #define FP_RD_UP  3
47 #define FP_RD_NMM 4
48 
49 #define FSR_RD_SHIFT 5
50 #define FSR_RD   (0x7 << FSR_RD_SHIFT)
51 
52 #define FPEXC_NX 0x01
53 #define FPEXC_UF 0x02
54 #define FPEXC_OF 0x04
55 #define FPEXC_DZ 0x08
56 #define FPEXC_NV 0x10
57 
58 #define FSR_AEXC_SHIFT 0
59 #define FSR_NVA  (FPEXC_NV << FSR_AEXC_SHIFT)
60 #define FSR_OFA  (FPEXC_OF << FSR_AEXC_SHIFT)
61 #define FSR_UFA  (FPEXC_UF << FSR_AEXC_SHIFT)
62 #define FSR_DZA  (FPEXC_DZ << FSR_AEXC_SHIFT)
63 #define FSR_NXA  (FPEXC_NX << FSR_AEXC_SHIFT)
64 #define FSR_AEXC (FSR_NVA | FSR_OFA | FSR_UFA | FSR_DZA | FSR_NXA)
65 
66 #define insn_length(x) \
67   (((x) & 0x03) < 0x03 ? 2 : \
68    ((x) & 0x1f) < 0x1f ? 4 : \
69    ((x) & 0x3f) < 0x3f ? 6 : \
70    ((x) & 0x7f) == 0x7f ? 4 : \
71    8)
72 #define MAX_INSN_LENGTH 8
73 #define PC_ALIGN 2
74 
75 typedef uint64_t insn_bits_t;
76 class insn_t
77 {
78 public:
79   insn_t() = default;
insn_t(insn_bits_t bits)80   insn_t(insn_bits_t bits) : b(bits) {}
bits()81   insn_bits_t bits() { return b & ~((UINT64_MAX) << (length() * 8)); }
length()82   int length() { return insn_length(b); }
i_imm()83   int64_t i_imm() { return int64_t(b) >> 20; }
shamt()84   int64_t shamt() { return x(20, 6); }
s_imm()85   int64_t s_imm() { return x(7, 5) + (xs(25, 7) << 5); }
sb_imm()86   int64_t sb_imm() { return (x(8, 4) << 1) + (x(25,6) << 5) + (x(7,1) << 11) + (imm_sign() << 12); }
u_imm()87   int64_t u_imm() { return int64_t(b) >> 12 << 12; }
uj_imm()88   int64_t uj_imm() { return (x(21, 10) << 1) + (x(20, 1) << 11) + (x(12, 8) << 12) + (imm_sign() << 20); }
rd()89   uint64_t rd() { return x(7, 5); }
rs1()90   uint64_t rs1() { return x(15, 5); }
rs2()91   uint64_t rs2() { return x(20, 5); }
rs3()92   uint64_t rs3() { return x(27, 5); }
rm()93   uint64_t rm() { return x(12, 3); }
csr()94   uint64_t csr() { return x(20, 12); }
iorw()95   uint64_t iorw() { return x(20, 8); }
bs()96   uint64_t bs  () {return x(30,2);} // Crypto ISE - SM4/AES32 byte select.
rcon()97   uint64_t rcon() {return x(20,4);} // Crypto ISE - AES64 round const.
98 
rvc_imm()99   int64_t rvc_imm() { return x(2, 5) + (xs(12, 1) << 5); }
rvc_zimm()100   int64_t rvc_zimm() { return x(2, 5) + (x(12, 1) << 5); }
rvc_addi4spn_imm()101   int64_t rvc_addi4spn_imm() { return (x(6, 1) << 2) + (x(5, 1) << 3) + (x(11, 2) << 4) + (x(7, 4) << 6); }
rvc_addi16sp_imm()102   int64_t rvc_addi16sp_imm() { return (x(6, 1) << 4) + (x(2, 1) << 5) + (x(5, 1) << 6) + (x(3, 2) << 7) + (xs(12, 1) << 9); }
rvc_lwsp_imm()103   int64_t rvc_lwsp_imm() { return (x(4, 3) << 2) + (x(12, 1) << 5) + (x(2, 2) << 6); }
rvc_ldsp_imm()104   int64_t rvc_ldsp_imm() { return (x(5, 2) << 3) + (x(12, 1) << 5) + (x(2, 3) << 6); }
rvc_swsp_imm()105   int64_t rvc_swsp_imm() { return (x(9, 4) << 2) + (x(7, 2) << 6); }
rvc_sdsp_imm()106   int64_t rvc_sdsp_imm() { return (x(10, 3) << 3) + (x(7, 3) << 6); }
rvc_lw_imm()107   int64_t rvc_lw_imm() { return (x(6, 1) << 2) + (x(10, 3) << 3) + (x(5, 1) << 6); }
rvc_ld_imm()108   int64_t rvc_ld_imm() { return (x(10, 3) << 3) + (x(5, 2) << 6); }
rvc_j_imm()109   int64_t rvc_j_imm() { return (x(3, 3) << 1) + (x(11, 1) << 4) + (x(2, 1) << 5) + (x(7, 1) << 6) + (x(6, 1) << 7) + (x(9, 2) << 8) + (x(8, 1) << 10) + (xs(12, 1) << 11); }
rvc_b_imm()110   int64_t rvc_b_imm() { return (x(3, 2) << 1) + (x(10, 2) << 3) + (x(2, 1) << 5) + (x(5, 2) << 6) + (xs(12, 1) << 8); }
rvc_simm3()111   int64_t rvc_simm3() { return x(10, 3); }
rvc_rd()112   uint64_t rvc_rd() { return rd(); }
rvc_rs1()113   uint64_t rvc_rs1() { return rd(); }
rvc_rs2()114   uint64_t rvc_rs2() { return x(2, 5); }
rvc_rs1s()115   uint64_t rvc_rs1s() { return 8 + x(7, 3); }
rvc_rs2s()116   uint64_t rvc_rs2s() { return 8 + x(2, 3); }
117 
v_vm()118   uint64_t v_vm() { return x(25, 1); }
v_wd()119   uint64_t v_wd() { return x(26, 1); }
v_nf()120   uint64_t v_nf() { return x(29, 3); }
v_simm5()121   uint64_t v_simm5() { return xs(15, 5); }
v_zimm5()122   uint64_t v_zimm5() { return x(15, 5); }
v_zimm10()123   uint64_t v_zimm10() { return x(20, 10); }
v_zimm11()124   uint64_t v_zimm11() { return x(20, 11); }
v_lmul()125   uint64_t v_lmul() { return x(20, 2); }
v_frac_lmul()126   uint64_t v_frac_lmul() { return x(22, 1); }
v_sew()127   uint64_t v_sew() { return 1 << (x(23, 3) + 3); }
v_width()128   uint64_t v_width() { return x(12, 3); }
v_mop()129   uint64_t v_mop() { return x(26, 2); }
v_lumop()130   uint64_t v_lumop() { return x(20, 5); }
v_sumop()131   uint64_t v_sumop() { return x(20, 5); }
v_vta()132   uint64_t v_vta() { return x(26, 1); }
v_vma()133   uint64_t v_vma() { return x(27, 1); }
v_mew()134   uint64_t v_mew() { return x(28, 1); }
135 
p_imm2()136   uint64_t p_imm2() { return x(20, 2); }
p_imm3()137   uint64_t p_imm3() { return x(20, 3); }
p_imm4()138   uint64_t p_imm4() { return x(20, 4); }
p_imm5()139   uint64_t p_imm5() { return x(20, 5); }
p_imm6()140   uint64_t p_imm6() { return x(20, 6); }
141 
142 private:
143   insn_bits_t b;
x(int lo,int len)144   uint64_t x(int lo, int len) { return (b >> lo) & ((insn_bits_t(1) << len)-1); }
xs(int lo,int len)145   uint64_t xs(int lo, int len) { return int64_t(b) << (64-lo-len) >> (64-len); }
imm_sign()146   uint64_t imm_sign() { return xs(63, 1); }
147 };
148 
149 template <class T, size_t N, bool zero_reg>
150 class regfile_t
151 {
152 public:
write(size_t i,T value)153   void write(size_t i, T value)
154   {
155     if (!zero_reg || i != 0)
156       data[i] = value;
157   }
158   const T& operator [] (size_t i) const
159   {
160     return data[i];
161   }
regfile_t()162   regfile_t()
163   {
164     reset();
165   }
reset()166   void reset()
167   {
168     memset(data, 0, sizeof(data));
169   }
170 private:
171   T data[N];
172 };
173 
174 // helpful macros, etc
175 #define MMU (*p->get_mmu())
176 #define STATE (*p->get_state())
177 #define P (*p)
178 #define FLEN (p->get_flen())
179 #define READ_REG(reg) STATE.XPR[reg]
180 #define READ_FREG(reg) STATE.FPR[reg]
181 #define RD READ_REG(insn.rd())
182 #define RS1 READ_REG(insn.rs1())
183 #define RS2 READ_REG(insn.rs2())
184 #define RS3 READ_REG(insn.rs3())
185 #define WRITE_RD(value) WRITE_REG(insn.rd(), value)
186 
187 #ifndef RISCV_ENABLE_COMMITLOG
188 # define WRITE_REG(reg, value) STATE.XPR.write(reg, value)
189 # define WRITE_FREG(reg, value) DO_WRITE_FREG(reg, freg(value))
190 # define WRITE_VSTATUS
191 #else
192    /* 0 : int
193     * 1 : floating
194     * 2 : vector reg
195     * 3 : vector hint
196     * 4 : csr
197     */
198 # define WRITE_REG(reg, value) ({ \
199     reg_t wdata = (value); /* value may have side effects */ \
200     STATE.log_reg_write[(reg) << 4] = {wdata, 0}; \
201     STATE.XPR.write(reg, wdata); \
202   })
203 # define WRITE_FREG(reg, value) ({ \
204     freg_t wdata = freg(value); /* value may have side effects */ \
205     STATE.log_reg_write[((reg) << 4) | 1] = wdata; \
206     DO_WRITE_FREG(reg, wdata); \
207   })
208 # define WRITE_VSTATUS STATE.log_reg_write[3] = {0, 0};
209 #endif
210 
211 // RVC macros
212 #define WRITE_RVC_RS1S(value) WRITE_REG(insn.rvc_rs1s(), value)
213 #define WRITE_RVC_RS2S(value) WRITE_REG(insn.rvc_rs2s(), value)
214 #define WRITE_RVC_FRS2S(value) WRITE_FREG(insn.rvc_rs2s(), value)
215 #define RVC_RS1 READ_REG(insn.rvc_rs1())
216 #define RVC_RS2 READ_REG(insn.rvc_rs2())
217 #define RVC_RS1S READ_REG(insn.rvc_rs1s())
218 #define RVC_RS2S READ_REG(insn.rvc_rs2s())
219 #define RVC_FRS2 READ_FREG(insn.rvc_rs2())
220 #define RVC_FRS2S READ_FREG(insn.rvc_rs2s())
221 #define RVC_SP READ_REG(X_SP)
222 
223 // FPU macros
224 #define FRS1 READ_FREG(insn.rs1())
225 #define FRS2 READ_FREG(insn.rs2())
226 #define FRS3 READ_FREG(insn.rs3())
227 #define dirty_fp_state  STATE.sstatus->dirty(SSTATUS_FS)
228 #define dirty_ext_state STATE.sstatus->dirty(SSTATUS_XS)
229 #define dirty_vs_state  STATE.sstatus->dirty(SSTATUS_VS)
230 #define DO_WRITE_FREG(reg, value) (STATE.FPR.write(reg, value), dirty_fp_state)
231 #define WRITE_FRD(value) WRITE_FREG(insn.rd(), value)
232 
233 #define SHAMT (insn.i_imm() & 0x3F)
234 #define BRANCH_TARGET (pc + insn.sb_imm())
235 #define JUMP_TARGET (pc + insn.uj_imm())
236 #define RM ({ int rm = insn.rm(); \
237               if(rm == 7) rm = STATE.frm->read(); \
238               if(rm > 4) throw trap_illegal_instruction(insn.bits()); \
239               rm; })
240 
241 #define get_field(reg, mask) (((reg) & (decltype(reg))(mask)) / ((mask) & ~((mask) << 1)))
242 #define set_field(reg, mask, val) (((reg) & ~(decltype(reg))(mask)) | (((decltype(reg))(val) * ((mask) & ~((mask) << 1))) & (decltype(reg))(mask)))
243 
244 #define require(x) do { if (unlikely(!(x))) throw trap_illegal_instruction(insn.bits()); } while (0)
245 #define require_privilege(p) require(STATE.prv >= (p))
246 #define require_novirt() if (unlikely(STATE.v)) throw trap_virtual_instruction(insn.bits())
247 #define require_rv64 require(xlen == 64)
248 #define require_rv32 require(xlen == 32)
249 #define require_extension(s) require(p->extension_enabled(s))
250 #define require_either_extension(A,B) require(p->extension_enabled(A) || p->extension_enabled(B));
251 #define require_impl(s) require(p->supports_impl(s))
252 #define require_fp          require(STATE.sstatus->enabled(SSTATUS_FS))
253 #define require_accelerator require(STATE.sstatus->enabled(SSTATUS_XS))
254 #define require_vector_vs   require(STATE.sstatus->enabled(SSTATUS_VS))
255 #define require_vector(alu) \
256   do { \
257     require_vector_vs; \
258     require_extension('V'); \
259     require(!P.VU.vill); \
260     if (alu && !P.VU.vstart_alu) \
261       require(P.VU.vstart->read() == 0);        \
262     WRITE_VSTATUS; \
263     dirty_vs_state; \
264   } while (0);
265 #define require_vector_novtype(is_log, alu) \
266   do {  \
267     require_vector_vs; \
268     require_extension('V'); \
269     if (alu && !P.VU.vstart_alu) \
270       require(P.VU.vstart->read() == 0);        \
271     if (is_log) \
272       WRITE_VSTATUS; \
273     dirty_vs_state; \
274   } while (0);
275 #define require_align(val, pos) require(is_aligned(val, pos))
276 #define require_noover(astart, asize, bstart, bsize) \
277   require(!is_overlapped(astart, asize, bstart, bsize))
278 #define require_noover_widen(astart, asize, bstart, bsize) \
279   require(!is_overlapped_widen(astart, asize, bstart, bsize))
280 #define require_vm do { if (insn.v_vm() == 0) require(insn.rd() != 0);} while(0);
281 
282 #define set_fp_exceptions ({ if (softfloat_exceptionFlags) { \
283                                STATE.fflags->write(STATE.fflags->read() | softfloat_exceptionFlags); \
284                              } \
285                              softfloat_exceptionFlags = 0; })
286 
287 #define sext32(x) ((sreg_t)(int32_t)(x))
288 #define zext32(x) ((reg_t)(uint32_t)(x))
289 #define sext_xlen(x) (((sreg_t)(x) << (64-xlen)) >> (64-xlen))
290 #define zext(x, pos) (((reg_t)(x) << (64-(pos))) >> (64-(pos)))
291 #define zext_xlen(x) zext(x, xlen)
292 
293 #define set_pc(x) \
294   do { p->check_pc_alignment(x); \
295        npc = sext_xlen(x); \
296      } while(0)
297 
298 #define set_pc_and_serialize(x) \
299   do { reg_t __npc = (x) & p->pc_alignment_mask(); \
300        npc = PC_SERIALIZE_AFTER; \
301        STATE.pc = __npc; \
302      } while(0)
303 
304 class wait_for_interrupt_t {};
305 
306 #define wfi() \
307   do { set_pc_and_serialize(npc); \
308        npc = PC_SERIALIZE_WFI; \
309        throw wait_for_interrupt_t(); \
310      } while(0)
311 
312 #define serialize() set_pc_and_serialize(npc)
313 
314 /* Sentinel PC values to serialize simulator pipeline */
315 #define PC_SERIALIZE_BEFORE 3
316 #define PC_SERIALIZE_AFTER 5
317 #define PC_SERIALIZE_WFI 7
318 #define invalid_pc(pc) ((pc) & 1)
319 
320 /* Convenience wrappers to simplify softfloat code sequences */
321 #define isBoxedF16(r) (isBoxedF32(r) && ((uint64_t)((r.v[0] >> 16) + 1) == ((uint64_t)1 << 48)))
322 #define unboxF16(r) (isBoxedF16(r) ? (uint16_t)r.v[0] : defaultNaNF16UI)
323 #define isBoxedF32(r) (isBoxedF64(r) && ((uint32_t)((r.v[0] >> 32) + 1) == 0))
324 #define unboxF32(r) (isBoxedF32(r) ? (uint32_t)r.v[0] : defaultNaNF32UI)
325 #define isBoxedF64(r) ((r.v[1] + 1) == 0)
326 #define unboxF64(r) (isBoxedF64(r) ? r.v[0] : defaultNaNF64UI)
327 typedef float128_t freg_t;
f16(uint16_t v)328 inline float16_t f16(uint16_t v) { return { v }; }
f32(uint32_t v)329 inline float32_t f32(uint32_t v) { return { v }; }
f64(uint64_t v)330 inline float64_t f64(uint64_t v) { return { v }; }
f16(freg_t r)331 inline float16_t f16(freg_t r) { return f16(unboxF16(r)); }
f32(freg_t r)332 inline float32_t f32(freg_t r) { return f32(unboxF32(r)); }
f64(freg_t r)333 inline float64_t f64(freg_t r) { return f64(unboxF64(r)); }
f128(freg_t r)334 inline float128_t f128(freg_t r) { return r; }
freg(float16_t f)335 inline freg_t freg(float16_t f) { return { ((uint64_t)-1 << 16) | f.v, (uint64_t)-1 }; }
freg(float32_t f)336 inline freg_t freg(float32_t f) { return { ((uint64_t)-1 << 32) | f.v, (uint64_t)-1 }; }
freg(float64_t f)337 inline freg_t freg(float64_t f) { return { f.v, (uint64_t)-1 }; }
freg(float128_t f)338 inline freg_t freg(float128_t f) { return f; }
339 #define F16_SIGN ((uint16_t)1 << 15)
340 #define F32_SIGN ((uint32_t)1 << 31)
341 #define F64_SIGN ((uint64_t)1 << 63)
342 #define fsgnj16(a, b, n, x) \
343   f16((f16(a).v & ~F16_SIGN) | ((((x) ? f16(a).v : (n) ? F16_SIGN : 0) ^ f16(b).v) & F16_SIGN))
344 #define fsgnj32(a, b, n, x) \
345   f32((f32(a).v & ~F32_SIGN) | ((((x) ? f32(a).v : (n) ? F32_SIGN : 0) ^ f32(b).v) & F32_SIGN))
346 #define fsgnj64(a, b, n, x) \
347   f64((f64(a).v & ~F64_SIGN) | ((((x) ? f64(a).v : (n) ? F64_SIGN : 0) ^ f64(b).v) & F64_SIGN))
348 
349 #define isNaNF128(x) isNaNF128UI(x.v[1], x.v[0])
defaultNaNF128()350 inline float128_t defaultNaNF128()
351 {
352   float128_t nan;
353   nan.v[1] = defaultNaNF128UI64;
354   nan.v[0] = defaultNaNF128UI0;
355   return nan;
356 }
fsgnj128(freg_t a,freg_t b,bool n,bool x)357 inline freg_t fsgnj128(freg_t a, freg_t b, bool n, bool x)
358 {
359   a.v[1] = (a.v[1] & ~F64_SIGN) | (((x ? a.v[1] : n ? F64_SIGN : 0) ^ b.v[1]) & F64_SIGN);
360   return a;
361 }
f128_negate(freg_t a)362 inline freg_t f128_negate(freg_t a)
363 {
364   a.v[1] ^= F64_SIGN;
365   return a;
366 }
367 
368 #define validate_csr(which, write) ({ \
369   if (!STATE.serialized) return PC_SERIALIZE_BEFORE; \
370   STATE.serialized = false; \
371   /* permissions check occurs in get_csr */ \
372   (which); })
373 
374 /* For debug only. This will fail if the native machine's float types are not IEEE */
to_f(float32_t f)375 inline float to_f(float32_t f){float r; memcpy(&r, &f, sizeof(r)); return r;}
to_f(float64_t f)376 inline double to_f(float64_t f){double r; memcpy(&r, &f, sizeof(r)); return r;}
to_f(float128_t f)377 inline long double to_f(float128_t f){long double r; memcpy(&r, &f, sizeof(r)); return r;}
378 
379 // Vector macros
380 #define e8 8      // 8b elements
381 #define e16 16    // 16b elements
382 #define e32 32    // 32b elements
383 #define e64 64    // 64b elements
384 #define e128 128  // 128b elements
385 #define e256 256  // 256b elements
386 #define e512 512  // 512b elements
387 #define e1024 1024  // 1024b elements
388 
389 #define vsext(x, sew) (((sreg_t)(x) << (64-sew)) >> (64-sew))
390 #define vzext(x, sew) (((reg_t)(x) << (64-sew)) >> (64-sew))
391 
392 #define DEBUG_RVV 0
393 
394 #if DEBUG_RVV
395 #define DEBUG_RVV_FP_VV \
396   printf("vfp(%lu) vd=%f vs1=%f vs2=%f\n", i, to_f(vd), to_f(vs1), to_f(vs2));
397 #define DEBUG_RVV_FP_VF \
398   printf("vfp(%lu) vd=%f vs1=%f vs2=%f\n", i, to_f(vd), to_f(rs1), to_f(vs2));
399 #define DEBUG_RVV_FMA_VV \
400   printf("vfma(%lu) vd=%f vs1=%f vs2=%f vd_old=%f\n", i, to_f(vd), to_f(vs1), to_f(vs2), to_f(vd_old));
401 #define DEBUG_RVV_FMA_VF \
402   printf("vfma(%lu) vd=%f vs1=%f vs2=%f vd_old=%f\n", i, to_f(vd), to_f(rs1), to_f(vs2), to_f(vd_old));
403 #else
404 #define DEBUG_RVV_FP_VV 0
405 #define DEBUG_RVV_FP_VF 0
406 #define DEBUG_RVV_FMA_VV 0
407 #define DEBUG_RVV_FMA_VF 0
408 #endif
409 
410 //
411 // vector: masking skip helper
412 //
413 #define VI_MASK_VARS \
414   const int midx = i / 64; \
415   const int mpos = i % 64;
416 
417 #define VI_LOOP_ELEMENT_SKIP(BODY) \
418   VI_MASK_VARS \
419   if (insn.v_vm() == 0) { \
420     BODY; \
421     bool skip = ((P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1) == 0; \
422     if (skip) {\
423         continue; \
424     }\
425   }
426 
427 #define VI_ELEMENT_SKIP(inx) \
428   if (inx >= vl) { \
429     continue; \
430   } else if (inx < P.VU.vstart->read()) {       \
431     continue; \
432   } else { \
433     VI_LOOP_ELEMENT_SKIP(); \
434   }
435 
436 //
437 // vector: operation and register acccess check helper
438 //
is_overlapped(const int astart,int asize,const int bstart,int bsize)439 static inline bool is_overlapped(const int astart, int asize,
440                                 const int bstart, int bsize)
441 {
442   asize = asize == 0 ? 1 : asize;
443   bsize = bsize == 0 ? 1 : bsize;
444 
445   const int aend = astart + asize;
446   const int bend = bstart + bsize;
447 
448   return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
449 }
450 
is_overlapped_widen(const int astart,int asize,const int bstart,int bsize)451 static inline bool is_overlapped_widen(const int astart, int asize,
452                                        const int bstart, int bsize)
453 {
454   asize = asize == 0 ? 1 : asize;
455   bsize = bsize == 0 ? 1 : bsize;
456 
457   const int aend = astart + asize;
458   const int bend = bstart + bsize;
459 
460   if (astart < bstart &&
461       is_overlapped(astart, asize, bstart, bsize) &&
462       !is_overlapped(astart, asize, bstart + bsize, bsize)) {
463       return false;
464   } else  {
465     return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
466   }
467 }
468 
is_aligned(const unsigned val,const unsigned pos)469 static inline bool is_aligned(const unsigned val, const unsigned pos)
470 {
471   return pos ? (val & (pos - 1)) == 0 : true;
472 }
473 
474 #define VI_NARROW_CHECK_COMMON \
475   require_vector(true);\
476   require(P.VU.vflmul <= 4); \
477   require(P.VU.vsew * 2 <= P.VU.ELEN); \
478   require_align(insn.rs2(), P.VU.vflmul * 2); \
479   require_align(insn.rd(), P.VU.vflmul); \
480   require_vm; \
481 
482 #define VI_WIDE_CHECK_COMMON \
483   require_vector(true);\
484   require(P.VU.vflmul <= 4); \
485   require(P.VU.vsew * 2 <= P.VU.ELEN); \
486   require_align(insn.rd(), P.VU.vflmul * 2); \
487   require_vm; \
488 
489 #define VI_CHECK_ST_INDEX(elt_width) \
490   require_vector(false); \
491   float vemul = ((float)elt_width / P.VU.vsew * P.VU.vflmul); \
492   require(vemul >= 0.125 && vemul <= 8); \
493   reg_t emul = vemul < 1 ? 1 : vemul; \
494   reg_t flmul = P.VU.vflmul < 1 ? 1 : P.VU.vflmul; \
495   require_align(insn.rd(), P.VU.vflmul); \
496   require_align(insn.rs2(), vemul); \
497   require((nf * flmul) <= (NVPR / 4) && \
498           (insn.rd() + nf * flmul) <= NVPR); \
499 
500 #define VI_CHECK_LD_INDEX(elt_width) \
501   VI_CHECK_ST_INDEX(elt_width); \
502   for (reg_t idx = 0; idx < nf; ++idx) { \
503     reg_t flmul = P.VU.vflmul < 1 ? 1 : P.VU.vflmul; \
504     reg_t seg_vd = insn.rd() + flmul * idx;  \
505     if (elt_width > P.VU.vsew) { \
506       if (seg_vd != insn.rs2()) \
507         require_noover(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
508     } else if (elt_width < P.VU.vsew) { \
509       if (vemul < 1) {\
510         require_noover(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
511       } else {\
512         require_noover_widen(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
513       } \
514     } \
515     if (nf >= 2) { \
516       require_noover(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
517     } \
518   } \
519   require_vm; \
520 
521 #define VI_CHECK_MSS(is_vs1) \
522   if (insn.rd() != insn.rs2()) \
523     require_noover(insn.rd(), 1, insn.rs2(), P.VU.vflmul); \
524   require_align(insn.rs2(), P.VU.vflmul); \
525   if (is_vs1) {\
526     if (insn.rd() != insn.rs1()) \
527       require_noover(insn.rd(), 1, insn.rs1(), P.VU.vflmul); \
528     require_align(insn.rs1(), P.VU.vflmul); \
529   } \
530 
531 #define VI_CHECK_SSS(is_vs1) \
532   require_vm; \
533   if (P.VU.vflmul > 1) { \
534     require_align(insn.rd(), P.VU.vflmul); \
535     require_align(insn.rs2(), P.VU.vflmul); \
536     if (is_vs1) { \
537       require_align(insn.rs1(), P.VU.vflmul); \
538     } \
539   }
540 
541 #define VI_CHECK_STORE(elt_width, is_mask_ldst) \
542   require_vector(false); \
543   reg_t veew = is_mask_ldst ? 1 : sizeof(elt_width##_t) * 8; \
544   float vemul = is_mask_ldst ? 1 : ((float)veew / P.VU.vsew * P.VU.vflmul); \
545   reg_t emul = vemul < 1 ? 1 : vemul; \
546   require(vemul >= 0.125 && vemul <= 8); \
547   require_align(insn.rd(), vemul); \
548   require((nf * emul) <= (NVPR / 4) && \
549           (insn.rd() + nf * emul) <= NVPR); \
550 
551 #define VI_CHECK_LOAD(elt_width, is_mask_ldst) \
552   VI_CHECK_STORE(elt_width, is_mask_ldst); \
553   require_vm; \
554 
555 #define VI_CHECK_DSS(is_vs1) \
556   VI_WIDE_CHECK_COMMON; \
557   require_align(insn.rs2(), P.VU.vflmul); \
558   if (P.VU.vflmul < 1) {\
559     require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs2(), P.VU.vflmul); \
560   } else {\
561     require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs2(), P.VU.vflmul); \
562   } \
563   if (is_vs1) {\
564     require_align(insn.rs1(), P.VU.vflmul); \
565     if (P.VU.vflmul < 1) {\
566       require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
567     } else {\
568       require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
569     } \
570   }
571 
572 #define VI_CHECK_DDS(is_rs) \
573   VI_WIDE_CHECK_COMMON; \
574   require_align(insn.rs2(), P.VU.vflmul * 2); \
575   if (is_rs) { \
576      require_align(insn.rs1(), P.VU.vflmul); \
577     if (P.VU.vflmul < 1) {\
578       require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
579     } else {\
580       require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
581     } \
582   }
583 
584 #define VI_CHECK_SDS(is_vs1) \
585   VI_NARROW_CHECK_COMMON; \
586   if (insn.rd() != insn.rs2()) \
587     require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul * 2); \
588   if (is_vs1) \
589     require_align(insn.rs1(), P.VU.vflmul); \
590 
591 #define VI_CHECK_REDUCTION(is_wide) \
592   require_vector(true);\
593   if (is_wide) {\
594     require(P.VU.vsew * 2 <= P.VU.ELEN); \
595   } \
596   require_align(insn.rs2(), P.VU.vflmul); \
597   require(P.VU.vstart->read() == 0);      \
598 
599 #define VI_CHECK_SLIDE(is_over) \
600   require_align(insn.rs2(), P.VU.vflmul); \
601   require_align(insn.rd(), P.VU.vflmul); \
602   require_vm; \
603   if (is_over) \
604     require(insn.rd() != insn.rs2()); \
605 
606 
607 //
608 // vector: loop header and end helper
609 //
610 #define VI_GENERAL_LOOP_BASE \
611   require(P.VU.vsew >= e8 && P.VU.vsew <= e64); \
612   require_vector(true);\
613   reg_t vl = P.VU.vl->read();                   \
614   reg_t sew = P.VU.vsew; \
615   reg_t rd_num = insn.rd(); \
616   reg_t rs1_num = insn.rs1(); \
617   reg_t rs2_num = insn.rs2(); \
618   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){
619 
620 #define VI_LOOP_BASE \
621     VI_GENERAL_LOOP_BASE \
622     VI_LOOP_ELEMENT_SKIP();
623 
624 #define VI_LOOP_END \
625   } \
626   P.VU.vstart->write(0);
627 
628 #define VI_LOOP_REDUCTION_END(x) \
629   } \
630   if (vl > 0) { \
631     vd_0_des = vd_0_res; \
632   } \
633   P.VU.vstart->write(0);
634 
635 #define VI_LOOP_CMP_BASE \
636   require(P.VU.vsew >= e8 && P.VU.vsew <= e64); \
637   require_vector(true);\
638   reg_t vl = P.VU.vl->read();                   \
639   reg_t sew = P.VU.vsew; \
640   reg_t rd_num = insn.rd(); \
641   reg_t rs1_num = insn.rs1(); \
642   reg_t rs2_num = insn.rs2(); \
643   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
644     VI_LOOP_ELEMENT_SKIP(); \
645     uint64_t mmask = UINT64_C(1) << mpos; \
646     uint64_t &vdi = P.VU.elt<uint64_t>(insn.rd(), midx, true); \
647     uint64_t res = 0;
648 
649 #define VI_LOOP_CMP_END \
650     vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
651   } \
652   P.VU.vstart->write(0);
653 
654 #define VI_LOOP_MASK(op) \
655   require(P.VU.vsew <= e64); \
656   require_vector(true);\
657   reg_t vl = P.VU.vl->read();                        \
658   for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
659     int midx = i / 64; \
660     int mpos = i % 64; \
661     uint64_t mmask = UINT64_C(1) << mpos; \
662     uint64_t vs2 = P.VU.elt<uint64_t>(insn.rs2(), midx); \
663     uint64_t vs1 = P.VU.elt<uint64_t>(insn.rs1(), midx); \
664     uint64_t &res = P.VU.elt<uint64_t>(insn.rd(), midx, true); \
665     res = (res & ~mmask) | ((op) & (1ULL << mpos)); \
666   } \
667   P.VU.vstart->write(0);
668 
669 #define VI_LOOP_NSHIFT_BASE \
670   VI_GENERAL_LOOP_BASE; \
671   VI_LOOP_ELEMENT_SKIP({\
672     require(!(insn.rd() == 0 && P.VU.vflmul > 1));\
673   });
674 
675 
676 #define INT_ROUNDING(result, xrm, gb) \
677   do { \
678     const uint64_t lsb = 1UL << (gb); \
679     const uint64_t lsb_half = lsb >> 1; \
680     switch (xrm) {\
681       case VRM::RNU:\
682         result += lsb_half; \
683         break;\
684       case VRM::RNE:\
685         if ((result & lsb_half) && ((result & (lsb_half - 1)) || (result & lsb))) \
686           result += lsb; \
687         break;\
688       case VRM::RDN:\
689         break;\
690       case VRM::ROD:\
691         if (result & (lsb - 1)) \
692           result |= lsb; \
693         break;\
694       case VRM::INVALID_RM:\
695         assert(true);\
696     } \
697   } while (0)
698 
699 //
700 // vector: integer and masking operand access helper
701 //
702 #define VXI_PARAMS(x) \
703   type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
704   type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
705   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
706   type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
707   type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5();
708 
709 #define VV_U_PARAMS(x) \
710   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
711   type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
712   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
713 
714 #define VX_U_PARAMS(x) \
715   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
716   type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
717   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
718 
719 #define VI_U_PARAMS(x) \
720   type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
721   type_usew_t<x>::type zimm5 = (type_usew_t<x>::type)insn.v_zimm5(); \
722   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
723 
724 #define VV_PARAMS(x) \
725   type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
726   type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
727   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
728 
729 #define VX_PARAMS(x) \
730   type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
731   type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
732   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
733 
734 #define VI_PARAMS(x) \
735   type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
736   type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
737   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
738 
739 #define XV_PARAMS(x) \
740   type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
741   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, RS1);
742 
743 #define VV_UCMP_PARAMS(x) \
744   type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
745   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
746 
747 #define VX_UCMP_PARAMS(x) \
748   type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
749   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
750 
751 #define VI_UCMP_PARAMS(x) \
752   type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
753 
754 #define VV_CMP_PARAMS(x) \
755   type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
756   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
757 
758 #define VX_CMP_PARAMS(x) \
759   type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
760   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
761 
762 #define VI_CMP_PARAMS(x) \
763   type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
764   type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
765 
766 #define VI_XI_SLIDEDOWN_PARAMS(x, off) \
767   auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
768   auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i + off);
769 
770 #define VI_XI_SLIDEUP_PARAMS(x, offset) \
771   auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
772   auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i - offset);
773 
774 #define VI_NSHIFT_PARAMS(sew1, sew2) \
775   auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
776   auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
777   auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
778   auto zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5();
779 
780 #define VX_NSHIFT_PARAMS(sew1, sew2) \
781   auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
782   auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
783   auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
784   auto rs1 = (type_sew_t<sew1>::type)RS1;
785 
786 #define VV_NSHIFT_PARAMS(sew1, sew2) \
787   auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
788   auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
789   auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
790   auto vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i);
791 
792 #define XI_CARRY_PARAMS(x) \
793   auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
794   auto rs1 = (type_sew_t<x>::type)RS1; \
795   auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
796   auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
797 
798 #define VV_CARRY_PARAMS(x) \
799   auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
800   auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
801   auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
802 
803 #define XI_WITH_CARRY_PARAMS(x) \
804   auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
805   auto rs1 = (type_sew_t<x>::type)RS1; \
806   auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
807   auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true);
808 
809 #define VV_WITH_CARRY_PARAMS(x) \
810   auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
811   auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
812   auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true);
813 
814 //
815 // vector: integer and masking operation loop
816 //
817 
818 // comparision result to masking register
819 #define VI_VV_LOOP_CMP(BODY) \
820   VI_CHECK_MSS(true); \
821   VI_LOOP_CMP_BASE \
822   if (sew == e8){ \
823     VV_CMP_PARAMS(e8); \
824     BODY; \
825   }else if(sew == e16){ \
826     VV_CMP_PARAMS(e16); \
827     BODY; \
828   }else if(sew == e32){ \
829     VV_CMP_PARAMS(e32); \
830     BODY; \
831   }else if(sew == e64){ \
832     VV_CMP_PARAMS(e64); \
833     BODY; \
834   } \
835   VI_LOOP_CMP_END
836 
837 #define VI_VX_LOOP_CMP(BODY) \
838   VI_CHECK_MSS(false); \
839   VI_LOOP_CMP_BASE \
840   if (sew == e8){ \
841     VX_CMP_PARAMS(e8); \
842     BODY; \
843   }else if(sew == e16){ \
844     VX_CMP_PARAMS(e16); \
845     BODY; \
846   }else if(sew == e32){ \
847     VX_CMP_PARAMS(e32); \
848     BODY; \
849   }else if(sew == e64){ \
850     VX_CMP_PARAMS(e64); \
851     BODY; \
852   } \
853   VI_LOOP_CMP_END
854 
855 #define VI_VI_LOOP_CMP(BODY) \
856   VI_CHECK_MSS(false); \
857   VI_LOOP_CMP_BASE \
858   if (sew == e8){ \
859     VI_CMP_PARAMS(e8); \
860     BODY; \
861   }else if(sew == e16){ \
862     VI_CMP_PARAMS(e16); \
863     BODY; \
864   }else if(sew == e32){ \
865     VI_CMP_PARAMS(e32); \
866     BODY; \
867   }else if(sew == e64){ \
868     VI_CMP_PARAMS(e64); \
869     BODY; \
870   } \
871   VI_LOOP_CMP_END
872 
873 #define VI_VV_ULOOP_CMP(BODY) \
874   VI_CHECK_MSS(true); \
875   VI_LOOP_CMP_BASE \
876   if (sew == e8){ \
877     VV_UCMP_PARAMS(e8); \
878     BODY; \
879   }else if(sew == e16){ \
880     VV_UCMP_PARAMS(e16); \
881     BODY; \
882   }else if(sew == e32){ \
883     VV_UCMP_PARAMS(e32); \
884     BODY; \
885   }else if(sew == e64){ \
886     VV_UCMP_PARAMS(e64); \
887     BODY; \
888   } \
889   VI_LOOP_CMP_END
890 
891 #define VI_VX_ULOOP_CMP(BODY) \
892   VI_CHECK_MSS(false); \
893   VI_LOOP_CMP_BASE \
894   if (sew == e8){ \
895     VX_UCMP_PARAMS(e8); \
896     BODY; \
897   }else if(sew == e16){ \
898     VX_UCMP_PARAMS(e16); \
899     BODY; \
900   }else if(sew == e32){ \
901     VX_UCMP_PARAMS(e32); \
902     BODY; \
903   }else if(sew == e64){ \
904     VX_UCMP_PARAMS(e64); \
905     BODY; \
906   } \
907   VI_LOOP_CMP_END
908 
909 #define VI_VI_ULOOP_CMP(BODY) \
910   VI_CHECK_MSS(false); \
911   VI_LOOP_CMP_BASE \
912   if (sew == e8){ \
913     VI_UCMP_PARAMS(e8); \
914     BODY; \
915   }else if(sew == e16){ \
916     VI_UCMP_PARAMS(e16); \
917     BODY; \
918   }else if(sew == e32){ \
919     VI_UCMP_PARAMS(e32); \
920     BODY; \
921   }else if(sew == e64){ \
922     VI_UCMP_PARAMS(e64); \
923     BODY; \
924   } \
925   VI_LOOP_CMP_END
926 
927 // merge and copy loop
928 #define VI_VVXI_MERGE_LOOP(BODY) \
929   VI_GENERAL_LOOP_BASE \
930   if (sew == e8){ \
931     VXI_PARAMS(e8); \
932     BODY; \
933   }else if(sew == e16){ \
934     VXI_PARAMS(e16); \
935     BODY; \
936   }else if(sew == e32){ \
937     VXI_PARAMS(e32); \
938     BODY; \
939   }else if(sew == e64){ \
940     VXI_PARAMS(e64); \
941     BODY; \
942   } \
943   VI_LOOP_END
944 
945 // reduction loop - signed
946 #define VI_LOOP_REDUCTION_BASE(x) \
947   require(x >= e8 && x <= e64); \
948   reg_t vl = P.VU.vl->read();   \
949   reg_t rd_num = insn.rd(); \
950   reg_t rs1_num = insn.rs1(); \
951   reg_t rs2_num = insn.rs2(); \
952   auto &vd_0_des = P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true); \
953   auto vd_0_res = P.VU.elt<type_sew_t<x>::type>(rs1_num, 0); \
954   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
955     VI_LOOP_ELEMENT_SKIP(); \
956     auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
957 
958 #define REDUCTION_LOOP(x, BODY) \
959   VI_LOOP_REDUCTION_BASE(x) \
960   BODY; \
961   VI_LOOP_REDUCTION_END(x)
962 
963 #define VI_VV_LOOP_REDUCTION(BODY) \
964   VI_CHECK_REDUCTION(false); \
965   reg_t sew = P.VU.vsew; \
966   if (sew == e8) { \
967     REDUCTION_LOOP(e8, BODY) \
968   } else if(sew == e16) { \
969     REDUCTION_LOOP(e16, BODY) \
970   } else if(sew == e32) { \
971     REDUCTION_LOOP(e32, BODY) \
972   } else if(sew == e64) { \
973     REDUCTION_LOOP(e64, BODY) \
974   }
975 
976 // reduction loop - unsgied
977 #define VI_ULOOP_REDUCTION_BASE(x) \
978   require(x >= e8 && x <= e64); \
979   reg_t vl = P.VU.vl->read();   \
980   reg_t rd_num = insn.rd(); \
981   reg_t rs1_num = insn.rs1(); \
982   reg_t rs2_num = insn.rs2(); \
983   auto &vd_0_des = P.VU.elt<type_usew_t<x>::type>(rd_num, 0, true); \
984   auto vd_0_res = P.VU.elt<type_usew_t<x>::type>(rs1_num, 0); \
985   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
986     VI_LOOP_ELEMENT_SKIP(); \
987     auto vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
988 
989 #define REDUCTION_ULOOP(x, BODY) \
990   VI_ULOOP_REDUCTION_BASE(x) \
991   BODY; \
992   VI_LOOP_REDUCTION_END(x)
993 
994 #define VI_VV_ULOOP_REDUCTION(BODY) \
995   VI_CHECK_REDUCTION(false); \
996   reg_t sew = P.VU.vsew; \
997   if (sew == e8){ \
998     REDUCTION_ULOOP(e8, BODY) \
999   } else if(sew == e16) { \
1000     REDUCTION_ULOOP(e16, BODY) \
1001   } else if(sew == e32) { \
1002     REDUCTION_ULOOP(e32, BODY) \
1003   } else if(sew == e64) { \
1004     REDUCTION_ULOOP(e64, BODY) \
1005   }
1006 
1007 
1008 // genearl VXI signed/unsgied loop
1009 #define VI_VV_ULOOP(BODY) \
1010   VI_CHECK_SSS(true) \
1011   VI_LOOP_BASE \
1012   if (sew == e8){ \
1013     VV_U_PARAMS(e8); \
1014     BODY; \
1015   }else if(sew == e16){ \
1016     VV_U_PARAMS(e16); \
1017     BODY; \
1018   }else if(sew == e32){ \
1019     VV_U_PARAMS(e32); \
1020     BODY; \
1021   }else if(sew == e64){ \
1022     VV_U_PARAMS(e64); \
1023     BODY; \
1024   } \
1025   VI_LOOP_END
1026 
1027 #define VI_VV_LOOP(BODY) \
1028   VI_CHECK_SSS(true) \
1029   VI_LOOP_BASE \
1030   if (sew == e8){ \
1031     VV_PARAMS(e8); \
1032     BODY; \
1033   }else if(sew == e16){ \
1034     VV_PARAMS(e16); \
1035     BODY; \
1036   }else if(sew == e32){ \
1037     VV_PARAMS(e32); \
1038     BODY; \
1039   }else if(sew == e64){ \
1040     VV_PARAMS(e64); \
1041     BODY; \
1042   } \
1043   VI_LOOP_END
1044 
1045 #define VI_VX_ULOOP(BODY) \
1046   VI_CHECK_SSS(false) \
1047   VI_LOOP_BASE \
1048   if (sew == e8){ \
1049     VX_U_PARAMS(e8); \
1050     BODY; \
1051   }else if(sew == e16){ \
1052     VX_U_PARAMS(e16); \
1053     BODY; \
1054   }else if(sew == e32){ \
1055     VX_U_PARAMS(e32); \
1056     BODY; \
1057   }else if(sew == e64){ \
1058     VX_U_PARAMS(e64); \
1059     BODY; \
1060   } \
1061   VI_LOOP_END
1062 
1063 #define VI_VX_LOOP(BODY) \
1064   VI_CHECK_SSS(false) \
1065   VI_LOOP_BASE \
1066   if (sew == e8){ \
1067     VX_PARAMS(e8); \
1068     BODY; \
1069   }else if(sew == e16){ \
1070     VX_PARAMS(e16); \
1071     BODY; \
1072   }else if(sew == e32){ \
1073     VX_PARAMS(e32); \
1074     BODY; \
1075   }else if(sew == e64){ \
1076     VX_PARAMS(e64); \
1077     BODY; \
1078   } \
1079   VI_LOOP_END
1080 
1081 #define VI_VI_ULOOP(BODY) \
1082   VI_CHECK_SSS(false) \
1083   VI_LOOP_BASE \
1084   if (sew == e8){ \
1085     VI_U_PARAMS(e8); \
1086     BODY; \
1087   }else if(sew == e16){ \
1088     VI_U_PARAMS(e16); \
1089     BODY; \
1090   }else if(sew == e32){ \
1091     VI_U_PARAMS(e32); \
1092     BODY; \
1093   }else if(sew == e64){ \
1094     VI_U_PARAMS(e64); \
1095     BODY; \
1096   } \
1097   VI_LOOP_END
1098 
1099 #define VI_VI_LOOP(BODY) \
1100   VI_CHECK_SSS(false) \
1101   VI_LOOP_BASE \
1102   if (sew == e8){ \
1103     VI_PARAMS(e8); \
1104     BODY; \
1105   }else if(sew == e16){ \
1106     VI_PARAMS(e16); \
1107     BODY; \
1108   }else if(sew == e32){ \
1109     VI_PARAMS(e32); \
1110     BODY; \
1111   }else if(sew == e64){ \
1112     VI_PARAMS(e64); \
1113     BODY; \
1114   } \
1115   VI_LOOP_END
1116 
1117 // narrow operation loop
1118 #define VI_VV_LOOP_NARROW(BODY) \
1119 VI_NARROW_CHECK_COMMON; \
1120 VI_LOOP_BASE \
1121 if (sew == e8){ \
1122   VI_NARROW_SHIFT(e8, e16) \
1123   BODY; \
1124 }else if(sew == e16){ \
1125   VI_NARROW_SHIFT(e16, e32) \
1126   BODY; \
1127 }else if(sew == e32){ \
1128   VI_NARROW_SHIFT(e32, e64) \
1129   BODY; \
1130 } \
1131 VI_LOOP_END
1132 
1133 #define VI_NARROW_SHIFT(sew1, sew2) \
1134   type_usew_t<sew1>::type &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
1135   type_usew_t<sew2>::type vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
1136   type_usew_t<sew1>::type zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5(); \
1137   type_sew_t<sew2>::type vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
1138   type_sew_t<sew1>::type vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i); \
1139   type_sew_t<sew1>::type rs1 = (type_sew_t<sew1>::type)RS1;
1140 
1141 #define VI_VVXI_LOOP_NARROW(BODY, is_vs1) \
1142   VI_CHECK_SDS(is_vs1); \
1143   VI_LOOP_BASE \
1144   if (sew == e8){ \
1145     VI_NARROW_SHIFT(e8, e16) \
1146     BODY; \
1147   } else if (sew == e16) { \
1148     VI_NARROW_SHIFT(e16, e32) \
1149     BODY; \
1150   } else if (sew == e32) { \
1151     VI_NARROW_SHIFT(e32, e64) \
1152     BODY; \
1153   } \
1154   VI_LOOP_END
1155 
1156 #define VI_VI_LOOP_NSHIFT(BODY, is_vs1) \
1157   VI_CHECK_SDS(is_vs1); \
1158   VI_LOOP_NSHIFT_BASE \
1159   if (sew == e8){ \
1160     VI_NSHIFT_PARAMS(e8, e16) \
1161     BODY; \
1162   } else if (sew == e16) { \
1163     VI_NSHIFT_PARAMS(e16, e32) \
1164     BODY; \
1165   } else if (sew == e32) { \
1166     VI_NSHIFT_PARAMS(e32, e64) \
1167     BODY; \
1168   } \
1169   VI_LOOP_END
1170 
1171 #define VI_VX_LOOP_NSHIFT(BODY, is_vs1) \
1172   VI_CHECK_SDS(is_vs1); \
1173   VI_LOOP_NSHIFT_BASE \
1174   if (sew == e8){ \
1175     VX_NSHIFT_PARAMS(e8, e16) \
1176     BODY; \
1177   } else if (sew == e16) { \
1178     VX_NSHIFT_PARAMS(e16, e32) \
1179     BODY; \
1180   } else if (sew == e32) { \
1181     VX_NSHIFT_PARAMS(e32, e64) \
1182     BODY; \
1183   } \
1184   VI_LOOP_END
1185 
1186 #define VI_VV_LOOP_NSHIFT(BODY, is_vs1) \
1187   VI_CHECK_SDS(is_vs1); \
1188   VI_LOOP_NSHIFT_BASE \
1189   if (sew == e8){ \
1190     VV_NSHIFT_PARAMS(e8, e16) \
1191     BODY; \
1192   } else if (sew == e16) { \
1193     VV_NSHIFT_PARAMS(e16, e32) \
1194     BODY; \
1195   } else if (sew == e32) { \
1196     VV_NSHIFT_PARAMS(e32, e64) \
1197     BODY; \
1198   } \
1199   VI_LOOP_END
1200 
1201 // widen operation loop
1202 #define VI_VV_LOOP_WIDEN(BODY) \
1203   VI_LOOP_BASE \
1204   if (sew == e8){ \
1205     VV_PARAMS(e8); \
1206     BODY; \
1207   }else if(sew == e16){ \
1208     VV_PARAMS(e16); \
1209     BODY; \
1210   }else if(sew == e32){ \
1211     VV_PARAMS(e32); \
1212     BODY; \
1213   } \
1214   VI_LOOP_END
1215 
1216 #define VI_VX_LOOP_WIDEN(BODY) \
1217   VI_LOOP_BASE \
1218   if (sew == e8){ \
1219     VX_PARAMS(e8); \
1220     BODY; \
1221   }else if(sew == e16){ \
1222     VX_PARAMS(e16); \
1223     BODY; \
1224   }else if(sew == e32){ \
1225     VX_PARAMS(e32); \
1226     BODY; \
1227   } \
1228   VI_LOOP_END
1229 
1230 #define VI_WIDE_OP_AND_ASSIGN(var0, var1, var2, op0, op1, sign) \
1231   switch(P.VU.vsew) { \
1232   case e8: { \
1233     sign##16_t vd_w = P.VU.elt<sign##16_t>(rd_num, i); \
1234     P.VU.elt<uint16_t>(rd_num, i, true) = \
1235       op1((sign##16_t)(sign##8_t)var0 op0 (sign##16_t)(sign##8_t)var1) + var2; \
1236     } \
1237     break; \
1238   case e16: { \
1239     sign##32_t vd_w = P.VU.elt<sign##32_t>(rd_num, i); \
1240     P.VU.elt<uint32_t>(rd_num, i, true) = \
1241       op1((sign##32_t)(sign##16_t)var0 op0 (sign##32_t)(sign##16_t)var1) + var2; \
1242     } \
1243     break; \
1244   default: { \
1245     sign##64_t vd_w = P.VU.elt<sign##64_t>(rd_num, i); \
1246     P.VU.elt<uint64_t>(rd_num, i, true) = \
1247       op1((sign##64_t)(sign##32_t)var0 op0 (sign##64_t)(sign##32_t)var1) + var2; \
1248     } \
1249     break; \
1250   }
1251 
1252 #define VI_WIDE_OP_AND_ASSIGN_MIX(var0, var1, var2, op0, op1, sign_d, sign_1, sign_2) \
1253   switch(P.VU.vsew) { \
1254   case e8: { \
1255     sign_d##16_t vd_w = P.VU.elt<sign_d##16_t>(rd_num, i); \
1256     P.VU.elt<uint16_t>(rd_num, i, true) = \
1257       op1((sign_1##16_t)(sign_1##8_t)var0 op0 (sign_2##16_t)(sign_2##8_t)var1) + var2; \
1258     } \
1259     break; \
1260   case e16: { \
1261     sign_d##32_t vd_w = P.VU.elt<sign_d##32_t>(rd_num, i); \
1262     P.VU.elt<uint32_t>(rd_num, i, true) = \
1263       op1((sign_1##32_t)(sign_1##16_t)var0 op0 (sign_2##32_t)(sign_2##16_t)var1) + var2; \
1264     } \
1265     break; \
1266   default: { \
1267     sign_d##64_t vd_w = P.VU.elt<sign_d##64_t>(rd_num, i); \
1268     P.VU.elt<uint64_t>(rd_num, i, true) = \
1269       op1((sign_1##64_t)(sign_1##32_t)var0 op0 (sign_2##64_t)(sign_2##32_t)var1) + var2; \
1270     } \
1271     break; \
1272   }
1273 
1274 #define VI_WIDE_WVX_OP(var0, op0, sign) \
1275   switch(P.VU.vsew) { \
1276   case e8: { \
1277     sign##16_t &vd_w = P.VU.elt<sign##16_t>(rd_num, i, true); \
1278     sign##16_t vs2_w = P.VU.elt<sign##16_t>(rs2_num, i); \
1279     vd_w = vs2_w op0 (sign##16_t)(sign##8_t)var0; \
1280     } \
1281     break; \
1282   case e16: { \
1283     sign##32_t &vd_w = P.VU.elt<sign##32_t>(rd_num, i, true); \
1284     sign##32_t vs2_w = P.VU.elt<sign##32_t>(rs2_num, i); \
1285     vd_w = vs2_w op0 (sign##32_t)(sign##16_t)var0; \
1286     } \
1287     break; \
1288   default: { \
1289     sign##64_t &vd_w = P.VU.elt<sign##64_t>(rd_num, i, true); \
1290     sign##64_t vs2_w = P.VU.elt<sign##64_t>(rs2_num, i); \
1291     vd_w = vs2_w op0 (sign##64_t)(sign##32_t)var0; \
1292     } \
1293     break; \
1294   }
1295 
1296 // wide reduction loop - signed
1297 #define VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1298   reg_t vl = P.VU.vl->read();                   \
1299   reg_t rd_num = insn.rd(); \
1300   reg_t rs1_num = insn.rs1(); \
1301   reg_t rs2_num = insn.rs2(); \
1302   auto &vd_0_des = P.VU.elt<type_sew_t<sew2>::type>(rd_num, 0, true); \
1303   auto vd_0_res = P.VU.elt<type_sew_t<sew2>::type>(rs1_num, 0); \
1304   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
1305     VI_LOOP_ELEMENT_SKIP(); \
1306     auto vs2 = P.VU.elt<type_sew_t<sew1>::type>(rs2_num, i);
1307 
1308 #define WIDE_REDUCTION_LOOP(sew1, sew2, BODY) \
1309   VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1310   BODY; \
1311   VI_LOOP_REDUCTION_END(sew2)
1312 
1313 #define VI_VV_LOOP_WIDE_REDUCTION(BODY) \
1314   VI_CHECK_REDUCTION(true); \
1315   reg_t sew = P.VU.vsew; \
1316   if (sew == e8){ \
1317     WIDE_REDUCTION_LOOP(e8, e16, BODY) \
1318   } else if(sew == e16){ \
1319     WIDE_REDUCTION_LOOP(e16, e32, BODY) \
1320   } else if(sew == e32){ \
1321     WIDE_REDUCTION_LOOP(e32, e64, BODY) \
1322   }
1323 
1324 // wide reduction loop - unsigned
1325 #define VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1326   reg_t vl = P.VU.vl->read();                    \
1327   reg_t rd_num = insn.rd(); \
1328   reg_t rs1_num = insn.rs1(); \
1329   reg_t rs2_num = insn.rs2(); \
1330   auto &vd_0_des = P.VU.elt<type_usew_t<sew2>::type>(rd_num, 0, true); \
1331   auto vd_0_res = P.VU.elt<type_usew_t<sew2>::type>(rs1_num, 0); \
1332   for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
1333     VI_LOOP_ELEMENT_SKIP(); \
1334     auto vs2 = P.VU.elt<type_usew_t<sew1>::type>(rs2_num, i);
1335 
1336 #define WIDE_REDUCTION_ULOOP(sew1, sew2, BODY) \
1337   VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1338   BODY; \
1339   VI_LOOP_REDUCTION_END(sew2)
1340 
1341 #define VI_VV_ULOOP_WIDE_REDUCTION(BODY) \
1342   VI_CHECK_REDUCTION(true); \
1343   reg_t sew = P.VU.vsew; \
1344   if (sew == e8){ \
1345     WIDE_REDUCTION_ULOOP(e8, e16, BODY) \
1346   } else if(sew == e16){ \
1347     WIDE_REDUCTION_ULOOP(e16, e32, BODY) \
1348   } else if(sew == e32){ \
1349     WIDE_REDUCTION_ULOOP(e32, e64, BODY) \
1350   }
1351 
1352 // carry/borrow bit loop
1353 #define VI_VV_LOOP_CARRY(BODY) \
1354   VI_CHECK_MSS(true); \
1355   VI_GENERAL_LOOP_BASE \
1356   VI_MASK_VARS \
1357     if (sew == e8){ \
1358       VV_CARRY_PARAMS(e8) \
1359       BODY; \
1360     } else if (sew == e16) { \
1361       VV_CARRY_PARAMS(e16) \
1362       BODY; \
1363     } else if (sew == e32) { \
1364       VV_CARRY_PARAMS(e32) \
1365       BODY; \
1366     } else if (sew == e64) { \
1367       VV_CARRY_PARAMS(e64) \
1368       BODY; \
1369     } \
1370   VI_LOOP_END
1371 
1372 #define VI_XI_LOOP_CARRY(BODY) \
1373   VI_CHECK_MSS(false); \
1374   VI_GENERAL_LOOP_BASE \
1375   VI_MASK_VARS \
1376     if (sew == e8){ \
1377       XI_CARRY_PARAMS(e8) \
1378       BODY; \
1379     } else if (sew == e16) { \
1380       XI_CARRY_PARAMS(e16) \
1381       BODY; \
1382     } else if (sew == e32) { \
1383       XI_CARRY_PARAMS(e32) \
1384       BODY; \
1385     } else if (sew == e64) { \
1386       XI_CARRY_PARAMS(e64) \
1387       BODY; \
1388     } \
1389   VI_LOOP_END
1390 
1391 #define VI_VV_LOOP_WITH_CARRY(BODY) \
1392   require(insn.rd() != 0); \
1393   VI_CHECK_SSS(true); \
1394   VI_GENERAL_LOOP_BASE \
1395   VI_MASK_VARS \
1396     if (sew == e8){ \
1397       VV_WITH_CARRY_PARAMS(e8) \
1398       BODY; \
1399     } else if (sew == e16) { \
1400       VV_WITH_CARRY_PARAMS(e16) \
1401       BODY; \
1402     } else if (sew == e32) { \
1403       VV_WITH_CARRY_PARAMS(e32) \
1404       BODY; \
1405     } else if (sew == e64) { \
1406       VV_WITH_CARRY_PARAMS(e64) \
1407       BODY; \
1408     } \
1409   VI_LOOP_END
1410 
1411 #define VI_XI_LOOP_WITH_CARRY(BODY) \
1412   require(insn.rd() != 0); \
1413   VI_CHECK_SSS(false); \
1414   VI_GENERAL_LOOP_BASE \
1415   VI_MASK_VARS \
1416     if (sew == e8){ \
1417       XI_WITH_CARRY_PARAMS(e8) \
1418       BODY; \
1419     } else if (sew == e16) { \
1420       XI_WITH_CARRY_PARAMS(e16) \
1421       BODY; \
1422     } else if (sew == e32) { \
1423       XI_WITH_CARRY_PARAMS(e32) \
1424       BODY; \
1425     } else if (sew == e64) { \
1426       XI_WITH_CARRY_PARAMS(e64) \
1427       BODY; \
1428     } \
1429   VI_LOOP_END
1430 
1431 // average loop
1432 #define VI_VVX_LOOP_AVG(opd, op, is_vs1) \
1433 VI_CHECK_SSS(is_vs1); \
1434 VRM xrm = p->VU.get_vround_mode(); \
1435 VI_LOOP_BASE \
1436   switch(sew) { \
1437     case e8: { \
1438      VV_PARAMS(e8); \
1439      type_sew_t<e8>::type rs1 = RS1; \
1440      auto res = (int32_t)vs2 op opd; \
1441      INT_ROUNDING(res, xrm, 1); \
1442      vd = res >> 1; \
1443      break; \
1444     } \
1445     case e16: { \
1446      VV_PARAMS(e16); \
1447      type_sew_t<e16>::type rs1 = RS1; \
1448      auto res = (int32_t)vs2 op opd; \
1449      INT_ROUNDING(res, xrm, 1); \
1450      vd = res >> 1; \
1451      break; \
1452     } \
1453     case e32: { \
1454      VV_PARAMS(e32); \
1455      type_sew_t<e32>::type rs1 = RS1; \
1456      auto res = (int64_t)vs2 op opd; \
1457      INT_ROUNDING(res, xrm, 1); \
1458      vd = res >> 1; \
1459      break; \
1460     } \
1461     default: { \
1462      VV_PARAMS(e64); \
1463      type_sew_t<e64>::type rs1 = RS1; \
1464      auto res = (int128_t)vs2 op opd; \
1465      INT_ROUNDING(res, xrm, 1); \
1466      vd = res >> 1; \
1467      break; \
1468     } \
1469   } \
1470 VI_LOOP_END
1471 
1472 #define VI_VVX_ULOOP_AVG(opd, op, is_vs1) \
1473 VI_CHECK_SSS(is_vs1); \
1474 VRM xrm = p->VU.get_vround_mode(); \
1475 VI_LOOP_BASE \
1476   switch(sew) { \
1477     case e8: { \
1478      VV_U_PARAMS(e8); \
1479      type_usew_t<e8>::type rs1 = RS1; \
1480      auto res = (uint16_t)vs2 op opd; \
1481      INT_ROUNDING(res, xrm, 1); \
1482      vd = res >> 1; \
1483      break; \
1484     } \
1485     case e16: { \
1486      VV_U_PARAMS(e16); \
1487      type_usew_t<e16>::type rs1 = RS1; \
1488      auto res = (uint32_t)vs2 op opd; \
1489      INT_ROUNDING(res, xrm, 1); \
1490      vd = res >> 1; \
1491      break; \
1492     } \
1493     case e32: { \
1494      VV_U_PARAMS(e32); \
1495      type_usew_t<e32>::type rs1 = RS1; \
1496      auto res = (uint64_t)vs2 op opd; \
1497      INT_ROUNDING(res, xrm, 1); \
1498      vd = res >> 1; \
1499      break; \
1500     } \
1501     default: { \
1502      VV_U_PARAMS(e64); \
1503      type_usew_t<e64>::type rs1 = RS1; \
1504      auto res = (uint128_t)vs2 op opd; \
1505      INT_ROUNDING(res, xrm, 1); \
1506      vd = res >> 1; \
1507      break; \
1508     } \
1509   } \
1510 VI_LOOP_END
1511 
1512 //
1513 // vector: load/store helper
1514 //
1515 #define VI_STRIP(inx) \
1516   reg_t vreg_inx = inx;
1517 
1518 #define VI_DUPLICATE_VREG(reg_num, idx_sew) \
1519 reg_t index[P.VU.vlmax]; \
1520  for (reg_t i = 0; i < P.VU.vlmax && P.VU.vl->read() != 0; ++i) {       \
1521   switch(idx_sew) { \
1522     case e8: \
1523       index[i] = P.VU.elt<uint8_t>(reg_num, i); \
1524       break; \
1525     case e16: \
1526       index[i] = P.VU.elt<uint16_t>(reg_num, i); \
1527       break; \
1528     case e32: \
1529       index[i] = P.VU.elt<uint32_t>(reg_num, i); \
1530       break; \
1531     case e64: \
1532       index[i] = P.VU.elt<uint64_t>(reg_num, i); \
1533       break; \
1534   } \
1535 }
1536 
1537 #define VI_LD(stride, offset, elt_width, is_mask_ldst) \
1538   const reg_t nf = insn.v_nf() + 1; \
1539   const reg_t vl = is_mask_ldst ? ((P.VU.vl->read() + 7) / 8) : P.VU.vl->read(); \
1540   const reg_t baseAddr = RS1; \
1541   const reg_t vd = insn.rd(); \
1542   VI_CHECK_LOAD(elt_width, is_mask_ldst); \
1543   for (reg_t i = 0; i < vl; ++i) { \
1544     VI_ELEMENT_SKIP(i); \
1545     VI_STRIP(i); \
1546     P.VU.vstart->write(i); \
1547     for (reg_t fn = 0; fn < nf; ++fn) { \
1548       elt_width##_t val = MMU.load_##elt_width( \
1549         baseAddr + (stride) + (offset) * sizeof(elt_width##_t)); \
1550       P.VU.elt<elt_width##_t>(vd + fn * emul, vreg_inx, true) = val; \
1551     } \
1552   } \
1553   P.VU.vstart->write(0);
1554 
1555 #define VI_LD_INDEX(elt_width, is_seg) \
1556   const reg_t nf = insn.v_nf() + 1; \
1557   const reg_t vl = P.VU.vl->read(); \
1558   const reg_t baseAddr = RS1; \
1559   const reg_t vd = insn.rd(); \
1560   if (!is_seg) \
1561     require(nf == 1); \
1562   VI_CHECK_LD_INDEX(elt_width); \
1563   VI_DUPLICATE_VREG(insn.rs2(), elt_width); \
1564   for (reg_t i = 0; i < vl; ++i) { \
1565     VI_ELEMENT_SKIP(i); \
1566     VI_STRIP(i); \
1567     P.VU.vstart->write(i); \
1568     for (reg_t fn = 0; fn < nf; ++fn) { \
1569       switch(P.VU.vsew){ \
1570         case e8: \
1571           P.VU.elt<uint8_t>(vd + fn * flmul, vreg_inx, true) = \
1572             MMU.load_uint8(baseAddr + index[i] + fn * 1); \
1573           break; \
1574         case e16: \
1575           P.VU.elt<uint16_t>(vd + fn * flmul, vreg_inx, true) = \
1576             MMU.load_uint16(baseAddr + index[i] + fn * 2); \
1577           break; \
1578         case e32: \
1579           P.VU.elt<uint32_t>(vd + fn * flmul, vreg_inx, true) = \
1580             MMU.load_uint32(baseAddr + index[i] + fn * 4); \
1581           break; \
1582         default: \
1583           P.VU.elt<uint64_t>(vd + fn * flmul, vreg_inx, true) = \
1584             MMU.load_uint64(baseAddr + index[i] + fn * 8); \
1585           break; \
1586       } \
1587     } \
1588   } \
1589   P.VU.vstart->write(0);
1590 
1591 #define VI_ST(stride, offset, elt_width, is_mask_ldst) \
1592   const reg_t nf = insn.v_nf() + 1; \
1593   const reg_t vl = is_mask_ldst ? ((P.VU.vl->read() + 7) / 8) : P.VU.vl->read(); \
1594   const reg_t baseAddr = RS1; \
1595   const reg_t vs3 = insn.rd(); \
1596   VI_CHECK_STORE(elt_width, is_mask_ldst); \
1597   for (reg_t i = 0; i < vl; ++i) { \
1598     VI_STRIP(i) \
1599     VI_ELEMENT_SKIP(i); \
1600     P.VU.vstart->write(i); \
1601     for (reg_t fn = 0; fn < nf; ++fn) { \
1602       elt_width##_t val = P.VU.elt<elt_width##_t>(vs3 + fn * emul, vreg_inx); \
1603       MMU.store_##elt_width( \
1604         baseAddr + (stride) + (offset) * sizeof(elt_width##_t), val); \
1605     } \
1606   } \
1607   P.VU.vstart->write(0);
1608 
1609 #define VI_ST_INDEX(elt_width, is_seg) \
1610   const reg_t nf = insn.v_nf() + 1; \
1611   const reg_t vl = P.VU.vl->read(); \
1612   const reg_t baseAddr = RS1; \
1613   const reg_t vs3 = insn.rd(); \
1614   if (!is_seg) \
1615     require(nf == 1); \
1616   VI_CHECK_ST_INDEX(elt_width); \
1617   VI_DUPLICATE_VREG(insn.rs2(), elt_width);   \
1618   for (reg_t i = 0; i < vl; ++i) { \
1619     VI_STRIP(i) \
1620     VI_ELEMENT_SKIP(i); \
1621     P.VU.vstart->write(i); \
1622     for (reg_t fn = 0; fn < nf; ++fn) { \
1623       switch (P.VU.vsew) { \
1624       case e8: \
1625         MMU.store_uint8(baseAddr + index[i] + fn * 1, \
1626           P.VU.elt<uint8_t>(vs3 + fn * flmul, vreg_inx)); \
1627         break; \
1628       case e16: \
1629         MMU.store_uint16(baseAddr + index[i] + fn * 2, \
1630           P.VU.elt<uint16_t>(vs3 + fn * flmul, vreg_inx)); \
1631         break; \
1632       case e32: \
1633         MMU.store_uint32(baseAddr + index[i] + fn * 4, \
1634           P.VU.elt<uint32_t>(vs3 + fn * flmul, vreg_inx)); \
1635         break; \
1636       default: \
1637         MMU.store_uint64(baseAddr + index[i] + fn * 8, \
1638           P.VU.elt<uint64_t>(vs3 + fn * flmul, vreg_inx)); \
1639         break; \
1640       } \
1641     } \
1642   } \
1643   P.VU.vstart->write(0);
1644 
1645 #define VI_LDST_FF(elt_width) \
1646   const reg_t nf = insn.v_nf() + 1; \
1647   const reg_t sew = p->VU.vsew; \
1648   const reg_t vl = p->VU.vl->read(); \
1649   const reg_t baseAddr = RS1; \
1650   const reg_t rd_num = insn.rd(); \
1651   VI_CHECK_LOAD(elt_width, false); \
1652   bool early_stop = false; \
1653   for (reg_t i = p->VU.vstart->read(); i < vl; ++i) { \
1654     VI_STRIP(i); \
1655     VI_ELEMENT_SKIP(i); \
1656     \
1657     for (reg_t fn = 0; fn < nf; ++fn) { \
1658       uint64_t val; \
1659       try { \
1660         val = MMU.load_##elt_width( \
1661           baseAddr + (i * nf + fn) * sizeof(elt_width##_t)); \
1662       } catch (trap_t& t) { \
1663         if (i == 0) \
1664           throw; /* Only take exception on zeroth element */ \
1665         /* Reduce VL if an exception occurs on a later element */ \
1666         early_stop = true; \
1667         P.VU.vl->write_raw(i);                  \
1668         break; \
1669       } \
1670       p->VU.elt<elt_width##_t>(rd_num + fn * emul, vreg_inx, true) = val; \
1671     } \
1672     \
1673     if (early_stop) { \
1674       break; \
1675     } \
1676   } \
1677   p->VU.vstart->write(0);
1678 
1679 #define VI_LD_WHOLE(elt_width) \
1680   require_vector_novtype(true, false); \
1681   const reg_t baseAddr = RS1; \
1682   const reg_t vd = insn.rd(); \
1683   const reg_t len = insn.v_nf() + 1; \
1684   require_align(vd, len); \
1685   const reg_t elt_per_reg = P.VU.vlenb / sizeof(elt_width ## _t); \
1686   const reg_t size = len * elt_per_reg; \
1687   if (P.VU.vstart->read() < size) { \
1688     reg_t i = P.VU.vstart->read() / elt_per_reg; \
1689     reg_t off = P.VU.vstart->read() % elt_per_reg; \
1690     if (off) { \
1691       for (reg_t pos = off; pos < elt_per_reg; ++pos) { \
1692         auto val = MMU.load_## elt_width(baseAddr + \
1693           P.VU.vstart->read() * sizeof(elt_width ## _t)); \
1694         P.VU.elt<elt_width ## _t>(vd + i, pos, true) = val; \
1695         P.VU.vstart->write(P.VU.vstart->read() + 1); \
1696       } \
1697       ++i; \
1698     } \
1699     for (; i < len; ++i) { \
1700       for (reg_t pos = 0; pos < elt_per_reg; ++pos) { \
1701         auto val = MMU.load_## elt_width(baseAddr + \
1702           P.VU.vstart->read() * sizeof(elt_width ## _t)); \
1703         P.VU.elt<elt_width ## _t>(vd + i, pos, true) = val; \
1704         P.VU.vstart->write(P.VU.vstart->read() + 1); \
1705       } \
1706     } \
1707   } \
1708   P.VU.vstart = 0; \
1709 
1710 #define VI_ST_WHOLE \
1711   require_vector_novtype(true, false); \
1712   const reg_t baseAddr = RS1; \
1713   const reg_t vs3 = insn.rd(); \
1714   const reg_t len = insn.v_nf() + 1; \
1715   require_align(vs3, len); \
1716   const reg_t size = len * P.VU.vlenb; \
1717    \
1718   if (P.VU.vstart->read() < size) { \
1719     reg_t i = P.VU.vstart->read() / P.VU.vlenb; \
1720     reg_t off = P.VU.vstart->read() % P.VU.vlenb; \
1721     if (off) { \
1722       for (reg_t pos = off; pos < P.VU.vlenb; ++pos) { \
1723         auto val = P.VU.elt<uint8_t>(vs3 + i, pos); \
1724         MMU.store_uint8(baseAddr + P.VU.vstart->read(), val); \
1725         P.VU.vstart->write(P.VU.vstart->read() + 1); \
1726       } \
1727       i++; \
1728     } \
1729     for (; i < len; ++i) { \
1730       for (reg_t pos = 0; pos < P.VU.vlenb; ++pos) { \
1731         auto val = P.VU.elt<uint8_t>(vs3 + i, pos); \
1732         MMU.store_uint8(baseAddr + P.VU.vstart->read(), val); \
1733         P.VU.vstart->write(P.VU.vstart->read() + 1); \
1734       } \
1735     } \
1736   } \
1737   P.VU.vstart->write(0);
1738 
1739 //
1740 // vector: amo
1741 //
1742 #define VI_AMO(op, type, idx_type) \
1743   require_vector(false); \
1744   require_align(insn.rd(), P.VU.vflmul); \
1745   require(P.VU.vsew <= P.get_xlen() && P.VU.vsew >= 32); \
1746   require_align(insn.rd(), P.VU.vflmul); \
1747   float vemul = ((float)idx_type / P.VU.vsew * P.VU.vflmul); \
1748   require(vemul >= 0.125 && vemul <= 8); \
1749   require_align(insn.rs2(), vemul); \
1750   if (insn.v_wd()) {\
1751     require_vm; \
1752     if (idx_type > P.VU.vsew) { \
1753       if (insn.rd() != insn.rs2()) \
1754         require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
1755     } else if (idx_type < P.VU.vsew) { \
1756       if (vemul < 1) {\
1757         require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
1758       } else {\
1759         require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
1760       } \
1761     } \
1762   } \
1763   VI_DUPLICATE_VREG(insn.rs2(), idx_type); \
1764   const reg_t vl = P.VU.vl->read(); \
1765   const reg_t baseAddr = RS1; \
1766   const reg_t vd = insn.rd(); \
1767   for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
1768     VI_ELEMENT_SKIP(i); \
1769     VI_STRIP(i); \
1770     P.VU.vstart->write(i); \
1771     switch (P.VU.vsew) { \
1772     case e32: {\
1773       auto vs3 = P.VU.elt< type ## 32_t>(vd, vreg_inx); \
1774       auto val = MMU.amo_uint32(baseAddr + index[i], [&]( type ## 32_t lhs) { op }); \
1775       if (insn.v_wd()) \
1776         P.VU.elt< type ## 32_t>(vd, vreg_inx, true) = val; \
1777       } \
1778       break; \
1779     case e64: {\
1780       auto vs3 = P.VU.elt< type ## 64_t>(vd, vreg_inx); \
1781       auto val = MMU.amo_uint64(baseAddr + index[i], [&]( type ## 64_t lhs) { op }); \
1782       if (insn.v_wd()) \
1783         P.VU.elt< type ## 64_t>(vd, vreg_inx, true) = val; \
1784       } \
1785       break; \
1786     default: \
1787       require(0); \
1788       break; \
1789     } \
1790   } \
1791   P.VU.vstart->write(0);
1792 
1793 // vector: sign/unsiged extension
1794 #define VI_VV_EXT(div, type) \
1795   require(insn.rd() != insn.rs2()); \
1796   require_vm; \
1797   reg_t from = P.VU.vsew / div; \
1798   require(from >= e8 && from <= e64); \
1799   require(((float)P.VU.vflmul / div) >= 0.125 && ((float)P.VU.vflmul / div) <= 8 ); \
1800   require_align(insn.rd(), P.VU.vflmul); \
1801   require_align(insn.rs2(), P.VU.vflmul / div); \
1802   if ((P.VU.vflmul / div) < 1) { \
1803     require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul / div); \
1804   } else {\
1805     require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul / div); \
1806   } \
1807   reg_t pat = (((P.VU.vsew >> 3) << 4) | from >> 3); \
1808   VI_GENERAL_LOOP_BASE \
1809   VI_LOOP_ELEMENT_SKIP(); \
1810     switch (pat) { \
1811       case 0x21: \
1812         P.VU.elt<type##16_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
1813         break; \
1814       case 0x41: \
1815         P.VU.elt<type##32_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
1816         break; \
1817       case 0x81: \
1818         P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
1819         break; \
1820       case 0x42: \
1821         P.VU.elt<type##32_t>(rd_num, i, true) = P.VU.elt<type##16_t>(rs2_num, i); \
1822         break; \
1823       case 0x82: \
1824         P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##16_t>(rs2_num, i); \
1825         break; \
1826       case 0x84: \
1827         P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##32_t>(rs2_num, i); \
1828         break; \
1829       case 0x88: \
1830         P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##32_t>(rs2_num, i); \
1831         break; \
1832       default: \
1833         break; \
1834     } \
1835   VI_LOOP_END
1836 
1837 //
1838 // vector: vfp helper
1839 //
1840 #define VI_VFP_COMMON \
1841   require_fp; \
1842   require((P.VU.vsew == e16 && p->extension_enabled(EXT_ZFH)) || \
1843           (P.VU.vsew == e32 && p->extension_enabled('F')) || \
1844           (P.VU.vsew == e64 && p->extension_enabled('D'))); \
1845   require_vector(true);\
1846   require(STATE.frm->read() < 0x5);\
1847   reg_t vl = P.VU.vl->read(); \
1848   reg_t rd_num = insn.rd(); \
1849   reg_t rs1_num = insn.rs1(); \
1850   reg_t rs2_num = insn.rs2(); \
1851   softfloat_roundingMode = STATE.frm->read();
1852 
1853 #define VI_VFP_LOOP_BASE \
1854   VI_VFP_COMMON \
1855   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
1856     VI_LOOP_ELEMENT_SKIP();
1857 
1858 #define VI_VFP_LOOP_CMP_BASE \
1859   VI_VFP_COMMON \
1860   for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
1861     VI_LOOP_ELEMENT_SKIP(); \
1862     uint64_t mmask = UINT64_C(1) << mpos; \
1863     uint64_t &vdi = P.VU.elt<uint64_t>(rd_num, midx, true); \
1864     uint64_t res = 0;
1865 
1866 #define VI_VFP_LOOP_REDUCTION_BASE(width) \
1867   float##width##_t vd_0 = P.VU.elt<float##width##_t>(rd_num, 0); \
1868   float##width##_t vs1_0 = P.VU.elt<float##width##_t>(rs1_num, 0); \
1869   vd_0 = vs1_0; \
1870   bool is_active = false; \
1871   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
1872     VI_LOOP_ELEMENT_SKIP(); \
1873     float##width##_t vs2 = P.VU.elt<float##width##_t>(rs2_num, i); \
1874     is_active = true; \
1875 
1876 #define VI_VFP_LOOP_WIDE_REDUCTION_BASE \
1877   VI_VFP_COMMON \
1878   float64_t vd_0 = f64(P.VU.elt<float64_t>(rs1_num, 0).v); \
1879   for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
1880     VI_LOOP_ELEMENT_SKIP();
1881 
1882 #define VI_VFP_LOOP_END \
1883   } \
1884   P.VU.vstart->write(0); \
1885 
1886 #define VI_VFP_LOOP_REDUCTION_END(x) \
1887   } \
1888   P.VU.vstart->write(0); \
1889   if (vl > 0) { \
1890     if (is_propagate && !is_active) { \
1891       switch (x) { \
1892         case e16: {\
1893             auto ret = f16_classify(f16(vd_0.v)); \
1894             if (ret & 0x300) { \
1895               if (ret & 0x100) { \
1896                 softfloat_exceptionFlags |= softfloat_flag_invalid; \
1897                 set_fp_exceptions; \
1898               } \
1899               P.VU.elt<uint16_t>(rd_num, 0, true) = defaultNaNF16UI; \
1900             } else { \
1901               P.VU.elt<uint16_t>(rd_num, 0, true) = vd_0.v; \
1902             } \
1903           } \
1904           break; \
1905         case e32: { \
1906             auto ret = f32_classify(f32(vd_0.v)); \
1907             if (ret & 0x300) { \
1908               if (ret & 0x100) { \
1909                 softfloat_exceptionFlags |= softfloat_flag_invalid; \
1910                 set_fp_exceptions; \
1911               } \
1912               P.VU.elt<uint32_t>(rd_num, 0, true) = defaultNaNF32UI; \
1913             } else { \
1914               P.VU.elt<uint32_t>(rd_num, 0, true) = vd_0.v; \
1915             } \
1916           } \
1917           break; \
1918         case e64: {\
1919             auto ret = f64_classify(f64(vd_0.v)); \
1920             if (ret & 0x300) { \
1921               if (ret & 0x100) { \
1922                 softfloat_exceptionFlags |= softfloat_flag_invalid; \
1923                 set_fp_exceptions; \
1924               } \
1925               P.VU.elt<uint64_t>(rd_num, 0, true) = defaultNaNF64UI; \
1926             } else { \
1927               P.VU.elt<uint64_t>(rd_num, 0, true) = vd_0.v; \
1928             } \
1929           } \
1930           break; \
1931       } \
1932     } else { \
1933       P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true) = vd_0.v; \
1934     } \
1935   }
1936 
1937 #define VI_VFP_LOOP_CMP_END \
1938   switch(P.VU.vsew) { \
1939     case e16: \
1940     case e32: \
1941     case e64: { \
1942       vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
1943       break; \
1944     } \
1945     default: \
1946       require(0); \
1947       break; \
1948     }; \
1949   } \
1950   P.VU.vstart->write(0);
1951 
1952 #define VI_VFP_VV_LOOP(BODY16, BODY32, BODY64) \
1953   VI_CHECK_SSS(true); \
1954   VI_VFP_LOOP_BASE \
1955   switch(P.VU.vsew) { \
1956     case e16: {\
1957       float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
1958       float16_t vs1 = P.VU.elt<float16_t>(rs1_num, i); \
1959       float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
1960       BODY16; \
1961       set_fp_exceptions; \
1962       break; \
1963     }\
1964     case e32: {\
1965       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
1966       float32_t vs1 = P.VU.elt<float32_t>(rs1_num, i); \
1967       float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
1968       BODY32; \
1969       set_fp_exceptions; \
1970       break; \
1971     }\
1972     case e64: {\
1973       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
1974       float64_t vs1 = P.VU.elt<float64_t>(rs1_num, i); \
1975       float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
1976       BODY64; \
1977       set_fp_exceptions; \
1978       break; \
1979     }\
1980     default: \
1981       require(0); \
1982       break; \
1983   }; \
1984   DEBUG_RVV_FP_VV; \
1985   VI_VFP_LOOP_END
1986 
1987 #define VI_VFP_V_LOOP(BODY16, BODY32, BODY64) \
1988   VI_CHECK_SSS(false); \
1989   VI_VFP_LOOP_BASE \
1990   switch(P.VU.vsew) { \
1991     case e16: {\
1992       float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
1993       float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
1994       BODY16; \
1995       break; \
1996     }\
1997     case e32: {\
1998       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
1999       float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2000       BODY32; \
2001       break; \
2002     }\
2003     case e64: {\
2004       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2005       float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2006       BODY64; \
2007       break; \
2008     }\
2009     default: \
2010       require(0); \
2011       break; \
2012   }; \
2013   set_fp_exceptions; \
2014   VI_VFP_LOOP_END
2015 
2016 #define VI_VFP_VV_LOOP_REDUCTION(BODY16, BODY32, BODY64) \
2017   VI_CHECK_REDUCTION(false) \
2018   VI_VFP_COMMON \
2019   switch(P.VU.vsew) { \
2020     case e16: {\
2021       VI_VFP_LOOP_REDUCTION_BASE(16) \
2022         BODY16; \
2023         set_fp_exceptions; \
2024       VI_VFP_LOOP_REDUCTION_END(e16) \
2025       break; \
2026     }\
2027     case e32: {\
2028       VI_VFP_LOOP_REDUCTION_BASE(32) \
2029         BODY32; \
2030         set_fp_exceptions; \
2031       VI_VFP_LOOP_REDUCTION_END(e32) \
2032       break; \
2033     }\
2034     case e64: {\
2035       VI_VFP_LOOP_REDUCTION_BASE(64) \
2036         BODY64; \
2037         set_fp_exceptions; \
2038       VI_VFP_LOOP_REDUCTION_END(e64) \
2039       break; \
2040     }\
2041     default: \
2042       require(0); \
2043       break; \
2044   }; \
2045 
2046 #define VI_VFP_VV_LOOP_WIDE_REDUCTION(BODY16, BODY32) \
2047   VI_CHECK_REDUCTION(true) \
2048   VI_VFP_COMMON \
2049   require((P.VU.vsew == e16 && p->extension_enabled('F')) || \
2050           (P.VU.vsew == e32 && p->extension_enabled('D'))); \
2051   bool is_active = false; \
2052   switch(P.VU.vsew) { \
2053     case e16: {\
2054       float32_t vd_0 = P.VU.elt<float32_t>(rs1_num, 0); \
2055       for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
2056         VI_LOOP_ELEMENT_SKIP(); \
2057         is_active = true; \
2058         float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
2059         BODY16; \
2060         set_fp_exceptions; \
2061       VI_VFP_LOOP_REDUCTION_END(e32) \
2062       break; \
2063     }\
2064     case e32: {\
2065       float64_t vd_0 = P.VU.elt<float64_t>(rs1_num, 0); \
2066       for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
2067         VI_LOOP_ELEMENT_SKIP(); \
2068         is_active = true; \
2069         float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
2070         BODY32; \
2071         set_fp_exceptions; \
2072       VI_VFP_LOOP_REDUCTION_END(e64) \
2073       break; \
2074     }\
2075     default: \
2076       require(0); \
2077       break; \
2078   }; \
2079 
2080 #define VI_VFP_VF_LOOP(BODY16, BODY32, BODY64) \
2081   VI_CHECK_SSS(false); \
2082   VI_VFP_LOOP_BASE \
2083   switch(P.VU.vsew) { \
2084     case e16: {\
2085       float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
2086       float16_t rs1 = f16(READ_FREG(rs1_num)); \
2087       float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
2088       BODY16; \
2089       set_fp_exceptions; \
2090       break; \
2091     }\
2092     case e32: {\
2093       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2094       float32_t rs1 = f32(READ_FREG(rs1_num)); \
2095       float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2096       BODY32; \
2097       set_fp_exceptions; \
2098       break; \
2099     }\
2100     case e64: {\
2101       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2102       float64_t rs1 = f64(READ_FREG(rs1_num)); \
2103       float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2104       BODY64; \
2105       set_fp_exceptions; \
2106       break; \
2107     }\
2108     default: \
2109       require(0); \
2110       break; \
2111   }; \
2112   DEBUG_RVV_FP_VF; \
2113   VI_VFP_LOOP_END
2114 
2115 #define VI_VFP_LOOP_CMP(BODY16, BODY32, BODY64, is_vs1) \
2116   VI_CHECK_MSS(is_vs1); \
2117   VI_VFP_LOOP_CMP_BASE \
2118   switch(P.VU.vsew) { \
2119     case e16: {\
2120       float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
2121       float16_t vs1 = P.VU.elt<float16_t>(rs1_num, i); \
2122       float16_t rs1 = f16(READ_FREG(rs1_num)); \
2123       BODY16; \
2124       set_fp_exceptions; \
2125       break; \
2126     }\
2127     case e32: {\
2128       float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2129       float32_t vs1 = P.VU.elt<float32_t>(rs1_num, i); \
2130       float32_t rs1 = f32(READ_FREG(rs1_num)); \
2131       BODY32; \
2132       set_fp_exceptions; \
2133       break; \
2134     }\
2135     case e64: {\
2136       float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2137       float64_t vs1 = P.VU.elt<float64_t>(rs1_num, i); \
2138       float64_t rs1 = f64(READ_FREG(rs1_num)); \
2139       BODY64; \
2140       set_fp_exceptions; \
2141       break; \
2142     }\
2143     default: \
2144       require(0); \
2145       break; \
2146   }; \
2147   VI_VFP_LOOP_CMP_END \
2148 
2149 #define VI_VFP_VF_LOOP_WIDE(BODY16, BODY32) \
2150   VI_CHECK_DSS(false); \
2151   VI_VFP_LOOP_BASE \
2152   switch(P.VU.vsew) { \
2153     case e16: { \
2154       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2155       float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
2156       float32_t rs1 = f16_to_f32(f16(READ_FREG(rs1_num))); \
2157       BODY16; \
2158       set_fp_exceptions; \
2159       break; \
2160     } \
2161     case e32: {\
2162       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2163       float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
2164       float64_t rs1 = f32_to_f64(f32(READ_FREG(rs1_num))); \
2165       BODY32; \
2166       set_fp_exceptions; \
2167       break; \
2168     }\
2169     default: \
2170       require(0); \
2171       break; \
2172   }; \
2173   DEBUG_RVV_FP_VV; \
2174   VI_VFP_LOOP_END
2175 
2176 
2177 #define VI_VFP_VV_LOOP_WIDE(BODY16, BODY32) \
2178   VI_CHECK_DSS(true); \
2179   VI_VFP_LOOP_BASE \
2180   switch(P.VU.vsew) { \
2181     case e16: {\
2182       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2183       float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
2184       float32_t vs1 = f16_to_f32(P.VU.elt<float16_t>(rs1_num, i)); \
2185       BODY16; \
2186       set_fp_exceptions; \
2187       break; \
2188     }\
2189     case e32: {\
2190       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2191       float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
2192       float64_t vs1 = f32_to_f64(P.VU.elt<float32_t>(rs1_num, i)); \
2193       BODY32; \
2194       set_fp_exceptions; \
2195       break; \
2196     }\
2197     default: \
2198       require(0); \
2199       break; \
2200   }; \
2201   DEBUG_RVV_FP_VV; \
2202   VI_VFP_LOOP_END
2203 
2204 #define VI_VFP_WF_LOOP_WIDE(BODY16, BODY32) \
2205   VI_CHECK_DDS(false); \
2206   VI_VFP_LOOP_BASE \
2207   switch(P.VU.vsew) { \
2208     case e16: {\
2209       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2210       float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2211       float32_t rs1 = f16_to_f32(f16(READ_FREG(rs1_num))); \
2212       BODY16; \
2213       set_fp_exceptions; \
2214       break; \
2215     }\
2216     case e32: {\
2217       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2218       float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2219       float64_t rs1 = f32_to_f64(f32(READ_FREG(rs1_num))); \
2220       BODY32; \
2221       set_fp_exceptions; \
2222       break; \
2223     }\
2224     default: \
2225       require(0); \
2226   }; \
2227   DEBUG_RVV_FP_VV; \
2228   VI_VFP_LOOP_END
2229 
2230 #define VI_VFP_WV_LOOP_WIDE(BODY16, BODY32) \
2231   VI_CHECK_DDS(true); \
2232   VI_VFP_LOOP_BASE \
2233   switch(P.VU.vsew) { \
2234     case e16: {\
2235       float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2236       float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2237       float32_t vs1 = f16_to_f32(P.VU.elt<float16_t>(rs1_num, i)); \
2238       BODY16; \
2239       set_fp_exceptions; \
2240       break; \
2241     }\
2242     case e32: {\
2243       float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2244       float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2245       float64_t vs1 = f32_to_f64(P.VU.elt<float32_t>(rs1_num, i)); \
2246       BODY32; \
2247       set_fp_exceptions; \
2248       break; \
2249     }\
2250     default: \
2251       require(0); \
2252   }; \
2253   DEBUG_RVV_FP_VV; \
2254   VI_VFP_LOOP_END
2255 
2256 #define VI_VFP_LOOP_SCALE_BASE \
2257   require_fp; \
2258   require_vector(true);\
2259   require((P.VU.vsew == e8 && p->extension_enabled(EXT_ZFH)) || \
2260           (P.VU.vsew == e16 && p->extension_enabled('F')) || \
2261           (P.VU.vsew == e32 && p->extension_enabled('D'))); \
2262   require(STATE.frm->read() < 0x5);\
2263   reg_t vl = P.VU.vl->read(); \
2264   reg_t rd_num = insn.rd(); \
2265   reg_t rs1_num = insn.rs1(); \
2266   reg_t rs2_num = insn.rs2(); \
2267   softfloat_roundingMode = STATE.frm->read(); \
2268   for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
2269     VI_LOOP_ELEMENT_SKIP();
2270 
2271 #define VI_VFP_CVT_SCALE(BODY8, BODY16, BODY32, \
2272                          CHECK8, CHECK16, CHECK32, \
2273                          is_widen, eew_check) \
2274   if (is_widen) { \
2275     VI_CHECK_DSS(false);\
2276   } else { \
2277     VI_CHECK_SDS(false); \
2278   } \
2279   require(eew_check); \
2280   switch(P.VU.vsew) { \
2281     case e8: {\
2282       CHECK8 \
2283       VI_VFP_LOOP_SCALE_BASE \
2284         BODY8 \
2285         set_fp_exceptions; \
2286       VI_VFP_LOOP_END \
2287       } \
2288       break; \
2289     case e16: {\
2290       CHECK16 \
2291       VI_VFP_LOOP_SCALE_BASE \
2292         BODY16 \
2293         set_fp_exceptions; \
2294       VI_VFP_LOOP_END \
2295       } \
2296       break; \
2297     case e32: {\
2298       CHECK32 \
2299       VI_VFP_LOOP_SCALE_BASE \
2300         BODY32 \
2301         set_fp_exceptions; \
2302       VI_VFP_LOOP_END \
2303       } \
2304       break; \
2305     default: \
2306       require(0); \
2307       break; \
2308   }
2309 
2310 // The p-extension support is contributed by
2311 // Programming Langauge Lab, Department of Computer Science, National Tsing-Hua University, Taiwan
2312 
2313 #define P_FIELD(R, INDEX, SIZE) \
2314   (type_sew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE))
2315 
2316 #define P_UFIELD(R, INDEX, SIZE) \
2317   (type_usew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE))
2318 
2319 #define P_B(R, INDEX) P_UFIELD(R, INDEX, 8)
2320 #define P_H(R, INDEX) P_UFIELD(R, INDEX, 16)
2321 #define P_W(R, INDEX) P_UFIELD(R, INDEX, 32)
2322 #define P_SB(R, INDEX) P_FIELD(R, INDEX, 8)
2323 #define P_SH(R, INDEX) P_FIELD(R, INDEX, 16)
2324 #define P_SW(R, INDEX) P_FIELD(R, INDEX, 32)
2325 
2326 #define READ_REG_PAIR(reg) ({ \
2327   require((reg) % 2 == 0); \
2328   (reg) == 0 ? reg_t(0) : \
2329   (READ_REG((reg) + 1) << 32) + zext32(READ_REG(reg)); })
2330 
2331 #define RS1_PAIR READ_REG_PAIR(insn.rs1())
2332 #define RS2_PAIR READ_REG_PAIR(insn.rs2())
2333 #define RD_PAIR READ_REG_PAIR(insn.rd())
2334 
2335 #define WRITE_PD() \
2336   rd_tmp = set_field(rd_tmp, make_mask64((i * sizeof(pd) * 8), sizeof(pd) * 8), pd);
2337 
2338 #define WRITE_RD_PAIR(value) \
2339   if (insn.rd() != 0) { \
2340     require(insn.rd() % 2 == 0); \
2341     WRITE_REG(insn.rd(), sext32(value)); \
2342     WRITE_REG(insn.rd() + 1, (sreg_t(value)) >> 32); \
2343   }
2344 
2345 #define P_SET_OV(ov) \
2346   if (ov) P.VU.vxsat->write(1);
2347 
2348 #define P_SAT(R, BIT) \
2349   if (R > INT##BIT##_MAX) { \
2350     R = INT##BIT##_MAX; \
2351     P_SET_OV(1); \
2352   } else if (R < INT##BIT##_MIN) { \
2353     R = INT##BIT##_MIN; \
2354     P_SET_OV(1); \
2355   }
2356 
2357 #define P_SATU(R, BIT) \
2358   if (R > UINT##BIT##_MAX) { \
2359     R = UINT##BIT##_MAX; \
2360     P_SET_OV(1); \
2361   } else if (R < 0) { \
2362     P_SET_OV(1); \
2363     R = 0; \
2364   }
2365 
2366 #define P_LOOP_BASE(BIT) \
2367   require_extension(EXT_ZPN); \
2368   require(BIT == e8 || BIT == e16 || BIT == e32); \
2369   reg_t rd_tmp = RD; \
2370   reg_t rs1 = RS1; \
2371   reg_t rs2 = RS2; \
2372   sreg_t len = xlen / BIT; \
2373   for (sreg_t i = len - 1; i >= 0; --i) {
2374 
2375 #define P_ONE_LOOP_BASE(BIT) \
2376   require_extension(EXT_ZPN); \
2377   require(BIT == e8 || BIT == e16 || BIT == e32); \
2378   reg_t rd_tmp = RD; \
2379   reg_t rs1 = RS1; \
2380   sreg_t len = xlen / BIT; \
2381   for (sreg_t i = len - 1; i >= 0; --i) {
2382 
2383 #define P_I_LOOP_BASE(BIT, IMMBIT) \
2384   require_extension(EXT_ZPN); \
2385   require(BIT == e8 || BIT == e16 || BIT == e32); \
2386   reg_t rd_tmp = RD; \
2387   reg_t rs1 = RS1; \
2388   type_usew_t<BIT>::type imm##IMMBIT##u = insn.p_imm##IMMBIT(); \
2389   sreg_t len = xlen / BIT; \
2390   for (sreg_t i = len - 1; i >= 0; --i) {
2391 
2392 #define P_X_LOOP_BASE(BIT, LOWBIT) \
2393   require_extension(EXT_ZPN); \
2394   require(BIT == e8 || BIT == e16 || BIT == e32); \
2395   reg_t rd_tmp = RD; \
2396   reg_t rs1 = RS1; \
2397   type_usew_t<BIT>::type sa = RS2 & ((uint64_t(1) << LOWBIT) - 1); \
2398   type_sew_t<BIT>::type ssa = int64_t(RS2) << (64 - LOWBIT) >> (64 - LOWBIT); \
2399   sreg_t len = xlen / BIT; \
2400   for (sreg_t i = len - 1; i >= 0; --i) {
2401 
2402 #define P_MUL_LOOP_BASE(BIT) \
2403   require_extension(EXT_ZPN); \
2404   require(BIT == e8 || BIT == e16 || BIT == e32); \
2405   reg_t rd_tmp = RD; \
2406   reg_t rs1 = RS1; \
2407   reg_t rs2 = RS2; \
2408   sreg_t len = 32 / BIT; \
2409   for (sreg_t i = len - 1; i >= 0; --i) {
2410 
2411 #define P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2412   require_extension(EXT_ZPN); \
2413   require(BIT == e16 || BIT == e32 || BIT == e64); \
2414   reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \
2415   reg_t rs1 = zext_xlen(RS1); \
2416   reg_t rs2 = zext_xlen(RS2); \
2417   sreg_t len = 64 / BIT; \
2418   sreg_t len_inner = BIT / BIT_INNER; \
2419   for (sreg_t i = len - 1; i >= 0; --i) { \
2420     sreg_t pd_res = P_FIELD(rd_tmp, i, BIT); \
2421     for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) {
2422 
2423 #define P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
2424   require_extension(EXT_ZPN); \
2425   require(BIT == e16 || BIT == e32 || BIT == e64); \
2426   reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \
2427   reg_t rs1 = zext_xlen(RS1); \
2428   reg_t rs2 = zext_xlen(RS2); \
2429   sreg_t len = 64 / BIT; \
2430   sreg_t len_inner = BIT / BIT_INNER; \
2431   for (sreg_t i = len - 1; i >=0; --i) { \
2432     reg_t pd_res = P_UFIELD(rd_tmp, i, BIT); \
2433     for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) {
2434 
2435 #define P_PARAMS(BIT) \
2436   auto pd = P_FIELD(rd_tmp, i, BIT); \
2437   auto ps1 = P_FIELD(rs1, i, BIT); \
2438   auto ps2 = P_FIELD(rs2, i, BIT);
2439 
2440 #define P_UPARAMS(BIT) \
2441   auto pd = P_UFIELD(rd_tmp, i, BIT); \
2442   auto ps1 = P_UFIELD(rs1, i, BIT); \
2443   auto ps2 = P_UFIELD(rs2, i, BIT);
2444 
2445 #define P_CORSS_PARAMS(BIT) \
2446   auto pd = P_FIELD(rd_tmp, i, BIT); \
2447   auto ps1 = P_FIELD(rs1, i, BIT); \
2448   auto ps2 = P_FIELD(rs2, (i ^ 1), BIT);
2449 
2450 #define P_CORSS_UPARAMS(BIT) \
2451   auto pd = P_UFIELD(rd_tmp, i, BIT); \
2452   auto ps1 = P_UFIELD(rs1, i, BIT); \
2453   auto ps2 = P_UFIELD(rs2, (i ^ 1), BIT);
2454 
2455 #define P_ONE_PARAMS(BIT) \
2456   auto pd = P_FIELD(rd_tmp, i, BIT); \
2457   auto ps1 = P_FIELD(rs1, i, BIT);
2458 
2459 #define P_ONE_UPARAMS(BIT) \
2460   auto pd = P_UFIELD(rd_tmp, i, BIT); \
2461   auto ps1 = P_UFIELD(rs1, i, BIT);
2462 
2463 #define P_ONE_SUPARAMS(BIT) \
2464   auto pd = P_UFIELD(rd_tmp, i, BIT); \
2465   auto ps1 = P_FIELD(rs1, i, BIT);
2466 
2467 #define P_MUL_PARAMS(BIT) \
2468   auto pd = P_FIELD(rd_tmp, i, BIT * 2); \
2469   auto ps1 = P_FIELD(rs1, i, BIT); \
2470   auto ps2 = P_FIELD(rs2, i, BIT);
2471 
2472 #define P_MUL_UPARAMS(BIT) \
2473   auto pd = P_UFIELD(rd_tmp, i, BIT * 2); \
2474   auto ps1 = P_UFIELD(rs1, i, BIT); \
2475   auto ps2 = P_UFIELD(rs2, i, BIT);
2476 
2477 #define P_MUL_CROSS_PARAMS(BIT) \
2478   auto pd = P_FIELD(rd_tmp, i, BIT * 2); \
2479   auto ps1 = P_FIELD(rs1, i, BIT); \
2480   auto ps2 = P_FIELD(rs2, (i ^ 1), BIT);
2481 
2482 #define P_MUL_CROSS_UPARAMS(BIT) \
2483   auto pd = P_UFIELD(rd_tmp, i, BIT*2); \
2484   auto ps1 = P_UFIELD(rs1, i, BIT); \
2485   auto ps2 = P_UFIELD(rs2, (i ^ 1), BIT);
2486 
2487 #define P_REDUCTION_PARAMS(BIT_INNER) \
2488   auto ps1 = P_FIELD(rs1, j, BIT_INNER); \
2489   auto ps2 = P_FIELD(rs2, j, BIT_INNER);
2490 
2491 #define P_REDUCTION_UPARAMS(BIT_INNER) \
2492   auto ps1 = P_UFIELD(rs1, j, BIT_INNER); \
2493   auto ps2 = P_UFIELD(rs2, j, BIT_INNER);
2494 
2495 #define P_REDUCTION_SUPARAMS(BIT_INNER) \
2496   auto ps1 = P_FIELD(rs1, j, BIT_INNER); \
2497   auto ps2 = P_UFIELD(rs2, j, BIT_INNER);
2498 
2499 #define P_REDUCTION_CROSS_PARAMS(BIT_INNER) \
2500   auto ps1 = P_FIELD(rs1, j, BIT_INNER); \
2501   auto ps2 = P_FIELD(rs2, (j ^ 1), BIT_INNER);
2502 
2503 #define P_LOOP_BODY(BIT, BODY) { \
2504   P_PARAMS(BIT) \
2505   BODY \
2506   WRITE_PD(); \
2507 }
2508 
2509 #define P_ULOOP_BODY(BIT, BODY) { \
2510   P_UPARAMS(BIT) \
2511   BODY \
2512   WRITE_PD(); \
2513 }
2514 
2515 #define P_ONE_LOOP_BODY(BIT, BODY) { \
2516   P_ONE_PARAMS(BIT) \
2517   BODY \
2518   WRITE_PD(); \
2519 }
2520 
2521 #define P_CROSS_LOOP_BODY(BIT, BODY) { \
2522   P_CORSS_PARAMS(BIT) \
2523   BODY \
2524   WRITE_PD(); \
2525 }
2526 
2527 #define P_CROSS_ULOOP_BODY(BIT, BODY) { \
2528   P_CORSS_UPARAMS(BIT) \
2529   BODY \
2530   WRITE_PD(); \
2531 }
2532 
2533 #define P_ONE_ULOOP_BODY(BIT, BODY) { \
2534   P_ONE_UPARAMS(BIT) \
2535   BODY \
2536   WRITE_PD(); \
2537 }
2538 
2539 #define P_MUL_LOOP_BODY(BIT, BODY) { \
2540   P_MUL_PARAMS(BIT) \
2541   BODY \
2542   WRITE_PD(); \
2543 }
2544 
2545 #define P_MUL_ULOOP_BODY(BIT, BODY) { \
2546   P_MUL_UPARAMS(BIT) \
2547   BODY \
2548   WRITE_PD(); \
2549 }
2550 
2551 #define P_MUL_CROSS_LOOP_BODY(BIT, BODY) { \
2552   P_MUL_CROSS_PARAMS(BIT) \
2553   BODY \
2554   WRITE_PD(); \
2555 }
2556 
2557 #define P_MUL_CROSS_ULOOP_BODY(BIT, BODY) { \
2558   P_MUL_CROSS_UPARAMS(BIT) \
2559   BODY \
2560   WRITE_PD(); \
2561 }
2562 
2563 #define P_LOOP(BIT, BODY) \
2564   P_LOOP_BASE(BIT) \
2565   P_LOOP_BODY(BIT, BODY) \
2566   P_LOOP_END()
2567 
2568 #define P_ONE_LOOP(BIT, BODY) \
2569   P_ONE_LOOP_BASE(BIT) \
2570   P_ONE_LOOP_BODY(BIT, BODY) \
2571   P_LOOP_END()
2572 
2573 #define P_ULOOP(BIT, BODY) \
2574   P_LOOP_BASE(BIT) \
2575   P_ULOOP_BODY(BIT, BODY) \
2576   P_LOOP_END()
2577 
2578 #define P_CROSS_LOOP(BIT, BODY1, BODY2) \
2579   P_LOOP_BASE(BIT) \
2580   P_CROSS_LOOP_BODY(BIT, BODY1) \
2581   --i; \
2582   if (sizeof(#BODY2) == 1) { \
2583     P_CROSS_LOOP_BODY(BIT, BODY1) \
2584   } \
2585   else { \
2586     P_CROSS_LOOP_BODY(BIT, BODY2) \
2587   } \
2588   P_LOOP_END()
2589 
2590 #define P_CROSS_ULOOP(BIT, BODY1, BODY2) \
2591   P_LOOP_BASE(BIT) \
2592   P_CROSS_ULOOP_BODY(BIT, BODY1) \
2593   --i; \
2594   P_CROSS_ULOOP_BODY(BIT, BODY2) \
2595   P_LOOP_END()
2596 
2597 #define P_STRAIGHT_LOOP(BIT, BODY1, BODY2) \
2598   P_LOOP_BASE(BIT) \
2599   P_LOOP_BODY(BIT, BODY1) \
2600   --i; \
2601   P_LOOP_BODY(BIT, BODY2) \
2602   P_LOOP_END()
2603 
2604 #define P_STRAIGHT_ULOOP(BIT, BODY1, BODY2) \
2605   P_LOOP_BASE(BIT) \
2606   P_ULOOP_BODY(BIT, BODY1) \
2607   --i; \
2608   P_ULOOP_BODY(BIT, BODY2) \
2609   P_LOOP_END()
2610 
2611 #define P_X_LOOP(BIT, RS2_LOW_BIT, BODY) \
2612   P_X_LOOP_BASE(BIT, RS2_LOW_BIT) \
2613   P_ONE_LOOP_BODY(BIT, BODY) \
2614   P_LOOP_END()
2615 
2616 #define P_X_ULOOP(BIT, RS2_LOW_BIT, BODY) \
2617   P_X_LOOP_BASE(BIT, RS2_LOW_BIT) \
2618   P_ONE_ULOOP_BODY(BIT, BODY) \
2619   P_LOOP_END()
2620 
2621 #define P_I_LOOP(BIT, IMMBIT, BODY) \
2622   P_I_LOOP_BASE(BIT, IMMBIT) \
2623   P_ONE_LOOP_BODY(BIT, BODY) \
2624   P_LOOP_END()
2625 
2626 #define P_I_ULOOP(BIT, IMMBIT, BODY) \
2627   P_I_LOOP_BASE(BIT, IMMBIT) \
2628   P_ONE_ULOOP_BODY(BIT, BODY) \
2629   P_LOOP_END()
2630 
2631 #define P_MUL_LOOP(BIT, BODY) \
2632   P_MUL_LOOP_BASE(BIT) \
2633   P_MUL_LOOP_BODY(BIT, BODY) \
2634   P_PAIR_LOOP_END()
2635 
2636 #define P_MUL_ULOOP(BIT, BODY) \
2637   P_MUL_LOOP_BASE(BIT) \
2638   P_MUL_ULOOP_BODY(BIT, BODY) \
2639   P_PAIR_LOOP_END()
2640 
2641 #define P_MUL_CROSS_LOOP(BIT, BODY) \
2642   P_MUL_LOOP_BASE(BIT) \
2643   P_MUL_CROSS_LOOP_BODY(BIT, BODY) \
2644   P_PAIR_LOOP_END()
2645 
2646 #define P_MUL_CROSS_ULOOP(BIT, BODY) \
2647   P_MUL_LOOP_BASE(BIT) \
2648   P_MUL_CROSS_ULOOP_BODY(BIT, BODY) \
2649   P_PAIR_LOOP_END()
2650 
2651 #define P_REDUCTION_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2652   P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2653   P_REDUCTION_PARAMS(BIT_INNER) \
2654   BODY \
2655   P_REDUCTION_LOOP_END(BIT, IS_SAT)
2656 
2657 #define P_REDUCTION_ULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2658   P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
2659   P_REDUCTION_UPARAMS(BIT_INNER) \
2660   BODY \
2661   P_REDUCTION_ULOOP_END(BIT, IS_SAT)
2662 
2663 #define P_REDUCTION_SULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2664   P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2665   P_REDUCTION_SUPARAMS(BIT_INNER) \
2666   BODY \
2667   P_REDUCTION_LOOP_END(BIT, IS_SAT)
2668 
2669 #define P_REDUCTION_CROSS_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2670   P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2671   P_REDUCTION_CROSS_PARAMS(BIT_INNER) \
2672   BODY \
2673   P_REDUCTION_LOOP_END(BIT, IS_SAT)
2674 
2675 #define P_LOOP_END() \
2676   } \
2677   WRITE_RD(sext_xlen(rd_tmp));
2678 
2679 #define P_PAIR_LOOP_END() \
2680   } \
2681   if (xlen == 32) { \
2682     WRITE_RD_PAIR(rd_tmp); \
2683   } \
2684   else { \
2685     WRITE_RD(sext_xlen(rd_tmp)); \
2686   }
2687 
2688 #define P_REDUCTION_LOOP_END(BIT, IS_SAT) \
2689     } \
2690     if (IS_SAT) { \
2691       P_SAT(pd_res, BIT); \
2692     } \
2693     type_usew_t<BIT>::type pd = pd_res; \
2694     WRITE_PD(); \
2695   } \
2696   WRITE_RD(sext_xlen(rd_tmp));
2697 
2698 #define P_REDUCTION_ULOOP_END(BIT, IS_SAT) \
2699     } \
2700     if (IS_SAT) { \
2701       P_SATU(pd_res, BIT); \
2702     } \
2703     type_usew_t<BIT>::type pd = pd_res; \
2704     WRITE_PD(); \
2705   } \
2706   WRITE_RD(sext_xlen(rd_tmp));
2707 
2708 #define P_SUNPKD8(X, Y) \
2709   require_extension(EXT_ZPN); \
2710   reg_t rd_tmp = 0; \
2711   int16_t pd[4] = { \
2712     P_SB(RS1, Y), \
2713     P_SB(RS1, X), \
2714     P_SB(RS1, Y + 4), \
2715     P_SB(RS1, X + 4), \
2716   }; \
2717   if (xlen == 64) { \
2718     memcpy(&rd_tmp, pd, 8); \
2719   } else { \
2720     memcpy(&rd_tmp, pd, 4); \
2721   } \
2722   WRITE_RD(sext_xlen(rd_tmp));
2723 
2724 #define P_ZUNPKD8(X, Y) \
2725   require_extension(EXT_ZPN); \
2726   reg_t rd_tmp = 0; \
2727   uint16_t pd[4] = { \
2728     P_B(RS1, Y), \
2729     P_B(RS1, X), \
2730     P_B(RS1, Y + 4), \
2731     P_B(RS1, X + 4), \
2732   }; \
2733   if (xlen == 64) { \
2734     memcpy(&rd_tmp, pd, 8); \
2735   } else { \
2736     memcpy(&rd_tmp, pd, 4); \
2737   } \
2738   WRITE_RD(sext_xlen(rd_tmp));
2739 
2740 #define P_PK(BIT, X, Y) \
2741   require_extension(EXT_ZPN); \
2742   require(BIT == e16 || BIT == e32); \
2743   reg_t rd_tmp = 0, rs1 = RS1, rs2 = RS2; \
2744   for (sreg_t i = 0; i < xlen / BIT / 2; i++) { \
2745     rd_tmp = set_field(rd_tmp, make_mask64(i * 2 * BIT, BIT), \
2746       P_UFIELD(RS2, i * 2 + Y, BIT)); \
2747     rd_tmp = set_field(rd_tmp, make_mask64((i * 2 + 1) * BIT, BIT), \
2748       P_UFIELD(RS1, i * 2 + X, BIT)); \
2749   } \
2750   WRITE_RD(sext_xlen(rd_tmp));
2751 
2752 #define P_64_PROFILE_BASE() \
2753   require_extension(EXT_ZPSFOPERAND); \
2754   sreg_t rd, rs1, rs2;
2755 
2756 #define P_64_UPROFILE_BASE() \
2757   require_extension(EXT_ZPSFOPERAND); \
2758   reg_t rd, rs1, rs2;
2759 
2760 #define P_64_PROFILE_PARAM(USE_RD, INPUT_PAIR) \
2761   if (xlen == 32) { \
2762     rs1 = INPUT_PAIR ? RS1_PAIR : RS1; \
2763     rs2 = INPUT_PAIR ? RS2_PAIR : RS2; \
2764     rd = USE_RD ? RD_PAIR : 0; \
2765   } else { \
2766     rs1 = RS1; \
2767     rs2 = RS2; \
2768     rd = USE_RD ? RD : 0; \
2769   }
2770 
2771 #define P_64_PROFILE(BODY) \
2772   P_64_PROFILE_BASE() \
2773   P_64_PROFILE_PARAM(false, true) \
2774   BODY \
2775   P_64_PROFILE_END() \
2776 
2777 #define P_64_UPROFILE(BODY) \
2778   P_64_UPROFILE_BASE() \
2779   P_64_PROFILE_PARAM(false, true) \
2780   BODY \
2781   P_64_PROFILE_END() \
2782 
2783 #define P_64_PROFILE_REDUCTION(BIT, BODY) \
2784   P_64_PROFILE_BASE() \
2785   P_64_PROFILE_PARAM(true, false) \
2786   for (sreg_t i = 0; i < xlen / BIT; i++) { \
2787     sreg_t ps1 = P_FIELD(rs1, i, BIT); \
2788     sreg_t ps2 = P_FIELD(rs2, i, BIT); \
2789     BODY \
2790   } \
2791   P_64_PROFILE_END() \
2792 
2793 #define P_64_UPROFILE_REDUCTION(BIT, BODY) \
2794   P_64_UPROFILE_BASE() \
2795   P_64_PROFILE_PARAM(true, false) \
2796   for (sreg_t i = 0; i < xlen / BIT; i++) { \
2797     reg_t ps1 = P_UFIELD(rs1, i, BIT); \
2798     reg_t ps2 = P_UFIELD(rs2, i, BIT); \
2799     BODY \
2800   } \
2801   P_64_PROFILE_END() \
2802 
2803 #define P_64_PROFILE_END() \
2804   if (xlen == 32) { \
2805     WRITE_RD_PAIR(rd); \
2806   } else { \
2807     WRITE_RD(sext_xlen(rd)); \
2808   }
2809 
2810 #define DEBUG_START             0x0
2811 #define DEBUG_END               (0x1000 - 1)
2812 
2813 #endif
2814