1 // See LICENSE for license details.
2
3 #ifndef _RISCV_DECODE_H
4 #define _RISCV_DECODE_H
5
6 #if (-1 != ~0) || ((-1 >> 1) != -1)
7 # error spike requires a two''s-complement c++ implementation
8 #endif
9
10 #include <algorithm>
11 #include <cstdint>
12 #include <string.h>
13 #include <strings.h>
14 #include "encoding.h"
15 #include "config.h"
16 #include "common.h"
17 #include "softfloat_types.h"
18 #include "specialize.h"
19 #include <cinttypes>
20
21 typedef int64_t sreg_t;
22 typedef uint64_t reg_t;
23
24 #ifdef __SIZEOF_INT128__
25 typedef __int128 int128_t;
26 typedef unsigned __int128 uint128_t;
27 #endif
28
29 const int NXPR = 32;
30 const int NFPR = 32;
31 const int NVPR = 32;
32 const int NCSR = 4096;
33
34 #define X_RA 1
35 #define X_SP 2
36
37 #define VCSR_VXRM_SHIFT 1
38 #define VCSR_VXRM (0x3 << VCSR_VXRM_SHIFT)
39
40 #define VCSR_VXSAT_SHIFT 0
41 #define VCSR_VXSAT (0x1 << VCSR_VXSAT_SHIFT)
42
43 #define FP_RD_NE 0
44 #define FP_RD_0 1
45 #define FP_RD_DN 2
46 #define FP_RD_UP 3
47 #define FP_RD_NMM 4
48
49 #define FSR_RD_SHIFT 5
50 #define FSR_RD (0x7 << FSR_RD_SHIFT)
51
52 #define FPEXC_NX 0x01
53 #define FPEXC_UF 0x02
54 #define FPEXC_OF 0x04
55 #define FPEXC_DZ 0x08
56 #define FPEXC_NV 0x10
57
58 #define FSR_AEXC_SHIFT 0
59 #define FSR_NVA (FPEXC_NV << FSR_AEXC_SHIFT)
60 #define FSR_OFA (FPEXC_OF << FSR_AEXC_SHIFT)
61 #define FSR_UFA (FPEXC_UF << FSR_AEXC_SHIFT)
62 #define FSR_DZA (FPEXC_DZ << FSR_AEXC_SHIFT)
63 #define FSR_NXA (FPEXC_NX << FSR_AEXC_SHIFT)
64 #define FSR_AEXC (FSR_NVA | FSR_OFA | FSR_UFA | FSR_DZA | FSR_NXA)
65
66 #define insn_length(x) \
67 (((x) & 0x03) < 0x03 ? 2 : \
68 ((x) & 0x1f) < 0x1f ? 4 : \
69 ((x) & 0x3f) < 0x3f ? 6 : \
70 ((x) & 0x7f) == 0x7f ? 4 : \
71 8)
72 #define MAX_INSN_LENGTH 8
73 #define PC_ALIGN 2
74
75 typedef uint64_t insn_bits_t;
76 class insn_t
77 {
78 public:
79 insn_t() = default;
insn_t(insn_bits_t bits)80 insn_t(insn_bits_t bits) : b(bits) {}
bits()81 insn_bits_t bits() { return b & ~((UINT64_MAX) << (length() * 8)); }
length()82 int length() { return insn_length(b); }
i_imm()83 int64_t i_imm() { return int64_t(b) >> 20; }
shamt()84 int64_t shamt() { return x(20, 6); }
s_imm()85 int64_t s_imm() { return x(7, 5) + (xs(25, 7) << 5); }
sb_imm()86 int64_t sb_imm() { return (x(8, 4) << 1) + (x(25,6) << 5) + (x(7,1) << 11) + (imm_sign() << 12); }
u_imm()87 int64_t u_imm() { return int64_t(b) >> 12 << 12; }
uj_imm()88 int64_t uj_imm() { return (x(21, 10) << 1) + (x(20, 1) << 11) + (x(12, 8) << 12) + (imm_sign() << 20); }
rd()89 uint64_t rd() { return x(7, 5); }
rs1()90 uint64_t rs1() { return x(15, 5); }
rs2()91 uint64_t rs2() { return x(20, 5); }
rs3()92 uint64_t rs3() { return x(27, 5); }
rm()93 uint64_t rm() { return x(12, 3); }
csr()94 uint64_t csr() { return x(20, 12); }
iorw()95 uint64_t iorw() { return x(20, 8); }
bs()96 uint64_t bs () {return x(30,2);} // Crypto ISE - SM4/AES32 byte select.
rcon()97 uint64_t rcon() {return x(20,4);} // Crypto ISE - AES64 round const.
98
rvc_imm()99 int64_t rvc_imm() { return x(2, 5) + (xs(12, 1) << 5); }
rvc_zimm()100 int64_t rvc_zimm() { return x(2, 5) + (x(12, 1) << 5); }
rvc_addi4spn_imm()101 int64_t rvc_addi4spn_imm() { return (x(6, 1) << 2) + (x(5, 1) << 3) + (x(11, 2) << 4) + (x(7, 4) << 6); }
rvc_addi16sp_imm()102 int64_t rvc_addi16sp_imm() { return (x(6, 1) << 4) + (x(2, 1) << 5) + (x(5, 1) << 6) + (x(3, 2) << 7) + (xs(12, 1) << 9); }
rvc_lwsp_imm()103 int64_t rvc_lwsp_imm() { return (x(4, 3) << 2) + (x(12, 1) << 5) + (x(2, 2) << 6); }
rvc_ldsp_imm()104 int64_t rvc_ldsp_imm() { return (x(5, 2) << 3) + (x(12, 1) << 5) + (x(2, 3) << 6); }
rvc_swsp_imm()105 int64_t rvc_swsp_imm() { return (x(9, 4) << 2) + (x(7, 2) << 6); }
rvc_sdsp_imm()106 int64_t rvc_sdsp_imm() { return (x(10, 3) << 3) + (x(7, 3) << 6); }
rvc_lw_imm()107 int64_t rvc_lw_imm() { return (x(6, 1) << 2) + (x(10, 3) << 3) + (x(5, 1) << 6); }
rvc_ld_imm()108 int64_t rvc_ld_imm() { return (x(10, 3) << 3) + (x(5, 2) << 6); }
rvc_j_imm()109 int64_t rvc_j_imm() { return (x(3, 3) << 1) + (x(11, 1) << 4) + (x(2, 1) << 5) + (x(7, 1) << 6) + (x(6, 1) << 7) + (x(9, 2) << 8) + (x(8, 1) << 10) + (xs(12, 1) << 11); }
rvc_b_imm()110 int64_t rvc_b_imm() { return (x(3, 2) << 1) + (x(10, 2) << 3) + (x(2, 1) << 5) + (x(5, 2) << 6) + (xs(12, 1) << 8); }
rvc_simm3()111 int64_t rvc_simm3() { return x(10, 3); }
rvc_rd()112 uint64_t rvc_rd() { return rd(); }
rvc_rs1()113 uint64_t rvc_rs1() { return rd(); }
rvc_rs2()114 uint64_t rvc_rs2() { return x(2, 5); }
rvc_rs1s()115 uint64_t rvc_rs1s() { return 8 + x(7, 3); }
rvc_rs2s()116 uint64_t rvc_rs2s() { return 8 + x(2, 3); }
117
v_vm()118 uint64_t v_vm() { return x(25, 1); }
v_wd()119 uint64_t v_wd() { return x(26, 1); }
v_nf()120 uint64_t v_nf() { return x(29, 3); }
v_simm5()121 uint64_t v_simm5() { return xs(15, 5); }
v_zimm5()122 uint64_t v_zimm5() { return x(15, 5); }
v_zimm10()123 uint64_t v_zimm10() { return x(20, 10); }
v_zimm11()124 uint64_t v_zimm11() { return x(20, 11); }
v_lmul()125 uint64_t v_lmul() { return x(20, 2); }
v_frac_lmul()126 uint64_t v_frac_lmul() { return x(22, 1); }
v_sew()127 uint64_t v_sew() { return 1 << (x(23, 3) + 3); }
v_width()128 uint64_t v_width() { return x(12, 3); }
v_mop()129 uint64_t v_mop() { return x(26, 2); }
v_lumop()130 uint64_t v_lumop() { return x(20, 5); }
v_sumop()131 uint64_t v_sumop() { return x(20, 5); }
v_vta()132 uint64_t v_vta() { return x(26, 1); }
v_vma()133 uint64_t v_vma() { return x(27, 1); }
v_mew()134 uint64_t v_mew() { return x(28, 1); }
135
p_imm2()136 uint64_t p_imm2() { return x(20, 2); }
p_imm3()137 uint64_t p_imm3() { return x(20, 3); }
p_imm4()138 uint64_t p_imm4() { return x(20, 4); }
p_imm5()139 uint64_t p_imm5() { return x(20, 5); }
p_imm6()140 uint64_t p_imm6() { return x(20, 6); }
141
142 private:
143 insn_bits_t b;
x(int lo,int len)144 uint64_t x(int lo, int len) { return (b >> lo) & ((insn_bits_t(1) << len)-1); }
xs(int lo,int len)145 uint64_t xs(int lo, int len) { return int64_t(b) << (64-lo-len) >> (64-len); }
imm_sign()146 uint64_t imm_sign() { return xs(63, 1); }
147 };
148
149 template <class T, size_t N, bool zero_reg>
150 class regfile_t
151 {
152 public:
write(size_t i,T value)153 void write(size_t i, T value)
154 {
155 if (!zero_reg || i != 0)
156 data[i] = value;
157 }
158 const T& operator [] (size_t i) const
159 {
160 return data[i];
161 }
regfile_t()162 regfile_t()
163 {
164 reset();
165 }
reset()166 void reset()
167 {
168 memset(data, 0, sizeof(data));
169 }
170 private:
171 T data[N];
172 };
173
174 // helpful macros, etc
175 #define MMU (*p->get_mmu())
176 #define STATE (*p->get_state())
177 #define P (*p)
178 #define FLEN (p->get_flen())
179 #define READ_REG(reg) STATE.XPR[reg]
180 #define READ_FREG(reg) STATE.FPR[reg]
181 #define RD READ_REG(insn.rd())
182 #define RS1 READ_REG(insn.rs1())
183 #define RS2 READ_REG(insn.rs2())
184 #define RS3 READ_REG(insn.rs3())
185 #define WRITE_RD(value) WRITE_REG(insn.rd(), value)
186
187 #ifndef RISCV_ENABLE_COMMITLOG
188 # define WRITE_REG(reg, value) STATE.XPR.write(reg, value)
189 # define WRITE_FREG(reg, value) DO_WRITE_FREG(reg, freg(value))
190 # define WRITE_VSTATUS
191 #else
192 /* 0 : int
193 * 1 : floating
194 * 2 : vector reg
195 * 3 : vector hint
196 * 4 : csr
197 */
198 # define WRITE_REG(reg, value) ({ \
199 reg_t wdata = (value); /* value may have side effects */ \
200 STATE.log_reg_write[(reg) << 4] = {wdata, 0}; \
201 STATE.XPR.write(reg, wdata); \
202 })
203 # define WRITE_FREG(reg, value) ({ \
204 freg_t wdata = freg(value); /* value may have side effects */ \
205 STATE.log_reg_write[((reg) << 4) | 1] = wdata; \
206 DO_WRITE_FREG(reg, wdata); \
207 })
208 # define WRITE_VSTATUS STATE.log_reg_write[3] = {0, 0};
209 #endif
210
211 // RVC macros
212 #define WRITE_RVC_RS1S(value) WRITE_REG(insn.rvc_rs1s(), value)
213 #define WRITE_RVC_RS2S(value) WRITE_REG(insn.rvc_rs2s(), value)
214 #define WRITE_RVC_FRS2S(value) WRITE_FREG(insn.rvc_rs2s(), value)
215 #define RVC_RS1 READ_REG(insn.rvc_rs1())
216 #define RVC_RS2 READ_REG(insn.rvc_rs2())
217 #define RVC_RS1S READ_REG(insn.rvc_rs1s())
218 #define RVC_RS2S READ_REG(insn.rvc_rs2s())
219 #define RVC_FRS2 READ_FREG(insn.rvc_rs2())
220 #define RVC_FRS2S READ_FREG(insn.rvc_rs2s())
221 #define RVC_SP READ_REG(X_SP)
222
223 // FPU macros
224 #define FRS1 READ_FREG(insn.rs1())
225 #define FRS2 READ_FREG(insn.rs2())
226 #define FRS3 READ_FREG(insn.rs3())
227 #define dirty_fp_state STATE.sstatus->dirty(SSTATUS_FS)
228 #define dirty_ext_state STATE.sstatus->dirty(SSTATUS_XS)
229 #define dirty_vs_state STATE.sstatus->dirty(SSTATUS_VS)
230 #define DO_WRITE_FREG(reg, value) (STATE.FPR.write(reg, value), dirty_fp_state)
231 #define WRITE_FRD(value) WRITE_FREG(insn.rd(), value)
232
233 #define SHAMT (insn.i_imm() & 0x3F)
234 #define BRANCH_TARGET (pc + insn.sb_imm())
235 #define JUMP_TARGET (pc + insn.uj_imm())
236 #define RM ({ int rm = insn.rm(); \
237 if(rm == 7) rm = STATE.frm->read(); \
238 if(rm > 4) throw trap_illegal_instruction(insn.bits()); \
239 rm; })
240
241 #define get_field(reg, mask) (((reg) & (decltype(reg))(mask)) / ((mask) & ~((mask) << 1)))
242 #define set_field(reg, mask, val) (((reg) & ~(decltype(reg))(mask)) | (((decltype(reg))(val) * ((mask) & ~((mask) << 1))) & (decltype(reg))(mask)))
243
244 #define require(x) do { if (unlikely(!(x))) throw trap_illegal_instruction(insn.bits()); } while (0)
245 #define require_privilege(p) require(STATE.prv >= (p))
246 #define require_novirt() if (unlikely(STATE.v)) throw trap_virtual_instruction(insn.bits())
247 #define require_rv64 require(xlen == 64)
248 #define require_rv32 require(xlen == 32)
249 #define require_extension(s) require(p->extension_enabled(s))
250 #define require_either_extension(A,B) require(p->extension_enabled(A) || p->extension_enabled(B));
251 #define require_impl(s) require(p->supports_impl(s))
252 #define require_fp require(STATE.sstatus->enabled(SSTATUS_FS))
253 #define require_accelerator require(STATE.sstatus->enabled(SSTATUS_XS))
254 #define require_vector_vs require(STATE.sstatus->enabled(SSTATUS_VS))
255 #define require_vector(alu) \
256 do { \
257 require_vector_vs; \
258 require_extension('V'); \
259 require(!P.VU.vill); \
260 if (alu && !P.VU.vstart_alu) \
261 require(P.VU.vstart->read() == 0); \
262 WRITE_VSTATUS; \
263 dirty_vs_state; \
264 } while (0);
265 #define require_vector_novtype(is_log, alu) \
266 do { \
267 require_vector_vs; \
268 require_extension('V'); \
269 if (alu && !P.VU.vstart_alu) \
270 require(P.VU.vstart->read() == 0); \
271 if (is_log) \
272 WRITE_VSTATUS; \
273 dirty_vs_state; \
274 } while (0);
275 #define require_align(val, pos) require(is_aligned(val, pos))
276 #define require_noover(astart, asize, bstart, bsize) \
277 require(!is_overlapped(astart, asize, bstart, bsize))
278 #define require_noover_widen(astart, asize, bstart, bsize) \
279 require(!is_overlapped_widen(astart, asize, bstart, bsize))
280 #define require_vm do { if (insn.v_vm() == 0) require(insn.rd() != 0);} while(0);
281
282 #define set_fp_exceptions ({ if (softfloat_exceptionFlags) { \
283 STATE.fflags->write(STATE.fflags->read() | softfloat_exceptionFlags); \
284 } \
285 softfloat_exceptionFlags = 0; })
286
287 #define sext32(x) ((sreg_t)(int32_t)(x))
288 #define zext32(x) ((reg_t)(uint32_t)(x))
289 #define sext_xlen(x) (((sreg_t)(x) << (64-xlen)) >> (64-xlen))
290 #define zext(x, pos) (((reg_t)(x) << (64-(pos))) >> (64-(pos)))
291 #define zext_xlen(x) zext(x, xlen)
292
293 #define set_pc(x) \
294 do { p->check_pc_alignment(x); \
295 npc = sext_xlen(x); \
296 } while(0)
297
298 #define set_pc_and_serialize(x) \
299 do { reg_t __npc = (x) & p->pc_alignment_mask(); \
300 npc = PC_SERIALIZE_AFTER; \
301 STATE.pc = __npc; \
302 } while(0)
303
304 class wait_for_interrupt_t {};
305
306 #define wfi() \
307 do { set_pc_and_serialize(npc); \
308 npc = PC_SERIALIZE_WFI; \
309 throw wait_for_interrupt_t(); \
310 } while(0)
311
312 #define serialize() set_pc_and_serialize(npc)
313
314 /* Sentinel PC values to serialize simulator pipeline */
315 #define PC_SERIALIZE_BEFORE 3
316 #define PC_SERIALIZE_AFTER 5
317 #define PC_SERIALIZE_WFI 7
318 #define invalid_pc(pc) ((pc) & 1)
319
320 /* Convenience wrappers to simplify softfloat code sequences */
321 #define isBoxedF16(r) (isBoxedF32(r) && ((uint64_t)((r.v[0] >> 16) + 1) == ((uint64_t)1 << 48)))
322 #define unboxF16(r) (isBoxedF16(r) ? (uint16_t)r.v[0] : defaultNaNF16UI)
323 #define isBoxedF32(r) (isBoxedF64(r) && ((uint32_t)((r.v[0] >> 32) + 1) == 0))
324 #define unboxF32(r) (isBoxedF32(r) ? (uint32_t)r.v[0] : defaultNaNF32UI)
325 #define isBoxedF64(r) ((r.v[1] + 1) == 0)
326 #define unboxF64(r) (isBoxedF64(r) ? r.v[0] : defaultNaNF64UI)
327 typedef float128_t freg_t;
f16(uint16_t v)328 inline float16_t f16(uint16_t v) { return { v }; }
f32(uint32_t v)329 inline float32_t f32(uint32_t v) { return { v }; }
f64(uint64_t v)330 inline float64_t f64(uint64_t v) { return { v }; }
f16(freg_t r)331 inline float16_t f16(freg_t r) { return f16(unboxF16(r)); }
f32(freg_t r)332 inline float32_t f32(freg_t r) { return f32(unboxF32(r)); }
f64(freg_t r)333 inline float64_t f64(freg_t r) { return f64(unboxF64(r)); }
f128(freg_t r)334 inline float128_t f128(freg_t r) { return r; }
freg(float16_t f)335 inline freg_t freg(float16_t f) { return { ((uint64_t)-1 << 16) | f.v, (uint64_t)-1 }; }
freg(float32_t f)336 inline freg_t freg(float32_t f) { return { ((uint64_t)-1 << 32) | f.v, (uint64_t)-1 }; }
freg(float64_t f)337 inline freg_t freg(float64_t f) { return { f.v, (uint64_t)-1 }; }
freg(float128_t f)338 inline freg_t freg(float128_t f) { return f; }
339 #define F16_SIGN ((uint16_t)1 << 15)
340 #define F32_SIGN ((uint32_t)1 << 31)
341 #define F64_SIGN ((uint64_t)1 << 63)
342 #define fsgnj16(a, b, n, x) \
343 f16((f16(a).v & ~F16_SIGN) | ((((x) ? f16(a).v : (n) ? F16_SIGN : 0) ^ f16(b).v) & F16_SIGN))
344 #define fsgnj32(a, b, n, x) \
345 f32((f32(a).v & ~F32_SIGN) | ((((x) ? f32(a).v : (n) ? F32_SIGN : 0) ^ f32(b).v) & F32_SIGN))
346 #define fsgnj64(a, b, n, x) \
347 f64((f64(a).v & ~F64_SIGN) | ((((x) ? f64(a).v : (n) ? F64_SIGN : 0) ^ f64(b).v) & F64_SIGN))
348
349 #define isNaNF128(x) isNaNF128UI(x.v[1], x.v[0])
defaultNaNF128()350 inline float128_t defaultNaNF128()
351 {
352 float128_t nan;
353 nan.v[1] = defaultNaNF128UI64;
354 nan.v[0] = defaultNaNF128UI0;
355 return nan;
356 }
fsgnj128(freg_t a,freg_t b,bool n,bool x)357 inline freg_t fsgnj128(freg_t a, freg_t b, bool n, bool x)
358 {
359 a.v[1] = (a.v[1] & ~F64_SIGN) | (((x ? a.v[1] : n ? F64_SIGN : 0) ^ b.v[1]) & F64_SIGN);
360 return a;
361 }
f128_negate(freg_t a)362 inline freg_t f128_negate(freg_t a)
363 {
364 a.v[1] ^= F64_SIGN;
365 return a;
366 }
367
368 #define validate_csr(which, write) ({ \
369 if (!STATE.serialized) return PC_SERIALIZE_BEFORE; \
370 STATE.serialized = false; \
371 /* permissions check occurs in get_csr */ \
372 (which); })
373
374 /* For debug only. This will fail if the native machine's float types are not IEEE */
to_f(float32_t f)375 inline float to_f(float32_t f){float r; memcpy(&r, &f, sizeof(r)); return r;}
to_f(float64_t f)376 inline double to_f(float64_t f){double r; memcpy(&r, &f, sizeof(r)); return r;}
to_f(float128_t f)377 inline long double to_f(float128_t f){long double r; memcpy(&r, &f, sizeof(r)); return r;}
378
379 // Vector macros
380 #define e8 8 // 8b elements
381 #define e16 16 // 16b elements
382 #define e32 32 // 32b elements
383 #define e64 64 // 64b elements
384 #define e128 128 // 128b elements
385 #define e256 256 // 256b elements
386 #define e512 512 // 512b elements
387 #define e1024 1024 // 1024b elements
388
389 #define vsext(x, sew) (((sreg_t)(x) << (64-sew)) >> (64-sew))
390 #define vzext(x, sew) (((reg_t)(x) << (64-sew)) >> (64-sew))
391
392 #define DEBUG_RVV 0
393
394 #if DEBUG_RVV
395 #define DEBUG_RVV_FP_VV \
396 printf("vfp(%lu) vd=%f vs1=%f vs2=%f\n", i, to_f(vd), to_f(vs1), to_f(vs2));
397 #define DEBUG_RVV_FP_VF \
398 printf("vfp(%lu) vd=%f vs1=%f vs2=%f\n", i, to_f(vd), to_f(rs1), to_f(vs2));
399 #define DEBUG_RVV_FMA_VV \
400 printf("vfma(%lu) vd=%f vs1=%f vs2=%f vd_old=%f\n", i, to_f(vd), to_f(vs1), to_f(vs2), to_f(vd_old));
401 #define DEBUG_RVV_FMA_VF \
402 printf("vfma(%lu) vd=%f vs1=%f vs2=%f vd_old=%f\n", i, to_f(vd), to_f(rs1), to_f(vs2), to_f(vd_old));
403 #else
404 #define DEBUG_RVV_FP_VV 0
405 #define DEBUG_RVV_FP_VF 0
406 #define DEBUG_RVV_FMA_VV 0
407 #define DEBUG_RVV_FMA_VF 0
408 #endif
409
410 //
411 // vector: masking skip helper
412 //
413 #define VI_MASK_VARS \
414 const int midx = i / 64; \
415 const int mpos = i % 64;
416
417 #define VI_LOOP_ELEMENT_SKIP(BODY) \
418 VI_MASK_VARS \
419 if (insn.v_vm() == 0) { \
420 BODY; \
421 bool skip = ((P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1) == 0; \
422 if (skip) {\
423 continue; \
424 }\
425 }
426
427 #define VI_ELEMENT_SKIP(inx) \
428 if (inx >= vl) { \
429 continue; \
430 } else if (inx < P.VU.vstart->read()) { \
431 continue; \
432 } else { \
433 VI_LOOP_ELEMENT_SKIP(); \
434 }
435
436 //
437 // vector: operation and register acccess check helper
438 //
is_overlapped(const int astart,int asize,const int bstart,int bsize)439 static inline bool is_overlapped(const int astart, int asize,
440 const int bstart, int bsize)
441 {
442 asize = asize == 0 ? 1 : asize;
443 bsize = bsize == 0 ? 1 : bsize;
444
445 const int aend = astart + asize;
446 const int bend = bstart + bsize;
447
448 return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
449 }
450
is_overlapped_widen(const int astart,int asize,const int bstart,int bsize)451 static inline bool is_overlapped_widen(const int astart, int asize,
452 const int bstart, int bsize)
453 {
454 asize = asize == 0 ? 1 : asize;
455 bsize = bsize == 0 ? 1 : bsize;
456
457 const int aend = astart + asize;
458 const int bend = bstart + bsize;
459
460 if (astart < bstart &&
461 is_overlapped(astart, asize, bstart, bsize) &&
462 !is_overlapped(astart, asize, bstart + bsize, bsize)) {
463 return false;
464 } else {
465 return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
466 }
467 }
468
is_aligned(const unsigned val,const unsigned pos)469 static inline bool is_aligned(const unsigned val, const unsigned pos)
470 {
471 return pos ? (val & (pos - 1)) == 0 : true;
472 }
473
474 #define VI_NARROW_CHECK_COMMON \
475 require_vector(true);\
476 require(P.VU.vflmul <= 4); \
477 require(P.VU.vsew * 2 <= P.VU.ELEN); \
478 require_align(insn.rs2(), P.VU.vflmul * 2); \
479 require_align(insn.rd(), P.VU.vflmul); \
480 require_vm; \
481
482 #define VI_WIDE_CHECK_COMMON \
483 require_vector(true);\
484 require(P.VU.vflmul <= 4); \
485 require(P.VU.vsew * 2 <= P.VU.ELEN); \
486 require_align(insn.rd(), P.VU.vflmul * 2); \
487 require_vm; \
488
489 #define VI_CHECK_ST_INDEX(elt_width) \
490 require_vector(false); \
491 float vemul = ((float)elt_width / P.VU.vsew * P.VU.vflmul); \
492 require(vemul >= 0.125 && vemul <= 8); \
493 reg_t emul = vemul < 1 ? 1 : vemul; \
494 reg_t flmul = P.VU.vflmul < 1 ? 1 : P.VU.vflmul; \
495 require_align(insn.rd(), P.VU.vflmul); \
496 require_align(insn.rs2(), vemul); \
497 require((nf * flmul) <= (NVPR / 4) && \
498 (insn.rd() + nf * flmul) <= NVPR); \
499
500 #define VI_CHECK_LD_INDEX(elt_width) \
501 VI_CHECK_ST_INDEX(elt_width); \
502 for (reg_t idx = 0; idx < nf; ++idx) { \
503 reg_t flmul = P.VU.vflmul < 1 ? 1 : P.VU.vflmul; \
504 reg_t seg_vd = insn.rd() + flmul * idx; \
505 if (elt_width > P.VU.vsew) { \
506 if (seg_vd != insn.rs2()) \
507 require_noover(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
508 } else if (elt_width < P.VU.vsew) { \
509 if (vemul < 1) {\
510 require_noover(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
511 } else {\
512 require_noover_widen(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
513 } \
514 } \
515 if (nf >= 2) { \
516 require_noover(seg_vd, P.VU.vflmul, insn.rs2(), vemul); \
517 } \
518 } \
519 require_vm; \
520
521 #define VI_CHECK_MSS(is_vs1) \
522 if (insn.rd() != insn.rs2()) \
523 require_noover(insn.rd(), 1, insn.rs2(), P.VU.vflmul); \
524 require_align(insn.rs2(), P.VU.vflmul); \
525 if (is_vs1) {\
526 if (insn.rd() != insn.rs1()) \
527 require_noover(insn.rd(), 1, insn.rs1(), P.VU.vflmul); \
528 require_align(insn.rs1(), P.VU.vflmul); \
529 } \
530
531 #define VI_CHECK_SSS(is_vs1) \
532 require_vm; \
533 if (P.VU.vflmul > 1) { \
534 require_align(insn.rd(), P.VU.vflmul); \
535 require_align(insn.rs2(), P.VU.vflmul); \
536 if (is_vs1) { \
537 require_align(insn.rs1(), P.VU.vflmul); \
538 } \
539 }
540
541 #define VI_CHECK_STORE(elt_width, is_mask_ldst) \
542 require_vector(false); \
543 reg_t veew = is_mask_ldst ? 1 : sizeof(elt_width##_t) * 8; \
544 float vemul = is_mask_ldst ? 1 : ((float)veew / P.VU.vsew * P.VU.vflmul); \
545 reg_t emul = vemul < 1 ? 1 : vemul; \
546 require(vemul >= 0.125 && vemul <= 8); \
547 require_align(insn.rd(), vemul); \
548 require((nf * emul) <= (NVPR / 4) && \
549 (insn.rd() + nf * emul) <= NVPR); \
550
551 #define VI_CHECK_LOAD(elt_width, is_mask_ldst) \
552 VI_CHECK_STORE(elt_width, is_mask_ldst); \
553 require_vm; \
554
555 #define VI_CHECK_DSS(is_vs1) \
556 VI_WIDE_CHECK_COMMON; \
557 require_align(insn.rs2(), P.VU.vflmul); \
558 if (P.VU.vflmul < 1) {\
559 require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs2(), P.VU.vflmul); \
560 } else {\
561 require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs2(), P.VU.vflmul); \
562 } \
563 if (is_vs1) {\
564 require_align(insn.rs1(), P.VU.vflmul); \
565 if (P.VU.vflmul < 1) {\
566 require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
567 } else {\
568 require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
569 } \
570 }
571
572 #define VI_CHECK_DDS(is_rs) \
573 VI_WIDE_CHECK_COMMON; \
574 require_align(insn.rs2(), P.VU.vflmul * 2); \
575 if (is_rs) { \
576 require_align(insn.rs1(), P.VU.vflmul); \
577 if (P.VU.vflmul < 1) {\
578 require_noover(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
579 } else {\
580 require_noover_widen(insn.rd(), P.VU.vflmul * 2, insn.rs1(), P.VU.vflmul); \
581 } \
582 }
583
584 #define VI_CHECK_SDS(is_vs1) \
585 VI_NARROW_CHECK_COMMON; \
586 if (insn.rd() != insn.rs2()) \
587 require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul * 2); \
588 if (is_vs1) \
589 require_align(insn.rs1(), P.VU.vflmul); \
590
591 #define VI_CHECK_REDUCTION(is_wide) \
592 require_vector(true);\
593 if (is_wide) {\
594 require(P.VU.vsew * 2 <= P.VU.ELEN); \
595 } \
596 require_align(insn.rs2(), P.VU.vflmul); \
597 require(P.VU.vstart->read() == 0); \
598
599 #define VI_CHECK_SLIDE(is_over) \
600 require_align(insn.rs2(), P.VU.vflmul); \
601 require_align(insn.rd(), P.VU.vflmul); \
602 require_vm; \
603 if (is_over) \
604 require(insn.rd() != insn.rs2()); \
605
606
607 //
608 // vector: loop header and end helper
609 //
610 #define VI_GENERAL_LOOP_BASE \
611 require(P.VU.vsew >= e8 && P.VU.vsew <= e64); \
612 require_vector(true);\
613 reg_t vl = P.VU.vl->read(); \
614 reg_t sew = P.VU.vsew; \
615 reg_t rd_num = insn.rd(); \
616 reg_t rs1_num = insn.rs1(); \
617 reg_t rs2_num = insn.rs2(); \
618 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){
619
620 #define VI_LOOP_BASE \
621 VI_GENERAL_LOOP_BASE \
622 VI_LOOP_ELEMENT_SKIP();
623
624 #define VI_LOOP_END \
625 } \
626 P.VU.vstart->write(0);
627
628 #define VI_LOOP_REDUCTION_END(x) \
629 } \
630 if (vl > 0) { \
631 vd_0_des = vd_0_res; \
632 } \
633 P.VU.vstart->write(0);
634
635 #define VI_LOOP_CMP_BASE \
636 require(P.VU.vsew >= e8 && P.VU.vsew <= e64); \
637 require_vector(true);\
638 reg_t vl = P.VU.vl->read(); \
639 reg_t sew = P.VU.vsew; \
640 reg_t rd_num = insn.rd(); \
641 reg_t rs1_num = insn.rs1(); \
642 reg_t rs2_num = insn.rs2(); \
643 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
644 VI_LOOP_ELEMENT_SKIP(); \
645 uint64_t mmask = UINT64_C(1) << mpos; \
646 uint64_t &vdi = P.VU.elt<uint64_t>(insn.rd(), midx, true); \
647 uint64_t res = 0;
648
649 #define VI_LOOP_CMP_END \
650 vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
651 } \
652 P.VU.vstart->write(0);
653
654 #define VI_LOOP_MASK(op) \
655 require(P.VU.vsew <= e64); \
656 require_vector(true);\
657 reg_t vl = P.VU.vl->read(); \
658 for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
659 int midx = i / 64; \
660 int mpos = i % 64; \
661 uint64_t mmask = UINT64_C(1) << mpos; \
662 uint64_t vs2 = P.VU.elt<uint64_t>(insn.rs2(), midx); \
663 uint64_t vs1 = P.VU.elt<uint64_t>(insn.rs1(), midx); \
664 uint64_t &res = P.VU.elt<uint64_t>(insn.rd(), midx, true); \
665 res = (res & ~mmask) | ((op) & (1ULL << mpos)); \
666 } \
667 P.VU.vstart->write(0);
668
669 #define VI_LOOP_NSHIFT_BASE \
670 VI_GENERAL_LOOP_BASE; \
671 VI_LOOP_ELEMENT_SKIP({\
672 require(!(insn.rd() == 0 && P.VU.vflmul > 1));\
673 });
674
675
676 #define INT_ROUNDING(result, xrm, gb) \
677 do { \
678 const uint64_t lsb = 1UL << (gb); \
679 const uint64_t lsb_half = lsb >> 1; \
680 switch (xrm) {\
681 case VRM::RNU:\
682 result += lsb_half; \
683 break;\
684 case VRM::RNE:\
685 if ((result & lsb_half) && ((result & (lsb_half - 1)) || (result & lsb))) \
686 result += lsb; \
687 break;\
688 case VRM::RDN:\
689 break;\
690 case VRM::ROD:\
691 if (result & (lsb - 1)) \
692 result |= lsb; \
693 break;\
694 case VRM::INVALID_RM:\
695 assert(true);\
696 } \
697 } while (0)
698
699 //
700 // vector: integer and masking operand access helper
701 //
702 #define VXI_PARAMS(x) \
703 type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
704 type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
705 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
706 type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
707 type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5();
708
709 #define VV_U_PARAMS(x) \
710 type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
711 type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
712 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
713
714 #define VX_U_PARAMS(x) \
715 type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
716 type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
717 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
718
719 #define VI_U_PARAMS(x) \
720 type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i, true); \
721 type_usew_t<x>::type zimm5 = (type_usew_t<x>::type)insn.v_zimm5(); \
722 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
723
724 #define VV_PARAMS(x) \
725 type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
726 type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
727 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
728
729 #define VX_PARAMS(x) \
730 type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
731 type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
732 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
733
734 #define VI_PARAMS(x) \
735 type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
736 type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
737 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
738
739 #define XV_PARAMS(x) \
740 type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
741 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, RS1);
742
743 #define VV_UCMP_PARAMS(x) \
744 type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
745 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
746
747 #define VX_UCMP_PARAMS(x) \
748 type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
749 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
750
751 #define VI_UCMP_PARAMS(x) \
752 type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
753
754 #define VV_CMP_PARAMS(x) \
755 type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
756 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
757
758 #define VX_CMP_PARAMS(x) \
759 type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
760 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
761
762 #define VI_CMP_PARAMS(x) \
763 type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
764 type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
765
766 #define VI_XI_SLIDEDOWN_PARAMS(x, off) \
767 auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
768 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i + off);
769
770 #define VI_XI_SLIDEUP_PARAMS(x, offset) \
771 auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true); \
772 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i - offset);
773
774 #define VI_NSHIFT_PARAMS(sew1, sew2) \
775 auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
776 auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
777 auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
778 auto zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5();
779
780 #define VX_NSHIFT_PARAMS(sew1, sew2) \
781 auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
782 auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
783 auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
784 auto rs1 = (type_sew_t<sew1>::type)RS1;
785
786 #define VV_NSHIFT_PARAMS(sew1, sew2) \
787 auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
788 auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
789 auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
790 auto vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i);
791
792 #define XI_CARRY_PARAMS(x) \
793 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
794 auto rs1 = (type_sew_t<x>::type)RS1; \
795 auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
796 auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
797
798 #define VV_CARRY_PARAMS(x) \
799 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
800 auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
801 auto &vd = P.VU.elt<uint64_t>(rd_num, midx, true);
802
803 #define XI_WITH_CARRY_PARAMS(x) \
804 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
805 auto rs1 = (type_sew_t<x>::type)RS1; \
806 auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
807 auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true);
808
809 #define VV_WITH_CARRY_PARAMS(x) \
810 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
811 auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
812 auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i, true);
813
814 //
815 // vector: integer and masking operation loop
816 //
817
818 // comparision result to masking register
819 #define VI_VV_LOOP_CMP(BODY) \
820 VI_CHECK_MSS(true); \
821 VI_LOOP_CMP_BASE \
822 if (sew == e8){ \
823 VV_CMP_PARAMS(e8); \
824 BODY; \
825 }else if(sew == e16){ \
826 VV_CMP_PARAMS(e16); \
827 BODY; \
828 }else if(sew == e32){ \
829 VV_CMP_PARAMS(e32); \
830 BODY; \
831 }else if(sew == e64){ \
832 VV_CMP_PARAMS(e64); \
833 BODY; \
834 } \
835 VI_LOOP_CMP_END
836
837 #define VI_VX_LOOP_CMP(BODY) \
838 VI_CHECK_MSS(false); \
839 VI_LOOP_CMP_BASE \
840 if (sew == e8){ \
841 VX_CMP_PARAMS(e8); \
842 BODY; \
843 }else if(sew == e16){ \
844 VX_CMP_PARAMS(e16); \
845 BODY; \
846 }else if(sew == e32){ \
847 VX_CMP_PARAMS(e32); \
848 BODY; \
849 }else if(sew == e64){ \
850 VX_CMP_PARAMS(e64); \
851 BODY; \
852 } \
853 VI_LOOP_CMP_END
854
855 #define VI_VI_LOOP_CMP(BODY) \
856 VI_CHECK_MSS(false); \
857 VI_LOOP_CMP_BASE \
858 if (sew == e8){ \
859 VI_CMP_PARAMS(e8); \
860 BODY; \
861 }else if(sew == e16){ \
862 VI_CMP_PARAMS(e16); \
863 BODY; \
864 }else if(sew == e32){ \
865 VI_CMP_PARAMS(e32); \
866 BODY; \
867 }else if(sew == e64){ \
868 VI_CMP_PARAMS(e64); \
869 BODY; \
870 } \
871 VI_LOOP_CMP_END
872
873 #define VI_VV_ULOOP_CMP(BODY) \
874 VI_CHECK_MSS(true); \
875 VI_LOOP_CMP_BASE \
876 if (sew == e8){ \
877 VV_UCMP_PARAMS(e8); \
878 BODY; \
879 }else if(sew == e16){ \
880 VV_UCMP_PARAMS(e16); \
881 BODY; \
882 }else if(sew == e32){ \
883 VV_UCMP_PARAMS(e32); \
884 BODY; \
885 }else if(sew == e64){ \
886 VV_UCMP_PARAMS(e64); \
887 BODY; \
888 } \
889 VI_LOOP_CMP_END
890
891 #define VI_VX_ULOOP_CMP(BODY) \
892 VI_CHECK_MSS(false); \
893 VI_LOOP_CMP_BASE \
894 if (sew == e8){ \
895 VX_UCMP_PARAMS(e8); \
896 BODY; \
897 }else if(sew == e16){ \
898 VX_UCMP_PARAMS(e16); \
899 BODY; \
900 }else if(sew == e32){ \
901 VX_UCMP_PARAMS(e32); \
902 BODY; \
903 }else if(sew == e64){ \
904 VX_UCMP_PARAMS(e64); \
905 BODY; \
906 } \
907 VI_LOOP_CMP_END
908
909 #define VI_VI_ULOOP_CMP(BODY) \
910 VI_CHECK_MSS(false); \
911 VI_LOOP_CMP_BASE \
912 if (sew == e8){ \
913 VI_UCMP_PARAMS(e8); \
914 BODY; \
915 }else if(sew == e16){ \
916 VI_UCMP_PARAMS(e16); \
917 BODY; \
918 }else if(sew == e32){ \
919 VI_UCMP_PARAMS(e32); \
920 BODY; \
921 }else if(sew == e64){ \
922 VI_UCMP_PARAMS(e64); \
923 BODY; \
924 } \
925 VI_LOOP_CMP_END
926
927 // merge and copy loop
928 #define VI_VVXI_MERGE_LOOP(BODY) \
929 VI_GENERAL_LOOP_BASE \
930 if (sew == e8){ \
931 VXI_PARAMS(e8); \
932 BODY; \
933 }else if(sew == e16){ \
934 VXI_PARAMS(e16); \
935 BODY; \
936 }else if(sew == e32){ \
937 VXI_PARAMS(e32); \
938 BODY; \
939 }else if(sew == e64){ \
940 VXI_PARAMS(e64); \
941 BODY; \
942 } \
943 VI_LOOP_END
944
945 // reduction loop - signed
946 #define VI_LOOP_REDUCTION_BASE(x) \
947 require(x >= e8 && x <= e64); \
948 reg_t vl = P.VU.vl->read(); \
949 reg_t rd_num = insn.rd(); \
950 reg_t rs1_num = insn.rs1(); \
951 reg_t rs2_num = insn.rs2(); \
952 auto &vd_0_des = P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true); \
953 auto vd_0_res = P.VU.elt<type_sew_t<x>::type>(rs1_num, 0); \
954 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
955 VI_LOOP_ELEMENT_SKIP(); \
956 auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
957
958 #define REDUCTION_LOOP(x, BODY) \
959 VI_LOOP_REDUCTION_BASE(x) \
960 BODY; \
961 VI_LOOP_REDUCTION_END(x)
962
963 #define VI_VV_LOOP_REDUCTION(BODY) \
964 VI_CHECK_REDUCTION(false); \
965 reg_t sew = P.VU.vsew; \
966 if (sew == e8) { \
967 REDUCTION_LOOP(e8, BODY) \
968 } else if(sew == e16) { \
969 REDUCTION_LOOP(e16, BODY) \
970 } else if(sew == e32) { \
971 REDUCTION_LOOP(e32, BODY) \
972 } else if(sew == e64) { \
973 REDUCTION_LOOP(e64, BODY) \
974 }
975
976 // reduction loop - unsgied
977 #define VI_ULOOP_REDUCTION_BASE(x) \
978 require(x >= e8 && x <= e64); \
979 reg_t vl = P.VU.vl->read(); \
980 reg_t rd_num = insn.rd(); \
981 reg_t rs1_num = insn.rs1(); \
982 reg_t rs2_num = insn.rs2(); \
983 auto &vd_0_des = P.VU.elt<type_usew_t<x>::type>(rd_num, 0, true); \
984 auto vd_0_res = P.VU.elt<type_usew_t<x>::type>(rs1_num, 0); \
985 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
986 VI_LOOP_ELEMENT_SKIP(); \
987 auto vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
988
989 #define REDUCTION_ULOOP(x, BODY) \
990 VI_ULOOP_REDUCTION_BASE(x) \
991 BODY; \
992 VI_LOOP_REDUCTION_END(x)
993
994 #define VI_VV_ULOOP_REDUCTION(BODY) \
995 VI_CHECK_REDUCTION(false); \
996 reg_t sew = P.VU.vsew; \
997 if (sew == e8){ \
998 REDUCTION_ULOOP(e8, BODY) \
999 } else if(sew == e16) { \
1000 REDUCTION_ULOOP(e16, BODY) \
1001 } else if(sew == e32) { \
1002 REDUCTION_ULOOP(e32, BODY) \
1003 } else if(sew == e64) { \
1004 REDUCTION_ULOOP(e64, BODY) \
1005 }
1006
1007
1008 // genearl VXI signed/unsgied loop
1009 #define VI_VV_ULOOP(BODY) \
1010 VI_CHECK_SSS(true) \
1011 VI_LOOP_BASE \
1012 if (sew == e8){ \
1013 VV_U_PARAMS(e8); \
1014 BODY; \
1015 }else if(sew == e16){ \
1016 VV_U_PARAMS(e16); \
1017 BODY; \
1018 }else if(sew == e32){ \
1019 VV_U_PARAMS(e32); \
1020 BODY; \
1021 }else if(sew == e64){ \
1022 VV_U_PARAMS(e64); \
1023 BODY; \
1024 } \
1025 VI_LOOP_END
1026
1027 #define VI_VV_LOOP(BODY) \
1028 VI_CHECK_SSS(true) \
1029 VI_LOOP_BASE \
1030 if (sew == e8){ \
1031 VV_PARAMS(e8); \
1032 BODY; \
1033 }else if(sew == e16){ \
1034 VV_PARAMS(e16); \
1035 BODY; \
1036 }else if(sew == e32){ \
1037 VV_PARAMS(e32); \
1038 BODY; \
1039 }else if(sew == e64){ \
1040 VV_PARAMS(e64); \
1041 BODY; \
1042 } \
1043 VI_LOOP_END
1044
1045 #define VI_VX_ULOOP(BODY) \
1046 VI_CHECK_SSS(false) \
1047 VI_LOOP_BASE \
1048 if (sew == e8){ \
1049 VX_U_PARAMS(e8); \
1050 BODY; \
1051 }else if(sew == e16){ \
1052 VX_U_PARAMS(e16); \
1053 BODY; \
1054 }else if(sew == e32){ \
1055 VX_U_PARAMS(e32); \
1056 BODY; \
1057 }else if(sew == e64){ \
1058 VX_U_PARAMS(e64); \
1059 BODY; \
1060 } \
1061 VI_LOOP_END
1062
1063 #define VI_VX_LOOP(BODY) \
1064 VI_CHECK_SSS(false) \
1065 VI_LOOP_BASE \
1066 if (sew == e8){ \
1067 VX_PARAMS(e8); \
1068 BODY; \
1069 }else if(sew == e16){ \
1070 VX_PARAMS(e16); \
1071 BODY; \
1072 }else if(sew == e32){ \
1073 VX_PARAMS(e32); \
1074 BODY; \
1075 }else if(sew == e64){ \
1076 VX_PARAMS(e64); \
1077 BODY; \
1078 } \
1079 VI_LOOP_END
1080
1081 #define VI_VI_ULOOP(BODY) \
1082 VI_CHECK_SSS(false) \
1083 VI_LOOP_BASE \
1084 if (sew == e8){ \
1085 VI_U_PARAMS(e8); \
1086 BODY; \
1087 }else if(sew == e16){ \
1088 VI_U_PARAMS(e16); \
1089 BODY; \
1090 }else if(sew == e32){ \
1091 VI_U_PARAMS(e32); \
1092 BODY; \
1093 }else if(sew == e64){ \
1094 VI_U_PARAMS(e64); \
1095 BODY; \
1096 } \
1097 VI_LOOP_END
1098
1099 #define VI_VI_LOOP(BODY) \
1100 VI_CHECK_SSS(false) \
1101 VI_LOOP_BASE \
1102 if (sew == e8){ \
1103 VI_PARAMS(e8); \
1104 BODY; \
1105 }else if(sew == e16){ \
1106 VI_PARAMS(e16); \
1107 BODY; \
1108 }else if(sew == e32){ \
1109 VI_PARAMS(e32); \
1110 BODY; \
1111 }else if(sew == e64){ \
1112 VI_PARAMS(e64); \
1113 BODY; \
1114 } \
1115 VI_LOOP_END
1116
1117 // narrow operation loop
1118 #define VI_VV_LOOP_NARROW(BODY) \
1119 VI_NARROW_CHECK_COMMON; \
1120 VI_LOOP_BASE \
1121 if (sew == e8){ \
1122 VI_NARROW_SHIFT(e8, e16) \
1123 BODY; \
1124 }else if(sew == e16){ \
1125 VI_NARROW_SHIFT(e16, e32) \
1126 BODY; \
1127 }else if(sew == e32){ \
1128 VI_NARROW_SHIFT(e32, e64) \
1129 BODY; \
1130 } \
1131 VI_LOOP_END
1132
1133 #define VI_NARROW_SHIFT(sew1, sew2) \
1134 type_usew_t<sew1>::type &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i, true); \
1135 type_usew_t<sew2>::type vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
1136 type_usew_t<sew1>::type zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5(); \
1137 type_sew_t<sew2>::type vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
1138 type_sew_t<sew1>::type vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i); \
1139 type_sew_t<sew1>::type rs1 = (type_sew_t<sew1>::type)RS1;
1140
1141 #define VI_VVXI_LOOP_NARROW(BODY, is_vs1) \
1142 VI_CHECK_SDS(is_vs1); \
1143 VI_LOOP_BASE \
1144 if (sew == e8){ \
1145 VI_NARROW_SHIFT(e8, e16) \
1146 BODY; \
1147 } else if (sew == e16) { \
1148 VI_NARROW_SHIFT(e16, e32) \
1149 BODY; \
1150 } else if (sew == e32) { \
1151 VI_NARROW_SHIFT(e32, e64) \
1152 BODY; \
1153 } \
1154 VI_LOOP_END
1155
1156 #define VI_VI_LOOP_NSHIFT(BODY, is_vs1) \
1157 VI_CHECK_SDS(is_vs1); \
1158 VI_LOOP_NSHIFT_BASE \
1159 if (sew == e8){ \
1160 VI_NSHIFT_PARAMS(e8, e16) \
1161 BODY; \
1162 } else if (sew == e16) { \
1163 VI_NSHIFT_PARAMS(e16, e32) \
1164 BODY; \
1165 } else if (sew == e32) { \
1166 VI_NSHIFT_PARAMS(e32, e64) \
1167 BODY; \
1168 } \
1169 VI_LOOP_END
1170
1171 #define VI_VX_LOOP_NSHIFT(BODY, is_vs1) \
1172 VI_CHECK_SDS(is_vs1); \
1173 VI_LOOP_NSHIFT_BASE \
1174 if (sew == e8){ \
1175 VX_NSHIFT_PARAMS(e8, e16) \
1176 BODY; \
1177 } else if (sew == e16) { \
1178 VX_NSHIFT_PARAMS(e16, e32) \
1179 BODY; \
1180 } else if (sew == e32) { \
1181 VX_NSHIFT_PARAMS(e32, e64) \
1182 BODY; \
1183 } \
1184 VI_LOOP_END
1185
1186 #define VI_VV_LOOP_NSHIFT(BODY, is_vs1) \
1187 VI_CHECK_SDS(is_vs1); \
1188 VI_LOOP_NSHIFT_BASE \
1189 if (sew == e8){ \
1190 VV_NSHIFT_PARAMS(e8, e16) \
1191 BODY; \
1192 } else if (sew == e16) { \
1193 VV_NSHIFT_PARAMS(e16, e32) \
1194 BODY; \
1195 } else if (sew == e32) { \
1196 VV_NSHIFT_PARAMS(e32, e64) \
1197 BODY; \
1198 } \
1199 VI_LOOP_END
1200
1201 // widen operation loop
1202 #define VI_VV_LOOP_WIDEN(BODY) \
1203 VI_LOOP_BASE \
1204 if (sew == e8){ \
1205 VV_PARAMS(e8); \
1206 BODY; \
1207 }else if(sew == e16){ \
1208 VV_PARAMS(e16); \
1209 BODY; \
1210 }else if(sew == e32){ \
1211 VV_PARAMS(e32); \
1212 BODY; \
1213 } \
1214 VI_LOOP_END
1215
1216 #define VI_VX_LOOP_WIDEN(BODY) \
1217 VI_LOOP_BASE \
1218 if (sew == e8){ \
1219 VX_PARAMS(e8); \
1220 BODY; \
1221 }else if(sew == e16){ \
1222 VX_PARAMS(e16); \
1223 BODY; \
1224 }else if(sew == e32){ \
1225 VX_PARAMS(e32); \
1226 BODY; \
1227 } \
1228 VI_LOOP_END
1229
1230 #define VI_WIDE_OP_AND_ASSIGN(var0, var1, var2, op0, op1, sign) \
1231 switch(P.VU.vsew) { \
1232 case e8: { \
1233 sign##16_t vd_w = P.VU.elt<sign##16_t>(rd_num, i); \
1234 P.VU.elt<uint16_t>(rd_num, i, true) = \
1235 op1((sign##16_t)(sign##8_t)var0 op0 (sign##16_t)(sign##8_t)var1) + var2; \
1236 } \
1237 break; \
1238 case e16: { \
1239 sign##32_t vd_w = P.VU.elt<sign##32_t>(rd_num, i); \
1240 P.VU.elt<uint32_t>(rd_num, i, true) = \
1241 op1((sign##32_t)(sign##16_t)var0 op0 (sign##32_t)(sign##16_t)var1) + var2; \
1242 } \
1243 break; \
1244 default: { \
1245 sign##64_t vd_w = P.VU.elt<sign##64_t>(rd_num, i); \
1246 P.VU.elt<uint64_t>(rd_num, i, true) = \
1247 op1((sign##64_t)(sign##32_t)var0 op0 (sign##64_t)(sign##32_t)var1) + var2; \
1248 } \
1249 break; \
1250 }
1251
1252 #define VI_WIDE_OP_AND_ASSIGN_MIX(var0, var1, var2, op0, op1, sign_d, sign_1, sign_2) \
1253 switch(P.VU.vsew) { \
1254 case e8: { \
1255 sign_d##16_t vd_w = P.VU.elt<sign_d##16_t>(rd_num, i); \
1256 P.VU.elt<uint16_t>(rd_num, i, true) = \
1257 op1((sign_1##16_t)(sign_1##8_t)var0 op0 (sign_2##16_t)(sign_2##8_t)var1) + var2; \
1258 } \
1259 break; \
1260 case e16: { \
1261 sign_d##32_t vd_w = P.VU.elt<sign_d##32_t>(rd_num, i); \
1262 P.VU.elt<uint32_t>(rd_num, i, true) = \
1263 op1((sign_1##32_t)(sign_1##16_t)var0 op0 (sign_2##32_t)(sign_2##16_t)var1) + var2; \
1264 } \
1265 break; \
1266 default: { \
1267 sign_d##64_t vd_w = P.VU.elt<sign_d##64_t>(rd_num, i); \
1268 P.VU.elt<uint64_t>(rd_num, i, true) = \
1269 op1((sign_1##64_t)(sign_1##32_t)var0 op0 (sign_2##64_t)(sign_2##32_t)var1) + var2; \
1270 } \
1271 break; \
1272 }
1273
1274 #define VI_WIDE_WVX_OP(var0, op0, sign) \
1275 switch(P.VU.vsew) { \
1276 case e8: { \
1277 sign##16_t &vd_w = P.VU.elt<sign##16_t>(rd_num, i, true); \
1278 sign##16_t vs2_w = P.VU.elt<sign##16_t>(rs2_num, i); \
1279 vd_w = vs2_w op0 (sign##16_t)(sign##8_t)var0; \
1280 } \
1281 break; \
1282 case e16: { \
1283 sign##32_t &vd_w = P.VU.elt<sign##32_t>(rd_num, i, true); \
1284 sign##32_t vs2_w = P.VU.elt<sign##32_t>(rs2_num, i); \
1285 vd_w = vs2_w op0 (sign##32_t)(sign##16_t)var0; \
1286 } \
1287 break; \
1288 default: { \
1289 sign##64_t &vd_w = P.VU.elt<sign##64_t>(rd_num, i, true); \
1290 sign##64_t vs2_w = P.VU.elt<sign##64_t>(rs2_num, i); \
1291 vd_w = vs2_w op0 (sign##64_t)(sign##32_t)var0; \
1292 } \
1293 break; \
1294 }
1295
1296 // wide reduction loop - signed
1297 #define VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1298 reg_t vl = P.VU.vl->read(); \
1299 reg_t rd_num = insn.rd(); \
1300 reg_t rs1_num = insn.rs1(); \
1301 reg_t rs2_num = insn.rs2(); \
1302 auto &vd_0_des = P.VU.elt<type_sew_t<sew2>::type>(rd_num, 0, true); \
1303 auto vd_0_res = P.VU.elt<type_sew_t<sew2>::type>(rs1_num, 0); \
1304 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
1305 VI_LOOP_ELEMENT_SKIP(); \
1306 auto vs2 = P.VU.elt<type_sew_t<sew1>::type>(rs2_num, i);
1307
1308 #define WIDE_REDUCTION_LOOP(sew1, sew2, BODY) \
1309 VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1310 BODY; \
1311 VI_LOOP_REDUCTION_END(sew2)
1312
1313 #define VI_VV_LOOP_WIDE_REDUCTION(BODY) \
1314 VI_CHECK_REDUCTION(true); \
1315 reg_t sew = P.VU.vsew; \
1316 if (sew == e8){ \
1317 WIDE_REDUCTION_LOOP(e8, e16, BODY) \
1318 } else if(sew == e16){ \
1319 WIDE_REDUCTION_LOOP(e16, e32, BODY) \
1320 } else if(sew == e32){ \
1321 WIDE_REDUCTION_LOOP(e32, e64, BODY) \
1322 }
1323
1324 // wide reduction loop - unsigned
1325 #define VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1326 reg_t vl = P.VU.vl->read(); \
1327 reg_t rd_num = insn.rd(); \
1328 reg_t rs1_num = insn.rs1(); \
1329 reg_t rs2_num = insn.rs2(); \
1330 auto &vd_0_des = P.VU.elt<type_usew_t<sew2>::type>(rd_num, 0, true); \
1331 auto vd_0_res = P.VU.elt<type_usew_t<sew2>::type>(rs1_num, 0); \
1332 for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
1333 VI_LOOP_ELEMENT_SKIP(); \
1334 auto vs2 = P.VU.elt<type_usew_t<sew1>::type>(rs2_num, i);
1335
1336 #define WIDE_REDUCTION_ULOOP(sew1, sew2, BODY) \
1337 VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
1338 BODY; \
1339 VI_LOOP_REDUCTION_END(sew2)
1340
1341 #define VI_VV_ULOOP_WIDE_REDUCTION(BODY) \
1342 VI_CHECK_REDUCTION(true); \
1343 reg_t sew = P.VU.vsew; \
1344 if (sew == e8){ \
1345 WIDE_REDUCTION_ULOOP(e8, e16, BODY) \
1346 } else if(sew == e16){ \
1347 WIDE_REDUCTION_ULOOP(e16, e32, BODY) \
1348 } else if(sew == e32){ \
1349 WIDE_REDUCTION_ULOOP(e32, e64, BODY) \
1350 }
1351
1352 // carry/borrow bit loop
1353 #define VI_VV_LOOP_CARRY(BODY) \
1354 VI_CHECK_MSS(true); \
1355 VI_GENERAL_LOOP_BASE \
1356 VI_MASK_VARS \
1357 if (sew == e8){ \
1358 VV_CARRY_PARAMS(e8) \
1359 BODY; \
1360 } else if (sew == e16) { \
1361 VV_CARRY_PARAMS(e16) \
1362 BODY; \
1363 } else if (sew == e32) { \
1364 VV_CARRY_PARAMS(e32) \
1365 BODY; \
1366 } else if (sew == e64) { \
1367 VV_CARRY_PARAMS(e64) \
1368 BODY; \
1369 } \
1370 VI_LOOP_END
1371
1372 #define VI_XI_LOOP_CARRY(BODY) \
1373 VI_CHECK_MSS(false); \
1374 VI_GENERAL_LOOP_BASE \
1375 VI_MASK_VARS \
1376 if (sew == e8){ \
1377 XI_CARRY_PARAMS(e8) \
1378 BODY; \
1379 } else if (sew == e16) { \
1380 XI_CARRY_PARAMS(e16) \
1381 BODY; \
1382 } else if (sew == e32) { \
1383 XI_CARRY_PARAMS(e32) \
1384 BODY; \
1385 } else if (sew == e64) { \
1386 XI_CARRY_PARAMS(e64) \
1387 BODY; \
1388 } \
1389 VI_LOOP_END
1390
1391 #define VI_VV_LOOP_WITH_CARRY(BODY) \
1392 require(insn.rd() != 0); \
1393 VI_CHECK_SSS(true); \
1394 VI_GENERAL_LOOP_BASE \
1395 VI_MASK_VARS \
1396 if (sew == e8){ \
1397 VV_WITH_CARRY_PARAMS(e8) \
1398 BODY; \
1399 } else if (sew == e16) { \
1400 VV_WITH_CARRY_PARAMS(e16) \
1401 BODY; \
1402 } else if (sew == e32) { \
1403 VV_WITH_CARRY_PARAMS(e32) \
1404 BODY; \
1405 } else if (sew == e64) { \
1406 VV_WITH_CARRY_PARAMS(e64) \
1407 BODY; \
1408 } \
1409 VI_LOOP_END
1410
1411 #define VI_XI_LOOP_WITH_CARRY(BODY) \
1412 require(insn.rd() != 0); \
1413 VI_CHECK_SSS(false); \
1414 VI_GENERAL_LOOP_BASE \
1415 VI_MASK_VARS \
1416 if (sew == e8){ \
1417 XI_WITH_CARRY_PARAMS(e8) \
1418 BODY; \
1419 } else if (sew == e16) { \
1420 XI_WITH_CARRY_PARAMS(e16) \
1421 BODY; \
1422 } else if (sew == e32) { \
1423 XI_WITH_CARRY_PARAMS(e32) \
1424 BODY; \
1425 } else if (sew == e64) { \
1426 XI_WITH_CARRY_PARAMS(e64) \
1427 BODY; \
1428 } \
1429 VI_LOOP_END
1430
1431 // average loop
1432 #define VI_VVX_LOOP_AVG(opd, op, is_vs1) \
1433 VI_CHECK_SSS(is_vs1); \
1434 VRM xrm = p->VU.get_vround_mode(); \
1435 VI_LOOP_BASE \
1436 switch(sew) { \
1437 case e8: { \
1438 VV_PARAMS(e8); \
1439 type_sew_t<e8>::type rs1 = RS1; \
1440 auto res = (int32_t)vs2 op opd; \
1441 INT_ROUNDING(res, xrm, 1); \
1442 vd = res >> 1; \
1443 break; \
1444 } \
1445 case e16: { \
1446 VV_PARAMS(e16); \
1447 type_sew_t<e16>::type rs1 = RS1; \
1448 auto res = (int32_t)vs2 op opd; \
1449 INT_ROUNDING(res, xrm, 1); \
1450 vd = res >> 1; \
1451 break; \
1452 } \
1453 case e32: { \
1454 VV_PARAMS(e32); \
1455 type_sew_t<e32>::type rs1 = RS1; \
1456 auto res = (int64_t)vs2 op opd; \
1457 INT_ROUNDING(res, xrm, 1); \
1458 vd = res >> 1; \
1459 break; \
1460 } \
1461 default: { \
1462 VV_PARAMS(e64); \
1463 type_sew_t<e64>::type rs1 = RS1; \
1464 auto res = (int128_t)vs2 op opd; \
1465 INT_ROUNDING(res, xrm, 1); \
1466 vd = res >> 1; \
1467 break; \
1468 } \
1469 } \
1470 VI_LOOP_END
1471
1472 #define VI_VVX_ULOOP_AVG(opd, op, is_vs1) \
1473 VI_CHECK_SSS(is_vs1); \
1474 VRM xrm = p->VU.get_vround_mode(); \
1475 VI_LOOP_BASE \
1476 switch(sew) { \
1477 case e8: { \
1478 VV_U_PARAMS(e8); \
1479 type_usew_t<e8>::type rs1 = RS1; \
1480 auto res = (uint16_t)vs2 op opd; \
1481 INT_ROUNDING(res, xrm, 1); \
1482 vd = res >> 1; \
1483 break; \
1484 } \
1485 case e16: { \
1486 VV_U_PARAMS(e16); \
1487 type_usew_t<e16>::type rs1 = RS1; \
1488 auto res = (uint32_t)vs2 op opd; \
1489 INT_ROUNDING(res, xrm, 1); \
1490 vd = res >> 1; \
1491 break; \
1492 } \
1493 case e32: { \
1494 VV_U_PARAMS(e32); \
1495 type_usew_t<e32>::type rs1 = RS1; \
1496 auto res = (uint64_t)vs2 op opd; \
1497 INT_ROUNDING(res, xrm, 1); \
1498 vd = res >> 1; \
1499 break; \
1500 } \
1501 default: { \
1502 VV_U_PARAMS(e64); \
1503 type_usew_t<e64>::type rs1 = RS1; \
1504 auto res = (uint128_t)vs2 op opd; \
1505 INT_ROUNDING(res, xrm, 1); \
1506 vd = res >> 1; \
1507 break; \
1508 } \
1509 } \
1510 VI_LOOP_END
1511
1512 //
1513 // vector: load/store helper
1514 //
1515 #define VI_STRIP(inx) \
1516 reg_t vreg_inx = inx;
1517
1518 #define VI_DUPLICATE_VREG(reg_num, idx_sew) \
1519 reg_t index[P.VU.vlmax]; \
1520 for (reg_t i = 0; i < P.VU.vlmax && P.VU.vl->read() != 0; ++i) { \
1521 switch(idx_sew) { \
1522 case e8: \
1523 index[i] = P.VU.elt<uint8_t>(reg_num, i); \
1524 break; \
1525 case e16: \
1526 index[i] = P.VU.elt<uint16_t>(reg_num, i); \
1527 break; \
1528 case e32: \
1529 index[i] = P.VU.elt<uint32_t>(reg_num, i); \
1530 break; \
1531 case e64: \
1532 index[i] = P.VU.elt<uint64_t>(reg_num, i); \
1533 break; \
1534 } \
1535 }
1536
1537 #define VI_LD(stride, offset, elt_width, is_mask_ldst) \
1538 const reg_t nf = insn.v_nf() + 1; \
1539 const reg_t vl = is_mask_ldst ? ((P.VU.vl->read() + 7) / 8) : P.VU.vl->read(); \
1540 const reg_t baseAddr = RS1; \
1541 const reg_t vd = insn.rd(); \
1542 VI_CHECK_LOAD(elt_width, is_mask_ldst); \
1543 for (reg_t i = 0; i < vl; ++i) { \
1544 VI_ELEMENT_SKIP(i); \
1545 VI_STRIP(i); \
1546 P.VU.vstart->write(i); \
1547 for (reg_t fn = 0; fn < nf; ++fn) { \
1548 elt_width##_t val = MMU.load_##elt_width( \
1549 baseAddr + (stride) + (offset) * sizeof(elt_width##_t)); \
1550 P.VU.elt<elt_width##_t>(vd + fn * emul, vreg_inx, true) = val; \
1551 } \
1552 } \
1553 P.VU.vstart->write(0);
1554
1555 #define VI_LD_INDEX(elt_width, is_seg) \
1556 const reg_t nf = insn.v_nf() + 1; \
1557 const reg_t vl = P.VU.vl->read(); \
1558 const reg_t baseAddr = RS1; \
1559 const reg_t vd = insn.rd(); \
1560 if (!is_seg) \
1561 require(nf == 1); \
1562 VI_CHECK_LD_INDEX(elt_width); \
1563 VI_DUPLICATE_VREG(insn.rs2(), elt_width); \
1564 for (reg_t i = 0; i < vl; ++i) { \
1565 VI_ELEMENT_SKIP(i); \
1566 VI_STRIP(i); \
1567 P.VU.vstart->write(i); \
1568 for (reg_t fn = 0; fn < nf; ++fn) { \
1569 switch(P.VU.vsew){ \
1570 case e8: \
1571 P.VU.elt<uint8_t>(vd + fn * flmul, vreg_inx, true) = \
1572 MMU.load_uint8(baseAddr + index[i] + fn * 1); \
1573 break; \
1574 case e16: \
1575 P.VU.elt<uint16_t>(vd + fn * flmul, vreg_inx, true) = \
1576 MMU.load_uint16(baseAddr + index[i] + fn * 2); \
1577 break; \
1578 case e32: \
1579 P.VU.elt<uint32_t>(vd + fn * flmul, vreg_inx, true) = \
1580 MMU.load_uint32(baseAddr + index[i] + fn * 4); \
1581 break; \
1582 default: \
1583 P.VU.elt<uint64_t>(vd + fn * flmul, vreg_inx, true) = \
1584 MMU.load_uint64(baseAddr + index[i] + fn * 8); \
1585 break; \
1586 } \
1587 } \
1588 } \
1589 P.VU.vstart->write(0);
1590
1591 #define VI_ST(stride, offset, elt_width, is_mask_ldst) \
1592 const reg_t nf = insn.v_nf() + 1; \
1593 const reg_t vl = is_mask_ldst ? ((P.VU.vl->read() + 7) / 8) : P.VU.vl->read(); \
1594 const reg_t baseAddr = RS1; \
1595 const reg_t vs3 = insn.rd(); \
1596 VI_CHECK_STORE(elt_width, is_mask_ldst); \
1597 for (reg_t i = 0; i < vl; ++i) { \
1598 VI_STRIP(i) \
1599 VI_ELEMENT_SKIP(i); \
1600 P.VU.vstart->write(i); \
1601 for (reg_t fn = 0; fn < nf; ++fn) { \
1602 elt_width##_t val = P.VU.elt<elt_width##_t>(vs3 + fn * emul, vreg_inx); \
1603 MMU.store_##elt_width( \
1604 baseAddr + (stride) + (offset) * sizeof(elt_width##_t), val); \
1605 } \
1606 } \
1607 P.VU.vstart->write(0);
1608
1609 #define VI_ST_INDEX(elt_width, is_seg) \
1610 const reg_t nf = insn.v_nf() + 1; \
1611 const reg_t vl = P.VU.vl->read(); \
1612 const reg_t baseAddr = RS1; \
1613 const reg_t vs3 = insn.rd(); \
1614 if (!is_seg) \
1615 require(nf == 1); \
1616 VI_CHECK_ST_INDEX(elt_width); \
1617 VI_DUPLICATE_VREG(insn.rs2(), elt_width); \
1618 for (reg_t i = 0; i < vl; ++i) { \
1619 VI_STRIP(i) \
1620 VI_ELEMENT_SKIP(i); \
1621 P.VU.vstart->write(i); \
1622 for (reg_t fn = 0; fn < nf; ++fn) { \
1623 switch (P.VU.vsew) { \
1624 case e8: \
1625 MMU.store_uint8(baseAddr + index[i] + fn * 1, \
1626 P.VU.elt<uint8_t>(vs3 + fn * flmul, vreg_inx)); \
1627 break; \
1628 case e16: \
1629 MMU.store_uint16(baseAddr + index[i] + fn * 2, \
1630 P.VU.elt<uint16_t>(vs3 + fn * flmul, vreg_inx)); \
1631 break; \
1632 case e32: \
1633 MMU.store_uint32(baseAddr + index[i] + fn * 4, \
1634 P.VU.elt<uint32_t>(vs3 + fn * flmul, vreg_inx)); \
1635 break; \
1636 default: \
1637 MMU.store_uint64(baseAddr + index[i] + fn * 8, \
1638 P.VU.elt<uint64_t>(vs3 + fn * flmul, vreg_inx)); \
1639 break; \
1640 } \
1641 } \
1642 } \
1643 P.VU.vstart->write(0);
1644
1645 #define VI_LDST_FF(elt_width) \
1646 const reg_t nf = insn.v_nf() + 1; \
1647 const reg_t sew = p->VU.vsew; \
1648 const reg_t vl = p->VU.vl->read(); \
1649 const reg_t baseAddr = RS1; \
1650 const reg_t rd_num = insn.rd(); \
1651 VI_CHECK_LOAD(elt_width, false); \
1652 bool early_stop = false; \
1653 for (reg_t i = p->VU.vstart->read(); i < vl; ++i) { \
1654 VI_STRIP(i); \
1655 VI_ELEMENT_SKIP(i); \
1656 \
1657 for (reg_t fn = 0; fn < nf; ++fn) { \
1658 uint64_t val; \
1659 try { \
1660 val = MMU.load_##elt_width( \
1661 baseAddr + (i * nf + fn) * sizeof(elt_width##_t)); \
1662 } catch (trap_t& t) { \
1663 if (i == 0) \
1664 throw; /* Only take exception on zeroth element */ \
1665 /* Reduce VL if an exception occurs on a later element */ \
1666 early_stop = true; \
1667 P.VU.vl->write_raw(i); \
1668 break; \
1669 } \
1670 p->VU.elt<elt_width##_t>(rd_num + fn * emul, vreg_inx, true) = val; \
1671 } \
1672 \
1673 if (early_stop) { \
1674 break; \
1675 } \
1676 } \
1677 p->VU.vstart->write(0);
1678
1679 #define VI_LD_WHOLE(elt_width) \
1680 require_vector_novtype(true, false); \
1681 const reg_t baseAddr = RS1; \
1682 const reg_t vd = insn.rd(); \
1683 const reg_t len = insn.v_nf() + 1; \
1684 require_align(vd, len); \
1685 const reg_t elt_per_reg = P.VU.vlenb / sizeof(elt_width ## _t); \
1686 const reg_t size = len * elt_per_reg; \
1687 if (P.VU.vstart->read() < size) { \
1688 reg_t i = P.VU.vstart->read() / elt_per_reg; \
1689 reg_t off = P.VU.vstart->read() % elt_per_reg; \
1690 if (off) { \
1691 for (reg_t pos = off; pos < elt_per_reg; ++pos) { \
1692 auto val = MMU.load_## elt_width(baseAddr + \
1693 P.VU.vstart->read() * sizeof(elt_width ## _t)); \
1694 P.VU.elt<elt_width ## _t>(vd + i, pos, true) = val; \
1695 P.VU.vstart->write(P.VU.vstart->read() + 1); \
1696 } \
1697 ++i; \
1698 } \
1699 for (; i < len; ++i) { \
1700 for (reg_t pos = 0; pos < elt_per_reg; ++pos) { \
1701 auto val = MMU.load_## elt_width(baseAddr + \
1702 P.VU.vstart->read() * sizeof(elt_width ## _t)); \
1703 P.VU.elt<elt_width ## _t>(vd + i, pos, true) = val; \
1704 P.VU.vstart->write(P.VU.vstart->read() + 1); \
1705 } \
1706 } \
1707 } \
1708 P.VU.vstart = 0; \
1709
1710 #define VI_ST_WHOLE \
1711 require_vector_novtype(true, false); \
1712 const reg_t baseAddr = RS1; \
1713 const reg_t vs3 = insn.rd(); \
1714 const reg_t len = insn.v_nf() + 1; \
1715 require_align(vs3, len); \
1716 const reg_t size = len * P.VU.vlenb; \
1717 \
1718 if (P.VU.vstart->read() < size) { \
1719 reg_t i = P.VU.vstart->read() / P.VU.vlenb; \
1720 reg_t off = P.VU.vstart->read() % P.VU.vlenb; \
1721 if (off) { \
1722 for (reg_t pos = off; pos < P.VU.vlenb; ++pos) { \
1723 auto val = P.VU.elt<uint8_t>(vs3 + i, pos); \
1724 MMU.store_uint8(baseAddr + P.VU.vstart->read(), val); \
1725 P.VU.vstart->write(P.VU.vstart->read() + 1); \
1726 } \
1727 i++; \
1728 } \
1729 for (; i < len; ++i) { \
1730 for (reg_t pos = 0; pos < P.VU.vlenb; ++pos) { \
1731 auto val = P.VU.elt<uint8_t>(vs3 + i, pos); \
1732 MMU.store_uint8(baseAddr + P.VU.vstart->read(), val); \
1733 P.VU.vstart->write(P.VU.vstart->read() + 1); \
1734 } \
1735 } \
1736 } \
1737 P.VU.vstart->write(0);
1738
1739 //
1740 // vector: amo
1741 //
1742 #define VI_AMO(op, type, idx_type) \
1743 require_vector(false); \
1744 require_align(insn.rd(), P.VU.vflmul); \
1745 require(P.VU.vsew <= P.get_xlen() && P.VU.vsew >= 32); \
1746 require_align(insn.rd(), P.VU.vflmul); \
1747 float vemul = ((float)idx_type / P.VU.vsew * P.VU.vflmul); \
1748 require(vemul >= 0.125 && vemul <= 8); \
1749 require_align(insn.rs2(), vemul); \
1750 if (insn.v_wd()) {\
1751 require_vm; \
1752 if (idx_type > P.VU.vsew) { \
1753 if (insn.rd() != insn.rs2()) \
1754 require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
1755 } else if (idx_type < P.VU.vsew) { \
1756 if (vemul < 1) {\
1757 require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
1758 } else {\
1759 require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), vemul); \
1760 } \
1761 } \
1762 } \
1763 VI_DUPLICATE_VREG(insn.rs2(), idx_type); \
1764 const reg_t vl = P.VU.vl->read(); \
1765 const reg_t baseAddr = RS1; \
1766 const reg_t vd = insn.rd(); \
1767 for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
1768 VI_ELEMENT_SKIP(i); \
1769 VI_STRIP(i); \
1770 P.VU.vstart->write(i); \
1771 switch (P.VU.vsew) { \
1772 case e32: {\
1773 auto vs3 = P.VU.elt< type ## 32_t>(vd, vreg_inx); \
1774 auto val = MMU.amo_uint32(baseAddr + index[i], [&]( type ## 32_t lhs) { op }); \
1775 if (insn.v_wd()) \
1776 P.VU.elt< type ## 32_t>(vd, vreg_inx, true) = val; \
1777 } \
1778 break; \
1779 case e64: {\
1780 auto vs3 = P.VU.elt< type ## 64_t>(vd, vreg_inx); \
1781 auto val = MMU.amo_uint64(baseAddr + index[i], [&]( type ## 64_t lhs) { op }); \
1782 if (insn.v_wd()) \
1783 P.VU.elt< type ## 64_t>(vd, vreg_inx, true) = val; \
1784 } \
1785 break; \
1786 default: \
1787 require(0); \
1788 break; \
1789 } \
1790 } \
1791 P.VU.vstart->write(0);
1792
1793 // vector: sign/unsiged extension
1794 #define VI_VV_EXT(div, type) \
1795 require(insn.rd() != insn.rs2()); \
1796 require_vm; \
1797 reg_t from = P.VU.vsew / div; \
1798 require(from >= e8 && from <= e64); \
1799 require(((float)P.VU.vflmul / div) >= 0.125 && ((float)P.VU.vflmul / div) <= 8 ); \
1800 require_align(insn.rd(), P.VU.vflmul); \
1801 require_align(insn.rs2(), P.VU.vflmul / div); \
1802 if ((P.VU.vflmul / div) < 1) { \
1803 require_noover(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul / div); \
1804 } else {\
1805 require_noover_widen(insn.rd(), P.VU.vflmul, insn.rs2(), P.VU.vflmul / div); \
1806 } \
1807 reg_t pat = (((P.VU.vsew >> 3) << 4) | from >> 3); \
1808 VI_GENERAL_LOOP_BASE \
1809 VI_LOOP_ELEMENT_SKIP(); \
1810 switch (pat) { \
1811 case 0x21: \
1812 P.VU.elt<type##16_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
1813 break; \
1814 case 0x41: \
1815 P.VU.elt<type##32_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
1816 break; \
1817 case 0x81: \
1818 P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##8_t>(rs2_num, i); \
1819 break; \
1820 case 0x42: \
1821 P.VU.elt<type##32_t>(rd_num, i, true) = P.VU.elt<type##16_t>(rs2_num, i); \
1822 break; \
1823 case 0x82: \
1824 P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##16_t>(rs2_num, i); \
1825 break; \
1826 case 0x84: \
1827 P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##32_t>(rs2_num, i); \
1828 break; \
1829 case 0x88: \
1830 P.VU.elt<type##64_t>(rd_num, i, true) = P.VU.elt<type##32_t>(rs2_num, i); \
1831 break; \
1832 default: \
1833 break; \
1834 } \
1835 VI_LOOP_END
1836
1837 //
1838 // vector: vfp helper
1839 //
1840 #define VI_VFP_COMMON \
1841 require_fp; \
1842 require((P.VU.vsew == e16 && p->extension_enabled(EXT_ZFH)) || \
1843 (P.VU.vsew == e32 && p->extension_enabled('F')) || \
1844 (P.VU.vsew == e64 && p->extension_enabled('D'))); \
1845 require_vector(true);\
1846 require(STATE.frm->read() < 0x5);\
1847 reg_t vl = P.VU.vl->read(); \
1848 reg_t rd_num = insn.rd(); \
1849 reg_t rs1_num = insn.rs1(); \
1850 reg_t rs2_num = insn.rs2(); \
1851 softfloat_roundingMode = STATE.frm->read();
1852
1853 #define VI_VFP_LOOP_BASE \
1854 VI_VFP_COMMON \
1855 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
1856 VI_LOOP_ELEMENT_SKIP();
1857
1858 #define VI_VFP_LOOP_CMP_BASE \
1859 VI_VFP_COMMON \
1860 for (reg_t i = P.VU.vstart->read(); i < vl; ++i) { \
1861 VI_LOOP_ELEMENT_SKIP(); \
1862 uint64_t mmask = UINT64_C(1) << mpos; \
1863 uint64_t &vdi = P.VU.elt<uint64_t>(rd_num, midx, true); \
1864 uint64_t res = 0;
1865
1866 #define VI_VFP_LOOP_REDUCTION_BASE(width) \
1867 float##width##_t vd_0 = P.VU.elt<float##width##_t>(rd_num, 0); \
1868 float##width##_t vs1_0 = P.VU.elt<float##width##_t>(rs1_num, 0); \
1869 vd_0 = vs1_0; \
1870 bool is_active = false; \
1871 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
1872 VI_LOOP_ELEMENT_SKIP(); \
1873 float##width##_t vs2 = P.VU.elt<float##width##_t>(rs2_num, i); \
1874 is_active = true; \
1875
1876 #define VI_VFP_LOOP_WIDE_REDUCTION_BASE \
1877 VI_VFP_COMMON \
1878 float64_t vd_0 = f64(P.VU.elt<float64_t>(rs1_num, 0).v); \
1879 for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
1880 VI_LOOP_ELEMENT_SKIP();
1881
1882 #define VI_VFP_LOOP_END \
1883 } \
1884 P.VU.vstart->write(0); \
1885
1886 #define VI_VFP_LOOP_REDUCTION_END(x) \
1887 } \
1888 P.VU.vstart->write(0); \
1889 if (vl > 0) { \
1890 if (is_propagate && !is_active) { \
1891 switch (x) { \
1892 case e16: {\
1893 auto ret = f16_classify(f16(vd_0.v)); \
1894 if (ret & 0x300) { \
1895 if (ret & 0x100) { \
1896 softfloat_exceptionFlags |= softfloat_flag_invalid; \
1897 set_fp_exceptions; \
1898 } \
1899 P.VU.elt<uint16_t>(rd_num, 0, true) = defaultNaNF16UI; \
1900 } else { \
1901 P.VU.elt<uint16_t>(rd_num, 0, true) = vd_0.v; \
1902 } \
1903 } \
1904 break; \
1905 case e32: { \
1906 auto ret = f32_classify(f32(vd_0.v)); \
1907 if (ret & 0x300) { \
1908 if (ret & 0x100) { \
1909 softfloat_exceptionFlags |= softfloat_flag_invalid; \
1910 set_fp_exceptions; \
1911 } \
1912 P.VU.elt<uint32_t>(rd_num, 0, true) = defaultNaNF32UI; \
1913 } else { \
1914 P.VU.elt<uint32_t>(rd_num, 0, true) = vd_0.v; \
1915 } \
1916 } \
1917 break; \
1918 case e64: {\
1919 auto ret = f64_classify(f64(vd_0.v)); \
1920 if (ret & 0x300) { \
1921 if (ret & 0x100) { \
1922 softfloat_exceptionFlags |= softfloat_flag_invalid; \
1923 set_fp_exceptions; \
1924 } \
1925 P.VU.elt<uint64_t>(rd_num, 0, true) = defaultNaNF64UI; \
1926 } else { \
1927 P.VU.elt<uint64_t>(rd_num, 0, true) = vd_0.v; \
1928 } \
1929 } \
1930 break; \
1931 } \
1932 } else { \
1933 P.VU.elt<type_sew_t<x>::type>(rd_num, 0, true) = vd_0.v; \
1934 } \
1935 }
1936
1937 #define VI_VFP_LOOP_CMP_END \
1938 switch(P.VU.vsew) { \
1939 case e16: \
1940 case e32: \
1941 case e64: { \
1942 vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
1943 break; \
1944 } \
1945 default: \
1946 require(0); \
1947 break; \
1948 }; \
1949 } \
1950 P.VU.vstart->write(0);
1951
1952 #define VI_VFP_VV_LOOP(BODY16, BODY32, BODY64) \
1953 VI_CHECK_SSS(true); \
1954 VI_VFP_LOOP_BASE \
1955 switch(P.VU.vsew) { \
1956 case e16: {\
1957 float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
1958 float16_t vs1 = P.VU.elt<float16_t>(rs1_num, i); \
1959 float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
1960 BODY16; \
1961 set_fp_exceptions; \
1962 break; \
1963 }\
1964 case e32: {\
1965 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
1966 float32_t vs1 = P.VU.elt<float32_t>(rs1_num, i); \
1967 float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
1968 BODY32; \
1969 set_fp_exceptions; \
1970 break; \
1971 }\
1972 case e64: {\
1973 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
1974 float64_t vs1 = P.VU.elt<float64_t>(rs1_num, i); \
1975 float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
1976 BODY64; \
1977 set_fp_exceptions; \
1978 break; \
1979 }\
1980 default: \
1981 require(0); \
1982 break; \
1983 }; \
1984 DEBUG_RVV_FP_VV; \
1985 VI_VFP_LOOP_END
1986
1987 #define VI_VFP_V_LOOP(BODY16, BODY32, BODY64) \
1988 VI_CHECK_SSS(false); \
1989 VI_VFP_LOOP_BASE \
1990 switch(P.VU.vsew) { \
1991 case e16: {\
1992 float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
1993 float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
1994 BODY16; \
1995 break; \
1996 }\
1997 case e32: {\
1998 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
1999 float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2000 BODY32; \
2001 break; \
2002 }\
2003 case e64: {\
2004 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2005 float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2006 BODY64; \
2007 break; \
2008 }\
2009 default: \
2010 require(0); \
2011 break; \
2012 }; \
2013 set_fp_exceptions; \
2014 VI_VFP_LOOP_END
2015
2016 #define VI_VFP_VV_LOOP_REDUCTION(BODY16, BODY32, BODY64) \
2017 VI_CHECK_REDUCTION(false) \
2018 VI_VFP_COMMON \
2019 switch(P.VU.vsew) { \
2020 case e16: {\
2021 VI_VFP_LOOP_REDUCTION_BASE(16) \
2022 BODY16; \
2023 set_fp_exceptions; \
2024 VI_VFP_LOOP_REDUCTION_END(e16) \
2025 break; \
2026 }\
2027 case e32: {\
2028 VI_VFP_LOOP_REDUCTION_BASE(32) \
2029 BODY32; \
2030 set_fp_exceptions; \
2031 VI_VFP_LOOP_REDUCTION_END(e32) \
2032 break; \
2033 }\
2034 case e64: {\
2035 VI_VFP_LOOP_REDUCTION_BASE(64) \
2036 BODY64; \
2037 set_fp_exceptions; \
2038 VI_VFP_LOOP_REDUCTION_END(e64) \
2039 break; \
2040 }\
2041 default: \
2042 require(0); \
2043 break; \
2044 }; \
2045
2046 #define VI_VFP_VV_LOOP_WIDE_REDUCTION(BODY16, BODY32) \
2047 VI_CHECK_REDUCTION(true) \
2048 VI_VFP_COMMON \
2049 require((P.VU.vsew == e16 && p->extension_enabled('F')) || \
2050 (P.VU.vsew == e32 && p->extension_enabled('D'))); \
2051 bool is_active = false; \
2052 switch(P.VU.vsew) { \
2053 case e16: {\
2054 float32_t vd_0 = P.VU.elt<float32_t>(rs1_num, 0); \
2055 for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
2056 VI_LOOP_ELEMENT_SKIP(); \
2057 is_active = true; \
2058 float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
2059 BODY16; \
2060 set_fp_exceptions; \
2061 VI_VFP_LOOP_REDUCTION_END(e32) \
2062 break; \
2063 }\
2064 case e32: {\
2065 float64_t vd_0 = P.VU.elt<float64_t>(rs1_num, 0); \
2066 for (reg_t i=P.VU.vstart->read(); i<vl; ++i) { \
2067 VI_LOOP_ELEMENT_SKIP(); \
2068 is_active = true; \
2069 float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
2070 BODY32; \
2071 set_fp_exceptions; \
2072 VI_VFP_LOOP_REDUCTION_END(e64) \
2073 break; \
2074 }\
2075 default: \
2076 require(0); \
2077 break; \
2078 }; \
2079
2080 #define VI_VFP_VF_LOOP(BODY16, BODY32, BODY64) \
2081 VI_CHECK_SSS(false); \
2082 VI_VFP_LOOP_BASE \
2083 switch(P.VU.vsew) { \
2084 case e16: {\
2085 float16_t &vd = P.VU.elt<float16_t>(rd_num, i, true); \
2086 float16_t rs1 = f16(READ_FREG(rs1_num)); \
2087 float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
2088 BODY16; \
2089 set_fp_exceptions; \
2090 break; \
2091 }\
2092 case e32: {\
2093 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2094 float32_t rs1 = f32(READ_FREG(rs1_num)); \
2095 float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2096 BODY32; \
2097 set_fp_exceptions; \
2098 break; \
2099 }\
2100 case e64: {\
2101 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2102 float64_t rs1 = f64(READ_FREG(rs1_num)); \
2103 float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2104 BODY64; \
2105 set_fp_exceptions; \
2106 break; \
2107 }\
2108 default: \
2109 require(0); \
2110 break; \
2111 }; \
2112 DEBUG_RVV_FP_VF; \
2113 VI_VFP_LOOP_END
2114
2115 #define VI_VFP_LOOP_CMP(BODY16, BODY32, BODY64, is_vs1) \
2116 VI_CHECK_MSS(is_vs1); \
2117 VI_VFP_LOOP_CMP_BASE \
2118 switch(P.VU.vsew) { \
2119 case e16: {\
2120 float16_t vs2 = P.VU.elt<float16_t>(rs2_num, i); \
2121 float16_t vs1 = P.VU.elt<float16_t>(rs1_num, i); \
2122 float16_t rs1 = f16(READ_FREG(rs1_num)); \
2123 BODY16; \
2124 set_fp_exceptions; \
2125 break; \
2126 }\
2127 case e32: {\
2128 float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2129 float32_t vs1 = P.VU.elt<float32_t>(rs1_num, i); \
2130 float32_t rs1 = f32(READ_FREG(rs1_num)); \
2131 BODY32; \
2132 set_fp_exceptions; \
2133 break; \
2134 }\
2135 case e64: {\
2136 float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2137 float64_t vs1 = P.VU.elt<float64_t>(rs1_num, i); \
2138 float64_t rs1 = f64(READ_FREG(rs1_num)); \
2139 BODY64; \
2140 set_fp_exceptions; \
2141 break; \
2142 }\
2143 default: \
2144 require(0); \
2145 break; \
2146 }; \
2147 VI_VFP_LOOP_CMP_END \
2148
2149 #define VI_VFP_VF_LOOP_WIDE(BODY16, BODY32) \
2150 VI_CHECK_DSS(false); \
2151 VI_VFP_LOOP_BASE \
2152 switch(P.VU.vsew) { \
2153 case e16: { \
2154 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2155 float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
2156 float32_t rs1 = f16_to_f32(f16(READ_FREG(rs1_num))); \
2157 BODY16; \
2158 set_fp_exceptions; \
2159 break; \
2160 } \
2161 case e32: {\
2162 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2163 float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
2164 float64_t rs1 = f32_to_f64(f32(READ_FREG(rs1_num))); \
2165 BODY32; \
2166 set_fp_exceptions; \
2167 break; \
2168 }\
2169 default: \
2170 require(0); \
2171 break; \
2172 }; \
2173 DEBUG_RVV_FP_VV; \
2174 VI_VFP_LOOP_END
2175
2176
2177 #define VI_VFP_VV_LOOP_WIDE(BODY16, BODY32) \
2178 VI_CHECK_DSS(true); \
2179 VI_VFP_LOOP_BASE \
2180 switch(P.VU.vsew) { \
2181 case e16: {\
2182 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2183 float32_t vs2 = f16_to_f32(P.VU.elt<float16_t>(rs2_num, i)); \
2184 float32_t vs1 = f16_to_f32(P.VU.elt<float16_t>(rs1_num, i)); \
2185 BODY16; \
2186 set_fp_exceptions; \
2187 break; \
2188 }\
2189 case e32: {\
2190 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2191 float64_t vs2 = f32_to_f64(P.VU.elt<float32_t>(rs2_num, i)); \
2192 float64_t vs1 = f32_to_f64(P.VU.elt<float32_t>(rs1_num, i)); \
2193 BODY32; \
2194 set_fp_exceptions; \
2195 break; \
2196 }\
2197 default: \
2198 require(0); \
2199 break; \
2200 }; \
2201 DEBUG_RVV_FP_VV; \
2202 VI_VFP_LOOP_END
2203
2204 #define VI_VFP_WF_LOOP_WIDE(BODY16, BODY32) \
2205 VI_CHECK_DDS(false); \
2206 VI_VFP_LOOP_BASE \
2207 switch(P.VU.vsew) { \
2208 case e16: {\
2209 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2210 float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2211 float32_t rs1 = f16_to_f32(f16(READ_FREG(rs1_num))); \
2212 BODY16; \
2213 set_fp_exceptions; \
2214 break; \
2215 }\
2216 case e32: {\
2217 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2218 float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2219 float64_t rs1 = f32_to_f64(f32(READ_FREG(rs1_num))); \
2220 BODY32; \
2221 set_fp_exceptions; \
2222 break; \
2223 }\
2224 default: \
2225 require(0); \
2226 }; \
2227 DEBUG_RVV_FP_VV; \
2228 VI_VFP_LOOP_END
2229
2230 #define VI_VFP_WV_LOOP_WIDE(BODY16, BODY32) \
2231 VI_CHECK_DDS(true); \
2232 VI_VFP_LOOP_BASE \
2233 switch(P.VU.vsew) { \
2234 case e16: {\
2235 float32_t &vd = P.VU.elt<float32_t>(rd_num, i, true); \
2236 float32_t vs2 = P.VU.elt<float32_t>(rs2_num, i); \
2237 float32_t vs1 = f16_to_f32(P.VU.elt<float16_t>(rs1_num, i)); \
2238 BODY16; \
2239 set_fp_exceptions; \
2240 break; \
2241 }\
2242 case e32: {\
2243 float64_t &vd = P.VU.elt<float64_t>(rd_num, i, true); \
2244 float64_t vs2 = P.VU.elt<float64_t>(rs2_num, i); \
2245 float64_t vs1 = f32_to_f64(P.VU.elt<float32_t>(rs1_num, i)); \
2246 BODY32; \
2247 set_fp_exceptions; \
2248 break; \
2249 }\
2250 default: \
2251 require(0); \
2252 }; \
2253 DEBUG_RVV_FP_VV; \
2254 VI_VFP_LOOP_END
2255
2256 #define VI_VFP_LOOP_SCALE_BASE \
2257 require_fp; \
2258 require_vector(true);\
2259 require((P.VU.vsew == e8 && p->extension_enabled(EXT_ZFH)) || \
2260 (P.VU.vsew == e16 && p->extension_enabled('F')) || \
2261 (P.VU.vsew == e32 && p->extension_enabled('D'))); \
2262 require(STATE.frm->read() < 0x5);\
2263 reg_t vl = P.VU.vl->read(); \
2264 reg_t rd_num = insn.rd(); \
2265 reg_t rs1_num = insn.rs1(); \
2266 reg_t rs2_num = insn.rs2(); \
2267 softfloat_roundingMode = STATE.frm->read(); \
2268 for (reg_t i=P.VU.vstart->read(); i<vl; ++i){ \
2269 VI_LOOP_ELEMENT_SKIP();
2270
2271 #define VI_VFP_CVT_SCALE(BODY8, BODY16, BODY32, \
2272 CHECK8, CHECK16, CHECK32, \
2273 is_widen, eew_check) \
2274 if (is_widen) { \
2275 VI_CHECK_DSS(false);\
2276 } else { \
2277 VI_CHECK_SDS(false); \
2278 } \
2279 require(eew_check); \
2280 switch(P.VU.vsew) { \
2281 case e8: {\
2282 CHECK8 \
2283 VI_VFP_LOOP_SCALE_BASE \
2284 BODY8 \
2285 set_fp_exceptions; \
2286 VI_VFP_LOOP_END \
2287 } \
2288 break; \
2289 case e16: {\
2290 CHECK16 \
2291 VI_VFP_LOOP_SCALE_BASE \
2292 BODY16 \
2293 set_fp_exceptions; \
2294 VI_VFP_LOOP_END \
2295 } \
2296 break; \
2297 case e32: {\
2298 CHECK32 \
2299 VI_VFP_LOOP_SCALE_BASE \
2300 BODY32 \
2301 set_fp_exceptions; \
2302 VI_VFP_LOOP_END \
2303 } \
2304 break; \
2305 default: \
2306 require(0); \
2307 break; \
2308 }
2309
2310 // The p-extension support is contributed by
2311 // Programming Langauge Lab, Department of Computer Science, National Tsing-Hua University, Taiwan
2312
2313 #define P_FIELD(R, INDEX, SIZE) \
2314 (type_sew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE))
2315
2316 #define P_UFIELD(R, INDEX, SIZE) \
2317 (type_usew_t<SIZE>::type)get_field(R, make_mask64(((INDEX) * SIZE), SIZE))
2318
2319 #define P_B(R, INDEX) P_UFIELD(R, INDEX, 8)
2320 #define P_H(R, INDEX) P_UFIELD(R, INDEX, 16)
2321 #define P_W(R, INDEX) P_UFIELD(R, INDEX, 32)
2322 #define P_SB(R, INDEX) P_FIELD(R, INDEX, 8)
2323 #define P_SH(R, INDEX) P_FIELD(R, INDEX, 16)
2324 #define P_SW(R, INDEX) P_FIELD(R, INDEX, 32)
2325
2326 #define READ_REG_PAIR(reg) ({ \
2327 require((reg) % 2 == 0); \
2328 (reg) == 0 ? reg_t(0) : \
2329 (READ_REG((reg) + 1) << 32) + zext32(READ_REG(reg)); })
2330
2331 #define RS1_PAIR READ_REG_PAIR(insn.rs1())
2332 #define RS2_PAIR READ_REG_PAIR(insn.rs2())
2333 #define RD_PAIR READ_REG_PAIR(insn.rd())
2334
2335 #define WRITE_PD() \
2336 rd_tmp = set_field(rd_tmp, make_mask64((i * sizeof(pd) * 8), sizeof(pd) * 8), pd);
2337
2338 #define WRITE_RD_PAIR(value) \
2339 if (insn.rd() != 0) { \
2340 require(insn.rd() % 2 == 0); \
2341 WRITE_REG(insn.rd(), sext32(value)); \
2342 WRITE_REG(insn.rd() + 1, (sreg_t(value)) >> 32); \
2343 }
2344
2345 #define P_SET_OV(ov) \
2346 if (ov) P.VU.vxsat->write(1);
2347
2348 #define P_SAT(R, BIT) \
2349 if (R > INT##BIT##_MAX) { \
2350 R = INT##BIT##_MAX; \
2351 P_SET_OV(1); \
2352 } else if (R < INT##BIT##_MIN) { \
2353 R = INT##BIT##_MIN; \
2354 P_SET_OV(1); \
2355 }
2356
2357 #define P_SATU(R, BIT) \
2358 if (R > UINT##BIT##_MAX) { \
2359 R = UINT##BIT##_MAX; \
2360 P_SET_OV(1); \
2361 } else if (R < 0) { \
2362 P_SET_OV(1); \
2363 R = 0; \
2364 }
2365
2366 #define P_LOOP_BASE(BIT) \
2367 require_extension(EXT_ZPN); \
2368 require(BIT == e8 || BIT == e16 || BIT == e32); \
2369 reg_t rd_tmp = RD; \
2370 reg_t rs1 = RS1; \
2371 reg_t rs2 = RS2; \
2372 sreg_t len = xlen / BIT; \
2373 for (sreg_t i = len - 1; i >= 0; --i) {
2374
2375 #define P_ONE_LOOP_BASE(BIT) \
2376 require_extension(EXT_ZPN); \
2377 require(BIT == e8 || BIT == e16 || BIT == e32); \
2378 reg_t rd_tmp = RD; \
2379 reg_t rs1 = RS1; \
2380 sreg_t len = xlen / BIT; \
2381 for (sreg_t i = len - 1; i >= 0; --i) {
2382
2383 #define P_I_LOOP_BASE(BIT, IMMBIT) \
2384 require_extension(EXT_ZPN); \
2385 require(BIT == e8 || BIT == e16 || BIT == e32); \
2386 reg_t rd_tmp = RD; \
2387 reg_t rs1 = RS1; \
2388 type_usew_t<BIT>::type imm##IMMBIT##u = insn.p_imm##IMMBIT(); \
2389 sreg_t len = xlen / BIT; \
2390 for (sreg_t i = len - 1; i >= 0; --i) {
2391
2392 #define P_X_LOOP_BASE(BIT, LOWBIT) \
2393 require_extension(EXT_ZPN); \
2394 require(BIT == e8 || BIT == e16 || BIT == e32); \
2395 reg_t rd_tmp = RD; \
2396 reg_t rs1 = RS1; \
2397 type_usew_t<BIT>::type sa = RS2 & ((uint64_t(1) << LOWBIT) - 1); \
2398 type_sew_t<BIT>::type ssa = int64_t(RS2) << (64 - LOWBIT) >> (64 - LOWBIT); \
2399 sreg_t len = xlen / BIT; \
2400 for (sreg_t i = len - 1; i >= 0; --i) {
2401
2402 #define P_MUL_LOOP_BASE(BIT) \
2403 require_extension(EXT_ZPN); \
2404 require(BIT == e8 || BIT == e16 || BIT == e32); \
2405 reg_t rd_tmp = RD; \
2406 reg_t rs1 = RS1; \
2407 reg_t rs2 = RS2; \
2408 sreg_t len = 32 / BIT; \
2409 for (sreg_t i = len - 1; i >= 0; --i) {
2410
2411 #define P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2412 require_extension(EXT_ZPN); \
2413 require(BIT == e16 || BIT == e32 || BIT == e64); \
2414 reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \
2415 reg_t rs1 = zext_xlen(RS1); \
2416 reg_t rs2 = zext_xlen(RS2); \
2417 sreg_t len = 64 / BIT; \
2418 sreg_t len_inner = BIT / BIT_INNER; \
2419 for (sreg_t i = len - 1; i >= 0; --i) { \
2420 sreg_t pd_res = P_FIELD(rd_tmp, i, BIT); \
2421 for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) {
2422
2423 #define P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
2424 require_extension(EXT_ZPN); \
2425 require(BIT == e16 || BIT == e32 || BIT == e64); \
2426 reg_t rd_tmp = USE_RD ? zext_xlen(RD) : 0; \
2427 reg_t rs1 = zext_xlen(RS1); \
2428 reg_t rs2 = zext_xlen(RS2); \
2429 sreg_t len = 64 / BIT; \
2430 sreg_t len_inner = BIT / BIT_INNER; \
2431 for (sreg_t i = len - 1; i >=0; --i) { \
2432 reg_t pd_res = P_UFIELD(rd_tmp, i, BIT); \
2433 for (sreg_t j = i * len_inner; j < (i + 1) * len_inner; ++j) {
2434
2435 #define P_PARAMS(BIT) \
2436 auto pd = P_FIELD(rd_tmp, i, BIT); \
2437 auto ps1 = P_FIELD(rs1, i, BIT); \
2438 auto ps2 = P_FIELD(rs2, i, BIT);
2439
2440 #define P_UPARAMS(BIT) \
2441 auto pd = P_UFIELD(rd_tmp, i, BIT); \
2442 auto ps1 = P_UFIELD(rs1, i, BIT); \
2443 auto ps2 = P_UFIELD(rs2, i, BIT);
2444
2445 #define P_CORSS_PARAMS(BIT) \
2446 auto pd = P_FIELD(rd_tmp, i, BIT); \
2447 auto ps1 = P_FIELD(rs1, i, BIT); \
2448 auto ps2 = P_FIELD(rs2, (i ^ 1), BIT);
2449
2450 #define P_CORSS_UPARAMS(BIT) \
2451 auto pd = P_UFIELD(rd_tmp, i, BIT); \
2452 auto ps1 = P_UFIELD(rs1, i, BIT); \
2453 auto ps2 = P_UFIELD(rs2, (i ^ 1), BIT);
2454
2455 #define P_ONE_PARAMS(BIT) \
2456 auto pd = P_FIELD(rd_tmp, i, BIT); \
2457 auto ps1 = P_FIELD(rs1, i, BIT);
2458
2459 #define P_ONE_UPARAMS(BIT) \
2460 auto pd = P_UFIELD(rd_tmp, i, BIT); \
2461 auto ps1 = P_UFIELD(rs1, i, BIT);
2462
2463 #define P_ONE_SUPARAMS(BIT) \
2464 auto pd = P_UFIELD(rd_tmp, i, BIT); \
2465 auto ps1 = P_FIELD(rs1, i, BIT);
2466
2467 #define P_MUL_PARAMS(BIT) \
2468 auto pd = P_FIELD(rd_tmp, i, BIT * 2); \
2469 auto ps1 = P_FIELD(rs1, i, BIT); \
2470 auto ps2 = P_FIELD(rs2, i, BIT);
2471
2472 #define P_MUL_UPARAMS(BIT) \
2473 auto pd = P_UFIELD(rd_tmp, i, BIT * 2); \
2474 auto ps1 = P_UFIELD(rs1, i, BIT); \
2475 auto ps2 = P_UFIELD(rs2, i, BIT);
2476
2477 #define P_MUL_CROSS_PARAMS(BIT) \
2478 auto pd = P_FIELD(rd_tmp, i, BIT * 2); \
2479 auto ps1 = P_FIELD(rs1, i, BIT); \
2480 auto ps2 = P_FIELD(rs2, (i ^ 1), BIT);
2481
2482 #define P_MUL_CROSS_UPARAMS(BIT) \
2483 auto pd = P_UFIELD(rd_tmp, i, BIT*2); \
2484 auto ps1 = P_UFIELD(rs1, i, BIT); \
2485 auto ps2 = P_UFIELD(rs2, (i ^ 1), BIT);
2486
2487 #define P_REDUCTION_PARAMS(BIT_INNER) \
2488 auto ps1 = P_FIELD(rs1, j, BIT_INNER); \
2489 auto ps2 = P_FIELD(rs2, j, BIT_INNER);
2490
2491 #define P_REDUCTION_UPARAMS(BIT_INNER) \
2492 auto ps1 = P_UFIELD(rs1, j, BIT_INNER); \
2493 auto ps2 = P_UFIELD(rs2, j, BIT_INNER);
2494
2495 #define P_REDUCTION_SUPARAMS(BIT_INNER) \
2496 auto ps1 = P_FIELD(rs1, j, BIT_INNER); \
2497 auto ps2 = P_UFIELD(rs2, j, BIT_INNER);
2498
2499 #define P_REDUCTION_CROSS_PARAMS(BIT_INNER) \
2500 auto ps1 = P_FIELD(rs1, j, BIT_INNER); \
2501 auto ps2 = P_FIELD(rs2, (j ^ 1), BIT_INNER);
2502
2503 #define P_LOOP_BODY(BIT, BODY) { \
2504 P_PARAMS(BIT) \
2505 BODY \
2506 WRITE_PD(); \
2507 }
2508
2509 #define P_ULOOP_BODY(BIT, BODY) { \
2510 P_UPARAMS(BIT) \
2511 BODY \
2512 WRITE_PD(); \
2513 }
2514
2515 #define P_ONE_LOOP_BODY(BIT, BODY) { \
2516 P_ONE_PARAMS(BIT) \
2517 BODY \
2518 WRITE_PD(); \
2519 }
2520
2521 #define P_CROSS_LOOP_BODY(BIT, BODY) { \
2522 P_CORSS_PARAMS(BIT) \
2523 BODY \
2524 WRITE_PD(); \
2525 }
2526
2527 #define P_CROSS_ULOOP_BODY(BIT, BODY) { \
2528 P_CORSS_UPARAMS(BIT) \
2529 BODY \
2530 WRITE_PD(); \
2531 }
2532
2533 #define P_ONE_ULOOP_BODY(BIT, BODY) { \
2534 P_ONE_UPARAMS(BIT) \
2535 BODY \
2536 WRITE_PD(); \
2537 }
2538
2539 #define P_MUL_LOOP_BODY(BIT, BODY) { \
2540 P_MUL_PARAMS(BIT) \
2541 BODY \
2542 WRITE_PD(); \
2543 }
2544
2545 #define P_MUL_ULOOP_BODY(BIT, BODY) { \
2546 P_MUL_UPARAMS(BIT) \
2547 BODY \
2548 WRITE_PD(); \
2549 }
2550
2551 #define P_MUL_CROSS_LOOP_BODY(BIT, BODY) { \
2552 P_MUL_CROSS_PARAMS(BIT) \
2553 BODY \
2554 WRITE_PD(); \
2555 }
2556
2557 #define P_MUL_CROSS_ULOOP_BODY(BIT, BODY) { \
2558 P_MUL_CROSS_UPARAMS(BIT) \
2559 BODY \
2560 WRITE_PD(); \
2561 }
2562
2563 #define P_LOOP(BIT, BODY) \
2564 P_LOOP_BASE(BIT) \
2565 P_LOOP_BODY(BIT, BODY) \
2566 P_LOOP_END()
2567
2568 #define P_ONE_LOOP(BIT, BODY) \
2569 P_ONE_LOOP_BASE(BIT) \
2570 P_ONE_LOOP_BODY(BIT, BODY) \
2571 P_LOOP_END()
2572
2573 #define P_ULOOP(BIT, BODY) \
2574 P_LOOP_BASE(BIT) \
2575 P_ULOOP_BODY(BIT, BODY) \
2576 P_LOOP_END()
2577
2578 #define P_CROSS_LOOP(BIT, BODY1, BODY2) \
2579 P_LOOP_BASE(BIT) \
2580 P_CROSS_LOOP_BODY(BIT, BODY1) \
2581 --i; \
2582 if (sizeof(#BODY2) == 1) { \
2583 P_CROSS_LOOP_BODY(BIT, BODY1) \
2584 } \
2585 else { \
2586 P_CROSS_LOOP_BODY(BIT, BODY2) \
2587 } \
2588 P_LOOP_END()
2589
2590 #define P_CROSS_ULOOP(BIT, BODY1, BODY2) \
2591 P_LOOP_BASE(BIT) \
2592 P_CROSS_ULOOP_BODY(BIT, BODY1) \
2593 --i; \
2594 P_CROSS_ULOOP_BODY(BIT, BODY2) \
2595 P_LOOP_END()
2596
2597 #define P_STRAIGHT_LOOP(BIT, BODY1, BODY2) \
2598 P_LOOP_BASE(BIT) \
2599 P_LOOP_BODY(BIT, BODY1) \
2600 --i; \
2601 P_LOOP_BODY(BIT, BODY2) \
2602 P_LOOP_END()
2603
2604 #define P_STRAIGHT_ULOOP(BIT, BODY1, BODY2) \
2605 P_LOOP_BASE(BIT) \
2606 P_ULOOP_BODY(BIT, BODY1) \
2607 --i; \
2608 P_ULOOP_BODY(BIT, BODY2) \
2609 P_LOOP_END()
2610
2611 #define P_X_LOOP(BIT, RS2_LOW_BIT, BODY) \
2612 P_X_LOOP_BASE(BIT, RS2_LOW_BIT) \
2613 P_ONE_LOOP_BODY(BIT, BODY) \
2614 P_LOOP_END()
2615
2616 #define P_X_ULOOP(BIT, RS2_LOW_BIT, BODY) \
2617 P_X_LOOP_BASE(BIT, RS2_LOW_BIT) \
2618 P_ONE_ULOOP_BODY(BIT, BODY) \
2619 P_LOOP_END()
2620
2621 #define P_I_LOOP(BIT, IMMBIT, BODY) \
2622 P_I_LOOP_BASE(BIT, IMMBIT) \
2623 P_ONE_LOOP_BODY(BIT, BODY) \
2624 P_LOOP_END()
2625
2626 #define P_I_ULOOP(BIT, IMMBIT, BODY) \
2627 P_I_LOOP_BASE(BIT, IMMBIT) \
2628 P_ONE_ULOOP_BODY(BIT, BODY) \
2629 P_LOOP_END()
2630
2631 #define P_MUL_LOOP(BIT, BODY) \
2632 P_MUL_LOOP_BASE(BIT) \
2633 P_MUL_LOOP_BODY(BIT, BODY) \
2634 P_PAIR_LOOP_END()
2635
2636 #define P_MUL_ULOOP(BIT, BODY) \
2637 P_MUL_LOOP_BASE(BIT) \
2638 P_MUL_ULOOP_BODY(BIT, BODY) \
2639 P_PAIR_LOOP_END()
2640
2641 #define P_MUL_CROSS_LOOP(BIT, BODY) \
2642 P_MUL_LOOP_BASE(BIT) \
2643 P_MUL_CROSS_LOOP_BODY(BIT, BODY) \
2644 P_PAIR_LOOP_END()
2645
2646 #define P_MUL_CROSS_ULOOP(BIT, BODY) \
2647 P_MUL_LOOP_BASE(BIT) \
2648 P_MUL_CROSS_ULOOP_BODY(BIT, BODY) \
2649 P_PAIR_LOOP_END()
2650
2651 #define P_REDUCTION_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2652 P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2653 P_REDUCTION_PARAMS(BIT_INNER) \
2654 BODY \
2655 P_REDUCTION_LOOP_END(BIT, IS_SAT)
2656
2657 #define P_REDUCTION_ULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2658 P_REDUCTION_ULOOP_BASE(BIT, BIT_INNER, USE_RD) \
2659 P_REDUCTION_UPARAMS(BIT_INNER) \
2660 BODY \
2661 P_REDUCTION_ULOOP_END(BIT, IS_SAT)
2662
2663 #define P_REDUCTION_SULOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2664 P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2665 P_REDUCTION_SUPARAMS(BIT_INNER) \
2666 BODY \
2667 P_REDUCTION_LOOP_END(BIT, IS_SAT)
2668
2669 #define P_REDUCTION_CROSS_LOOP(BIT, BIT_INNER, USE_RD, IS_SAT, BODY) \
2670 P_REDUCTION_LOOP_BASE(BIT, BIT_INNER, USE_RD) \
2671 P_REDUCTION_CROSS_PARAMS(BIT_INNER) \
2672 BODY \
2673 P_REDUCTION_LOOP_END(BIT, IS_SAT)
2674
2675 #define P_LOOP_END() \
2676 } \
2677 WRITE_RD(sext_xlen(rd_tmp));
2678
2679 #define P_PAIR_LOOP_END() \
2680 } \
2681 if (xlen == 32) { \
2682 WRITE_RD_PAIR(rd_tmp); \
2683 } \
2684 else { \
2685 WRITE_RD(sext_xlen(rd_tmp)); \
2686 }
2687
2688 #define P_REDUCTION_LOOP_END(BIT, IS_SAT) \
2689 } \
2690 if (IS_SAT) { \
2691 P_SAT(pd_res, BIT); \
2692 } \
2693 type_usew_t<BIT>::type pd = pd_res; \
2694 WRITE_PD(); \
2695 } \
2696 WRITE_RD(sext_xlen(rd_tmp));
2697
2698 #define P_REDUCTION_ULOOP_END(BIT, IS_SAT) \
2699 } \
2700 if (IS_SAT) { \
2701 P_SATU(pd_res, BIT); \
2702 } \
2703 type_usew_t<BIT>::type pd = pd_res; \
2704 WRITE_PD(); \
2705 } \
2706 WRITE_RD(sext_xlen(rd_tmp));
2707
2708 #define P_SUNPKD8(X, Y) \
2709 require_extension(EXT_ZPN); \
2710 reg_t rd_tmp = 0; \
2711 int16_t pd[4] = { \
2712 P_SB(RS1, Y), \
2713 P_SB(RS1, X), \
2714 P_SB(RS1, Y + 4), \
2715 P_SB(RS1, X + 4), \
2716 }; \
2717 if (xlen == 64) { \
2718 memcpy(&rd_tmp, pd, 8); \
2719 } else { \
2720 memcpy(&rd_tmp, pd, 4); \
2721 } \
2722 WRITE_RD(sext_xlen(rd_tmp));
2723
2724 #define P_ZUNPKD8(X, Y) \
2725 require_extension(EXT_ZPN); \
2726 reg_t rd_tmp = 0; \
2727 uint16_t pd[4] = { \
2728 P_B(RS1, Y), \
2729 P_B(RS1, X), \
2730 P_B(RS1, Y + 4), \
2731 P_B(RS1, X + 4), \
2732 }; \
2733 if (xlen == 64) { \
2734 memcpy(&rd_tmp, pd, 8); \
2735 } else { \
2736 memcpy(&rd_tmp, pd, 4); \
2737 } \
2738 WRITE_RD(sext_xlen(rd_tmp));
2739
2740 #define P_PK(BIT, X, Y) \
2741 require_extension(EXT_ZPN); \
2742 require(BIT == e16 || BIT == e32); \
2743 reg_t rd_tmp = 0, rs1 = RS1, rs2 = RS2; \
2744 for (sreg_t i = 0; i < xlen / BIT / 2; i++) { \
2745 rd_tmp = set_field(rd_tmp, make_mask64(i * 2 * BIT, BIT), \
2746 P_UFIELD(RS2, i * 2 + Y, BIT)); \
2747 rd_tmp = set_field(rd_tmp, make_mask64((i * 2 + 1) * BIT, BIT), \
2748 P_UFIELD(RS1, i * 2 + X, BIT)); \
2749 } \
2750 WRITE_RD(sext_xlen(rd_tmp));
2751
2752 #define P_64_PROFILE_BASE() \
2753 require_extension(EXT_ZPSFOPERAND); \
2754 sreg_t rd, rs1, rs2;
2755
2756 #define P_64_UPROFILE_BASE() \
2757 require_extension(EXT_ZPSFOPERAND); \
2758 reg_t rd, rs1, rs2;
2759
2760 #define P_64_PROFILE_PARAM(USE_RD, INPUT_PAIR) \
2761 if (xlen == 32) { \
2762 rs1 = INPUT_PAIR ? RS1_PAIR : RS1; \
2763 rs2 = INPUT_PAIR ? RS2_PAIR : RS2; \
2764 rd = USE_RD ? RD_PAIR : 0; \
2765 } else { \
2766 rs1 = RS1; \
2767 rs2 = RS2; \
2768 rd = USE_RD ? RD : 0; \
2769 }
2770
2771 #define P_64_PROFILE(BODY) \
2772 P_64_PROFILE_BASE() \
2773 P_64_PROFILE_PARAM(false, true) \
2774 BODY \
2775 P_64_PROFILE_END() \
2776
2777 #define P_64_UPROFILE(BODY) \
2778 P_64_UPROFILE_BASE() \
2779 P_64_PROFILE_PARAM(false, true) \
2780 BODY \
2781 P_64_PROFILE_END() \
2782
2783 #define P_64_PROFILE_REDUCTION(BIT, BODY) \
2784 P_64_PROFILE_BASE() \
2785 P_64_PROFILE_PARAM(true, false) \
2786 for (sreg_t i = 0; i < xlen / BIT; i++) { \
2787 sreg_t ps1 = P_FIELD(rs1, i, BIT); \
2788 sreg_t ps2 = P_FIELD(rs2, i, BIT); \
2789 BODY \
2790 } \
2791 P_64_PROFILE_END() \
2792
2793 #define P_64_UPROFILE_REDUCTION(BIT, BODY) \
2794 P_64_UPROFILE_BASE() \
2795 P_64_PROFILE_PARAM(true, false) \
2796 for (sreg_t i = 0; i < xlen / BIT; i++) { \
2797 reg_t ps1 = P_UFIELD(rs1, i, BIT); \
2798 reg_t ps2 = P_UFIELD(rs2, i, BIT); \
2799 BODY \
2800 } \
2801 P_64_PROFILE_END() \
2802
2803 #define P_64_PROFILE_END() \
2804 if (xlen == 32) { \
2805 WRITE_RD_PAIR(rd); \
2806 } else { \
2807 WRITE_RD(sext_xlen(rd)); \
2808 }
2809
2810 #define DEBUG_START 0x0
2811 #define DEBUG_END (0x1000 - 1)
2812
2813 #endif
2814