1 // license:BSD-3-Clause
2 // copyright-holders:Ryan Holtz,Tyler J. Stachecki
3 /***************************************************************************
4 
5     rspcp2.h
6 
7     Interface file for Reality Signal Processor (RSP) vector extensions.
8 
9 ***************************************************************************/
10 #ifndef MAME_CPU_RSP_RSPCP2_H
11 #define MAME_CPU_RSP_RSPCP2_H
12 
13 #pragma once
14 
15 #include "rsp.h"
16 #include "rspdiv.h"
17 #include "cpu/drcuml.h"
18 
19 #define SIMD_OFF        (1)
20 
21 #if (defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__SSE4_1__) || defined(__SSE4_2__))
22 #define SSE_AVAILABLE   (1)
23 #else
24 #define SSE_AVAILABLE   (0)
25 #endif
26 
27 #if (SSE_AVAILABLE || defined(_MSC_VER)) && defined(PTR64) && !SIMD_OFF
28 #define USE_SIMD    (1)
29 #else
30 #define USE_SIMD    (0)
31 #endif
32 
33 #if USE_SIMD
34 
35 #ifdef _MSC_VER
36 #define __align16 __declspec(align(16))
37 #else
38 #define __align16 __attribute__((aligned(16)))
39 #endif
40 
41 #if (defined(__SSE4_2__) || defined(_MSC_VER))
42 #include <nmmintrin.h>
43 #elif (defined(__SSE4_1__) || defined(_MSC_VER))
44 #include <smmintrin.h>
45 #elif (defined(__SSSE3__) || defined(_MSC_VER))
46 #include <tmmintrin.h>
47 #elif (defined(__SSE3__ ) || defined(_MSC_VER))
48 #include <pmmintrin.h>
49 #else
50 #include <emmintrin.h>
51 #endif
52 
53 typedef __m128i rsp_vec_t;
54 #endif
55 
56 union VECTOR_REG
57 {
58 	uint64_t d[2];
59 	uint32_t l[4];
60 	uint16_t s[8];
61 	uint8_t b[16];
62 #if USE_SIMD
63 	rsp_vec_t v;
64 #endif
65 };
66 
67 union ACCUMULATOR_REG
68 {
69 	uint64_t q;
70 	uint32_t l[2];
71 	uint16_t w[4];
72 };
73 
74 struct compiler_state;
75 
76 class rsp_device::cop2
77 {
78 	friend class rsp_device;
79 
80 public:
81 	cop2(rsp_device &rsp, running_machine &machine);
82 
83 protected:
84 	virtual void init();
85 	virtual void start();
86 
generate_cop2(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)87 	virtual bool generate_cop2(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; }
generate_lwc2(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)88 	virtual bool generate_lwc2(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; }
generate_swc2(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)89 	virtual bool generate_swc2(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; }
90 
91 	virtual void state_string_export(const int index, std::string &str) const;
92 
93 public:
94 	virtual ~cop2();
95 
lbv()96 	virtual void lbv() { }
lsv()97 	virtual void lsv() { }
llv()98 	virtual void llv() { }
ldv()99 	virtual void ldv() { }
lqv()100 	virtual void lqv() { }
lrv()101 	virtual void lrv() { }
lpv()102 	virtual void lpv() { }
luv()103 	virtual void luv() { }
lhv()104 	virtual void lhv() { }
lfv()105 	virtual void lfv() { }
lwv()106 	virtual void lwv() { }
ltv()107 	virtual void ltv() { }
sbv()108 	virtual void sbv() { }
ssv()109 	virtual void ssv() { }
slv()110 	virtual void slv() { }
sdv()111 	virtual void sdv() { }
sqv()112 	virtual void sqv() { }
srv()113 	virtual void srv() { }
spv()114 	virtual void spv() { }
suv()115 	virtual void suv() { }
shv()116 	virtual void shv() { }
sfv()117 	virtual void sfv() { }
swv()118 	virtual void swv() { }
stv()119 	virtual void stv() { }
vmulf()120 	virtual void vmulf() { }
vmulu()121 	virtual void vmulu() { }
vmudl()122 	virtual void vmudl() { }
vmudm()123 	virtual void vmudm() { }
vmudn()124 	virtual void vmudn() { }
vmudh()125 	virtual void vmudh() { }
vmacf()126 	virtual void vmacf() { }
vmacu()127 	virtual void vmacu() { }
vmadl()128 	virtual void vmadl() { }
vmadm()129 	virtual void vmadm() { }
vmadn()130 	virtual void vmadn() { }
vmadh()131 	virtual void vmadh() { }
vadd()132 	virtual void vadd() { }
vsub()133 	virtual void vsub() { }
vabs()134 	virtual void vabs() { }
vaddc()135 	virtual void vaddc() { }
vsubc()136 	virtual void vsubc() { }
vaddb()137 	virtual void vaddb() { }
vsaw()138 	virtual void vsaw() { }
vlt()139 	virtual void vlt() { }
veq()140 	virtual void veq() { }
vne()141 	virtual void vne() { }
vge()142 	virtual void vge() { }
vcl()143 	virtual void vcl() { }
vch()144 	virtual void vch() { }
vcr()145 	virtual void vcr() { }
vmrg()146 	virtual void vmrg() { }
vand()147 	virtual void vand() { }
vnand()148 	virtual void vnand() { }
vor()149 	virtual void vor() { }
vnor()150 	virtual void vnor() { }
vxor()151 	virtual void vxor() { }
vnxor()152 	virtual void vnxor() { }
vrcp()153 	virtual void vrcp() { }
vrcpl()154 	virtual void vrcpl() { }
vrcph()155 	virtual void vrcph() { }
vmov()156 	virtual void vmov() { }
vrsql()157 	virtual void vrsql() { }
vrsqh()158 	virtual void vrsqh() { }
vrsq()159 	virtual void vrsq() { }
160 	virtual void mfc2();
161 	virtual void cfc2();
162 	virtual void mtc2();
163 	virtual void ctc2();
164 
165 	virtual void    handle_cop2(uint32_t op);
166 
167 	void            log_instruction_execution();
cfunc_unimplemented_opcode()168 	virtual void    cfunc_unimplemented_opcode() { }
169 
170 	void            dump(uint32_t op);
171 	void            dump_dmem();
172 
173 protected:
generate_vector_opcode(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)174 	virtual bool     generate_vector_opcode(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; }
175 
176 	uint16_t          SATURATE_ACCUM(int accum, int slice, uint16_t negative, uint16_t positive);
177 
178 	// Data that needs to be stored close to the generated DRC code
179 	struct internal_rspcop2_state
180 	{
181 		uint32_t      op;
182 	};
183 
184 	internal_rspcop2_state *m_rspcop2_state;
185 	rsp_device&     m_rsp;
186 	running_machine& m_machine;
187 	uint32_t          m_vres[8];          /* used for temporary vector results */
188 
189 #if USE_SIMD
190 	__align16 VECTOR_REG      m_v[32];
191 #else
192 	VECTOR_REG      m_v[32];
193 #endif
194 	ACCUMULATOR_REG m_accum[8];
195 	uint16_t          m_vflag[6][8];
196 
197 	int32_t           m_reciprocal_res;
198 	uint32_t          m_reciprocal_high;
199 	int32_t           m_dp_allowed;
200 
201 #if USE_SIMD
202 	enum rsp_flags_t {
203 		RSP_VCO = 0,
204 		RSP_VCC = 1,
205 		RSP_VCE = 2
206 	};
207 
208 	enum rsp_acc_t {
209 		RSP_ACC_LO = 16,
210 		RSP_ACC_MD = 8,
211 		RSP_ACC_HI = 0,
212 	};
213 
214 	enum rsp_mem_request_type {
215 		RSP_MEM_REQUEST_NONE,
216 		RSP_MEM_REQUEST_INT_MEM,
217 		RSP_MEM_REQUEST_VECTOR,
218 		RSP_MEM_REQUEST_FOURTH,
219 		RSP_MEM_REQUEST_HALF,
220 		RSP_MEM_REQUEST_PACK,
221 		RSP_MEM_REQUEST_QUAD,
222 		RSP_MEM_REQUEST_REST,
223 		RSP_MEM_REQUEST_UPACK
224 	};
225 
226 	union aligned_rsp_1vect_t {
227 		rsp_vec_t __align;
228 		uint16_t s[8];
229 	};
230 
231 	union aligned_rsp_2vect_t {
232 		rsp_vec_t __align[2];
233 		uint16_t s[16];
234 	};
235 
236 	union aligned_rsp_3vect_t {
237 		rsp_vec_t __align[3];
238 		uint16_t s[24];
239 	};
240 
241 	__align16 aligned_rsp_1vect_t m_vdqm;
242 	__align16 aligned_rsp_2vect_t m_flags[3];
243 	__align16 aligned_rsp_3vect_t m_acc;
244 	uint32_t m_dp_flag;
245 
246 	typedef struct
247 	{
248 		rsp_vec_t dummy_for_alignment;
249 		const uint16_t logic_mask[2][8];
250 		const uint16_t vrsq_mask_table[8][8];
251 		const uint16_t shuffle_keys[16][8];
252 		const uint16_t sll_b2l_keys[16][8];
253 		const uint16_t sll_l2b_keys[16][8];
254 		const uint16_t srl_b2l_keys[16][8];
255 		const uint16_t ror_b2l_keys[16][8];
256 		const uint16_t rol_l2b_keys[16][8];
257 		const uint16_t ror_l2b_keys[16][8];
258 		const uint16_t qr_lut[16][8];
259 		const uint16_t bdls_lut[4][4];
260 		const uint16_t word_reverse[8];
261 	} vec_helpers_t;
262 
263 	static const vec_helpers_t m_vec_helpers;
264 
265 	rsp_vec_t vec_load_and_shuffle_operand(const uint16_t* src, uint32_t element);
sign_extend_6(int32_t i)266 	static inline uint32_t sign_extend_6(int32_t i) {
267 		return ((i << (32 - 7)) >> (32 - 7)) & 0xfff;
268 	}
vec_load_unshuffled_operand(const void * src)269 	static inline rsp_vec_t vec_load_unshuffled_operand(const void* src)
270 	{
271 		return _mm_load_si128((rsp_vec_t*) src);
272 	}
vec_write_operand(uint16_t * dest,rsp_vec_t src)273 	static inline void vec_write_operand(uint16_t* dest, rsp_vec_t src)
274 	{
275 		_mm_store_si128((rsp_vec_t*) dest, src);
276 	}
read_acc_lo(const uint16_t * acc)277 	static inline rsp_vec_t read_acc_lo(const uint16_t* acc)
278 	{
279 		return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t));
280 	}
read_acc_mid(const uint16_t * acc)281 	static inline rsp_vec_t read_acc_mid(const uint16_t* acc)
282 	{
283 		return vec_load_unshuffled_operand(acc + (sizeof(rsp_vec_t) >> 1));
284 	}
read_acc_hi(const void * acc)285 	static inline rsp_vec_t read_acc_hi(const void* acc)
286 	{
287 		return vec_load_unshuffled_operand(acc);
288 	}
read_vcc_lo(const uint16_t * vcc)289 	static inline rsp_vec_t read_vcc_lo(const uint16_t *vcc)
290 	{
291 		return vec_load_unshuffled_operand(vcc + (sizeof(rsp_vec_t) >> 1));
292 	}
read_vcc_hi(const uint16_t * vcc)293 	static inline rsp_vec_t read_vcc_hi(const uint16_t *vcc)
294 	{
295 		return vec_load_unshuffled_operand(vcc);
296 	}
read_vco_lo(const uint16_t * vco)297 	static inline rsp_vec_t read_vco_lo(const uint16_t *vco)
298 	{
299 		return vec_load_unshuffled_operand(vco + (sizeof(rsp_vec_t) >> 1));
300 	}
read_vco_hi(const uint16_t * vco)301 	static inline rsp_vec_t read_vco_hi(const uint16_t *vco)
302 	{
303 		return vec_load_unshuffled_operand(vco);
304 	}
read_vce(const uint16_t * vce)305 	static inline rsp_vec_t read_vce(const uint16_t *vce)
306 	{
307 		return vec_load_unshuffled_operand(vce + (sizeof(rsp_vec_t) >> 1));
308 	}
write_acc_lo(uint16_t * acc,rsp_vec_t acc_lo)309 	static inline void write_acc_lo(uint16_t *acc, rsp_vec_t acc_lo)
310 	{
311 		return vec_write_operand(acc + sizeof(rsp_vec_t), acc_lo);
312 	}
write_acc_mid(uint16_t * acc,rsp_vec_t acc_mid)313 	static inline void write_acc_mid(uint16_t *acc, rsp_vec_t acc_mid)
314 	{
315 		return vec_write_operand(acc + (sizeof(rsp_vec_t) >> 1), acc_mid);
316 	}
write_acc_hi(uint16_t * acc,rsp_vec_t acc_hi)317 	static inline void write_acc_hi(uint16_t *acc, rsp_vec_t acc_hi)
318 	{
319 		return vec_write_operand(acc, acc_hi);
320 	}
write_vcc_lo(uint16_t * vcc,rsp_vec_t vcc_lo)321 	static inline void write_vcc_lo(uint16_t *vcc, rsp_vec_t vcc_lo)
322 	{
323 		return vec_write_operand(vcc + (sizeof(rsp_vec_t) >> 1), vcc_lo);
324 	}
write_vcc_hi(uint16_t * vcc,rsp_vec_t vcc_hi)325 	static inline void write_vcc_hi(uint16_t *vcc, rsp_vec_t vcc_hi)
326 	{
327 		return vec_write_operand(vcc, vcc_hi);
328 	}
write_vco_lo(uint16_t * vcc,rsp_vec_t vco_lo)329 	static inline void write_vco_lo(uint16_t *vcc, rsp_vec_t vco_lo)
330 	{
331 		return vec_write_operand(vcc + (sizeof(rsp_vec_t) >> 1), vco_lo);
332 	}
write_vco_hi(uint16_t * vcc,rsp_vec_t vco_hi)333 	static inline void write_vco_hi(uint16_t *vcc, rsp_vec_t vco_hi)
334 	{
335 		return vec_write_operand(vcc, vco_hi);
336 	}
write_vce(uint16_t * vce,rsp_vec_t vce_r)337 	static inline void write_vce(uint16_t *vce, rsp_vec_t vce_r)
338 	{
339 		return vec_write_operand(vce + (sizeof(rsp_vec_t) >> 1), vce_r);
340 	}
341 
get_flags(const uint16_t * flags)342 	static inline int16_t get_flags(const uint16_t *flags)
343 	{
344 		return _mm_movemask_epi8(_mm_packs_epi16(_mm_load_si128((rsp_vec_t*) (flags + (sizeof(rsp_vec_t) >> 1))), _mm_load_si128((rsp_vec_t*) flags)));
345 	}
346 
vec_zero()347 	static inline rsp_vec_t vec_zero()
348 	{
349 		return _mm_setzero_si128();
350 	}
351 
352 	void vec_load_group1(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm);
353 	void vec_load_group2(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
354 	void vec_load_group4(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
355 	void vec_store_group1(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm);
356 	void vec_store_group2(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
357 	void vec_store_group4(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
358 
359 #include "clamp.h"
360 #include "vabs.h"
361 #include "vadd.h"
362 #include "vaddc.h"
363 #include "vand.h"
364 #include "vch.h"
365 #include "vcmp.h"
366 #include "vcl.h"
367 #include "vcr.h"
368 #include "vdivh.h"
369 #include "vmac.h"
370 #include "vmov.h"
371 #include "vmrg.h"
372 #include "vmul.h"
373 #include "vmulh.h"
374 #include "vmull.h"
375 #include "vmulm.h"
376 #include "vmuln.h"
377 #include "vor.h"
378 #include "vrcpsq.h"
379 #include "vrsq.h"
380 #include "vsub.h"
381 #include "vsubc.h"
382 #include "vxor.h"
383 #include "vldst.h"
384 #endif
385 
386 private:
387 	void            handle_lwc2(uint32_t op);
388 	void            handle_swc2(uint32_t op);
389 	void            handle_vector_ops(uint32_t op);
390 
391 	uint32_t          m_div_in;
392 	uint32_t          m_div_out;
393 };
394 
395 #endif // MAME_CPU_RSP_RSPCP2_H
396