1 // license:BSD-3-Clause 2 // copyright-holders:Ryan Holtz,Tyler J. Stachecki 3 /*************************************************************************** 4 5 rspcp2.h 6 7 Interface file for Reality Signal Processor (RSP) vector extensions. 8 9 ***************************************************************************/ 10 #ifndef MAME_CPU_RSP_RSPCP2_H 11 #define MAME_CPU_RSP_RSPCP2_H 12 13 #pragma once 14 15 #include "rsp.h" 16 #include "rspdiv.h" 17 #include "cpu/drcuml.h" 18 19 #define SIMD_OFF (1) 20 21 #if (defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__SSE4_1__) || defined(__SSE4_2__)) 22 #define SSE_AVAILABLE (1) 23 #else 24 #define SSE_AVAILABLE (0) 25 #endif 26 27 #if (SSE_AVAILABLE || defined(_MSC_VER)) && defined(PTR64) && !SIMD_OFF 28 #define USE_SIMD (1) 29 #else 30 #define USE_SIMD (0) 31 #endif 32 33 #if USE_SIMD 34 35 #ifdef _MSC_VER 36 #define __align16 __declspec(align(16)) 37 #else 38 #define __align16 __attribute__((aligned(16))) 39 #endif 40 41 #if (defined(__SSE4_2__) || defined(_MSC_VER)) 42 #include <nmmintrin.h> 43 #elif (defined(__SSE4_1__) || defined(_MSC_VER)) 44 #include <smmintrin.h> 45 #elif (defined(__SSSE3__) || defined(_MSC_VER)) 46 #include <tmmintrin.h> 47 #elif (defined(__SSE3__ ) || defined(_MSC_VER)) 48 #include <pmmintrin.h> 49 #else 50 #include <emmintrin.h> 51 #endif 52 53 typedef __m128i rsp_vec_t; 54 #endif 55 56 union VECTOR_REG 57 { 58 uint64_t d[2]; 59 uint32_t l[4]; 60 uint16_t s[8]; 61 uint8_t b[16]; 62 #if USE_SIMD 63 rsp_vec_t v; 64 #endif 65 }; 66 67 union ACCUMULATOR_REG 68 { 69 uint64_t q; 70 uint32_t l[2]; 71 uint16_t w[4]; 72 }; 73 74 struct compiler_state; 75 76 class rsp_device::cop2 77 { 78 friend class rsp_device; 79 80 public: 81 cop2(rsp_device &rsp, running_machine &machine); 82 83 protected: 84 virtual void init(); 85 virtual void start(); 86 generate_cop2(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)87 virtual bool generate_cop2(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; } generate_lwc2(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)88 virtual bool generate_lwc2(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; } generate_swc2(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)89 virtual bool generate_swc2(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; } 90 91 virtual void state_string_export(const int index, std::string &str) const; 92 93 public: 94 virtual ~cop2(); 95 lbv()96 virtual void lbv() { } lsv()97 virtual void lsv() { } llv()98 virtual void llv() { } ldv()99 virtual void ldv() { } lqv()100 virtual void lqv() { } lrv()101 virtual void lrv() { } lpv()102 virtual void lpv() { } luv()103 virtual void luv() { } lhv()104 virtual void lhv() { } lfv()105 virtual void lfv() { } lwv()106 virtual void lwv() { } ltv()107 virtual void ltv() { } sbv()108 virtual void sbv() { } ssv()109 virtual void ssv() { } slv()110 virtual void slv() { } sdv()111 virtual void sdv() { } sqv()112 virtual void sqv() { } srv()113 virtual void srv() { } spv()114 virtual void spv() { } suv()115 virtual void suv() { } shv()116 virtual void shv() { } sfv()117 virtual void sfv() { } swv()118 virtual void swv() { } stv()119 virtual void stv() { } vmulf()120 virtual void vmulf() { } vmulu()121 virtual void vmulu() { } vmudl()122 virtual void vmudl() { } vmudm()123 virtual void vmudm() { } vmudn()124 virtual void vmudn() { } vmudh()125 virtual void vmudh() { } vmacf()126 virtual void vmacf() { } vmacu()127 virtual void vmacu() { } vmadl()128 virtual void vmadl() { } vmadm()129 virtual void vmadm() { } vmadn()130 virtual void vmadn() { } vmadh()131 virtual void vmadh() { } vadd()132 virtual void vadd() { } vsub()133 virtual void vsub() { } vabs()134 virtual void vabs() { } vaddc()135 virtual void vaddc() { } vsubc()136 virtual void vsubc() { } vaddb()137 virtual void vaddb() { } vsaw()138 virtual void vsaw() { } vlt()139 virtual void vlt() { } veq()140 virtual void veq() { } vne()141 virtual void vne() { } vge()142 virtual void vge() { } vcl()143 virtual void vcl() { } vch()144 virtual void vch() { } vcr()145 virtual void vcr() { } vmrg()146 virtual void vmrg() { } vand()147 virtual void vand() { } vnand()148 virtual void vnand() { } vor()149 virtual void vor() { } vnor()150 virtual void vnor() { } vxor()151 virtual void vxor() { } vnxor()152 virtual void vnxor() { } vrcp()153 virtual void vrcp() { } vrcpl()154 virtual void vrcpl() { } vrcph()155 virtual void vrcph() { } vmov()156 virtual void vmov() { } vrsql()157 virtual void vrsql() { } vrsqh()158 virtual void vrsqh() { } vrsq()159 virtual void vrsq() { } 160 virtual void mfc2(); 161 virtual void cfc2(); 162 virtual void mtc2(); 163 virtual void ctc2(); 164 165 virtual void handle_cop2(uint32_t op); 166 167 void log_instruction_execution(); cfunc_unimplemented_opcode()168 virtual void cfunc_unimplemented_opcode() { } 169 170 void dump(uint32_t op); 171 void dump_dmem(); 172 173 protected: generate_vector_opcode(drcuml_block & block,rsp_device::compiler_state & compiler,const opcode_desc * desc)174 virtual bool generate_vector_opcode(drcuml_block &block, rsp_device::compiler_state &compiler, const opcode_desc *desc) { return true; } 175 176 uint16_t SATURATE_ACCUM(int accum, int slice, uint16_t negative, uint16_t positive); 177 178 // Data that needs to be stored close to the generated DRC code 179 struct internal_rspcop2_state 180 { 181 uint32_t op; 182 }; 183 184 internal_rspcop2_state *m_rspcop2_state; 185 rsp_device& m_rsp; 186 running_machine& m_machine; 187 uint32_t m_vres[8]; /* used for temporary vector results */ 188 189 #if USE_SIMD 190 __align16 VECTOR_REG m_v[32]; 191 #else 192 VECTOR_REG m_v[32]; 193 #endif 194 ACCUMULATOR_REG m_accum[8]; 195 uint16_t m_vflag[6][8]; 196 197 int32_t m_reciprocal_res; 198 uint32_t m_reciprocal_high; 199 int32_t m_dp_allowed; 200 201 #if USE_SIMD 202 enum rsp_flags_t { 203 RSP_VCO = 0, 204 RSP_VCC = 1, 205 RSP_VCE = 2 206 }; 207 208 enum rsp_acc_t { 209 RSP_ACC_LO = 16, 210 RSP_ACC_MD = 8, 211 RSP_ACC_HI = 0, 212 }; 213 214 enum rsp_mem_request_type { 215 RSP_MEM_REQUEST_NONE, 216 RSP_MEM_REQUEST_INT_MEM, 217 RSP_MEM_REQUEST_VECTOR, 218 RSP_MEM_REQUEST_FOURTH, 219 RSP_MEM_REQUEST_HALF, 220 RSP_MEM_REQUEST_PACK, 221 RSP_MEM_REQUEST_QUAD, 222 RSP_MEM_REQUEST_REST, 223 RSP_MEM_REQUEST_UPACK 224 }; 225 226 union aligned_rsp_1vect_t { 227 rsp_vec_t __align; 228 uint16_t s[8]; 229 }; 230 231 union aligned_rsp_2vect_t { 232 rsp_vec_t __align[2]; 233 uint16_t s[16]; 234 }; 235 236 union aligned_rsp_3vect_t { 237 rsp_vec_t __align[3]; 238 uint16_t s[24]; 239 }; 240 241 __align16 aligned_rsp_1vect_t m_vdqm; 242 __align16 aligned_rsp_2vect_t m_flags[3]; 243 __align16 aligned_rsp_3vect_t m_acc; 244 uint32_t m_dp_flag; 245 246 typedef struct 247 { 248 rsp_vec_t dummy_for_alignment; 249 const uint16_t logic_mask[2][8]; 250 const uint16_t vrsq_mask_table[8][8]; 251 const uint16_t shuffle_keys[16][8]; 252 const uint16_t sll_b2l_keys[16][8]; 253 const uint16_t sll_l2b_keys[16][8]; 254 const uint16_t srl_b2l_keys[16][8]; 255 const uint16_t ror_b2l_keys[16][8]; 256 const uint16_t rol_l2b_keys[16][8]; 257 const uint16_t ror_l2b_keys[16][8]; 258 const uint16_t qr_lut[16][8]; 259 const uint16_t bdls_lut[4][4]; 260 const uint16_t word_reverse[8]; 261 } vec_helpers_t; 262 263 static const vec_helpers_t m_vec_helpers; 264 265 rsp_vec_t vec_load_and_shuffle_operand(const uint16_t* src, uint32_t element); sign_extend_6(int32_t i)266 static inline uint32_t sign_extend_6(int32_t i) { 267 return ((i << (32 - 7)) >> (32 - 7)) & 0xfff; 268 } vec_load_unshuffled_operand(const void * src)269 static inline rsp_vec_t vec_load_unshuffled_operand(const void* src) 270 { 271 return _mm_load_si128((rsp_vec_t*) src); 272 } vec_write_operand(uint16_t * dest,rsp_vec_t src)273 static inline void vec_write_operand(uint16_t* dest, rsp_vec_t src) 274 { 275 _mm_store_si128((rsp_vec_t*) dest, src); 276 } read_acc_lo(const uint16_t * acc)277 static inline rsp_vec_t read_acc_lo(const uint16_t* acc) 278 { 279 return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t)); 280 } read_acc_mid(const uint16_t * acc)281 static inline rsp_vec_t read_acc_mid(const uint16_t* acc) 282 { 283 return vec_load_unshuffled_operand(acc + (sizeof(rsp_vec_t) >> 1)); 284 } read_acc_hi(const void * acc)285 static inline rsp_vec_t read_acc_hi(const void* acc) 286 { 287 return vec_load_unshuffled_operand(acc); 288 } read_vcc_lo(const uint16_t * vcc)289 static inline rsp_vec_t read_vcc_lo(const uint16_t *vcc) 290 { 291 return vec_load_unshuffled_operand(vcc + (sizeof(rsp_vec_t) >> 1)); 292 } read_vcc_hi(const uint16_t * vcc)293 static inline rsp_vec_t read_vcc_hi(const uint16_t *vcc) 294 { 295 return vec_load_unshuffled_operand(vcc); 296 } read_vco_lo(const uint16_t * vco)297 static inline rsp_vec_t read_vco_lo(const uint16_t *vco) 298 { 299 return vec_load_unshuffled_operand(vco + (sizeof(rsp_vec_t) >> 1)); 300 } read_vco_hi(const uint16_t * vco)301 static inline rsp_vec_t read_vco_hi(const uint16_t *vco) 302 { 303 return vec_load_unshuffled_operand(vco); 304 } read_vce(const uint16_t * vce)305 static inline rsp_vec_t read_vce(const uint16_t *vce) 306 { 307 return vec_load_unshuffled_operand(vce + (sizeof(rsp_vec_t) >> 1)); 308 } write_acc_lo(uint16_t * acc,rsp_vec_t acc_lo)309 static inline void write_acc_lo(uint16_t *acc, rsp_vec_t acc_lo) 310 { 311 return vec_write_operand(acc + sizeof(rsp_vec_t), acc_lo); 312 } write_acc_mid(uint16_t * acc,rsp_vec_t acc_mid)313 static inline void write_acc_mid(uint16_t *acc, rsp_vec_t acc_mid) 314 { 315 return vec_write_operand(acc + (sizeof(rsp_vec_t) >> 1), acc_mid); 316 } write_acc_hi(uint16_t * acc,rsp_vec_t acc_hi)317 static inline void write_acc_hi(uint16_t *acc, rsp_vec_t acc_hi) 318 { 319 return vec_write_operand(acc, acc_hi); 320 } write_vcc_lo(uint16_t * vcc,rsp_vec_t vcc_lo)321 static inline void write_vcc_lo(uint16_t *vcc, rsp_vec_t vcc_lo) 322 { 323 return vec_write_operand(vcc + (sizeof(rsp_vec_t) >> 1), vcc_lo); 324 } write_vcc_hi(uint16_t * vcc,rsp_vec_t vcc_hi)325 static inline void write_vcc_hi(uint16_t *vcc, rsp_vec_t vcc_hi) 326 { 327 return vec_write_operand(vcc, vcc_hi); 328 } write_vco_lo(uint16_t * vcc,rsp_vec_t vco_lo)329 static inline void write_vco_lo(uint16_t *vcc, rsp_vec_t vco_lo) 330 { 331 return vec_write_operand(vcc + (sizeof(rsp_vec_t) >> 1), vco_lo); 332 } write_vco_hi(uint16_t * vcc,rsp_vec_t vco_hi)333 static inline void write_vco_hi(uint16_t *vcc, rsp_vec_t vco_hi) 334 { 335 return vec_write_operand(vcc, vco_hi); 336 } write_vce(uint16_t * vce,rsp_vec_t vce_r)337 static inline void write_vce(uint16_t *vce, rsp_vec_t vce_r) 338 { 339 return vec_write_operand(vce + (sizeof(rsp_vec_t) >> 1), vce_r); 340 } 341 get_flags(const uint16_t * flags)342 static inline int16_t get_flags(const uint16_t *flags) 343 { 344 return _mm_movemask_epi8(_mm_packs_epi16(_mm_load_si128((rsp_vec_t*) (flags + (sizeof(rsp_vec_t) >> 1))), _mm_load_si128((rsp_vec_t*) flags))); 345 } 346 vec_zero()347 static inline rsp_vec_t vec_zero() 348 { 349 return _mm_setzero_si128(); 350 } 351 352 void vec_load_group1(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm); 353 void vec_load_group2(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); 354 void vec_load_group4(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); 355 void vec_store_group1(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm); 356 void vec_store_group2(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); 357 void vec_store_group4(uint32_t addr, uint32_t element, uint16_t* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); 358 359 #include "clamp.h" 360 #include "vabs.h" 361 #include "vadd.h" 362 #include "vaddc.h" 363 #include "vand.h" 364 #include "vch.h" 365 #include "vcmp.h" 366 #include "vcl.h" 367 #include "vcr.h" 368 #include "vdivh.h" 369 #include "vmac.h" 370 #include "vmov.h" 371 #include "vmrg.h" 372 #include "vmul.h" 373 #include "vmulh.h" 374 #include "vmull.h" 375 #include "vmulm.h" 376 #include "vmuln.h" 377 #include "vor.h" 378 #include "vrcpsq.h" 379 #include "vrsq.h" 380 #include "vsub.h" 381 #include "vsubc.h" 382 #include "vxor.h" 383 #include "vldst.h" 384 #endif 385 386 private: 387 void handle_lwc2(uint32_t op); 388 void handle_swc2(uint32_t op); 389 void handle_vector_ops(uint32_t op); 390 391 uint32_t m_div_in; 392 uint32_t m_div_out; 393 }; 394 395 #endif // MAME_CPU_RSP_RSPCP2_H 396