1 // [Blend2D] 2 // 2D Vector Graphics Powered by a JIT Compiler. 3 // 4 // [License] 5 // Zlib - See LICENSE.md file in the package. 6 7 #ifndef BLEND2D_PIPEGEN_PIPECOMPILER_P_H 8 #define BLEND2D_PIPEGEN_PIPECOMPILER_P_H 9 10 #include "../pipegen/pipegencore_p.h" 11 #include "../pipegen/piperegusage_p.h" 12 13 //! \cond INTERNAL 14 //! \addtogroup blend2d_internal_pipegen 15 //! \{ 16 17 namespace BLPipeGen { 18 19 // ============================================================================ 20 // [BLPipeGen::PipeCompiler] 21 // ============================================================================ 22 23 //! Pipeline compiler. 24 class PipeCompiler { 25 public: 26 BL_NONCOPYABLE(PipeCompiler) 27 28 //! AsmJit compiler. 29 x86::Compiler* cc; 30 //! Target CPU features. 31 x86::Features _features; 32 33 //! SIMD width. 34 uint32_t _simdWidth; 35 //! Number of registers available to the pipeline compiler. 36 PipeRegUsage _availableRegs; 37 //! Estimation of registers used by the pipeline temporarily. 38 PipeRegUsage _temporaryRegs; 39 //! Estimation of registers used by the pipeline permanently. 40 PipeRegUsage _persistentRegs; 41 42 //! Function node. 43 asmjit::FuncNode* _funcNode; 44 //! Function initialization hook. 45 asmjit::BaseNode* _funcInit; 46 //! Function end hook (to add 'unlikely' branches). 47 asmjit::BaseNode* _funcEnd; 48 49 //! Invalid GP register. 50 x86::Gp _gpNone; 51 //! Holds `BLPipeFillFunc::ctxData` argument. 52 x86::Gp _ctxData; 53 //! Holds `BLPipeFillFunc::fillData` argument. 54 x86::Gp _fillData; 55 //! Holds `BLPipeFillFunc::fetchData` argument. 56 x86::Gp _fetchData; 57 //! Temporary stack used to transfer SIMD regs to GP/MM. 58 x86::Mem _tmpStack; 59 60 //! Offset to get real ctx-data from the passed pointer. 61 int _ctxDataOffset; 62 //! Offset to get real fill-data from the passed pointer. 63 int _fillDataOffset; 64 //! Offset to get real fetch-data from the passed pointer. 65 int _fetchDataOffset; 66 67 //! Offset to the first constant to the `blCommonTable` global. 68 int32_t _commonTableOff; 69 //! Pointer to the `blCommonTable` constant pool (only used in 64-bit mode). 70 x86::Gp _commonTablePtr; 71 //! XMM constants. 72 x86::Xmm _constantsXmm[4]; 73 74 // -------------------------------------------------------------------------- 75 // [PackedInst] 76 // -------------------------------------------------------------------------- 77 78 //! Packing generic instructions and SSE+AVX instructions into a single 32-bit 79 //! integer. 80 //! 81 //! AsmJit has around 1400 instructions for X86|X64, which means that we need 82 //! at least 11 bits to represent each. Typically we need just one instruction 83 //! ID at a time, however, since SSE and AVX instructions use different IDs 84 //! we need a way to pack both SSE and AVX instruction ID into one integer as 85 //! it's much easier to use unified instruction set rather than using specific 86 //! paths for SSE and AVX code. 87 //! 88 //! PackedInst allows to specify the following: 89 //! 90 //! - SSE instruction ID for up to SSE4.2 code generation. 91 //! - AVX instruction ID for AVX+ code generation. 92 //! - Maximum operation width aka 0 (XMM), 1 (YMM) and 2 (ZMM). 93 //! - Special intrinsic used only by PipeCompiler. 94 struct PackedInst { 95 //! Limit width of operands of vector instructions to Xmm|Ymm|Zmm. 96 enum WidthLimit : uint32_t { 97 kWidthX = 0, 98 kWidthY = 1, 99 kWidthZ = 2 100 }; 101 102 enum Bits : uint32_t { 103 kSseIdShift = 0, 104 kSseIdBits = 0xFFF, 105 106 kAvxIdShift = 12, 107 kAvxIdBits = 0xFFF, 108 109 kWidthShift = 24, 110 kWidthBits = 0x3, 111 112 kIntrinShift = 31, 113 kIntrinBits = 0x1 114 }; 115 116 static inline uint32_t packIntrin(uint32_t intrinId, uint32_t width = kWidthZ) noexcept { 117 return (intrinId << kSseIdShift ) | 118 (width << kWidthShift ) | 119 (1u << kIntrinShift) ; 120 } 121 122 static inline uint32_t packAvxSse(uint32_t avxId, uint32_t sseId, uint32_t width = kWidthZ) noexcept { 123 return (avxId << kAvxIdShift) | 124 (sseId << kSseIdShift) | 125 (width << kWidthShift) ; 126 } 127 avxIdPackedInst128 static inline uint32_t avxId(uint32_t packedId) noexcept { return (packedId >> kAvxIdShift) & kAvxIdBits; } sseIdPackedInst129 static inline uint32_t sseId(uint32_t packedId) noexcept { return (packedId >> kSseIdShift) & kSseIdBits; } widthPackedInst130 static inline uint32_t width(uint32_t packedId) noexcept { return (packedId >> kWidthShift) & kWidthBits; } 131 isIntrinPackedInst132 static inline uint32_t isIntrin(uint32_t packedId) noexcept { return (packedId & (kIntrinBits << kIntrinShift)) != 0; } intrinIdPackedInst133 static inline uint32_t intrinId(uint32_t packedId) noexcept { return (packedId >> kSseIdShift) & kSseIdBits; } 134 }; 135 136 //! Intrinsic ID. 137 //! 138 //! Some operations are not available as a single instruction or are part 139 //! of CPU extensions outside of the baseline instruction set. These are 140 //! handled as intrinsics. 141 enum IntrinId { 142 kIntrin2Vloadi128uRO, 143 144 kIntrin2Vmovu8u16, 145 kIntrin2Vmovu8u32, 146 kIntrin2Vmovu16u32, 147 kIntrin2Vabsi8, 148 kIntrin2Vabsi16, 149 kIntrin2Vabsi32, 150 kIntrin2Vabsi64, 151 kIntrin2Vinv255u16, 152 kIntrin2Vinv256u16, 153 kIntrin2Vinv255u32, 154 kIntrin2Vinv256u32, 155 kIntrin2Vduplpd, 156 kIntrin2Vduphpd, 157 158 kIntrin2VBroadcastU16, 159 kIntrin2VBroadcastU32, 160 kIntrin2VBroadcastU64, 161 162 kIntrin2iVswizps, 163 kIntrin2iVswizpd, 164 165 kIntrin3Vcombhli64, 166 kIntrin3Vcombhld64, 167 kIntrin3Vminu16, 168 kIntrin3Vmaxu16, 169 kIntrin3Vmulu64x32, 170 kIntrin3Vhaddpd, 171 172 kIntrin4Vpblendvb, 173 kIntrin4VpblendvbDestructive 174 }; 175 176 enum { 177 //! Number of reserved GP registers for general use. 178 //! 179 //! \note In 32-bit mode constants are absolutely addressed, however, in 180 //! 64-bit mode we can't address arbitrary 64-bit pointers, so one more 181 //! register is reserved as a compensation. 182 kReservedGpRegs = 1 + uint32_t(BL_TARGET_ARCH_BITS >= 64), 183 //! Number of spare MM registers to always reserve (not used). 184 kReservedMmRegs = 0, 185 //! Number of spare XMM|YMM|ZMM registers to always reserve. 186 kReservedVecRegs = 1 187 }; 188 189 // -------------------------------------------------------------------------- 190 // [Construction / Destruction] 191 // -------------------------------------------------------------------------- 192 193 PipeCompiler(x86::Compiler* cc, const asmjit::x86::Features& features) noexcept; 194 ~PipeCompiler() noexcept; 195 196 // -------------------------------------------------------------------------- 197 // [Reset] 198 // -------------------------------------------------------------------------- 199 200 void reset() noexcept; 201 202 // -------------------------------------------------------------------------- 203 // [CPU SIMD Width] 204 // -------------------------------------------------------------------------- 205 206 //! Returns the current SIMD width that this compiler and all its parts use. 207 //! 208 //! \note The returned width is in bytes. simdWidth()209 inline uint32_t simdWidth() const noexcept { return _simdWidth; } 210 211 void initSimdWidth() noexcept; 212 213 // -------------------------------------------------------------------------- 214 // [CPU Features] 215 // -------------------------------------------------------------------------- 216 hasSSE2()217 inline bool hasSSE2() const noexcept { return _features.hasSSE2(); } hasSSE3()218 inline bool hasSSE3() const noexcept { return _features.hasSSE3(); } hasSSSE3()219 inline bool hasSSSE3() const noexcept { return _features.hasSSSE3(); } hasSSE4_1()220 inline bool hasSSE4_1() const noexcept { return _features.hasSSE4_1(); } hasSSE4_2()221 inline bool hasSSE4_2() const noexcept { return _features.hasSSE4_2(); } hasAVX()222 inline bool hasAVX() const noexcept { return _features.hasAVX(); } hasAVX2()223 inline bool hasAVX2() const noexcept { return _features.hasAVX2(); } hasAVX512_F()224 inline bool hasAVX512_F() const noexcept { return _features.hasAVX512_F(); } hasAVX512_BW()225 inline bool hasAVX512_BW() const noexcept { return _features.hasAVX512_BW(); } 226 hasADX()227 inline bool hasADX() const noexcept { return _features.hasADX(); } hasBMI()228 inline bool hasBMI() const noexcept { return _features.hasBMI(); } hasBMI2()229 inline bool hasBMI2() const noexcept { return _features.hasBMI2(); } hasLZCNT()230 inline bool hasLZCNT() const noexcept { return _features.hasLZCNT(); } hasPOPCNT()231 inline bool hasPOPCNT() const noexcept { return _features.hasPOPCNT(); } 232 233 // -------------------------------------------------------------------------- 234 // [Data Offsets] 235 // -------------------------------------------------------------------------- 236 ctxDataOffset()237 inline int ctxDataOffset() const noexcept { return _ctxDataOffset; } fillDataOffset()238 inline int fillDataOffset() const noexcept { return _fillDataOffset; } fetchDataOffset()239 inline int fetchDataOffset() const noexcept { return _fetchDataOffset; } 240 setCtxDataOffset(int offset)241 inline void setCtxDataOffset(int offset) noexcept { _ctxDataOffset = offset; } setFillDataOffset(int offset)242 inline void setFillDataOffset(int offset) noexcept { _fillDataOffset = offset; } setFetchDataOffset(int offset)243 inline void setFetchDataOffset(int offset) noexcept { _fetchDataOffset = offset; } 244 245 // -------------------------------------------------------------------------- 246 // [Compilation] 247 // -------------------------------------------------------------------------- 248 249 void beginFunction() noexcept; 250 void endFunction() noexcept; 251 252 // -------------------------------------------------------------------------- 253 // [Parts Management] 254 // -------------------------------------------------------------------------- 255 256 // TODO: [PIPEGEN] There should be a getter on asmjit side that will return 257 // the `ZoneAllocator` object that can be used for these kind of purposes. 258 // It doesn't make sense to create another ZoneAllocator. 259 template<typename T> newPartT()260 inline T* newPartT() noexcept { 261 void* p = cc->_codeZone.alloc(sizeof(T), 8); 262 if (BL_UNLIKELY(!p)) 263 return nullptr; 264 return new(p) T(this); 265 } 266 267 template<typename T, typename... Args> newPartT(Args &&...args)268 inline T* newPartT(Args&&... args) noexcept { 269 void* p = cc->_codeZone.alloc(sizeof(T), 8); 270 if (BL_UNLIKELY(!p)) 271 return nullptr; 272 return new(p) T(this, std::forward<Args>(args)...); 273 } 274 275 FillPart* newFillPart(uint32_t fillType, FetchPart* dstPart, CompOpPart* compOpPart) noexcept; 276 FetchPart* newFetchPart(uint32_t fetchType, uint32_t fetchPayload, uint32_t format) noexcept; 277 CompOpPart* newCompOpPart(uint32_t compOp, FetchPart* dstPart, FetchPart* srcPart) noexcept; 278 279 // -------------------------------------------------------------------------- 280 // [Init] 281 // -------------------------------------------------------------------------- 282 283 void initPipeline(PipePart* root) noexcept; 284 285 void onPreInitPart(PipePart* part) noexcept; 286 void onPostInitPart(PipePart* part) noexcept; 287 288 //! Generate a function of the given `signature`. 289 void compileFunc(uint32_t signature) noexcept; 290 291 // -------------------------------------------------------------------------- 292 // [Miscellaneous] 293 // -------------------------------------------------------------------------- 294 rename(const OpArray & opArray,const char * name)295 BL_INLINE void rename(const OpArray& opArray, const char* name) noexcept { 296 for (uint32_t i = 0; i < opArray.size(); i++) 297 cc->rename(opArray[i].as<asmjit::BaseReg>(), "%s%u", name, unsigned(i)); 298 } 299 300 // -------------------------------------------------------------------------- 301 // [Constants] 302 // -------------------------------------------------------------------------- 303 304 void _initCommonTablePtr() noexcept; 305 306 x86::Mem constAsMem(const void* c) noexcept; 307 x86::Xmm constAsXmm(const void* c) noexcept; 308 309 // -------------------------------------------------------------------------- 310 // [Registers / Memory] 311 // -------------------------------------------------------------------------- 312 newRegArray(OpArray & dst,uint32_t n,uint32_t regType,const char * name)313 BL_NOINLINE void newRegArray(OpArray& dst, uint32_t n, uint32_t regType, const char* name) noexcept { 314 BL_ASSERT(n <= OpArray::kMaxSize); 315 dst._size = n; 316 for (uint32_t i = 0; i < n; i++) { 317 cc->_newRegFmt(dst[i].as<asmjit::BaseReg>(), regType, "%s%i", name, i); 318 } 319 } 320 newXmmArray(OpArray & dst,uint32_t n,const char * name)321 BL_INLINE void newXmmArray(OpArray& dst, uint32_t n, const char* name) noexcept { 322 newRegArray(dst, n, asmjit::x86::Reg::kTypeXmm, name); 323 } 324 newYmmArray(OpArray & dst,uint32_t n,const char * name)325 BL_INLINE void newYmmArray(OpArray& dst, uint32_t n, const char* name) noexcept { 326 newRegArray(dst, n, asmjit::x86::Reg::kTypeYmm, name); 327 } 328 329 x86::Mem tmpStack(uint32_t size) noexcept; 330 331 // -------------------------------------------------------------------------- 332 // [Emit - Commons] 333 // -------------------------------------------------------------------------- 334 335 // Emit helpers used by GP. 336 void iemit2(uint32_t instId, const Operand_& op1, int imm) noexcept; 337 void iemit2(uint32_t instId, const Operand_& op1, const Operand_& op2) noexcept; 338 void iemit3(uint32_t instId, const Operand_& op1, const Operand_& op2, int imm) noexcept; 339 340 // Emit helpers to emit MOVE from SrcT to DstT, used by pre-AVX instructions. 341 // The `width` parameter is important as it describes how many bytes to read 342 // in case that `src` is a memory location. It's important as some instructions 343 // like PMOVZXBW read only 8 bytes, but to make the same operation in pre-SSE4.1 344 // code we need to read 8 bytes from memory and use PUNPCKLBW to interleave that 345 // bytes with zero. PUNPCKLBW would read 16 bytes from memory and would require 346 // them to be aligned to 16 bytes, if used with memory operand. 347 void vemit_xmov(const Operand_& dst, const Operand_& src, uint32_t width) noexcept; 348 void vemit_xmov(const OpArray& dst, const Operand_& src, uint32_t width) noexcept; 349 void vemit_xmov(const OpArray& dst, const OpArray& src, uint32_t width) noexcept; 350 351 // Emit helpers used by SSE|AVX intrinsics. 352 void vemit_vv_vv(uint32_t packedId, const Operand_& dst_, const Operand_& src_) noexcept; 353 void vemit_vv_vv(uint32_t packedId, const OpArray& dst_, const Operand_& src_) noexcept; 354 void vemit_vv_vv(uint32_t packedId, const OpArray& dst_, const OpArray& src_) noexcept; 355 356 void vemit_vvi_vi(uint32_t packedId, const Operand_& dst_, const Operand_& src_, uint32_t imm) noexcept; 357 void vemit_vvi_vi(uint32_t packedId, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept; 358 void vemit_vvi_vi(uint32_t packedId, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept; 359 360 void vemit_vvi_vvi(uint32_t packedId, const Operand_& dst_, const Operand_& src_, uint32_t imm) noexcept; 361 void vemit_vvi_vvi(uint32_t packedId, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept; 362 void vemit_vvi_vvi(uint32_t packedId, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept; 363 364 void vemit_vvv_vv(uint32_t packedId, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_) noexcept; 365 void vemit_vvv_vv(uint32_t packedId, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept; 366 void vemit_vvv_vv(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept; 367 void vemit_vvv_vv(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept; 368 369 void vemit_vvvi_vvi(uint32_t packedId, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, uint32_t imm) noexcept; 370 void vemit_vvvi_vvi(uint32_t packedId, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept; 371 void vemit_vvvi_vvi(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept; 372 void vemit_vvvi_vvi(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept; 373 374 void vemit_vvvv_vvv(uint32_t packedId, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, const Operand_& src3_) noexcept; 375 void vemit_vvvv_vvv(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand_& src3_) noexcept; 376 void vemit_vvvv_vvv(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept; 377 378 #define I_EMIT_2(NAME, INST_ID) \ 379 template<typename Op1, typename Op2> \ 380 inline void NAME(const Op1& o1, \ 381 const Op2& o2) noexcept { \ 382 iemit2(x86::Inst::kId##INST_ID, o1, o2); \ 383 } 384 385 #define I_EMIT_3(NAME, INST_ID) \ 386 template<typename Op1, typename Op2, typename Op3> \ 387 inline void NAME(const Op1& o1, \ 388 const Op2& o2, \ 389 const Op3& o3) noexcept { \ 390 iemit3(x86::Inst::kId##INST_ID, o1, o2, o3); \ 391 } 392 393 #define V_EMIT_VV_VV(NAME, PACKED_ID) \ 394 template<typename DstT, typename SrcT> \ 395 inline void NAME(const DstT& dst, \ 396 const SrcT& src) noexcept { \ 397 vemit_vv_vv(PACKED_ID, dst, src); \ 398 } 399 400 #define V_EMIT_VVI_VI(NAME, PACKED_ID) \ 401 template<typename DstT, typename SrcT> \ 402 inline void NAME(const DstT& dst, \ 403 const SrcT& src, \ 404 uint32_t imm) noexcept { \ 405 vemit_vvi_vi(PACKED_ID, dst, src, imm); \ 406 } 407 408 #define V_EMIT_VVI_VVI(NAME, PACKED_ID) \ 409 template<typename DstT, typename SrcT> \ 410 inline void NAME(const DstT& dst, \ 411 const SrcT& src, \ 412 uint32_t imm) noexcept { \ 413 vemit_vvi_vvi(PACKED_ID, dst, src, imm); \ 414 } 415 416 #define V_EMIT_VVi_VVi(NAME, PACKED_ID, IMM_VALUE) \ 417 template<typename DstT, typename SrcT> \ 418 inline void NAME(const DstT& dst, \ 419 const SrcT& src) noexcept { \ 420 vemit_vvi_vvi(PACKED_ID, dst, src, IMM_VALUE); \ 421 } 422 423 #define V_EMIT_VVV_VV(NAME, PACKED_ID) \ 424 template<typename DstT, typename Src1T, typename Src2T> \ 425 inline void NAME(const DstT& dst, \ 426 const Src1T& src1, \ 427 const Src2T& src2) noexcept { \ 428 vemit_vvv_vv(PACKED_ID, dst, src1, src2); \ 429 } 430 431 #define V_EMIT_VVVI_VVI(NAME, PACKED_ID) \ 432 template<typename DstT, typename Src1T, typename Src2T> \ 433 inline void NAME(const DstT& dst, \ 434 const Src1T& src1, \ 435 const Src2T& src2, \ 436 uint32_t imm) noexcept { \ 437 vemit_vvvi_vvi(PACKED_ID, dst, src1, src2, imm); \ 438 } 439 440 #define V_EMIT_VVVi_VVi(NAME, PACKED_ID, IMM_VALUE) \ 441 template<typename DstT, typename Src1T, typename Src2T> \ 442 inline void NAME(const DstT& dst, \ 443 const Src1T& src1, \ 444 const Src2T& src2) noexcept { \ 445 vemit_vvvi_vvi(PACKED_ID, dst, src1, src2, IMM_VALUE); \ 446 } 447 448 #define V_EMIT_VVVV_VVV(NAME, PACKED_ID) \ 449 template<typename DstT, typename Src1T, typename Src2T, typename Src3T> \ 450 inline void NAME(const DstT& dst, \ 451 const Src1T& src1, \ 452 const Src2T& src2, \ 453 const Src3T& src3) noexcept { \ 454 vemit_vvvv_vvv(PACKED_ID, dst, src1, src2, src3); \ 455 } 456 457 #define PACK_AVX_SSE(AVX_ID, SSE_ID, W) \ 458 PackedInst::packAvxSse(x86::Inst::kId##AVX_ID, x86::Inst::kId##SSE_ID, PackedInst::kWidth##W) 459 460 // -------------------------------------------------------------------------- 461 // [Emit - 'I' General Purpose Instructions] 462 // -------------------------------------------------------------------------- 463 load8(const x86::Gp & dst,const x86::Mem & src)464 BL_NOINLINE void load8(const x86::Gp& dst, const x86::Mem& src) noexcept { 465 x86::Mem src8 = src; 466 src8.setSize(1); 467 cc->movzx(dst.r32(), src8); 468 } 469 load16(const x86::Gp & dst,const x86::Mem & src)470 BL_NOINLINE void load16(const x86::Gp& dst, const x86::Mem& src) noexcept { 471 x86::Mem src16 = src; 472 src16.setSize(2); 473 cc->movzx(dst.r32(), src16); 474 } 475 store8(const x86::Mem & dst,const x86::Gp & src)476 BL_NOINLINE void store8(const x86::Mem& dst, const x86::Gp& src) noexcept { 477 x86::Mem dst8 = dst; 478 dst8.setSize(1); 479 cc->mov(dst8, src.r8()); 480 } 481 store16(const x86::Mem & dst,const x86::Gp & src)482 BL_NOINLINE void store16(const x86::Mem& dst, const x86::Gp& src) noexcept { 483 x86::Mem dst16 = dst; 484 dst16.setSize(2); 485 cc->mov(dst16, src.r16()); 486 } 487 uMov(const x86::Gp & dst,const x86::Gp & src)488 BL_INLINE void uMov(const x86::Gp& dst, const x86::Gp& src) noexcept { 489 cc->mov(dst, src); 490 } 491 uMov(const x86::Gp & dst,const x86::Mem & src)492 BL_INLINE void uMov(const x86::Gp& dst, const x86::Mem& src) noexcept { 493 cc->mov(dst, src); 494 } 495 496 //! dst = src1 + src2. uAdd(const x86::Gp & dst,const x86::Gp & src1,const x86::Gp & src2)497 BL_NOINLINE void uAdd(const x86::Gp& dst, const x86::Gp& src1, const x86::Gp& src2) noexcept { 498 BL_ASSERT(dst.size() == src1.size()); 499 BL_ASSERT(dst.size() == src2.size()); 500 501 if (dst.id() == src1.id()) { 502 cc->add(dst, src2); 503 } 504 else if (dst.id() == src2.id()) { 505 cc->add(dst, src1); 506 } 507 else if (dst.size() >= 4) { 508 cc->lea(dst, x86::ptr(src1, src2)); 509 } 510 else { 511 cc->mov(dst, src1); 512 cc->add(dst, src2); 513 } 514 } 515 516 //! dst = src1 + [src2]. uAdd(const x86::Gp & dst,const x86::Gp & src1,const x86::Mem & src2)517 BL_NOINLINE void uAdd(const x86::Gp& dst, const x86::Gp& src1, const x86::Mem& src2) noexcept { 518 if (dst.id() != src1.id()) 519 cc->mov(dst, src1); 520 cc->add(dst, src2); 521 } 522 523 //! dst = src1 + [src2]. uAdd(const x86::Gp & dst,const x86::Gp & src1,const Imm & src2)524 BL_NOINLINE void uAdd(const x86::Gp& dst, const x86::Gp& src1, const Imm& src2) noexcept { 525 if (dst.id() != src1.id() && src2.isInt32()) { 526 cc->lea(dst, x86::ptr(src1, src2.i32())); 527 } 528 else { 529 if (dst.id() != src1.id()) 530 cc->mov(dst, src1); 531 cc->add(dst, src2); 532 } 533 } 534 uAddsU8(const x86::Gp & dst,const x86::Gp & src1,const x86::Gp & src2)535 BL_NOINLINE void uAddsU8(const x86::Gp& dst, const x86::Gp& src1, const x86::Gp& src2) noexcept { 536 BL_ASSERT(dst.size() == src1.size()); 537 BL_ASSERT(dst.size() == src2.size()); 538 539 if (dst.id() == src1.id()) { 540 cc->add(dst.r8(), src2.r8()); 541 } 542 else if (dst.id() == src2.id()) { 543 cc->add(dst.r8(), src1.r8()); 544 } 545 else { 546 cc->mov(dst, src1); 547 cc->add(dst, src2); 548 } 549 550 x86::Gp u8_msk = cc->newUInt32("@u8_msk"); 551 cc->sbb(u8_msk, u8_msk); 552 cc->or_(dst.r8(), u8_msk.r8()); 553 } 554 555 //! dst = src1 - src2. uSub(const x86::Gp & dst,const x86::Gp & src1,const x86::Gp & src2)556 BL_NOINLINE void uSub(const x86::Gp& dst, const x86::Gp& src1, const x86::Gp& src2) noexcept { 557 BL_ASSERT(dst.size() == src1.size()); 558 BL_ASSERT(dst.size() == src2.size()); 559 560 if (src1.id() == src2.id()) { 561 cc->xor_(dst, dst); 562 } 563 else if (dst.id() == src1.id()) { 564 cc->sub(dst, src2); 565 } 566 else if (dst.id() == src2.id()) { 567 cc->neg(dst); 568 cc->add(dst, src1); 569 } 570 else { 571 cc->mov(dst, src1); 572 cc->sub(dst, src2); 573 } 574 } 575 576 //! dst = src1 - [src2]. uSub(const x86::Gp & dst,const x86::Gp & src1,const x86::Mem & src2)577 BL_NOINLINE void uSub(const x86::Gp& dst, const x86::Gp& src1, const x86::Mem& src2) noexcept { 578 if (dst.id() != src1.id()) 579 cc->mov(dst, src1); 580 cc->sub(dst, src2); 581 } 582 583 //! dst = src1 + [src2]. uSub(const x86::Gp & dst,const x86::Gp & src1,const Imm & src2)584 BL_NOINLINE void uSub(const x86::Gp& dst, const x86::Gp& src1, const Imm& src2) noexcept { 585 if (dst.id() != src1.id()) 586 cc->mov(dst, src1); 587 cc->sub(dst, src2); 588 } 589 590 //! dst = src1 * src2. uMul(const x86::Gp & dst,const x86::Gp & src1,const x86::Gp & src2)591 BL_NOINLINE void uMul(const x86::Gp& dst, const x86::Gp& src1, const x86::Gp& src2) noexcept { 592 BL_ASSERT(dst.size() == src1.size()); 593 BL_ASSERT(dst.size() == src2.size()); 594 595 if (dst.id() == src1.id()) { 596 cc->imul(dst, src2); 597 } 598 else if (dst.id() == src2.id()) { 599 cc->imul(dst, src1); 600 } 601 else { 602 cc->mov(dst, src1); 603 cc->imul(dst, src2); 604 } 605 } 606 607 //! dst = src1 * [src2]. uMul(const x86::Gp & dst,const x86::Gp & src1,const x86::Mem & src2)608 BL_NOINLINE void uMul(const x86::Gp& dst, const x86::Gp& src1, const x86::Mem& src2) noexcept { 609 BL_ASSERT(dst.size() == src1.size()); 610 BL_ASSERT(dst.size() == src2.size()); 611 612 if (dst.id() != src1.id()) 613 cc->mov(dst, src1); 614 cc->imul(dst, src2); 615 } 616 617 //! dst = src1 * src2. uMul(const x86::Gp & dst,const x86::Gp & src1,int src2)618 BL_NOINLINE void uMul(const x86::Gp& dst, const x86::Gp& src1, int src2) noexcept { 619 BL_ASSERT(dst.size() == src1.size()); 620 621 if (src2 > 0) { 622 switch (src2) { 623 case 1: 624 if (dst.id() != src1.id()) 625 cc->mov(dst, src1); 626 return; 627 628 case 2: 629 if (dst.id() == src1.id()) 630 cc->shl(dst, 1); 631 else 632 cc->lea(dst, x86::ptr(src1, src1, 0)); 633 return; 634 635 case 3: 636 cc->lea(dst, x86::ptr(src1, src1, 1)); 637 return; 638 639 case 4: 640 case 8: { 641 int shift = 2 + (src2 == 8); 642 if (dst.id() == src1.id()) 643 cc->shl(dst, shift); 644 else 645 break; 646 return; 647 } 648 } 649 } 650 651 if (dst.id() == src1.id()) 652 cc->imul(dst, src2); 653 else 654 cc->imul(dst, src1, src2); 655 } 656 uInv8(const x86::Gp & dst,const x86::Gp & src)657 BL_NOINLINE void uInv8(const x86::Gp& dst, const x86::Gp& src) noexcept { 658 if (dst.id() != src.id()) 659 cc->mov(dst, src); 660 cc->xor_(dst.r8(), 0xFF); 661 } 662 663 //! Integer division by 255 with correct rounding semantics. uDiv255(const x86::Gp & dst,const x86::Gp & src)664 BL_NOINLINE void uDiv255(const x86::Gp& dst, const x86::Gp& src) noexcept { 665 BL_ASSERT(dst.size() == src.size()); 666 667 if (dst.id() == src.id()) { 668 // tmp = src + 128; 669 // dst = (tmp + (tmp >> 8)) >> 8 670 x86::Gp tmp = cc->newSimilarReg(dst, "@tmp"); 671 cc->sub(dst, -128); 672 cc->mov(tmp, dst); 673 cc->shr(tmp, 8); 674 cc->add(dst, tmp); 675 cc->shr(dst, 8); 676 } 677 else { 678 // dst = (src + 128 + ((src + 128) >> 8)) >> 8 679 cc->lea(dst, x86::ptr(src, 128)); 680 cc->shr(dst, 8); 681 cc->lea(dst, x86::ptr(dst, src, 0, 128)); 682 cc->shr(dst, 8); 683 } 684 } 685 uMul257hu16(const x86::Gp & dst,const x86::Gp & src)686 BL_NOINLINE void uMul257hu16(const x86::Gp& dst, const x86::Gp& src) { 687 BL_ASSERT(dst.size() == src.size()); 688 cc->imul(dst, src, 257); 689 cc->shr(dst, 16); 690 } 691 692 template<typename A, typename B> uZeroIfEq(const A & a,const B & b)693 BL_NOINLINE void uZeroIfEq(const A& a, const B& b) noexcept { 694 Label L = cc->newLabel(); 695 696 cc->cmp(a, b); 697 cc->jne(L); 698 cc->mov(a, 0); 699 cc->bind(L); 700 } 701 uJumpIfNotOpaqueMask(const x86::Gp & msk,const Label & target)702 BL_NOINLINE void uJumpIfNotOpaqueMask(const x86::Gp& msk, const Label& target) noexcept { 703 cc->cmp(msk.r8(), 255); 704 cc->jnz(target); 705 } 706 707 // dst = abs(src) 708 template<typename DstT, typename SrcT> uAbs(const DstT & dst,const SrcT & src)709 BL_NOINLINE void uAbs(const DstT& dst, const SrcT& src) noexcept { 710 if (dst.id() == src.id()) { 711 x86::Gp tmp = cc->newSimilarReg(dst, "@tmp"); 712 713 cc->mov(tmp, dst); 714 cc->neg(dst); 715 cc->cmovs(dst, tmp); 716 } 717 else { 718 cc->mov(dst, src); 719 cc->neg(dst); 720 cc->cmovs(dst, src); 721 } 722 } 723 724 template<typename DstT, typename ValueT, typename LimitT> uBound0ToN(const DstT & dst,const ValueT & value,const LimitT & limit)725 BL_NOINLINE void uBound0ToN(const DstT& dst, const ValueT& value, const LimitT& limit) noexcept { 726 if (dst.id() == value.id()) { 727 x86::Gp zero = cc->newSimilarReg(dst, "@zero"); 728 729 cc->xor_(zero, zero); 730 cc->cmp(dst, limit); 731 cc->cmova(dst, zero); 732 cc->cmovg(dst, limit); 733 } 734 else { 735 cc->xor_(dst, dst); 736 cc->cmp(value, limit); 737 cc->cmovbe(dst, value); 738 cc->cmovg(dst, limit); 739 } 740 } 741 742 template<typename DstT, typename SrcT> uReflect(const DstT & dst,const SrcT & src)743 BL_NOINLINE void uReflect(const DstT& dst, const SrcT& src) noexcept { 744 BL_ASSERT(dst.size() == src.size()); 745 int nBits = int(dst.size()) * 8 - 1; 746 747 if (dst.id() == src.id()) { 748 DstT copy = cc->newSimilarReg(dst, "@copy"); 749 cc->mov(copy, dst); 750 cc->sar(copy, nBits); 751 cc->xor_(dst, copy); 752 } 753 else { 754 cc->mov(dst, src); 755 cc->sar(dst, nBits); 756 cc->xor_(dst, src); 757 } 758 } 759 760 template<typename DstT, typename SrcT> uMod(const DstT & dst,const SrcT & src)761 BL_NOINLINE void uMod(const DstT& dst, const SrcT& src) noexcept { 762 x86::Gp mod = cc->newSimilarReg(dst, "@mod"); 763 764 cc->xor_(mod, mod); 765 cc->div(mod, dst, src); 766 cc->mov(dst, mod); 767 } 768 uAdvanceAndDecrement(const x86::Gp & p,int pAdd,const x86::Gp & i,int iDec)769 BL_NOINLINE void uAdvanceAndDecrement(const x86::Gp& p, int pAdd, const x86::Gp& i, int iDec) noexcept { 770 cc->add(p, pAdd); 771 cc->sub(i, iDec); 772 } 773 774 //! dst += a * b. uAddMulImm(const x86::Gp & dst,const x86::Gp & a,int b)775 BL_NOINLINE void uAddMulImm(const x86::Gp& dst, const x86::Gp& a, int b) noexcept { 776 switch (b) { 777 case 1: 778 cc->add(dst, a); 779 return; 780 781 case 2: 782 case 4: 783 case 8: { 784 uint32_t shift = b == 2 ? 1 : 785 b == 4 ? 2 : 3; 786 cc->lea(dst, x86::ptr(dst, a, shift)); 787 return; 788 } 789 790 default: { 791 x86::Gp tmp = cc->newSimilarReg(dst, "tmp"); 792 cc->imul(tmp, a, b); 793 cc->add(dst, tmp); 794 return; 795 } 796 } 797 } 798 799 BL_NOINLINE void uLeaBpp(const x86::Gp& dst, const x86::Gp& src_, const x86::Gp& idx_, uint32_t scale, int32_t disp = 0) noexcept { 800 x86::Gp src = src_.cloneAs(dst); 801 x86::Gp idx = idx_.cloneAs(dst); 802 803 switch (scale) { 804 case 1: 805 if (dst.id() == src.id() && disp == 0) 806 cc->add(dst, idx); 807 else 808 cc->lea(dst, x86::ptr(src, idx, 0, disp)); 809 break; 810 811 case 2: 812 cc->lea(dst, x86::ptr(src, idx, 1, disp)); 813 break; 814 815 case 3: 816 cc->lea(dst, x86::ptr(src, idx, 1, disp)); 817 cc->add(dst, idx); 818 break; 819 820 case 4: 821 cc->lea(dst, x86::ptr(src, idx, 2, disp)); 822 break; 823 824 default: 825 BL_NOT_REACHED(); 826 } 827 } 828 uShl(const x86::Gp & dst,const x86::Gp & src1,const x86::Gp & src2)829 BL_NOINLINE void uShl(const x86::Gp& dst, const x86::Gp& src1, const x86::Gp& src2) noexcept { 830 if (hasBMI2()) { 831 cc->shlx(dst, src1, src2.cloneAs(dst)); 832 } 833 else { 834 if (dst.id() != src1.id()) 835 cc->mov(dst, src1); 836 cc->shl(dst, src2.r8()); 837 } 838 } 839 uShl(const x86::Gp & dst,const x86::Gp & src1,const Imm & src2)840 BL_NOINLINE void uShl(const x86::Gp& dst, const x86::Gp& src1, const Imm& src2) noexcept { 841 if (dst.id() != src1.id()) 842 cc->mov(dst, src1); 843 cc->shl(dst, src2); 844 } 845 uShr(const x86::Gp & dst,const x86::Gp & src1,const x86::Gp & src2)846 BL_NOINLINE void uShr(const x86::Gp& dst, const x86::Gp& src1, const x86::Gp& src2) noexcept { 847 if (hasBMI2()) { 848 cc->shrx(dst, src1, src2.cloneAs(dst)); 849 } 850 else { 851 if (dst.id() != src1.id()) 852 cc->mov(dst, src1); 853 cc->shr(dst, src2.r8()); 854 } 855 } 856 uShr(const x86::Gp & dst,const x86::Gp & src1,const Imm & src2)857 BL_NOINLINE void uShr(const x86::Gp& dst, const x86::Gp& src1, const Imm& src2) noexcept { 858 if (dst.id() != src1.id()) 859 cc->mov(dst, src1); 860 cc->shr(dst, src2); 861 } 862 uCTZ(const Operand & dst,const Operand & src)863 inline void uCTZ(const Operand& dst, const Operand& src) noexcept { 864 // INTEL - No difference, `bsf` and `tzcnt` both have latency ~2.5 cycles. 865 // AMD - Big difference, `tzcnt` has only ~1.5 cycle latency while `bsf` has ~2.5 cycles. 866 cc->emit(hasBMI() ? x86::Inst::kIdTzcnt : x86::Inst::kIdBsf, dst, src); 867 } 868 uTest(const x86::Gp & ptr,uint32_t mask)869 inline void uTest(const x86::Gp& ptr, uint32_t mask) noexcept { 870 if (mask <= 0xFF && cc->is64Bit()) { 871 // Shorter, but limits as to use AL|BL|CL|DL, so we don't wanna use this 872 // construct in 32-bit mode as it would limit the register allocator. 873 cc->test(ptr.r8(), mask); 874 } 875 else { 876 cc->test(ptr, mask); 877 } 878 } 879 uPrefetch(const x86::Mem & mem)880 inline void uPrefetch(const x86::Mem& mem) noexcept { 881 cc->prefetcht0(mem); 882 } 883 884 // -------------------------------------------------------------------------- 885 // [Emit - 'V' Vector Instructions (128..512-bit SSE|AVX|AVX512)] 886 // -------------------------------------------------------------------------- 887 888 // To make the code generation easier and more parametrizable we support both 889 // SSE|AVX through the same interface (always non-destructive source form) and 890 // each intrinsic can accept either `Operand_` or `OpArray`, which can hold up 891 // to 4 registers to form scalars, pairs and quads. Each 'V' instruction maps 892 // directly to the ISA so check the optimization level before using them or use 893 // instructions starting with 'x' that are generic and designed to map to the 894 // best instruction(s) possible. 895 // 896 // Also, multiple overloads are provided for convenience, similarly to AsmJit 897 // design, we don't want to inline expansion of `OpArray(op)` here so these 898 // overloads are implemented in pipecompiler.cpp. 899 900 // SSE instructions that require SSE3+ are suffixed with `_` to make it clear 901 // that they are not part of the baseline instruction set. Some instructions 902 // like that are always provided don't have such suffix, and will be emulated 903 904 // Integer SIMD - Core. 905 V_EMIT_VV_VV(vmov,PACK_AVX_SSE (Vmovaps,Movaps,Z))906 V_EMIT_VV_VV(vmov , PACK_AVX_SSE(Vmovaps , Movaps , Z)) // AVX | SSE2 907 V_EMIT_VV_VV(vmov64 , PACK_AVX_SSE(Vmovq , Movq , X)) // AVX | SSE2 908 909 V_EMIT_VV_VV(vmovi8i16_ , PACK_AVX_SSE(Vpmovsxbw , Pmovsxbw , Z)) // AVX2 | SSE4.1 910 V_EMIT_VV_VV(vmovu8u16_ , PACK_AVX_SSE(Vpmovzxbw , Pmovzxbw , Z)) // AVX2 | SSE4.1 911 V_EMIT_VV_VV(vmovi8i32_ , PACK_AVX_SSE(Vpmovsxbd , Pmovsxbd , Z)) // AVX2 | SSE4.1 912 V_EMIT_VV_VV(vmovu8u32_ , PACK_AVX_SSE(Vpmovzxbd , Pmovzxbd , Z)) // AVX2 | SSE4.1 913 V_EMIT_VV_VV(vmovi8i64_ , PACK_AVX_SSE(Vpmovsxbq , Pmovsxbq , Z)) // AVX2 | SSE4.1 914 V_EMIT_VV_VV(vmovu8u64_ , PACK_AVX_SSE(Vpmovzxbq , Pmovzxbq , Z)) // AVX2 | SSE4.1 915 916 V_EMIT_VV_VV(vmovi16i32_ , PACK_AVX_SSE(Vpmovsxwd , Pmovsxwd , Z)) // AVX2 | SSE4.1 917 V_EMIT_VV_VV(vmovu16u32_ , PACK_AVX_SSE(Vpmovzxwd , Pmovzxwd , Z)) // AVX2 | SSE4.1 918 V_EMIT_VV_VV(vmovi16i64_ , PACK_AVX_SSE(Vpmovsxwq , Pmovsxwq , Z)) // AVX2 | SSE4.1 919 V_EMIT_VV_VV(vmovu16u64_ , PACK_AVX_SSE(Vpmovzxwq , Pmovzxwq , Z)) // AVX2 | SSE4.1 920 921 V_EMIT_VV_VV(vmovi32i64_ , PACK_AVX_SSE(Vpmovsxdq , Pmovsxdq , Z)) // AVX2 | SSE4.1 922 V_EMIT_VV_VV(vmovu32u64_ , PACK_AVX_SSE(Vpmovzxdq , Pmovzxdq , Z)) // AVX2 | SSE4.1 923 924 V_EMIT_VV_VV(vmovmsku8 , PACK_AVX_SSE(Vpmovmskb , Pmovmskb , Z)) // AVX2 | SSE2 925 926 V_EMIT_VVVI_VVI(vinsertu8_ , PACK_AVX_SSE(Vpinsrb , Pinsrb , X)) // AVX2 | SSE4_1 927 V_EMIT_VVVI_VVI(vinsertu16 , PACK_AVX_SSE(Vpinsrw , Pinsrw , X)) // AVX2 | SSE2 928 V_EMIT_VVVI_VVI(vinsertu32_, PACK_AVX_SSE(Vpinsrd , Pinsrd , X)) // AVX2 | SSE4_1 929 V_EMIT_VVVI_VVI(vinsertu64_, PACK_AVX_SSE(Vpinsrq , Pinsrq , X)) // AVX2 | SSE4_1 930 931 V_EMIT_VVI_VVI(vextractu8_ , PACK_AVX_SSE(Vpextrb , Pextrb , X)) // AVX2 | SSE4_1 932 V_EMIT_VVI_VVI(vextractu16 , PACK_AVX_SSE(Vpextrw , Pextrw , X)) // AVX2 | SSE2 933 V_EMIT_VVI_VVI(vextractu32_, PACK_AVX_SSE(Vpextrd , Pextrd , X)) // AVX2 | SSE4_1 934 V_EMIT_VVI_VVI(vextractu64_, PACK_AVX_SSE(Vpextrq , Pextrq , X)) // AVX2 | SSE4_1 935 936 V_EMIT_VVV_VV(vunpackli8 , PACK_AVX_SSE(Vpunpcklbw , Punpcklbw , Z)) // AVX2 | SSE2 937 V_EMIT_VVV_VV(vunpackhi8 , PACK_AVX_SSE(Vpunpckhbw , Punpckhbw , Z)) // AVX2 | SSE2 938 V_EMIT_VVV_VV(vunpackli16 , PACK_AVX_SSE(Vpunpcklwd , Punpcklwd , Z)) // AVX2 | SSE2 939 V_EMIT_VVV_VV(vunpackhi16 , PACK_AVX_SSE(Vpunpckhwd , Punpckhwd , Z)) // AVX2 | SSE2 940 V_EMIT_VVV_VV(vunpackli32 , PACK_AVX_SSE(Vpunpckldq , Punpckldq , Z)) // AVX2 | SSE2 941 V_EMIT_VVV_VV(vunpackhi32 , PACK_AVX_SSE(Vpunpckhdq , Punpckhdq , Z)) // AVX2 | SSE2 942 V_EMIT_VVV_VV(vunpackli64 , PACK_AVX_SSE(Vpunpcklqdq, Punpcklqdq, Z)) // AVX2 | SSE2 943 V_EMIT_VVV_VV(vunpackhi64 , PACK_AVX_SSE(Vpunpckhqdq, Punpckhqdq, Z)) // AVX2 | SSE2 944 945 V_EMIT_VVV_VV(vpacki32i16 , PACK_AVX_SSE(Vpackssdw , Packssdw , Z)) // AVX2 | SSE2 946 V_EMIT_VVV_VV(vpacki32u16_ , PACK_AVX_SSE(Vpackusdw , Packusdw , Z)) // AVX2 | SSE4.1 947 V_EMIT_VVV_VV(vpacki16i8 , PACK_AVX_SSE(Vpacksswb , Packsswb , Z)) // AVX2 | SSE2 948 V_EMIT_VVV_VV(vpacki16u8 , PACK_AVX_SSE(Vpackuswb , Packuswb , Z)) // AVX2 | SSE2 949 950 V_EMIT_VVV_VV(vswizi8v_ , PACK_AVX_SSE(Vpshufb , Pshufb , Z)) // AVX2 | SSSE3 951 V_EMIT_VVI_VVI(vswizli16 , PACK_AVX_SSE(Vpshuflw , Pshuflw , Z)) // AVX2 | SSE2 952 V_EMIT_VVI_VVI(vswizhi16 , PACK_AVX_SSE(Vpshufhw , Pshufhw , Z)) // AVX2 | SSE2 953 V_EMIT_VVI_VVI(vswizi32 , PACK_AVX_SSE(Vpshufd , Pshufd , Z)) // AVX2 | SSE2 954 955 V_EMIT_VVVI_VVI(vshufi32 , PACK_AVX_SSE(Vshufps , Shufps , Z)) // AVX | SSE 956 V_EMIT_VVVI_VVI(vshufi64 , PACK_AVX_SSE(Vshufpd , Shufpd , Z)) // AVX | SSE2 957 958 V_EMIT_VVV_VV(vand , PACK_AVX_SSE(Vpand , Pand , Z)) // AVX2 | SSE2 959 V_EMIT_VVV_VV(vandnot_a , PACK_AVX_SSE(Vpandn , Pandn , Z)) // AVX2 | SSE2 960 V_EMIT_VVV_VV(vor , PACK_AVX_SSE(Vpor , Por , Z)) // AVX2 | SSE2 961 V_EMIT_VVV_VV(vxor , PACK_AVX_SSE(Vpxor , Pxor , Z)) // AVX2 | SSE2 962 963 V_EMIT_VVV_VV(vavgu8 , PACK_AVX_SSE(Vpavgb , Pavgb , Z)) // AVX2 | SSE2 964 V_EMIT_VVV_VV(vavgu16 , PACK_AVX_SSE(Vpavgw , Pavgw , Z)) // AVX2 | SSE2 965 966 V_EMIT_VVV_VV(vsigni8_ , PACK_AVX_SSE(Vpsignb , Psignb , Z)) // AVX2 | SSSE3 967 V_EMIT_VVV_VV(vsigni16_ , PACK_AVX_SSE(Vpsignw , Psignw , Z)) // AVX2 | SSSE3 968 V_EMIT_VVV_VV(vsigni32_ , PACK_AVX_SSE(Vpsignd , Psignd , Z)) // AVX2 | SSSE3 969 970 V_EMIT_VVV_VV(vaddi8 , PACK_AVX_SSE(Vpaddb , Paddb , Z)) // AVX2 | SSE2 971 V_EMIT_VVV_VV(vaddi16 , PACK_AVX_SSE(Vpaddw , Paddw , Z)) // AVX2 | SSE2 972 V_EMIT_VVV_VV(vaddi32 , PACK_AVX_SSE(Vpaddd , Paddd , Z)) // AVX2 | SSE2 973 V_EMIT_VVV_VV(vaddi64 , PACK_AVX_SSE(Vpaddq , Paddq , Z)) // AVX2 | SSE2 974 975 V_EMIT_VVV_VV(vaddsi8 , PACK_AVX_SSE(Vpaddsb , Paddsb , Z)) // AVX2 | SSE2 976 V_EMIT_VVV_VV(vaddsu8 , PACK_AVX_SSE(Vpaddusb , Paddusb , Z)) // AVX2 | SSE2 977 V_EMIT_VVV_VV(vaddsi16 , PACK_AVX_SSE(Vpaddsw , Paddsw , Z)) // AVX2 | SSE2 978 V_EMIT_VVV_VV(vaddsu16 , PACK_AVX_SSE(Vpaddusw , Paddusw , Z)) // AVX2 | SSE2 979 980 V_EMIT_VVV_VV(vsubi8 , PACK_AVX_SSE(Vpsubb , Psubb , Z)) // AVX2 | SSE2 981 V_EMIT_VVV_VV(vsubi16 , PACK_AVX_SSE(Vpsubw , Psubw , Z)) // AVX2 | SSE2 982 V_EMIT_VVV_VV(vsubi32 , PACK_AVX_SSE(Vpsubd , Psubd , Z)) // AVX2 | SSE2 983 V_EMIT_VVV_VV(vsubi64 , PACK_AVX_SSE(Vpsubq , Psubq , Z)) // AVX2 | SSE2 984 985 V_EMIT_VVV_VV(vsubsi8 , PACK_AVX_SSE(Vpsubsb , Psubsb , Z)) // AVX2 | SSE2 986 V_EMIT_VVV_VV(vsubsi16 , PACK_AVX_SSE(Vpsubsw , Psubsw , Z)) // AVX2 | SSE2 987 V_EMIT_VVV_VV(vsubsu8 , PACK_AVX_SSE(Vpsubusb , Psubusb , Z)) // AVX2 | SSE2 988 V_EMIT_VVV_VV(vsubsu16 , PACK_AVX_SSE(Vpsubusw , Psubusw , Z)) // AVX2 | SSE2 989 990 V_EMIT_VVV_VV(vmuli16 , PACK_AVX_SSE(Vpmullw , Pmullw , Z)) // AVX2 | SSE2 991 V_EMIT_VVV_VV(vmulu16 , PACK_AVX_SSE(Vpmullw , Pmullw , Z)) // AVX2 | SSE2 992 V_EMIT_VVV_VV(vmulhi16 , PACK_AVX_SSE(Vpmulhw , Pmulhw , Z)) // AVX2 | SSE2 993 V_EMIT_VVV_VV(vmulhu16 , PACK_AVX_SSE(Vpmulhuw , Pmulhuw , Z)) // AVX2 | SSE2 994 995 V_EMIT_VVV_VV(vmuli32_ , PACK_AVX_SSE(Vpmulld , Pmulld , Z)) // AVX2 | SSE4.1 996 V_EMIT_VVV_VV(vmulu32_ , PACK_AVX_SSE(Vpmulld , Pmulld , Z)) // AVX2 | SSE4.1 997 V_EMIT_VVV_VV(vmulxlli32_ , PACK_AVX_SSE(Vpmuldq , Pmuldq , Z)) // AVX2 | SSE4.1 998 V_EMIT_VVV_VV(vmulxllu32 , PACK_AVX_SSE(Vpmuludq , Pmuludq , Z)) // AVX2 | SSE2 999 1000 V_EMIT_VVVi_VVi(vmulxllu64_, PACK_AVX_SSE(Vpclmulqdq , Pclmulqdq , Z), 0x00) // AVX2 | PCLMULQDQ 1001 V_EMIT_VVVi_VVi(vmulxhlu64_, PACK_AVX_SSE(Vpclmulqdq , Pclmulqdq , Z), 0x01) // AVX2 | PCLMULQDQ 1002 V_EMIT_VVVi_VVi(vmulxlhu64_, PACK_AVX_SSE(Vpclmulqdq , Pclmulqdq , Z), 0x10) // AVX2 | PCLMULQDQ 1003 V_EMIT_VVVi_VVi(vmulxhhu64_, PACK_AVX_SSE(Vpclmulqdq , Pclmulqdq , Z), 0x11) // AVX2 | PCLMULQDQ 1004 1005 V_EMIT_VVV_VV(vmini8_ , PACK_AVX_SSE(Vpminsb , Pminsb , Z)) // AVX2 | SSE4.1 1006 V_EMIT_VVV_VV(vmaxi8_ , PACK_AVX_SSE(Vpmaxsb , Pmaxsb , Z)) // AVX2 | SSE4.1 1007 V_EMIT_VVV_VV(vminu8 , PACK_AVX_SSE(Vpminub , Pminub , Z)) // AVX2 | SSE2 1008 V_EMIT_VVV_VV(vmaxu8 , PACK_AVX_SSE(Vpmaxub , Pmaxub , Z)) // AVX2 | SSE2 1009 1010 V_EMIT_VVV_VV(vmini16 , PACK_AVX_SSE(Vpminsw , Pminsw , Z)) // AVX2 | SSE2 1011 V_EMIT_VVV_VV(vmaxi16 , PACK_AVX_SSE(Vpmaxsw , Pmaxsw , Z)) // AVX2 | SSE2 1012 1013 V_EMIT_VVV_VV(vmini32_ , PACK_AVX_SSE(Vpminsd , Pminsd , Z)) // AVX2 | SSE4.1 1014 V_EMIT_VVV_VV(vmaxi32_ , PACK_AVX_SSE(Vpmaxsd , Pmaxsd , Z)) // AVX2 | SSE4.1 1015 V_EMIT_VVV_VV(vminu32_ , PACK_AVX_SSE(Vpminud , Pminud , Z)) // AVX2 | SSE4.1 1016 V_EMIT_VVV_VV(vmaxu32_ , PACK_AVX_SSE(Vpmaxud , Pmaxud , Z)) // AVX2 | SSE4.1 1017 1018 V_EMIT_VVV_VV(vcmpeqi8 , PACK_AVX_SSE(Vpcmpeqb , Pcmpeqb , Z)) // AVX2 | SSE2 1019 V_EMIT_VVV_VV(vcmpeqi16 , PACK_AVX_SSE(Vpcmpeqw , Pcmpeqw , Z)) // AVX2 | SSE2 1020 V_EMIT_VVV_VV(vcmpeqi32 , PACK_AVX_SSE(Vpcmpeqd , Pcmpeqd , Z)) // AVX2 | SSE2 1021 V_EMIT_VVV_VV(vcmpeqi64_ , PACK_AVX_SSE(Vpcmpeqq , Pcmpeqq , Z)) // AVX2 | SSE4.1 1022 1023 V_EMIT_VVV_VV(vcmpgti8 , PACK_AVX_SSE(Vpcmpgtb , Pcmpgtb , Z)) // AVX2 | SSE2 1024 V_EMIT_VVV_VV(vcmpgti16 , PACK_AVX_SSE(Vpcmpgtw , Pcmpgtw , Z)) // AVX2 | SSE2 1025 V_EMIT_VVV_VV(vcmpgti32 , PACK_AVX_SSE(Vpcmpgtd , Pcmpgtd , Z)) // AVX2 | SSE2 1026 V_EMIT_VVV_VV(vcmpgti64_ , PACK_AVX_SSE(Vpcmpgtq , Pcmpgtq , Z)) // AVX2 | SSE4.2 1027 1028 V_EMIT_VVI_VI(vslli16 , PACK_AVX_SSE(Vpsllw , Psllw , Z)) // AVX2 | SSE2 1029 V_EMIT_VVI_VI(vsrli16 , PACK_AVX_SSE(Vpsrlw , Psrlw , Z)) // AVX2 | SSE2 1030 V_EMIT_VVI_VI(vsrai16 , PACK_AVX_SSE(Vpsraw , Psraw , Z)) // AVX2 | SSE2 1031 1032 V_EMIT_VVI_VI(vslli32 , PACK_AVX_SSE(Vpslld , Pslld , Z)) // AVX2 | SSE2 1033 V_EMIT_VVI_VI(vsrli32 , PACK_AVX_SSE(Vpsrld , Psrld , Z)) // AVX2 | SSE2 1034 V_EMIT_VVI_VI(vsrai32 , PACK_AVX_SSE(Vpsrad , Psrad , Z)) // AVX2 | SSE2 1035 1036 V_EMIT_VVI_VI(vslli64 , PACK_AVX_SSE(Vpsllq , Psllq , Z)) // AVX2 | SSE2 1037 V_EMIT_VVI_VI(vsrli64 , PACK_AVX_SSE(Vpsrlq , Psrlq , Z)) // AVX2 | SSE2 1038 1039 V_EMIT_VVI_VI(vslli128b , PACK_AVX_SSE(Vpslldq , Pslldq , Z)) // AVX2 | SSE2 1040 V_EMIT_VVI_VI(vsrli128b , PACK_AVX_SSE(Vpsrldq , Psrldq , Z)) // AVX2 | SSE2 1041 1042 V_EMIT_VVVV_VVV(vblendv8_ , PACK_AVX_SSE(Vpblendvb , Pblendvb , Z)) // AVX2 | SSE4.1 1043 V_EMIT_VVVI_VVI(vblend16_ , PACK_AVX_SSE(Vpblendw , Pblendw , Z)) // AVX2 | SSE4.1 1044 1045 V_EMIT_VVV_VV(vhaddi16_ , PACK_AVX_SSE(Vphaddw , Phaddw , Z)) // AVX2 | SSSE3 1046 V_EMIT_VVV_VV(vhaddi32_ , PACK_AVX_SSE(Vphaddd , Phaddd , Z)) // AVX2 | SSSE3 1047 1048 V_EMIT_VVV_VV(vhsubi16_ , PACK_AVX_SSE(Vphsubw , Phsubw , Z)) // AVX2 | SSSE3 1049 V_EMIT_VVV_VV(vhsubi32_ , PACK_AVX_SSE(Vphsubd , Phsubd , Z)) // AVX2 | SSSE3 1050 1051 V_EMIT_VVV_VV(vhaddsi16_ , PACK_AVX_SSE(Vphaddsw , Phaddsw , Z)) // AVX2 | SSSE3 1052 V_EMIT_VVV_VV(vhsubsi16_ , PACK_AVX_SSE(Vphsubsw , Phsubsw , Z)) // AVX2 | SSSE3 1053 1054 // Integer SIMD - Miscellaneous. 1055 1056 V_EMIT_VV_VV(vtest_ , PACK_AVX_SSE(Vptest , Ptest , Z)) // AVX2 | SSE4_1 1057 1058 // Integer SIMD - Consult X86 manual before using these... 1059 1060 V_EMIT_VVV_VV(vsadu8 , PACK_AVX_SSE(Vpsadbw , Psadbw , Z)) // AVX2 | SSE2 [dst.u64[0..X] = SUM{0.7}(ABS(src1.u8[N] - src2.u8[N]))))] 1061 V_EMIT_VVV_VV(vmulrhi16_ , PACK_AVX_SSE(Vpmulhrsw , Pmulhrsw , Z)) // AVX2 | SSSE3 [dst.i16[0..X] = ((((src1.i16[0] * src2.i16[0])) >> 14)) + 1)) >> 1))] 1062 V_EMIT_VVV_VV(vmaddsu8i8_ , PACK_AVX_SSE(Vpmaddubsw , Pmaddubsw , Z)) // AVX2 | SSSE3 [dst.i16[0..X] = SAT(src1.u8[0] * src2.i8[0] + src1.u8[1] * src2.i8[1])) 1063 V_EMIT_VVV_VV(vmaddi16 , PACK_AVX_SSE(Vpmaddwd , Pmaddwd , Z)) // AVX2 | SSE2 [dst.i32[0..X] = (src1.i16[0] * src2.i16[0] + src1.i16[1] * src2.i16[1])) 1064 V_EMIT_VVVI_VVI(vmpsadu8_ , PACK_AVX_SSE(Vmpsadbw , Mpsadbw , Z)) // AVX2 | SSE4.1 1065 V_EMIT_VVVI_VVI(valignr8_ , PACK_AVX_SSE(Vpalignr , Palignr , Z)) // AVX2 | SSSE3 1066 V_EMIT_VV_VV(vhminposu16_ , PACK_AVX_SSE(Vphminposuw, Phminposuw, Z)) // AVX2 | SSE4_1 1067 1068 // Floating Point - Core. 1069 1070 V_EMIT_VV_VV(vmovaps , PACK_AVX_SSE(Vmovaps , Movaps , Z)) // AVX | SSE 1071 V_EMIT_VV_VV(vmovapd , PACK_AVX_SSE(Vmovapd , Movapd , Z)) // AVX | SSE2 1072 V_EMIT_VV_VV(vmovups , PACK_AVX_SSE(Vmovups , Movups , Z)) // AVX | SSE 1073 V_EMIT_VV_VV(vmovupd , PACK_AVX_SSE(Vmovupd , Movupd , Z)) // AVX | SSE2 1074 1075 V_EMIT_VVV_VV(vmovlps2x , PACK_AVX_SSE(Vmovlps , Movlps , X)) // AVX | SSE 1076 V_EMIT_VVV_VV(vmovhps2x , PACK_AVX_SSE(Vmovhps , Movhps , X)) // AVX | SSE 1077 1078 V_EMIT_VVV_VV(vmovlhps2x , PACK_AVX_SSE(Vmovlhps , Movlhps , X)) // AVX | SSE 1079 V_EMIT_VVV_VV(vmovhlps2x , PACK_AVX_SSE(Vmovhlps , Movhlps , X)) // AVX | SSE 1080 1081 V_EMIT_VVV_VV(vmovlpd , PACK_AVX_SSE(Vmovlpd , Movlpd , X)) // AVX | SSE 1082 V_EMIT_VVV_VV(vmovhpd , PACK_AVX_SSE(Vmovhpd , Movhpd , X)) // AVX | SSE 1083 1084 V_EMIT_VV_VV(vmovduplps_ , PACK_AVX_SSE(Vmovsldup , Movsldup , Z)) // AVX | SSE3 1085 V_EMIT_VV_VV(vmovduphps_ , PACK_AVX_SSE(Vmovshdup , Movshdup , Z)) // AVX | SSE3 1086 1087 V_EMIT_VV_VV(vmovduplpd_ , PACK_AVX_SSE(Vmovddup , Movddup , Z)) // AVX | SSE3 1088 1089 V_EMIT_VV_VV(vmovmskps , PACK_AVX_SSE(Vmovmskps , Movmskps , Z)) // AVX | SSE 1090 V_EMIT_VV_VV(vmovmskpd , PACK_AVX_SSE(Vmovmskpd , Movmskpd , Z)) // AVX | SSE2 1091 1092 V_EMIT_VVI_VVI(vinsertss_ , PACK_AVX_SSE(Vinsertps , Insertps , X)) // AVX | SSE4_1 1093 V_EMIT_VVI_VVI(vextractss_ , PACK_AVX_SSE(Vextractps , Extractps , X)) // AVX | SSE4_1 1094 1095 V_EMIT_VVV_VV(vunpacklps , PACK_AVX_SSE(Vunpcklps , Unpcklps , Z)) // AVX | SSE 1096 V_EMIT_VVV_VV(vunpacklpd , PACK_AVX_SSE(Vunpcklpd , Unpcklpd , Z)) // AVX | SSE2 1097 V_EMIT_VVV_VV(vunpackhps , PACK_AVX_SSE(Vunpckhps , Unpckhps , Z)) // AVX | SSE 1098 V_EMIT_VVV_VV(vunpackhpd , PACK_AVX_SSE(Vunpckhpd , Unpckhpd , Z)) // AVX | SSE2 1099 1100 V_EMIT_VVVI_VVI(vshufps , PACK_AVX_SSE(Vshufps , Shufps , Z)) // AVX | SSE 1101 V_EMIT_VVVI_VVI(vshufpd , PACK_AVX_SSE(Vshufpd , Shufpd , Z)) // AVX | SSE2 1102 1103 V_EMIT_VVV_VV(vandps , PACK_AVX_SSE(Vandps , Andps , Z)) // AVX | SSE 1104 V_EMIT_VVV_VV(vandpd , PACK_AVX_SSE(Vandpd , Andpd , Z)) // AVX | SSE2 1105 V_EMIT_VVV_VV(vandnot_aps , PACK_AVX_SSE(Vandnps , Andnps , Z)) // AVX | SSE 1106 V_EMIT_VVV_VV(vandnot_apd , PACK_AVX_SSE(Vandnpd , Andnpd , Z)) // AVX | SSE2 1107 V_EMIT_VVV_VV(vorps , PACK_AVX_SSE(Vorps , Orps , Z)) // AVX | SSE 1108 V_EMIT_VVV_VV(vorpd , PACK_AVX_SSE(Vorpd , Orpd , Z)) // AVX | SSE2 1109 V_EMIT_VVV_VV(vxorps , PACK_AVX_SSE(Vxorps , Xorps , Z)) // AVX | SSE 1110 V_EMIT_VVV_VV(vxorpd , PACK_AVX_SSE(Vxorpd , Xorpd , Z)) // AVX | SSE2 1111 1112 V_EMIT_VVV_VV(vaddss , PACK_AVX_SSE(Vaddss , Addss , X)) // AVX | SSE 1113 V_EMIT_VVV_VV(vaddsd , PACK_AVX_SSE(Vaddsd , Addsd , X)) // AVX | SSE2 1114 V_EMIT_VVV_VV(vaddps , PACK_AVX_SSE(Vaddps , Addps , Z)) // AVX | SSE 1115 V_EMIT_VVV_VV(vaddpd , PACK_AVX_SSE(Vaddpd , Addpd , Z)) // AVX | SSE2 1116 1117 V_EMIT_VVV_VV(vsubss , PACK_AVX_SSE(Vsubss , Subss , X)) // AVX | SSE 1118 V_EMIT_VVV_VV(vsubsd , PACK_AVX_SSE(Vsubsd , Subsd , X)) // AVX | SSE2 1119 V_EMIT_VVV_VV(vsubps , PACK_AVX_SSE(Vsubps , Subps , Z)) // AVX | SSE 1120 V_EMIT_VVV_VV(vsubpd , PACK_AVX_SSE(Vsubpd , Subpd , Z)) // AVX | SSE2 1121 1122 V_EMIT_VVV_VV(vaddsubps_ , PACK_AVX_SSE(Vaddsubps , Addsubps , Z)) // AVX | SSE3 1123 V_EMIT_VVV_VV(vaddsubpd_ , PACK_AVX_SSE(Vaddsubpd , Addsubpd , Z)) // AVX | SSE3 1124 1125 V_EMIT_VVV_VV(vmulss , PACK_AVX_SSE(Vmulss , Mulss , X)) // AVX | SSE 1126 V_EMIT_VVV_VV(vmulsd , PACK_AVX_SSE(Vmulsd , Mulsd , X)) // AVX | SSE2 1127 V_EMIT_VVV_VV(vmulps , PACK_AVX_SSE(Vmulps , Mulps , Z)) // AVX | SSE 1128 V_EMIT_VVV_VV(vmulpd , PACK_AVX_SSE(Vmulpd , Mulpd , Z)) // AVX | SSE2 1129 1130 V_EMIT_VVV_VV(vdivss , PACK_AVX_SSE(Vdivss , Divss , X)) // AVX | SSE 1131 V_EMIT_VVV_VV(vdivsd , PACK_AVX_SSE(Vdivsd , Divsd , X)) // AVX | SSE2 1132 V_EMIT_VVV_VV(vdivps , PACK_AVX_SSE(Vdivps , Divps , Z)) // AVX | SSE 1133 V_EMIT_VVV_VV(vdivpd , PACK_AVX_SSE(Vdivpd , Divpd , Z)) // AVX | SSE2 1134 1135 V_EMIT_VVV_VV(vminss , PACK_AVX_SSE(Vminss , Minss , X)) // AVX | SSE 1136 V_EMIT_VVV_VV(vminsd , PACK_AVX_SSE(Vminsd , Minsd , X)) // AVX | SSE2 1137 V_EMIT_VVV_VV(vminps , PACK_AVX_SSE(Vminps , Minps , Z)) // AVX | SSE 1138 V_EMIT_VVV_VV(vminpd , PACK_AVX_SSE(Vminpd , Minpd , Z)) // AVX | SSE2 1139 1140 V_EMIT_VVV_VV(vmaxss , PACK_AVX_SSE(Vmaxss , Maxss , X)) // AVX | SSE 1141 V_EMIT_VVV_VV(vmaxsd , PACK_AVX_SSE(Vmaxsd , Maxsd , X)) // AVX | SSE2 1142 V_EMIT_VVV_VV(vmaxps , PACK_AVX_SSE(Vmaxps , Maxps , Z)) // AVX | SSE 1143 V_EMIT_VVV_VV(vmaxpd , PACK_AVX_SSE(Vmaxpd , Maxpd , Z)) // AVX | SSE2 1144 1145 V_EMIT_VVV_VV(vsqrtss , PACK_AVX_SSE(Vsqrtss , Sqrtss , X)) // AVX | SSE 1146 V_EMIT_VVV_VV(vsqrtsd , PACK_AVX_SSE(Vsqrtsd , Sqrtsd , X)) // AVX | SSE2 1147 V_EMIT_VV_VV(vsqrtps , PACK_AVX_SSE(Vsqrtps , Sqrtps , Z)) // AVX | SSE 1148 V_EMIT_VV_VV(vsqrtpd , PACK_AVX_SSE(Vsqrtpd , Sqrtpd , Z)) // AVX | SSE2 1149 1150 V_EMIT_VVV_VV(vrcpss , PACK_AVX_SSE(Vrcpss , Rcpss , X)) // AVX | SSE 1151 V_EMIT_VV_VV(vrcpps , PACK_AVX_SSE(Vrcpps , Rcpps , Z)) // AVX | SSE 1152 1153 V_EMIT_VVV_VV(vrsqrtss , PACK_AVX_SSE(Vrsqrtss , Rsqrtss , X)) // AVX | SSE 1154 V_EMIT_VV_VV(vrsqrtps , PACK_AVX_SSE(Vrsqrtps , Rsqrtps , Z)) // AVX | SSE 1155 1156 V_EMIT_VVVI_VVI(vdpps_ , PACK_AVX_SSE(Vdpps , Dpps , Z)) // AVX | SSE4.1 1157 V_EMIT_VVVI_VVI(vdppd_ , PACK_AVX_SSE(Vdppd , Dppd , Z)) // AVX | SSE4.1 1158 1159 V_EMIT_VVVI_VVI(vroundss_ , PACK_AVX_SSE(Vroundss , Roundss , X)) // AVX | SSE4.1 1160 V_EMIT_VVVI_VVI(vroundsd_ , PACK_AVX_SSE(Vroundsd , Roundsd , X)) // AVX | SSE4.1 1161 V_EMIT_VVI_VVI(vroundps_ , PACK_AVX_SSE(Vroundps , Roundps , Z)) // AVX | SSE4.1 1162 V_EMIT_VVI_VVI(vroundpd_ , PACK_AVX_SSE(Vroundpd , Roundpd , Z)) // AVX | SSE4.1 1163 1164 V_EMIT_VVVI_VVI(vcmpss , PACK_AVX_SSE(Vcmpss , Cmpss , X)) // AVX | SSE 1165 V_EMIT_VVVI_VVI(vcmpsd , PACK_AVX_SSE(Vcmpsd , Cmpsd , X)) // AVX | SSE2 1166 V_EMIT_VVVI_VVI(vcmpps , PACK_AVX_SSE(Vcmpps , Cmpps , Z)) // AVX | SSE 1167 V_EMIT_VVVI_VVI(vcmppd , PACK_AVX_SSE(Vcmppd , Cmppd , Z)) // AVX | SSE2 1168 1169 V_EMIT_VVVV_VVV(vblendvps_ , PACK_AVX_SSE(Vblendvps , Blendvps , Z)) // AVX | SSE4.1 1170 V_EMIT_VVVV_VVV(vblendvpd_ , PACK_AVX_SSE(Vblendvpd , Blendvpd , Z)) // AVX | SSE4.1 1171 V_EMIT_VVVI_VVI(vblendps_ , PACK_AVX_SSE(Vblendps , Blendps , Z)) // AVX | SSE4.1 1172 V_EMIT_VVVI_VVI(vblendpd_ , PACK_AVX_SSE(Vblendpd , Blendpd , Z)) // AVX | SSE4.1 1173 1174 V_EMIT_VV_VV(vcvti32ps , PACK_AVX_SSE(Vcvtdq2ps , Cvtdq2ps , Z)) // AVX | SSE2 1175 V_EMIT_VV_VV(vcvtpdps , PACK_AVX_SSE(Vcvtpd2ps , Cvtpd2ps , Z)) // AVX | SSE2 1176 1177 V_EMIT_VV_VV(vcvti32pd , PACK_AVX_SSE(Vcvtdq2pd , Cvtdq2pd , Z)) // AVX | SSE2 1178 V_EMIT_VV_VV(vcvtpspd , PACK_AVX_SSE(Vcvtps2pd , Cvtps2pd , Z)) // AVX | SSE2 1179 1180 V_EMIT_VV_VV(vcvtpsi32 , PACK_AVX_SSE(Vcvtps2dq , Cvtps2dq , Z)) // AVX | SSE2 1181 V_EMIT_VV_VV(vcvtpdi32 , PACK_AVX_SSE(Vcvtpd2dq , Cvtpd2dq , Z)) // AVX | SSE2 1182 1183 V_EMIT_VV_VV(vcvttpsi32 , PACK_AVX_SSE(Vcvttps2dq , Cvttps2dq , Z)) // AVX | SSE2 1184 V_EMIT_VV_VV(vcvttpdi32 , PACK_AVX_SSE(Vcvttpd2dq , Cvttpd2dq , Z)) // AVX | SSE2 1185 1186 V_EMIT_VVV_VV(vcvtsdss , PACK_AVX_SSE(Vcvtsd2ss , Cvtsd2ss , X)) // AVX | SSE2 1187 V_EMIT_VVV_VV(vcvtsssd , PACK_AVX_SSE(Vcvtss2sd , Cvtss2sd , X)) // AVX | SSE2 1188 1189 V_EMIT_VVV_VV(vcvtsiss , PACK_AVX_SSE(Vcvtsi2ss , Cvtsi2ss , X)) // AVX | SSE 1190 V_EMIT_VVV_VV(vcvtsisd , PACK_AVX_SSE(Vcvtsi2sd , Cvtsi2sd , X)) // AVX | SSE2 1191 1192 V_EMIT_VV_VV(vcvtsssi , PACK_AVX_SSE(Vcvtss2si , Cvtss2si , X)) // AVX | SSE 1193 V_EMIT_VV_VV(vcvtsdsi , PACK_AVX_SSE(Vcvtsd2si , Cvtsd2si , X)) // AVX | SSE2 1194 1195 V_EMIT_VV_VV(vcvttsssi , PACK_AVX_SSE(Vcvttss2si , Cvttss2si , X)) // AVX | SSE 1196 V_EMIT_VV_VV(vcvttsdsi , PACK_AVX_SSE(Vcvttsd2si , Cvttsd2si , X)) // AVX | SSE2 1197 1198 V_EMIT_VVV_VV(vhaddps_ , PACK_AVX_SSE(Vhaddps , Haddps , Z)) // AVX | SSE3 1199 V_EMIT_VVV_VV(vhaddpd_ , PACK_AVX_SSE(Vhaddpd , Haddpd , Z)) // AVX | SSE3 1200 V_EMIT_VVV_VV(vhsubps_ , PACK_AVX_SSE(Vhsubps , Hsubps , Z)) // AVX | SSE3 1201 V_EMIT_VVV_VV(vhsubpd_ , PACK_AVX_SSE(Vhsubpd , Hsubpd , Z)) // AVX | SSE3 1202 1203 // Floating Point - Miscellaneous. 1204 1205 V_EMIT_VV_VV(vcomiss , PACK_AVX_SSE(Vcomiss , Comiss , X)) // AVX | SSE 1206 V_EMIT_VV_VV(vcomisd , PACK_AVX_SSE(Vcomisd , Comisd , X)) // AVX | SSE2 1207 V_EMIT_VV_VV(vucomiss , PACK_AVX_SSE(Vucomiss , Ucomiss , X)) // AVX | SSE 1208 V_EMIT_VV_VV(vucomisd , PACK_AVX_SSE(Vucomisd , Ucomisd , X)) // AVX | SSE2 1209 1210 // Initialization. 1211 1212 inline void vzeropi(const Operand_& dst) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vpxor , Pxor , Z), dst, dst, dst); } vzerops(const Operand_ & dst)1213 inline void vzerops(const Operand_& dst) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vxorps, Xorps, Z), dst, dst, dst); } vzeropd(const Operand_ & dst)1214 inline void vzeropd(const Operand_& dst) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vxorpd, Xorpd, Z), dst, dst, dst); } 1215 vzeropi(const OpArray & dst)1216 inline void vzeropi(const OpArray& dst) noexcept { for (uint32_t i = 0; i < dst.size(); i++) vzeropi(dst[i]); } vzerops(const OpArray & dst)1217 inline void vzerops(const OpArray& dst) noexcept { for (uint32_t i = 0; i < dst.size(); i++) vzerops(dst[i]); } vzeropd(const OpArray & dst)1218 inline void vzeropd(const OpArray& dst) noexcept { for (uint32_t i = 0; i < dst.size(); i++) vzeropd(dst[i]); } 1219 1220 // Conversion. 1221 vmovsi32(const x86::Vec & dst,const x86::Gp & src)1222 inline void vmovsi32(const x86::Vec& dst, const x86::Gp& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovd, Movd, X), dst, src); } vmovsi64(const x86::Vec & dst,const x86::Gp & src)1223 inline void vmovsi64(const x86::Vec& dst, const x86::Gp& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovq, Movq, X), dst, src); } 1224 vmovsi32(const x86::Gp & dst,const x86::Vec & src)1225 inline void vmovsi32(const x86::Gp& dst, const x86::Vec& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovd, Movd, X), dst, src); } vmovsi64(const x86::Gp & dst,const x86::Vec & src)1226 inline void vmovsi64(const x86::Gp& dst, const x86::Vec& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovq, Movq, X), dst, src); } 1227 1228 // Memory Load & Store. 1229 vloadi8(const Operand_ & dst,const x86::Mem & src)1230 BL_NOINLINE void vloadi8(const Operand_& dst, const x86::Mem& src) noexcept { 1231 if (hasSSE4_1()) { 1232 vzeropi(dst); 1233 vinsertu8_(dst, dst, src, 0); 1234 } 1235 else { 1236 x86::Gp tmp = cc->newUInt32("@tmp"); 1237 load8(tmp, src); 1238 vmovsi32(dst.as<x86::Xmm>(), tmp); 1239 } 1240 } 1241 vloadu8_u16_2x(const Operand_ & dst,const x86::Mem & lo,const x86::Mem & hi)1242 BL_NOINLINE void vloadu8_u16_2x(const Operand_& dst, const x86::Mem& lo, const x86::Mem& hi) noexcept { 1243 x86::Gp reg = cc->newUInt32("@tmp"); 1244 x86::Mem mLo(lo); 1245 x86::Mem mHi(hi); 1246 1247 mLo.setSize(1); 1248 mHi.setSize(1); 1249 1250 cc->movzx(reg, mHi); 1251 cc->shl(reg, 16); 1252 cc->mov(reg.r8(), mLo); 1253 vmovsi32(dst.as<x86::Xmm>(), reg); 1254 } 1255 vloadi16(const Operand_ & dst,const x86::Mem & src)1256 BL_NOINLINE void vloadi16(const Operand_& dst, const x86::Mem& src) noexcept { 1257 if (hasSSE4_1()) { 1258 vzeropi(dst); 1259 vinsertu16(dst, dst, src, 0); 1260 } 1261 else { 1262 x86::Gp tmp = cc->newUInt32("@tmp"); 1263 load16(tmp, src); 1264 vmovsi32(dst.as<x86::Xmm>(), tmp); 1265 } 1266 } 1267 vloadi32(const Operand_ & dst,const x86::Mem & src)1268 inline void vloadi32(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovd, Movd, X), dst, src); } vloadi64(const Operand_ & dst,const x86::Mem & src)1269 inline void vloadi64(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovq, Movq, X), dst, src); } 1270 vloadi128a(const Operand_ & dst,const x86::Mem & src)1271 inline void vloadi128a(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqa, Movaps, X), dst, src); } vloadi128u(const Operand_ & dst,const x86::Mem & src)1272 inline void vloadi128u(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqu, Movups, X), dst, src); } vloadi128u_ro(const Operand_ & dst,const x86::Mem & src)1273 inline void vloadi128u_ro(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vloadi128uRO), dst, src); } 1274 vloadi256a(const Operand_ & dst,const x86::Mem & src)1275 inline void vloadi256a(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqa, Movaps, Y), dst, src); } vloadi256u(const Operand_ & dst,const x86::Mem & src)1276 inline void vloadi256u(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqu, Movups, Y), dst, src); } vloadi256u_ro(const Operand_ & dst,const x86::Mem & src)1277 inline void vloadi256u_ro(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vloadi128uRO), dst, src); } 1278 vloadi64_u8u16_(const Operand_ & dst,const x86::Mem & src)1279 inline void vloadi64_u8u16_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovzxbw, Pmovzxbw, X), dst, src); } vloadi32_u8u32_(const Operand_ & dst,const x86::Mem & src)1280 inline void vloadi32_u8u32_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovzxbd, Pmovzxbd, X), dst, src); } vloadi16_u8u64_(const Operand_ & dst,const x86::Mem & src)1281 inline void vloadi16_u8u64_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovzxbq, Pmovzxbq, X), dst, src); } vloadi64_u16u32_(const Operand_ & dst,const x86::Mem & src)1282 inline void vloadi64_u16u32_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovzxwd, Pmovzxwd, X), dst, src); } vloadi32_u16u64_(const Operand_ & dst,const x86::Mem & src)1283 inline void vloadi32_u16u64_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovzxwq, Pmovzxwq, X), dst, src); } vloadi64_u32u64_(const Operand_ & dst,const x86::Mem & src)1284 inline void vloadi64_u32u64_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovzxdq, Pmovzxdq, X), dst, src); } 1285 vloadi64_i8i16_(const Operand_ & dst,const x86::Mem & src)1286 inline void vloadi64_i8i16_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovsxbw, Pmovsxbw, X), dst, src); } vloadi32_i8i32_(const Operand_ & dst,const x86::Mem & src)1287 inline void vloadi32_i8i32_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovsxbd, Pmovsxbd, X), dst, src); } vloadi16_i8i64_(const Operand_ & dst,const x86::Mem & src)1288 inline void vloadi16_i8i64_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovsxbq, Pmovsxbq, X), dst, src); } vloadi64_i16i32_(const Operand_ & dst,const x86::Mem & src)1289 inline void vloadi64_i16i32_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovsxwd, Pmovsxwd, X), dst, src); } vloadi32_i16i64_(const Operand_ & dst,const x86::Mem & src)1290 inline void vloadi32_i16i64_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovsxwq, Pmovsxwq, X), dst, src); } vloadi64_i32i64_(const Operand_ & dst,const x86::Mem & src)1291 inline void vloadi64_i32i64_(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vpmovsxdq, Pmovsxdq, X), dst, src); } 1292 vstorei32(const x86::Mem & dst,const Operand_ & src)1293 inline void vstorei32(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovd, Movd, X), dst, src); } vstorei64(const x86::Mem & dst,const Operand_ & src)1294 inline void vstorei64(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovq, Movq, X), dst, src); } 1295 vstorei128a(const x86::Mem & dst,const Operand_ & src)1296 inline void vstorei128a(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqa, Movaps, X), dst, src); } vstorei128u(const x86::Mem & dst,const Operand_ & src)1297 inline void vstorei128u(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqu, Movups, X), dst, src); } 1298 vstorei128x(const x86::Mem & dst,const x86::Vec & src,uint32_t alignment)1299 inline void vstorei128x(const x86::Mem& dst, const x86::Vec& src, uint32_t alignment) noexcept { 1300 if (alignment >= 16) 1301 vstorei128a(dst, src); 1302 else 1303 vstorei128u(dst, src); 1304 } 1305 vstorei256a(const x86::Mem & dst,const Operand_ & src)1306 inline void vstorei256a(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqa, Movaps, Y), dst, src); } vstorei256u(const x86::Mem & dst,const Operand_ & src)1307 inline void vstorei256u(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovdqu, Movups, Y), dst, src); } 1308 vstorei256x(const x86::Mem & dst,const x86::Vec & src,uint32_t alignment)1309 inline void vstorei256x(const x86::Mem& dst, const x86::Vec& src, uint32_t alignment) noexcept { 1310 if (alignment >= 32) 1311 vstorei256a(dst, src); 1312 else 1313 vstorei256u(dst, src); 1314 } 1315 vloadss(const Operand_ & dst,const x86::Mem & src)1316 inline void vloadss(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovss, Movss, X), dst, src); } vloadsd(const Operand_ & dst,const x86::Mem & src)1317 inline void vloadsd(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovsd, Movsd, X), dst, src); } 1318 vloadps_64l(const Operand_ & dst,const Operand_ & src1,const x86::Mem & src2)1319 inline void vloadps_64l(const Operand_& dst, const Operand_& src1, const x86::Mem& src2) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vmovlps, Movlps, X), dst, src1, src2); } vloadps_64h(const Operand_ & dst,const Operand_ & src1,const x86::Mem & src2)1320 inline void vloadps_64h(const Operand_& dst, const Operand_& src1, const x86::Mem& src2) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vmovhps, Movhps, X), dst, src1, src2); } vloadpd_64l(const Operand_ & dst,const Operand_ & src1,const x86::Mem & src2)1321 inline void vloadpd_64l(const Operand_& dst, const Operand_& src1, const x86::Mem& src2) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vmovlpd, Movlpd, X), dst, src1, src2); } vloadpd_64h(const Operand_ & dst,const Operand_ & src1,const x86::Mem & src2)1322 inline void vloadpd_64h(const Operand_& dst, const Operand_& src1, const x86::Mem& src2) noexcept { vemit_vvv_vv(PACK_AVX_SSE(Vmovhpd, Movhpd, X), dst, src1, src2); } 1323 vloadps_128a(const Operand_ & dst,const x86::Mem & src)1324 inline void vloadps_128a(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovaps, Movaps, X), dst, src); } vloadps_128u(const Operand_ & dst,const x86::Mem & src)1325 inline void vloadps_128u(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovups, Movups, X), dst, src); } vloadpd_128a(const Operand_ & dst,const x86::Mem & src)1326 inline void vloadpd_128a(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovapd, Movaps, X), dst, src); } vloadpd_128u(const Operand_ & dst,const x86::Mem & src)1327 inline void vloadpd_128u(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovupd, Movups, X), dst, src); } 1328 vloadps_256a(const Operand_ & dst,const x86::Mem & src)1329 inline void vloadps_256a(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovaps, Movaps, Y), dst, src); } vloadps_256u(const Operand_ & dst,const x86::Mem & src)1330 inline void vloadps_256u(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovups, Movups, Y), dst, src); } vloadpd_256a(const Operand_ & dst,const x86::Mem & src)1331 inline void vloadpd_256a(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovapd, Movaps, Y), dst, src); } vloadpd_256u(const Operand_ & dst,const x86::Mem & src)1332 inline void vloadpd_256u(const Operand_& dst, const x86::Mem& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovupd, Movups, Y), dst, src); } 1333 vstoress(const x86::Mem & dst,const Operand_ & src)1334 inline void vstoress(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovss, Movss, X), dst, src); } vstoresd(const x86::Mem & dst,const Operand_ & src)1335 inline void vstoresd(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovsd, Movsd, X), dst, src); } 1336 vstoreps_64l(const x86::Mem & dst,const Operand_ & src)1337 inline void vstoreps_64l(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovlps, Movlps, X), dst, src); } vstoreps_64h(const x86::Mem & dst,const Operand_ & src)1338 inline void vstoreps_64h(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovhps, Movhps, X), dst, src); } 1339 vstorepd_64l(const x86::Mem & dst,const Operand_ & src)1340 inline void vstorepd_64l(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovsd , Movsd , X), dst, src); } vstorepd_64h(const x86::Mem & dst,const Operand_ & src)1341 inline void vstorepd_64h(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovhpd, Movhpd, X), dst, src); } 1342 vstoreps_128a(const x86::Mem & dst,const Operand_ & src)1343 inline void vstoreps_128a(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovaps, Movaps, X), dst, src); } vstoreps_128u(const x86::Mem & dst,const Operand_ & src)1344 inline void vstoreps_128u(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovups, Movups, X), dst, src); } vstorepd_128a(const x86::Mem & dst,const Operand_ & src)1345 inline void vstorepd_128a(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovapd, Movaps, X), dst, src); } vstorepd_128u(const x86::Mem & dst,const Operand_ & src)1346 inline void vstorepd_128u(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovupd, Movups, X), dst, src); } 1347 vstoreps_256a(const x86::Mem & dst,const Operand_ & src)1348 inline void vstoreps_256a(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovaps, Movaps, Y), dst, src); } vstoreps_256u(const x86::Mem & dst,const Operand_ & src)1349 inline void vstoreps_256u(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovups, Movups, Y), dst, src); } vstorepd_256a(const x86::Mem & dst,const Operand_ & src)1350 inline void vstorepd_256a(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovapd, Movaps, Y), dst, src); } vstorepd_256u(const x86::Mem & dst,const Operand_ & src)1351 inline void vstorepd_256u(const x86::Mem& dst, const Operand_& src) noexcept { vemit_vv_vv(PACK_AVX_SSE(Vmovupd, Movups, Y), dst, src); } 1352 1353 // Intrinsics: 1354 // 1355 // - vmov{x}{y} - Move with sign or zero extension from {x} to {y}. Similar 1356 // to instructions like `pmovzx..`, `pmovsx..`, and `punpckl..` 1357 // 1358 // - vswap{x} - Swap low and high elements. If the vector has more than 2 1359 // elements it's divided into 2 element vectors in which the 1360 // operation is performed separately. 1361 // 1362 // - vdup{l|h}{x} - Duplicate either low or high element into both. If there 1363 // are more than 2 elements in the vector it's considered 1364 // they are separate units. For example a 4-element vector 1365 // can be considered as 2 2-element vectors on which the 1366 // duplication operation is performed. 1367 1368 template<typename DstT, typename SrcT> vmovu8u16(const DstT & dst,const SrcT & src)1369 inline void vmovu8u16(const DstT& dst, const SrcT& src) noexcept { 1370 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vmovu8u16), dst, src); 1371 } 1372 1373 template<typename DstT, typename SrcT> vmovu8u32(const DstT & dst,const SrcT & src)1374 inline void vmovu8u32(const DstT& dst, const SrcT& src) noexcept { 1375 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vmovu8u32), dst, src); 1376 } 1377 1378 template<typename DstT, typename SrcT> vmovu16u32(const DstT & dst,const SrcT & src)1379 inline void vmovu16u32(const DstT& dst, const SrcT& src) noexcept { 1380 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vmovu16u32), dst, src); 1381 } 1382 1383 template<typename DstT, typename SrcT> vabsi8(const DstT & dst,const SrcT & src)1384 inline void vabsi8(const DstT& dst, const SrcT& src) noexcept { 1385 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vabsi8), dst, src); 1386 } 1387 1388 template<typename DstT, typename SrcT> vabsi16(const DstT & dst,const SrcT & src)1389 inline void vabsi16(const DstT& dst, const SrcT& src) noexcept { 1390 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vabsi16), dst, src); 1391 } 1392 1393 template<typename DstT, typename SrcT> vabsi32(const DstT & dst,const SrcT & src)1394 inline void vabsi32(const DstT& dst, const SrcT& src) noexcept { 1395 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vabsi32), dst, src); 1396 } 1397 1398 template<typename DstT, typename SrcT> vabsi64(const DstT & dst,const SrcT & src)1399 inline void vabsi64(const DstT& dst, const SrcT& src) noexcept { 1400 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vabsi64), dst, src); 1401 } 1402 1403 template<typename DstT, typename SrcT> vswapi32(const DstT & dst,const SrcT & src)1404 inline void vswapi32(const DstT& dst, const SrcT& src) noexcept { 1405 vswizi32(dst, src, x86::Predicate::shuf(2, 3, 0, 1)); 1406 } 1407 1408 template<typename DstT, typename SrcT> vswapi64(const DstT & dst,const SrcT & src)1409 inline void vswapi64(const DstT& dst, const SrcT& src) noexcept { 1410 vswizi32(dst, src, x86::Predicate::shuf(1, 0, 3, 2)); 1411 } 1412 1413 template<typename DstT, typename SrcT> vdupli32(const DstT & dst,const SrcT & src)1414 inline void vdupli32(const DstT& dst, const SrcT& src) noexcept { 1415 vswizi32(dst, src, x86::Predicate::shuf(2, 2, 0, 0)); 1416 } 1417 1418 template<typename DstT, typename SrcT> vduphi32(const DstT & dst,const SrcT & src)1419 inline void vduphi32(const DstT& dst, const SrcT& src) noexcept { 1420 vswizi32(dst, src, x86::Predicate::shuf(3, 3, 1, 1)); 1421 } 1422 1423 template<typename DstT, typename SrcT> vdupli64(const DstT & dst,const SrcT & src)1424 inline void vdupli64(const DstT& dst, const SrcT& src) noexcept { 1425 vswizi32(dst, src, x86::Predicate::shuf(1, 0, 1, 0)); 1426 } 1427 1428 template<typename DstT, typename SrcT> vduphi64(const DstT & dst,const SrcT & src)1429 inline void vduphi64(const DstT& dst, const SrcT& src) noexcept { 1430 vswizi32(dst, src, x86::Predicate::shuf(3, 2, 3, 2)); 1431 } 1432 1433 // Dst = (CondBit == 0) ? Src1 : Src2; 1434 template<typename DstT, typename Src1T, typename Src2T, typename CondT> vblendv8(const DstT & dst,const Src1T & src1,const Src2T & src2,const CondT & cond)1435 inline void vblendv8(const DstT& dst, const Src1T& src1, const Src2T& src2, const CondT& cond) noexcept { 1436 vemit_vvvv_vvv(PackedInst::packIntrin(kIntrin4Vpblendvb), dst, src1, src2, cond); 1437 } 1438 1439 // Dst = (CondBit == 0) ? Src1 : Src2; 1440 template<typename DstT, typename Src1T, typename Src2T, typename CondT> vblendv8_destructive(const DstT & dst,const Src1T & src1,const Src2T & src2,const CondT & cond)1441 inline void vblendv8_destructive(const DstT& dst, const Src1T& src1, const Src2T& src2, const CondT& cond) noexcept { 1442 vemit_vvvv_vvv(PackedInst::packIntrin(kIntrin4VpblendvbDestructive), dst, src1, src2, cond); 1443 } 1444 1445 template<typename DstT, typename SrcT> vinv255u16(const DstT & dst,const SrcT & src)1446 inline void vinv255u16(const DstT& dst, const SrcT& src) noexcept { 1447 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vinv255u16), dst, src); 1448 } 1449 1450 template<typename DstT, typename SrcT> vinv256u16(const DstT & dst,const SrcT & src)1451 inline void vinv256u16(const DstT& dst, const SrcT& src) noexcept { 1452 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vinv256u16), dst, src); 1453 } 1454 1455 template<typename DstT, typename SrcT> vinv255u32(const DstT & dst,const SrcT & src)1456 inline void vinv255u32(const DstT& dst, const SrcT& src) noexcept { 1457 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vinv255u32), dst, src); 1458 } 1459 1460 template<typename DstT, typename SrcT> vinv256u32(const DstT & dst,const SrcT & src)1461 inline void vinv256u32(const DstT& dst, const SrcT& src) noexcept { 1462 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vinv256u32), dst, src); 1463 } 1464 1465 template<typename DstT, typename SrcT> vduplpd(const DstT & dst,const SrcT & src)1466 inline void vduplpd(const DstT& dst, const SrcT& src) noexcept { 1467 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vduplpd), dst, src); 1468 } 1469 1470 template<typename DstT, typename SrcT> vduphpd(const DstT & dst,const SrcT & src)1471 inline void vduphpd(const DstT& dst, const SrcT& src) noexcept { 1472 vemit_vv_vv(PackedInst::packIntrin(kIntrin2Vduphpd), dst, src); 1473 } 1474 1475 template<typename DstT, typename Src1T, typename Src2T> vhaddpd(const DstT & dst,const Src1T & src1,const Src2T & src2)1476 inline void vhaddpd(const DstT& dst, const Src1T& src1, const Src2T& src2) noexcept { 1477 vemit_vvv_vv(PackedInst::packIntrin(kIntrin3Vhaddpd), dst, src1, src2); 1478 } 1479 1480 template<typename DstT, typename SrcT> vexpandli32(const DstT & dst,const SrcT & src)1481 inline void vexpandli32(const DstT& dst, const SrcT& src) noexcept { 1482 vswizi32(dst, src, x86::Predicate::shuf(0, 0, 0, 0)); 1483 } 1484 1485 // dst.u64[0] = src1.u64[1]; 1486 // dst.u64[1] = src2.u64[0]; 1487 template<typename DstT, typename Src1T, typename Src2T> vcombhli64(const DstT & dst,const Src1T & src1,const Src2T & src2)1488 inline void vcombhli64(const DstT& dst, const Src1T& src1, const Src2T& src2) noexcept { 1489 vemit_vvv_vv(PackedInst::packIntrin(kIntrin3Vcombhli64), dst, src1, src2); 1490 } 1491 1492 // dst.d64[0] = src1.d64[1]; 1493 // dst.d64[1] = src2.d64[0]; 1494 template<typename DstT, typename Src1T, typename Src2T> vcombhld64(const DstT & dst,const Src1T & src1,const Src2T & src2)1495 inline void vcombhld64(const DstT& dst, const Src1T& src1, const Src2T& src2) noexcept { 1496 vemit_vvv_vv(PackedInst::packIntrin(kIntrin3Vcombhld64), dst, src1, src2); 1497 } 1498 1499 template<typename DstT, typename Src1T, typename Src2T> vminu16(const DstT & dst,const Src1T & src1,const Src2T & src2)1500 inline void vminu16(const DstT& dst, const Src1T& src1, const Src2T& src2) noexcept { 1501 vemit_vvv_vv(PackedInst::packIntrin(kIntrin3Vminu16), dst, src1, src2); 1502 } 1503 1504 template<typename DstT, typename Src1T, typename Src2T> vmaxu16(const DstT & dst,const Src1T & src1,const Src2T & src2)1505 inline void vmaxu16(const DstT& dst, const Src1T& src1, const Src2T& src2) noexcept { 1506 vemit_vvv_vv(PackedInst::packIntrin(kIntrin3Vmaxu16), dst, src1, src2); 1507 } 1508 1509 // Multiplies packed uint64_t in `src1` with packed low uint32_t int `src2`. 1510 template<typename DstT, typename Src1T, typename Src2T> vMulU64xU32Lo(const DstT & dst,const Src1T & src1,const Src2T & src2)1511 inline void vMulU64xU32Lo(const DstT& dst, const Src1T& src1, const Src2T& src2) noexcept { 1512 vemit_vvv_vv(PackedInst::packIntrin(kIntrin3Vmulu64x32), dst, src1, src2); 1513 } 1514 1515 template<typename DstT, typename SrcT> vmul257hu16(const DstT & dst,const SrcT & src)1516 BL_NOINLINE void vmul257hu16(const DstT& dst, const SrcT& src) { 1517 vmulhu16(dst, src, constAsXmm(blCommonTable.i128_0101010101010101)); 1518 } 1519 1520 // TODO: [PIPEGEN] Consolidate this to only one implementation. 1521 template<typename DstSrcT> vdiv255u16(const DstSrcT & x)1522 BL_NOINLINE void vdiv255u16(const DstSrcT& x) { 1523 vaddi16(x, x, constAsXmm(blCommonTable.i128_0080008000800080)); 1524 vmul257hu16(x, x); 1525 } 1526 1527 template<typename DstSrcT> vdiv255u16_2x(const DstSrcT & v0,const DstSrcT & v1)1528 BL_NOINLINE void vdiv255u16_2x( 1529 const DstSrcT& v0, 1530 const DstSrcT& v1) noexcept { 1531 1532 x86::Xmm i128_0080008000800080 = constAsXmm(blCommonTable.i128_0080008000800080); 1533 x86::Xmm i128_0101010101010101 = constAsXmm(blCommonTable.i128_0101010101010101); 1534 1535 vaddi16(v0, v0, i128_0080008000800080); 1536 vaddi16(v1, v1, i128_0080008000800080); 1537 1538 vmulhu16(v0, v0, i128_0101010101010101); 1539 vmulhu16(v1, v1, i128_0101010101010101); 1540 } 1541 1542 template<typename DstSrcT> vdiv255u16_3x(const DstSrcT & v0,const DstSrcT & v1,const DstSrcT & v2)1543 BL_NOINLINE void vdiv255u16_3x( 1544 const DstSrcT& v0, 1545 const DstSrcT& v1, 1546 const DstSrcT& v2) noexcept { 1547 1548 x86::Xmm i128_0080008000800080 = constAsXmm(blCommonTable.i128_0080008000800080); 1549 x86::Xmm i128_0101010101010101 = constAsXmm(blCommonTable.i128_0101010101010101); 1550 1551 vaddi16(v0, v0, i128_0080008000800080); 1552 vaddi16(v1, v1, i128_0080008000800080); 1553 1554 vmulhu16(v0, v0, i128_0101010101010101); 1555 vmulhu16(v1, v1, i128_0101010101010101); 1556 1557 vaddi16(v2, v2, i128_0080008000800080); 1558 vmulhu16(v2, v2, i128_0101010101010101); 1559 } 1560 1561 1562 template<typename DstT, typename SrcT> vexpandlps(const DstT & dst,const SrcT & src)1563 inline void vexpandlps(const DstT& dst, const SrcT& src) noexcept { 1564 vexpandli32(dst, src); 1565 } 1566 1567 template<typename DstT, typename SrcT> vswizps(const DstT & dst,const SrcT & src,uint32_t imm)1568 inline void vswizps(const DstT& dst, const SrcT& src, uint32_t imm) noexcept { vemit_vvi_vi(PackedInst::packIntrin(kIntrin2iVswizps), dst, src, imm); } 1569 1570 template<typename DstT, typename SrcT> vswizpd(const DstT & dst,const SrcT & src,uint32_t imm)1571 inline void vswizpd(const DstT& dst, const SrcT& src, uint32_t imm) noexcept { vemit_vvi_vi(PackedInst::packIntrin(kIntrin2iVswizpd), dst, src, imm); } 1572 1573 template<typename DstT, typename SrcT> vswapps(const DstT & dst,const SrcT & src)1574 inline void vswapps(const DstT& dst, const SrcT& src) noexcept { vswizps(dst, src, x86::Predicate::shuf(2, 3, 0, 1)); } 1575 1576 template<typename DstT, typename SrcT> vswappd(const DstT & dst,const SrcT & src)1577 inline void vswappd(const DstT& dst, const SrcT& src) noexcept { vswizpd(dst, src, x86::Predicate::shuf(0, 1)); } 1578 1579 template<typename DstT, typename SrcT> vbroadcast_u16(const DstT & dst,const SrcT & src)1580 inline void vbroadcast_u16(const DstT& dst, const SrcT& src) noexcept { 1581 vemit_vv_vv(PackedInst::packIntrin(kIntrin2VBroadcastU16), dst, src); 1582 } 1583 1584 template<typename DstT, typename SrcT> vbroadcast_u32(const DstT & dst,const SrcT & src)1585 inline void vbroadcast_u32(const DstT& dst, const SrcT& src) noexcept { 1586 vemit_vv_vv(PackedInst::packIntrin(kIntrin2VBroadcastU32), dst, src); 1587 } 1588 1589 template<typename DstT, typename SrcT> vbroadcast_u64(const DstT & dst,const SrcT & src)1590 inline void vbroadcast_u64(const DstT& dst, const SrcT& src) noexcept { 1591 vemit_vv_vv(PackedInst::packIntrin(kIntrin2VBroadcastU64), dst, src); 1592 } 1593 1594 template<typename DstT, typename Src1T, typename Src2T> vminmaxu8(const DstT & dst,const Src1T & src1,const Src2T & src2,bool isMin)1595 BL_INLINE void vminmaxu8(const DstT& dst, const Src1T& src1, const Src2T& src2, bool isMin) noexcept { 1596 if (isMin) 1597 vminu8(dst, src1, src2); 1598 else 1599 vmaxu8(dst, src1, src2); 1600 } 1601 1602 // -------------------------------------------------------------------------- 1603 // [X-Emit - High-Level] 1604 // -------------------------------------------------------------------------- 1605 1606 // Kind of a hack - if we don't have SSE4.1 we have to load the byte into GP 1607 // register first and then we use 'PINSRW', which is provided by baseline SSE2. 1608 // If we have SSE4.1 then it's much easier as we can load the byte by 'PINSRB'. xInsertWordOrByte(const x86::Vec & dst,const x86::Mem & src,uint32_t wordIndex)1609 void xInsertWordOrByte(const x86::Vec& dst, const x86::Mem& src, uint32_t wordIndex) noexcept { 1610 x86::Mem m = src; 1611 m.setSize(1); 1612 1613 if (hasSSE4_1()) { 1614 vinsertu8_(dst, dst, m, wordIndex * 2u); 1615 } 1616 else { 1617 x86::Gp tmp = cc->newUInt32("@tmp"); 1618 cc->movzx(tmp, m); 1619 vinsertu16(dst, dst, tmp, wordIndex); 1620 } 1621 } 1622 1623 void xInlinePixelFillLoop(x86::Gp& dst, x86::Vec& src, x86::Gp& i, uint32_t mainLoopSize, uint32_t itemSize, uint32_t itemGranularity) noexcept; 1624 void xInlinePixelCopyLoop(x86::Gp& dst, x86::Gp& src, x86::Gp& i, uint32_t mainLoopSize, uint32_t itemSize, uint32_t itemGranularity, uint32_t format) noexcept; 1625 1626 void _xInlineMemCopySequenceXmm( 1627 const x86::Mem& dPtr, bool dstAligned, 1628 const x86::Mem& sPtr, bool srcAligned, uint32_t numBytes, const x86::Vec& fillMask) noexcept; 1629 1630 // -------------------------------------------------------------------------- 1631 // [Fetch Utilities] 1632 // -------------------------------------------------------------------------- 1633 1634 //! Fetches 1 pixel to XMM register(s) in `p` from memory location `sMem`. 1635 void xFetchPixel_1x(Pixel& p, uint32_t flags, uint32_t sFormat, const x86::Mem& sMem, uint32_t sAlignment) noexcept; 1636 //! Fetches 4 pixels to XMM register(s) in `p` from memory location `sMem`. 1637 void xFetchPixel_4x(Pixel& p, uint32_t flags, uint32_t sFormat, const x86::Mem& sMem, uint32_t sAlignment) noexcept; 1638 //! Fetches 8 pixels to XMM register(s) in `p` from memory location `sMem`. 1639 void xFetchPixel_8x(Pixel& p, uint32_t flags, uint32_t sFormat, const x86::Mem& sMem, uint32_t sAlignment) noexcept; 1640 1641 //! Makes sure that the given pixel `p` has all the `flags`. 1642 void xSatisfyPixel(Pixel& p, uint32_t flags) noexcept; 1643 //! Makes sure that the given pixel `p` has all the `flags` (solid source only). 1644 void xSatisfySolid(Pixel& p, uint32_t flags) noexcept; 1645 1646 void _xSatisfyPixelRGBA(Pixel& p, uint32_t flags) noexcept; 1647 void _xSatisfyPixelAlpha(Pixel& p, uint32_t flags) noexcept; 1648 1649 void _xSatisfySolidRGBA(Pixel& p, uint32_t flags) noexcept; 1650 void _xSatisfySolidAlpha(Pixel& p, uint32_t flags) noexcept; 1651 1652 void xFetchUnpackedA8_2x(const x86::Xmm& dst, uint32_t format, const x86::Mem& src1, const x86::Mem& src0) noexcept; 1653 1654 void xAssignUnpackedAlphaValues(Pixel& p, uint32_t flags, x86::Xmm& vec) noexcept; 1655 1656 //! Fills alpha channel with 1. 1657 void vFillAlpha(Pixel& p) noexcept; 1658 1659 // -------------------------------------------------------------------------- 1660 // [Utilities - SIMD] 1661 // -------------------------------------------------------------------------- 1662 xStorePixel(const x86::Gp & dPtr,const x86::Vec & vSrc,uint32_t count,uint32_t bpp,uint32_t dAlignment)1663 BL_NOINLINE void xStorePixel(const x86::Gp& dPtr, const x86::Vec& vSrc, uint32_t count, uint32_t bpp, uint32_t dAlignment) noexcept { 1664 x86::Mem dMem = x86::ptr(dPtr); 1665 uint32_t numBytes = bpp * count; 1666 1667 switch (numBytes) { 1668 case 4: 1669 vstorei32(dMem, vSrc); 1670 break; 1671 1672 case 8: 1673 vstorei64(dMem, vSrc); 1674 break; 1675 1676 case 16: 1677 if (dAlignment == 16) 1678 vstorei128a(dMem, vSrc); 1679 else 1680 vstorei128u(dMem, vSrc); 1681 break; 1682 1683 default: 1684 BL_NOT_REACHED(); 1685 } 1686 } 1687 xStore32_ARGB(const x86::Gp & dPtr,const x86::Vec & vSrc)1688 inline void xStore32_ARGB(const x86::Gp& dPtr, const x86::Vec& vSrc) noexcept { 1689 vstorei32(x86::ptr_32(dPtr), vSrc); 1690 } 1691 xMovzxBW_LoHi(const x86::Vec & d0,const x86::Vec & d1,const x86::Vec & s)1692 BL_NOINLINE void xMovzxBW_LoHi(const x86::Vec& d0, const x86::Vec& d1, const x86::Vec& s) noexcept { 1693 BL_ASSERT(d0.id() != d1.id()); 1694 1695 if (hasSSE4_1()) { 1696 if (d0.id() == s.id()) { 1697 vswizi32(d1, d0, x86::Predicate::shuf(1, 0, 3, 2)); 1698 vmovu8u16_(d0, d0); 1699 vmovu8u16_(d1, d1); 1700 } 1701 else { 1702 vmovu8u16(d0, s); 1703 vswizi32(d1, s, x86::Predicate::shuf(1, 0, 3, 2)); 1704 vmovu8u16(d1, d1); 1705 } 1706 } 1707 else { 1708 x86::Xmm i128_0000000000000000 = constAsXmm(blCommonTable.i128_0000000000000000); 1709 if (d1.id() != s.id()) { 1710 vunpackhi8(d1, s, i128_0000000000000000); 1711 vunpackli8(d0, s, i128_0000000000000000); 1712 } 1713 else { 1714 vunpackli8(d0, s, i128_0000000000000000); 1715 vunpackhi8(d1, s, i128_0000000000000000); 1716 } 1717 } 1718 } 1719 1720 template<typename Dst, typename Src> vExpandAlphaLo16(const Dst & d,const Src & s)1721 inline void vExpandAlphaLo16(const Dst& d, const Src& s) noexcept { vswizli16(d, s, x86::Predicate::shuf(3, 3, 3, 3)); } 1722 1723 template<typename Dst, typename Src> vExpandAlphaHi16(const Dst & d,const Src & s)1724 inline void vExpandAlphaHi16(const Dst& d, const Src& s) noexcept { vswizhi16(d, s, x86::Predicate::shuf(3, 3, 3, 3)); } 1725 1726 template<typename Dst, typename Src> 1727 inline void vExpandAlpha16(const Dst& d, const Src& s, uint32_t useHiPart = 1) noexcept { 1728 vExpandAlphaLo16(d, s); 1729 if (useHiPart) 1730 vExpandAlphaHi16(d, d); 1731 } 1732 1733 template<typename Dst, typename Src> vExpandAlphaPS(const Dst & d,const Src & s)1734 inline void vExpandAlphaPS(const Dst& d, const Src& s) noexcept { vswizi32(d, s, x86::Predicate::shuf(3, 3, 3, 3)); } 1735 1736 template<typename DstT, typename SrcT> vFillAlpha255B(const DstT & dst,const SrcT & src)1737 inline void vFillAlpha255B(const DstT& dst, const SrcT& src) noexcept { vor(dst, src, constAsXmm(blCommonTable.i128_FF000000FF000000)); } 1738 template<typename DstT, typename SrcT> vFillAlpha255W(const DstT & dst,const SrcT & src)1739 inline void vFillAlpha255W(const DstT& dst, const SrcT& src) noexcept { vor(dst, src, constAsMem(blCommonTable.i128_00FF000000000000)); } 1740 1741 template<typename DstT, typename SrcT> vZeroAlphaB(const DstT & dst,const SrcT & src)1742 inline void vZeroAlphaB(const DstT& dst, const SrcT& src) noexcept { vand(dst, src, constAsMem(blCommonTable.i128_00FFFFFF00FFFFFF)); } 1743 template<typename DstT, typename SrcT> vZeroAlphaW(const DstT & dst,const SrcT & src)1744 inline void vZeroAlphaW(const DstT& dst, const SrcT& src) noexcept { vand(dst, src, constAsMem(blCommonTable.i128_0000FFFFFFFFFFFF)); } 1745 1746 template<typename DstT, typename SrcT> vNegAlpha8B(const DstT & dst,const SrcT & src)1747 inline void vNegAlpha8B(const DstT& dst, const SrcT& src) noexcept { vxor(dst, src, constAsMem(blCommonTable.i128_FF000000FF000000)); } 1748 template<typename DstT, typename SrcT> vNegAlpha8W(const DstT & dst,const SrcT & src)1749 inline void vNegAlpha8W(const DstT& dst, const SrcT& src) noexcept { vxor(dst, src, constAsMem(blCommonTable.i128_00FF000000000000)); } 1750 1751 template<typename DstT, typename SrcT> vNegRgb8B(const DstT & dst,const SrcT & src)1752 inline void vNegRgb8B(const DstT& dst, const SrcT& src) noexcept { vxor(dst, src, constAsMem(blCommonTable.i128_00FFFFFF00FFFFFF)); } 1753 template<typename DstT, typename SrcT> vNegRgb8W(const DstT & dst,const SrcT & src)1754 inline void vNegRgb8W(const DstT& dst, const SrcT& src) noexcept { vxor(dst, src, constAsMem(blCommonTable.i128_000000FF00FF00FF)); } 1755 1756 // d = int(floor(a / b) * b). 1757 template<typename XmmOrMem> vmodpd(const x86::Xmm & d,const x86::Xmm & a,const XmmOrMem & b)1758 BL_NOINLINE void vmodpd(const x86::Xmm& d, const x86::Xmm& a, const XmmOrMem& b) noexcept { 1759 if (hasSSE4_1()) { 1760 vdivpd(d, a, b); 1761 vroundpd_(d, d, x86::Predicate::kRoundTrunc | x86::Predicate::kRoundInexact); 1762 vmulpd(d, d, b); 1763 } 1764 else { 1765 x86::Xmm t = cc->newXmm("vmodpdTmp"); 1766 1767 vdivpd(d, a, b); 1768 vcvttpdi32(t, d); 1769 vcvti32pd(t, t); 1770 vcmppd(d, d, t, x86::Predicate::kCmpLT | x86::Predicate::kCmpUNORD); 1771 vandpd(d, d, constAsMem(blCommonTable.d128_m1)); 1772 vaddpd(d, d, t); 1773 vmulpd(d, d, b); 1774 } 1775 } 1776 1777 // Performs 32-bit unsigned modulo of 32-bit `a` (hi DWORD) with 32-bit `b` (lo DWORD). 1778 template<typename XmmOrMem_A, typename XmmOrMem_B> xModI64HIxU64LO(const x86::Xmm & d,const XmmOrMem_A & a,const XmmOrMem_B & b)1779 BL_NOINLINE void xModI64HIxU64LO(const x86::Xmm& d, const XmmOrMem_A& a, const XmmOrMem_B& b) noexcept { 1780 x86::Xmm t0 = cc->newXmm("t0"); 1781 x86::Xmm t1 = cc->newXmm("t1"); 1782 1783 vswizi32(t1, b, x86::Predicate::shuf(3, 3, 2, 0)); 1784 vswizi32(d , a, x86::Predicate::shuf(2, 0, 3, 1)); 1785 1786 vcvti32pd(t1, t1); 1787 vcvti32pd(t0, d); 1788 vmodpd(t0, t0, t1); 1789 vcvttpdi32(t0, t0); 1790 1791 vsubi32(d, d, t0); 1792 vswizi32(d, d, x86::Predicate::shuf(1, 3, 0, 2)); 1793 } 1794 1795 // Performs 32-bit unsigned modulo of 32-bit `a` (hi DWORD) with 64-bit `b` (DOUBLE). 1796 template<typename XmmOrMem_A, typename XmmOrMem_B> xModI64HIxDouble(const x86::Xmm & d,const XmmOrMem_A & a,const XmmOrMem_B & b)1797 BL_NOINLINE void xModI64HIxDouble(const x86::Xmm& d, const XmmOrMem_A& a, const XmmOrMem_B& b) noexcept { 1798 x86::Xmm t0 = cc->newXmm("t0"); 1799 1800 vswizi32(d, a, x86::Predicate::shuf(2, 0, 3, 1)); 1801 vcvti32pd(t0, d); 1802 vmodpd(t0, t0, b); 1803 vcvttpdi32(t0, t0); 1804 1805 vsubi32(d, d, t0); 1806 vswizi32(d, d, x86::Predicate::shuf(1, 3, 0, 2)); 1807 } 1808 xExtractUnpackedAFromPackedARGB32_1(const x86::Xmm & d,const x86::Xmm & s)1809 BL_NOINLINE void xExtractUnpackedAFromPackedARGB32_1(const x86::Xmm& d, const x86::Xmm& s) noexcept { 1810 vswizli16(d, s, x86::Predicate::shuf(1, 1, 1, 1)); 1811 vsrli16(d, d, 8); 1812 } 1813 xExtractUnpackedAFromPackedARGB32_2(const x86::Xmm & d,const x86::Xmm & s)1814 BL_NOINLINE void xExtractUnpackedAFromPackedARGB32_2(const x86::Xmm& d, const x86::Xmm& s) noexcept { 1815 if (hasSSSE3()) { 1816 vswizi8v_(d, s, constAsMem(blCommonTable.i128_pshufb_packed_argb32_2x_lo_to_unpacked_a8)); 1817 } 1818 else { 1819 vswizli16(d, s, x86::Predicate::shuf(3, 3, 1, 1)); 1820 vswizi32(d, d, x86::Predicate::shuf(1, 1, 0, 0)); 1821 vsrli16(d, d, 8); 1822 } 1823 } 1824 xExtractUnpackedAFromPackedARGB32_4(const x86::Vec & d0,const x86::Vec & d1,const x86::Vec & s)1825 BL_NOINLINE void xExtractUnpackedAFromPackedARGB32_4(const x86::Vec& d0, const x86::Vec& d1, const x86::Vec& s) noexcept { 1826 BL_ASSERT(d0.id() != d1.id()); 1827 1828 if (hasSSSE3()) { 1829 if (d0.id() == s.id()) { 1830 vswizi8v_(d1, s, constAsMem(blCommonTable.i128_pshufb_packed_argb32_2x_hi_to_unpacked_a8)); 1831 vswizi8v_(d0, s, constAsMem(blCommonTable.i128_pshufb_packed_argb32_2x_lo_to_unpacked_a8)); 1832 } 1833 else { 1834 vswizi8v_(d0, s, constAsMem(blCommonTable.i128_pshufb_packed_argb32_2x_lo_to_unpacked_a8)); 1835 vswizi8v_(d1, s, constAsMem(blCommonTable.i128_pshufb_packed_argb32_2x_hi_to_unpacked_a8)); 1836 } 1837 } 1838 else { 1839 if (d1.id() != s.id()) { 1840 vswizhi16(d1, s, x86::Predicate::shuf(3, 3, 1, 1)); 1841 vswizli16(d0, s, x86::Predicate::shuf(3, 3, 1, 1)); 1842 1843 vswizi32(d1, d1, x86::Predicate::shuf(3, 3, 2, 2)); 1844 vswizi32(d0, d0, x86::Predicate::shuf(1, 1, 0, 0)); 1845 1846 vsrli16(d1, d1, 8); 1847 vsrli16(d0, d0, 8); 1848 } 1849 else { 1850 vswizli16(d0, s, x86::Predicate::shuf(3, 3, 1, 1)); 1851 vswizhi16(d1, s, x86::Predicate::shuf(3, 3, 1, 1)); 1852 1853 vswizi32(d0, d0, x86::Predicate::shuf(1, 1, 0, 0)); 1854 vswizi32(d1, d1, x86::Predicate::shuf(3, 3, 2, 2)); 1855 1856 vsrli16(d0, d0, 8); 1857 vsrli16(d1, d1, 8); 1858 } 1859 } 1860 } 1861 xPackU32ToU16Lo(const x86::Vec & d0,const x86::Vec & s0)1862 BL_NOINLINE void xPackU32ToU16Lo(const x86::Vec& d0, const x86::Vec& s0) noexcept { 1863 if (hasSSE4_1()) { 1864 vpacki32u16_(d0, s0, s0); 1865 } 1866 else if (hasSSSE3()) { 1867 vswizi8v_(d0, s0, constAsMem(blCommonTable.i128_pshufb_u32_to_u16_lo)); 1868 } 1869 else { 1870 // Sign extend and then use `packssdw()`. 1871 vslli32(d0, s0, 16); 1872 vsrai32(d0, d0, 16); 1873 vpacki32i16(d0, d0, d0); 1874 } 1875 } 1876 xPackU32ToU16Lo(const VecArray & d0,const VecArray & s0)1877 BL_NOINLINE void xPackU32ToU16Lo(const VecArray& d0, const VecArray& s0) noexcept { 1878 for (uint32_t i = 0; i < d0.size(); i++) 1879 xPackU32ToU16Lo(d0[i], s0[i]); 1880 } 1881 1882 // -------------------------------------------------------------------------- 1883 // [Emit - End] 1884 // -------------------------------------------------------------------------- 1885 1886 #undef PACK_AVX_SSE 1887 #undef V_EMIT_VVVV_VVV 1888 #undef V_EMIT_VVVi_VVi 1889 #undef V_EMIT_VVVI_VVI 1890 #undef V_EMIT_VVV_VV 1891 #undef V_EMIT_VVi_VVi 1892 #undef V_EMIT_VVI_VVI 1893 #undef V_EMIT_VVI_VVI 1894 #undef V_EMIT_VVI_VI 1895 #undef V_EMIT_VV_VV 1896 #undef I_EMIT_3 1897 #undef I_EMIT_2i 1898 #undef I_EMIT_2 1899 }; 1900 1901 // ============================================================================ 1902 // [BLPipeGen::PipeInjectAtTheEnd] 1903 // ============================================================================ 1904 1905 class PipeInjectAtTheEnd { 1906 public: 1907 ScopedInjector _injector; 1908 PipeInjectAtTheEnd(PipeCompiler * pc)1909 BL_INLINE PipeInjectAtTheEnd(PipeCompiler* pc) noexcept 1910 : _injector(pc->cc, &pc->_funcEnd) {} 1911 }; 1912 1913 } // {BLPipeGen} 1914 1915 //! \} 1916 //! \endcond 1917 1918 #endif // BLEND2D_PIPEGEN_PIPECOMPILER_P_H 1919