1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2013 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Anton Mitrofanov <BugMaster@narod.ru> 8;* Fiona Glaser <fiona@x264.com> 9;* Henrik Gramner <henrik@gramner.com> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x264ASM assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used in x264. 29 30; Unlike the rest of x264, this file is available under an ISC license, as it 31; has significant usefulness outside of x264 and we want it to be available 32; to the largest audience possible. Of course, if you modify it for your own 33; purposes to add a new feature, we strongly encourage contributing a patch 34; as this feature might be useful for others as well. Send patches or ideas 35; to x264-devel@videolan.org . 36 37%ifndef private_prefix 38 %define private_prefix x264 39%endif 40 41%ifndef public_prefix 42 %define public_prefix private_prefix 43%endif 44 45%define WIN64 0 46%define UNIX64 0 47%if ARCH_X86_64 48 %ifidn __OUTPUT_FORMAT__,win32 49 %define WIN64 1 50 %elifidn __OUTPUT_FORMAT__,win64 51 %define WIN64 1 52 %elifidn __OUTPUT_FORMAT__,x64 53 %define WIN64 1 54 %else 55 %define UNIX64 1 56 %endif 57%endif 58 59%ifdef PREFIX 60 %define mangle(x) _ %+ x 61%else 62 %define mangle(x) x 63%endif 64 65; aout does not support align= 66; NOTE: This section is out of sync with x264, in order to 67; keep supporting OS/2. 68%macro SECTION_RODATA 0-1 16 69 %ifidn __OUTPUT_FORMAT__,aout 70 section .text 71 %else 72 SECTION .rodata align=%1 73 %endif 74%endmacro 75 76%macro SECTION_TEXT 0-1 16 77 %ifidn __OUTPUT_FORMAT__,aout 78 SECTION .text 79 %else 80 SECTION .text align=%1 81 %endif 82%endmacro 83 84%if WIN64 85 %define PIC 86%elif ARCH_X86_64 == 0 87; x86_32 doesn't require PIC. 88; Some distros prefer shared objects to be PIC, but nothing breaks if 89; the code contains a few textrels, so we'll skip that complexity. 90 %undef PIC 91%endif 92%ifdef PIC 93 default rel 94%endif 95 96%macro CPUNOP 1 97 %if HAVE_CPUNOP 98 CPU %1 99 %endif 100%endmacro 101 102; Macros to eliminate most code duplication between x86_32 and x86_64: 103; Currently this works only for leaf functions which load all their arguments 104; into registers at the start, and make no other use of the stack. Luckily that 105; covers most of x264's asm. 106 107; PROLOGUE: 108; %1 = number of arguments. loads them from stack if needed. 109; %2 = number of registers used. pushes callee-saved regs if needed. 110; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 111; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, 112; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), 113; and an extra register will be allocated to hold the original stack 114; pointer (to not invalidate r0m etc.). To prevent the use of an extra 115; register as stack pointer, request a negative stack size. 116; %4+/%5+ = list of names to define to registers 117; PROLOGUE can also be invoked by adding the same options to cglobal 118 119; e.g. 120; cglobal foo, 2,3,0, dst, src, tmp 121; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 122 123; TODO Some functions can use some args directly from the stack. If they're the 124; last args then you can just not declare them, but if they're in the middle 125; we need more flexible macro. 126 127; RET: 128; Pops anything that was pushed by PROLOGUE, and returns. 129 130; REP_RET: 131; Use this instead of RET if it's a branch target. 132 133; registers: 134; rN and rNq are the native-size register holding function argument N 135; rNd, rNw, rNb are dword, word, and byte size 136; rNh is the high 8 bits of the word size 137; rNm is the original location of arg N (a register or on the stack), dword 138; rNmp is native size 139 140%macro DECLARE_REG 2-3 141 %define r%1q %2 142 %define r%1d %2d 143 %define r%1w %2w 144 %define r%1b %2b 145 %define r%1h %2h 146 %define %2q %2 147 %if %0 == 2 148 %define r%1m %2d 149 %define r%1mp %2 150 %elif ARCH_X86_64 ; memory 151 %define r%1m [rstk + stack_offset + %3] 152 %define r%1mp qword r %+ %1 %+ m 153 %else 154 %define r%1m [rstk + stack_offset + %3] 155 %define r%1mp dword r %+ %1 %+ m 156 %endif 157 %define r%1 %2 158%endmacro 159 160%macro DECLARE_REG_SIZE 3 161 %define r%1q r%1 162 %define e%1q r%1 163 %define r%1d e%1 164 %define e%1d e%1 165 %define r%1w %1 166 %define e%1w %1 167 %define r%1h %3 168 %define e%1h %3 169 %define r%1b %2 170 %define e%1b %2 171%if ARCH_X86_64 == 0 172 %define r%1 e%1 173%endif 174%endmacro 175 176DECLARE_REG_SIZE ax, al, ah 177DECLARE_REG_SIZE bx, bl, bh 178DECLARE_REG_SIZE cx, cl, ch 179DECLARE_REG_SIZE dx, dl, dh 180DECLARE_REG_SIZE si, sil, null 181DECLARE_REG_SIZE di, dil, null 182DECLARE_REG_SIZE bp, bpl, null 183 184; t# defines for when per-arch register allocation is more complex than just function arguments 185 186%macro DECLARE_REG_TMP 1-* 187 %assign %%i 0 188 %rep %0 189 CAT_XDEFINE t, %%i, r%1 190 %assign %%i %%i+1 191 %rotate 1 192 %endrep 193%endmacro 194 195%macro DECLARE_REG_TMP_SIZE 0-* 196 %rep %0 197 %define t%1q t%1 %+ q 198 %define t%1d t%1 %+ d 199 %define t%1w t%1 %+ w 200 %define t%1h t%1 %+ h 201 %define t%1b t%1 %+ b 202 %rotate 1 203 %endrep 204%endmacro 205 206DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 207 208%if ARCH_X86_64 209 %define gprsize 8 210%else 211 %define gprsize 4 212%endif 213 214%macro PUSH 1 215 push %1 216 %ifidn rstk, rsp 217 %assign stack_offset stack_offset+gprsize 218 %endif 219%endmacro 220 221%macro POP 1 222 pop %1 223 %ifidn rstk, rsp 224 %assign stack_offset stack_offset-gprsize 225 %endif 226%endmacro 227 228%macro PUSH_IF_USED 1-* 229 %rep %0 230 %if %1 < regs_used 231 PUSH r%1 232 %endif 233 %rotate 1 234 %endrep 235%endmacro 236 237%macro POP_IF_USED 1-* 238 %rep %0 239 %if %1 < regs_used 240 pop r%1 241 %endif 242 %rotate 1 243 %endrep 244%endmacro 245 246%macro LOAD_IF_USED 1-* 247 %rep %0 248 %if %1 < num_args 249 mov r%1, r %+ %1 %+ mp 250 %endif 251 %rotate 1 252 %endrep 253%endmacro 254 255%macro SUB 2 256 sub %1, %2 257 %ifidn %1, rstk 258 %assign stack_offset stack_offset+(%2) 259 %endif 260%endmacro 261 262%macro ADD 2 263 add %1, %2 264 %ifidn %1, rstk 265 %assign stack_offset stack_offset-(%2) 266 %endif 267%endmacro 268 269%macro movifnidn 2 270 %ifnidn %1, %2 271 mov %1, %2 272 %endif 273%endmacro 274 275%macro movsxdifnidn 2 276 %ifnidn %1, %2 277 movsxd %1, %2 278 %endif 279%endmacro 280 281%macro ASSERT 1 282 %if (%1) == 0 283 %error assert failed 284 %endif 285%endmacro 286 287%macro DEFINE_ARGS 0-* 288 %ifdef n_arg_names 289 %assign %%i 0 290 %rep n_arg_names 291 CAT_UNDEF arg_name %+ %%i, q 292 CAT_UNDEF arg_name %+ %%i, d 293 CAT_UNDEF arg_name %+ %%i, w 294 CAT_UNDEF arg_name %+ %%i, h 295 CAT_UNDEF arg_name %+ %%i, b 296 CAT_UNDEF arg_name %+ %%i, m 297 CAT_UNDEF arg_name %+ %%i, mp 298 CAT_UNDEF arg_name, %%i 299 %assign %%i %%i+1 300 %endrep 301 %endif 302 303 %xdefine %%stack_offset stack_offset 304 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 305 %assign %%i 0 306 %rep %0 307 %xdefine %1q r %+ %%i %+ q 308 %xdefine %1d r %+ %%i %+ d 309 %xdefine %1w r %+ %%i %+ w 310 %xdefine %1h r %+ %%i %+ h 311 %xdefine %1b r %+ %%i %+ b 312 %xdefine %1m r %+ %%i %+ m 313 %xdefine %1mp r %+ %%i %+ mp 314 CAT_XDEFINE arg_name, %%i, %1 315 %assign %%i %%i+1 316 %rotate 1 317 %endrep 318 %xdefine stack_offset %%stack_offset 319 %assign n_arg_names %0 320%endmacro 321 322%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) 323 %ifnum %1 324 %if %1 != 0 325 %assign %%stack_alignment ((mmsize + 15) & ~15) 326 %assign stack_size %1 327 %if stack_size < 0 328 %assign stack_size -stack_size 329 %endif 330 %assign stack_size_padded stack_size 331 %if WIN64 332 %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space 333 %if mmsize != 8 334 %assign xmm_regs_used %2 335 %if xmm_regs_used > 8 336 %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 337 %endif 338 %endif 339 %endif 340 %if mmsize <= 16 && HAVE_ALIGNED_STACK 341 %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) 342 SUB rsp, stack_size_padded 343 %else 344 %assign %%reg_num (regs_used - 1) 345 %xdefine rstk r %+ %%reg_num 346 ; align stack, and save original stack location directly above 347 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 348 ; stack in a single instruction (i.e. mov rsp, rstk or mov 349 ; rsp, [rsp+stack_size_padded]) 350 mov rstk, rsp 351 %if %1 < 0 ; need to store rsp on stack 352 sub rsp, gprsize+stack_size_padded 353 and rsp, ~(%%stack_alignment-1) 354 %xdefine rstkm [rsp+stack_size_padded] 355 mov rstkm, rstk 356 %else ; can keep rsp in rstk during whole function 357 sub rsp, stack_size_padded 358 and rsp, ~(%%stack_alignment-1) 359 %xdefine rstkm rstk 360 %endif 361 %endif 362 WIN64_PUSH_XMM 363 %endif 364 %endif 365%endmacro 366 367%macro SETUP_STACK_POINTER 1 368 %ifnum %1 369 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) 370 %if %1 > 0 371 %assign regs_used (regs_used + 1) 372 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 373 %warning "Stack pointer will overwrite register argument" 374 %endif 375 %endif 376 %endif 377%endmacro 378 379%macro DEFINE_ARGS_INTERNAL 3+ 380 %ifnum %2 381 DEFINE_ARGS %3 382 %elif %1 == 4 383 DEFINE_ARGS %2 384 %elif %1 > 4 385 DEFINE_ARGS %2, %3 386 %endif 387%endmacro 388 389%if WIN64 ; Windows x64 ;================================================= 390 391DECLARE_REG 0, rcx 392DECLARE_REG 1, rdx 393DECLARE_REG 2, R8 394DECLARE_REG 3, R9 395DECLARE_REG 4, R10, 40 396DECLARE_REG 5, R11, 48 397DECLARE_REG 6, rax, 56 398DECLARE_REG 7, rdi, 64 399DECLARE_REG 8, rsi, 72 400DECLARE_REG 9, rbx, 80 401DECLARE_REG 10, rbp, 88 402DECLARE_REG 11, R12, 96 403DECLARE_REG 12, R13, 104 404DECLARE_REG 13, R14, 112 405DECLARE_REG 14, R15, 120 406 407%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 408 %assign num_args %1 409 %assign regs_used %2 410 ASSERT regs_used >= num_args 411 SETUP_STACK_POINTER %4 412 ASSERT regs_used <= 15 413 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 414 ALLOC_STACK %4, %3 415 %if mmsize != 8 && stack_size == 0 416 WIN64_SPILL_XMM %3 417 %endif 418 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 419 DEFINE_ARGS_INTERNAL %0, %4, %5 420%endmacro 421 422%macro WIN64_PUSH_XMM 0 423 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 424 %if xmm_regs_used > 6 425 movaps [rstk + stack_offset + 8], xmm6 426 %endif 427 %if xmm_regs_used > 7 428 movaps [rstk + stack_offset + 24], xmm7 429 %endif 430 %if xmm_regs_used > 8 431 %assign %%i 8 432 %rep xmm_regs_used-8 433 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 434 %assign %%i %%i+1 435 %endrep 436 %endif 437%endmacro 438 439%macro WIN64_SPILL_XMM 1 440 %assign xmm_regs_used %1 441 ASSERT xmm_regs_used <= 16 442 %if xmm_regs_used > 8 443 %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 444 SUB rsp, stack_size_padded 445 %endif 446 WIN64_PUSH_XMM 447%endmacro 448 449%macro WIN64_RESTORE_XMM_INTERNAL 1 450 %assign %%pad_size 0 451 %if xmm_regs_used > 8 452 %assign %%i xmm_regs_used 453 %rep xmm_regs_used-8 454 %assign %%i %%i-1 455 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] 456 %endrep 457 %endif 458 %if stack_size_padded > 0 459 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) 460 mov rsp, rstkm 461 %else 462 add %1, stack_size_padded 463 %assign %%pad_size stack_size_padded 464 %endif 465 %endif 466 %if xmm_regs_used > 7 467 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] 468 %endif 469 %if xmm_regs_used > 6 470 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] 471 %endif 472%endmacro 473 474%macro WIN64_RESTORE_XMM 1 475 WIN64_RESTORE_XMM_INTERNAL %1 476 %assign stack_offset (stack_offset-stack_size_padded) 477 %assign xmm_regs_used 0 478%endmacro 479 480%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 481 482%macro RET 0 483 WIN64_RESTORE_XMM_INTERNAL rsp 484 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 485%if mmsize == 32 486 vzeroupper 487%endif 488 AUTO_REP_RET 489%endmacro 490 491%elif ARCH_X86_64 ; *nix x64 ;============================================= 492 493DECLARE_REG 0, rdi 494DECLARE_REG 1, rsi 495DECLARE_REG 2, rdx 496DECLARE_REG 3, rcx 497DECLARE_REG 4, R8 498DECLARE_REG 5, R9 499DECLARE_REG 6, rax, 8 500DECLARE_REG 7, R10, 16 501DECLARE_REG 8, R11, 24 502DECLARE_REG 9, rbx, 32 503DECLARE_REG 10, rbp, 40 504DECLARE_REG 11, R12, 48 505DECLARE_REG 12, R13, 56 506DECLARE_REG 13, R14, 64 507DECLARE_REG 14, R15, 72 508 509%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 510 %assign num_args %1 511 %assign regs_used %2 512 ASSERT regs_used >= num_args 513 SETUP_STACK_POINTER %4 514 ASSERT regs_used <= 15 515 PUSH_IF_USED 9, 10, 11, 12, 13, 14 516 ALLOC_STACK %4 517 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 518 DEFINE_ARGS_INTERNAL %0, %4, %5 519%endmacro 520 521%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 522 523%macro RET 0 524%if stack_size_padded > 0 525%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 526 mov rsp, rstkm 527%else 528 add rsp, stack_size_padded 529%endif 530%endif 531 POP_IF_USED 14, 13, 12, 11, 10, 9 532%if mmsize == 32 533 vzeroupper 534%endif 535 AUTO_REP_RET 536%endmacro 537 538%else ; X86_32 ;============================================================== 539 540DECLARE_REG 0, eax, 4 541DECLARE_REG 1, ecx, 8 542DECLARE_REG 2, edx, 12 543DECLARE_REG 3, ebx, 16 544DECLARE_REG 4, esi, 20 545DECLARE_REG 5, edi, 24 546DECLARE_REG 6, ebp, 28 547%define rsp esp 548 549%macro DECLARE_ARG 1-* 550 %rep %0 551 %define r%1m [rstk + stack_offset + 4*%1 + 4] 552 %define r%1mp dword r%1m 553 %rotate 1 554 %endrep 555%endmacro 556 557DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 558 559%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 560 %assign num_args %1 561 %assign regs_used %2 562 ASSERT regs_used >= num_args 563 %if num_args > 7 564 %assign num_args 7 565 %endif 566 %if regs_used > 7 567 %assign regs_used 7 568 %endif 569 SETUP_STACK_POINTER %4 570 ASSERT regs_used <= 7 571 PUSH_IF_USED 3, 4, 5, 6 572 ALLOC_STACK %4 573 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 574 DEFINE_ARGS_INTERNAL %0, %4, %5 575%endmacro 576 577%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 578 579%macro RET 0 580%if stack_size_padded > 0 581%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 582 mov rsp, rstkm 583%else 584 add rsp, stack_size_padded 585%endif 586%endif 587 POP_IF_USED 6, 5, 4, 3 588%if mmsize == 32 589 vzeroupper 590%endif 591 AUTO_REP_RET 592%endmacro 593 594%endif ;====================================================================== 595 596%if WIN64 == 0 597%macro WIN64_SPILL_XMM 1 598%endmacro 599%macro WIN64_RESTORE_XMM 1 600%endmacro 601%macro WIN64_PUSH_XMM 0 602%endmacro 603%endif 604 605; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 606; a branch or a branch target. So switch to a 2-byte form of ret in that case. 607; We can automatically detect "follows a branch", but not a branch target. 608; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 609%macro REP_RET 0 610 %if has_epilogue 611 RET 612 %else 613 rep ret 614 %endif 615%endmacro 616 617%define last_branch_adr $$ 618%macro AUTO_REP_RET 0 619 %ifndef cpuflags 620 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. 621 %elif notcpuflag(ssse3) 622 times ((last_branch_adr-$)>>31)+1 rep 623 %endif 624 ret 625%endmacro 626 627%macro BRANCH_INSTR 0-* 628 %rep %0 629 %macro %1 1-2 %1 630 %2 %1 631 %%branch_instr: 632 %xdefine last_branch_adr %%branch_instr 633 %endmacro 634 %rotate 1 635 %endrep 636%endmacro 637 638BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 639 640%macro TAIL_CALL 2 ; callee, is_nonadjacent 641 %if has_epilogue 642 call %1 643 RET 644 %elif %2 645 jmp %1 646 %endif 647%endmacro 648 649;============================================================================= 650; arch-independent part 651;============================================================================= 652 653%assign function_align 16 654 655; Begin a function. 656; Applies any symbol mangling needed for C linkage, and sets up a define such that 657; subsequent uses of the function name automatically refer to the mangled version. 658; Appends cpuflags to the function name if cpuflags has been specified. 659; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 660; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 661%macro cglobal 1-2+ "" ; name, [PROLOGUE args] 662 cglobal_internal 1, %1 %+ SUFFIX, %2 663%endmacro 664%macro cvisible 1-2+ "" ; name, [PROLOGUE args] 665 cglobal_internal 0, %1 %+ SUFFIX, %2 666%endmacro 667%macro cglobal_internal 2-3+ 668 %if %1 669 %xdefine %%FUNCTION_PREFIX private_prefix 670 %xdefine %%VISIBILITY hidden 671 %else 672 %xdefine %%FUNCTION_PREFIX public_prefix 673 %xdefine %%VISIBILITY 674 %endif 675 %ifndef cglobaled_%2 676 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) 677 %xdefine %2.skip_prologue %2 %+ .skip_prologue 678 CAT_XDEFINE cglobaled_, %2, 1 679 %endif 680 %xdefine current_function %2 681 %ifidn __OUTPUT_FORMAT__,elf 682 global %2:function %%VISIBILITY 683 %else 684 global %2 685 %endif 686 align function_align 687 %2: 688 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 689 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 690 %assign stack_offset 0 ; stack pointer offset relative to the return address 691 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 692 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 693 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 694 %ifnidn %3, "" 695 PROLOGUE %3 696 %endif 697%endmacro 698 699%macro cextern 1 700 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 701 CAT_XDEFINE cglobaled_, %1, 1 702 extern %1 703%endmacro 704 705; like cextern, but without the prefix 706%macro cextern_naked 1 707 %xdefine %1 mangle(%1) 708 CAT_XDEFINE cglobaled_, %1, 1 709 extern %1 710%endmacro 711 712%macro const 1-2+ 713 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 714 %ifidn __OUTPUT_FORMAT__,elf 715 global %1:data hidden 716 %else 717 global %1 718 %endif 719 %1: %2 720%endmacro 721 722; This is needed for ELF, otherwise the GNU linker assumes the stack is 723; executable by default. 724%ifidn __OUTPUT_FORMAT__,elf 725SECTION .note.GNU-stack noalloc noexec nowrite progbits 726%endif 727 728; cpuflags 729 730%assign cpuflags_mmx (1<<0) 731%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 732%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 733%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 734%assign cpuflags_sse (1<<4) | cpuflags_mmx2 735%assign cpuflags_sse2 (1<<5) | cpuflags_sse 736%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 737%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 738%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 739%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 740%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 741%assign cpuflags_avx (1<<11)| cpuflags_sse42 742%assign cpuflags_xop (1<<12)| cpuflags_avx 743%assign cpuflags_fma4 (1<<13)| cpuflags_avx 744%assign cpuflags_avx2 (1<<14)| cpuflags_avx 745%assign cpuflags_fma3 (1<<15)| cpuflags_avx 746 747%assign cpuflags_cache32 (1<<16) 748%assign cpuflags_cache64 (1<<17) 749%assign cpuflags_slowctz (1<<18) 750%assign cpuflags_lzcnt (1<<19) 751%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant 752%assign cpuflags_atom (1<<21) 753%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt 754%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 755 756%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 757%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 758 759; Takes an arbitrary number of cpuflags from the above list. 760; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 761; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 762%macro INIT_CPUFLAGS 0-* 763 %xdefine SUFFIX 764 %undef cpuname 765 %assign cpuflags 0 766 767 %if %0 >= 1 768 %rep %0 769 %ifdef cpuname 770 %xdefine cpuname cpuname %+ _%1 771 %else 772 %xdefine cpuname %1 773 %endif 774 %assign cpuflags cpuflags | cpuflags_%1 775 %rotate 1 776 %endrep 777 %xdefine SUFFIX _ %+ cpuname 778 779 %if cpuflag(avx) 780 %assign avx_enabled 1 781 %endif 782 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 783 %define mova movaps 784 %define movu movups 785 %define movnta movntps 786 %endif 787 %if cpuflag(aligned) 788 %define movu mova 789 %elif cpuflag(sse3) && notcpuflag(ssse3) 790 %define movu lddqu 791 %endif 792 %endif 793 794 %if cpuflag(sse2) 795 CPUNOP amdnop 796 %else 797 CPUNOP basicnop 798 %endif 799%endmacro 800 801; Merge mmx and sse* 802; m# is a simd register of the currently selected size 803; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 804; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 805; (All 3 remain in sync through SWAP.) 806 807%macro CAT_XDEFINE 3 808 %xdefine %1%2 %3 809%endmacro 810 811%macro CAT_UNDEF 2 812 %undef %1%2 813%endmacro 814 815%macro INIT_MMX 0-1+ 816 %assign avx_enabled 0 817 %define RESET_MM_PERMUTATION INIT_MMX %1 818 %define mmsize 8 819 %define num_mmregs 8 820 %define mova movq 821 %define movu movq 822 %define movh movd 823 %define movnta movntq 824 %assign %%i 0 825 %rep 8 826 CAT_XDEFINE m, %%i, mm %+ %%i 827 CAT_XDEFINE nnmm, %%i, %%i 828 %assign %%i %%i+1 829 %endrep 830 %rep 8 831 CAT_UNDEF m, %%i 832 CAT_UNDEF nnmm, %%i 833 %assign %%i %%i+1 834 %endrep 835 INIT_CPUFLAGS %1 836%endmacro 837 838%macro INIT_XMM 0-1+ 839 %assign avx_enabled 0 840 %define RESET_MM_PERMUTATION INIT_XMM %1 841 %define mmsize 16 842 %define num_mmregs 8 843 %if ARCH_X86_64 844 %define num_mmregs 16 845 %endif 846 %define mova movdqa 847 %define movu movdqu 848 %define movh movq 849 %define movnta movntdq 850 %assign %%i 0 851 %rep num_mmregs 852 CAT_XDEFINE m, %%i, xmm %+ %%i 853 CAT_XDEFINE nnxmm, %%i, %%i 854 %assign %%i %%i+1 855 %endrep 856 INIT_CPUFLAGS %1 857%endmacro 858 859; FIXME: INIT_AVX can be replaced by INIT_XMM avx 860%macro INIT_AVX 0 861 INIT_XMM 862 %assign avx_enabled 1 863 %define PALIGNR PALIGNR_SSSE3 864 %define RESET_MM_PERMUTATION INIT_AVX 865%endmacro 866 867%macro INIT_YMM 0-1+ 868 %assign avx_enabled 1 869 %define RESET_MM_PERMUTATION INIT_YMM %1 870 %define mmsize 32 871 %define num_mmregs 8 872 %if ARCH_X86_64 873 %define num_mmregs 16 874 %endif 875 %define mova movdqa 876 %define movu movdqu 877 %undef movh 878 %define movnta movntdq 879 %assign %%i 0 880 %rep num_mmregs 881 CAT_XDEFINE m, %%i, ymm %+ %%i 882 CAT_XDEFINE nymm, %%i, %%i 883 %assign %%i %%i+1 884 %endrep 885 INIT_CPUFLAGS %1 886%endmacro 887 888INIT_XMM 889 890%macro DECLARE_MMCAST 1 891 %define mmmm%1 mm%1 892 %define mmxmm%1 mm%1 893 %define mmymm%1 mm%1 894 %define xmmmm%1 mm%1 895 %define xmmxmm%1 xmm%1 896 %define xmmymm%1 xmm%1 897 %define ymmmm%1 mm%1 898 %define ymmxmm%1 xmm%1 899 %define ymmymm%1 ymm%1 900 %define xm%1 xmm %+ m%1 901 %define ym%1 ymm %+ m%1 902%endmacro 903 904%assign i 0 905%rep 16 906 DECLARE_MMCAST i 907%assign i i+1 908%endrep 909 910; I often want to use macros that permute their arguments. e.g. there's no 911; efficient way to implement butterfly or transpose or dct without swapping some 912; arguments. 913; 914; I would like to not have to manually keep track of the permutations: 915; If I insert a permutation in the middle of a function, it should automatically 916; change everything that follows. For more complex macros I may also have multiple 917; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 918; 919; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 920; permutes its arguments. It's equivalent to exchanging the contents of the 921; registers, except that this way you exchange the register names instead, so it 922; doesn't cost any cycles. 923 924%macro PERMUTE 2-* ; takes a list of pairs to swap 925%rep %0/2 926 %xdefine %%tmp%2 m%2 927 %rotate 2 928%endrep 929%rep %0/2 930 %xdefine m%1 %%tmp%2 931 CAT_XDEFINE nn, m%1, %1 932 %rotate 2 933%endrep 934%endmacro 935 936%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 937%ifnum %1 ; SWAP 0, 1, ... 938 SWAP_INTERNAL_NUM %1, %2 939%else ; SWAP m0, m1, ... 940 SWAP_INTERNAL_NAME %1, %2 941%endif 942%endmacro 943 944%macro SWAP_INTERNAL_NUM 2-* 945 %rep %0-1 946 %xdefine %%tmp m%1 947 %xdefine m%1 m%2 948 %xdefine m%2 %%tmp 949 CAT_XDEFINE nn, m%1, %1 950 CAT_XDEFINE nn, m%2, %2 951 %rotate 1 952 %endrep 953%endmacro 954 955%macro SWAP_INTERNAL_NAME 2-* 956 %xdefine %%args nn %+ %1 957 %rep %0-1 958 %xdefine %%args %%args, nn %+ %2 959 %rotate 1 960 %endrep 961 SWAP_INTERNAL_NUM %%args 962%endmacro 963 964; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 965; calls to that function will automatically load the permutation, so values can 966; be returned in mmregs. 967%macro SAVE_MM_PERMUTATION 0-1 968 %if %0 969 %xdefine %%f %1_m 970 %else 971 %xdefine %%f current_function %+ _m 972 %endif 973 %assign %%i 0 974 %rep num_mmregs 975 CAT_XDEFINE %%f, %%i, m %+ %%i 976 %assign %%i %%i+1 977 %endrep 978%endmacro 979 980%macro LOAD_MM_PERMUTATION 1 ; name to load from 981 %ifdef %1_m0 982 %assign %%i 0 983 %rep num_mmregs 984 CAT_XDEFINE m, %%i, %1_m %+ %%i 985 CAT_XDEFINE nn, m %+ %%i, %%i 986 %assign %%i %%i+1 987 %endrep 988 %endif 989%endmacro 990 991; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 992%macro call 1 993 call_internal %1 %+ SUFFIX, %1 994%endmacro 995%macro call_internal 2 996 %xdefine %%i %2 997 %ifndef cglobaled_%2 998 %ifdef cglobaled_%1 999 %xdefine %%i %1 1000 %endif 1001 %endif 1002 call %%i 1003 LOAD_MM_PERMUTATION %%i 1004%endmacro 1005 1006; Substitutions that reduce instruction size but are functionally equivalent 1007%macro add 2 1008 %ifnum %2 1009 %if %2==128 1010 sub %1, -128 1011 %else 1012 add %1, %2 1013 %endif 1014 %else 1015 add %1, %2 1016 %endif 1017%endmacro 1018 1019%macro sub 2 1020 %ifnum %2 1021 %if %2==128 1022 add %1, -128 1023 %else 1024 sub %1, %2 1025 %endif 1026 %else 1027 sub %1, %2 1028 %endif 1029%endmacro 1030 1031;============================================================================= 1032; AVX abstraction layer 1033;============================================================================= 1034 1035%assign i 0 1036%rep 16 1037 %if i < 8 1038 CAT_XDEFINE sizeofmm, i, 8 1039 %endif 1040 CAT_XDEFINE sizeofxmm, i, 16 1041 CAT_XDEFINE sizeofymm, i, 32 1042%assign i i+1 1043%endrep 1044%undef i 1045 1046%macro CHECK_AVX_INSTR_EMU 3-* 1047 %xdefine %%opcode %1 1048 %xdefine %%dst %2 1049 %rep %0-2 1050 %ifidn %%dst, %3 1051 %error non-avx emulation of ``%%opcode'' is not supported 1052 %endif 1053 %rotate 1 1054 %endrep 1055%endmacro 1056 1057;%1 == instruction 1058;%2 == 1 if float, 0 if int 1059;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise 1060;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1061;%5+: operands 1062%macro RUN_AVX_INSTR 5-8+ 1063 %ifnum sizeof%6 1064 %assign __sizeofreg sizeof%6 1065 %elifnum sizeof%5 1066 %assign __sizeofreg sizeof%5 1067 %else 1068 %assign __sizeofreg mmsize 1069 %endif 1070 %assign __emulate_avx 0 1071 %if avx_enabled && __sizeofreg >= 16 1072 %xdefine __instr v%1 1073 %else 1074 %xdefine __instr %1 1075 %if %0 >= 7+%3 1076 %assign __emulate_avx 1 1077 %endif 1078 %endif 1079 1080 %if __emulate_avx 1081 %xdefine __src1 %6 1082 %xdefine __src2 %7 1083 %ifnidn %5, %6 1084 %if %0 >= 8 1085 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 1086 %else 1087 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 1088 %endif 1089 %if %4 && %3 == 0 1090 %ifnid %7 1091 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1092 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1093 ; So, if the instruction is commutative with a memory arg, swap them. 1094 %xdefine __src1 %7 1095 %xdefine __src2 %6 1096 %endif 1097 %endif 1098 %if __sizeofreg == 8 1099 MOVQ %5, __src1 1100 %elif %2 1101 MOVAPS %5, __src1 1102 %else 1103 MOVDQA %5, __src1 1104 %endif 1105 %endif 1106 %if %0 >= 8 1107 %1 %5, __src2, %8 1108 %else 1109 %1 %5, __src2 1110 %endif 1111 %elif %0 >= 8 1112 __instr %5, %6, %7, %8 1113 %elif %0 == 7 1114 __instr %5, %6, %7 1115 %elif %0 == 6 1116 __instr %5, %6 1117 %else 1118 __instr %5 1119 %endif 1120%endmacro 1121 1122;%1 == instruction 1123;%2 == 1 if float, 0 if int 1124;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise 1125;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1126%macro AVX_INSTR 1-4 0, 1, 0 1127 %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 1128 %ifidn %2, fnord 1129 RUN_AVX_INSTR %6, %7, %8, %9, %1 1130 %elifidn %3, fnord 1131 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 1132 %elifidn %4, fnord 1133 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 1134 %elifidn %5, fnord 1135 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 1136 %else 1137 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 1138 %endif 1139 %endmacro 1140%endmacro 1141 1142; Instructions with both VEX and non-VEX encodings 1143; Non-destructive instructions are written without parameters 1144AVX_INSTR addpd, 1, 0, 1 1145AVX_INSTR addps, 1, 0, 1 1146AVX_INSTR addsd, 1, 0, 1 1147AVX_INSTR addss, 1, 0, 1 1148AVX_INSTR addsubpd, 1, 0, 0 1149AVX_INSTR addsubps, 1, 0, 0 1150AVX_INSTR aesdec, 0, 0, 0 1151AVX_INSTR aesdeclast, 0, 0, 0 1152AVX_INSTR aesenc, 0, 0, 0 1153AVX_INSTR aesenclast, 0, 0, 0 1154AVX_INSTR aesimc 1155AVX_INSTR aeskeygenassist 1156AVX_INSTR andnpd, 1, 0, 0 1157AVX_INSTR andnps, 1, 0, 0 1158AVX_INSTR andpd, 1, 0, 1 1159AVX_INSTR andps, 1, 0, 1 1160AVX_INSTR blendpd, 1, 0, 0 1161AVX_INSTR blendps, 1, 0, 0 1162AVX_INSTR blendvpd, 1, 0, 0 1163AVX_INSTR blendvps, 1, 0, 0 1164AVX_INSTR cmppd, 1, 1, 0 1165AVX_INSTR cmpps, 1, 1, 0 1166AVX_INSTR cmpsd, 1, 1, 0 1167AVX_INSTR cmpss, 1, 1, 0 1168AVX_INSTR comisd 1169AVX_INSTR comiss 1170AVX_INSTR cvtdq2pd 1171AVX_INSTR cvtdq2ps 1172AVX_INSTR cvtpd2dq 1173AVX_INSTR cvtpd2ps 1174AVX_INSTR cvtps2dq 1175AVX_INSTR cvtps2pd 1176AVX_INSTR cvtsd2si 1177AVX_INSTR cvtsd2ss 1178AVX_INSTR cvtsi2sd 1179AVX_INSTR cvtsi2ss 1180AVX_INSTR cvtss2sd 1181AVX_INSTR cvtss2si 1182AVX_INSTR cvttpd2dq 1183AVX_INSTR cvttps2dq 1184AVX_INSTR cvttsd2si 1185AVX_INSTR cvttss2si 1186AVX_INSTR divpd, 1, 0, 0 1187AVX_INSTR divps, 1, 0, 0 1188AVX_INSTR divsd, 1, 0, 0 1189AVX_INSTR divss, 1, 0, 0 1190AVX_INSTR dppd, 1, 1, 0 1191AVX_INSTR dpps, 1, 1, 0 1192AVX_INSTR extractps 1193AVX_INSTR haddpd, 1, 0, 0 1194AVX_INSTR haddps, 1, 0, 0 1195AVX_INSTR hsubpd, 1, 0, 0 1196AVX_INSTR hsubps, 1, 0, 0 1197AVX_INSTR insertps, 1, 1, 0 1198AVX_INSTR lddqu 1199AVX_INSTR ldmxcsr 1200AVX_INSTR maskmovdqu 1201AVX_INSTR maxpd, 1, 0, 1 1202AVX_INSTR maxps, 1, 0, 1 1203AVX_INSTR maxsd, 1, 0, 1 1204AVX_INSTR maxss, 1, 0, 1 1205AVX_INSTR minpd, 1, 0, 1 1206AVX_INSTR minps, 1, 0, 1 1207AVX_INSTR minsd, 1, 0, 1 1208AVX_INSTR minss, 1, 0, 1 1209AVX_INSTR movapd 1210AVX_INSTR movaps 1211AVX_INSTR movd 1212AVX_INSTR movddup 1213AVX_INSTR movdqa 1214AVX_INSTR movdqu 1215AVX_INSTR movhlps, 1, 0, 0 1216AVX_INSTR movhpd, 1, 0, 0 1217AVX_INSTR movhps, 1, 0, 0 1218AVX_INSTR movlhps, 1, 0, 0 1219AVX_INSTR movlpd, 1, 0, 0 1220AVX_INSTR movlps, 1, 0, 0 1221AVX_INSTR movmskpd 1222AVX_INSTR movmskps 1223AVX_INSTR movntdq 1224AVX_INSTR movntdqa 1225AVX_INSTR movntpd 1226AVX_INSTR movntps 1227AVX_INSTR movq 1228AVX_INSTR movsd, 1, 0, 0 1229AVX_INSTR movshdup 1230AVX_INSTR movsldup 1231AVX_INSTR movss, 1, 0, 0 1232AVX_INSTR movupd 1233AVX_INSTR movups 1234AVX_INSTR mpsadbw, 0, 1, 0 1235AVX_INSTR mulpd, 1, 0, 1 1236AVX_INSTR mulps, 1, 0, 1 1237AVX_INSTR mulsd, 1, 0, 1 1238AVX_INSTR mulss, 1, 0, 1 1239AVX_INSTR orpd, 1, 0, 1 1240AVX_INSTR orps, 1, 0, 1 1241AVX_INSTR pabsb 1242AVX_INSTR pabsd 1243AVX_INSTR pabsw 1244AVX_INSTR packsswb, 0, 0, 0 1245AVX_INSTR packssdw, 0, 0, 0 1246AVX_INSTR packuswb, 0, 0, 0 1247AVX_INSTR packusdw, 0, 0, 0 1248AVX_INSTR paddb, 0, 0, 1 1249AVX_INSTR paddw, 0, 0, 1 1250AVX_INSTR paddd, 0, 0, 1 1251AVX_INSTR paddq, 0, 0, 1 1252AVX_INSTR paddsb, 0, 0, 1 1253AVX_INSTR paddsw, 0, 0, 1 1254AVX_INSTR paddusb, 0, 0, 1 1255AVX_INSTR paddusw, 0, 0, 1 1256AVX_INSTR palignr, 0, 1, 0 1257AVX_INSTR pand, 0, 0, 1 1258AVX_INSTR pandn, 0, 0, 0 1259AVX_INSTR pavgb, 0, 0, 1 1260AVX_INSTR pavgw, 0, 0, 1 1261AVX_INSTR pblendvb, 0, 0, 0 1262AVX_INSTR pblendw, 0, 1, 0 1263AVX_INSTR pclmulqdq, 0, 1, 0 1264AVX_INSTR pcmpestri 1265AVX_INSTR pcmpestrm 1266AVX_INSTR pcmpistri 1267AVX_INSTR pcmpistrm 1268AVX_INSTR pcmpeqb, 0, 0, 1 1269AVX_INSTR pcmpeqw, 0, 0, 1 1270AVX_INSTR pcmpeqd, 0, 0, 1 1271AVX_INSTR pcmpeqq, 0, 0, 1 1272AVX_INSTR pcmpgtb, 0, 0, 0 1273AVX_INSTR pcmpgtw, 0, 0, 0 1274AVX_INSTR pcmpgtd, 0, 0, 0 1275AVX_INSTR pcmpgtq, 0, 0, 0 1276AVX_INSTR pextrb 1277AVX_INSTR pextrd 1278AVX_INSTR pextrq 1279AVX_INSTR pextrw 1280AVX_INSTR phaddw, 0, 0, 0 1281AVX_INSTR phaddd, 0, 0, 0 1282AVX_INSTR phaddsw, 0, 0, 0 1283AVX_INSTR phminposuw 1284AVX_INSTR phsubw, 0, 0, 0 1285AVX_INSTR phsubd, 0, 0, 0 1286AVX_INSTR phsubsw, 0, 0, 0 1287AVX_INSTR pinsrb, 0, 1, 0 1288AVX_INSTR pinsrd, 0, 1, 0 1289AVX_INSTR pinsrq, 0, 1, 0 1290AVX_INSTR pinsrw, 0, 1, 0 1291AVX_INSTR pmaddwd, 0, 0, 1 1292AVX_INSTR pmaddubsw, 0, 0, 0 1293AVX_INSTR pmaxsb, 0, 0, 1 1294AVX_INSTR pmaxsw, 0, 0, 1 1295AVX_INSTR pmaxsd, 0, 0, 1 1296AVX_INSTR pmaxub, 0, 0, 1 1297AVX_INSTR pmaxuw, 0, 0, 1 1298AVX_INSTR pmaxud, 0, 0, 1 1299AVX_INSTR pminsb, 0, 0, 1 1300AVX_INSTR pminsw, 0, 0, 1 1301AVX_INSTR pminsd, 0, 0, 1 1302AVX_INSTR pminub, 0, 0, 1 1303AVX_INSTR pminuw, 0, 0, 1 1304AVX_INSTR pminud, 0, 0, 1 1305AVX_INSTR pmovmskb 1306AVX_INSTR pmovsxbw 1307AVX_INSTR pmovsxbd 1308AVX_INSTR pmovsxbq 1309AVX_INSTR pmovsxwd 1310AVX_INSTR pmovsxwq 1311AVX_INSTR pmovsxdq 1312AVX_INSTR pmovzxbw 1313AVX_INSTR pmovzxbd 1314AVX_INSTR pmovzxbq 1315AVX_INSTR pmovzxwd 1316AVX_INSTR pmovzxwq 1317AVX_INSTR pmovzxdq 1318AVX_INSTR pmuldq, 0, 0, 1 1319AVX_INSTR pmulhrsw, 0, 0, 1 1320AVX_INSTR pmulhuw, 0, 0, 1 1321AVX_INSTR pmulhw, 0, 0, 1 1322AVX_INSTR pmullw, 0, 0, 1 1323AVX_INSTR pmulld, 0, 0, 1 1324AVX_INSTR pmuludq, 0, 0, 1 1325AVX_INSTR por, 0, 0, 1 1326AVX_INSTR psadbw, 0, 0, 1 1327AVX_INSTR pshufb, 0, 0, 0 1328AVX_INSTR pshufd 1329AVX_INSTR pshufhw 1330AVX_INSTR pshuflw 1331AVX_INSTR psignb, 0, 0, 0 1332AVX_INSTR psignw, 0, 0, 0 1333AVX_INSTR psignd, 0, 0, 0 1334AVX_INSTR psllw, 0, 0, 0 1335AVX_INSTR pslld, 0, 0, 0 1336AVX_INSTR psllq, 0, 0, 0 1337AVX_INSTR pslldq, 0, 0, 0 1338AVX_INSTR psraw, 0, 0, 0 1339AVX_INSTR psrad, 0, 0, 0 1340AVX_INSTR psrlw, 0, 0, 0 1341AVX_INSTR psrld, 0, 0, 0 1342AVX_INSTR psrlq, 0, 0, 0 1343AVX_INSTR psrldq, 0, 0, 0 1344AVX_INSTR psubb, 0, 0, 0 1345AVX_INSTR psubw, 0, 0, 0 1346AVX_INSTR psubd, 0, 0, 0 1347AVX_INSTR psubq, 0, 0, 0 1348AVX_INSTR psubsb, 0, 0, 0 1349AVX_INSTR psubsw, 0, 0, 0 1350AVX_INSTR psubusb, 0, 0, 0 1351AVX_INSTR psubusw, 0, 0, 0 1352AVX_INSTR ptest 1353AVX_INSTR punpckhbw, 0, 0, 0 1354AVX_INSTR punpckhwd, 0, 0, 0 1355AVX_INSTR punpckhdq, 0, 0, 0 1356AVX_INSTR punpckhqdq, 0, 0, 0 1357AVX_INSTR punpcklbw, 0, 0, 0 1358AVX_INSTR punpcklwd, 0, 0, 0 1359AVX_INSTR punpckldq, 0, 0, 0 1360AVX_INSTR punpcklqdq, 0, 0, 0 1361AVX_INSTR pxor, 0, 0, 1 1362AVX_INSTR rcpps, 1, 0, 0 1363AVX_INSTR rcpss, 1, 0, 0 1364AVX_INSTR roundpd 1365AVX_INSTR roundps 1366AVX_INSTR roundsd 1367AVX_INSTR roundss 1368AVX_INSTR rsqrtps, 1, 0, 0 1369AVX_INSTR rsqrtss, 1, 0, 0 1370AVX_INSTR shufpd, 1, 1, 0 1371AVX_INSTR shufps, 1, 1, 0 1372AVX_INSTR sqrtpd, 1, 0, 0 1373AVX_INSTR sqrtps, 1, 0, 0 1374AVX_INSTR sqrtsd, 1, 0, 0 1375AVX_INSTR sqrtss, 1, 0, 0 1376AVX_INSTR stmxcsr 1377AVX_INSTR subpd, 1, 0, 0 1378AVX_INSTR subps, 1, 0, 0 1379AVX_INSTR subsd, 1, 0, 0 1380AVX_INSTR subss, 1, 0, 0 1381AVX_INSTR ucomisd 1382AVX_INSTR ucomiss 1383AVX_INSTR unpckhpd, 1, 0, 0 1384AVX_INSTR unpckhps, 1, 0, 0 1385AVX_INSTR unpcklpd, 1, 0, 0 1386AVX_INSTR unpcklps, 1, 0, 0 1387AVX_INSTR xorpd, 1, 0, 1 1388AVX_INSTR xorps, 1, 0, 1 1389 1390; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1391AVX_INSTR pfadd, 1, 0, 1 1392AVX_INSTR pfsub, 1, 0, 0 1393AVX_INSTR pfmul, 1, 0, 1 1394 1395; base-4 constants for shuffles 1396%assign i 0 1397%rep 256 1398 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1399 %if j < 10 1400 CAT_XDEFINE q000, j, i 1401 %elif j < 100 1402 CAT_XDEFINE q00, j, i 1403 %elif j < 1000 1404 CAT_XDEFINE q0, j, i 1405 %else 1406 CAT_XDEFINE q, j, i 1407 %endif 1408%assign i i+1 1409%endrep 1410%undef i 1411%undef j 1412 1413; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. 1414; This lets us use tzcnt without bumping the yasm version requirement yet. 1415%define tzcnt rep bsf 1416 1417; convert FMA4 to FMA3 if possible 1418%macro FMA4_INSTR 4 1419 %macro %1 4-8 %1, %2, %3, %4 1420 %if cpuflag(fma4) 1421 v%5 %1, %2, %3, %4 1422 %elifidn %1, %2 1423 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 1424 %elifidn %1, %3 1425 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 1426 %elifidn %1, %4 1427 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 1428 %else 1429 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported 1430 %endif 1431 %endmacro 1432%endmacro 1433 1434FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd 1435FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps 1436FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd 1437FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss 1438 1439FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd 1440FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps 1441FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd 1442FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps 1443 1444FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd 1445FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps 1446FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd 1447FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss 1448 1449FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd 1450FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps 1451FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd 1452FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss 1453 1454FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd 1455FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps 1456FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd 1457FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss 1458 1459; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug 1460%if ARCH_X86_64 == 0 1461%macro vpbroadcastq 2 1462%if sizeof%1 == 16 1463 movddup %1, %2 1464%else 1465 vbroadcastsd %1, %2 1466%endif 1467%endmacro 1468%endif 1469