1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2014 x264 project 5;* 2013-2014 x265 project 6;* 7;* Authors: Loren Merritt <lorenm@u.washington.edu> 8;* Anton Mitrofanov <BugMaster@narod.ru> 9;* Fiona Glaser <fiona@x264.com> 10;* Henrik Gramner <henrik@gramner.com> 11;* Min Chen <chenm003@163.com> 12;* 13;* Permission to use, copy, modify, and/or distribute this software for any 14;* purpose with or without fee is hereby granted, provided that the above 15;* copyright notice and this permission notice appear in all copies. 16;* 17;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 18;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 19;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 20;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 21;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 22;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 23;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 24;***************************************************************************** 25 26; This is a header file for the x264ASM assembly language, which uses 27; NASM/YASM syntax combined with a large number of macros to provide easy 28; abstraction between different calling conventions (x86_32, win64, linux64). 29; It also has various other useful features to simplify writing the kind of 30; DSP functions that are most often used in x264. 31 32; Unlike the rest of x264, this file is available under an ISC license, as it 33; has significant usefulness outside of x264 and we want it to be available 34; to the largest audience possible. Of course, if you modify it for your own 35; purposes to add a new feature, we strongly encourage contributing a patch 36; as this feature might be useful for others as well. Send patches or ideas 37; to x264-devel@videolan.org . 38 39%ifndef private_prefix 40 %define private_prefix X265_NS 41%endif 42 43%ifndef public_prefix 44 %define public_prefix private_prefix 45%endif 46 47%ifndef STACK_ALIGNMENT 48 %if ARCH_X86_64 49 %define STACK_ALIGNMENT 16 50 %else 51 %define STACK_ALIGNMENT 4 52 %endif 53%endif 54 55%define WIN64 0 56%define UNIX64 0 57%if ARCH_X86_64 58 %ifidn __OUTPUT_FORMAT__,win32 59 %define WIN64 1 60 %elifidn __OUTPUT_FORMAT__,win64 61 %define WIN64 1 62 %elifidn __OUTPUT_FORMAT__,x64 63 %define WIN64 1 64 %else 65 %define UNIX64 1 66 %endif 67%endif 68 69%ifdef PREFIX 70 %define mangle(x) _ %+ x 71%else 72 %define mangle(x) x 73%endif 74 75%macro SECTION_RODATA 0-1 32 76 SECTION .rodata align=%1 77%endmacro 78 79%macro SECTION_TEXT 0-1 16 80 SECTION .text align=%1 81%endmacro 82 83%if WIN64 84 %define PIC 85%elif ARCH_X86_64 == 0 86; x86_32 doesn't require PIC. 87; Some distros prefer shared objects to be PIC, but nothing breaks if 88; the code contains a few textrels, so we'll skip that complexity. 89 %undef PIC 90%endif 91%ifdef PIC 92 default rel 93%endif 94 95; Macros to eliminate most code duplication between x86_32 and x86_64: 96; Currently this works only for leaf functions which load all their arguments 97; into registers at the start, and make no other use of the stack. Luckily that 98; covers most of x264's asm. 99 100; PROLOGUE: 101; %1 = number of arguments. loads them from stack if needed. 102; %2 = number of registers used. pushes callee-saved regs if needed. 103; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 104; %4 = (optional) stack size to be allocated. The stack will be aligned before 105; allocating the specified stack size. If the required stack alignment is 106; larger than the known stack alignment the stack will be manually aligned 107; and an extra register will be allocated to hold the original stack 108; pointer (to not invalidate r0m etc.). To prevent the use of an extra 109; register as stack pointer, request a negative stack size. 110; %4+/%5+ = list of names to define to registers 111; PROLOGUE can also be invoked by adding the same options to cglobal 112 113; e.g. 114; cglobal foo, 2,3,7,0x40, dst, src, tmp 115; declares a function (foo) that automatically loads two arguments (dst and 116; src) into registers, uses one additional register (tmp) plus 7 vector 117; registers (m0-m6) and allocates 0x40 bytes of stack space. 118 119; TODO Some functions can use some args directly from the stack. If they're the 120; last args then you can just not declare them, but if they're in the middle 121; we need more flexible macro. 122 123; RET: 124; Pops anything that was pushed by PROLOGUE, and returns. 125 126; REP_RET: 127; Use this instead of RET if it's a branch target. 128 129; registers: 130; rN and rNq are the native-size register holding function argument N 131; rNd, rNw, rNb are dword, word, and byte size 132; rNh is the high 8 bits of the word size 133; rNm is the original location of arg N (a register or on the stack), dword 134; rNmp is native size 135 136%macro DECLARE_REG 2-3 137 %define r%1q %2 138 %define r%1d %2d 139 %define r%1w %2w 140 %define r%1b %2b 141 %define r%1h %2h 142 %if %0 == 2 143 %define r%1m %2d 144 %define r%1mp %2 145 %elif ARCH_X86_64 ; memory 146 %define r%1m [rstk + stack_offset + %3] 147 %define r%1mp qword r %+ %1 %+ m 148 %else 149 %define r%1m [rstk + stack_offset + %3] 150 %define r%1mp dword r %+ %1 %+ m 151 %endif 152 %define r%1 %2 153%endmacro 154 155%macro DECLARE_REG_SIZE 3 156 %define r%1q r%1 157 %define e%1q r%1 158 %define r%1d e%1 159 %define e%1d e%1 160 %define r%1w %1 161 %define e%1w %1 162 %define r%1h %3 163 %define e%1h %3 164 %define r%1b %2 165 %define e%1b %2 166%if ARCH_X86_64 == 0 167 %define r%1 e%1 168%endif 169%endmacro 170 171DECLARE_REG_SIZE ax, al, ah 172DECLARE_REG_SIZE bx, bl, bh 173DECLARE_REG_SIZE cx, cl, ch 174DECLARE_REG_SIZE dx, dl, dh 175DECLARE_REG_SIZE si, sil, null 176DECLARE_REG_SIZE di, dil, null 177DECLARE_REG_SIZE bp, bpl, null 178 179; t# defines for when per-arch register allocation is more complex than just function arguments 180 181%macro DECLARE_REG_TMP 1-* 182 %assign %%i 0 183 %rep %0 184 CAT_XDEFINE t, %%i, r%1 185 %assign %%i %%i+1 186 %rotate 1 187 %endrep 188%endmacro 189 190%macro DECLARE_REG_TMP_SIZE 0-* 191 %rep %0 192 %define t%1q t%1 %+ q 193 %define t%1d t%1 %+ d 194 %define t%1w t%1 %+ w 195 %define t%1h t%1 %+ h 196 %define t%1b t%1 %+ b 197 %rotate 1 198 %endrep 199%endmacro 200 201DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 202 203%if ARCH_X86_64 204 %define gprsize 8 205%else 206 %define gprsize 4 207%endif 208 209%macro PUSH 1 210 push %1 211 %ifidn rstk, rsp 212 %assign stack_offset stack_offset+gprsize 213 %endif 214%endmacro 215 216%macro POP 1 217 pop %1 218 %ifidn rstk, rsp 219 %assign stack_offset stack_offset-gprsize 220 %endif 221%endmacro 222 223%macro PUSH_IF_USED 1-* 224 %rep %0 225 %if %1 < regs_used 226 PUSH r%1 227 %endif 228 %rotate 1 229 %endrep 230%endmacro 231 232%macro POP_IF_USED 1-* 233 %rep %0 234 %if %1 < regs_used 235 pop r%1 236 %endif 237 %rotate 1 238 %endrep 239%endmacro 240 241%macro LOAD_IF_USED 1-* 242 %rep %0 243 %if %1 < num_args 244 mov r%1, r %+ %1 %+ mp 245 %endif 246 %rotate 1 247 %endrep 248%endmacro 249 250%macro SUB 2 251 sub %1, %2 252 %ifidn %1, rstk 253 %assign stack_offset stack_offset+(%2) 254 %endif 255%endmacro 256 257%macro ADD 2 258 add %1, %2 259 %ifidn %1, rstk 260 %assign stack_offset stack_offset-(%2) 261 %endif 262%endmacro 263 264%macro movifnidn 2 265 %ifnidn %1, %2 266 mov %1, %2 267 %endif 268%endmacro 269 270%macro movsxdifnidn 2 271 %ifnidn %1, %2 272 movsxd %1, %2 273 %endif 274%endmacro 275 276%macro ASSERT 1 277 %if (%1) == 0 278 %error assert failed 279 %endif 280%endmacro 281 282%macro DEFINE_ARGS 0-* 283 %ifdef n_arg_names 284 %assign %%i 0 285 %rep n_arg_names 286 CAT_UNDEF arg_name %+ %%i, q 287 CAT_UNDEF arg_name %+ %%i, d 288 CAT_UNDEF arg_name %+ %%i, w 289 CAT_UNDEF arg_name %+ %%i, h 290 CAT_UNDEF arg_name %+ %%i, b 291 CAT_UNDEF arg_name %+ %%i, m 292 CAT_UNDEF arg_name %+ %%i, mp 293 CAT_UNDEF arg_name, %%i 294 %assign %%i %%i+1 295 %endrep 296 %endif 297 298 %xdefine %%stack_offset stack_offset 299 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 300 %assign %%i 0 301 %rep %0 302 %xdefine %1q r %+ %%i %+ q 303 %xdefine %1d r %+ %%i %+ d 304 %xdefine %1w r %+ %%i %+ w 305 %xdefine %1h r %+ %%i %+ h 306 %xdefine %1b r %+ %%i %+ b 307 %xdefine %1m r %+ %%i %+ m 308 %xdefine %1mp r %+ %%i %+ mp 309 CAT_XDEFINE arg_name, %%i, %1 310 %assign %%i %%i+1 311 %rotate 1 312 %endrep 313 %xdefine stack_offset %%stack_offset 314 %assign n_arg_names %0 315%endmacro 316 317%define required_stack_alignment ((mmsize + 15) & ~15) 318 319%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) 320 %ifnum %1 321 %if %1 != 0 322 %assign %%pad 0 323 %assign stack_size %1 324 %if stack_size < 0 325 %assign stack_size -stack_size 326 %endif 327 %if WIN64 328 %assign %%pad %%pad + 32 ; shadow space 329 %if mmsize != 8 330 %assign xmm_regs_used %2 331 %if xmm_regs_used > 8 332 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 333 %endif 334 %endif 335 %endif 336 %if required_stack_alignment <= STACK_ALIGNMENT 337 ; maintain the current stack alignment 338 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 339 SUB rsp, stack_size_padded 340 %else 341 %assign %%reg_num (regs_used - 1) 342 %xdefine rstk r %+ %%reg_num 343 ; align stack, and save original stack location directly above 344 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 345 ; stack in a single instruction (i.e. mov rsp, rstk or mov 346 ; rsp, [rsp+stack_size_padded]) 347 %if %1 < 0 ; need to store rsp on stack 348 %xdefine rstkm [rsp + stack_size + %%pad] 349 %assign %%pad %%pad + gprsize 350 %else ; can keep rsp in rstk during whole function 351 %xdefine rstkm rstk 352 %endif 353 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 354 mov rstk, rsp 355 and rsp, ~(required_stack_alignment-1) 356 sub rsp, stack_size_padded 357 movifnidn rstkm, rstk 358 %endif 359 WIN64_PUSH_XMM 360 %endif 361 %endif 362%endmacro 363 364%macro SETUP_STACK_POINTER 1 365 %ifnum %1 366 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 367 %if %1 > 0 368 %assign regs_used (regs_used + 1) 369 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 370 %warning "Stack pointer will overwrite register argument" 371 %endif 372 %endif 373 %endif 374%endmacro 375 376%macro DEFINE_ARGS_INTERNAL 3+ 377 %ifnum %2 378 DEFINE_ARGS %3 379 %elif %1 == 4 380 DEFINE_ARGS %2 381 %elif %1 > 4 382 DEFINE_ARGS %2, %3 383 %endif 384%endmacro 385 386%if WIN64 ; Windows x64 ;================================================= 387 388DECLARE_REG 0, rcx 389DECLARE_REG 1, rdx 390DECLARE_REG 2, R8 391DECLARE_REG 3, R9 392DECLARE_REG 4, R10, 40 393DECLARE_REG 5, R11, 48 394DECLARE_REG 6, rax, 56 395DECLARE_REG 7, rdi, 64 396DECLARE_REG 8, rsi, 72 397DECLARE_REG 9, rbx, 80 398DECLARE_REG 10, rbp, 88 399DECLARE_REG 11, R12, 96 400DECLARE_REG 12, R13, 104 401DECLARE_REG 13, R14, 112 402DECLARE_REG 14, R15, 120 403 404%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 405 %assign num_args %1 406 %assign regs_used %2 407 ASSERT regs_used >= num_args 408 SETUP_STACK_POINTER %4 409 ASSERT regs_used <= 15 410 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 411 ALLOC_STACK %4, %3 412 %if mmsize != 8 && stack_size == 0 413 WIN64_SPILL_XMM %3 414 %endif 415 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 416 DEFINE_ARGS_INTERNAL %0, %4, %5 417%endmacro 418 419%macro WIN64_PUSH_XMM 0 420 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 421 %if xmm_regs_used > 6 422 movaps [rstk + stack_offset + 8], xmm6 423 %endif 424 %if xmm_regs_used > 7 425 movaps [rstk + stack_offset + 24], xmm7 426 %endif 427 %if xmm_regs_used > 8 428 %assign %%i 8 429 %rep xmm_regs_used-8 430 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 431 %assign %%i %%i+1 432 %endrep 433 %endif 434%endmacro 435 436%macro WIN64_SPILL_XMM 1 437 %assign xmm_regs_used %1 438 ASSERT xmm_regs_used <= 16 439 %if xmm_regs_used > 8 440 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 441 %assign %%pad (xmm_regs_used-8)*16 + 32 442 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 443 SUB rsp, stack_size_padded 444 %endif 445 WIN64_PUSH_XMM 446%endmacro 447 448%macro WIN64_RESTORE_XMM_INTERNAL 1 449 %assign %%pad_size 0 450 %if xmm_regs_used > 8 451 %assign %%i xmm_regs_used 452 %rep xmm_regs_used-8 453 %assign %%i %%i-1 454 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] 455 %endrep 456 %endif 457 %if stack_size_padded > 0 458 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 459 mov rsp, rstkm 460 %else 461 add %1, stack_size_padded 462 %assign %%pad_size stack_size_padded 463 %endif 464 %endif 465 %if xmm_regs_used > 7 466 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] 467 %endif 468 %if xmm_regs_used > 6 469 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] 470 %endif 471%endmacro 472 473%macro WIN64_RESTORE_XMM 1 474 WIN64_RESTORE_XMM_INTERNAL %1 475 %assign stack_offset (stack_offset-stack_size_padded) 476 %assign xmm_regs_used 0 477%endmacro 478 479%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 480 481%macro RET 0 482 WIN64_RESTORE_XMM_INTERNAL rsp 483 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 484%if mmsize == 32 485 vzeroupper 486%endif 487 AUTO_REP_RET 488%endmacro 489 490%elif ARCH_X86_64 ; *nix x64 ;============================================= 491 492DECLARE_REG 0, rdi 493DECLARE_REG 1, rsi 494DECLARE_REG 2, rdx 495DECLARE_REG 3, rcx 496DECLARE_REG 4, R8 497DECLARE_REG 5, R9 498DECLARE_REG 6, rax, 8 499DECLARE_REG 7, R10, 16 500DECLARE_REG 8, R11, 24 501DECLARE_REG 9, rbx, 32 502DECLARE_REG 10, rbp, 40 503DECLARE_REG 11, R12, 48 504DECLARE_REG 12, R13, 56 505DECLARE_REG 13, R14, 64 506DECLARE_REG 14, R15, 72 507 508%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 509 %assign num_args %1 510 %assign regs_used %2 511 ASSERT regs_used >= num_args 512 SETUP_STACK_POINTER %4 513 ASSERT regs_used <= 15 514 PUSH_IF_USED 9, 10, 11, 12, 13, 14 515 ALLOC_STACK %4 516 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 517 DEFINE_ARGS_INTERNAL %0, %4, %5 518%endmacro 519 520%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 521 522%macro RET 0 523%if stack_size_padded > 0 524%if required_stack_alignment > STACK_ALIGNMENT 525 mov rsp, rstkm 526%else 527 add rsp, stack_size_padded 528%endif 529%endif 530 POP_IF_USED 14, 13, 12, 11, 10, 9 531%if mmsize == 32 532 vzeroupper 533%endif 534 AUTO_REP_RET 535%endmacro 536 537%else ; X86_32 ;============================================================== 538 539DECLARE_REG 0, eax, 4 540DECLARE_REG 1, ecx, 8 541DECLARE_REG 2, edx, 12 542DECLARE_REG 3, ebx, 16 543DECLARE_REG 4, esi, 20 544DECLARE_REG 5, edi, 24 545DECLARE_REG 6, ebp, 28 546%define rsp esp 547 548%macro DECLARE_ARG 1-* 549 %rep %0 550 %define r%1m [rstk + stack_offset + 4*%1 + 4] 551 %define r%1mp dword r%1m 552 %rotate 1 553 %endrep 554%endmacro 555 556DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 557 558%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 559 %assign num_args %1 560 %assign regs_used %2 561 ASSERT regs_used >= num_args 562 %if num_args > 7 563 %assign num_args 7 564 %endif 565 %if regs_used > 7 566 %assign regs_used 7 567 %endif 568 SETUP_STACK_POINTER %4 569 ASSERT regs_used <= 7 570 PUSH_IF_USED 3, 4, 5, 6 571 ALLOC_STACK %4 572 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 573 DEFINE_ARGS_INTERNAL %0, %4, %5 574%endmacro 575 576%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 577 578%macro RET 0 579%if stack_size_padded > 0 580%if required_stack_alignment > STACK_ALIGNMENT 581 mov rsp, rstkm 582%else 583 add rsp, stack_size_padded 584%endif 585%endif 586 POP_IF_USED 6, 5, 4, 3 587%if mmsize == 32 588 vzeroupper 589%endif 590 AUTO_REP_RET 591%endmacro 592 593%endif ;====================================================================== 594 595%if WIN64 == 0 596%macro WIN64_SPILL_XMM 1 597%endmacro 598%macro WIN64_RESTORE_XMM 1 599%endmacro 600%macro WIN64_PUSH_XMM 0 601%endmacro 602%endif 603 604; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 605; a branch or a branch target. So switch to a 2-byte form of ret in that case. 606; We can automatically detect "follows a branch", but not a branch target. 607; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 608%macro REP_RET 0 609 %if has_epilogue 610 RET 611 %else 612 rep ret 613 %endif 614%endmacro 615 616%define last_branch_adr $$ 617%macro AUTO_REP_RET 0 618 %ifndef cpuflags 619 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. 620 %elif notcpuflag(ssse3) 621 times ((last_branch_adr-$)>>31)+1 rep 622 %endif 623 ret 624%endmacro 625 626%macro BRANCH_INSTR 0-* 627 %rep %0 628 %macro %1 1-2 %1 629 %2 %1 630 %%branch_instr: 631 %xdefine last_branch_adr %%branch_instr 632 %endmacro 633 %rotate 1 634 %endrep 635%endmacro 636 637BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 638 639%macro TAIL_CALL 2 ; callee, is_nonadjacent 640 %if has_epilogue 641 call %1 642 RET 643 %elif %2 644 jmp %1 645 %endif 646%endmacro 647 648;============================================================================= 649; arch-independent part 650;============================================================================= 651 652%assign function_align 16 653 654; Begin a function. 655; Applies any symbol mangling needed for C linkage, and sets up a define such that 656; subsequent uses of the function name automatically refer to the mangled version. 657; Appends cpuflags to the function name if cpuflags has been specified. 658; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 659; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 660%macro cglobal 1-2+ "" ; name, [PROLOGUE args] 661 cglobal_internal 1, %1 %+ SUFFIX, %2 662%endmacro 663%macro cvisible 1-2+ "" ; name, [PROLOGUE args] 664 cglobal_internal 0, %1 %+ SUFFIX, %2 665%endmacro 666%macro cglobal_internal 2-3+ 667 %if %1 668 %xdefine %%FUNCTION_PREFIX private_prefix 669 %xdefine %%VISIBILITY hidden 670 %else 671 %xdefine %%FUNCTION_PREFIX public_prefix 672 %xdefine %%VISIBILITY 673 %endif 674 %ifndef cglobaled_%2 675 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) 676 %xdefine %2.skip_prologue %2 %+ .skip_prologue 677 CAT_XDEFINE cglobaled_, %2, 1 678 %endif 679 %xdefine current_function %2 680 %ifidn __OUTPUT_FORMAT__,elf 681 global %2:function %%VISIBILITY 682 %else 683 global %2 684 %endif 685 align function_align 686 %2: 687 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 688 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 689 %assign stack_offset 0 ; stack pointer offset relative to the return address 690 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 691 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 692 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 693 %ifnidn %3, "" 694 PROLOGUE %3 695 %endif 696%endmacro 697 698%macro cextern 1 699 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 700 CAT_XDEFINE cglobaled_, %1, 1 701 extern %1 702%endmacro 703 704; like cextern, but without the prefix 705%macro cextern_naked 1 706 %xdefine %1 mangle(%1) 707 CAT_XDEFINE cglobaled_, %1, 1 708 extern %1 709%endmacro 710 711%macro const 1-2+ 712 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 713 %ifidn __OUTPUT_FORMAT__,elf 714 global %1:data hidden 715 %else 716 global %1 717 %endif 718 ALIGN 32 719 %1: %2 720%endmacro 721 722; This is needed for ELF, otherwise the GNU linker assumes the stack is 723; executable by default. 724%ifidn __OUTPUT_FORMAT__,elf 725SECTION .note.GNU-stack noalloc noexec nowrite progbits 726%endif 727 728; cpuflags 729 730%assign cpuflags_mmx (1<<0) 731%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 732%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 733%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 734%assign cpuflags_sse (1<<4) | cpuflags_mmx2 735%assign cpuflags_sse2 (1<<5) | cpuflags_sse 736%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 737%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 738%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 739%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 740%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 741%assign cpuflags_avx (1<<11)| cpuflags_sse42 742%assign cpuflags_xop (1<<12)| cpuflags_avx 743%assign cpuflags_fma4 (1<<13)| cpuflags_avx 744%assign cpuflags_avx2 (1<<14)| cpuflags_avx 745%assign cpuflags_fma3 (1<<15)| cpuflags_avx 746 747%assign cpuflags_cache32 (1<<16) 748%assign cpuflags_cache64 (1<<17) 749%assign cpuflags_slowctz (1<<18) 750%assign cpuflags_lzcnt (1<<19) 751%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant 752%assign cpuflags_atom (1<<21) 753%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt 754%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 755 756%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 757%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 758 759; Takes an arbitrary number of cpuflags from the above list. 760; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 761; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 762%macro INIT_CPUFLAGS 0-* 763 %xdefine SUFFIX 764 %undef cpuname 765 %assign cpuflags 0 766 767 %if %0 >= 1 768 %rep %0 769 %ifdef cpuname 770 %xdefine cpuname cpuname %+ _%1 771 %else 772 %xdefine cpuname %1 773 %endif 774 %assign cpuflags cpuflags | cpuflags_%1 775 %rotate 1 776 %endrep 777 %xdefine SUFFIX _ %+ cpuname 778 779 %if cpuflag(avx) 780 %assign avx_enabled 1 781 %endif 782 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 783 %define mova movaps 784 %define movu movups 785 %define movnta movntps 786 %endif 787 %if cpuflag(aligned) 788 %define movu mova 789 %elif cpuflag(sse3) && notcpuflag(ssse3) 790 %define movu lddqu 791 %endif 792 %endif 793 794 %if ARCH_X86_64 || cpuflag(sse2) 795 CPU amdnop 796 %else 797 CPU basicnop 798 %endif 799%endmacro 800 801; Merge mmx and sse* 802; m# is a simd register of the currently selected size 803; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 804; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 805; (All 3 remain in sync through SWAP.) 806 807%macro CAT_XDEFINE 3 808 %xdefine %1%2 %3 809%endmacro 810 811%macro CAT_UNDEF 2 812 %undef %1%2 813%endmacro 814 815%macro INIT_MMX 0-1+ 816 %assign avx_enabled 0 817 %define RESET_MM_PERMUTATION INIT_MMX %1 818 %define mmsize 8 819 %define num_mmregs 8 820 %define mova movq 821 %define movu movq 822 %define movh movd 823 %define movnta movntq 824 %assign %%i 0 825 %rep 8 826 CAT_XDEFINE m, %%i, mm %+ %%i 827 CAT_XDEFINE nmm, %%i, %%i 828 %assign %%i %%i+1 829 %endrep 830 %rep 8 831 CAT_UNDEF m, %%i 832 CAT_UNDEF nmm, %%i 833 %assign %%i %%i+1 834 %endrep 835 INIT_CPUFLAGS %1 836%endmacro 837 838%macro INIT_XMM 0-1+ 839 %assign avx_enabled 0 840 %define RESET_MM_PERMUTATION INIT_XMM %1 841 %define mmsize 16 842 %define num_mmregs 8 843 %if ARCH_X86_64 844 %define num_mmregs 16 845 %endif 846 %define mova movdqa 847 %define movu movdqu 848 %define movh movq 849 %define movnta movntdq 850 %assign %%i 0 851 %rep num_mmregs 852 CAT_XDEFINE m, %%i, xmm %+ %%i 853 CAT_XDEFINE nxmm, %%i, %%i 854 %assign %%i %%i+1 855 %endrep 856 INIT_CPUFLAGS %1 857%endmacro 858 859%macro INIT_YMM 0-1+ 860 %assign avx_enabled 1 861 %define RESET_MM_PERMUTATION INIT_YMM %1 862 %define mmsize 32 863 %define num_mmregs 8 864 %if ARCH_X86_64 865 %define num_mmregs 16 866 %endif 867 %define mova movdqa 868 %define movu movdqu 869 %undef movh 870 %define movnta movntdq 871 %assign %%i 0 872 %rep num_mmregs 873 CAT_XDEFINE m, %%i, ymm %+ %%i 874 CAT_XDEFINE nymm, %%i, %%i 875 %assign %%i %%i+1 876 %endrep 877 INIT_CPUFLAGS %1 878%endmacro 879 880INIT_XMM 881 882%macro DECLARE_MMCAST 1 883 %define mmmm%1 mm%1 884 %define mmxmm%1 mm%1 885 %define mmymm%1 mm%1 886 %define xmmmm%1 mm%1 887 %define xmmxmm%1 xmm%1 888 %define xmmymm%1 xmm%1 889 %define ymmmm%1 mm%1 890 %define ymmxmm%1 xmm%1 891 %define ymmymm%1 ymm%1 892 %define ymm%1xmm xmm%1 893 %define xmm%1ymm ymm%1 894 %define xm%1 xmm %+ m%1 895 %define ym%1 ymm %+ m%1 896%endmacro 897 898%assign i 0 899%rep 16 900 DECLARE_MMCAST i 901%assign i i+1 902%endrep 903 904; I often want to use macros that permute their arguments. e.g. there's no 905; efficient way to implement butterfly or transpose or dct without swapping some 906; arguments. 907; 908; I would like to not have to manually keep track of the permutations: 909; If I insert a permutation in the middle of a function, it should automatically 910; change everything that follows. For more complex macros I may also have multiple 911; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 912; 913; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 914; permutes its arguments. It's equivalent to exchanging the contents of the 915; registers, except that this way you exchange the register names instead, so it 916; doesn't cost any cycles. 917 918%macro PERMUTE 2-* ; takes a list of pairs to swap 919%rep %0/2 920 %xdefine %%tmp%2 m%2 921 %rotate 2 922%endrep 923%rep %0/2 924 %xdefine m%1 %%tmp%2 925 CAT_XDEFINE n, m%1, %1 926 %rotate 2 927%endrep 928%endmacro 929 930%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 931%ifnum %1 ; SWAP 0, 1, ... 932 SWAP_INTERNAL_NUM %1, %2 933%else ; SWAP m0, m1, ... 934 SWAP_INTERNAL_NAME %1, %2 935%endif 936%endmacro 937 938%macro SWAP_INTERNAL_NUM 2-* 939 %rep %0-1 940 %xdefine %%tmp m%1 941 %xdefine m%1 m%2 942 %xdefine m%2 %%tmp 943 CAT_XDEFINE n, m%1, %1 944 CAT_XDEFINE n, m%2, %2 945 %rotate 1 946 %endrep 947%endmacro 948 949%macro SWAP_INTERNAL_NAME 2-* 950 %xdefine %%args n %+ %1 951 %rep %0-1 952 %xdefine %%args %%args, n %+ %2 953 %rotate 1 954 %endrep 955 SWAP_INTERNAL_NUM %%args 956%endmacro 957 958; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 959; calls to that function will automatically load the permutation, so values can 960; be returned in mmregs. 961%macro SAVE_MM_PERMUTATION 0-1 962 %if %0 963 %xdefine %%f %1_m 964 %else 965 %xdefine %%f current_function %+ _m 966 %endif 967 %assign %%i 0 968 %rep num_mmregs 969 CAT_XDEFINE %%f, %%i, m %+ %%i 970 %assign %%i %%i+1 971 %endrep 972%endmacro 973 974%macro LOAD_MM_PERMUTATION 1 ; name to load from 975 %ifdef %1_m0 976 %assign %%i 0 977 %rep num_mmregs 978 CAT_XDEFINE m, %%i, %1_m %+ %%i 979 CAT_XDEFINE n, m %+ %%i, %%i 980 %assign %%i %%i+1 981 %endrep 982 %endif 983%endmacro 984 985; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 986%macro call 1 987 call_internal %1, %1 %+ SUFFIX 988%endmacro 989%macro call_internal 2 990 %xdefine %%i %1 991 %ifndef cglobaled_%1 992 %ifdef cglobaled_%2 993 %xdefine %%i %2 994 %endif 995 %endif 996 call %%i 997 LOAD_MM_PERMUTATION %%i 998%endmacro 999 1000; Substitutions that reduce instruction size but are functionally equivalent 1001%macro add 2 1002 %ifnum %2 1003 %if %2==128 1004 sub %1, -128 1005 %else 1006 add %1, %2 1007 %endif 1008 %else 1009 add %1, %2 1010 %endif 1011%endmacro 1012 1013%macro sub 2 1014 %ifnum %2 1015 %if %2==128 1016 add %1, -128 1017 %else 1018 sub %1, %2 1019 %endif 1020 %else 1021 sub %1, %2 1022 %endif 1023%endmacro 1024 1025;============================================================================= 1026; AVX abstraction layer 1027;============================================================================= 1028 1029%assign i 0 1030%rep 16 1031 %if i < 8 1032 CAT_XDEFINE sizeofmm, i, 8 1033 %endif 1034 CAT_XDEFINE sizeofxmm, i, 16 1035 CAT_XDEFINE sizeofymm, i, 32 1036%assign i i+1 1037%endrep 1038%undef i 1039 1040%macro CHECK_AVX_INSTR_EMU 3-* 1041 %xdefine %%opcode %1 1042 %xdefine %%dst %2 1043 %rep %0-2 1044 %ifidn %%dst, %3 1045 %error non-avx emulation of ``%%opcode'' is not supported 1046 %endif 1047 %rotate 1 1048 %endrep 1049%endmacro 1050 1051;%1 == instruction 1052;%2 == minimal instruction set 1053;%3 == 1 if float, 0 if int 1054;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise 1055;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1056;%6+: operands 1057%macro RUN_AVX_INSTR 6-9+ 1058 %ifnum sizeof%7 1059 %assign __sizeofreg sizeof%7 1060 %elifnum sizeof%6 1061 %assign __sizeofreg sizeof%6 1062 %else 1063 %assign __sizeofreg mmsize 1064 %endif 1065 %assign __emulate_avx 0 1066 %if avx_enabled && __sizeofreg >= 16 1067 %xdefine __instr v%1 1068 %else 1069 %xdefine __instr %1 1070 %if %0 >= 8+%4 1071 %assign __emulate_avx 1 1072 %endif 1073 %endif 1074 %ifnidn %2, fnord 1075 %ifdef cpuname 1076 %if notcpuflag(%2) 1077 %error use of ``%1'' %2 instruction in cpuname function: current_function 1078 %endif 1079 %endif 1080 %endif 1081 1082 %if __emulate_avx 1083 %xdefine __src1 %7 1084 %xdefine __src2 %8 1085 %ifnidn %6, %7 1086 %if %0 >= 9 1087 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 1088 %else 1089 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 1090 %endif 1091 %if %5 && %4 == 0 1092 %ifnid %8 1093 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1094 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1095 ; So, if the instruction is commutative with a memory arg, swap them. 1096 %xdefine __src1 %8 1097 %xdefine __src2 %7 1098 %endif 1099 %endif 1100 %if __sizeofreg == 8 1101 MOVQ %6, __src1 1102 %elif %3 1103 MOVAPS %6, __src1 1104 %else 1105 MOVDQA %6, __src1 1106 %endif 1107 %endif 1108 %if %0 >= 9 1109 %1 %6, __src2, %9 1110 %else 1111 %1 %6, __src2 1112 %endif 1113 %elif %0 >= 9 1114 __instr %6, %7, %8, %9 1115 %elif %0 == 8 1116 __instr %6, %7, %8 1117 %elif %0 == 7 1118 __instr %6, %7 1119 %else 1120 __instr %6 1121 %endif 1122%endmacro 1123 1124;%1 == instruction 1125;%2 == minimal instruction set 1126;%3 == 1 if float, 0 if int 1127;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise 1128;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1129%macro AVX_INSTR 1-5 fnord, 0, 1, 0 1130 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1131 %ifidn %2, fnord 1132 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1133 %elifidn %3, fnord 1134 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1135 %elifidn %4, fnord 1136 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1137 %elifidn %5, fnord 1138 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1139 %else 1140 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1141 %endif 1142 %endmacro 1143%endmacro 1144 1145; Instructions with both VEX and non-VEX encodings 1146; Non-destructive instructions are written without parameters 1147AVX_INSTR addpd, sse2, 1, 0, 1 1148AVX_INSTR addps, sse, 1, 0, 1 1149AVX_INSTR addsd, sse2, 1, 0, 1 1150AVX_INSTR addss, sse, 1, 0, 1 1151AVX_INSTR addsubpd, sse3, 1, 0, 0 1152AVX_INSTR addsubps, sse3, 1, 0, 0 1153AVX_INSTR aesdec, fnord, 0, 0, 0 1154AVX_INSTR aesdeclast, fnord, 0, 0, 0 1155AVX_INSTR aesenc, fnord, 0, 0, 0 1156AVX_INSTR aesenclast, fnord, 0, 0, 0 1157AVX_INSTR aesimc 1158AVX_INSTR aeskeygenassist 1159AVX_INSTR andnpd, sse2, 1, 0, 0 1160AVX_INSTR andnps, sse, 1, 0, 0 1161AVX_INSTR andpd, sse2, 1, 0, 1 1162AVX_INSTR andps, sse, 1, 0, 1 1163AVX_INSTR blendpd, sse4, 1, 0, 0 1164AVX_INSTR blendps, sse4, 1, 0, 0 1165AVX_INSTR blendvpd, sse4, 1, 0, 0 1166AVX_INSTR blendvps, sse4, 1, 0, 0 1167AVX_INSTR cmppd, sse2, 1, 1, 0 1168AVX_INSTR cmpps, sse, 1, 1, 0 1169AVX_INSTR cmpsd, sse2, 1, 1, 0 1170AVX_INSTR cmpss, sse, 1, 1, 0 1171AVX_INSTR comisd, sse2 1172AVX_INSTR comiss, sse 1173AVX_INSTR cvtdq2pd, sse2 1174AVX_INSTR cvtdq2ps, sse2 1175AVX_INSTR cvtpd2dq, sse2 1176AVX_INSTR cvtpd2ps, sse2 1177AVX_INSTR cvtps2dq, sse2 1178AVX_INSTR cvtps2pd, sse2 1179AVX_INSTR cvtsd2si, sse2 1180AVX_INSTR cvtsd2ss, sse2 1181AVX_INSTR cvtsi2sd, sse2 1182AVX_INSTR cvtsi2ss, sse 1183AVX_INSTR cvtss2sd, sse2 1184AVX_INSTR cvtss2si, sse 1185AVX_INSTR cvttpd2dq, sse2 1186AVX_INSTR cvttps2dq, sse2 1187AVX_INSTR cvttsd2si, sse2 1188AVX_INSTR cvttss2si, sse 1189AVX_INSTR divpd, sse2, 1, 0, 0 1190AVX_INSTR divps, sse, 1, 0, 0 1191AVX_INSTR divsd, sse2, 1, 0, 0 1192AVX_INSTR divss, sse, 1, 0, 0 1193AVX_INSTR dppd, sse4, 1, 1, 0 1194AVX_INSTR dpps, sse4, 1, 1, 0 1195AVX_INSTR extractps, sse4 1196AVX_INSTR haddpd, sse3, 1, 0, 0 1197AVX_INSTR haddps, sse3, 1, 0, 0 1198AVX_INSTR hsubpd, sse3, 1, 0, 0 1199AVX_INSTR hsubps, sse3, 1, 0, 0 1200AVX_INSTR insertps, sse4, 1, 1, 0 1201AVX_INSTR lddqu, sse3 1202AVX_INSTR ldmxcsr, sse 1203AVX_INSTR maskmovdqu, sse2 1204AVX_INSTR maxpd, sse2, 1, 0, 1 1205AVX_INSTR maxps, sse, 1, 0, 1 1206AVX_INSTR maxsd, sse2, 1, 0, 1 1207AVX_INSTR maxss, sse, 1, 0, 1 1208AVX_INSTR minpd, sse2, 1, 0, 1 1209AVX_INSTR minps, sse, 1, 0, 1 1210AVX_INSTR minsd, sse2, 1, 0, 1 1211AVX_INSTR minss, sse, 1, 0, 1 1212AVX_INSTR movapd, sse2 1213AVX_INSTR movaps, sse 1214AVX_INSTR movd 1215AVX_INSTR movddup, sse3 1216AVX_INSTR movdqa, sse2 1217AVX_INSTR movdqu, sse2 1218AVX_INSTR movhlps, sse, 1, 0, 0 1219AVX_INSTR movhpd, sse2, 1, 0, 0 1220AVX_INSTR movhps, sse, 1, 0, 0 1221AVX_INSTR movlhps, sse, 1, 0, 0 1222AVX_INSTR movlpd, sse2, 1, 0, 0 1223AVX_INSTR movlps, sse, 1, 0, 0 1224AVX_INSTR movmskpd, sse2 1225AVX_INSTR movmskps, sse 1226AVX_INSTR movntdq, sse2 1227AVX_INSTR movntdqa, sse4 1228AVX_INSTR movntpd, sse2 1229AVX_INSTR movntps, sse 1230AVX_INSTR movq 1231AVX_INSTR movsd, sse2, 1, 0, 0 1232AVX_INSTR movshdup, sse3 1233AVX_INSTR movsldup, sse3 1234AVX_INSTR movss, sse, 1, 0, 0 1235AVX_INSTR movupd, sse2 1236AVX_INSTR movups, sse 1237AVX_INSTR mpsadbw, sse4 1238AVX_INSTR mulpd, sse2, 1, 0, 1 1239AVX_INSTR mulps, sse, 1, 0, 1 1240AVX_INSTR mulsd, sse2, 1, 0, 1 1241AVX_INSTR mulss, sse, 1, 0, 1 1242AVX_INSTR orpd, sse2, 1, 0, 1 1243AVX_INSTR orps, sse, 1, 0, 1 1244AVX_INSTR pabsb, ssse3 1245AVX_INSTR pabsd, ssse3 1246AVX_INSTR pabsw, ssse3 1247AVX_INSTR packsswb, mmx, 0, 0, 0 1248AVX_INSTR packssdw, mmx, 0, 0, 0 1249AVX_INSTR packuswb, mmx, 0, 0, 0 1250AVX_INSTR packusdw, sse4, 0, 0, 0 1251AVX_INSTR paddb, mmx, 0, 0, 1 1252AVX_INSTR paddw, mmx, 0, 0, 1 1253AVX_INSTR paddd, mmx, 0, 0, 1 1254AVX_INSTR paddq, sse2, 0, 0, 1 1255AVX_INSTR paddsb, mmx, 0, 0, 1 1256AVX_INSTR paddsw, mmx, 0, 0, 1 1257AVX_INSTR paddusb, mmx, 0, 0, 1 1258AVX_INSTR paddusw, mmx, 0, 0, 1 1259AVX_INSTR palignr, ssse3 1260AVX_INSTR pand, mmx, 0, 0, 1 1261AVX_INSTR pandn, mmx, 0, 0, 0 1262AVX_INSTR pavgb, mmx2, 0, 0, 1 1263AVX_INSTR pavgw, mmx2, 0, 0, 1 1264AVX_INSTR pblendvb, sse4, 0, 0, 0 1265AVX_INSTR pblendw, sse4 1266AVX_INSTR pclmulqdq 1267AVX_INSTR pcmpestri, sse42 1268AVX_INSTR pcmpestrm, sse42 1269AVX_INSTR pcmpistri, sse42 1270AVX_INSTR pcmpistrm, sse42 1271AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1272AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1273AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1274AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1275AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1276AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1277AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1278AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1279AVX_INSTR pextrb, sse4 1280AVX_INSTR pextrd, sse4 1281AVX_INSTR pextrq, sse4 1282AVX_INSTR pextrw, mmx2 1283AVX_INSTR phaddw, ssse3, 0, 0, 0 1284AVX_INSTR phaddd, ssse3, 0, 0, 0 1285AVX_INSTR phaddsw, ssse3, 0, 0, 0 1286AVX_INSTR phminposuw, sse4 1287AVX_INSTR phsubw, ssse3, 0, 0, 0 1288AVX_INSTR phsubd, ssse3, 0, 0, 0 1289AVX_INSTR phsubsw, ssse3, 0, 0, 0 1290AVX_INSTR pinsrb, sse4 1291AVX_INSTR pinsrd, sse4 1292AVX_INSTR pinsrq, sse4 1293AVX_INSTR pinsrw, mmx2 1294AVX_INSTR pmaddwd, mmx, 0, 0, 1 1295AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1296AVX_INSTR pmaxsb, sse4, 0, 0, 1 1297AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1298AVX_INSTR pmaxsd, sse4, 0, 0, 1 1299AVX_INSTR pmaxub, mmx2, 0, 0, 1 1300AVX_INSTR pmaxuw, sse4, 0, 0, 1 1301AVX_INSTR pmaxud, sse4, 0, 0, 1 1302AVX_INSTR pminsb, sse4, 0, 0, 1 1303AVX_INSTR pminsw, mmx2, 0, 0, 1 1304AVX_INSTR pminsd, sse4, 0, 0, 1 1305AVX_INSTR pminub, mmx2, 0, 0, 1 1306AVX_INSTR pminuw, sse4, 0, 0, 1 1307AVX_INSTR pminud, sse4, 0, 0, 1 1308AVX_INSTR pmovmskb, mmx2 1309AVX_INSTR pmovsxbw, sse4 1310AVX_INSTR pmovsxbd, sse4 1311AVX_INSTR pmovsxbq, sse4 1312AVX_INSTR pmovsxwd, sse4 1313AVX_INSTR pmovsxwq, sse4 1314AVX_INSTR pmovsxdq, sse4 1315AVX_INSTR pmovzxbw, sse4 1316AVX_INSTR pmovzxbd, sse4 1317AVX_INSTR pmovzxbq, sse4 1318AVX_INSTR pmovzxwd, sse4 1319AVX_INSTR pmovzxwq, sse4 1320AVX_INSTR pmovzxdq, sse4 1321AVX_INSTR pmuldq, sse4, 0, 0, 1 1322AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1323AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1324AVX_INSTR pmulhw, mmx, 0, 0, 1 1325AVX_INSTR pmullw, mmx, 0, 0, 1 1326AVX_INSTR pmulld, sse4, 0, 0, 1 1327AVX_INSTR pmuludq, sse2, 0, 0, 1 1328AVX_INSTR por, mmx, 0, 0, 1 1329AVX_INSTR psadbw, mmx2, 0, 0, 1 1330AVX_INSTR pshufb, ssse3, 0, 0, 0 1331AVX_INSTR pshufd, sse2 1332AVX_INSTR pshufhw, sse2 1333AVX_INSTR pshuflw, sse2 1334AVX_INSTR psignb, ssse3, 0, 0, 0 1335AVX_INSTR psignw, ssse3, 0, 0, 0 1336AVX_INSTR psignd, ssse3, 0, 0, 0 1337AVX_INSTR psllw, mmx, 0, 0, 0 1338AVX_INSTR pslld, mmx, 0, 0, 0 1339AVX_INSTR psllq, mmx, 0, 0, 0 1340AVX_INSTR pslldq, sse2, 0, 0, 0 1341AVX_INSTR psraw, mmx, 0, 0, 0 1342AVX_INSTR psrad, mmx, 0, 0, 0 1343AVX_INSTR psrlw, mmx, 0, 0, 0 1344AVX_INSTR psrld, mmx, 0, 0, 0 1345AVX_INSTR psrlq, mmx, 0, 0, 0 1346AVX_INSTR psrldq, sse2, 0, 0, 0 1347AVX_INSTR psubb, mmx, 0, 0, 0 1348AVX_INSTR psubw, mmx, 0, 0, 0 1349AVX_INSTR psubd, mmx, 0, 0, 0 1350AVX_INSTR psubq, sse2, 0, 0, 0 1351AVX_INSTR psubsb, mmx, 0, 0, 0 1352AVX_INSTR psubsw, mmx, 0, 0, 0 1353AVX_INSTR psubusb, mmx, 0, 0, 0 1354AVX_INSTR psubusw, mmx, 0, 0, 0 1355AVX_INSTR ptest, sse4 1356AVX_INSTR punpckhbw, mmx, 0, 0, 0 1357AVX_INSTR punpckhwd, mmx, 0, 0, 0 1358AVX_INSTR punpckhdq, mmx, 0, 0, 0 1359AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1360AVX_INSTR punpcklbw, mmx, 0, 0, 0 1361AVX_INSTR punpcklwd, mmx, 0, 0, 0 1362AVX_INSTR punpckldq, mmx, 0, 0, 0 1363AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1364AVX_INSTR pxor, mmx, 0, 0, 1 1365AVX_INSTR rcpps, sse, 1, 0, 0 1366AVX_INSTR rcpss, sse, 1, 0, 0 1367AVX_INSTR roundpd, sse4 1368AVX_INSTR roundps, sse4 1369AVX_INSTR roundsd, sse4 1370AVX_INSTR roundss, sse4 1371AVX_INSTR rsqrtps, sse, 1, 0, 0 1372AVX_INSTR rsqrtss, sse, 1, 0, 0 1373AVX_INSTR shufpd, sse2, 1, 1, 0 1374AVX_INSTR shufps, sse, 1, 1, 0 1375AVX_INSTR sqrtpd, sse2, 1, 0, 0 1376AVX_INSTR sqrtps, sse, 1, 0, 0 1377AVX_INSTR sqrtsd, sse2, 1, 0, 0 1378AVX_INSTR sqrtss, sse, 1, 0, 0 1379AVX_INSTR stmxcsr, sse 1380AVX_INSTR subpd, sse2, 1, 0, 0 1381AVX_INSTR subps, sse, 1, 0, 0 1382AVX_INSTR subsd, sse2, 1, 0, 0 1383AVX_INSTR subss, sse, 1, 0, 0 1384AVX_INSTR ucomisd, sse2 1385AVX_INSTR ucomiss, sse 1386AVX_INSTR unpckhpd, sse2, 1, 0, 0 1387AVX_INSTR unpckhps, sse, 1, 0, 0 1388AVX_INSTR unpcklpd, sse2, 1, 0, 0 1389AVX_INSTR unpcklps, sse, 1, 0, 0 1390AVX_INSTR xorpd, sse2, 1, 0, 1 1391AVX_INSTR xorps, sse, 1, 0, 1 1392 1393; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1394AVX_INSTR pfadd, 3dnow, 1, 0, 1 1395AVX_INSTR pfsub, 3dnow, 1, 0, 0 1396AVX_INSTR pfmul, 3dnow, 1, 0, 1 1397 1398; base-4 constants for shuffles 1399%assign i 0 1400%rep 256 1401 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1402 %if j < 10 1403 CAT_XDEFINE q000, j, i 1404 %elif j < 100 1405 CAT_XDEFINE q00, j, i 1406 %elif j < 1000 1407 CAT_XDEFINE q0, j, i 1408 %else 1409 CAT_XDEFINE q, j, i 1410 %endif 1411%assign i i+1 1412%endrep 1413%undef i 1414%undef j 1415 1416%macro FMA_INSTR 3 1417 %macro %1 4-7 %1, %2, %3 1418 %if cpuflag(xop) 1419 v%5 %1, %2, %3, %4 1420 %elifnidn %1, %4 1421 %6 %1, %2, %3 1422 %7 %1, %4 1423 %else 1424 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1425 %endif 1426 %endmacro 1427%endmacro 1428 1429FMA_INSTR pmacsww, pmullw, paddw 1430FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1431FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1432FMA_INSTR pmadcswd, pmaddwd, paddd 1433 1434; convert FMA4 to FMA3 if possible 1435%macro FMA4_INSTR 4 1436 %macro %1 4-8 %1, %2, %3, %4 1437 %if cpuflag(fma4) 1438 v%5 %1, %2, %3, %4 1439 %elifidn %1, %2 1440 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 1441 %elifidn %1, %3 1442 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 1443 %elifidn %1, %4 1444 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 1445 %else 1446 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported 1447 %endif 1448 %endmacro 1449%endmacro 1450 1451FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd 1452FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps 1453FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd 1454FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss 1455 1456FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd 1457FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps 1458FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd 1459FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps 1460 1461FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd 1462FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps 1463FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd 1464FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss 1465 1466FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd 1467FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps 1468FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd 1469FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss 1470 1471FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd 1472FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps 1473FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd 1474FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss 1475 1476; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug 1477%if ARCH_X86_64 == 0 1478%macro vpbroadcastq 2 1479%if sizeof%1 == 16 1480 movddup %1, %2 1481%else 1482 vbroadcastsd %1, %2 1483%endif 1484%endmacro 1485%endif 1486