1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2016 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Anton Mitrofanov <BugMaster@narod.ru> 8;* Fiona Glaser <fiona@x264.com> 9;* Henrik Gramner <henrik@gramner.com> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x264ASM assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used in x264. 29 30; Unlike the rest of x264, this file is available under an ISC license, as it 31; has significant usefulness outside of x264 and we want it to be available 32; to the largest audience possible. Of course, if you modify it for your own 33; purposes to add a new feature, we strongly encourage contributing a patch 34; as this feature might be useful for others as well. Send patches or ideas 35; to x264-devel@videolan.org . 36 37%include "vpx_config.asm" 38 39%ifndef private_prefix 40 %define private_prefix vpx 41%endif 42 43%ifndef public_prefix 44 %define public_prefix private_prefix 45%endif 46 47%ifndef STACK_ALIGNMENT 48 %if VPX_ARCH_X86_64 49 %define STACK_ALIGNMENT 16 50 %else 51 %define STACK_ALIGNMENT 4 52 %endif 53%endif 54 55%define WIN64 0 56%define UNIX64 0 57%if VPX_ARCH_X86_64 58 %ifidn __OUTPUT_FORMAT__,win32 59 %define WIN64 1 60 %elifidn __OUTPUT_FORMAT__,win64 61 %define WIN64 1 62 %elifidn __OUTPUT_FORMAT__,x64 63 %define WIN64 1 64 %else 65 %define UNIX64 1 66 %endif 67%endif 68 69%define FORMAT_ELF 0 70%ifidn __OUTPUT_FORMAT__,elf 71 %define FORMAT_ELF 1 72%elifidn __OUTPUT_FORMAT__,elf32 73 %define FORMAT_ELF 1 74%elifidn __OUTPUT_FORMAT__,elf64 75 %define FORMAT_ELF 1 76%endif 77 78%define FORMAT_MACHO 0 79%ifidn __OUTPUT_FORMAT__,macho32 80 %define FORMAT_MACHO 1 81%elifidn __OUTPUT_FORMAT__,macho64 82 %define FORMAT_MACHO 1 83%endif 84 85; Set PREFIX for libvpx builds. 86%if FORMAT_ELF 87 %undef PREFIX 88%elif WIN64 89 %undef PREFIX 90%else 91 %define PREFIX 92%endif 93 94%ifdef PREFIX 95 %define mangle(x) _ %+ x 96%else 97 %define mangle(x) x 98%endif 99 100; In some instances macho32 tables get misaligned when using .rodata. 101; When looking at the disassembly it appears that the offset is either 102; correct or consistently off by 90. Placing them in the .text section 103; works around the issue. It appears to be specific to the way libvpx 104; handles the tables. 105%macro SECTION_RODATA 0-1 16 106 %ifidn __OUTPUT_FORMAT__,macho32 107 SECTION .text align=%1 108 fakegot: 109 %elifidn __OUTPUT_FORMAT__,aout 110 SECTION .text 111 %else 112 SECTION .rodata align=%1 113 %endif 114%endmacro 115 116; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" 117; from original code is added in for 64bit. 118%ifidn __OUTPUT_FORMAT__,elf32 119%define ABI_IS_32BIT 1 120%elifidn __OUTPUT_FORMAT__,macho32 121%define ABI_IS_32BIT 1 122%elifidn __OUTPUT_FORMAT__,win32 123%define ABI_IS_32BIT 1 124%elifidn __OUTPUT_FORMAT__,aout 125%define ABI_IS_32BIT 1 126%else 127%define ABI_IS_32BIT 0 128%endif 129 130%if ABI_IS_32BIT 131 %if CONFIG_PIC=1 132 %ifidn __OUTPUT_FORMAT__,elf32 133 %define GET_GOT_DEFINED 1 134 %define WRT_PLT wrt ..plt 135 %macro GET_GOT 1 136 extern _GLOBAL_OFFSET_TABLE_ 137 push %1 138 call %%get_got 139 %%sub_offset: 140 jmp %%exitGG 141 %%get_got: 142 mov %1, [esp] 143 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc 144 ret 145 %%exitGG: 146 %undef GLOBAL 147 %define GLOBAL(x) x + %1 wrt ..gotoff 148 %undef RESTORE_GOT 149 %define RESTORE_GOT pop %1 150 %endmacro 151 %elifidn __OUTPUT_FORMAT__,macho32 152 %define GET_GOT_DEFINED 1 153 %macro GET_GOT 1 154 push %1 155 call %%get_got 156 %%get_got: 157 pop %1 158 %undef GLOBAL 159 %define GLOBAL(x) x + %1 - %%get_got 160 %undef RESTORE_GOT 161 %define RESTORE_GOT pop %1 162 %endmacro 163 %else 164 %define GET_GOT_DEFINED 0 165 %endif 166 %endif 167 168 %if VPX_ARCH_X86_64 == 0 169 %undef PIC 170 %endif 171 172%else 173 %macro GET_GOT 1 174 %endmacro 175 %define GLOBAL(x) rel x 176 %define WRT_PLT wrt ..plt 177 178 %if WIN64 179 %define PIC 180 %elifidn __OUTPUT_FORMAT__,macho64 181 %define PIC 182 %elif CONFIG_PIC 183 %define PIC 184 %endif 185%endif 186 187%ifnmacro GET_GOT 188 %macro GET_GOT 1 189 %endmacro 190 %define GLOBAL(x) x 191%endif 192%ifndef RESTORE_GOT 193 %define RESTORE_GOT 194%endif 195%ifndef WRT_PLT 196 %define WRT_PLT 197%endif 198 199%ifdef PIC 200 default rel 201%endif 202 203%ifndef GET_GOT_DEFINED 204 %define GET_GOT_DEFINED 0 205%endif 206; Done with PIC macros 207 208%ifdef __NASM_VER__ 209 %use smartalign 210%endif 211 212; Macros to eliminate most code duplication between x86_32 and x86_64: 213; Currently this works only for leaf functions which load all their arguments 214; into registers at the start, and make no other use of the stack. Luckily that 215; covers most of x264's asm. 216 217; PROLOGUE: 218; %1 = number of arguments. loads them from stack if needed. 219; %2 = number of registers used. pushes callee-saved regs if needed. 220; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 221; %4 = (optional) stack size to be allocated. The stack will be aligned before 222; allocating the specified stack size. If the required stack alignment is 223; larger than the known stack alignment the stack will be manually aligned 224; and an extra register will be allocated to hold the original stack 225; pointer (to not invalidate r0m etc.). To prevent the use of an extra 226; register as stack pointer, request a negative stack size. 227; %4+/%5+ = list of names to define to registers 228; PROLOGUE can also be invoked by adding the same options to cglobal 229 230; e.g. 231; cglobal foo, 2,3,7,0x40, dst, src, tmp 232; declares a function (foo) that automatically loads two arguments (dst and 233; src) into registers, uses one additional register (tmp) plus 7 vector 234; registers (m0-m6) and allocates 0x40 bytes of stack space. 235 236; TODO Some functions can use some args directly from the stack. If they're the 237; last args then you can just not declare them, but if they're in the middle 238; we need more flexible macro. 239 240; RET: 241; Pops anything that was pushed by PROLOGUE, and returns. 242 243; REP_RET: 244; Use this instead of RET if it's a branch target. 245 246; registers: 247; rN and rNq are the native-size register holding function argument N 248; rNd, rNw, rNb are dword, word, and byte size 249; rNh is the high 8 bits of the word size 250; rNm is the original location of arg N (a register or on the stack), dword 251; rNmp is native size 252 253%macro DECLARE_REG 2-3 254 %define r%1q %2 255 %define r%1d %2d 256 %define r%1w %2w 257 %define r%1b %2b 258 %define r%1h %2h 259 %define %2q %2 260 %if %0 == 2 261 %define r%1m %2d 262 %define r%1mp %2 263 %elif VPX_ARCH_X86_64 ; memory 264 %define r%1m [rstk + stack_offset + %3] 265 %define r%1mp qword r %+ %1 %+ m 266 %else 267 %define r%1m [rstk + stack_offset + %3] 268 %define r%1mp dword r %+ %1 %+ m 269 %endif 270 %define r%1 %2 271%endmacro 272 273%macro DECLARE_REG_SIZE 3 274 %define r%1q r%1 275 %define e%1q r%1 276 %define r%1d e%1 277 %define e%1d e%1 278 %define r%1w %1 279 %define e%1w %1 280 %define r%1h %3 281 %define e%1h %3 282 %define r%1b %2 283 %define e%1b %2 284 %if VPX_ARCH_X86_64 == 0 285 %define r%1 e%1 286 %endif 287%endmacro 288 289DECLARE_REG_SIZE ax, al, ah 290DECLARE_REG_SIZE bx, bl, bh 291DECLARE_REG_SIZE cx, cl, ch 292DECLARE_REG_SIZE dx, dl, dh 293DECLARE_REG_SIZE si, sil, null 294DECLARE_REG_SIZE di, dil, null 295DECLARE_REG_SIZE bp, bpl, null 296 297; t# defines for when per-arch register allocation is more complex than just function arguments 298 299%macro DECLARE_REG_TMP 1-* 300 %assign %%i 0 301 %rep %0 302 CAT_XDEFINE t, %%i, r%1 303 %assign %%i %%i+1 304 %rotate 1 305 %endrep 306%endmacro 307 308%macro DECLARE_REG_TMP_SIZE 0-* 309 %rep %0 310 %define t%1q t%1 %+ q 311 %define t%1d t%1 %+ d 312 %define t%1w t%1 %+ w 313 %define t%1h t%1 %+ h 314 %define t%1b t%1 %+ b 315 %rotate 1 316 %endrep 317%endmacro 318 319DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 320 321%if VPX_ARCH_X86_64 322 %define gprsize 8 323%else 324 %define gprsize 4 325%endif 326 327%macro PUSH 1 328 push %1 329 %ifidn rstk, rsp 330 %assign stack_offset stack_offset+gprsize 331 %endif 332%endmacro 333 334%macro POP 1 335 pop %1 336 %ifidn rstk, rsp 337 %assign stack_offset stack_offset-gprsize 338 %endif 339%endmacro 340 341%macro PUSH_IF_USED 1-* 342 %rep %0 343 %if %1 < regs_used 344 PUSH r%1 345 %endif 346 %rotate 1 347 %endrep 348%endmacro 349 350%macro POP_IF_USED 1-* 351 %rep %0 352 %if %1 < regs_used 353 pop r%1 354 %endif 355 %rotate 1 356 %endrep 357%endmacro 358 359%macro LOAD_IF_USED 1-* 360 %rep %0 361 %if %1 < num_args 362 mov r%1, r %+ %1 %+ mp 363 %endif 364 %rotate 1 365 %endrep 366%endmacro 367 368%macro SUB 2 369 sub %1, %2 370 %ifidn %1, rstk 371 %assign stack_offset stack_offset+(%2) 372 %endif 373%endmacro 374 375%macro ADD 2 376 add %1, %2 377 %ifidn %1, rstk 378 %assign stack_offset stack_offset-(%2) 379 %endif 380%endmacro 381 382%macro movifnidn 2 383 %ifnidn %1, %2 384 mov %1, %2 385 %endif 386%endmacro 387 388%macro movsxdifnidn 2 389 %ifnidn %1, %2 390 movsxd %1, %2 391 %endif 392%endmacro 393 394%macro ASSERT 1 395 %if (%1) == 0 396 %error assertion ``%1'' failed 397 %endif 398%endmacro 399 400%macro DEFINE_ARGS 0-* 401 %ifdef n_arg_names 402 %assign %%i 0 403 %rep n_arg_names 404 CAT_UNDEF arg_name %+ %%i, q 405 CAT_UNDEF arg_name %+ %%i, d 406 CAT_UNDEF arg_name %+ %%i, w 407 CAT_UNDEF arg_name %+ %%i, h 408 CAT_UNDEF arg_name %+ %%i, b 409 CAT_UNDEF arg_name %+ %%i, m 410 CAT_UNDEF arg_name %+ %%i, mp 411 CAT_UNDEF arg_name, %%i 412 %assign %%i %%i+1 413 %endrep 414 %endif 415 416 %xdefine %%stack_offset stack_offset 417 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 418 %assign %%i 0 419 %rep %0 420 %xdefine %1q r %+ %%i %+ q 421 %xdefine %1d r %+ %%i %+ d 422 %xdefine %1w r %+ %%i %+ w 423 %xdefine %1h r %+ %%i %+ h 424 %xdefine %1b r %+ %%i %+ b 425 %xdefine %1m r %+ %%i %+ m 426 %xdefine %1mp r %+ %%i %+ mp 427 CAT_XDEFINE arg_name, %%i, %1 428 %assign %%i %%i+1 429 %rotate 1 430 %endrep 431 %xdefine stack_offset %%stack_offset 432 %assign n_arg_names %0 433%endmacro 434 435%define required_stack_alignment ((mmsize + 15) & ~15) 436 437%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) 438 %ifnum %1 439 %if %1 != 0 440 %assign %%pad 0 441 %assign stack_size %1 442 %if stack_size < 0 443 %assign stack_size -stack_size 444 %endif 445 %if WIN64 446 %assign %%pad %%pad + 32 ; shadow space 447 %if mmsize != 8 448 %assign xmm_regs_used %2 449 %if xmm_regs_used > 8 450 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 451 %endif 452 %endif 453 %endif 454 %if required_stack_alignment <= STACK_ALIGNMENT 455 ; maintain the current stack alignment 456 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 457 SUB rsp, stack_size_padded 458 %else 459 %assign %%reg_num (regs_used - 1) 460 %xdefine rstk r %+ %%reg_num 461 ; align stack, and save original stack location directly above 462 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 463 ; stack in a single instruction (i.e. mov rsp, rstk or mov 464 ; rsp, [rsp+stack_size_padded]) 465 %if %1 < 0 ; need to store rsp on stack 466 %xdefine rstkm [rsp + stack_size + %%pad] 467 %assign %%pad %%pad + gprsize 468 %else ; can keep rsp in rstk during whole function 469 %xdefine rstkm rstk 470 %endif 471 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 472 mov rstk, rsp 473 and rsp, ~(required_stack_alignment-1) 474 sub rsp, stack_size_padded 475 movifnidn rstkm, rstk 476 %endif 477 WIN64_PUSH_XMM 478 %endif 479 %endif 480%endmacro 481 482%macro SETUP_STACK_POINTER 1 483 %ifnum %1 484 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 485 %if %1 > 0 486 %assign regs_used (regs_used + 1) 487 %endif 488 %if VPX_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 489 ; Ensure that we don't clobber any registers containing arguments 490 %assign regs_used 5 + UNIX64 * 3 491 %endif 492 %endif 493 %endif 494%endmacro 495 496%macro DEFINE_ARGS_INTERNAL 3+ 497 %ifnum %2 498 DEFINE_ARGS %3 499 %elif %1 == 4 500 DEFINE_ARGS %2 501 %elif %1 > 4 502 DEFINE_ARGS %2, %3 503 %endif 504%endmacro 505 506%if WIN64 ; Windows x64 ;================================================= 507 508DECLARE_REG 0, rcx 509DECLARE_REG 1, rdx 510DECLARE_REG 2, R8 511DECLARE_REG 3, R9 512DECLARE_REG 4, R10, 40 513DECLARE_REG 5, R11, 48 514DECLARE_REG 6, rax, 56 515DECLARE_REG 7, rdi, 64 516DECLARE_REG 8, rsi, 72 517DECLARE_REG 9, rbx, 80 518DECLARE_REG 10, rbp, 88 519DECLARE_REG 11, R12, 96 520DECLARE_REG 12, R13, 104 521DECLARE_REG 13, R14, 112 522DECLARE_REG 14, R15, 120 523 524%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 525 %assign num_args %1 526 %assign regs_used %2 527 ASSERT regs_used >= num_args 528 SETUP_STACK_POINTER %4 529 ASSERT regs_used <= 15 530 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 531 ALLOC_STACK %4, %3 532 %if mmsize != 8 && stack_size == 0 533 WIN64_SPILL_XMM %3 534 %endif 535 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 536 DEFINE_ARGS_INTERNAL %0, %4, %5 537%endmacro 538 539%macro WIN64_PUSH_XMM 0 540 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 541 %if xmm_regs_used > 6 542 movaps [rstk + stack_offset + 8], xmm6 543 %endif 544 %if xmm_regs_used > 7 545 movaps [rstk + stack_offset + 24], xmm7 546 %endif 547 %if xmm_regs_used > 8 548 %assign %%i 8 549 %rep xmm_regs_used-8 550 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 551 %assign %%i %%i+1 552 %endrep 553 %endif 554%endmacro 555 556%macro WIN64_SPILL_XMM 1 557 %assign xmm_regs_used %1 558 ASSERT xmm_regs_used <= 16 559 %if xmm_regs_used > 8 560 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 561 %assign %%pad (xmm_regs_used-8)*16 + 32 562 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 563 SUB rsp, stack_size_padded 564 %endif 565 WIN64_PUSH_XMM 566%endmacro 567 568%macro WIN64_RESTORE_XMM_INTERNAL 1 569 %assign %%pad_size 0 570 %if xmm_regs_used > 8 571 %assign %%i xmm_regs_used 572 %rep xmm_regs_used-8 573 %assign %%i %%i-1 574 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] 575 %endrep 576 %endif 577 %if stack_size_padded > 0 578 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 579 mov rsp, rstkm 580 %else 581 add %1, stack_size_padded 582 %assign %%pad_size stack_size_padded 583 %endif 584 %endif 585 %if xmm_regs_used > 7 586 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] 587 %endif 588 %if xmm_regs_used > 6 589 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] 590 %endif 591%endmacro 592 593%macro WIN64_RESTORE_XMM 1 594 WIN64_RESTORE_XMM_INTERNAL %1 595 %assign stack_offset (stack_offset-stack_size_padded) 596 %assign xmm_regs_used 0 597%endmacro 598 599%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 600 601%macro RET 0 602 WIN64_RESTORE_XMM_INTERNAL rsp 603 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 604 %if mmsize == 32 605 vzeroupper 606 %endif 607 AUTO_REP_RET 608%endmacro 609 610%elif VPX_ARCH_X86_64 ; *nix x64 ;============================================= 611 612DECLARE_REG 0, rdi 613DECLARE_REG 1, rsi 614DECLARE_REG 2, rdx 615DECLARE_REG 3, rcx 616DECLARE_REG 4, R8 617DECLARE_REG 5, R9 618DECLARE_REG 6, rax, 8 619DECLARE_REG 7, R10, 16 620DECLARE_REG 8, R11, 24 621DECLARE_REG 9, rbx, 32 622DECLARE_REG 10, rbp, 40 623DECLARE_REG 11, R12, 48 624DECLARE_REG 12, R13, 56 625DECLARE_REG 13, R14, 64 626DECLARE_REG 14, R15, 72 627 628%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 629 %assign num_args %1 630 %assign regs_used %2 631 ASSERT regs_used >= num_args 632 SETUP_STACK_POINTER %4 633 ASSERT regs_used <= 15 634 PUSH_IF_USED 9, 10, 11, 12, 13, 14 635 ALLOC_STACK %4 636 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 637 DEFINE_ARGS_INTERNAL %0, %4, %5 638%endmacro 639 640%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 641 642%macro RET 0 643 %if stack_size_padded > 0 644 %if required_stack_alignment > STACK_ALIGNMENT 645 mov rsp, rstkm 646 %else 647 add rsp, stack_size_padded 648 %endif 649 %endif 650 POP_IF_USED 14, 13, 12, 11, 10, 9 651 %if mmsize == 32 652 vzeroupper 653 %endif 654 AUTO_REP_RET 655%endmacro 656 657%else ; X86_32 ;============================================================== 658 659DECLARE_REG 0, eax, 4 660DECLARE_REG 1, ecx, 8 661DECLARE_REG 2, edx, 12 662DECLARE_REG 3, ebx, 16 663DECLARE_REG 4, esi, 20 664DECLARE_REG 5, edi, 24 665DECLARE_REG 6, ebp, 28 666%define rsp esp 667 668%macro DECLARE_ARG 1-* 669 %rep %0 670 %define r%1m [rstk + stack_offset + 4*%1 + 4] 671 %define r%1mp dword r%1m 672 %rotate 1 673 %endrep 674%endmacro 675 676DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 677 678%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 679 %assign num_args %1 680 %assign regs_used %2 681 ASSERT regs_used >= num_args 682 %if num_args > 7 683 %assign num_args 7 684 %endif 685 %if regs_used > 7 686 %assign regs_used 7 687 %endif 688 SETUP_STACK_POINTER %4 689 ASSERT regs_used <= 7 690 PUSH_IF_USED 3, 4, 5, 6 691 ALLOC_STACK %4 692 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 693 DEFINE_ARGS_INTERNAL %0, %4, %5 694%endmacro 695 696%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 697 698%macro RET 0 699 %if stack_size_padded > 0 700 %if required_stack_alignment > STACK_ALIGNMENT 701 mov rsp, rstkm 702 %else 703 add rsp, stack_size_padded 704 %endif 705 %endif 706 POP_IF_USED 6, 5, 4, 3 707 %if mmsize == 32 708 vzeroupper 709 %endif 710 AUTO_REP_RET 711%endmacro 712 713%endif ;====================================================================== 714 715%if WIN64 == 0 716 %macro WIN64_SPILL_XMM 1 717 %endmacro 718 %macro WIN64_RESTORE_XMM 1 719 %endmacro 720 %macro WIN64_PUSH_XMM 0 721 %endmacro 722%endif 723 724; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 725; a branch or a branch target. So switch to a 2-byte form of ret in that case. 726; We can automatically detect "follows a branch", but not a branch target. 727; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 728%macro REP_RET 0 729 %if has_epilogue 730 RET 731 %else 732 rep ret 733 %endif 734 annotate_function_size 735%endmacro 736 737%define last_branch_adr $$ 738%macro AUTO_REP_RET 0 739 %if notcpuflag(ssse3) 740 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. 741 %endif 742 ret 743 annotate_function_size 744%endmacro 745 746%macro BRANCH_INSTR 0-* 747 %rep %0 748 %macro %1 1-2 %1 749 %2 %1 750 %if notcpuflag(ssse3) 751 %%branch_instr equ $ 752 %xdefine last_branch_adr %%branch_instr 753 %endif 754 %endmacro 755 %rotate 1 756 %endrep 757%endmacro 758 759BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 760 761%macro TAIL_CALL 2 ; callee, is_nonadjacent 762 %if has_epilogue 763 call %1 764 RET 765 %elif %2 766 jmp %1 767 %endif 768 annotate_function_size 769%endmacro 770 771;============================================================================= 772; arch-independent part 773;============================================================================= 774 775%assign function_align 16 776 777; Begin a function. 778; Applies any symbol mangling needed for C linkage, and sets up a define such that 779; subsequent uses of the function name automatically refer to the mangled version. 780; Appends cpuflags to the function name if cpuflags has been specified. 781; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 782; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 783%macro cglobal 1-2+ "" ; name, [PROLOGUE args] 784 cglobal_internal 1, %1 %+ SUFFIX, %2 785%endmacro 786%macro cvisible 1-2+ "" ; name, [PROLOGUE args] 787 cglobal_internal 0, %1 %+ SUFFIX, %2 788%endmacro 789%macro cglobal_internal 2-3+ 790 annotate_function_size 791 %if %1 792 %xdefine %%FUNCTION_PREFIX private_prefix 793 ; libvpx explicitly sets visibility in shared object builds. Avoid 794 ; setting visibility to hidden as it may break builds that split 795 ; sources on e.g., directory boundaries. 796 %ifdef CHROMIUM 797 %xdefine %%VISIBILITY hidden 798 %else 799 %xdefine %%VISIBILITY 800 %endif 801 %else 802 %xdefine %%FUNCTION_PREFIX public_prefix 803 %xdefine %%VISIBILITY 804 %endif 805 %ifndef cglobaled_%2 806 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) 807 %xdefine %2.skip_prologue %2 %+ .skip_prologue 808 CAT_XDEFINE cglobaled_, %2, 1 809 %endif 810 %xdefine current_function %2 811 %xdefine current_function_section __SECT__ 812 %if FORMAT_ELF 813 global %2:function %%VISIBILITY 814 %elif FORMAT_MACHO 815 %ifdef __NASM_VER__ 816 global %2 817 %else 818 global %2:private_extern 819 %endif 820 %else 821 global %2 822 %endif 823 align function_align 824 %2: 825 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 826 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 827 %assign stack_offset 0 ; stack pointer offset relative to the return address 828 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 829 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 830 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 831 %ifnidn %3, "" 832 PROLOGUE %3 833 %endif 834%endmacro 835 836%macro cextern 1 837 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 838 CAT_XDEFINE cglobaled_, %1, 1 839 extern %1 840%endmacro 841 842; like cextern, but without the prefix 843%macro cextern_naked 1 844 %ifdef PREFIX 845 %xdefine %1 mangle(%1) 846 %endif 847 CAT_XDEFINE cglobaled_, %1, 1 848 extern %1 849%endmacro 850 851%macro const 1-2+ 852 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 853 %if FORMAT_ELF 854 global %1:data hidden 855 %else 856 global %1 857 %endif 858 %1: %2 859%endmacro 860 861; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. 862%if FORMAT_ELF 863 [SECTION .note.GNU-stack noalloc noexec nowrite progbits] 864%endif 865 866; Tell debuggers how large the function was. 867; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. 868; This is invoked by RET and similar macros, and also cglobal does it for the previous function, 869; but if the last function in a source file doesn't use any of the standard macros for its epilogue, 870; then its size might be unspecified. 871%macro annotate_function_size 0 872 %ifdef __YASM_VER__ 873 %ifdef current_function 874 %if FORMAT_ELF 875 current_function_section 876 %%ecf equ $ 877 size current_function %%ecf - current_function 878 __SECT__ 879 %endif 880 %endif 881 %endif 882%endmacro 883 884; cpuflags 885 886%assign cpuflags_mmx (1<<0) 887%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 888%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 889%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 890%assign cpuflags_sse (1<<4) | cpuflags_mmx2 891%assign cpuflags_sse2 (1<<5) | cpuflags_sse 892%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 893%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 894%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 895%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 896%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 897%assign cpuflags_avx (1<<11)| cpuflags_sse42 898%assign cpuflags_xop (1<<12)| cpuflags_avx 899%assign cpuflags_fma4 (1<<13)| cpuflags_avx 900%assign cpuflags_fma3 (1<<14)| cpuflags_avx 901%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 902 903%assign cpuflags_cache32 (1<<16) 904%assign cpuflags_cache64 (1<<17) 905%assign cpuflags_slowctz (1<<18) 906%assign cpuflags_lzcnt (1<<19) 907%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant 908%assign cpuflags_atom (1<<21) 909%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt 910%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 911 912; Returns a boolean value expressing whether or not the specified cpuflag is enabled. 913%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) 914%define notcpuflag(x) (cpuflag(x) ^ 1) 915 916; Takes an arbitrary number of cpuflags from the above list. 917; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 918; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 919%macro INIT_CPUFLAGS 0-* 920 %xdefine SUFFIX 921 %undef cpuname 922 %assign cpuflags 0 923 924 %if %0 >= 1 925 %rep %0 926 %ifdef cpuname 927 %xdefine cpuname cpuname %+ _%1 928 %else 929 %xdefine cpuname %1 930 %endif 931 %assign cpuflags cpuflags | cpuflags_%1 932 %rotate 1 933 %endrep 934 %xdefine SUFFIX _ %+ cpuname 935 936 %if cpuflag(avx) 937 %assign avx_enabled 1 938 %endif 939 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 940 %define mova movaps 941 %define movu movups 942 %define movnta movntps 943 %endif 944 %if cpuflag(aligned) 945 %define movu mova 946 %elif cpuflag(sse3) && notcpuflag(ssse3) 947 %define movu lddqu 948 %endif 949 %endif 950 951 %if VPX_ARCH_X86_64 || cpuflag(sse2) 952 %ifdef __NASM_VER__ 953 ALIGNMODE k8 954 %else 955 CPU amdnop 956 %endif 957 %else 958 %ifdef __NASM_VER__ 959 ALIGNMODE nop 960 %else 961 CPU basicnop 962 %endif 963 %endif 964%endmacro 965 966; Merge mmx and sse* 967; m# is a simd register of the currently selected size 968; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 969; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 970; (All 3 remain in sync through SWAP.) 971 972%macro CAT_XDEFINE 3 973 %xdefine %1%2 %3 974%endmacro 975 976%macro CAT_UNDEF 2 977 %undef %1%2 978%endmacro 979 980%macro INIT_MMX 0-1+ 981 %assign avx_enabled 0 982 %define RESET_MM_PERMUTATION INIT_MMX %1 983 %define mmsize 8 984 %define num_mmregs 8 985 %define mova movq 986 %define movu movq 987 %define movh movd 988 %define movnta movntq 989 %assign %%i 0 990 %rep 8 991 CAT_XDEFINE m, %%i, mm %+ %%i 992 CAT_XDEFINE nnmm, %%i, %%i 993 %assign %%i %%i+1 994 %endrep 995 %rep 8 996 CAT_UNDEF m, %%i 997 CAT_UNDEF nnmm, %%i 998 %assign %%i %%i+1 999 %endrep 1000 INIT_CPUFLAGS %1 1001%endmacro 1002 1003%macro INIT_XMM 0-1+ 1004 %assign avx_enabled 0 1005 %define RESET_MM_PERMUTATION INIT_XMM %1 1006 %define mmsize 16 1007 %define num_mmregs 8 1008 %if VPX_ARCH_X86_64 1009 %define num_mmregs 16 1010 %endif 1011 %define mova movdqa 1012 %define movu movdqu 1013 %define movh movq 1014 %define movnta movntdq 1015 %assign %%i 0 1016 %rep num_mmregs 1017 CAT_XDEFINE m, %%i, xmm %+ %%i 1018 CAT_XDEFINE nnxmm, %%i, %%i 1019 %assign %%i %%i+1 1020 %endrep 1021 INIT_CPUFLAGS %1 1022%endmacro 1023 1024%macro INIT_YMM 0-1+ 1025 %assign avx_enabled 1 1026 %define RESET_MM_PERMUTATION INIT_YMM %1 1027 %define mmsize 32 1028 %define num_mmregs 8 1029 %if VPX_ARCH_X86_64 1030 %define num_mmregs 16 1031 %endif 1032 %define mova movdqa 1033 %define movu movdqu 1034 %undef movh 1035 %define movnta movntdq 1036 %assign %%i 0 1037 %rep num_mmregs 1038 CAT_XDEFINE m, %%i, ymm %+ %%i 1039 CAT_XDEFINE nnymm, %%i, %%i 1040 %assign %%i %%i+1 1041 %endrep 1042 INIT_CPUFLAGS %1 1043%endmacro 1044 1045INIT_XMM 1046 1047%macro DECLARE_MMCAST 1 1048 %define mmmm%1 mm%1 1049 %define mmxmm%1 mm%1 1050 %define mmymm%1 mm%1 1051 %define xmmmm%1 mm%1 1052 %define xmmxmm%1 xmm%1 1053 %define xmmymm%1 xmm%1 1054 %define ymmmm%1 mm%1 1055 %define ymmxmm%1 xmm%1 1056 %define ymmymm%1 ymm%1 1057 %define xm%1 xmm %+ m%1 1058 %define ym%1 ymm %+ m%1 1059%endmacro 1060 1061%assign i 0 1062%rep 16 1063 DECLARE_MMCAST i 1064 %assign i i+1 1065%endrep 1066 1067; I often want to use macros that permute their arguments. e.g. there's no 1068; efficient way to implement butterfly or transpose or dct without swapping some 1069; arguments. 1070; 1071; I would like to not have to manually keep track of the permutations: 1072; If I insert a permutation in the middle of a function, it should automatically 1073; change everything that follows. For more complex macros I may also have multiple 1074; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1075; 1076; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1077; permutes its arguments. It's equivalent to exchanging the contents of the 1078; registers, except that this way you exchange the register names instead, so it 1079; doesn't cost any cycles. 1080 1081%macro PERMUTE 2-* ; takes a list of pairs to swap 1082 %rep %0/2 1083 %xdefine %%tmp%2 m%2 1084 %rotate 2 1085 %endrep 1086 %rep %0/2 1087 %xdefine m%1 %%tmp%2 1088 CAT_XDEFINE nn, m%1, %1 1089 %rotate 2 1090 %endrep 1091%endmacro 1092 1093%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 1094 %ifnum %1 ; SWAP 0, 1, ... 1095 SWAP_INTERNAL_NUM %1, %2 1096 %else ; SWAP m0, m1, ... 1097 SWAP_INTERNAL_NAME %1, %2 1098 %endif 1099%endmacro 1100 1101%macro SWAP_INTERNAL_NUM 2-* 1102 %rep %0-1 1103 %xdefine %%tmp m%1 1104 %xdefine m%1 m%2 1105 %xdefine m%2 %%tmp 1106 CAT_XDEFINE nn, m%1, %1 1107 CAT_XDEFINE nn, m%2, %2 1108 %rotate 1 1109 %endrep 1110%endmacro 1111 1112%macro SWAP_INTERNAL_NAME 2-* 1113 %xdefine %%args nn %+ %1 1114 %rep %0-1 1115 %xdefine %%args %%args, nn %+ %2 1116 %rotate 1 1117 %endrep 1118 SWAP_INTERNAL_NUM %%args 1119%endmacro 1120 1121; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1122; calls to that function will automatically load the permutation, so values can 1123; be returned in mmregs. 1124%macro SAVE_MM_PERMUTATION 0-1 1125 %if %0 1126 %xdefine %%f %1_m 1127 %else 1128 %xdefine %%f current_function %+ _m 1129 %endif 1130 %assign %%i 0 1131 %rep num_mmregs 1132 CAT_XDEFINE %%f, %%i, m %+ %%i 1133 %assign %%i %%i+1 1134 %endrep 1135%endmacro 1136 1137%macro LOAD_MM_PERMUTATION 1 ; name to load from 1138 %ifdef %1_m0 1139 %assign %%i 0 1140 %rep num_mmregs 1141 CAT_XDEFINE m, %%i, %1_m %+ %%i 1142 CAT_XDEFINE nn, m %+ %%i, %%i 1143 %assign %%i %%i+1 1144 %endrep 1145 %endif 1146%endmacro 1147 1148; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1149%macro call 1 1150 call_internal %1 %+ SUFFIX, %1 1151%endmacro 1152%macro call_internal 2 1153 %xdefine %%i %2 1154 %ifndef cglobaled_%2 1155 %ifdef cglobaled_%1 1156 %xdefine %%i %1 1157 %endif 1158 %endif 1159 call %%i 1160 LOAD_MM_PERMUTATION %%i 1161%endmacro 1162 1163; Substitutions that reduce instruction size but are functionally equivalent 1164%macro add 2 1165 %ifnum %2 1166 %if %2==128 1167 sub %1, -128 1168 %else 1169 add %1, %2 1170 %endif 1171 %else 1172 add %1, %2 1173 %endif 1174%endmacro 1175 1176%macro sub 2 1177 %ifnum %2 1178 %if %2==128 1179 add %1, -128 1180 %else 1181 sub %1, %2 1182 %endif 1183 %else 1184 sub %1, %2 1185 %endif 1186%endmacro 1187 1188;============================================================================= 1189; AVX abstraction layer 1190;============================================================================= 1191 1192%assign i 0 1193%rep 16 1194 %if i < 8 1195 CAT_XDEFINE sizeofmm, i, 8 1196 %endif 1197 CAT_XDEFINE sizeofxmm, i, 16 1198 CAT_XDEFINE sizeofymm, i, 32 1199 %assign i i+1 1200%endrep 1201%undef i 1202 1203%macro CHECK_AVX_INSTR_EMU 3-* 1204 %xdefine %%opcode %1 1205 %xdefine %%dst %2 1206 %rep %0-2 1207 %ifidn %%dst, %3 1208 %error non-avx emulation of ``%%opcode'' is not supported 1209 %endif 1210 %rotate 1 1211 %endrep 1212%endmacro 1213 1214;%1 == instruction 1215;%2 == minimal instruction set 1216;%3 == 1 if float, 0 if int 1217;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise 1218;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1219;%6+: operands 1220%macro RUN_AVX_INSTR 6-9+ 1221 %ifnum sizeof%7 1222 %assign __sizeofreg sizeof%7 1223 %elifnum sizeof%6 1224 %assign __sizeofreg sizeof%6 1225 %else 1226 %assign __sizeofreg mmsize 1227 %endif 1228 %assign __emulate_avx 0 1229 %if avx_enabled && __sizeofreg >= 16 1230 %xdefine __instr v%1 1231 %else 1232 %xdefine __instr %1 1233 %if %0 >= 8+%4 1234 %assign __emulate_avx 1 1235 %endif 1236 %endif 1237 %ifnidn %2, fnord 1238 %ifdef cpuname 1239 %if notcpuflag(%2) 1240 %error use of ``%1'' %2 instruction in cpuname function: current_function 1241 %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 1242 %error use of ``%1'' sse2 instruction in cpuname function: current_function 1243 %endif 1244 %endif 1245 %endif 1246 1247 %if __emulate_avx 1248 %xdefine __src1 %7 1249 %xdefine __src2 %8 1250 %ifnidn %6, %7 1251 %if %0 >= 9 1252 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 1253 %else 1254 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 1255 %endif 1256 %if %5 && %4 == 0 1257 %ifnid %8 1258 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1259 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1260 ; So, if the instruction is commutative with a memory arg, swap them. 1261 %xdefine __src1 %8 1262 %xdefine __src2 %7 1263 %endif 1264 %endif 1265 %if __sizeofreg == 8 1266 MOVQ %6, __src1 1267 %elif %3 1268 MOVAPS %6, __src1 1269 %else 1270 MOVDQA %6, __src1 1271 %endif 1272 %endif 1273 %if %0 >= 9 1274 %1 %6, __src2, %9 1275 %else 1276 %1 %6, __src2 1277 %endif 1278 %elif %0 >= 9 1279 __instr %6, %7, %8, %9 1280 %elif %0 == 8 1281 __instr %6, %7, %8 1282 %elif %0 == 7 1283 __instr %6, %7 1284 %else 1285 __instr %6 1286 %endif 1287%endmacro 1288 1289;%1 == instruction 1290;%2 == minimal instruction set 1291;%3 == 1 if float, 0 if int 1292;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise 1293;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1294%macro AVX_INSTR 1-5 fnord, 0, 1, 0 1295 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1296 %ifidn %2, fnord 1297 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1298 %elifidn %3, fnord 1299 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1300 %elifidn %4, fnord 1301 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1302 %elifidn %5, fnord 1303 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1304 %else 1305 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1306 %endif 1307 %endmacro 1308%endmacro 1309 1310; Instructions with both VEX and non-VEX encodings 1311; Non-destructive instructions are written without parameters 1312AVX_INSTR addpd, sse2, 1, 0, 1 1313AVX_INSTR addps, sse, 1, 0, 1 1314AVX_INSTR addsd, sse2, 1, 0, 1 1315AVX_INSTR addss, sse, 1, 0, 1 1316AVX_INSTR addsubpd, sse3, 1, 0, 0 1317AVX_INSTR addsubps, sse3, 1, 0, 0 1318AVX_INSTR aesdec, fnord, 0, 0, 0 1319AVX_INSTR aesdeclast, fnord, 0, 0, 0 1320AVX_INSTR aesenc, fnord, 0, 0, 0 1321AVX_INSTR aesenclast, fnord, 0, 0, 0 1322AVX_INSTR aesimc 1323AVX_INSTR aeskeygenassist 1324AVX_INSTR andnpd, sse2, 1, 0, 0 1325AVX_INSTR andnps, sse, 1, 0, 0 1326AVX_INSTR andpd, sse2, 1, 0, 1 1327AVX_INSTR andps, sse, 1, 0, 1 1328AVX_INSTR blendpd, sse4, 1, 0, 0 1329AVX_INSTR blendps, sse4, 1, 0, 0 1330AVX_INSTR blendvpd, sse4, 1, 0, 0 1331AVX_INSTR blendvps, sse4, 1, 0, 0 1332AVX_INSTR cmppd, sse2, 1, 1, 0 1333AVX_INSTR cmpps, sse, 1, 1, 0 1334AVX_INSTR cmpsd, sse2, 1, 1, 0 1335AVX_INSTR cmpss, sse, 1, 1, 0 1336AVX_INSTR comisd, sse2 1337AVX_INSTR comiss, sse 1338AVX_INSTR cvtdq2pd, sse2 1339AVX_INSTR cvtdq2ps, sse2 1340AVX_INSTR cvtpd2dq, sse2 1341AVX_INSTR cvtpd2ps, sse2 1342AVX_INSTR cvtps2dq, sse2 1343AVX_INSTR cvtps2pd, sse2 1344AVX_INSTR cvtsd2si, sse2 1345AVX_INSTR cvtsd2ss, sse2 1346AVX_INSTR cvtsi2sd, sse2 1347AVX_INSTR cvtsi2ss, sse 1348AVX_INSTR cvtss2sd, sse2 1349AVX_INSTR cvtss2si, sse 1350AVX_INSTR cvttpd2dq, sse2 1351AVX_INSTR cvttps2dq, sse2 1352AVX_INSTR cvttsd2si, sse2 1353AVX_INSTR cvttss2si, sse 1354AVX_INSTR divpd, sse2, 1, 0, 0 1355AVX_INSTR divps, sse, 1, 0, 0 1356AVX_INSTR divsd, sse2, 1, 0, 0 1357AVX_INSTR divss, sse, 1, 0, 0 1358AVX_INSTR dppd, sse4, 1, 1, 0 1359AVX_INSTR dpps, sse4, 1, 1, 0 1360AVX_INSTR extractps, sse4 1361AVX_INSTR haddpd, sse3, 1, 0, 0 1362AVX_INSTR haddps, sse3, 1, 0, 0 1363AVX_INSTR hsubpd, sse3, 1, 0, 0 1364AVX_INSTR hsubps, sse3, 1, 0, 0 1365AVX_INSTR insertps, sse4, 1, 1, 0 1366AVX_INSTR lddqu, sse3 1367AVX_INSTR ldmxcsr, sse 1368AVX_INSTR maskmovdqu, sse2 1369AVX_INSTR maxpd, sse2, 1, 0, 1 1370AVX_INSTR maxps, sse, 1, 0, 1 1371AVX_INSTR maxsd, sse2, 1, 0, 1 1372AVX_INSTR maxss, sse, 1, 0, 1 1373AVX_INSTR minpd, sse2, 1, 0, 1 1374AVX_INSTR minps, sse, 1, 0, 1 1375AVX_INSTR minsd, sse2, 1, 0, 1 1376AVX_INSTR minss, sse, 1, 0, 1 1377AVX_INSTR movapd, sse2 1378AVX_INSTR movaps, sse 1379AVX_INSTR movd, mmx 1380AVX_INSTR movddup, sse3 1381AVX_INSTR movdqa, sse2 1382AVX_INSTR movdqu, sse2 1383AVX_INSTR movhlps, sse, 1, 0, 0 1384AVX_INSTR movhpd, sse2, 1, 0, 0 1385AVX_INSTR movhps, sse, 1, 0, 0 1386AVX_INSTR movlhps, sse, 1, 0, 0 1387AVX_INSTR movlpd, sse2, 1, 0, 0 1388AVX_INSTR movlps, sse, 1, 0, 0 1389AVX_INSTR movmskpd, sse2 1390AVX_INSTR movmskps, sse 1391AVX_INSTR movntdq, sse2 1392AVX_INSTR movntdqa, sse4 1393AVX_INSTR movntpd, sse2 1394AVX_INSTR movntps, sse 1395AVX_INSTR movq, mmx 1396AVX_INSTR movsd, sse2, 1, 0, 0 1397AVX_INSTR movshdup, sse3 1398AVX_INSTR movsldup, sse3 1399AVX_INSTR movss, sse, 1, 0, 0 1400AVX_INSTR movupd, sse2 1401AVX_INSTR movups, sse 1402AVX_INSTR mpsadbw, sse4 1403AVX_INSTR mulpd, sse2, 1, 0, 1 1404AVX_INSTR mulps, sse, 1, 0, 1 1405AVX_INSTR mulsd, sse2, 1, 0, 1 1406AVX_INSTR mulss, sse, 1, 0, 1 1407AVX_INSTR orpd, sse2, 1, 0, 1 1408AVX_INSTR orps, sse, 1, 0, 1 1409AVX_INSTR pabsb, ssse3 1410AVX_INSTR pabsd, ssse3 1411AVX_INSTR pabsw, ssse3 1412AVX_INSTR packsswb, mmx, 0, 0, 0 1413AVX_INSTR packssdw, mmx, 0, 0, 0 1414AVX_INSTR packuswb, mmx, 0, 0, 0 1415AVX_INSTR packusdw, sse4, 0, 0, 0 1416AVX_INSTR paddb, mmx, 0, 0, 1 1417AVX_INSTR paddw, mmx, 0, 0, 1 1418AVX_INSTR paddd, mmx, 0, 0, 1 1419AVX_INSTR paddq, sse2, 0, 0, 1 1420AVX_INSTR paddsb, mmx, 0, 0, 1 1421AVX_INSTR paddsw, mmx, 0, 0, 1 1422AVX_INSTR paddusb, mmx, 0, 0, 1 1423AVX_INSTR paddusw, mmx, 0, 0, 1 1424AVX_INSTR palignr, ssse3 1425AVX_INSTR pand, mmx, 0, 0, 1 1426AVX_INSTR pandn, mmx, 0, 0, 0 1427AVX_INSTR pavgb, mmx2, 0, 0, 1 1428AVX_INSTR pavgw, mmx2, 0, 0, 1 1429AVX_INSTR pblendvb, sse4, 0, 0, 0 1430AVX_INSTR pblendw, sse4 1431AVX_INSTR pclmulqdq 1432AVX_INSTR pcmpestri, sse42 1433AVX_INSTR pcmpestrm, sse42 1434AVX_INSTR pcmpistri, sse42 1435AVX_INSTR pcmpistrm, sse42 1436AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1437AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1438AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1439AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1440AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1441AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1442AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1443AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1444AVX_INSTR pextrb, sse4 1445AVX_INSTR pextrd, sse4 1446AVX_INSTR pextrq, sse4 1447AVX_INSTR pextrw, mmx2 1448AVX_INSTR phaddw, ssse3, 0, 0, 0 1449AVX_INSTR phaddd, ssse3, 0, 0, 0 1450AVX_INSTR phaddsw, ssse3, 0, 0, 0 1451AVX_INSTR phminposuw, sse4 1452AVX_INSTR phsubw, ssse3, 0, 0, 0 1453AVX_INSTR phsubd, ssse3, 0, 0, 0 1454AVX_INSTR phsubsw, ssse3, 0, 0, 0 1455AVX_INSTR pinsrb, sse4 1456AVX_INSTR pinsrd, sse4 1457AVX_INSTR pinsrq, sse4 1458AVX_INSTR pinsrw, mmx2 1459AVX_INSTR pmaddwd, mmx, 0, 0, 1 1460AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1461AVX_INSTR pmaxsb, sse4, 0, 0, 1 1462AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1463AVX_INSTR pmaxsd, sse4, 0, 0, 1 1464AVX_INSTR pmaxub, mmx2, 0, 0, 1 1465AVX_INSTR pmaxuw, sse4, 0, 0, 1 1466AVX_INSTR pmaxud, sse4, 0, 0, 1 1467AVX_INSTR pminsb, sse4, 0, 0, 1 1468AVX_INSTR pminsw, mmx2, 0, 0, 1 1469AVX_INSTR pminsd, sse4, 0, 0, 1 1470AVX_INSTR pminub, mmx2, 0, 0, 1 1471AVX_INSTR pminuw, sse4, 0, 0, 1 1472AVX_INSTR pminud, sse4, 0, 0, 1 1473AVX_INSTR pmovmskb, mmx2 1474AVX_INSTR pmovsxbw, sse4 1475AVX_INSTR pmovsxbd, sse4 1476AVX_INSTR pmovsxbq, sse4 1477AVX_INSTR pmovsxwd, sse4 1478AVX_INSTR pmovsxwq, sse4 1479AVX_INSTR pmovsxdq, sse4 1480AVX_INSTR pmovzxbw, sse4 1481AVX_INSTR pmovzxbd, sse4 1482AVX_INSTR pmovzxbq, sse4 1483AVX_INSTR pmovzxwd, sse4 1484AVX_INSTR pmovzxwq, sse4 1485AVX_INSTR pmovzxdq, sse4 1486AVX_INSTR pmuldq, sse4, 0, 0, 1 1487AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1488AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1489AVX_INSTR pmulhw, mmx, 0, 0, 1 1490AVX_INSTR pmullw, mmx, 0, 0, 1 1491AVX_INSTR pmulld, sse4, 0, 0, 1 1492AVX_INSTR pmuludq, sse2, 0, 0, 1 1493AVX_INSTR por, mmx, 0, 0, 1 1494AVX_INSTR psadbw, mmx2, 0, 0, 1 1495AVX_INSTR pshufb, ssse3, 0, 0, 0 1496AVX_INSTR pshufd, sse2 1497AVX_INSTR pshufhw, sse2 1498AVX_INSTR pshuflw, sse2 1499AVX_INSTR psignb, ssse3, 0, 0, 0 1500AVX_INSTR psignw, ssse3, 0, 0, 0 1501AVX_INSTR psignd, ssse3, 0, 0, 0 1502AVX_INSTR psllw, mmx, 0, 0, 0 1503AVX_INSTR pslld, mmx, 0, 0, 0 1504AVX_INSTR psllq, mmx, 0, 0, 0 1505AVX_INSTR pslldq, sse2, 0, 0, 0 1506AVX_INSTR psraw, mmx, 0, 0, 0 1507AVX_INSTR psrad, mmx, 0, 0, 0 1508AVX_INSTR psrlw, mmx, 0, 0, 0 1509AVX_INSTR psrld, mmx, 0, 0, 0 1510AVX_INSTR psrlq, mmx, 0, 0, 0 1511AVX_INSTR psrldq, sse2, 0, 0, 0 1512AVX_INSTR psubb, mmx, 0, 0, 0 1513AVX_INSTR psubw, mmx, 0, 0, 0 1514AVX_INSTR psubd, mmx, 0, 0, 0 1515AVX_INSTR psubq, sse2, 0, 0, 0 1516AVX_INSTR psubsb, mmx, 0, 0, 0 1517AVX_INSTR psubsw, mmx, 0, 0, 0 1518AVX_INSTR psubusb, mmx, 0, 0, 0 1519AVX_INSTR psubusw, mmx, 0, 0, 0 1520AVX_INSTR ptest, sse4 1521AVX_INSTR punpckhbw, mmx, 0, 0, 0 1522AVX_INSTR punpckhwd, mmx, 0, 0, 0 1523AVX_INSTR punpckhdq, mmx, 0, 0, 0 1524AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1525AVX_INSTR punpcklbw, mmx, 0, 0, 0 1526AVX_INSTR punpcklwd, mmx, 0, 0, 0 1527AVX_INSTR punpckldq, mmx, 0, 0, 0 1528AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1529AVX_INSTR pxor, mmx, 0, 0, 1 1530AVX_INSTR rcpps, sse, 1, 0, 0 1531AVX_INSTR rcpss, sse, 1, 0, 0 1532AVX_INSTR roundpd, sse4 1533AVX_INSTR roundps, sse4 1534AVX_INSTR roundsd, sse4 1535AVX_INSTR roundss, sse4 1536AVX_INSTR rsqrtps, sse, 1, 0, 0 1537AVX_INSTR rsqrtss, sse, 1, 0, 0 1538AVX_INSTR shufpd, sse2, 1, 1, 0 1539AVX_INSTR shufps, sse, 1, 1, 0 1540AVX_INSTR sqrtpd, sse2, 1, 0, 0 1541AVX_INSTR sqrtps, sse, 1, 0, 0 1542AVX_INSTR sqrtsd, sse2, 1, 0, 0 1543AVX_INSTR sqrtss, sse, 1, 0, 0 1544AVX_INSTR stmxcsr, sse 1545AVX_INSTR subpd, sse2, 1, 0, 0 1546AVX_INSTR subps, sse, 1, 0, 0 1547AVX_INSTR subsd, sse2, 1, 0, 0 1548AVX_INSTR subss, sse, 1, 0, 0 1549AVX_INSTR ucomisd, sse2 1550AVX_INSTR ucomiss, sse 1551AVX_INSTR unpckhpd, sse2, 1, 0, 0 1552AVX_INSTR unpckhps, sse, 1, 0, 0 1553AVX_INSTR unpcklpd, sse2, 1, 0, 0 1554AVX_INSTR unpcklps, sse, 1, 0, 0 1555AVX_INSTR xorpd, sse2, 1, 0, 1 1556AVX_INSTR xorps, sse, 1, 0, 1 1557 1558; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1559AVX_INSTR pfadd, 3dnow, 1, 0, 1 1560AVX_INSTR pfsub, 3dnow, 1, 0, 0 1561AVX_INSTR pfmul, 3dnow, 1, 0, 1 1562 1563; base-4 constants for shuffles 1564%assign i 0 1565%rep 256 1566 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1567 %if j < 10 1568 CAT_XDEFINE q000, j, i 1569 %elif j < 100 1570 CAT_XDEFINE q00, j, i 1571 %elif j < 1000 1572 CAT_XDEFINE q0, j, i 1573 %else 1574 CAT_XDEFINE q, j, i 1575 %endif 1576 %assign i i+1 1577%endrep 1578%undef i 1579%undef j 1580 1581%macro FMA_INSTR 3 1582 %macro %1 4-7 %1, %2, %3 1583 %if cpuflag(xop) 1584 v%5 %1, %2, %3, %4 1585 %elifnidn %1, %4 1586 %6 %1, %2, %3 1587 %7 %1, %4 1588 %else 1589 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1590 %endif 1591 %endmacro 1592%endmacro 1593 1594FMA_INSTR pmacsww, pmullw, paddw 1595FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1596FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1597FMA_INSTR pmadcswd, pmaddwd, paddd 1598 1599; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. 1600; FMA3 is only possible if dst is the same as one of the src registers. 1601; Either src2 or src3 can be a memory operand. 1602%macro FMA4_INSTR 2-* 1603 %push fma4_instr 1604 %xdefine %$prefix %1 1605 %rep %0 - 1 1606 %macro %$prefix%2 4-6 %$prefix, %2 1607 %if notcpuflag(fma3) && notcpuflag(fma4) 1608 %error use of ``%5%6'' fma instruction in cpuname function: current_function 1609 %elif cpuflag(fma4) 1610 v%5%6 %1, %2, %3, %4 1611 %elifidn %1, %2 1612 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. 1613 %ifid %3 1614 v%{5}213%6 %2, %3, %4 1615 %else 1616 v%{5}132%6 %2, %4, %3 1617 %endif 1618 %elifidn %1, %3 1619 v%{5}213%6 %3, %2, %4 1620 %elifidn %1, %4 1621 v%{5}231%6 %4, %2, %3 1622 %else 1623 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported 1624 %endif 1625 %endmacro 1626 %rotate 1 1627 %endrep 1628 %pop 1629%endmacro 1630 1631FMA4_INSTR fmadd, pd, ps, sd, ss 1632FMA4_INSTR fmaddsub, pd, ps 1633FMA4_INSTR fmsub, pd, ps, sd, ss 1634FMA4_INSTR fmsubadd, pd, ps 1635FMA4_INSTR fnmadd, pd, ps, sd, ss 1636FMA4_INSTR fnmsub, pd, ps, sd, ss 1637 1638; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) 1639%ifdef __YASM_VER__ 1640 %if __YASM_VERSION_ID__ < 0x01030000 && VPX_ARCH_X86_64 == 0 1641 %macro vpbroadcastq 2 1642 %if sizeof%1 == 16 1643 movddup %1, %2 1644 %else 1645 vbroadcastsd %1, %2 1646 %endif 1647 %endmacro 1648 %endif 1649%endif 1650