1;*****************************************************************************
2;* x86inc.asm: x264asm abstraction layer
3;*****************************************************************************
4;* Copyright (C) 2005-2016 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Anton Mitrofanov <BugMaster@narod.ru>
8;*          Fiona Glaser <fiona@x264.com>
9;*          Henrik Gramner <henrik@gramner.com>
10;*
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
14;*
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22;*****************************************************************************
23
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible.  Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well.  Send patches or ideas
35; to x264-devel@videolan.org .
36
37%include "vpx_config.asm"
38
39%ifndef private_prefix
40    %define private_prefix vpx
41%endif
42
43%ifndef public_prefix
44    %define public_prefix private_prefix
45%endif
46
47%ifndef STACK_ALIGNMENT
48    %if VPX_ARCH_X86_64
49        %define STACK_ALIGNMENT 16
50    %else
51        %define STACK_ALIGNMENT 4
52    %endif
53%endif
54
55%define WIN64  0
56%define UNIX64 0
57%if VPX_ARCH_X86_64
58    %ifidn __OUTPUT_FORMAT__,win32
59        %define WIN64  1
60    %elifidn __OUTPUT_FORMAT__,win64
61        %define WIN64  1
62    %elifidn __OUTPUT_FORMAT__,x64
63        %define WIN64  1
64    %else
65        %define UNIX64 1
66    %endif
67%endif
68
69%define FORMAT_ELF 0
70%ifidn __OUTPUT_FORMAT__,elf
71    %define FORMAT_ELF 1
72%elifidn __OUTPUT_FORMAT__,elf32
73    %define FORMAT_ELF 1
74%elifidn __OUTPUT_FORMAT__,elf64
75    %define FORMAT_ELF 1
76%endif
77
78%define FORMAT_MACHO 0
79%ifidn __OUTPUT_FORMAT__,macho32
80     %define FORMAT_MACHO 1
81%elifidn __OUTPUT_FORMAT__,macho64
82     %define FORMAT_MACHO 1
83%endif
84
85; Set PREFIX for libvpx builds.
86%if FORMAT_ELF
87    %undef PREFIX
88%elif WIN64
89    %undef PREFIX
90%else
91    %define PREFIX
92%endif
93
94%ifdef PREFIX
95    %define mangle(x) _ %+ x
96%else
97    %define mangle(x) x
98%endif
99
100; In some instances macho32 tables get misaligned when using .rodata.
101; When looking at the disassembly it appears that the offset is either
102; correct or consistently off by 90. Placing them in the .text section
103; works around the issue. It appears to be specific to the way libvpx
104; handles the tables.
105%macro SECTION_RODATA 0-1 16
106    %ifidn __OUTPUT_FORMAT__,macho32
107        SECTION .text align=%1
108        fakegot:
109    %elifidn __OUTPUT_FORMAT__,aout
110        SECTION .text
111    %else
112        SECTION .rodata align=%1
113    %endif
114%endmacro
115
116; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
117; from original code is added in for 64bit.
118%ifidn __OUTPUT_FORMAT__,elf32
119%define ABI_IS_32BIT 1
120%elifidn __OUTPUT_FORMAT__,macho32
121%define ABI_IS_32BIT 1
122%elifidn __OUTPUT_FORMAT__,win32
123%define ABI_IS_32BIT 1
124%elifidn __OUTPUT_FORMAT__,aout
125%define ABI_IS_32BIT 1
126%else
127%define ABI_IS_32BIT 0
128%endif
129
130%if ABI_IS_32BIT
131    %if CONFIG_PIC=1
132        %ifidn __OUTPUT_FORMAT__,elf32
133            %define GET_GOT_DEFINED 1
134            %define WRT_PLT wrt ..plt
135            %macro GET_GOT 1
136                extern _GLOBAL_OFFSET_TABLE_
137                push %1
138                call %%get_got
139                %%sub_offset:
140                jmp %%exitGG
141                %%get_got:
142                mov %1, [esp]
143                add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
144                ret
145                %%exitGG:
146                %undef GLOBAL
147                %define GLOBAL(x) x + %1 wrt ..gotoff
148                %undef RESTORE_GOT
149                %define RESTORE_GOT pop %1
150            %endmacro
151        %elifidn __OUTPUT_FORMAT__,macho32
152            %define GET_GOT_DEFINED 1
153            %macro GET_GOT 1
154                push %1
155                call %%get_got
156                %%get_got:
157                pop  %1
158                %undef GLOBAL
159                %define GLOBAL(x) x + %1 - %%get_got
160                %undef RESTORE_GOT
161                %define RESTORE_GOT pop %1
162            %endmacro
163        %else
164            %define GET_GOT_DEFINED 0
165        %endif
166    %endif
167
168    %if VPX_ARCH_X86_64 == 0
169        %undef PIC
170    %endif
171
172%else
173    %macro GET_GOT 1
174    %endmacro
175    %define GLOBAL(x) rel x
176    %define WRT_PLT wrt ..plt
177
178    %if WIN64
179        %define PIC
180    %elifidn __OUTPUT_FORMAT__,macho64
181        %define PIC
182    %elif CONFIG_PIC
183        %define PIC
184    %endif
185%endif
186
187%ifnmacro GET_GOT
188    %macro GET_GOT 1
189    %endmacro
190    %define GLOBAL(x) x
191%endif
192%ifndef RESTORE_GOT
193    %define RESTORE_GOT
194%endif
195%ifndef WRT_PLT
196    %define WRT_PLT
197%endif
198
199%ifdef PIC
200    default rel
201%endif
202
203%ifndef GET_GOT_DEFINED
204    %define GET_GOT_DEFINED 0
205%endif
206; Done with PIC macros
207
208%ifdef __NASM_VER__
209    %use smartalign
210%endif
211
212; Macros to eliminate most code duplication between x86_32 and x86_64:
213; Currently this works only for leaf functions which load all their arguments
214; into registers at the start, and make no other use of the stack. Luckily that
215; covers most of x264's asm.
216
217; PROLOGUE:
218; %1 = number of arguments. loads them from stack if needed.
219; %2 = number of registers used. pushes callee-saved regs if needed.
220; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
221; %4 = (optional) stack size to be allocated. The stack will be aligned before
222;      allocating the specified stack size. If the required stack alignment is
223;      larger than the known stack alignment the stack will be manually aligned
224;      and an extra register will be allocated to hold the original stack
225;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
226;      register as stack pointer, request a negative stack size.
227; %4+/%5+ = list of names to define to registers
228; PROLOGUE can also be invoked by adding the same options to cglobal
229
230; e.g.
231; cglobal foo, 2,3,7,0x40, dst, src, tmp
232; declares a function (foo) that automatically loads two arguments (dst and
233; src) into registers, uses one additional register (tmp) plus 7 vector
234; registers (m0-m6) and allocates 0x40 bytes of stack space.
235
236; TODO Some functions can use some args directly from the stack. If they're the
237; last args then you can just not declare them, but if they're in the middle
238; we need more flexible macro.
239
240; RET:
241; Pops anything that was pushed by PROLOGUE, and returns.
242
243; REP_RET:
244; Use this instead of RET if it's a branch target.
245
246; registers:
247; rN and rNq are the native-size register holding function argument N
248; rNd, rNw, rNb are dword, word, and byte size
249; rNh is the high 8 bits of the word size
250; rNm is the original location of arg N (a register or on the stack), dword
251; rNmp is native size
252
253%macro DECLARE_REG 2-3
254    %define r%1q %2
255    %define r%1d %2d
256    %define r%1w %2w
257    %define r%1b %2b
258    %define r%1h %2h
259    %define %2q %2
260    %if %0 == 2
261        %define r%1m  %2d
262        %define r%1mp %2
263    %elif VPX_ARCH_X86_64 ; memory
264        %define r%1m [rstk + stack_offset + %3]
265        %define r%1mp qword r %+ %1 %+ m
266    %else
267        %define r%1m [rstk + stack_offset + %3]
268        %define r%1mp dword r %+ %1 %+ m
269    %endif
270    %define r%1  %2
271%endmacro
272
273%macro DECLARE_REG_SIZE 3
274    %define r%1q r%1
275    %define e%1q r%1
276    %define r%1d e%1
277    %define e%1d e%1
278    %define r%1w %1
279    %define e%1w %1
280    %define r%1h %3
281    %define e%1h %3
282    %define r%1b %2
283    %define e%1b %2
284    %if VPX_ARCH_X86_64 == 0
285        %define r%1 e%1
286    %endif
287%endmacro
288
289DECLARE_REG_SIZE ax, al, ah
290DECLARE_REG_SIZE bx, bl, bh
291DECLARE_REG_SIZE cx, cl, ch
292DECLARE_REG_SIZE dx, dl, dh
293DECLARE_REG_SIZE si, sil, null
294DECLARE_REG_SIZE di, dil, null
295DECLARE_REG_SIZE bp, bpl, null
296
297; t# defines for when per-arch register allocation is more complex than just function arguments
298
299%macro DECLARE_REG_TMP 1-*
300    %assign %%i 0
301    %rep %0
302        CAT_XDEFINE t, %%i, r%1
303        %assign %%i %%i+1
304        %rotate 1
305    %endrep
306%endmacro
307
308%macro DECLARE_REG_TMP_SIZE 0-*
309    %rep %0
310        %define t%1q t%1 %+ q
311        %define t%1d t%1 %+ d
312        %define t%1w t%1 %+ w
313        %define t%1h t%1 %+ h
314        %define t%1b t%1 %+ b
315        %rotate 1
316    %endrep
317%endmacro
318
319DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
320
321%if VPX_ARCH_X86_64
322    %define gprsize 8
323%else
324    %define gprsize 4
325%endif
326
327%macro PUSH 1
328    push %1
329    %ifidn rstk, rsp
330        %assign stack_offset stack_offset+gprsize
331    %endif
332%endmacro
333
334%macro POP 1
335    pop %1
336    %ifidn rstk, rsp
337        %assign stack_offset stack_offset-gprsize
338    %endif
339%endmacro
340
341%macro PUSH_IF_USED 1-*
342    %rep %0
343        %if %1 < regs_used
344            PUSH r%1
345        %endif
346        %rotate 1
347    %endrep
348%endmacro
349
350%macro POP_IF_USED 1-*
351    %rep %0
352        %if %1 < regs_used
353            pop r%1
354        %endif
355        %rotate 1
356    %endrep
357%endmacro
358
359%macro LOAD_IF_USED 1-*
360    %rep %0
361        %if %1 < num_args
362            mov r%1, r %+ %1 %+ mp
363        %endif
364        %rotate 1
365    %endrep
366%endmacro
367
368%macro SUB 2
369    sub %1, %2
370    %ifidn %1, rstk
371        %assign stack_offset stack_offset+(%2)
372    %endif
373%endmacro
374
375%macro ADD 2
376    add %1, %2
377    %ifidn %1, rstk
378        %assign stack_offset stack_offset-(%2)
379    %endif
380%endmacro
381
382%macro movifnidn 2
383    %ifnidn %1, %2
384        mov %1, %2
385    %endif
386%endmacro
387
388%macro movsxdifnidn 2
389    %ifnidn %1, %2
390        movsxd %1, %2
391    %endif
392%endmacro
393
394%macro ASSERT 1
395    %if (%1) == 0
396        %error assertion ``%1'' failed
397    %endif
398%endmacro
399
400%macro DEFINE_ARGS 0-*
401    %ifdef n_arg_names
402        %assign %%i 0
403        %rep n_arg_names
404            CAT_UNDEF arg_name %+ %%i, q
405            CAT_UNDEF arg_name %+ %%i, d
406            CAT_UNDEF arg_name %+ %%i, w
407            CAT_UNDEF arg_name %+ %%i, h
408            CAT_UNDEF arg_name %+ %%i, b
409            CAT_UNDEF arg_name %+ %%i, m
410            CAT_UNDEF arg_name %+ %%i, mp
411            CAT_UNDEF arg_name, %%i
412            %assign %%i %%i+1
413        %endrep
414    %endif
415
416    %xdefine %%stack_offset stack_offset
417    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
418    %assign %%i 0
419    %rep %0
420        %xdefine %1q r %+ %%i %+ q
421        %xdefine %1d r %+ %%i %+ d
422        %xdefine %1w r %+ %%i %+ w
423        %xdefine %1h r %+ %%i %+ h
424        %xdefine %1b r %+ %%i %+ b
425        %xdefine %1m r %+ %%i %+ m
426        %xdefine %1mp r %+ %%i %+ mp
427        CAT_XDEFINE arg_name, %%i, %1
428        %assign %%i %%i+1
429        %rotate 1
430    %endrep
431    %xdefine stack_offset %%stack_offset
432    %assign n_arg_names %0
433%endmacro
434
435%define required_stack_alignment ((mmsize + 15) & ~15)
436
437%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
438    %ifnum %1
439        %if %1 != 0
440            %assign %%pad 0
441            %assign stack_size %1
442            %if stack_size < 0
443                %assign stack_size -stack_size
444            %endif
445            %if WIN64
446                %assign %%pad %%pad + 32 ; shadow space
447                %if mmsize != 8
448                    %assign xmm_regs_used %2
449                    %if xmm_regs_used > 8
450                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
451                    %endif
452                %endif
453            %endif
454            %if required_stack_alignment <= STACK_ALIGNMENT
455                ; maintain the current stack alignment
456                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
457                SUB rsp, stack_size_padded
458            %else
459                %assign %%reg_num (regs_used - 1)
460                %xdefine rstk r %+ %%reg_num
461                ; align stack, and save original stack location directly above
462                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
463                ; stack in a single instruction (i.e. mov rsp, rstk or mov
464                ; rsp, [rsp+stack_size_padded])
465                %if %1 < 0 ; need to store rsp on stack
466                    %xdefine rstkm [rsp + stack_size + %%pad]
467                    %assign %%pad %%pad + gprsize
468                %else ; can keep rsp in rstk during whole function
469                    %xdefine rstkm rstk
470                %endif
471                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
472                mov rstk, rsp
473                and rsp, ~(required_stack_alignment-1)
474                sub rsp, stack_size_padded
475                movifnidn rstkm, rstk
476            %endif
477            WIN64_PUSH_XMM
478        %endif
479    %endif
480%endmacro
481
482%macro SETUP_STACK_POINTER 1
483    %ifnum %1
484        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
485            %if %1 > 0
486                %assign regs_used (regs_used + 1)
487            %endif
488            %if VPX_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
489                ; Ensure that we don't clobber any registers containing arguments
490                %assign regs_used 5 + UNIX64 * 3
491            %endif
492        %endif
493    %endif
494%endmacro
495
496%macro DEFINE_ARGS_INTERNAL 3+
497    %ifnum %2
498        DEFINE_ARGS %3
499    %elif %1 == 4
500        DEFINE_ARGS %2
501    %elif %1 > 4
502        DEFINE_ARGS %2, %3
503    %endif
504%endmacro
505
506%if WIN64 ; Windows x64 ;=================================================
507
508DECLARE_REG 0,  rcx
509DECLARE_REG 1,  rdx
510DECLARE_REG 2,  R8
511DECLARE_REG 3,  R9
512DECLARE_REG 4,  R10, 40
513DECLARE_REG 5,  R11, 48
514DECLARE_REG 6,  rax, 56
515DECLARE_REG 7,  rdi, 64
516DECLARE_REG 8,  rsi, 72
517DECLARE_REG 9,  rbx, 80
518DECLARE_REG 10, rbp, 88
519DECLARE_REG 11, R12, 96
520DECLARE_REG 12, R13, 104
521DECLARE_REG 13, R14, 112
522DECLARE_REG 14, R15, 120
523
524%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
525    %assign num_args %1
526    %assign regs_used %2
527    ASSERT regs_used >= num_args
528    SETUP_STACK_POINTER %4
529    ASSERT regs_used <= 15
530    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
531    ALLOC_STACK %4, %3
532    %if mmsize != 8 && stack_size == 0
533        WIN64_SPILL_XMM %3
534    %endif
535    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
536    DEFINE_ARGS_INTERNAL %0, %4, %5
537%endmacro
538
539%macro WIN64_PUSH_XMM 0
540    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
541    %if xmm_regs_used > 6
542        movaps [rstk + stack_offset +  8], xmm6
543    %endif
544    %if xmm_regs_used > 7
545        movaps [rstk + stack_offset + 24], xmm7
546    %endif
547    %if xmm_regs_used > 8
548        %assign %%i 8
549        %rep xmm_regs_used-8
550            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
551            %assign %%i %%i+1
552        %endrep
553    %endif
554%endmacro
555
556%macro WIN64_SPILL_XMM 1
557    %assign xmm_regs_used %1
558    ASSERT xmm_regs_used <= 16
559    %if xmm_regs_used > 8
560        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
561        %assign %%pad (xmm_regs_used-8)*16 + 32
562        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
563        SUB rsp, stack_size_padded
564    %endif
565    WIN64_PUSH_XMM
566%endmacro
567
568%macro WIN64_RESTORE_XMM_INTERNAL 1
569    %assign %%pad_size 0
570    %if xmm_regs_used > 8
571        %assign %%i xmm_regs_used
572        %rep xmm_regs_used-8
573            %assign %%i %%i-1
574            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
575        %endrep
576    %endif
577    %if stack_size_padded > 0
578        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
579            mov rsp, rstkm
580        %else
581            add %1, stack_size_padded
582            %assign %%pad_size stack_size_padded
583        %endif
584    %endif
585    %if xmm_regs_used > 7
586        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
587    %endif
588    %if xmm_regs_used > 6
589        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
590    %endif
591%endmacro
592
593%macro WIN64_RESTORE_XMM 1
594    WIN64_RESTORE_XMM_INTERNAL %1
595    %assign stack_offset (stack_offset-stack_size_padded)
596    %assign xmm_regs_used 0
597%endmacro
598
599%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
600
601%macro RET 0
602    WIN64_RESTORE_XMM_INTERNAL rsp
603    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
604    %if mmsize == 32
605        vzeroupper
606    %endif
607    AUTO_REP_RET
608%endmacro
609
610%elif VPX_ARCH_X86_64 ; *nix x64 ;=============================================
611
612DECLARE_REG 0,  rdi
613DECLARE_REG 1,  rsi
614DECLARE_REG 2,  rdx
615DECLARE_REG 3,  rcx
616DECLARE_REG 4,  R8
617DECLARE_REG 5,  R9
618DECLARE_REG 6,  rax, 8
619DECLARE_REG 7,  R10, 16
620DECLARE_REG 8,  R11, 24
621DECLARE_REG 9,  rbx, 32
622DECLARE_REG 10, rbp, 40
623DECLARE_REG 11, R12, 48
624DECLARE_REG 12, R13, 56
625DECLARE_REG 13, R14, 64
626DECLARE_REG 14, R15, 72
627
628%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
629    %assign num_args %1
630    %assign regs_used %2
631    ASSERT regs_used >= num_args
632    SETUP_STACK_POINTER %4
633    ASSERT regs_used <= 15
634    PUSH_IF_USED 9, 10, 11, 12, 13, 14
635    ALLOC_STACK %4
636    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
637    DEFINE_ARGS_INTERNAL %0, %4, %5
638%endmacro
639
640%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
641
642%macro RET 0
643    %if stack_size_padded > 0
644        %if required_stack_alignment > STACK_ALIGNMENT
645            mov rsp, rstkm
646        %else
647            add rsp, stack_size_padded
648        %endif
649    %endif
650    POP_IF_USED 14, 13, 12, 11, 10, 9
651    %if mmsize == 32
652        vzeroupper
653    %endif
654    AUTO_REP_RET
655%endmacro
656
657%else ; X86_32 ;==============================================================
658
659DECLARE_REG 0, eax, 4
660DECLARE_REG 1, ecx, 8
661DECLARE_REG 2, edx, 12
662DECLARE_REG 3, ebx, 16
663DECLARE_REG 4, esi, 20
664DECLARE_REG 5, edi, 24
665DECLARE_REG 6, ebp, 28
666%define rsp esp
667
668%macro DECLARE_ARG 1-*
669    %rep %0
670        %define r%1m [rstk + stack_offset + 4*%1 + 4]
671        %define r%1mp dword r%1m
672        %rotate 1
673    %endrep
674%endmacro
675
676DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
677
678%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
679    %assign num_args %1
680    %assign regs_used %2
681    ASSERT regs_used >= num_args
682    %if num_args > 7
683        %assign num_args 7
684    %endif
685    %if regs_used > 7
686        %assign regs_used 7
687    %endif
688    SETUP_STACK_POINTER %4
689    ASSERT regs_used <= 7
690    PUSH_IF_USED 3, 4, 5, 6
691    ALLOC_STACK %4
692    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
693    DEFINE_ARGS_INTERNAL %0, %4, %5
694%endmacro
695
696%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
697
698%macro RET 0
699    %if stack_size_padded > 0
700        %if required_stack_alignment > STACK_ALIGNMENT
701            mov rsp, rstkm
702        %else
703            add rsp, stack_size_padded
704        %endif
705    %endif
706    POP_IF_USED 6, 5, 4, 3
707    %if mmsize == 32
708        vzeroupper
709    %endif
710    AUTO_REP_RET
711%endmacro
712
713%endif ;======================================================================
714
715%if WIN64 == 0
716    %macro WIN64_SPILL_XMM 1
717    %endmacro
718    %macro WIN64_RESTORE_XMM 1
719    %endmacro
720    %macro WIN64_PUSH_XMM 0
721    %endmacro
722%endif
723
724; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
725; a branch or a branch target. So switch to a 2-byte form of ret in that case.
726; We can automatically detect "follows a branch", but not a branch target.
727; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
728%macro REP_RET 0
729    %if has_epilogue
730        RET
731    %else
732        rep ret
733    %endif
734    annotate_function_size
735%endmacro
736
737%define last_branch_adr $$
738%macro AUTO_REP_RET 0
739    %if notcpuflag(ssse3)
740        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
741    %endif
742    ret
743    annotate_function_size
744%endmacro
745
746%macro BRANCH_INSTR 0-*
747    %rep %0
748        %macro %1 1-2 %1
749            %2 %1
750            %if notcpuflag(ssse3)
751                %%branch_instr equ $
752                %xdefine last_branch_adr %%branch_instr
753            %endif
754        %endmacro
755        %rotate 1
756    %endrep
757%endmacro
758
759BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
760
761%macro TAIL_CALL 2 ; callee, is_nonadjacent
762    %if has_epilogue
763        call %1
764        RET
765    %elif %2
766        jmp %1
767    %endif
768    annotate_function_size
769%endmacro
770
771;=============================================================================
772; arch-independent part
773;=============================================================================
774
775%assign function_align 16
776
777; Begin a function.
778; Applies any symbol mangling needed for C linkage, and sets up a define such that
779; subsequent uses of the function name automatically refer to the mangled version.
780; Appends cpuflags to the function name if cpuflags has been specified.
781; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
782; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
783%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
784    cglobal_internal 1, %1 %+ SUFFIX, %2
785%endmacro
786%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
787    cglobal_internal 0, %1 %+ SUFFIX, %2
788%endmacro
789%macro cglobal_internal 2-3+
790    annotate_function_size
791    %if %1
792        %xdefine %%FUNCTION_PREFIX private_prefix
793        ; libvpx explicitly sets visibility in shared object builds. Avoid
794        ; setting visibility to hidden as it may break builds that split
795        ; sources on e.g., directory boundaries.
796        %ifdef CHROMIUM
797            %xdefine %%VISIBILITY hidden
798        %else
799            %xdefine %%VISIBILITY
800        %endif
801    %else
802        %xdefine %%FUNCTION_PREFIX public_prefix
803        %xdefine %%VISIBILITY
804    %endif
805    %ifndef cglobaled_%2
806        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
807        %xdefine %2.skip_prologue %2 %+ .skip_prologue
808        CAT_XDEFINE cglobaled_, %2, 1
809    %endif
810    %xdefine current_function %2
811    %xdefine current_function_section __SECT__
812    %if FORMAT_ELF
813        global %2:function %%VISIBILITY
814    %elif FORMAT_MACHO
815        %ifdef __NASM_VER__
816            global %2
817        %else
818            global %2:private_extern
819        %endif
820    %else
821        global %2
822    %endif
823    align function_align
824    %2:
825    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
826    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
827    %assign stack_offset 0      ; stack pointer offset relative to the return address
828    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
829    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
830    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
831    %ifnidn %3, ""
832        PROLOGUE %3
833    %endif
834%endmacro
835
836%macro cextern 1
837    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
838    CAT_XDEFINE cglobaled_, %1, 1
839    extern %1
840%endmacro
841
842; like cextern, but without the prefix
843%macro cextern_naked 1
844    %ifdef PREFIX
845        %xdefine %1 mangle(%1)
846    %endif
847    CAT_XDEFINE cglobaled_, %1, 1
848    extern %1
849%endmacro
850
851%macro const 1-2+
852    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
853    %if FORMAT_ELF
854        global %1:data hidden
855    %else
856        global %1
857    %endif
858    %1: %2
859%endmacro
860
861; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
862%if FORMAT_ELF
863    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
864%endif
865
866; Tell debuggers how large the function was.
867; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
868; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
869; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
870; then its size might be unspecified.
871%macro annotate_function_size 0
872    %ifdef __YASM_VER__
873        %ifdef current_function
874            %if FORMAT_ELF
875                current_function_section
876                %%ecf equ $
877                size current_function %%ecf - current_function
878                __SECT__
879            %endif
880        %endif
881    %endif
882%endmacro
883
884; cpuflags
885
886%assign cpuflags_mmx      (1<<0)
887%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
888%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
889%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
890%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
891%assign cpuflags_sse2     (1<<5) | cpuflags_sse
892%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
893%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
894%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
895%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
896%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
897%assign cpuflags_avx      (1<<11)| cpuflags_sse42
898%assign cpuflags_xop      (1<<12)| cpuflags_avx
899%assign cpuflags_fma4     (1<<13)| cpuflags_avx
900%assign cpuflags_fma3     (1<<14)| cpuflags_avx
901%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
902
903%assign cpuflags_cache32  (1<<16)
904%assign cpuflags_cache64  (1<<17)
905%assign cpuflags_slowctz  (1<<18)
906%assign cpuflags_lzcnt    (1<<19)
907%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
908%assign cpuflags_atom     (1<<21)
909%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
910%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
911
912; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
913%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
914%define notcpuflag(x) (cpuflag(x) ^ 1)
915
916; Takes an arbitrary number of cpuflags from the above list.
917; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
918; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
919%macro INIT_CPUFLAGS 0-*
920    %xdefine SUFFIX
921    %undef cpuname
922    %assign cpuflags 0
923
924    %if %0 >= 1
925        %rep %0
926            %ifdef cpuname
927                %xdefine cpuname cpuname %+ _%1
928            %else
929                %xdefine cpuname %1
930            %endif
931            %assign cpuflags cpuflags | cpuflags_%1
932            %rotate 1
933        %endrep
934        %xdefine SUFFIX _ %+ cpuname
935
936        %if cpuflag(avx)
937            %assign avx_enabled 1
938        %endif
939        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
940            %define mova movaps
941            %define movu movups
942            %define movnta movntps
943        %endif
944        %if cpuflag(aligned)
945            %define movu mova
946        %elif cpuflag(sse3) && notcpuflag(ssse3)
947            %define movu lddqu
948        %endif
949    %endif
950
951    %if VPX_ARCH_X86_64 || cpuflag(sse2)
952        %ifdef __NASM_VER__
953            ALIGNMODE k8
954        %else
955            CPU amdnop
956        %endif
957    %else
958        %ifdef __NASM_VER__
959            ALIGNMODE nop
960        %else
961            CPU basicnop
962        %endif
963    %endif
964%endmacro
965
966; Merge mmx and sse*
967; m# is a simd register of the currently selected size
968; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
969; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
970; (All 3 remain in sync through SWAP.)
971
972%macro CAT_XDEFINE 3
973    %xdefine %1%2 %3
974%endmacro
975
976%macro CAT_UNDEF 2
977    %undef %1%2
978%endmacro
979
980%macro INIT_MMX 0-1+
981    %assign avx_enabled 0
982    %define RESET_MM_PERMUTATION INIT_MMX %1
983    %define mmsize 8
984    %define num_mmregs 8
985    %define mova movq
986    %define movu movq
987    %define movh movd
988    %define movnta movntq
989    %assign %%i 0
990    %rep 8
991        CAT_XDEFINE m, %%i, mm %+ %%i
992        CAT_XDEFINE nnmm, %%i, %%i
993        %assign %%i %%i+1
994    %endrep
995    %rep 8
996        CAT_UNDEF m, %%i
997        CAT_UNDEF nnmm, %%i
998        %assign %%i %%i+1
999    %endrep
1000    INIT_CPUFLAGS %1
1001%endmacro
1002
1003%macro INIT_XMM 0-1+
1004    %assign avx_enabled 0
1005    %define RESET_MM_PERMUTATION INIT_XMM %1
1006    %define mmsize 16
1007    %define num_mmregs 8
1008    %if VPX_ARCH_X86_64
1009        %define num_mmregs 16
1010    %endif
1011    %define mova movdqa
1012    %define movu movdqu
1013    %define movh movq
1014    %define movnta movntdq
1015    %assign %%i 0
1016    %rep num_mmregs
1017        CAT_XDEFINE m, %%i, xmm %+ %%i
1018        CAT_XDEFINE nnxmm, %%i, %%i
1019        %assign %%i %%i+1
1020    %endrep
1021    INIT_CPUFLAGS %1
1022%endmacro
1023
1024%macro INIT_YMM 0-1+
1025    %assign avx_enabled 1
1026    %define RESET_MM_PERMUTATION INIT_YMM %1
1027    %define mmsize 32
1028    %define num_mmregs 8
1029    %if VPX_ARCH_X86_64
1030        %define num_mmregs 16
1031    %endif
1032    %define mova movdqa
1033    %define movu movdqu
1034    %undef movh
1035    %define movnta movntdq
1036    %assign %%i 0
1037    %rep num_mmregs
1038        CAT_XDEFINE m, %%i, ymm %+ %%i
1039        CAT_XDEFINE nnymm, %%i, %%i
1040        %assign %%i %%i+1
1041    %endrep
1042    INIT_CPUFLAGS %1
1043%endmacro
1044
1045INIT_XMM
1046
1047%macro DECLARE_MMCAST 1
1048    %define  mmmm%1   mm%1
1049    %define  mmxmm%1  mm%1
1050    %define  mmymm%1  mm%1
1051    %define xmmmm%1   mm%1
1052    %define xmmxmm%1 xmm%1
1053    %define xmmymm%1 xmm%1
1054    %define ymmmm%1   mm%1
1055    %define ymmxmm%1 xmm%1
1056    %define ymmymm%1 ymm%1
1057    %define xm%1 xmm %+ m%1
1058    %define ym%1 ymm %+ m%1
1059%endmacro
1060
1061%assign i 0
1062%rep 16
1063    DECLARE_MMCAST i
1064    %assign i i+1
1065%endrep
1066
1067; I often want to use macros that permute their arguments. e.g. there's no
1068; efficient way to implement butterfly or transpose or dct without swapping some
1069; arguments.
1070;
1071; I would like to not have to manually keep track of the permutations:
1072; If I insert a permutation in the middle of a function, it should automatically
1073; change everything that follows. For more complex macros I may also have multiple
1074; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
1075;
1076; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
1077; permutes its arguments. It's equivalent to exchanging the contents of the
1078; registers, except that this way you exchange the register names instead, so it
1079; doesn't cost any cycles.
1080
1081%macro PERMUTE 2-* ; takes a list of pairs to swap
1082    %rep %0/2
1083        %xdefine %%tmp%2 m%2
1084        %rotate 2
1085    %endrep
1086    %rep %0/2
1087        %xdefine m%1 %%tmp%2
1088        CAT_XDEFINE nn, m%1, %1
1089        %rotate 2
1090    %endrep
1091%endmacro
1092
1093%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
1094    %ifnum %1 ; SWAP 0, 1, ...
1095        SWAP_INTERNAL_NUM %1, %2
1096    %else ; SWAP m0, m1, ...
1097        SWAP_INTERNAL_NAME %1, %2
1098    %endif
1099%endmacro
1100
1101%macro SWAP_INTERNAL_NUM 2-*
1102    %rep %0-1
1103        %xdefine %%tmp m%1
1104        %xdefine m%1 m%2
1105        %xdefine m%2 %%tmp
1106        CAT_XDEFINE nn, m%1, %1
1107        CAT_XDEFINE nn, m%2, %2
1108        %rotate 1
1109    %endrep
1110%endmacro
1111
1112%macro SWAP_INTERNAL_NAME 2-*
1113    %xdefine %%args nn %+ %1
1114    %rep %0-1
1115        %xdefine %%args %%args, nn %+ %2
1116        %rotate 1
1117    %endrep
1118    SWAP_INTERNAL_NUM %%args
1119%endmacro
1120
1121; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
1122; calls to that function will automatically load the permutation, so values can
1123; be returned in mmregs.
1124%macro SAVE_MM_PERMUTATION 0-1
1125    %if %0
1126        %xdefine %%f %1_m
1127    %else
1128        %xdefine %%f current_function %+ _m
1129    %endif
1130    %assign %%i 0
1131    %rep num_mmregs
1132        CAT_XDEFINE %%f, %%i, m %+ %%i
1133        %assign %%i %%i+1
1134    %endrep
1135%endmacro
1136
1137%macro LOAD_MM_PERMUTATION 1 ; name to load from
1138    %ifdef %1_m0
1139        %assign %%i 0
1140        %rep num_mmregs
1141            CAT_XDEFINE m, %%i, %1_m %+ %%i
1142            CAT_XDEFINE nn, m %+ %%i, %%i
1143            %assign %%i %%i+1
1144        %endrep
1145    %endif
1146%endmacro
1147
1148; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
1149%macro call 1
1150    call_internal %1 %+ SUFFIX, %1
1151%endmacro
1152%macro call_internal 2
1153    %xdefine %%i %2
1154    %ifndef cglobaled_%2
1155        %ifdef cglobaled_%1
1156            %xdefine %%i %1
1157        %endif
1158    %endif
1159    call %%i
1160    LOAD_MM_PERMUTATION %%i
1161%endmacro
1162
1163; Substitutions that reduce instruction size but are functionally equivalent
1164%macro add 2
1165    %ifnum %2
1166        %if %2==128
1167            sub %1, -128
1168        %else
1169            add %1, %2
1170        %endif
1171    %else
1172        add %1, %2
1173    %endif
1174%endmacro
1175
1176%macro sub 2
1177    %ifnum %2
1178        %if %2==128
1179            add %1, -128
1180        %else
1181            sub %1, %2
1182        %endif
1183    %else
1184        sub %1, %2
1185    %endif
1186%endmacro
1187
1188;=============================================================================
1189; AVX abstraction layer
1190;=============================================================================
1191
1192%assign i 0
1193%rep 16
1194    %if i < 8
1195        CAT_XDEFINE sizeofmm, i, 8
1196    %endif
1197    CAT_XDEFINE sizeofxmm, i, 16
1198    CAT_XDEFINE sizeofymm, i, 32
1199    %assign i i+1
1200%endrep
1201%undef i
1202
1203%macro CHECK_AVX_INSTR_EMU 3-*
1204    %xdefine %%opcode %1
1205    %xdefine %%dst %2
1206    %rep %0-2
1207        %ifidn %%dst, %3
1208            %error non-avx emulation of ``%%opcode'' is not supported
1209        %endif
1210        %rotate 1
1211    %endrep
1212%endmacro
1213
1214;%1 == instruction
1215;%2 == minimal instruction set
1216;%3 == 1 if float, 0 if int
1217;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1218;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1219;%6+: operands
1220%macro RUN_AVX_INSTR 6-9+
1221    %ifnum sizeof%7
1222        %assign __sizeofreg sizeof%7
1223    %elifnum sizeof%6
1224        %assign __sizeofreg sizeof%6
1225    %else
1226        %assign __sizeofreg mmsize
1227    %endif
1228    %assign __emulate_avx 0
1229    %if avx_enabled && __sizeofreg >= 16
1230        %xdefine __instr v%1
1231    %else
1232        %xdefine __instr %1
1233        %if %0 >= 8+%4
1234            %assign __emulate_avx 1
1235        %endif
1236    %endif
1237    %ifnidn %2, fnord
1238        %ifdef cpuname
1239            %if notcpuflag(%2)
1240                %error use of ``%1'' %2 instruction in cpuname function: current_function
1241            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
1242                %error use of ``%1'' sse2 instruction in cpuname function: current_function
1243            %endif
1244        %endif
1245    %endif
1246
1247    %if __emulate_avx
1248        %xdefine __src1 %7
1249        %xdefine __src2 %8
1250        %ifnidn %6, %7
1251            %if %0 >= 9
1252                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
1253            %else
1254                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
1255            %endif
1256            %if %5 && %4 == 0
1257                %ifnid %8
1258                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
1259                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1260                    ; So, if the instruction is commutative with a memory arg, swap them.
1261                    %xdefine __src1 %8
1262                    %xdefine __src2 %7
1263                %endif
1264            %endif
1265            %if __sizeofreg == 8
1266                MOVQ %6, __src1
1267            %elif %3
1268                MOVAPS %6, __src1
1269            %else
1270                MOVDQA %6, __src1
1271            %endif
1272        %endif
1273        %if %0 >= 9
1274            %1 %6, __src2, %9
1275        %else
1276            %1 %6, __src2
1277        %endif
1278    %elif %0 >= 9
1279        __instr %6, %7, %8, %9
1280    %elif %0 == 8
1281        __instr %6, %7, %8
1282    %elif %0 == 7
1283        __instr %6, %7
1284    %else
1285        __instr %6
1286    %endif
1287%endmacro
1288
1289;%1 == instruction
1290;%2 == minimal instruction set
1291;%3 == 1 if float, 0 if int
1292;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1293;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1294%macro AVX_INSTR 1-5 fnord, 0, 1, 0
1295    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
1296        %ifidn %2, fnord
1297            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
1298        %elifidn %3, fnord
1299            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
1300        %elifidn %4, fnord
1301            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
1302        %elifidn %5, fnord
1303            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
1304        %else
1305            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
1306        %endif
1307    %endmacro
1308%endmacro
1309
1310; Instructions with both VEX and non-VEX encodings
1311; Non-destructive instructions are written without parameters
1312AVX_INSTR addpd, sse2, 1, 0, 1
1313AVX_INSTR addps, sse, 1, 0, 1
1314AVX_INSTR addsd, sse2, 1, 0, 1
1315AVX_INSTR addss, sse, 1, 0, 1
1316AVX_INSTR addsubpd, sse3, 1, 0, 0
1317AVX_INSTR addsubps, sse3, 1, 0, 0
1318AVX_INSTR aesdec, fnord, 0, 0, 0
1319AVX_INSTR aesdeclast, fnord, 0, 0, 0
1320AVX_INSTR aesenc, fnord, 0, 0, 0
1321AVX_INSTR aesenclast, fnord, 0, 0, 0
1322AVX_INSTR aesimc
1323AVX_INSTR aeskeygenassist
1324AVX_INSTR andnpd, sse2, 1, 0, 0
1325AVX_INSTR andnps, sse, 1, 0, 0
1326AVX_INSTR andpd, sse2, 1, 0, 1
1327AVX_INSTR andps, sse, 1, 0, 1
1328AVX_INSTR blendpd, sse4, 1, 0, 0
1329AVX_INSTR blendps, sse4, 1, 0, 0
1330AVX_INSTR blendvpd, sse4, 1, 0, 0
1331AVX_INSTR blendvps, sse4, 1, 0, 0
1332AVX_INSTR cmppd, sse2, 1, 1, 0
1333AVX_INSTR cmpps, sse, 1, 1, 0
1334AVX_INSTR cmpsd, sse2, 1, 1, 0
1335AVX_INSTR cmpss, sse, 1, 1, 0
1336AVX_INSTR comisd, sse2
1337AVX_INSTR comiss, sse
1338AVX_INSTR cvtdq2pd, sse2
1339AVX_INSTR cvtdq2ps, sse2
1340AVX_INSTR cvtpd2dq, sse2
1341AVX_INSTR cvtpd2ps, sse2
1342AVX_INSTR cvtps2dq, sse2
1343AVX_INSTR cvtps2pd, sse2
1344AVX_INSTR cvtsd2si, sse2
1345AVX_INSTR cvtsd2ss, sse2
1346AVX_INSTR cvtsi2sd, sse2
1347AVX_INSTR cvtsi2ss, sse
1348AVX_INSTR cvtss2sd, sse2
1349AVX_INSTR cvtss2si, sse
1350AVX_INSTR cvttpd2dq, sse2
1351AVX_INSTR cvttps2dq, sse2
1352AVX_INSTR cvttsd2si, sse2
1353AVX_INSTR cvttss2si, sse
1354AVX_INSTR divpd, sse2, 1, 0, 0
1355AVX_INSTR divps, sse, 1, 0, 0
1356AVX_INSTR divsd, sse2, 1, 0, 0
1357AVX_INSTR divss, sse, 1, 0, 0
1358AVX_INSTR dppd, sse4, 1, 1, 0
1359AVX_INSTR dpps, sse4, 1, 1, 0
1360AVX_INSTR extractps, sse4
1361AVX_INSTR haddpd, sse3, 1, 0, 0
1362AVX_INSTR haddps, sse3, 1, 0, 0
1363AVX_INSTR hsubpd, sse3, 1, 0, 0
1364AVX_INSTR hsubps, sse3, 1, 0, 0
1365AVX_INSTR insertps, sse4, 1, 1, 0
1366AVX_INSTR lddqu, sse3
1367AVX_INSTR ldmxcsr, sse
1368AVX_INSTR maskmovdqu, sse2
1369AVX_INSTR maxpd, sse2, 1, 0, 1
1370AVX_INSTR maxps, sse, 1, 0, 1
1371AVX_INSTR maxsd, sse2, 1, 0, 1
1372AVX_INSTR maxss, sse, 1, 0, 1
1373AVX_INSTR minpd, sse2, 1, 0, 1
1374AVX_INSTR minps, sse, 1, 0, 1
1375AVX_INSTR minsd, sse2, 1, 0, 1
1376AVX_INSTR minss, sse, 1, 0, 1
1377AVX_INSTR movapd, sse2
1378AVX_INSTR movaps, sse
1379AVX_INSTR movd, mmx
1380AVX_INSTR movddup, sse3
1381AVX_INSTR movdqa, sse2
1382AVX_INSTR movdqu, sse2
1383AVX_INSTR movhlps, sse, 1, 0, 0
1384AVX_INSTR movhpd, sse2, 1, 0, 0
1385AVX_INSTR movhps, sse, 1, 0, 0
1386AVX_INSTR movlhps, sse, 1, 0, 0
1387AVX_INSTR movlpd, sse2, 1, 0, 0
1388AVX_INSTR movlps, sse, 1, 0, 0
1389AVX_INSTR movmskpd, sse2
1390AVX_INSTR movmskps, sse
1391AVX_INSTR movntdq, sse2
1392AVX_INSTR movntdqa, sse4
1393AVX_INSTR movntpd, sse2
1394AVX_INSTR movntps, sse
1395AVX_INSTR movq, mmx
1396AVX_INSTR movsd, sse2, 1, 0, 0
1397AVX_INSTR movshdup, sse3
1398AVX_INSTR movsldup, sse3
1399AVX_INSTR movss, sse, 1, 0, 0
1400AVX_INSTR movupd, sse2
1401AVX_INSTR movups, sse
1402AVX_INSTR mpsadbw, sse4
1403AVX_INSTR mulpd, sse2, 1, 0, 1
1404AVX_INSTR mulps, sse, 1, 0, 1
1405AVX_INSTR mulsd, sse2, 1, 0, 1
1406AVX_INSTR mulss, sse, 1, 0, 1
1407AVX_INSTR orpd, sse2, 1, 0, 1
1408AVX_INSTR orps, sse, 1, 0, 1
1409AVX_INSTR pabsb, ssse3
1410AVX_INSTR pabsd, ssse3
1411AVX_INSTR pabsw, ssse3
1412AVX_INSTR packsswb, mmx, 0, 0, 0
1413AVX_INSTR packssdw, mmx, 0, 0, 0
1414AVX_INSTR packuswb, mmx, 0, 0, 0
1415AVX_INSTR packusdw, sse4, 0, 0, 0
1416AVX_INSTR paddb, mmx, 0, 0, 1
1417AVX_INSTR paddw, mmx, 0, 0, 1
1418AVX_INSTR paddd, mmx, 0, 0, 1
1419AVX_INSTR paddq, sse2, 0, 0, 1
1420AVX_INSTR paddsb, mmx, 0, 0, 1
1421AVX_INSTR paddsw, mmx, 0, 0, 1
1422AVX_INSTR paddusb, mmx, 0, 0, 1
1423AVX_INSTR paddusw, mmx, 0, 0, 1
1424AVX_INSTR palignr, ssse3
1425AVX_INSTR pand, mmx, 0, 0, 1
1426AVX_INSTR pandn, mmx, 0, 0, 0
1427AVX_INSTR pavgb, mmx2, 0, 0, 1
1428AVX_INSTR pavgw, mmx2, 0, 0, 1
1429AVX_INSTR pblendvb, sse4, 0, 0, 0
1430AVX_INSTR pblendw, sse4
1431AVX_INSTR pclmulqdq
1432AVX_INSTR pcmpestri, sse42
1433AVX_INSTR pcmpestrm, sse42
1434AVX_INSTR pcmpistri, sse42
1435AVX_INSTR pcmpistrm, sse42
1436AVX_INSTR pcmpeqb, mmx, 0, 0, 1
1437AVX_INSTR pcmpeqw, mmx, 0, 0, 1
1438AVX_INSTR pcmpeqd, mmx, 0, 0, 1
1439AVX_INSTR pcmpeqq, sse4, 0, 0, 1
1440AVX_INSTR pcmpgtb, mmx, 0, 0, 0
1441AVX_INSTR pcmpgtw, mmx, 0, 0, 0
1442AVX_INSTR pcmpgtd, mmx, 0, 0, 0
1443AVX_INSTR pcmpgtq, sse42, 0, 0, 0
1444AVX_INSTR pextrb, sse4
1445AVX_INSTR pextrd, sse4
1446AVX_INSTR pextrq, sse4
1447AVX_INSTR pextrw, mmx2
1448AVX_INSTR phaddw, ssse3, 0, 0, 0
1449AVX_INSTR phaddd, ssse3, 0, 0, 0
1450AVX_INSTR phaddsw, ssse3, 0, 0, 0
1451AVX_INSTR phminposuw, sse4
1452AVX_INSTR phsubw, ssse3, 0, 0, 0
1453AVX_INSTR phsubd, ssse3, 0, 0, 0
1454AVX_INSTR phsubsw, ssse3, 0, 0, 0
1455AVX_INSTR pinsrb, sse4
1456AVX_INSTR pinsrd, sse4
1457AVX_INSTR pinsrq, sse4
1458AVX_INSTR pinsrw, mmx2
1459AVX_INSTR pmaddwd, mmx, 0, 0, 1
1460AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
1461AVX_INSTR pmaxsb, sse4, 0, 0, 1
1462AVX_INSTR pmaxsw, mmx2, 0, 0, 1
1463AVX_INSTR pmaxsd, sse4, 0, 0, 1
1464AVX_INSTR pmaxub, mmx2, 0, 0, 1
1465AVX_INSTR pmaxuw, sse4, 0, 0, 1
1466AVX_INSTR pmaxud, sse4, 0, 0, 1
1467AVX_INSTR pminsb, sse4, 0, 0, 1
1468AVX_INSTR pminsw, mmx2, 0, 0, 1
1469AVX_INSTR pminsd, sse4, 0, 0, 1
1470AVX_INSTR pminub, mmx2, 0, 0, 1
1471AVX_INSTR pminuw, sse4, 0, 0, 1
1472AVX_INSTR pminud, sse4, 0, 0, 1
1473AVX_INSTR pmovmskb, mmx2
1474AVX_INSTR pmovsxbw, sse4
1475AVX_INSTR pmovsxbd, sse4
1476AVX_INSTR pmovsxbq, sse4
1477AVX_INSTR pmovsxwd, sse4
1478AVX_INSTR pmovsxwq, sse4
1479AVX_INSTR pmovsxdq, sse4
1480AVX_INSTR pmovzxbw, sse4
1481AVX_INSTR pmovzxbd, sse4
1482AVX_INSTR pmovzxbq, sse4
1483AVX_INSTR pmovzxwd, sse4
1484AVX_INSTR pmovzxwq, sse4
1485AVX_INSTR pmovzxdq, sse4
1486AVX_INSTR pmuldq, sse4, 0, 0, 1
1487AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
1488AVX_INSTR pmulhuw, mmx2, 0, 0, 1
1489AVX_INSTR pmulhw, mmx, 0, 0, 1
1490AVX_INSTR pmullw, mmx, 0, 0, 1
1491AVX_INSTR pmulld, sse4, 0, 0, 1
1492AVX_INSTR pmuludq, sse2, 0, 0, 1
1493AVX_INSTR por, mmx, 0, 0, 1
1494AVX_INSTR psadbw, mmx2, 0, 0, 1
1495AVX_INSTR pshufb, ssse3, 0, 0, 0
1496AVX_INSTR pshufd, sse2
1497AVX_INSTR pshufhw, sse2
1498AVX_INSTR pshuflw, sse2
1499AVX_INSTR psignb, ssse3, 0, 0, 0
1500AVX_INSTR psignw, ssse3, 0, 0, 0
1501AVX_INSTR psignd, ssse3, 0, 0, 0
1502AVX_INSTR psllw, mmx, 0, 0, 0
1503AVX_INSTR pslld, mmx, 0, 0, 0
1504AVX_INSTR psllq, mmx, 0, 0, 0
1505AVX_INSTR pslldq, sse2, 0, 0, 0
1506AVX_INSTR psraw, mmx, 0, 0, 0
1507AVX_INSTR psrad, mmx, 0, 0, 0
1508AVX_INSTR psrlw, mmx, 0, 0, 0
1509AVX_INSTR psrld, mmx, 0, 0, 0
1510AVX_INSTR psrlq, mmx, 0, 0, 0
1511AVX_INSTR psrldq, sse2, 0, 0, 0
1512AVX_INSTR psubb, mmx, 0, 0, 0
1513AVX_INSTR psubw, mmx, 0, 0, 0
1514AVX_INSTR psubd, mmx, 0, 0, 0
1515AVX_INSTR psubq, sse2, 0, 0, 0
1516AVX_INSTR psubsb, mmx, 0, 0, 0
1517AVX_INSTR psubsw, mmx, 0, 0, 0
1518AVX_INSTR psubusb, mmx, 0, 0, 0
1519AVX_INSTR psubusw, mmx, 0, 0, 0
1520AVX_INSTR ptest, sse4
1521AVX_INSTR punpckhbw, mmx, 0, 0, 0
1522AVX_INSTR punpckhwd, mmx, 0, 0, 0
1523AVX_INSTR punpckhdq, mmx, 0, 0, 0
1524AVX_INSTR punpckhqdq, sse2, 0, 0, 0
1525AVX_INSTR punpcklbw, mmx, 0, 0, 0
1526AVX_INSTR punpcklwd, mmx, 0, 0, 0
1527AVX_INSTR punpckldq, mmx, 0, 0, 0
1528AVX_INSTR punpcklqdq, sse2, 0, 0, 0
1529AVX_INSTR pxor, mmx, 0, 0, 1
1530AVX_INSTR rcpps, sse, 1, 0, 0
1531AVX_INSTR rcpss, sse, 1, 0, 0
1532AVX_INSTR roundpd, sse4
1533AVX_INSTR roundps, sse4
1534AVX_INSTR roundsd, sse4
1535AVX_INSTR roundss, sse4
1536AVX_INSTR rsqrtps, sse, 1, 0, 0
1537AVX_INSTR rsqrtss, sse, 1, 0, 0
1538AVX_INSTR shufpd, sse2, 1, 1, 0
1539AVX_INSTR shufps, sse, 1, 1, 0
1540AVX_INSTR sqrtpd, sse2, 1, 0, 0
1541AVX_INSTR sqrtps, sse, 1, 0, 0
1542AVX_INSTR sqrtsd, sse2, 1, 0, 0
1543AVX_INSTR sqrtss, sse, 1, 0, 0
1544AVX_INSTR stmxcsr, sse
1545AVX_INSTR subpd, sse2, 1, 0, 0
1546AVX_INSTR subps, sse, 1, 0, 0
1547AVX_INSTR subsd, sse2, 1, 0, 0
1548AVX_INSTR subss, sse, 1, 0, 0
1549AVX_INSTR ucomisd, sse2
1550AVX_INSTR ucomiss, sse
1551AVX_INSTR unpckhpd, sse2, 1, 0, 0
1552AVX_INSTR unpckhps, sse, 1, 0, 0
1553AVX_INSTR unpcklpd, sse2, 1, 0, 0
1554AVX_INSTR unpcklps, sse, 1, 0, 0
1555AVX_INSTR xorpd, sse2, 1, 0, 1
1556AVX_INSTR xorps, sse, 1, 0, 1
1557
1558; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1559AVX_INSTR pfadd, 3dnow, 1, 0, 1
1560AVX_INSTR pfsub, 3dnow, 1, 0, 0
1561AVX_INSTR pfmul, 3dnow, 1, 0, 1
1562
1563; base-4 constants for shuffles
1564%assign i 0
1565%rep 256
1566    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1567    %if j < 10
1568        CAT_XDEFINE q000, j, i
1569    %elif j < 100
1570        CAT_XDEFINE q00, j, i
1571    %elif j < 1000
1572        CAT_XDEFINE q0, j, i
1573    %else
1574        CAT_XDEFINE q, j, i
1575    %endif
1576    %assign i i+1
1577%endrep
1578%undef i
1579%undef j
1580
1581%macro FMA_INSTR 3
1582    %macro %1 4-7 %1, %2, %3
1583        %if cpuflag(xop)
1584            v%5 %1, %2, %3, %4
1585        %elifnidn %1, %4
1586            %6 %1, %2, %3
1587            %7 %1, %4
1588        %else
1589            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
1590        %endif
1591    %endmacro
1592%endmacro
1593
1594FMA_INSTR  pmacsww,  pmullw, paddw
1595FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
1596FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
1597FMA_INSTR pmadcswd, pmaddwd, paddd
1598
1599; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
1600; FMA3 is only possible if dst is the same as one of the src registers.
1601; Either src2 or src3 can be a memory operand.
1602%macro FMA4_INSTR 2-*
1603    %push fma4_instr
1604    %xdefine %$prefix %1
1605    %rep %0 - 1
1606        %macro %$prefix%2 4-6 %$prefix, %2
1607            %if notcpuflag(fma3) && notcpuflag(fma4)
1608                %error use of ``%5%6'' fma instruction in cpuname function: current_function
1609            %elif cpuflag(fma4)
1610                v%5%6 %1, %2, %3, %4
1611            %elifidn %1, %2
1612                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
1613                %ifid %3
1614                    v%{5}213%6 %2, %3, %4
1615                %else
1616                    v%{5}132%6 %2, %4, %3
1617                %endif
1618            %elifidn %1, %3
1619                v%{5}213%6 %3, %2, %4
1620            %elifidn %1, %4
1621                v%{5}231%6 %4, %2, %3
1622            %else
1623                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
1624            %endif
1625        %endmacro
1626        %rotate 1
1627    %endrep
1628    %pop
1629%endmacro
1630
1631FMA4_INSTR fmadd,    pd, ps, sd, ss
1632FMA4_INSTR fmaddsub, pd, ps
1633FMA4_INSTR fmsub,    pd, ps, sd, ss
1634FMA4_INSTR fmsubadd, pd, ps
1635FMA4_INSTR fnmadd,   pd, ps, sd, ss
1636FMA4_INSTR fnmsub,   pd, ps, sd, ss
1637
1638; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
1639%ifdef __YASM_VER__
1640    %if __YASM_VERSION_ID__ < 0x01030000 && VPX_ARCH_X86_64 == 0
1641        %macro vpbroadcastq 2
1642            %if sizeof%1 == 16
1643                movddup %1, %2
1644            %else
1645                vbroadcastsd %1, %2
1646            %endif
1647        %endmacro
1648    %endif
1649%endif
1650