1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10;
11
12;
13
14%include "../ASM_SSE2/x86_abi_support.asm"
15
16%macro GET_PARAM_4 0
17    mov         rdx, arg(5)                 ;filter ptr
18    mov         rsi, arg(0)                 ;src_ptr
19    mov         rdi, arg(2)                 ;output_ptr
20    mov         ecx, 0x01000100
21
22    movdqa      xmm3, [rdx]                 ;load filters
23    psrldq      xmm3, 6
24    packsswb    xmm3, xmm3
25    pshuflw     xmm3, xmm3, 0b              ;k3_k4
26
27    movd        xmm2, ecx                   ;rounding_shift
28    pshufd      xmm2, xmm2, 0
29
30    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
31    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
32    movsxd      rcx, DWORD PTR arg(4)       ;output_height
33%endm
34
35%macro APPLY_FILTER_4 1
36    punpcklbw   xmm0, xmm1
37    pmaddubsw   xmm0, xmm3
38
39    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
40    packuswb    xmm0, xmm0                  ;pack to byte
41
42%if %1
43    movd        xmm1, [rdi]
44    pavgb       xmm0, xmm1
45%endif
46    movd        [rdi], xmm0
47    lea         rsi, [rsi + rax]
48    lea         rdi, [rdi + rdx]
49    dec         rcx
50%endm
51
52%macro GET_PARAM 0
53    mov         rdx, arg(5)                 ;filter ptr
54    mov         rsi, arg(0)                 ;src_ptr
55    mov         rdi, arg(2)                 ;output_ptr
56    mov         ecx, 0x01000100
57
58    movdqa      xmm7, [rdx]                 ;load filters
59    psrldq      xmm7, 6
60    packsswb    xmm7, xmm7
61    pshuflw     xmm7, xmm7, 0b              ;k3_k4
62    punpcklwd   xmm7, xmm7
63
64    movd        xmm6, ecx                   ;rounding_shift
65    pshufd      xmm6, xmm6, 0
66
67    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
68    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
69    movsxd      rcx, DWORD PTR arg(4)       ;output_height
70%endm
71
72%macro APPLY_FILTER_8 1
73    punpcklbw   xmm0, xmm1
74    pmaddubsw   xmm0, xmm7
75
76    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
77    packuswb    xmm0, xmm0                  ;pack back to byte
78
79%if %1
80    movq        xmm1, [rdi]
81    pavgb       xmm0, xmm1
82%endif
83    movq        [rdi], xmm0                 ;store the result
84
85    lea         rsi, [rsi + rax]
86    lea         rdi, [rdi + rdx]
87    dec         rcx
88%endm
89
90%macro APPLY_FILTER_16 1
91    punpcklbw   xmm0, xmm1
92    punpckhbw   xmm2, xmm1
93    pmaddubsw   xmm0, xmm7
94    pmaddubsw   xmm2, xmm7
95
96    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
97    pmulhrsw    xmm2, xmm6
98    packuswb    xmm0, xmm2                  ;pack back to byte
99
100%if %1
101    movdqu      xmm1, [rdi]
102    pavgb       xmm0, xmm1
103%endif
104    movdqu      [rdi], xmm0                 ;store the result
105
106    lea         rsi, [rsi + rax]
107    lea         rdi, [rdi + rdx]
108    dec         rcx
109%endm
110
111SECTION .text
112
113global sym(svt_aom_filter_block1d4_v2_ssse3) PRIVATE
114sym(svt_aom_filter_block1d4_v2_ssse3):
115    push        rbp
116    mov         rbp, rsp
117    SHADOW_ARGS_TO_STACK 6
118    push        rsi
119    push        rdi
120    ; end prolog
121
122    GET_PARAM_4
123.loop:
124    movd        xmm0, [rsi]                 ;load src
125    movd        xmm1, [rsi + rax]
126
127    APPLY_FILTER_4 0
128    jnz         .loop
129
130    ; begin epilog
131    pop         rdi
132    pop         rsi
133    UNSHADOW_ARGS
134    pop         rbp
135    ret
136
137global sym(svt_aom_filter_block1d8_v2_ssse3) PRIVATE
138sym(svt_aom_filter_block1d8_v2_ssse3):
139    push        rbp
140    mov         rbp, rsp
141    SHADOW_ARGS_TO_STACK 6
142    SAVE_XMM 7
143    push        rsi
144    push        rdi
145    ; end prolog
146
147    GET_PARAM
148.loop:
149    movq        xmm0, [rsi]                 ;0
150    movq        xmm1, [rsi + rax]           ;1
151
152    APPLY_FILTER_8 0
153    jnz         .loop
154
155    ; begin epilog
156    pop         rdi
157    pop         rsi
158    RESTORE_XMM
159    UNSHADOW_ARGS
160    pop         rbp
161    ret
162
163global sym(svt_aom_filter_block1d16_v2_ssse3) PRIVATE
164sym(svt_aom_filter_block1d16_v2_ssse3):
165    push        rbp
166    mov         rbp, rsp
167    SHADOW_ARGS_TO_STACK 6
168    SAVE_XMM 7
169    push        rsi
170    push        rdi
171    ; end prolog
172
173    GET_PARAM
174.loop:
175    movdqu        xmm0, [rsi]               ;0
176    movdqu        xmm1, [rsi + rax]         ;1
177    movdqa        xmm2, xmm0
178
179    APPLY_FILTER_16 0
180    jnz         .loop
181
182    ; begin epilog
183    pop         rdi
184    pop         rsi
185    RESTORE_XMM
186    UNSHADOW_ARGS
187    pop         rbp
188    ret
189
190global sym(svt_aom_filter_block1d4_h2_ssse3) PRIVATE
191sym(svt_aom_filter_block1d4_h2_ssse3):
192    push        rbp
193    mov         rbp, rsp
194    SHADOW_ARGS_TO_STACK 6
195    push        rsi
196    push        rdi
197    ; end prolog
198
199    GET_PARAM_4
200.loop:
201    movdqu      xmm0, [rsi]                 ;load src
202    movdqa      xmm1, xmm0
203    psrldq      xmm1, 1
204
205    APPLY_FILTER_4 0
206    jnz         .loop
207
208    ; begin epilog
209    pop         rdi
210    pop         rsi
211    UNSHADOW_ARGS
212    pop         rbp
213    ret
214
215global sym(svt_aom_filter_block1d8_h2_ssse3) PRIVATE
216sym(svt_aom_filter_block1d8_h2_ssse3):
217    push        rbp
218    mov         rbp, rsp
219    SHADOW_ARGS_TO_STACK 6
220    SAVE_XMM 7
221    push        rsi
222    push        rdi
223    ; end prolog
224
225    GET_PARAM
226.loop:
227    movdqu      xmm0, [rsi]                 ;load src
228    movdqa      xmm1, xmm0
229    psrldq      xmm1, 1
230
231    APPLY_FILTER_8 0
232    jnz         .loop
233
234    ; begin epilog
235    pop         rdi
236    pop         rsi
237    RESTORE_XMM
238    UNSHADOW_ARGS
239    pop         rbp
240    ret
241
242global sym(svt_aom_filter_block1d16_h2_ssse3) PRIVATE
243sym(svt_aom_filter_block1d16_h2_ssse3):
244    push        rbp
245    mov         rbp, rsp
246    SHADOW_ARGS_TO_STACK 6
247    SAVE_XMM 7
248    push        rsi
249    push        rdi
250    ; end prolog
251
252    GET_PARAM
253.loop:
254    movdqu      xmm0,   [rsi]               ;load src
255    movdqu      xmm1,   [rsi + 1]
256    movdqa      xmm2, xmm0
257
258    APPLY_FILTER_16 0
259    jnz         .loop
260
261    ; begin epilog
262    pop         rdi
263    pop         rsi
264    RESTORE_XMM
265    UNSHADOW_ARGS
266    pop         rbp
267    ret
268