1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "aom_ports/x86_abi_support.asm"
15
16%macro GET_PARAM_4 0
17    mov         rdx, arg(5)                 ;filter ptr
18    mov         rsi, arg(0)                 ;src_ptr
19    mov         rdi, arg(2)                 ;output_ptr
20    mov         rcx, 0x0400040
21
22    movdqa      xmm3, [rdx]                 ;load filters
23    pshuflw     xmm4, xmm3, 11111111b       ;k3
24    psrldq      xmm3, 8
25    pshuflw     xmm3, xmm3, 0b              ;k4
26    punpcklqdq  xmm4, xmm3                  ;k3k4
27
28    movq        xmm3, rcx                   ;rounding
29    pshufd      xmm3, xmm3, 0
30
31    pxor        xmm2, xmm2
32
33    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
34    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
35    movsxd      rcx, DWORD PTR arg(4)       ;output_height
36%endm
37
38%macro APPLY_FILTER_4 1
39
40    punpckldq   xmm0, xmm1                  ;two row in one register
41    punpcklbw   xmm0, xmm2                  ;unpack to word
42    pmullw      xmm0, xmm4                  ;multiply the filter factors
43
44    movdqa      xmm1, xmm0
45    psrldq      xmm1, 8
46    paddsw      xmm0, xmm1
47
48    paddsw      xmm0, xmm3                  ;rounding
49    psraw       xmm0, 7                     ;shift
50    packuswb    xmm0, xmm0                  ;pack to byte
51
52%if %1
53    movd        xmm1, [rdi]
54    pavgb       xmm0, xmm1
55%endif
56
57    movd        [rdi], xmm0
58    lea         rsi, [rsi + rax]
59    lea         rdi, [rdi + rdx]
60    dec         rcx
61%endm
62
63%macro GET_PARAM 0
64    mov         rdx, arg(5)                 ;filter ptr
65    mov         rsi, arg(0)                 ;src_ptr
66    mov         rdi, arg(2)                 ;output_ptr
67    mov         rcx, 0x0400040
68
69    movdqa      xmm7, [rdx]                 ;load filters
70
71    pshuflw     xmm6, xmm7, 11111111b       ;k3
72    pshufhw     xmm7, xmm7, 0b              ;k4
73    punpcklwd   xmm6, xmm6
74    punpckhwd   xmm7, xmm7
75
76    movq        xmm4, rcx                   ;rounding
77    pshufd      xmm4, xmm4, 0
78
79    pxor        xmm5, xmm5
80
81    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
82    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
83    movsxd      rcx, DWORD PTR arg(4)       ;output_height
84%endm
85
86%macro APPLY_FILTER_8 1
87    punpcklbw   xmm0, xmm5
88    punpcklbw   xmm1, xmm5
89
90    pmullw      xmm0, xmm6
91    pmullw      xmm1, xmm7
92    paddsw      xmm0, xmm1
93    paddsw      xmm0, xmm4                  ;rounding
94    psraw       xmm0, 7                     ;shift
95    packuswb    xmm0, xmm0                  ;pack back to byte
96%if %1
97    movq        xmm1, [rdi]
98    pavgb       xmm0, xmm1
99%endif
100    movq        [rdi], xmm0                 ;store the result
101
102    lea         rsi, [rsi + rax]
103    lea         rdi, [rdi + rdx]
104    dec         rcx
105%endm
106
107%macro APPLY_FILTER_16 1
108    punpcklbw   xmm0, xmm5
109    punpcklbw   xmm1, xmm5
110    punpckhbw   xmm2, xmm5
111    punpckhbw   xmm3, xmm5
112
113    pmullw      xmm0, xmm6
114    pmullw      xmm1, xmm7
115    pmullw      xmm2, xmm6
116    pmullw      xmm3, xmm7
117
118    paddsw      xmm0, xmm1
119    paddsw      xmm2, xmm3
120
121    paddsw      xmm0, xmm4                  ;rounding
122    paddsw      xmm2, xmm4
123    psraw       xmm0, 7                     ;shift
124    psraw       xmm2, 7
125    packuswb    xmm0, xmm2                  ;pack back to byte
126%if %1
127    movdqu      xmm1, [rdi]
128    pavgb       xmm0, xmm1
129%endif
130    movdqu      [rdi], xmm0                 ;store the result
131
132    lea         rsi, [rsi + rax]
133    lea         rdi, [rdi + rdx]
134    dec         rcx
135%endm
136
137SECTION .text
138
139global sym(aom_filter_block1d4_v2_sse2) PRIVATE
140sym(aom_filter_block1d4_v2_sse2):
141    push        rbp
142    mov         rbp, rsp
143    SHADOW_ARGS_TO_STACK 6
144    push        rsi
145    push        rdi
146    ; end prolog
147
148    GET_PARAM_4
149.loop:
150    movd        xmm0, [rsi]                 ;load src
151    movd        xmm1, [rsi + rax]
152
153    APPLY_FILTER_4 0
154    jnz         .loop
155
156    ; begin epilog
157    pop         rdi
158    pop         rsi
159    UNSHADOW_ARGS
160    pop         rbp
161    ret
162
163global sym(aom_filter_block1d8_v2_sse2) PRIVATE
164sym(aom_filter_block1d8_v2_sse2):
165    push        rbp
166    mov         rbp, rsp
167    SHADOW_ARGS_TO_STACK 6
168    SAVE_XMM 7
169    push        rsi
170    push        rdi
171    ; end prolog
172
173    GET_PARAM
174.loop:
175    movq        xmm0, [rsi]                 ;0
176    movq        xmm1, [rsi + rax]           ;1
177
178    APPLY_FILTER_8 0
179    jnz         .loop
180
181    ; begin epilog
182    pop         rdi
183    pop         rsi
184    RESTORE_XMM
185    UNSHADOW_ARGS
186    pop         rbp
187    ret
188
189global sym(aom_filter_block1d16_v2_sse2) PRIVATE
190sym(aom_filter_block1d16_v2_sse2):
191    push        rbp
192    mov         rbp, rsp
193    SHADOW_ARGS_TO_STACK 6
194    SAVE_XMM 7
195    push        rsi
196    push        rdi
197    ; end prolog
198
199    GET_PARAM
200.loop:
201    movdqu        xmm0, [rsi]               ;0
202    movdqu        xmm1, [rsi + rax]         ;1
203    movdqa        xmm2, xmm0
204    movdqa        xmm3, xmm1
205
206    APPLY_FILTER_16 0
207    jnz         .loop
208
209    ; begin epilog
210    pop         rdi
211    pop         rsi
212    RESTORE_XMM
213    UNSHADOW_ARGS
214    pop         rbp
215    ret
216
217global sym(aom_filter_block1d4_h2_sse2) PRIVATE
218sym(aom_filter_block1d4_h2_sse2):
219    push        rbp
220    mov         rbp, rsp
221    SHADOW_ARGS_TO_STACK 6
222    push        rsi
223    push        rdi
224    ; end prolog
225
226    GET_PARAM_4
227.loop:
228    movdqu      xmm0, [rsi]                 ;load src
229    movdqa      xmm1, xmm0
230    psrldq      xmm1, 1
231
232    APPLY_FILTER_4 0
233    jnz         .loop
234
235    ; begin epilog
236    pop         rdi
237    pop         rsi
238    UNSHADOW_ARGS
239    pop         rbp
240    ret
241
242global sym(aom_filter_block1d8_h2_sse2) PRIVATE
243sym(aom_filter_block1d8_h2_sse2):
244    push        rbp
245    mov         rbp, rsp
246    SHADOW_ARGS_TO_STACK 6
247    SAVE_XMM 7
248    push        rsi
249    push        rdi
250    ; end prolog
251
252    GET_PARAM
253.loop:
254    movdqu      xmm0, [rsi]                 ;load src
255    movdqa      xmm1, xmm0
256    psrldq      xmm1, 1
257
258    APPLY_FILTER_8 0
259    jnz         .loop
260
261    ; begin epilog
262    pop         rdi
263    pop         rsi
264    RESTORE_XMM
265    UNSHADOW_ARGS
266    pop         rbp
267    ret
268
269global sym(aom_filter_block1d16_h2_sse2) PRIVATE
270sym(aom_filter_block1d16_h2_sse2):
271    push        rbp
272    mov         rbp, rsp
273    SHADOW_ARGS_TO_STACK 6
274    SAVE_XMM 7
275    push        rsi
276    push        rdi
277    ; end prolog
278
279    GET_PARAM
280.loop:
281    movdqu      xmm0,   [rsi]               ;load src
282    movdqu      xmm1,   [rsi + 1]
283    movdqa      xmm2, xmm0
284    movdqa      xmm3, xmm1
285
286    APPLY_FILTER_16 0
287    jnz         .loop
288
289    ; begin epilog
290    pop         rdi
291    pop         rsi
292    RESTORE_XMM
293    UNSHADOW_ARGS
294    pop         rbp
295    ret
296