1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;unsigned int vpx_highbd_calc16x16var_sse2
17;(
18;    unsigned char   *  src_ptr,
19;    int             src_stride,
20;    unsigned char   *  ref_ptr,
21;    int             ref_stride,
22;    unsigned int    *  SSE,
23;    int             *  Sum
24;)
25global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
26sym(vpx_highbd_calc16x16var_sse2):
27    push        rbp
28    mov         rbp, rsp
29    SHADOW_ARGS_TO_STACK 6
30    SAVE_XMM 7
31    push rbx
32    push rsi
33    push rdi
34    ; end prolog
35
36        mov         rsi,            arg(0) ;[src_ptr]
37        mov         rdi,            arg(2) ;[ref_ptr]
38
39        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
40        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
41        add         rax,            rax ; source stride in bytes
42        add         rdx,            rdx ; recon stride in bytes
43
44        ; Prefetch data
45        prefetcht0      [rsi]
46        prefetcht0      [rsi+16]
47        prefetcht0      [rsi+rax]
48        prefetcht0      [rsi+rax+16]
49        lea             rbx,    [rsi+rax*2]
50        prefetcht0      [rbx]
51        prefetcht0      [rbx+16]
52        prefetcht0      [rbx+rax]
53        prefetcht0      [rbx+rax+16]
54
55        prefetcht0      [rdi]
56        prefetcht0      [rdi+16]
57        prefetcht0      [rdi+rdx]
58        prefetcht0      [rdi+rdx+16]
59        lea             rbx,    [rdi+rdx*2]
60        prefetcht0      [rbx]
61        prefetcht0      [rbx+16]
62        prefetcht0      [rbx+rdx]
63        prefetcht0      [rbx+rdx+16]
64
65        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
66        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
67
68        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
69        mov         rcx,            16
70
71.var16loop:
72        movdqu      xmm1,           XMMWORD PTR [rsi]
73        movdqu      xmm2,           XMMWORD PTR [rdi]
74
75        lea             rbx,    [rsi+rax*2]
76        prefetcht0      [rbx]
77        prefetcht0      [rbx+16]
78        prefetcht0      [rbx+rax]
79        prefetcht0      [rbx+rax+16]
80        lea             rbx,    [rdi+rdx*2]
81        prefetcht0      [rbx]
82        prefetcht0      [rbx+16]
83        prefetcht0      [rbx+rdx]
84        prefetcht0      [rbx+rdx+16]
85
86        pxor        xmm5,           xmm5
87
88        psubw       xmm1,           xmm2
89        movdqu      xmm3,           XMMWORD PTR [rsi+16]
90        paddw       xmm5,           xmm1
91        pmaddwd     xmm1,           xmm1
92        movdqu      xmm2,           XMMWORD PTR [rdi+16]
93        paddd       xmm6,           xmm1
94
95        psubw       xmm3,           xmm2
96        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
97        paddw       xmm5,           xmm3
98        pmaddwd     xmm3,           xmm3
99        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
100        paddd       xmm6,           xmm3
101
102        psubw       xmm1,           xmm2
103        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
104        paddw       xmm5,           xmm1
105        pmaddwd     xmm1,           xmm1
106        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
107        paddd       xmm6,           xmm1
108
109        psubw       xmm3,           xmm2
110        paddw       xmm5,           xmm3
111        pmaddwd     xmm3,           xmm3
112        paddd       xmm6,           xmm3
113
114        movdqa      xmm1,           xmm5
115        movdqa      xmm2,           xmm5
116        pcmpgtw     xmm1,           xmm0
117        pcmpeqw     xmm2,           xmm0
118        por         xmm1,           xmm2
119        pcmpeqw     xmm1,           xmm0
120        movdqa      xmm2,           xmm5
121        punpcklwd   xmm5,           xmm1
122        punpckhwd   xmm2,           xmm1
123        paddd       xmm7,           xmm5
124        paddd       xmm7,           xmm2
125
126        lea         rsi,            [rsi + 2*rax]
127        lea         rdi,            [rdi + 2*rdx]
128        sub         rcx,            2
129        jnz         .var16loop
130
131        movdqa      xmm4,           xmm6
132        punpckldq   xmm6,           xmm0
133
134        punpckhdq   xmm4,           xmm0
135        movdqa      xmm5,           xmm7
136
137        paddd       xmm6,           xmm4
138        punpckldq   xmm7,           xmm0
139
140        punpckhdq   xmm5,           xmm0
141        paddd       xmm7,           xmm5
142
143        movdqa      xmm4,           xmm6
144        movdqa      xmm5,           xmm7
145
146        psrldq      xmm4,           8
147        psrldq      xmm5,           8
148
149        paddd       xmm6,           xmm4
150        paddd       xmm7,           xmm5
151
152        mov         rdi,            arg(4)   ; [SSE]
153        mov         rax,            arg(5)   ; [Sum]
154
155        movd DWORD PTR [rdi],       xmm6
156        movd DWORD PTR [rax],       xmm7
157
158
159    ; begin epilog
160    pop rdi
161    pop rsi
162    pop rbx
163    RESTORE_XMM
164    UNSHADOW_ARGS
165    pop         rbp
166    ret
167
168
169;unsigned int vpx_highbd_calc8x8var_sse2
170;(
171;    unsigned char   *  src_ptr,
172;    int             src_stride,
173;    unsigned char   *  ref_ptr,
174;    int             ref_stride,
175;    unsigned int    *  SSE,
176;    int             *  Sum
177;)
178global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
179sym(vpx_highbd_calc8x8var_sse2):
180    push        rbp
181    mov         rbp, rsp
182    SHADOW_ARGS_TO_STACK 6
183    SAVE_XMM 7
184    push rbx
185    push rsi
186    push rdi
187    ; end prolog
188
189        mov         rsi,            arg(0) ;[src_ptr]
190        mov         rdi,            arg(2) ;[ref_ptr]
191
192        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
193        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
194        add         rax,            rax ; source stride in bytes
195        add         rdx,            rdx ; recon stride in bytes
196
197        ; Prefetch data
198        prefetcht0      [rsi]
199        prefetcht0      [rsi+rax]
200        lea             rbx,    [rsi+rax*2]
201        prefetcht0      [rbx]
202        prefetcht0      [rbx+rax]
203
204        prefetcht0      [rdi]
205        prefetcht0      [rdi+rdx]
206        lea             rbx,    [rdi+rdx*2]
207        prefetcht0      [rbx]
208        prefetcht0      [rbx+rdx]
209
210        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
211        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
212
213        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
214        mov         rcx,            8
215
216.var8loop:
217        movdqu      xmm1,           XMMWORD PTR [rsi]
218        movdqu      xmm2,           XMMWORD PTR [rdi]
219
220        lea             rbx,    [rsi+rax*4]
221        prefetcht0      [rbx]
222        prefetcht0      [rbx+rax]
223        lea             rbx,    [rbx+rax*2]
224        prefetcht0      [rbx]
225        prefetcht0      [rbx+rax]
226        lea             rbx,    [rdi+rdx*4]
227        prefetcht0      [rbx]
228        prefetcht0      [rbx+rdx]
229        lea             rbx,    [rbx+rdx*2]
230        prefetcht0      [rbx]
231        prefetcht0      [rbx+rdx]
232
233        pxor        xmm5,           xmm5
234
235        psubw       xmm1,           xmm2
236        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
237        paddw       xmm5,           xmm1
238        pmaddwd     xmm1,           xmm1
239        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
240        paddd       xmm6,           xmm1
241
242        lea         rsi,            [rsi + 2*rax]
243        lea         rdi,            [rdi + 2*rdx]
244
245        psubw       xmm3,           xmm2
246        movdqu      xmm1,           XMMWORD PTR [rsi]
247        paddw       xmm5,           xmm3
248        pmaddwd     xmm3,           xmm3
249        movdqu      xmm2,           XMMWORD PTR [rdi]
250        paddd       xmm6,           xmm3
251
252        psubw       xmm1,           xmm2
253        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
254        paddw       xmm5,           xmm1
255        pmaddwd     xmm1,           xmm1
256        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
257        paddd       xmm6,           xmm1
258
259        psubw       xmm3,           xmm2
260        paddw       xmm5,           xmm3
261        pmaddwd     xmm3,           xmm3
262        paddd       xmm6,           xmm3
263
264        movdqa      xmm1,           xmm5
265        movdqa      xmm2,           xmm5
266        pcmpgtw     xmm1,           xmm0
267        pcmpeqw     xmm2,           xmm0
268        por         xmm1,           xmm2
269        pcmpeqw     xmm1,           xmm0
270        movdqa      xmm2,           xmm5
271        punpcklwd   xmm5,           xmm1
272        punpckhwd   xmm2,           xmm1
273        paddd       xmm7,           xmm5
274        paddd       xmm7,           xmm2
275
276        lea         rsi,            [rsi + 2*rax]
277        lea         rdi,            [rdi + 2*rdx]
278        sub         rcx,            4
279        jnz         .var8loop
280
281        movdqa      xmm4,           xmm6
282        punpckldq   xmm6,           xmm0
283
284        punpckhdq   xmm4,           xmm0
285        movdqa      xmm5,           xmm7
286
287        paddd       xmm6,           xmm4
288        punpckldq   xmm7,           xmm0
289
290        punpckhdq   xmm5,           xmm0
291        paddd       xmm7,           xmm5
292
293        movdqa      xmm4,           xmm6
294        movdqa      xmm5,           xmm7
295
296        psrldq      xmm4,           8
297        psrldq      xmm5,           8
298
299        paddd       xmm6,           xmm4
300        paddd       xmm7,           xmm5
301
302        mov         rdi,            arg(4)   ; [SSE]
303        mov         rax,            arg(5)   ; [Sum]
304
305        movd DWORD PTR [rdi],       xmm6
306        movd DWORD PTR [rax],       xmm7
307
308    ; begin epilog
309    pop rdi
310    pop rsi
311    pop rbx
312    RESTORE_XMM
313    UNSHADOW_ARGS
314    pop         rbp
315    ret
316