1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14
15%include "aom_ports/x86_abi_support.asm"
16
17SECTION .text
18
19;unsigned int aom_highbd_calc16x16var_sse2
20;(
21;    unsigned char   *  src_ptr,
22;    int             source_stride,
23;    unsigned char   *  ref_ptr,
24;    int             recon_stride,
25;    unsigned int    *  SSE,
26;    int             *  Sum
27;)
28global sym(aom_highbd_calc16x16var_sse2) PRIVATE
29sym(aom_highbd_calc16x16var_sse2):
30    push        rbp
31    mov         rbp, rsp
32    SHADOW_ARGS_TO_STACK 6
33    SAVE_XMM 7
34    push rbx
35    push rsi
36    push rdi
37    ; end prolog
38
39        mov         rsi,            arg(0) ;[src_ptr]
40        mov         rdi,            arg(2) ;[ref_ptr]
41
42        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
43        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
44        add         rax,            rax ; source stride in bytes
45        add         rdx,            rdx ; recon stride in bytes
46
47        ; Prefetch data
48        prefetcht0      [rsi]
49        prefetcht0      [rsi+16]
50        prefetcht0      [rsi+rax]
51        prefetcht0      [rsi+rax+16]
52        lea             rbx,    [rsi+rax*2]
53        prefetcht0      [rbx]
54        prefetcht0      [rbx+16]
55        prefetcht0      [rbx+rax]
56        prefetcht0      [rbx+rax+16]
57
58        prefetcht0      [rdi]
59        prefetcht0      [rdi+16]
60        prefetcht0      [rdi+rdx]
61        prefetcht0      [rdi+rdx+16]
62        lea             rbx,    [rdi+rdx*2]
63        prefetcht0      [rbx]
64        prefetcht0      [rbx+16]
65        prefetcht0      [rbx+rdx]
66        prefetcht0      [rbx+rdx+16]
67
68        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
69        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
70
71        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
72        mov         rcx,            16
73
74.var16loop:
75        movdqu      xmm1,           XMMWORD PTR [rsi]
76        movdqu      xmm2,           XMMWORD PTR [rdi]
77
78        lea             rbx,    [rsi+rax*2]
79        prefetcht0      [rbx]
80        prefetcht0      [rbx+16]
81        prefetcht0      [rbx+rax]
82        prefetcht0      [rbx+rax+16]
83        lea             rbx,    [rdi+rdx*2]
84        prefetcht0      [rbx]
85        prefetcht0      [rbx+16]
86        prefetcht0      [rbx+rdx]
87        prefetcht0      [rbx+rdx+16]
88
89        pxor        xmm5,           xmm5
90
91        psubw       xmm1,           xmm2
92        movdqu      xmm3,           XMMWORD PTR [rsi+16]
93        paddw       xmm5,           xmm1
94        pmaddwd     xmm1,           xmm1
95        movdqu      xmm2,           XMMWORD PTR [rdi+16]
96        paddd       xmm6,           xmm1
97
98        psubw       xmm3,           xmm2
99        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
100        paddw       xmm5,           xmm3
101        pmaddwd     xmm3,           xmm3
102        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
103        paddd       xmm6,           xmm3
104
105        psubw       xmm1,           xmm2
106        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
107        paddw       xmm5,           xmm1
108        pmaddwd     xmm1,           xmm1
109        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
110        paddd       xmm6,           xmm1
111
112        psubw       xmm3,           xmm2
113        paddw       xmm5,           xmm3
114        pmaddwd     xmm3,           xmm3
115        paddd       xmm6,           xmm3
116
117        movdqa      xmm1,           xmm5
118        movdqa      xmm2,           xmm5
119        pcmpgtw     xmm1,           xmm0
120        pcmpeqw     xmm2,           xmm0
121        por         xmm1,           xmm2
122        pcmpeqw     xmm1,           xmm0
123        movdqa      xmm2,           xmm5
124        punpcklwd   xmm5,           xmm1
125        punpckhwd   xmm2,           xmm1
126        paddd       xmm7,           xmm5
127        paddd       xmm7,           xmm2
128
129        lea         rsi,            [rsi + 2*rax]
130        lea         rdi,            [rdi + 2*rdx]
131        sub         rcx,            2
132        jnz         .var16loop
133
134        movdqa      xmm4,           xmm6
135        punpckldq   xmm6,           xmm0
136
137        punpckhdq   xmm4,           xmm0
138        movdqa      xmm5,           xmm7
139
140        paddd       xmm6,           xmm4
141        punpckldq   xmm7,           xmm0
142
143        punpckhdq   xmm5,           xmm0
144        paddd       xmm7,           xmm5
145
146        movdqa      xmm4,           xmm6
147        movdqa      xmm5,           xmm7
148
149        psrldq      xmm4,           8
150        psrldq      xmm5,           8
151
152        paddd       xmm6,           xmm4
153        paddd       xmm7,           xmm5
154
155        mov         rdi,            arg(4)   ; [SSE]
156        mov         rax,            arg(5)   ; [Sum]
157
158        movd DWORD PTR [rdi],       xmm6
159        movd DWORD PTR [rax],       xmm7
160
161
162    ; begin epilog
163    pop rdi
164    pop rsi
165    pop rbx
166    RESTORE_XMM
167    UNSHADOW_ARGS
168    pop         rbp
169    ret
170
171
172;unsigned int aom_highbd_calc8x8var_sse2
173;(
174;    unsigned char   *  src_ptr,
175;    int             source_stride,
176;    unsigned char   *  ref_ptr,
177;    int             recon_stride,
178;    unsigned int    *  SSE,
179;    int             *  Sum
180;)
181global sym(aom_highbd_calc8x8var_sse2) PRIVATE
182sym(aom_highbd_calc8x8var_sse2):
183    push        rbp
184    mov         rbp, rsp
185    SHADOW_ARGS_TO_STACK 6
186    SAVE_XMM 7
187    push rbx
188    push rsi
189    push rdi
190    ; end prolog
191
192        mov         rsi,            arg(0) ;[src_ptr]
193        mov         rdi,            arg(2) ;[ref_ptr]
194
195        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
196        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
197        add         rax,            rax ; source stride in bytes
198        add         rdx,            rdx ; recon stride in bytes
199
200        ; Prefetch data
201        prefetcht0      [rsi]
202        prefetcht0      [rsi+rax]
203        lea             rbx,    [rsi+rax*2]
204        prefetcht0      [rbx]
205        prefetcht0      [rbx+rax]
206
207        prefetcht0      [rdi]
208        prefetcht0      [rdi+rdx]
209        lea             rbx,    [rdi+rdx*2]
210        prefetcht0      [rbx]
211        prefetcht0      [rbx+rdx]
212
213        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
214        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
215
216        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
217        mov         rcx,            8
218
219.var8loop:
220        movdqu      xmm1,           XMMWORD PTR [rsi]
221        movdqu      xmm2,           XMMWORD PTR [rdi]
222
223        lea             rbx,    [rsi+rax*4]
224        prefetcht0      [rbx]
225        prefetcht0      [rbx+rax]
226        lea             rbx,    [rbx+rax*2]
227        prefetcht0      [rbx]
228        prefetcht0      [rbx+rax]
229        lea             rbx,    [rdi+rdx*4]
230        prefetcht0      [rbx]
231        prefetcht0      [rbx+rdx]
232        lea             rbx,    [rbx+rdx*2]
233        prefetcht0      [rbx]
234        prefetcht0      [rbx+rdx]
235
236        pxor        xmm5,           xmm5
237
238        psubw       xmm1,           xmm2
239        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
240        paddw       xmm5,           xmm1
241        pmaddwd     xmm1,           xmm1
242        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
243        paddd       xmm6,           xmm1
244
245        lea         rsi,            [rsi + 2*rax]
246        lea         rdi,            [rdi + 2*rdx]
247
248        psubw       xmm3,           xmm2
249        movdqu      xmm1,           XMMWORD PTR [rsi]
250        paddw       xmm5,           xmm3
251        pmaddwd     xmm3,           xmm3
252        movdqu      xmm2,           XMMWORD PTR [rdi]
253        paddd       xmm6,           xmm3
254
255        psubw       xmm1,           xmm2
256        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
257        paddw       xmm5,           xmm1
258        pmaddwd     xmm1,           xmm1
259        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
260        paddd       xmm6,           xmm1
261
262        psubw       xmm3,           xmm2
263        paddw       xmm5,           xmm3
264        pmaddwd     xmm3,           xmm3
265        paddd       xmm6,           xmm3
266
267        movdqa      xmm1,           xmm5
268        movdqa      xmm2,           xmm5
269        pcmpgtw     xmm1,           xmm0
270        pcmpeqw     xmm2,           xmm0
271        por         xmm1,           xmm2
272        pcmpeqw     xmm1,           xmm0
273        movdqa      xmm2,           xmm5
274        punpcklwd   xmm5,           xmm1
275        punpckhwd   xmm2,           xmm1
276        paddd       xmm7,           xmm5
277        paddd       xmm7,           xmm2
278
279        lea         rsi,            [rsi + 2*rax]
280        lea         rdi,            [rdi + 2*rdx]
281        sub         rcx,            4
282        jnz         .var8loop
283
284        movdqa      xmm4,           xmm6
285        punpckldq   xmm6,           xmm0
286
287        punpckhdq   xmm4,           xmm0
288        movdqa      xmm5,           xmm7
289
290        paddd       xmm6,           xmm4
291        punpckldq   xmm7,           xmm0
292
293        punpckhdq   xmm5,           xmm0
294        paddd       xmm7,           xmm5
295
296        movdqa      xmm4,           xmm6
297        movdqa      xmm5,           xmm7
298
299        psrldq      xmm4,           8
300        psrldq      xmm5,           8
301
302        paddd       xmm6,           xmm4
303        paddd       xmm7,           xmm5
304
305        mov         rdi,            arg(4)   ; [SSE]
306        mov         rax,            arg(5)   ; [Sum]
307
308        movd DWORD PTR [rdi],       xmm6
309        movd DWORD PTR [rax],       xmm7
310
311    ; begin epilog
312    pop rdi
313    pop rsi
314    pop rbx
315    RESTORE_XMM
316    UNSHADOW_ARGS
317    pop         rbp
318    ret
319