1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro PROCESS_16X2X8 1
15%if %1
16        movdqa          xmm0,       XMMWORD PTR [rsi]
17        movq            xmm1,       MMWORD PTR [rdi]
18        movq            xmm3,       MMWORD PTR [rdi+8]
19        movq            xmm2,       MMWORD PTR [rdi+16]
20        punpcklqdq      xmm1,       xmm3
21        punpcklqdq      xmm3,       xmm2
22
23        movdqa          xmm2,       xmm1
24        mpsadbw         xmm1,       xmm0,  0x0
25        mpsadbw         xmm2,       xmm0,  0x5
26
27        psrldq          xmm0,       8
28
29        movdqa          xmm4,       xmm3
30        mpsadbw         xmm3,       xmm0,  0x0
31        mpsadbw         xmm4,       xmm0,  0x5
32
33        paddw           xmm1,       xmm2
34        paddw           xmm1,       xmm3
35        paddw           xmm1,       xmm4
36%else
37        movdqa          xmm0,       XMMWORD PTR [rsi]
38        movq            xmm5,       MMWORD PTR [rdi]
39        movq            xmm3,       MMWORD PTR [rdi+8]
40        movq            xmm2,       MMWORD PTR [rdi+16]
41        punpcklqdq      xmm5,       xmm3
42        punpcklqdq      xmm3,       xmm2
43
44        movdqa          xmm2,       xmm5
45        mpsadbw         xmm5,       xmm0,  0x0
46        mpsadbw         xmm2,       xmm0,  0x5
47
48        psrldq          xmm0,       8
49
50        movdqa          xmm4,       xmm3
51        mpsadbw         xmm3,       xmm0,  0x0
52        mpsadbw         xmm4,       xmm0,  0x5
53
54        paddw           xmm5,       xmm2
55        paddw           xmm5,       xmm3
56        paddw           xmm5,       xmm4
57
58        paddw           xmm1,       xmm5
59%endif
60        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
61        movq            xmm5,       MMWORD PTR [rdi+ rdx]
62        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
63        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
64        punpcklqdq      xmm5,       xmm3
65        punpcklqdq      xmm3,       xmm2
66
67        lea             rsi,        [rsi+rax*2]
68        lea             rdi,        [rdi+rdx*2]
69
70        movdqa          xmm2,       xmm5
71        mpsadbw         xmm5,       xmm0,  0x0
72        mpsadbw         xmm2,       xmm0,  0x5
73
74        psrldq          xmm0,       8
75        movdqa          xmm4,       xmm3
76        mpsadbw         xmm3,       xmm0,  0x0
77        mpsadbw         xmm4,       xmm0,  0x5
78
79        paddw           xmm5,       xmm2
80        paddw           xmm5,       xmm3
81        paddw           xmm5,       xmm4
82
83        paddw           xmm1,       xmm5
84%endmacro
85
86%macro PROCESS_8X2X8 1
87%if %1
88        movq            xmm0,       MMWORD PTR [rsi]
89        movq            xmm1,       MMWORD PTR [rdi]
90        movq            xmm3,       MMWORD PTR [rdi+8]
91        punpcklqdq      xmm1,       xmm3
92
93        movdqa          xmm2,       xmm1
94        mpsadbw         xmm1,       xmm0,  0x0
95        mpsadbw         xmm2,       xmm0,  0x5
96        paddw           xmm1,       xmm2
97%else
98        movq            xmm0,       MMWORD PTR [rsi]
99        movq            xmm5,       MMWORD PTR [rdi]
100        movq            xmm3,       MMWORD PTR [rdi+8]
101        punpcklqdq      xmm5,       xmm3
102
103        movdqa          xmm2,       xmm5
104        mpsadbw         xmm5,       xmm0,  0x0
105        mpsadbw         xmm2,       xmm0,  0x5
106        paddw           xmm5,       xmm2
107
108        paddw           xmm1,       xmm5
109%endif
110        movq            xmm0,       MMWORD PTR [rsi + rax]
111        movq            xmm5,       MMWORD PTR [rdi+ rdx]
112        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
113        punpcklqdq      xmm5,       xmm3
114
115        lea             rsi,        [rsi+rax*2]
116        lea             rdi,        [rdi+rdx*2]
117
118        movdqa          xmm2,       xmm5
119        mpsadbw         xmm5,       xmm0,  0x0
120        mpsadbw         xmm2,       xmm0,  0x5
121        paddw           xmm5,       xmm2
122
123        paddw           xmm1,       xmm5
124%endmacro
125
126%macro PROCESS_4X2X8 1
127%if %1
128        movd            xmm0,       [rsi]
129        movq            xmm1,       MMWORD PTR [rdi]
130        movq            xmm3,       MMWORD PTR [rdi+8]
131        punpcklqdq      xmm1,       xmm3
132
133        mpsadbw         xmm1,       xmm0,  0x0
134%else
135        movd            xmm0,       [rsi]
136        movq            xmm5,       MMWORD PTR [rdi]
137        movq            xmm3,       MMWORD PTR [rdi+8]
138        punpcklqdq      xmm5,       xmm3
139
140        mpsadbw         xmm5,       xmm0,  0x0
141
142        paddw           xmm1,       xmm5
143%endif
144        movd            xmm0,       [rsi + rax]
145        movq            xmm5,       MMWORD PTR [rdi+ rdx]
146        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
147        punpcklqdq      xmm5,       xmm3
148
149        lea             rsi,        [rsi+rax*2]
150        lea             rdi,        [rdi+rdx*2]
151
152        mpsadbw         xmm5,       xmm0,  0x0
153
154        paddw           xmm1,       xmm5
155%endmacro
156
157%macro WRITE_AS_INTS 0
158    mov             rdi,        arg(4)           ;Results
159    pxor            xmm0, xmm0
160    movdqa          xmm2, xmm1
161    punpcklwd       xmm1, xmm0
162    punpckhwd       xmm2, xmm0
163
164    movdqa          [rdi],    xmm1
165    movdqa          [rdi + 16],    xmm2
166%endmacro
167
168SECTION .text
169
170;void vpx_sad16x16x8_sse4_1(
171;    const unsigned char *src_ptr,
172;    int  src_stride,
173;    const unsigned char *ref_ptr,
174;    int  ref_stride,
175;    unsigned short *sad_array);
176global sym(vpx_sad16x16x8_sse4_1) PRIVATE
177sym(vpx_sad16x16x8_sse4_1):
178    push        rbp
179    mov         rbp, rsp
180    SHADOW_ARGS_TO_STACK 5
181    push        rsi
182    push        rdi
183    ; end prolog
184
185    mov             rsi,        arg(0)           ;src_ptr
186    mov             rdi,        arg(2)           ;ref_ptr
187
188    movsxd          rax,        dword ptr arg(1) ;src_stride
189    movsxd          rdx,        dword ptr arg(3) ;ref_stride
190
191    PROCESS_16X2X8 1
192    PROCESS_16X2X8 0
193    PROCESS_16X2X8 0
194    PROCESS_16X2X8 0
195    PROCESS_16X2X8 0
196    PROCESS_16X2X8 0
197    PROCESS_16X2X8 0
198    PROCESS_16X2X8 0
199
200    WRITE_AS_INTS
201
202    ; begin epilog
203    pop         rdi
204    pop         rsi
205    UNSHADOW_ARGS
206    pop         rbp
207    ret
208
209
210;void vpx_sad16x8x8_sse4_1(
211;    const unsigned char *src_ptr,
212;    int  src_stride,
213;    const unsigned char *ref_ptr,
214;    int  ref_stride,
215;    unsigned short *sad_array
216;);
217global sym(vpx_sad16x8x8_sse4_1) PRIVATE
218sym(vpx_sad16x8x8_sse4_1):
219    push        rbp
220    mov         rbp, rsp
221    SHADOW_ARGS_TO_STACK 5
222    push        rsi
223    push        rdi
224    ; end prolog
225
226    mov             rsi,        arg(0)           ;src_ptr
227    mov             rdi,        arg(2)           ;ref_ptr
228
229    movsxd          rax,        dword ptr arg(1) ;src_stride
230    movsxd          rdx,        dword ptr arg(3) ;ref_stride
231
232    PROCESS_16X2X8 1
233    PROCESS_16X2X8 0
234    PROCESS_16X2X8 0
235    PROCESS_16X2X8 0
236
237    WRITE_AS_INTS
238
239    ; begin epilog
240    pop         rdi
241    pop         rsi
242    UNSHADOW_ARGS
243    pop         rbp
244    ret
245
246
247;void vpx_sad8x8x8_sse4_1(
248;    const unsigned char *src_ptr,
249;    int  src_stride,
250;    const unsigned char *ref_ptr,
251;    int  ref_stride,
252;    unsigned short *sad_array
253;);
254global sym(vpx_sad8x8x8_sse4_1) PRIVATE
255sym(vpx_sad8x8x8_sse4_1):
256    push        rbp
257    mov         rbp, rsp
258    SHADOW_ARGS_TO_STACK 5
259    push        rsi
260    push        rdi
261    ; end prolog
262
263    mov             rsi,        arg(0)           ;src_ptr
264    mov             rdi,        arg(2)           ;ref_ptr
265
266    movsxd          rax,        dword ptr arg(1) ;src_stride
267    movsxd          rdx,        dword ptr arg(3) ;ref_stride
268
269    PROCESS_8X2X8 1
270    PROCESS_8X2X8 0
271    PROCESS_8X2X8 0
272    PROCESS_8X2X8 0
273
274    WRITE_AS_INTS
275
276    ; begin epilog
277    pop         rdi
278    pop         rsi
279    UNSHADOW_ARGS
280    pop         rbp
281    ret
282
283
284;void vpx_sad8x16x8_sse4_1(
285;    const unsigned char *src_ptr,
286;    int  src_stride,
287;    const unsigned char *ref_ptr,
288;    int  ref_stride,
289;    unsigned short *sad_array
290;);
291global sym(vpx_sad8x16x8_sse4_1) PRIVATE
292sym(vpx_sad8x16x8_sse4_1):
293    push        rbp
294    mov         rbp, rsp
295    SHADOW_ARGS_TO_STACK 5
296    push        rsi
297    push        rdi
298    ; end prolog
299
300    mov             rsi,        arg(0)           ;src_ptr
301    mov             rdi,        arg(2)           ;ref_ptr
302
303    movsxd          rax,        dword ptr arg(1) ;src_stride
304    movsxd          rdx,        dword ptr arg(3) ;ref_stride
305
306    PROCESS_8X2X8 1
307    PROCESS_8X2X8 0
308    PROCESS_8X2X8 0
309    PROCESS_8X2X8 0
310    PROCESS_8X2X8 0
311    PROCESS_8X2X8 0
312    PROCESS_8X2X8 0
313    PROCESS_8X2X8 0
314
315    WRITE_AS_INTS
316
317    ; begin epilog
318    pop         rdi
319    pop         rsi
320    UNSHADOW_ARGS
321    pop         rbp
322    ret
323
324
325;void vpx_sad4x4x8_sse4_1(
326;    const unsigned char *src_ptr,
327;    int  src_stride,
328;    const unsigned char *ref_ptr,
329;    int  ref_stride,
330;    unsigned short *sad_array
331;);
332global sym(vpx_sad4x4x8_sse4_1) PRIVATE
333sym(vpx_sad4x4x8_sse4_1):
334    push        rbp
335    mov         rbp, rsp
336    SHADOW_ARGS_TO_STACK 5
337    push        rsi
338    push        rdi
339    ; end prolog
340
341    mov             rsi,        arg(0)           ;src_ptr
342    mov             rdi,        arg(2)           ;ref_ptr
343
344    movsxd          rax,        dword ptr arg(1) ;src_stride
345    movsxd          rdx,        dword ptr arg(3) ;ref_stride
346
347    PROCESS_4X2X8 1
348    PROCESS_4X2X8 0
349
350    WRITE_AS_INTS
351
352    ; begin epilog
353    pop         rdi
354    pop         rsi
355    UNSHADOW_ARGS
356    pop         rbp
357    ret
358
359
360
361
362