1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15%macro HIGH_SAD_FN 4
16%if %4 == 0
17%if %3 == 5
18cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
19%else ; %3 == 7
20cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
21                            src_stride3, ref_stride3, n_rows
22%endif ; %3 == 5/7
23%else ; avg
24%if %3 == 5
25cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
26                                    second_pred, n_rows
27%else ; %3 == 7
28cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
29                                              ref, ref_stride, \
30                                              second_pred, \
31                                              src_stride3, ref_stride3
32%if VPX_ARCH_X86_64
33%define n_rowsd r7d
34%else ; x86-32
35%define n_rowsd dword r0m
36%endif ; x86-32/64
37%endif ; %3 == 5/7
38%endif ; avg/sad
39  movsxdifnidn src_strideq, src_strided
40  movsxdifnidn ref_strideq, ref_strided
41%if %3 == 7
42  lea         src_stride3q, [src_strideq*3]
43  lea         ref_stride3q, [ref_strideq*3]
44%endif ; %3 == 7
45; convert src, ref & second_pred to short ptrs (from byte ptrs)
46  shl                 srcq, 1
47  shl                 refq, 1
48%if %4 == 1
49  shl         second_predq, 1
50%endif
51%endmacro
52
53; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
54;                                    uint8_t *ref, int ref_stride);
55%macro HIGH_SAD64XN 1-2 0
56  HIGH_SAD_FN 64, %1, 5, %2
57  mov              n_rowsd, %1
58  pxor                  m0, m0
59  pxor                  m6, m6
60
61.loop:
62  ; first half of each row
63  movu                  m1, [refq]
64  movu                  m2, [refq+16]
65  movu                  m3, [refq+32]
66  movu                  m4, [refq+48]
67%if %2 == 1
68  pavgw                 m1, [second_predq+mmsize*0]
69  pavgw                 m2, [second_predq+mmsize*1]
70  pavgw                 m3, [second_predq+mmsize*2]
71  pavgw                 m4, [second_predq+mmsize*3]
72  lea         second_predq, [second_predq+mmsize*4]
73%endif
74  mova                  m5, [srcq]
75  psubusw               m5, m1
76  psubusw               m1, [srcq]
77  por                   m1, m5
78  mova                  m5, [srcq+16]
79  psubusw               m5, m2
80  psubusw               m2, [srcq+16]
81  por                   m2, m5
82  mova                  m5, [srcq+32]
83  psubusw               m5, m3
84  psubusw               m3, [srcq+32]
85  por                   m3, m5
86  mova                  m5, [srcq+48]
87  psubusw               m5, m4
88  psubusw               m4, [srcq+48]
89  por                   m4, m5
90  paddw                 m1, m2
91  paddw                 m3, m4
92  movhlps               m2, m1
93  movhlps               m4, m3
94  paddw                 m1, m2
95  paddw                 m3, m4
96  punpcklwd             m1, m6
97  punpcklwd             m3, m6
98  paddd                 m0, m1
99  paddd                 m0, m3
100  ; second half of each row
101  movu                  m1, [refq+64]
102  movu                  m2, [refq+80]
103  movu                  m3, [refq+96]
104  movu                  m4, [refq+112]
105%if %2 == 1
106  pavgw                 m1, [second_predq+mmsize*0]
107  pavgw                 m2, [second_predq+mmsize*1]
108  pavgw                 m3, [second_predq+mmsize*2]
109  pavgw                 m4, [second_predq+mmsize*3]
110  lea         second_predq, [second_predq+mmsize*4]
111%endif
112  mova                  m5, [srcq+64]
113  psubusw               m5, m1
114  psubusw               m1, [srcq+64]
115  por                   m1, m5
116  mova                  m5, [srcq+80]
117  psubusw               m5, m2
118  psubusw               m2, [srcq+80]
119  por                   m2, m5
120  mova                  m5, [srcq+96]
121  psubusw               m5, m3
122  psubusw               m3, [srcq+96]
123  por                   m3, m5
124  mova                  m5, [srcq+112]
125  psubusw               m5, m4
126  psubusw               m4, [srcq+112]
127  por                   m4, m5
128  paddw                 m1, m2
129  paddw                 m3, m4
130  movhlps               m2, m1
131  movhlps               m4, m3
132  paddw                 m1, m2
133  paddw                 m3, m4
134  punpcklwd             m1, m6
135  punpcklwd             m3, m6
136  lea                 refq, [refq+ref_strideq*2]
137  paddd                 m0, m1
138  lea                 srcq, [srcq+src_strideq*2]
139  paddd                 m0, m3
140
141  dec              n_rowsd
142  jg .loop
143
144  movhlps               m1, m0
145  paddd                 m0, m1
146  punpckldq             m0, m6
147  movhlps               m1, m0
148  paddd                 m0, m1
149  movd                 eax, m0
150  RET
151%endmacro
152
153INIT_XMM sse2
154HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
155HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
156HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
157HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
158
159
160; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
161;                                    uint8_t *ref, int ref_stride);
162%macro HIGH_SAD32XN 1-2 0
163  HIGH_SAD_FN 32, %1, 5, %2
164  mov              n_rowsd, %1
165  pxor                  m0, m0
166  pxor                  m6, m6
167
168.loop:
169  movu                  m1, [refq]
170  movu                  m2, [refq+16]
171  movu                  m3, [refq+32]
172  movu                  m4, [refq+48]
173%if %2 == 1
174  pavgw                 m1, [second_predq+mmsize*0]
175  pavgw                 m2, [second_predq+mmsize*1]
176  pavgw                 m3, [second_predq+mmsize*2]
177  pavgw                 m4, [second_predq+mmsize*3]
178  lea         second_predq, [second_predq+mmsize*4]
179%endif
180  mova                  m5, [srcq]
181  psubusw               m5, m1
182  psubusw               m1, [srcq]
183  por                   m1, m5
184  mova                  m5, [srcq+16]
185  psubusw               m5, m2
186  psubusw               m2, [srcq+16]
187  por                   m2, m5
188  mova                  m5, [srcq+32]
189  psubusw               m5, m3
190  psubusw               m3, [srcq+32]
191  por                   m3, m5
192  mova                  m5, [srcq+48]
193  psubusw               m5, m4
194  psubusw               m4, [srcq+48]
195  por                   m4, m5
196  paddw                 m1, m2
197  paddw                 m3, m4
198  movhlps               m2, m1
199  movhlps               m4, m3
200  paddw                 m1, m2
201  paddw                 m3, m4
202  punpcklwd             m1, m6
203  punpcklwd             m3, m6
204  lea                 refq, [refq+ref_strideq*2]
205  paddd                 m0, m1
206  lea                 srcq, [srcq+src_strideq*2]
207  paddd                 m0, m3
208  dec              n_rowsd
209  jg .loop
210
211  movhlps               m1, m0
212  paddd                 m0, m1
213  punpckldq             m0, m6
214  movhlps               m1, m0
215  paddd                 m0, m1
216  movd                 eax, m0
217  RET
218%endmacro
219
220INIT_XMM sse2
221HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
222HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
223HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
224HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
225HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
226HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
227
228; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
229;                                    uint8_t *ref, int ref_stride);
230%macro HIGH_SAD16XN 1-2 0
231  HIGH_SAD_FN 16, %1, 5, %2
232  mov              n_rowsd, %1/2
233  pxor                  m0, m0
234  pxor                  m6, m6
235
236.loop:
237  movu                  m1, [refq]
238  movu                  m2, [refq+16]
239  movu                  m3, [refq+ref_strideq*2]
240  movu                  m4, [refq+ref_strideq*2+16]
241%if %2 == 1
242  pavgw                 m1, [second_predq+mmsize*0]
243  pavgw                 m2, [second_predq+16]
244  pavgw                 m3, [second_predq+mmsize*2]
245  pavgw                 m4, [second_predq+mmsize*2+16]
246  lea         second_predq, [second_predq+mmsize*4]
247%endif
248  mova                  m5, [srcq]
249  psubusw               m5, m1
250  psubusw               m1, [srcq]
251  por                   m1, m5
252  mova                  m5, [srcq+16]
253  psubusw               m5, m2
254  psubusw               m2, [srcq+16]
255  por                   m2, m5
256  mova                  m5, [srcq+src_strideq*2]
257  psubusw               m5, m3
258  psubusw               m3, [srcq+src_strideq*2]
259  por                   m3, m5
260  mova                  m5, [srcq+src_strideq*2+16]
261  psubusw               m5, m4
262  psubusw               m4, [srcq+src_strideq*2+16]
263  por                   m4, m5
264  paddw                 m1, m2
265  paddw                 m3, m4
266  movhlps               m2, m1
267  movhlps               m4, m3
268  paddw                 m1, m2
269  paddw                 m3, m4
270  punpcklwd             m1, m6
271  punpcklwd             m3, m6
272  lea                 refq, [refq+ref_strideq*4]
273  paddd                 m0, m1
274  lea                 srcq, [srcq+src_strideq*4]
275  paddd                 m0, m3
276  dec              n_rowsd
277  jg .loop
278
279  movhlps               m1, m0
280  paddd                 m0, m1
281  punpckldq             m0, m6
282  movhlps               m1, m0
283  paddd                 m0, m1
284  movd                 eax, m0
285  RET
286%endmacro
287
288INIT_XMM sse2
289HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
290HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
291HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
292HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
293HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
294HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
295
296
297; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
298;                                    uint8_t *ref, int ref_stride);
299%macro HIGH_SAD8XN 1-2 0
300  HIGH_SAD_FN 8, %1, 7, %2
301  mov              n_rowsd, %1/4
302  pxor                  m0, m0
303  pxor                  m6, m6
304
305.loop:
306  movu                  m1, [refq]
307  movu                  m2, [refq+ref_strideq*2]
308  movu                  m3, [refq+ref_strideq*4]
309  movu                  m4, [refq+ref_stride3q*2]
310%if %2 == 1
311  pavgw                 m1, [second_predq+mmsize*0]
312  pavgw                 m2, [second_predq+mmsize*1]
313  pavgw                 m3, [second_predq+mmsize*2]
314  pavgw                 m4, [second_predq+mmsize*3]
315  lea         second_predq, [second_predq+mmsize*4]
316%endif
317  mova                  m5, [srcq]
318  psubusw               m5, m1
319  psubusw               m1, [srcq]
320  por                   m1, m5
321  mova                  m5, [srcq+src_strideq*2]
322  psubusw               m5, m2
323  psubusw               m2, [srcq+src_strideq*2]
324  por                   m2, m5
325  mova                  m5, [srcq+src_strideq*4]
326  psubusw               m5, m3
327  psubusw               m3, [srcq+src_strideq*4]
328  por                   m3, m5
329  mova                  m5, [srcq+src_stride3q*2]
330  psubusw               m5, m4
331  psubusw               m4, [srcq+src_stride3q*2]
332  por                   m4, m5
333  paddw                 m1, m2
334  paddw                 m3, m4
335  movhlps               m2, m1
336  movhlps               m4, m3
337  paddw                 m1, m2
338  paddw                 m3, m4
339  punpcklwd             m1, m6
340  punpcklwd             m3, m6
341  lea                 refq, [refq+ref_strideq*8]
342  paddd                 m0, m1
343  lea                 srcq, [srcq+src_strideq*8]
344  paddd                 m0, m3
345  dec              n_rowsd
346  jg .loop
347
348  movhlps               m1, m0
349  paddd                 m0, m1
350  punpckldq             m0, m6
351  movhlps               m1, m0
352  paddd                 m0, m1
353  movd                 eax, m0
354  RET
355%endmacro
356
357INIT_XMM sse2
358HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
359HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
360HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
361HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
362HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
363HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
364