1;******************************************************************************
2;* Pixel utilities SIMD
3;*
4;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (C) 2014 Clément Bœsch <u pkh me>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "x86util.asm"
25
26SECTION .text
27
28;-------------------------------------------------------------------------------
29; int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
30;                               const uint8_t *src2, ptrdiff_t stride2);
31;-------------------------------------------------------------------------------
32INIT_MMX mmx
33cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
34    pxor        m7, m7
35    pxor        m6, m6
36%rep 4
37    mova        m0, [src1q]
38    mova        m2, [src1q + stride1q]
39    mova        m1, [src2q]
40    mova        m3, [src2q + stride2q]
41    psubusb     m4, m0, m1
42    psubusb     m5, m2, m3
43    psubusb     m1, m0
44    psubusb     m3, m2
45    por         m1, m4
46    por         m3, m5
47    punpcklbw   m0, m1, m7
48    punpcklbw   m2, m3, m7
49    punpckhbw   m1, m7
50    punpckhbw   m3, m7
51    paddw       m0, m1
52    paddw       m2, m3
53    paddw       m0, m2
54    paddw       m6, m0
55    lea         src1q, [src1q + 2*stride1q]
56    lea         src2q, [src2q + 2*stride2q]
57%endrep
58    psrlq       m0, m6, 32
59    paddw       m6, m0
60    psrlq       m0, m6, 16
61    paddw       m6, m0
62    movd        eax, m6
63    movzx       eax, ax
64    RET
65
66;-------------------------------------------------------------------------------
67; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
68;                                  const uint8_t *src2, ptrdiff_t stride2);
69;-------------------------------------------------------------------------------
70INIT_MMX mmxext
71cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
72    pxor        m2, m2
73%rep 4
74    mova        m0, [src1q]
75    mova        m1, [src1q + stride1q]
76    psadbw      m0, [src2q]
77    psadbw      m1, [src2q + stride2q]
78    paddw       m2, m0
79    paddw       m2, m1
80    lea         src1q, [src1q + 2*stride1q]
81    lea         src2q, [src2q + 2*stride2q]
82%endrep
83    movd        eax, m2
84    RET
85
86;-------------------------------------------------------------------------------
87; int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
88;                                    const uint8_t *src2, ptrdiff_t stride2);
89;-------------------------------------------------------------------------------
90INIT_MMX mmxext
91cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
92    pxor        m2, m2
93%rep 16
94    mova        m0, [src1q]
95    mova        m1, [src1q + 8]
96    psadbw      m0, [src2q]
97    psadbw      m1, [src2q + 8]
98    paddw       m2, m0
99    paddw       m2, m1
100    add         src1q, stride1q
101    add         src2q, stride2q
102%endrep
103    movd        eax, m2
104    RET
105
106;-------------------------------------------------------------------------------
107; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
108;                                  const uint8_t *src2, ptrdiff_t stride2);
109;-------------------------------------------------------------------------------
110INIT_XMM sse2
111cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
112    movu        m4, [src1q]
113    movu        m2, [src2q]
114    movu        m1, [src1q + stride1q]
115    movu        m3, [src2q + stride2q]
116    psadbw      m4, m2
117    psadbw      m1, m3
118    paddw       m4, m1
119%rep 7
120    lea         src1q, [src1q + 2*stride1q]
121    lea         src2q, [src2q + 2*stride2q]
122    movu        m0, [src1q]
123    movu        m2, [src2q]
124    movu        m1, [src1q + stride1q]
125    movu        m3, [src2q + stride2q]
126    psadbw      m0, m2
127    psadbw      m1, m3
128    paddw       m4, m0
129    paddw       m4, m1
130%endrep
131    movhlps     m0, m4
132    paddw       m4, m0
133    movd        eax, m4
134    RET
135
136;-------------------------------------------------------------------------------
137; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
138;                                       const uint8_t *src2, ptrdiff_t stride2);
139;-------------------------------------------------------------------------------
140%macro SAD_XMM_16x16 1
141INIT_XMM sse2
142cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
143    mov%1       m2, [src2q]
144    psadbw      m2, [src1q]
145    mov%1       m1, [src2q + stride2q]
146    psadbw      m1, [src1q + stride1q]
147    paddw       m2, m1
148%rep 7
149    lea         src1q, [src1q + 2*stride1q]
150    lea         src2q, [src2q + 2*stride2q]
151    mov%1       m0, [src2q]
152    psadbw      m0, [src1q]
153    mov%1       m1, [src2q + stride2q]
154    psadbw      m1, [src1q + stride1q]
155    paddw       m2, m0
156    paddw       m2, m1
157%endrep
158    movhlps     m0, m2
159    paddw       m2, m0
160    movd        eax, m2
161    RET
162%endmacro
163
164SAD_XMM_16x16 a
165SAD_XMM_16x16 u
166
167
168%macro PROCESS_SAD_32x4_U 0
169    movu    m1,  [r2]
170    movu    m2,  [r2 + 16]
171    movu    m3,  [r0]
172    movu    m4,  [r0 + 16]
173    psadbw  m1,  m3
174    psadbw  m2,  m4
175    paddd   m1,  m2
176    paddd   m0,  m1
177    lea     r2,  [r2 + r3]
178    lea     r0,  [r0 + r1]
179
180    movu    m1,  [r2]
181    movu    m2,  [r2 + 16]
182    movu    m3,  [r0]
183    movu    m4,  [r0 + 16]
184    psadbw  m1,  m3
185    psadbw  m2,  m4
186    paddd   m1,  m2
187    paddd   m0,  m1
188    lea     r2,  [r2 + r3]
189    lea     r0,  [r0 + r1]
190
191    movu    m1,  [r2]
192    movu    m2,  [r2 + 16]
193    movu    m3,  [r0]
194    movu    m4,  [r0 + 16]
195    psadbw  m1,  m3
196    psadbw  m2,  m4
197    paddd   m1,  m2
198    paddd   m0,  m1
199    lea     r2,  [r2 + r3]
200    lea     r0,  [r0 + r1]
201
202    movu    m1,  [r2]
203    movu    m2,  [r2 + 16]
204    movu    m3,  [r0]
205    movu    m4,  [r0 + 16]
206    psadbw  m1,  m3
207    psadbw  m2,  m4
208    paddd   m1,  m2
209    paddd   m0,  m1
210    lea     r2,  [r2 + r3]
211    lea     r0,  [r0 + r1]
212%endmacro
213
214%macro PROCESS_SAD_32x4 1
215    mov%1   m1,  [r2]
216    mov%1   m2,  [r2 + 16]
217    psadbw  m1,  [r0]
218    psadbw  m2,  [r0 + 16]
219    paddd   m1,  m2
220    paddd   m0,  m1
221    lea     r2,  [r2 + r3]
222    lea     r0,  [r0 + r1]
223
224    mov%1   m1,  [r2]
225    mov%1   m2,  [r2 + 16]
226    psadbw  m1,  [r0]
227    psadbw  m2,  [r0 + 16]
228    paddd   m1,  m2
229    paddd   m0,  m1
230    lea     r2,  [r2 + r3]
231    lea     r0,  [r0 + r1]
232
233    mov%1   m1,  [r2]
234    mov%1   m2,  [r2 + 16]
235    psadbw  m1,  [r0]
236    psadbw  m2,  [r0 + 16]
237    paddd   m1,  m2
238    paddd   m0,  m1
239    lea     r2,  [r2 + r3]
240    lea     r0,  [r0 + r1]
241
242    mov%1   m1,  [r2]
243    mov%1   m2,  [r2 + 16]
244    psadbw  m1,  [r0]
245    psadbw  m2,  [r0 + 16]
246    paddd   m1,  m2
247    paddd   m0,  m1
248    lea     r2,  [r2 + r3]
249    lea     r0,  [r0 + r1]
250%endmacro
251
252;-----------------------------------------------------------------------------
253; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
254;                                  const uint8_t *src2, ptrdiff_t stride2);
255;-----------------------------------------------------------------------------
256INIT_XMM sse2
257cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2
258    pxor  m0,  m0
259    mov   r4d, 4
260.loop:
261    PROCESS_SAD_32x4_U
262    PROCESS_SAD_32x4_U
263    dec r4d
264    jnz .loop
265
266    movhlps m1,  m0
267    paddd   m0,  m1
268    movd    eax, m0
269    RET
270
271;-------------------------------------------------------------------------------
272; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
273;                                       const uint8_t *src2, ptrdiff_t stride2);
274;-------------------------------------------------------------------------------
275%macro SAD_XMM_32x32 1
276INIT_XMM sse2
277cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2
278    pxor  m0,  m0
279    mov   r4d, 4
280.loop:
281    PROCESS_SAD_32x4 %1
282    PROCESS_SAD_32x4 %1
283    dec r4d
284    jnz .loop
285
286    movhlps m1,  m0
287    paddd   m0,  m1
288    movd    eax, m0
289    RET
290%endmacro
291
292SAD_XMM_32x32 a
293SAD_XMM_32x32 u
294
295%if HAVE_AVX2_EXTERNAL
296;-------------------------------------------------------------------------------
297; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
298;                                  const uint8_t *src2, ptrdiff_t stride2);
299;-------------------------------------------------------------------------------
300INIT_YMM avx2
301cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2
302    pxor            m0, m0
303    mov             r4d, 32/4
304    lea             r5, [stride1q * 3]
305    lea             r6, [stride2q * 3]
306
307.loop:
308    movu           m1, [src1q]               ; row 0 of pix0
309    movu           m2, [src2q]               ; row 0 of pix1
310    movu           m3, [src1q + stride1q]    ; row 1 of pix0
311    movu           m4, [src2q + stride2q]    ; row 1 of pix1
312
313    psadbw         m1, m2
314    psadbw         m3, m4
315    paddd          m0, m1
316    paddd          m0, m3
317
318    movu           m1, [src1q + 2 * stride1q] ; row 2 of pix0
319    movu           m2, [src2q + 2 * stride2q] ; row 2 of pix1
320    movu           m3, [src1q + r5]           ; row 3 of pix0
321    movu           m4, [src2q + r6]           ; row 3 of pix1
322
323    psadbw         m1, m2
324    psadbw         m3, m4
325    paddd          m0, m1
326    paddd          m0, m3
327
328    lea            src2q,     [src2q + 4 * stride2q]
329    lea            src1q,     [src1q + 4 * stride1q]
330
331    dec            r4d
332    jnz           .loop
333
334    vextracti128   xm1, m0, 1
335    paddd          xm0, xm1
336    pshufd         xm1, xm0, 2
337    paddd          xm0, xm1
338    movd           eax, xm0
339    RET
340
341;-------------------------------------------------------------------------------
342; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
343;                                       const uint8_t *src2, ptrdiff_t stride2);
344;-------------------------------------------------------------------------------
345%macro SAD_AVX2_32x32 1
346INIT_YMM avx2
347cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2
348    pxor           m0, m0
349    mov            r4d, 32/4
350    lea            r5, [stride1q * 3]
351    lea            r6, [stride2q * 3]
352
353.loop:
354    mov%1          m1, [src2q]                ; row 0 of pix1
355    psadbw         m1, [src1q]
356    mov%1          m2, [src2q + stride2q]     ; row 1 of pix1
357    psadbw         m2, [src1q + stride1q]
358
359    paddd          m0, m1
360    paddd          m0, m2
361
362    mov%1          m1, [src2q + 2 * stride2q] ; row 2 of pix1
363    psadbw         m1, [src1q + 2 * stride1q]
364    mov%1          m2, [src2q + r6]           ; row 3 of pix1
365    psadbw         m2, [src1q + r5]
366
367    paddd          m0, m1
368    paddd          m0, m2
369
370    lea            src2q,     [src2q + 4 * stride2q]
371    lea            src1q,     [src1q + 4 * stride1q]
372
373    dec            r4d
374    jnz           .loop
375
376    vextracti128   xm1, m0, 1
377    paddd          xm0, xm1
378    pshufd         xm1, xm0, 2
379    paddd          xm0, xm1
380    movd           eax, xm0
381    RET
382%endmacro
383
384SAD_AVX2_32x32 a
385SAD_AVX2_32x32 u
386%endif
387