1;*****************************************************************************
2;* SIMD-optimized motion compensation estimation
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pb_1
29cextern pb_80
30
31SECTION .text
32
33%macro DIFF_PIXELS_1 4
34    movh            %1, %3
35    movh            %2, %4
36    punpcklbw       %2, %1
37    punpcklbw       %1, %1
38    psubw           %1, %2
39%endmacro
40
41; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
42; %6=temporary storage location
43; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
44%macro DIFF_PIXELS_8 6
45    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
46    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
47    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48    add             %1, %5
49    add             %2, %5
50    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
51    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
52    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
53    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
54%ifdef m8
55    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
56%else
57    mova          [%6], m0
58    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
59    mova            m0, [%6]
60%endif
61    sub             %1, %5
62    sub             %2, %5
63%endmacro
64
65%macro HADAMARD8 0
66    SUMSUB_BADC       w, 0, 1, 2, 3
67    SUMSUB_BADC       w, 4, 5, 6, 7
68    SUMSUB_BADC       w, 0, 2, 1, 3
69    SUMSUB_BADC       w, 4, 6, 5, 7
70    SUMSUB_BADC       w, 0, 4, 1, 5
71    SUMSUB_BADC       w, 2, 6, 3, 7
72%endmacro
73
74%macro ABS1_SUM 3
75    ABS1            %1, %2
76    paddusw         %3, %1
77%endmacro
78
79%macro ABS2_SUM 6
80    ABS2            %1, %2, %3, %4
81    paddusw         %5, %1
82    paddusw         %6, %2
83%endmacro
84
85%macro ABS_SUM_8x8_64 1
86    ABS2            m0, m1, m8, m9
87    ABS2_SUM        m2, m3, m8, m9, m0, m1
88    ABS2_SUM        m4, m5, m8, m9, m0, m1
89    ABS2_SUM        m6, m7, m8, m9, m0, m1
90    paddusw         m0, m1
91%endmacro
92
93%macro ABS_SUM_8x8_32 1
94    mova          [%1], m7
95    ABS1            m0, m7
96    ABS1            m1, m7
97    ABS1_SUM        m2, m7, m0
98    ABS1_SUM        m3, m7, m1
99    ABS1_SUM        m4, m7, m0
100    ABS1_SUM        m5, m7, m1
101    ABS1_SUM        m6, m7, m0
102    mova            m2, [%1]
103    ABS1_SUM        m2, m7, m1
104    paddusw         m0, m1
105%endmacro
106
107; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
108; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
109; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
110%macro HSUM 3
111%if cpuflag(sse2)
112    movhlps         %2, %1
113    paddusw         %1, %2
114    pshuflw         %2, %1, 0xE
115    paddusw         %1, %2
116    pshuflw         %2, %1, 0x1
117    paddusw         %1, %2
118    movd            %3, %1
119%elif cpuflag(mmxext)
120    pshufw          %2, %1, 0xE
121    paddusw         %1, %2
122    pshufw          %2, %1, 0x1
123    paddusw         %1, %2
124    movd            %3, %1
125%elif cpuflag(mmx)
126    mova            %2, %1
127    psrlq           %1, 32
128    paddusw         %1, %2
129    mova            %2, %1
130    psrlq           %1, 16
131    paddusw         %1, %2
132    movd            %3, %1
133%endif
134%endmacro
135
136%macro STORE4 5
137    mova [%1+mmsize*0], %2
138    mova [%1+mmsize*1], %3
139    mova [%1+mmsize*2], %4
140    mova [%1+mmsize*3], %5
141%endmacro
142
143%macro LOAD4 5
144    mova            %2, [%1+mmsize*0]
145    mova            %3, [%1+mmsize*1]
146    mova            %4, [%1+mmsize*2]
147    mova            %5, [%1+mmsize*3]
148%endmacro
149
150%macro hadamard8_16_wrapper 2
151cglobal hadamard8_diff, 4, 4, %1
152%ifndef m8
153    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
154    SUB            rsp, pad
155%endif
156    call hadamard8x8_diff %+ SUFFIX
157%ifndef m8
158    ADD            rsp, pad
159%endif
160    RET
161
162cglobal hadamard8_diff16, 5, 6, %1
163%ifndef m8
164    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
165    SUB            rsp, pad
166%endif
167
168    call hadamard8x8_diff %+ SUFFIX
169    mov            r5d, eax
170
171    add             r1, 8
172    add             r2, 8
173    call hadamard8x8_diff %+ SUFFIX
174    add            r5d, eax
175
176    cmp            r4d, 16
177    jne .done
178
179    lea             r1, [r1+r3*8-8]
180    lea             r2, [r2+r3*8-8]
181    call hadamard8x8_diff %+ SUFFIX
182    add            r5d, eax
183
184    add             r1, 8
185    add             r2, 8
186    call hadamard8x8_diff %+ SUFFIX
187    add            r5d, eax
188
189.done:
190    mov            eax, r5d
191%ifndef m8
192    ADD            rsp, pad
193%endif
194    RET
195%endmacro
196
197%macro HADAMARD8_DIFF 0-1
198%if cpuflag(sse2)
199hadamard8x8_diff %+ SUFFIX:
200    lea                          r0, [r3*3]
201    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
202    HADAMARD8
203%if ARCH_X86_64
204    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
205%else
206    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
207%endif
208    HADAMARD8
209    ABS_SUM_8x8         rsp+gprsize
210    HSUM                        m0, m1, eax
211    and                         eax, 0xFFFF
212    ret
213
214hadamard8_16_wrapper %1, 3
215%elif cpuflag(mmx)
216ALIGN 16
217; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
218;                               uint8_t *src2, ptrdiff_t stride, int h)
219; r0 = void *s = unused, int h = unused (always 8)
220; note how r1, r2 and r3 are not clobbered in this function, so 16x16
221; can simply call this 2x2x (and that's why we access rsp+gprsize
222; everywhere, which is rsp of calling func
223hadamard8x8_diff %+ SUFFIX:
224    lea                          r0, [r3*3]
225
226    ; first 4x8 pixels
227    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
228    HADAMARD8
229    mova         [rsp+gprsize+0x60], m7
230    TRANSPOSE4x4W                 0,  1,  2,  3,  7
231    STORE4              rsp+gprsize, m0, m1, m2, m3
232    mova                         m7, [rsp+gprsize+0x60]
233    TRANSPOSE4x4W                 4,  5,  6,  7,  0
234    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
235
236    ; second 4x8 pixels
237    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
238    HADAMARD8
239    mova         [rsp+gprsize+0x60], m7
240    TRANSPOSE4x4W                 0,  1,  2,  3,  7
241    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
242    mova                         m7, [rsp+gprsize+0x60]
243    TRANSPOSE4x4W                 4,  5,  6,  7,  0
244
245    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
246    HADAMARD8
247    ABS_SUM_8x8_32 rsp+gprsize+0x60
248    mova         [rsp+gprsize+0x60], m0
249
250    LOAD4          rsp+gprsize     , m0, m1, m2, m3
251    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
252    HADAMARD8
253    ABS_SUM_8x8_32 rsp+gprsize
254    paddusw                      m0, [rsp+gprsize+0x60]
255
256    HSUM                         m0, m1, eax
257    and                         rax, 0xFFFF
258    ret
259
260hadamard8_16_wrapper 0, 14
261%endif
262%endmacro
263
264INIT_MMX mmx
265HADAMARD8_DIFF
266
267INIT_MMX mmxext
268HADAMARD8_DIFF
269
270INIT_XMM sse2
271%if ARCH_X86_64
272%define ABS_SUM_8x8 ABS_SUM_8x8_64
273%else
274%define ABS_SUM_8x8 ABS_SUM_8x8_32
275%endif
276HADAMARD8_DIFF 10
277
278INIT_XMM ssse3
279%define ABS_SUM_8x8 ABS_SUM_8x8_64
280HADAMARD8_DIFF 9
281
282; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
283;               ptrdiff_t line_size, int h)
284
285%macro SUM_SQUARED_ERRORS 1
286cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
287%if %1 == mmsize
288    shr       hd, 1
289%endif
290    pxor      m0, m0         ; mm0 = 0
291    pxor      m7, m7         ; mm7 holds the sum
292
293.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
294    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
295    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
296%if %1 == mmsize
297    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
298    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
299%else  ; %1 / 2 == mmsize; mmx only
300    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
301    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
302%endif
303
304    ; todo: mm1-mm2, mm3-mm4
305    ; algo: subtract mm1 from mm2 with saturation and vice versa
306    ;       OR the result to get the absolute difference
307    mova      m5, m1
308    mova      m6, m3
309    psubusb   m1, m2
310    psubusb   m3, m4
311    psubusb   m2, m5
312    psubusb   m4, m6
313
314    por       m2, m1
315    por       m4, m3
316
317    ; now convert to 16-bit vectors so we can square them
318    mova      m1, m2
319    mova      m3, m4
320
321    punpckhbw m2, m0
322    punpckhbw m4, m0
323    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
324    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
325
326    pmaddwd   m2, m2
327    pmaddwd   m4, m4
328    pmaddwd   m1, m1
329    pmaddwd   m3, m3
330
331    paddd     m1, m2
332    paddd     m3, m4
333    paddd     m7, m1
334    paddd     m7, m3
335
336%if %1 == mmsize
337    lea    pix1q, [pix1q + 2*lsizeq]
338    lea    pix2q, [pix2q + 2*lsizeq]
339%else
340    add    pix1q, lsizeq
341    add    pix2q, lsizeq
342%endif
343    dec       hd
344    jnz .next2lines
345
346    HADDD     m7, m1
347    movd     eax, m7         ; return value
348    RET
349%endmacro
350
351INIT_MMX mmx
352SUM_SQUARED_ERRORS 8
353
354INIT_MMX mmx
355SUM_SQUARED_ERRORS 16
356
357INIT_XMM sse2
358SUM_SQUARED_ERRORS 16
359
360;-----------------------------------------------
361;int ff_sum_abs_dctelem(int16_t *block)
362;-----------------------------------------------
363; %1 = number of xmm registers used
364; %2 = number of inline loops
365
366%macro SUM_ABS_DCTELEM 2
367cglobal sum_abs_dctelem, 1, 1, %1, block
368    pxor    m0, m0
369    pxor    m1, m1
370%assign %%i 0
371%rep %2
372    mova      m2, [blockq+mmsize*(0+%%i)]
373    mova      m3, [blockq+mmsize*(1+%%i)]
374    mova      m4, [blockq+mmsize*(2+%%i)]
375    mova      m5, [blockq+mmsize*(3+%%i)]
376    ABS1_SUM  m2, m6, m0
377    ABS1_SUM  m3, m6, m1
378    ABS1_SUM  m4, m6, m0
379    ABS1_SUM  m5, m6, m1
380%assign %%i %%i+4
381%endrep
382    paddusw m0, m1
383    HSUM    m0, m1, eax
384    and     eax, 0xFFFF
385    RET
386%endmacro
387
388INIT_MMX mmx
389SUM_ABS_DCTELEM 0, 4
390INIT_MMX mmxext
391SUM_ABS_DCTELEM 0, 4
392INIT_XMM sse2
393SUM_ABS_DCTELEM 7, 2
394INIT_XMM ssse3
395SUM_ABS_DCTELEM 6, 2
396
397;------------------------------------------------------------------------------
398; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
399;------------------------------------------------------------------------------
400; %1 = 8/16. %2-5=m#
401%macro HF_NOISE_PART1 5
402    mova      m%2, [pix1q]
403%if %1 == 8
404    mova      m%3, m%2
405    psllq     m%2, 8
406    psrlq     m%3, 8
407    psrlq     m%2, 8
408%else
409    mova      m%3, [pix1q+1]
410%endif
411    mova      m%4, m%2
412    mova      m%5, m%3
413    punpcklbw m%2, m7
414    punpcklbw m%3, m7
415    punpckhbw m%4, m7
416    punpckhbw m%5, m7
417    psubw     m%2, m%3
418    psubw     m%4, m%5
419%endmacro
420
421; %1-2 = m#
422%macro HF_NOISE_PART2 4
423    psubw     m%1, m%3
424    psubw     m%2, m%4
425    pxor       m3, m3
426    pxor       m1, m1
427    pcmpgtw    m3, m%1
428    pcmpgtw    m1, m%2
429    pxor      m%1, m3
430    pxor      m%2, m1
431    psubw     m%1, m3
432    psubw     m%2, m1
433    paddw     m%2, m%1
434    paddw      m6, m%2
435%endmacro
436
437; %1 = 8/16
438%macro HF_NOISE 1
439cglobal hf_noise%1, 3,3,0, pix1, lsize, h
440    sub        hd, 2
441    pxor       m7, m7
442    pxor       m6, m6
443    HF_NOISE_PART1 %1, 0, 1, 2, 3
444    add     pix1q, lsizeq
445    HF_NOISE_PART1 %1, 4, 1, 5, 3
446    HF_NOISE_PART2     0, 2, 4, 5
447    add     pix1q, lsizeq
448.loop:
449    HF_NOISE_PART1 %1, 0, 1, 2, 3
450    HF_NOISE_PART2     4, 5, 0, 2
451    add     pix1q, lsizeq
452    HF_NOISE_PART1 %1, 4, 1, 5, 3
453    HF_NOISE_PART2     0, 2, 4, 5
454    add     pix1q, lsizeq
455    sub        hd, 2
456        jne .loop
457
458    mova       m0, m6
459    punpcklwd  m0, m7
460    punpckhwd  m6, m7
461    paddd      m6, m0
462    mova       m0, m6
463    psrlq      m6, 32
464    paddd      m0, m6
465    movd      eax, m0   ; eax = result of hf_noise8;
466    REP_RET                 ; return eax;
467%endmacro
468
469INIT_MMX mmx
470HF_NOISE 8
471HF_NOISE 16
472
473;---------------------------------------------------------------------------------------
474;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
475;---------------------------------------------------------------------------------------
476;%1 = 8/16
477%macro SAD 1
478cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
479    movu      m2, [pix2q]
480    movu      m1, [pix2q+strideq]
481    psadbw    m2, [pix1q]
482    psadbw    m1, [pix1q+strideq]
483    paddw     m2, m1
484%if %1 != mmsize
485    movu      m0, [pix2q+8]
486    movu      m1, [pix2q+strideq+8]
487    psadbw    m0, [pix1q+8]
488    psadbw    m1, [pix1q+strideq+8]
489    paddw     m2, m0
490    paddw     m2, m1
491%endif
492    sub       hd, 2
493
494align 16
495.loop:
496    lea    pix1q, [pix1q+strideq*2]
497    lea    pix2q, [pix2q+strideq*2]
498    movu      m0, [pix2q]
499    movu      m1, [pix2q+strideq]
500    psadbw    m0, [pix1q]
501    psadbw    m1, [pix1q+strideq]
502    paddw     m2, m0
503    paddw     m2, m1
504%if %1 != mmsize
505    movu      m0, [pix2q+8]
506    movu      m1, [pix2q+strideq+8]
507    psadbw    m0, [pix1q+8]
508    psadbw    m1, [pix1q+strideq+8]
509    paddw     m2, m0
510    paddw     m2, m1
511%endif
512    sub       hd, 2
513    jg .loop
514%if mmsize == 16
515    movhlps   m0, m2
516    paddw     m2, m0
517%endif
518    movd     eax, m2
519    RET
520%endmacro
521
522INIT_MMX mmxext
523SAD 8
524SAD 16
525INIT_XMM sse2
526SAD 16
527
528;------------------------------------------------------------------------------------------
529;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
530;------------------------------------------------------------------------------------------
531;%1 = 8/16
532%macro SAD_X2 1
533cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
534    movu      m0, [pix2q]
535    movu      m2, [pix2q+strideq]
536%if mmsize == 16
537    movu      m3, [pix2q+1]
538    movu      m4, [pix2q+strideq+1]
539    pavgb     m0, m3
540    pavgb     m2, m4
541%else
542    pavgb     m0, [pix2q+1]
543    pavgb     m2, [pix2q+strideq+1]
544%endif
545    psadbw    m0, [pix1q]
546    psadbw    m2, [pix1q+strideq]
547    paddw     m0, m2
548%if %1 != mmsize
549    movu      m1, [pix2q+8]
550    movu      m2, [pix2q+strideq+8]
551    pavgb     m1, [pix2q+9]
552    pavgb     m2, [pix2q+strideq+9]
553    psadbw    m1, [pix1q+8]
554    psadbw    m2, [pix1q+strideq+8]
555    paddw     m0, m1
556    paddw     m0, m2
557%endif
558    sub       hd, 2
559
560align 16
561.loop:
562    lea    pix1q, [pix1q+2*strideq]
563    lea    pix2q, [pix2q+2*strideq]
564    movu      m1, [pix2q]
565    movu      m2, [pix2q+strideq]
566%if mmsize == 16
567    movu      m3, [pix2q+1]
568    movu      m4, [pix2q+strideq+1]
569    pavgb     m1, m3
570    pavgb     m2, m4
571%else
572    pavgb     m1, [pix2q+1]
573    pavgb     m2, [pix2q+strideq+1]
574%endif
575    psadbw    m1, [pix1q]
576    psadbw    m2, [pix1q+strideq]
577    paddw     m0, m1
578    paddw     m0, m2
579%if %1 != mmsize
580    movu      m1, [pix2q+8]
581    movu      m2, [pix2q+strideq+8]
582    pavgb     m1, [pix2q+9]
583    pavgb     m2, [pix2q+strideq+9]
584    psadbw    m1, [pix1q+8]
585    psadbw    m2, [pix1q+strideq+8]
586    paddw     m0, m1
587    paddw     m0, m2
588%endif
589    sub       hd, 2
590    jg .loop
591%if mmsize == 16
592    movhlps   m1, m0
593    paddw     m0, m1
594%endif
595    movd     eax, m0
596    RET
597%endmacro
598
599INIT_MMX mmxext
600SAD_X2 8
601SAD_X2 16
602INIT_XMM sse2
603SAD_X2 16
604
605;------------------------------------------------------------------------------------------
606;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
607;------------------------------------------------------------------------------------------
608;%1 = 8/16
609%macro SAD_Y2 1
610cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
611    movu      m1, [pix2q]
612    movu      m0, [pix2q+strideq]
613    movu      m3, [pix2q+2*strideq]
614    pavgb     m1, m0
615    pavgb     m0, m3
616    psadbw    m1, [pix1q]
617    psadbw    m0, [pix1q+strideq]
618    paddw     m0, m1
619    mova      m1, m3
620%if %1 != mmsize
621    movu      m4, [pix2q+8]
622    movu      m5, [pix2q+strideq+8]
623    movu      m6, [pix2q+2*strideq+8]
624    pavgb     m4, m5
625    pavgb     m5, m6
626    psadbw    m4, [pix1q+8]
627    psadbw    m5, [pix1q+strideq+8]
628    paddw     m0, m4
629    paddw     m0, m5
630    mova      m4, m6
631%endif
632    add    pix2q, strideq
633    sub       hd, 2
634
635align 16
636.loop:
637    lea    pix1q, [pix1q+2*strideq]
638    lea    pix2q, [pix2q+2*strideq]
639    movu      m2, [pix2q]
640    movu      m3, [pix2q+strideq]
641    pavgb     m1, m2
642    pavgb     m2, m3
643    psadbw    m1, [pix1q]
644    psadbw    m2, [pix1q+strideq]
645    paddw     m0, m1
646    paddw     m0, m2
647    mova      m1, m3
648%if %1 != mmsize
649    movu      m5, [pix2q+8]
650    movu      m6, [pix2q+strideq+8]
651    pavgb     m4, m5
652    pavgb     m5, m6
653    psadbw    m4, [pix1q+8]
654    psadbw    m5, [pix1q+strideq+8]
655    paddw     m0, m4
656    paddw     m0, m5
657    mova      m4, m6
658%endif
659    sub       hd, 2
660    jg .loop
661%if mmsize == 16
662    movhlps   m1, m0
663    paddw     m0, m1
664%endif
665    movd     eax, m0
666    RET
667%endmacro
668
669INIT_MMX mmxext
670SAD_Y2 8
671SAD_Y2 16
672INIT_XMM sse2
673SAD_Y2 16
674
675;-------------------------------------------------------------------------------------------
676;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
677;-------------------------------------------------------------------------------------------
678;%1 = 8/16
679%macro SAD_APPROX_XY2 1
680cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
681    mova      m4, [pb_1]
682    movu      m1, [pix2q]
683    movu      m0, [pix2q+strideq]
684    movu      m3, [pix2q+2*strideq]
685%if mmsize == 16
686    movu      m5, [pix2q+1]
687    movu      m6, [pix2q+strideq+1]
688    movu      m2, [pix2q+2*strideq+1]
689    pavgb     m1, m5
690    pavgb     m0, m6
691    pavgb     m3, m2
692%else
693    pavgb     m1, [pix2q+1]
694    pavgb     m0, [pix2q+strideq+1]
695    pavgb     m3, [pix2q+2*strideq+1]
696%endif
697    psubusb   m0, m4
698    pavgb     m1, m0
699    pavgb     m0, m3
700    psadbw    m1, [pix1q]
701    psadbw    m0, [pix1q+strideq]
702    paddw     m0, m1
703    mova      m1, m3
704%if %1 != mmsize
705    movu      m5, [pix2q+8]
706    movu      m6, [pix2q+strideq+8]
707    movu      m7, [pix2q+2*strideq+8]
708    pavgb     m5, [pix2q+1+8]
709    pavgb     m6, [pix2q+strideq+1+8]
710    pavgb     m7, [pix2q+2*strideq+1+8]
711    psubusb   m6, m4
712    pavgb     m5, m6
713    pavgb     m6, m7
714    psadbw    m5, [pix1q+8]
715    psadbw    m6, [pix1q+strideq+8]
716    paddw     m0, m5
717    paddw     m0, m6
718    mova      m5, m7
719%endif
720    add    pix2q, strideq
721    sub       hd, 2
722
723align 16
724.loop:
725    lea    pix1q, [pix1q+2*strideq]
726    lea    pix2q, [pix2q+2*strideq]
727    movu      m2, [pix2q]
728    movu      m3, [pix2q+strideq]
729%if mmsize == 16
730    movu      m5, [pix2q+1]
731    movu      m6, [pix2q+strideq+1]
732    pavgb     m2, m5
733    pavgb     m3, m6
734%else
735    pavgb     m2, [pix2q+1]
736    pavgb     m3, [pix2q+strideq+1]
737%endif
738    psubusb   m2, m4
739    pavgb     m1, m2
740    pavgb     m2, m3
741    psadbw    m1, [pix1q]
742    psadbw    m2, [pix1q+strideq]
743    paddw     m0, m1
744    paddw     m0, m2
745    mova      m1, m3
746%if %1 != mmsize
747    movu      m6, [pix2q+8]
748    movu      m7, [pix2q+strideq+8]
749    pavgb     m6, [pix2q+8+1]
750    pavgb     m7, [pix2q+strideq+8+1]
751    psubusb   m6, m4
752    pavgb     m5, m6
753    pavgb     m6, m7
754    psadbw    m5, [pix1q+8]
755    psadbw    m6, [pix1q+strideq+8]
756    paddw     m0, m5
757    paddw     m0, m6
758    mova      m5, m7
759%endif
760    sub       hd, 2
761    jg .loop
762%if mmsize == 16
763    movhlps   m1, m0
764    paddw     m0, m1
765%endif
766    movd     eax, m0
767    RET
768%endmacro
769
770INIT_MMX mmxext
771SAD_APPROX_XY2 8
772SAD_APPROX_XY2 16
773INIT_XMM sse2
774SAD_APPROX_XY2 16
775
776;--------------------------------------------------------------------
777;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
778;                  ptrdiff_t line_size, int h);
779;--------------------------------------------------------------------
780; %1 = 8/16
781%macro VSAD_INTRA 1
782cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
783    mova      m0, [pix1q]
784%if %1 == mmsize
785    mova      m2, [pix1q+lsizeq]
786    psadbw    m0, m2
787%else
788    mova      m2, [pix1q+lsizeq]
789    mova      m3, [pix1q+8]
790    mova      m4, [pix1q+lsizeq+8]
791    psadbw    m0, m2
792    psadbw    m3, m4
793    paddw     m0, m3
794%endif
795    sub       hd, 2
796
797.loop:
798    lea    pix1q, [pix1q + 2*lsizeq]
799%if %1 == mmsize
800    mova      m1, [pix1q]
801    psadbw    m2, m1
802    paddw     m0, m2
803    mova      m2, [pix1q+lsizeq]
804    psadbw    m1, m2
805    paddw     m0, m1
806%else
807    mova      m1, [pix1q]
808    mova      m3, [pix1q+8]
809    psadbw    m2, m1
810    psadbw    m4, m3
811    paddw     m0, m2
812    paddw     m0, m4
813    mova      m2, [pix1q+lsizeq]
814    mova      m4, [pix1q+lsizeq+8]
815    psadbw    m1, m2
816    psadbw    m3, m4
817    paddw     m0, m1
818    paddw     m0, m3
819%endif
820    sub       hd, 2
821    jg     .loop
822
823%if mmsize == 16
824    pshufd m1, m0, 0xe
825    paddd  m0, m1
826%endif
827    movd eax, m0
828    RET
829%endmacro
830
831INIT_MMX mmxext
832VSAD_INTRA 8
833VSAD_INTRA 16
834INIT_XMM sse2
835VSAD_INTRA 16
836
837;---------------------------------------------------------------------
838;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
839;                   ptrdiff_t line_size, int h);
840;---------------------------------------------------------------------
841; %1 = 8/16
842%macro VSAD_APPROX 1
843cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
844    mova   m1, [pb_80]
845    mova   m0, [pix1q]
846%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
847    mova   m4, [pix1q+lsizeq]
848%if mmsize == 16
849    movu   m3, [pix2q]
850    movu   m2, [pix2q+lsizeq]
851    psubb  m0, m3
852    psubb  m4, m2
853%else
854    psubb  m0, [pix2q]
855    psubb  m4, [pix2q+lsizeq]
856%endif
857    pxor   m0, m1
858    pxor   m4, m1
859    psadbw m0, m4
860%else ; vsad16_mmxext
861    mova   m3, [pix1q+8]
862    psubb  m0, [pix2q]
863    psubb  m3, [pix2q+8]
864    pxor   m0, m1
865    pxor   m3, m1
866    mova   m4, [pix1q+lsizeq]
867    mova   m5, [pix1q+lsizeq+8]
868    psubb  m4, [pix2q+lsizeq]
869    psubb  m5, [pix2q+lsizeq+8]
870    pxor   m4, m1
871    pxor   m5, m1
872    psadbw m0, m4
873    psadbw m3, m5
874    paddw  m0, m3
875%endif
876    sub    hd, 2
877
878.loop:
879    lea pix1q, [pix1q + 2*lsizeq]
880    lea pix2q, [pix2q + 2*lsizeq]
881    mova   m2, [pix1q]
882%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
883%if mmsize == 16
884    movu   m3, [pix2q]
885    psubb  m2, m3
886%else
887    psubb  m2, [pix2q]
888%endif
889    pxor   m2, m1
890    psadbw m4, m2
891    paddw  m0, m4
892    mova   m4, [pix1q+lsizeq]
893    movu   m3, [pix2q+lsizeq]
894    psubb  m4, m3
895    pxor   m4, m1
896    psadbw m2, m4
897    paddw  m0, m2
898%else ; vsad16_mmxext
899    mova   m3, [pix1q+8]
900    psubb  m2, [pix2q]
901    psubb  m3, [pix2q+8]
902    pxor   m2, m1
903    pxor   m3, m1
904    psadbw m4, m2
905    psadbw m5, m3
906    paddw  m0, m4
907    paddw  m0, m5
908    mova   m4, [pix1q+lsizeq]
909    mova   m5, [pix1q+lsizeq+8]
910    psubb  m4, [pix2q+lsizeq]
911    psubb  m5, [pix2q+lsizeq+8]
912    pxor   m4, m1
913    pxor   m5, m1
914    psadbw m2, m4
915    psadbw m3, m5
916    paddw  m0, m2
917    paddw  m0, m3
918%endif
919    sub    hd, 2
920    jg  .loop
921
922%if mmsize == 16
923    pshufd m1, m0, 0xe
924    paddd  m0, m1
925%endif
926    movd  eax, m0
927    RET
928%endmacro
929
930INIT_MMX mmxext
931VSAD_APPROX 8
932VSAD_APPROX 16
933INIT_XMM sse2
934VSAD_APPROX 16
935