1;*****************************************************************************
2;* pixel.asm: x86 pixel metrics
3;*****************************************************************************
4;* Copyright (C) 2003-2013 x264 project
5;* Copyright (C) 2013-2020 MulticoreWare, Inc
6;*
7;* Authors: Loren Merritt <lorenm@u.washington.edu>
8;*          Holger Lubitz <holger@lubitz.org>
9;*          Laurent Aimar <fenrir@via.ecp.fr>
10;*          Alex Izvorski <aizvorksi@gmail.com>
11;*          Fiona Glaser <fiona@x264.com>
12;*          Oskar Arvidsson <oskar@irock.se>
13;*          Min Chen <chenm003@163.com>
14;*
15;* This program is free software; you can redistribute it and/or modify
16;* it under the terms of the GNU General Public License as published by
17;* the Free Software Foundation; either version 2 of the License, or
18;* (at your option) any later version.
19;*
20;* This program is distributed in the hope that it will be useful,
21;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23;* GNU General Public License for more details.
24;*
25;* You should have received a copy of the GNU General Public License
26;* along with this program; if not, write to the Free Software
27;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
28;*
29;* This program is also available under a commercial proprietary license.
30;* For more information, contact us at license @ x265.com.
31;*****************************************************************************
32
33%include "x86inc.asm"
34%include "x86util.asm"
35
36SECTION_RODATA 32
37hmul_8p:   times 8 db 1
38           times 4 db 1, -1
39           times 8 db 1
40           times 4 db 1, -1
41hmul_4p:   times 4 db 1, 1, 1, 1, 1, -1, 1, -1
42mask_10:   times 4 dw 0, -1
43mask_1100: times 2 dd 0, -1
44hmul_8w:   times 4 dw 1
45           times 2 dw 1, -1
46           times 4 dw 1
47           times 2 dw 1, -1
48psy_pp_shuff1:   dq 0, 1, 8, 9, 4, 5, 12, 13
49psy_pp_shuff2:   dq 2, 3, 10, 11, 6, 7, 14, 15
50psy_pp_shuff3:   dq 0, 0, 8, 8, 1, 1, 9, 9
51
52ALIGN 32
53transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
54transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
55
56SECTION .text
57
58cextern pb_0
59cextern pb_1
60cextern pw_1
61cextern pw_8
62cextern pw_16
63cextern pw_32
64cextern pw_00ff
65cextern pw_ppppmmmm
66cextern pw_ppmmppmm
67cextern pw_pmpmpmpm
68cextern pw_pmmpzzzz
69cextern pd_1
70cextern pd_2
71cextern hmul_16p
72cextern pb_movemask
73cextern pb_movemask_32
74cextern pw_pixel_max
75
76%if BIT_DEPTH == 12
77    %define     SSIMRD_SHIFT          4
78%elif BIT_DEPTH == 10
79    %define     SSIMRD_SHIFT          2
80%elif BIT_DEPTH == 8
81    %define     SSIMRD_SHIFT          0
82%else
83    %error Unsupported BIT_DEPTH!
84%endif
85
86;=============================================================================
87; SATD
88;=============================================================================
89
90%macro JDUP 2
91%if cpuflag(sse4)
92    ; just use shufps on anything post conroe
93    shufps %1, %2, 0
94%elif cpuflag(ssse3) && notcpuflag(atom)
95    ; join 2x 32 bit and duplicate them
96    ; emulating shufps is faster on conroe
97    punpcklqdq %1, %2
98    movsldup %1, %1
99%else
100    ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
101    punpckldq %1, %2
102%endif
103%endmacro
104
105%macro HSUMSUB 5
106    pmaddubsw m%2, m%5
107    pmaddubsw m%1, m%5
108    pmaddubsw m%4, m%5
109    pmaddubsw m%3, m%5
110%endmacro
111
112%macro DIFF_UNPACK_SSE2 5
113    punpcklbw m%1, m%5
114    punpcklbw m%2, m%5
115    punpcklbw m%3, m%5
116    punpcklbw m%4, m%5
117    psubw m%1, m%2
118    psubw m%3, m%4
119%endmacro
120
121%macro DIFF_SUMSUB_SSSE3 5
122    HSUMSUB %1, %2, %3, %4, %5
123    psubw m%1, m%2
124    psubw m%3, m%4
125%endmacro
126
127%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
128    movd %1, %3
129    movd %2, %4
130    JDUP %1, %2
131%endmacro
132
133%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
134    movddup m%3, %6
135    movddup m%4, %8
136    movddup m%1, %5
137    movddup m%2, %7
138%endmacro
139
140%macro LOAD_DUP_4x8P_PENRYN 8
141    ; penryn and nehalem run punpcklqdq and movddup in different units
142    movh m%3, %6
143    movh m%4, %8
144    punpcklqdq m%3, m%3
145    movddup m%1, %5
146    punpcklqdq m%4, m%4
147    movddup m%2, %7
148%endmacro
149
150%macro LOAD_SUMSUB_8x2P 9
151    LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
152    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
153%endmacro
154
155%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
156; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
157    LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
158    LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
159%if %10
160    lea %8, [%8+4*r1]
161    lea %9, [%9+4*r3]
162%endif
163%endmacro
164
165%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
166    movddup m%1, [%7]
167    movddup m%2, [%7+8]
168    mova m%4, [%6]
169    movddup m%3, m%4
170    punpckhqdq m%4, m%4
171    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
172%endmacro
173
174%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
175    movu  m%4, [%7]
176    mova  m%2, [%6]
177    DEINTB %1, %2, %3, %4, %5
178    psubw m%1, m%3
179    psubw m%2, m%4
180    SUMSUB_BA w, %1, %2, %3
181%endmacro
182
183%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
184; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
185    LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
186    LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
187    LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
188    LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
189%endmacro
190
191%macro LOAD_SUMSUB_16x2P_AVX2 9
192; 2*dst, 2*tmp, mul, 4*ptr
193    vbroadcasti128 m%1, [%6]
194    vbroadcasti128 m%3, [%7]
195    vbroadcasti128 m%2, [%8]
196    vbroadcasti128 m%4, [%9]
197    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
198%endmacro
199
200%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
201; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
202    LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
203    LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
204%if %10
205    lea  %8, [%8+4*r1]
206    lea  %9, [%9+4*r3]
207%endif
208%endmacro
209
210%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
211    mova  xm%3, %6
212    mova  xm%4, %8
213    mova  xm%1, %5
214    mova  xm%2, %7
215    vpermq m%3, m%3, q0011
216    vpermq m%4, m%4, q0011
217    vpermq m%1, m%1, q0011
218    vpermq m%2, m%2, q0011
219%endmacro
220
221%macro LOAD_SUMSUB8_16x2P_AVX2 9
222; 2*dst, 2*tmp, mul, 4*ptr
223    LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
224    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
225%endmacro
226
227%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
228; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
229    LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
230    LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
231%if %10
232    lea  %8, [%8+4*r1]
233    lea  %9, [%9+4*r3]
234%endif
235%endmacro
236
237; in: r4=3*stride1, r5=3*stride2
238; in: %2 = horizontal offset
239; in: %3 = whether we need to increment pix1 and pix2
240; clobber: m3..m7
241; out: %1 = satd
242%macro SATD_4x4_MMX 3
243    %xdefine %%n nn%1
244    %assign offset %2*SIZEOF_PIXEL
245    LOAD_DIFF m4, m3, none, [r0+     offset], [r2+     offset]
246    LOAD_DIFF m5, m3, none, [r0+  r1+offset], [r2+  r3+offset]
247    LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
248    LOAD_DIFF m7, m3, none, [r0+  r4+offset], [r2+  r5+offset]
249%if %3
250    lea  r0, [r0+4*r1]
251    lea  r2, [r2+4*r3]
252%endif
253    HADAMARD4_2D 4, 5, 6, 7, 3, %%n
254    paddw m4, m6
255;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
256;    pxor m5, m5
257;    punpcklwd m6, m4, m5
258;    punpckhwd m4, m5
259;    paddd m4, m6
260;%endif
261    SWAP %%n, 4
262%endmacro
263
264; in: %1 = horizontal if 0, vertical if 1
265%macro SATD_8x4_SSE 8-9
266%if %1
267    HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
268%else
269    HADAMARD4_V %2, %3, %4, %5, %6
270    ; doing the abs first is a slight advantage
271    ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
272    ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
273    HADAMARD 1, max, %2, %4, %6, %7
274%endif
275%ifnidn %9, swap
276  %if (BIT_DEPTH == 12)
277    pxor m%6, m%6
278    punpcklwd m%7, m%2, m%6
279    punpckhwd m%2, m%6
280    paddd m%8, m%7
281    paddd m%8, m%2
282  %else
283    paddw m%8, m%2
284  %endif
285%else
286    SWAP %8, %2
287  %if (BIT_DEPTH == 12)
288    pxor m%6, m%6
289    punpcklwd m%7, m%8, m%6
290    punpckhwd m%8, m%6
291    paddd m%8, m%7
292  %endif
293%endif
294%if %1
295  %if (BIT_DEPTH == 12)
296    pxor m%6, m%6
297    punpcklwd m%7, m%4, m%6
298    punpckhwd m%4, m%6
299    paddd m%8, m%7
300    paddd m%8, m%4
301  %else
302    paddw m%8, m%4
303  %endif
304%else
305    HADAMARD 1, max, %3, %5, %6, %7
306  %if (BIT_DEPTH == 12)
307    pxor m%6, m%6
308    punpcklwd m%7, m%3, m%6
309    punpckhwd m%3, m%6
310    paddd m%8, m%7
311    paddd m%8, m%3
312  %else
313    paddw m%8, m%3
314  %endif
315%endif
316%endmacro
317
318%macro SATD_8x4_1_SSE 10
319%if %1
320    HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
321%else
322    HADAMARD4_V %2, %3, %4, %5, %6
323    ; doing the abs first is a slight advantage
324    ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
325    ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
326    HADAMARD 1, max, %2, %4, %6, %7
327%endif
328
329    pxor m%10, m%10
330    punpcklwd m%9, m%2, m%10
331    paddd m%8, m%9
332    punpckhwd m%9, m%2, m%10
333    paddd m%8, m%9
334
335%if %1
336    pxor m%10, m%10
337    punpcklwd m%9, m%4, m%10
338    paddd m%8, m%9
339    punpckhwd m%9, m%4, m%10
340    paddd m%8, m%9
341%else
342    HADAMARD 1, max, %3, %5, %6, %7
343    pxor m%10, m%10
344    punpcklwd m%9, m%3, m%10
345    paddd m%8, m%9
346    punpckhwd m%9, m%3, m%10
347    paddd m%8, m%9
348%endif
349%endmacro
350
351%macro SATD_START_MMX 0
352    FIX_STRIDES r1, r3
353    lea  r4, [3*r1] ; 3*stride1
354    lea  r5, [3*r3] ; 3*stride2
355%endmacro
356
357%macro SATD_END_MMX 0
358%if HIGH_BIT_DEPTH
359    HADDUW      m0, m1
360    movd       eax, m0
361%else ; !HIGH_BIT_DEPTH
362    pshufw      m1, m0, q1032
363    paddw       m0, m1
364    pshufw      m1, m0, q2301
365    paddw       m0, m1
366    movd       eax, m0
367    and        eax, 0xffff
368%endif ; HIGH_BIT_DEPTH
369    EMMS
370    RET
371%endmacro
372
373%macro SSIM_DIST_HIGH 2
374    vpsrld         m6,         m0,        SSIMRD_SHIFT
375    vpsubd         m0,         m1
376
377    vpmuldq        m2,         m0,        m0
378    vpsrldq        m0,         m0,        4
379    vpmuldq        m0,         m0,        m0
380    vpaddq         m0,         m2
381
382    vpmuldq        m2,         m6,        m6
383    vpsrldq        m6,         m6,        4
384    vpmuldq        m6,         m6,        m6
385    vpaddq         m6,         m2
386
387    vpaddq         m4,         m0
388    vpaddq         m7,         m6
389%endmacro
390
391%macro NORM_FACT_HIGH 1
392    vpsrld         m1,          m0,        SSIMRD_SHIFT
393    vpmuldq        m2,          m1,        m1
394    vpsrldq        m1,          m1,        4
395    vpmuldq        m1,          m1,        m1
396
397    vpaddq         m1,          m2
398    vpaddq         m3,          m1
399%endmacro
400
401%macro SSIM_DIST_LOW 2
402    vpsrlw         m6,         m0,        SSIMRD_SHIFT
403    vpsubw         m0,         m1
404
405    vpmaddwd       m0,         m0,        m0
406    vpmaddwd       m6,         m6,        m6
407
408    vpaddd         m4,         m0
409    vpaddd         m7,         m6
410%endmacro
411
412%macro NORM_FACT_LOW 1
413    vpsrlw         m1,          m0,        SSIMRD_SHIFT
414    vpmaddwd       m1,          m1,        m1
415    vpaddd         m3,          m1
416%endmacro
417
418; FIXME avoid the spilling of regs to hold 3*stride.
419; for small blocks on x86_32, modify pixel pointer instead.
420
421;-----------------------------------------------------------------------------
422; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
423;-----------------------------------------------------------------------------
424INIT_MMX mmx2
425cglobal pixel_satd_4x4, 4,6
426    SATD_START_MMX
427    SATD_4x4_MMX m0, 0, 0
428    SATD_END_MMX
429
430%macro SATD_START_SSE2 2-3 0
431    FIX_STRIDES r1, r3
432%if HIGH_BIT_DEPTH && %3
433    pxor    %2, %2
434%elif cpuflag(ssse3) && notcpuflag(atom)
435%if mmsize==32
436    mova    %2, [hmul_16p]
437%else
438    mova    %2, [hmul_8p]
439%endif
440%endif
441    lea     r4, [3*r1]
442    lea     r5, [3*r3]
443    pxor    %1, %1
444%endmacro
445
446%macro SATD_END_SSE2 1-2
447%if HIGH_BIT_DEPTH
448  %if BIT_DEPTH == 12
449    HADDD   %1, xm0
450  %else ; BIT_DEPTH == 12
451    HADDUW  %1, xm0
452  %endif ; BIT_DEPTH == 12
453  %if %0 == 2
454    paddd   %1, %2
455  %endif
456%else
457    HADDW   %1, xm7
458%endif
459    movd   eax, %1
460    RET
461%endmacro
462
463%macro SATD_ACCUM 3
464%if HIGH_BIT_DEPTH
465    HADDUW %1, %2
466    paddd  %3, %1
467    pxor   %1, %1
468%endif
469%endmacro
470
471%macro BACKUP_POINTERS 0
472%if ARCH_X86_64
473%if WIN64
474    PUSH r7
475%endif
476    mov     r6, r0
477    mov     r7, r2
478%endif
479%endmacro
480
481%macro RESTORE_AND_INC_POINTERS 0
482%if ARCH_X86_64
483    lea     r0, [r6+8*SIZEOF_PIXEL]
484    lea     r2, [r7+8*SIZEOF_PIXEL]
485%if WIN64
486    POP r7
487%endif
488%else
489    mov     r0, r0mp
490    mov     r2, r2mp
491    add     r0, 8*SIZEOF_PIXEL
492    add     r2, 8*SIZEOF_PIXEL
493%endif
494%endmacro
495
496%macro SATD_4x8_SSE 3-4
497%if HIGH_BIT_DEPTH
498    movh    m0, [r0+0*r1]
499    movh    m4, [r2+0*r3]
500    movh    m1, [r0+1*r1]
501    movh    m5, [r2+1*r3]
502    movhps  m0, [r0+4*r1]
503    movhps  m4, [r2+4*r3]
504    movh    m2, [r0+2*r1]
505    movh    m6, [r2+2*r3]
506    psubw   m0, m4
507    movh    m3, [r0+r4]
508    movh    m4, [r2+r5]
509    lea     r0, [r0+4*r1]
510    lea     r2, [r2+4*r3]
511    movhps  m1, [r0+1*r1]
512    movhps  m5, [r2+1*r3]
513    movhps  m2, [r0+2*r1]
514    movhps  m6, [r2+2*r3]
515    psubw   m1, m5
516    movhps  m3, [r0+r4]
517    movhps  m4, [r2+r5]
518    psubw   m2, m6
519    psubw   m3, m4
520%else ; !HIGH_BIT_DEPTH
521    movd m4, [r2]
522    movd m5, [r2+r3]
523    movd m6, [r2+2*r3]
524    add r2, r5
525    movd m0, [r0]
526    movd m1, [r0+r1]
527    movd m2, [r0+2*r1]
528    add r0, r4
529    movd m3, [r2+r3]
530    JDUP m4, m3
531    movd m3, [r0+r1]
532    JDUP m0, m3
533    movd m3, [r2+2*r3]
534    JDUP m5, m3
535    movd m3, [r0+2*r1]
536    JDUP m1, m3
537%if %1==0 && %2==1
538    mova m3, [hmul_4p]
539    DIFFOP 0, 4, 1, 5, 3
540%else
541    DIFFOP 0, 4, 1, 5, 7
542%endif
543    movd m5, [r2]
544    add r2, r5
545    movd m3, [r0]
546    add r0, r4
547    movd m4, [r2]
548    JDUP m6, m4
549    movd m4, [r0]
550    JDUP m2, m4
551    movd m4, [r2+r3]
552    JDUP m5, m4
553    movd m4, [r0+r1]
554    JDUP m3, m4
555%if %1==0 && %2==1
556    mova m4, [hmul_4p]
557    DIFFOP 2, 6, 3, 5, 4
558%else
559    DIFFOP 2, 6, 3, 5, 7
560%endif
561%endif ; HIGH_BIT_DEPTH
562%if %0 == 4
563    SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
564%else
565    SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
566%endif
567%endmacro
568
569;-----------------------------------------------------------------------------
570; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
571;-----------------------------------------------------------------------------
572%macro SATDS_SSE2 0
573%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
574
575%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
576cglobal pixel_satd_4x4, 4, 6, 6
577    SATD_START_MMX
578    mova m4, [hmul_4p]
579    LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
580    LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
581    LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
582    LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
583    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
584    HADAMARD 0, sumsub, 0, 1, 2, 3
585    HADAMARD 4, sumsub, 0, 1, 2, 3
586    HADAMARD 1, amax, 0, 1, 2, 3
587    HADDW m0, m1
588    movd eax, m0
589    RET
590%endif
591
592cglobal pixel_satd_4x8, 4, 6, 8
593    SATD_START_MMX
594%if vertical==0
595    mova m7, [hmul_4p]
596%endif
597    SATD_4x8_SSE vertical, 0, swap
598%if BIT_DEPTH == 12
599    HADDD m7, m1
600%else
601    HADDUW m7, m1
602%endif
603    movd eax, m7
604    RET
605
606cglobal pixel_satd_4x16, 4, 6, 8
607    SATD_START_MMX
608%if vertical==0
609    mova m7, [hmul_4p]
610%endif
611    SATD_4x8_SSE vertical, 0, swap
612    lea r0, [r0+r1*2*SIZEOF_PIXEL]
613    lea r2, [r2+r3*2*SIZEOF_PIXEL]
614    SATD_4x8_SSE vertical, 1, add
615%if BIT_DEPTH == 12
616    HADDD m7, m1
617%else
618    HADDUW m7, m1
619%endif
620    movd eax, m7
621    RET
622
623cglobal pixel_satd_8x8_internal
624    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
625    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
626%%pixel_satd_8x4_internal:
627    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
628    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
629    ret
630
631cglobal pixel_satd_8x8_internal2
632%if WIN64
633    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
634    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
635%%pixel_satd_8x4_internal2:
636    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
637    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
638%else
639    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
640    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
641%%pixel_satd_8x4_internal2:
642    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
643    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
644%endif
645    ret
646
647; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
648; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
649%if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
650
651cglobal pixel_satd_16x4_internal2
652    LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
653    lea  r2, [r2+4*r3]
654    lea  r0, [r0+4*r1]
655    SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13
656    SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
657    ret
658
659cglobal pixel_satd_16x4, 4,6,14
660    SATD_START_SSE2 m10, m7
661%if vertical
662    mova m7, [pw_00ff]
663%endif
664    call pixel_satd_16x4_internal2
665    HADDD m10, m0
666    movd eax, m10
667    RET
668
669cglobal pixel_satd_16x8, 4,6,14
670    SATD_START_SSE2 m10, m7
671%if vertical
672    mova m7, [pw_00ff]
673%endif
674    jmp %%pixel_satd_16x8_internal
675
676cglobal pixel_satd_16x12, 4,6,14
677    SATD_START_SSE2 m10, m7
678%if vertical
679    mova m7, [pw_00ff]
680%endif
681    call pixel_satd_16x4_internal2
682    jmp %%pixel_satd_16x8_internal
683
684cglobal pixel_satd_16x32, 4,6,14
685    SATD_START_SSE2 m10, m7
686%if vertical
687    mova m7, [pw_00ff]
688%endif
689    call pixel_satd_16x4_internal2
690    call pixel_satd_16x4_internal2
691    call pixel_satd_16x4_internal2
692    call pixel_satd_16x4_internal2
693    call pixel_satd_16x4_internal2
694    call pixel_satd_16x4_internal2
695    jmp %%pixel_satd_16x8_internal
696
697cglobal pixel_satd_16x64, 4,6,14
698    SATD_START_SSE2 m10, m7
699%if vertical
700    mova m7, [pw_00ff]
701%endif
702    call pixel_satd_16x4_internal2
703    call pixel_satd_16x4_internal2
704    call pixel_satd_16x4_internal2
705    call pixel_satd_16x4_internal2
706    call pixel_satd_16x4_internal2
707    call pixel_satd_16x4_internal2
708    call pixel_satd_16x4_internal2
709    call pixel_satd_16x4_internal2
710    call pixel_satd_16x4_internal2
711    call pixel_satd_16x4_internal2
712    call pixel_satd_16x4_internal2
713    call pixel_satd_16x4_internal2
714    call pixel_satd_16x4_internal2
715    call pixel_satd_16x4_internal2
716    jmp %%pixel_satd_16x8_internal
717
718cglobal pixel_satd_16x16, 4,6,14
719    SATD_START_SSE2 m10, m7
720%if vertical
721    mova m7, [pw_00ff]
722%endif
723    call pixel_satd_16x4_internal2
724    call pixel_satd_16x4_internal2
725%%pixel_satd_16x8_internal:
726    call pixel_satd_16x4_internal2
727    call pixel_satd_16x4_internal2
728    HADDD m10, m0
729    movd eax, m10
730    RET
731
732cglobal pixel_satd_32x8, 4,8,14    ;if WIN64 && notcpuflag(avx)
733    SATD_START_SSE2 m10, m7
734    mov r6, r0
735    mov r7, r2
736%if vertical
737    mova m7, [pw_00ff]
738%endif
739    call pixel_satd_16x4_internal2
740    call pixel_satd_16x4_internal2
741    lea r0, [r6 + 16]
742    lea r2, [r7 + 16]
743    call pixel_satd_16x4_internal2
744    call pixel_satd_16x4_internal2
745    HADDD m10, m0
746    movd eax, m10
747    RET
748
749cglobal pixel_satd_32x16, 4,8,14    ;if WIN64 && notcpuflag(avx)
750    SATD_START_SSE2 m10, m7
751    mov r6, r0
752    mov r7, r2
753%if vertical
754    mova m7, [pw_00ff]
755%endif
756    call pixel_satd_16x4_internal2
757    call pixel_satd_16x4_internal2
758    call pixel_satd_16x4_internal2
759    call pixel_satd_16x4_internal2
760    lea r0, [r6 + 16]
761    lea r2, [r7 + 16]
762    call pixel_satd_16x4_internal2
763    call pixel_satd_16x4_internal2
764    call pixel_satd_16x4_internal2
765    call pixel_satd_16x4_internal2
766    HADDD m10, m0
767    movd    eax, m10
768    RET
769
770cglobal pixel_satd_32x24, 4,8,14    ;if WIN64 && notcpuflag(avx)
771    SATD_START_SSE2 m10, m7
772    mov r6, r0
773    mov r7, r2
774%if vertical
775    mova m7, [pw_00ff]
776%endif
777    call pixel_satd_16x4_internal2
778    call pixel_satd_16x4_internal2
779    call pixel_satd_16x4_internal2
780    call pixel_satd_16x4_internal2
781    call pixel_satd_16x4_internal2
782    call pixel_satd_16x4_internal2
783    lea r0, [r6 + 16]
784    lea r2, [r7 + 16]
785    call pixel_satd_16x4_internal2
786    call pixel_satd_16x4_internal2
787    call pixel_satd_16x4_internal2
788    call pixel_satd_16x4_internal2
789    call pixel_satd_16x4_internal2
790    call pixel_satd_16x4_internal2
791    HADDD m10, m0
792    movd eax, m10
793    RET
794
795cglobal pixel_satd_32x32, 4,8,14    ;if WIN64 && notcpuflag(avx)
796    SATD_START_SSE2 m10, m7
797    mov r6, r0
798    mov r7, r2
799%if vertical
800    mova m7, [pw_00ff]
801%endif
802    call pixel_satd_16x4_internal2
803    call pixel_satd_16x4_internal2
804    call pixel_satd_16x4_internal2
805    call pixel_satd_16x4_internal2
806    call pixel_satd_16x4_internal2
807    call pixel_satd_16x4_internal2
808    call pixel_satd_16x4_internal2
809    call pixel_satd_16x4_internal2
810    lea r0, [r6 + 16]
811    lea r2, [r7 + 16]
812    call pixel_satd_16x4_internal2
813    call pixel_satd_16x4_internal2
814    call pixel_satd_16x4_internal2
815    call pixel_satd_16x4_internal2
816    call pixel_satd_16x4_internal2
817    call pixel_satd_16x4_internal2
818    call pixel_satd_16x4_internal2
819    call pixel_satd_16x4_internal2
820    HADDD m10, m0
821    movd eax, m10
822    RET
823
824cglobal pixel_satd_32x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
825    SATD_START_SSE2 m10, m7
826    mov r6, r0
827    mov r7, r2
828%if vertical
829    mova m7, [pw_00ff]
830%endif
831    call pixel_satd_16x4_internal2
832    call pixel_satd_16x4_internal2
833    call pixel_satd_16x4_internal2
834    call pixel_satd_16x4_internal2
835    call pixel_satd_16x4_internal2
836    call pixel_satd_16x4_internal2
837    call pixel_satd_16x4_internal2
838    call pixel_satd_16x4_internal2
839    call pixel_satd_16x4_internal2
840    call pixel_satd_16x4_internal2
841    call pixel_satd_16x4_internal2
842    call pixel_satd_16x4_internal2
843    call pixel_satd_16x4_internal2
844    call pixel_satd_16x4_internal2
845    call pixel_satd_16x4_internal2
846    call pixel_satd_16x4_internal2
847    lea r0, [r6 + 16]
848    lea r2, [r7 + 16]
849    call pixel_satd_16x4_internal2
850    call pixel_satd_16x4_internal2
851    call pixel_satd_16x4_internal2
852    call pixel_satd_16x4_internal2
853    call pixel_satd_16x4_internal2
854    call pixel_satd_16x4_internal2
855    call pixel_satd_16x4_internal2
856    call pixel_satd_16x4_internal2
857    call pixel_satd_16x4_internal2
858    call pixel_satd_16x4_internal2
859    call pixel_satd_16x4_internal2
860    call pixel_satd_16x4_internal2
861    call pixel_satd_16x4_internal2
862    call pixel_satd_16x4_internal2
863    call pixel_satd_16x4_internal2
864    call pixel_satd_16x4_internal2
865    HADDD m10, m0
866    movd eax, m10
867    RET
868
869cglobal pixel_satd_48x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
870    SATD_START_SSE2 m10, m7
871    mov r6, r0
872    mov r7, r2
873%if vertical
874    mova m7, [pw_00ff]
875%endif
876    call pixel_satd_16x4_internal2
877    call pixel_satd_16x4_internal2
878    call pixel_satd_16x4_internal2
879    call pixel_satd_16x4_internal2
880    call pixel_satd_16x4_internal2
881    call pixel_satd_16x4_internal2
882    call pixel_satd_16x4_internal2
883    call pixel_satd_16x4_internal2
884    call pixel_satd_16x4_internal2
885    call pixel_satd_16x4_internal2
886    call pixel_satd_16x4_internal2
887    call pixel_satd_16x4_internal2
888    call pixel_satd_16x4_internal2
889    call pixel_satd_16x4_internal2
890    call pixel_satd_16x4_internal2
891    call pixel_satd_16x4_internal2
892    lea r0, [r6 + 16]
893    lea r2, [r7 + 16]
894    call pixel_satd_16x4_internal2
895    call pixel_satd_16x4_internal2
896    call pixel_satd_16x4_internal2
897    call pixel_satd_16x4_internal2
898    call pixel_satd_16x4_internal2
899    call pixel_satd_16x4_internal2
900    call pixel_satd_16x4_internal2
901    call pixel_satd_16x4_internal2
902    call pixel_satd_16x4_internal2
903    call pixel_satd_16x4_internal2
904    call pixel_satd_16x4_internal2
905    call pixel_satd_16x4_internal2
906    call pixel_satd_16x4_internal2
907    call pixel_satd_16x4_internal2
908    call pixel_satd_16x4_internal2
909    call pixel_satd_16x4_internal2
910    lea r0, [r6 + 32]
911    lea r2, [r7 + 32]
912    call pixel_satd_16x4_internal2
913    call pixel_satd_16x4_internal2
914    call pixel_satd_16x4_internal2
915    call pixel_satd_16x4_internal2
916    call pixel_satd_16x4_internal2
917    call pixel_satd_16x4_internal2
918    call pixel_satd_16x4_internal2
919    call pixel_satd_16x4_internal2
920    call pixel_satd_16x4_internal2
921    call pixel_satd_16x4_internal2
922    call pixel_satd_16x4_internal2
923    call pixel_satd_16x4_internal2
924    call pixel_satd_16x4_internal2
925    call pixel_satd_16x4_internal2
926    call pixel_satd_16x4_internal2
927    call pixel_satd_16x4_internal2
928    HADDD m10, m0
929    movd eax, m10
930    RET
931
932cglobal pixel_satd_64x16, 4,8,14    ;if WIN64 && notcpuflag(avx)
933    SATD_START_SSE2 m10, m7
934    mov r6, r0
935    mov r7, r2
936%if vertical
937    mova m7, [pw_00ff]
938%endif
939    call pixel_satd_16x4_internal2
940    call pixel_satd_16x4_internal2
941    call pixel_satd_16x4_internal2
942    call pixel_satd_16x4_internal2
943    lea r0, [r6 + 16]
944    lea r2, [r7 + 16]
945    call pixel_satd_16x4_internal2
946    call pixel_satd_16x4_internal2
947    call pixel_satd_16x4_internal2
948    call pixel_satd_16x4_internal2
949    lea r0, [r6 + 32]
950    lea r2, [r7 + 32]
951    call pixel_satd_16x4_internal2
952    call pixel_satd_16x4_internal2
953    call pixel_satd_16x4_internal2
954    call pixel_satd_16x4_internal2
955    lea r0, [r6 + 48]
956    lea r2, [r7 + 48]
957    call pixel_satd_16x4_internal2
958    call pixel_satd_16x4_internal2
959    call pixel_satd_16x4_internal2
960    call pixel_satd_16x4_internal2
961    HADDD m10, m0
962    movd eax, m10
963    RET
964
965cglobal pixel_satd_64x32, 4,8,14    ;if WIN64 && notcpuflag(avx)
966    SATD_START_SSE2 m10, m7
967    mov r6, r0
968    mov r7, r2
969%if vertical
970    mova m7, [pw_00ff]
971%endif
972    call pixel_satd_16x4_internal2
973    call pixel_satd_16x4_internal2
974    call pixel_satd_16x4_internal2
975    call pixel_satd_16x4_internal2
976    call pixel_satd_16x4_internal2
977    call pixel_satd_16x4_internal2
978    call pixel_satd_16x4_internal2
979    call pixel_satd_16x4_internal2
980    lea r0, [r6 + 16]
981    lea r2, [r7 + 16]
982    call pixel_satd_16x4_internal2
983    call pixel_satd_16x4_internal2
984    call pixel_satd_16x4_internal2
985    call pixel_satd_16x4_internal2
986    call pixel_satd_16x4_internal2
987    call pixel_satd_16x4_internal2
988    call pixel_satd_16x4_internal2
989    call pixel_satd_16x4_internal2
990    lea r0, [r6 + 32]
991    lea r2, [r7 + 32]
992    call pixel_satd_16x4_internal2
993    call pixel_satd_16x4_internal2
994    call pixel_satd_16x4_internal2
995    call pixel_satd_16x4_internal2
996    call pixel_satd_16x4_internal2
997    call pixel_satd_16x4_internal2
998    call pixel_satd_16x4_internal2
999    call pixel_satd_16x4_internal2
1000    lea r0, [r6 + 48]
1001    lea r2, [r7 + 48]
1002    call pixel_satd_16x4_internal2
1003    call pixel_satd_16x4_internal2
1004    call pixel_satd_16x4_internal2
1005    call pixel_satd_16x4_internal2
1006    call pixel_satd_16x4_internal2
1007    call pixel_satd_16x4_internal2
1008    call pixel_satd_16x4_internal2
1009    call pixel_satd_16x4_internal2
1010
1011    HADDD m10, m0
1012    movd eax, m10
1013    RET
1014
1015cglobal pixel_satd_64x48, 4,8,14    ;if WIN64 && notcpuflag(avx)
1016    SATD_START_SSE2 m10, m7
1017    mov r6, r0
1018    mov r7, r2
1019%if vertical
1020    mova m7, [pw_00ff]
1021%endif
1022    call pixel_satd_16x4_internal2
1023    call pixel_satd_16x4_internal2
1024    call pixel_satd_16x4_internal2
1025    call pixel_satd_16x4_internal2
1026    call pixel_satd_16x4_internal2
1027    call pixel_satd_16x4_internal2
1028    call pixel_satd_16x4_internal2
1029    call pixel_satd_16x4_internal2
1030    call pixel_satd_16x4_internal2
1031    call pixel_satd_16x4_internal2
1032    call pixel_satd_16x4_internal2
1033    call pixel_satd_16x4_internal2
1034    lea r0, [r6 + 16]
1035    lea r2, [r7 + 16]
1036    call pixel_satd_16x4_internal2
1037    call pixel_satd_16x4_internal2
1038    call pixel_satd_16x4_internal2
1039    call pixel_satd_16x4_internal2
1040    call pixel_satd_16x4_internal2
1041    call pixel_satd_16x4_internal2
1042    call pixel_satd_16x4_internal2
1043    call pixel_satd_16x4_internal2
1044    call pixel_satd_16x4_internal2
1045    call pixel_satd_16x4_internal2
1046    call pixel_satd_16x4_internal2
1047    call pixel_satd_16x4_internal2
1048    lea r0, [r6 + 32]
1049    lea r2, [r7 + 32]
1050    call pixel_satd_16x4_internal2
1051    call pixel_satd_16x4_internal2
1052    call pixel_satd_16x4_internal2
1053    call pixel_satd_16x4_internal2
1054    call pixel_satd_16x4_internal2
1055    call pixel_satd_16x4_internal2
1056    call pixel_satd_16x4_internal2
1057    call pixel_satd_16x4_internal2
1058    call pixel_satd_16x4_internal2
1059    call pixel_satd_16x4_internal2
1060    call pixel_satd_16x4_internal2
1061    call pixel_satd_16x4_internal2
1062    lea r0, [r6 + 48]
1063    lea r2, [r7 + 48]
1064    call pixel_satd_16x4_internal2
1065    call pixel_satd_16x4_internal2
1066    call pixel_satd_16x4_internal2
1067    call pixel_satd_16x4_internal2
1068    call pixel_satd_16x4_internal2
1069    call pixel_satd_16x4_internal2
1070    call pixel_satd_16x4_internal2
1071    call pixel_satd_16x4_internal2
1072    call pixel_satd_16x4_internal2
1073    call pixel_satd_16x4_internal2
1074    call pixel_satd_16x4_internal2
1075    call pixel_satd_16x4_internal2
1076
1077    HADDD m10, m0
1078    movd eax, m10
1079    RET
1080
1081cglobal pixel_satd_64x64, 4,8,14    ;if WIN64 && notcpuflag(avx)
1082    SATD_START_SSE2 m10, m7
1083    mov r6, r0
1084    mov r7, r2
1085%if vertical
1086    mova m7, [pw_00ff]
1087%endif
1088    call pixel_satd_16x4_internal2
1089    call pixel_satd_16x4_internal2
1090    call pixel_satd_16x4_internal2
1091    call pixel_satd_16x4_internal2
1092    call pixel_satd_16x4_internal2
1093    call pixel_satd_16x4_internal2
1094    call pixel_satd_16x4_internal2
1095    call pixel_satd_16x4_internal2
1096    call pixel_satd_16x4_internal2
1097    call pixel_satd_16x4_internal2
1098    call pixel_satd_16x4_internal2
1099    call pixel_satd_16x4_internal2
1100    call pixel_satd_16x4_internal2
1101    call pixel_satd_16x4_internal2
1102    call pixel_satd_16x4_internal2
1103    call pixel_satd_16x4_internal2
1104    lea r0, [r6 + 16]
1105    lea r2, [r7 + 16]
1106    call pixel_satd_16x4_internal2
1107    call pixel_satd_16x4_internal2
1108    call pixel_satd_16x4_internal2
1109    call pixel_satd_16x4_internal2
1110    call pixel_satd_16x4_internal2
1111    call pixel_satd_16x4_internal2
1112    call pixel_satd_16x4_internal2
1113    call pixel_satd_16x4_internal2
1114    call pixel_satd_16x4_internal2
1115    call pixel_satd_16x4_internal2
1116    call pixel_satd_16x4_internal2
1117    call pixel_satd_16x4_internal2
1118    call pixel_satd_16x4_internal2
1119    call pixel_satd_16x4_internal2
1120    call pixel_satd_16x4_internal2
1121    call pixel_satd_16x4_internal2
1122    lea r0, [r6 + 32]
1123    lea r2, [r7 + 32]
1124    call pixel_satd_16x4_internal2
1125    call pixel_satd_16x4_internal2
1126    call pixel_satd_16x4_internal2
1127    call pixel_satd_16x4_internal2
1128    call pixel_satd_16x4_internal2
1129    call pixel_satd_16x4_internal2
1130    call pixel_satd_16x4_internal2
1131    call pixel_satd_16x4_internal2
1132    call pixel_satd_16x4_internal2
1133    call pixel_satd_16x4_internal2
1134    call pixel_satd_16x4_internal2
1135    call pixel_satd_16x4_internal2
1136    call pixel_satd_16x4_internal2
1137    call pixel_satd_16x4_internal2
1138    call pixel_satd_16x4_internal2
1139    call pixel_satd_16x4_internal2
1140    lea r0, [r6 + 48]
1141    lea r2, [r7 + 48]
1142    call pixel_satd_16x4_internal2
1143    call pixel_satd_16x4_internal2
1144    call pixel_satd_16x4_internal2
1145    call pixel_satd_16x4_internal2
1146    call pixel_satd_16x4_internal2
1147    call pixel_satd_16x4_internal2
1148    call pixel_satd_16x4_internal2
1149    call pixel_satd_16x4_internal2
1150    call pixel_satd_16x4_internal2
1151    call pixel_satd_16x4_internal2
1152    call pixel_satd_16x4_internal2
1153    call pixel_satd_16x4_internal2
1154    call pixel_satd_16x4_internal2
1155    call pixel_satd_16x4_internal2
1156    call pixel_satd_16x4_internal2
1157    call pixel_satd_16x4_internal2
1158
1159    HADDD m10, m0
1160    movd eax, m10
1161    RET
1162
1163%else
1164%if WIN64
1165cglobal pixel_satd_16x24, 4,8,14    ;if WIN64 && cpuflag(avx)
1166    SATD_START_SSE2 m6, m7
1167    mov r6, r0
1168    mov r7, r2
1169    call pixel_satd_8x8_internal2
1170    call pixel_satd_8x8_internal2
1171    call pixel_satd_8x8_internal2
1172    lea r0, [r6 + 8*SIZEOF_PIXEL]
1173    lea r2, [r7 + 8*SIZEOF_PIXEL]
1174    call pixel_satd_8x8_internal2
1175    call pixel_satd_8x8_internal2
1176    call pixel_satd_8x8_internal2
1177    HADDD m6, m0
1178    movd   eax, m6
1179    RET
1180%else
1181cglobal pixel_satd_16x24, 4,7,8,0-gprsize    ;if !WIN64
1182    SATD_START_SSE2 m6, m7
1183    mov r6, r0
1184    mov [rsp], r2
1185    call pixel_satd_8x8_internal2
1186    call pixel_satd_8x8_internal2
1187    call pixel_satd_8x8_internal2
1188    lea r0, [r6 + 8*SIZEOF_PIXEL]
1189    mov r2, [rsp]
1190    add r2, 8*SIZEOF_PIXEL
1191    call pixel_satd_8x8_internal2
1192    call pixel_satd_8x8_internal2
1193    call pixel_satd_8x8_internal2
1194    HADDD m6, m0
1195    movd eax, m6
1196    RET
1197%endif
1198%if WIN64
1199cglobal pixel_satd_32x48, 4,8,14    ;if WIN64 && cpuflag(avx)
1200    SATD_START_SSE2 m6, m7
1201    mov r6, r0
1202    mov r7, r2
1203    call pixel_satd_8x8_internal2
1204    call pixel_satd_8x8_internal2
1205    call pixel_satd_8x8_internal2
1206    call pixel_satd_8x8_internal2
1207    call pixel_satd_8x8_internal2
1208    call pixel_satd_8x8_internal2
1209    lea r0, [r6 + 8*SIZEOF_PIXEL]
1210    lea r2, [r7 + 8*SIZEOF_PIXEL]
1211    call pixel_satd_8x8_internal2
1212    call pixel_satd_8x8_internal2
1213    call pixel_satd_8x8_internal2
1214    call pixel_satd_8x8_internal2
1215    call pixel_satd_8x8_internal2
1216    call pixel_satd_8x8_internal2
1217    lea r0, [r6 + 16*SIZEOF_PIXEL]
1218    lea r2, [r7 + 16*SIZEOF_PIXEL]
1219    call pixel_satd_8x8_internal2
1220    call pixel_satd_8x8_internal2
1221    call pixel_satd_8x8_internal2
1222    call pixel_satd_8x8_internal2
1223    call pixel_satd_8x8_internal2
1224    call pixel_satd_8x8_internal2
1225    lea r0, [r6 + 24*SIZEOF_PIXEL]
1226    lea r2, [r7 + 24*SIZEOF_PIXEL]
1227    call pixel_satd_8x8_internal2
1228    call pixel_satd_8x8_internal2
1229    call pixel_satd_8x8_internal2
1230    call pixel_satd_8x8_internal2
1231    call pixel_satd_8x8_internal2
1232    call pixel_satd_8x8_internal2
1233    HADDD m6, m0
1234    movd eax, m6
1235    RET
1236%else
1237cglobal pixel_satd_32x48, 4,7,8,0-gprsize    ;if !WIN64
1238    SATD_START_SSE2 m6, m7
1239    mov r6, r0
1240    mov [rsp], r2
1241    call pixel_satd_8x8_internal2
1242    call pixel_satd_8x8_internal2
1243    call pixel_satd_8x8_internal2
1244    call pixel_satd_8x8_internal2
1245    call pixel_satd_8x8_internal2
1246    call pixel_satd_8x8_internal2
1247    lea r0, [r6 + 8*SIZEOF_PIXEL]
1248    mov r2, [rsp]
1249    add r2, 8*SIZEOF_PIXEL
1250    call pixel_satd_8x8_internal2
1251    call pixel_satd_8x8_internal2
1252    call pixel_satd_8x8_internal2
1253    call pixel_satd_8x8_internal2
1254    call pixel_satd_8x8_internal2
1255    call pixel_satd_8x8_internal2
1256    lea r0, [r6 + 16*SIZEOF_PIXEL]
1257    mov r2, [rsp]
1258    add r2, 16*SIZEOF_PIXEL
1259    call pixel_satd_8x8_internal2
1260    call pixel_satd_8x8_internal2
1261    call pixel_satd_8x8_internal2
1262    call pixel_satd_8x8_internal2
1263    call pixel_satd_8x8_internal2
1264    call pixel_satd_8x8_internal2
1265    lea r0, [r6 + 24*SIZEOF_PIXEL]
1266    mov r2, [rsp]
1267    add r2, 24*SIZEOF_PIXEL
1268    call pixel_satd_8x8_internal2
1269    call pixel_satd_8x8_internal2
1270    call pixel_satd_8x8_internal2
1271    call pixel_satd_8x8_internal2
1272    call pixel_satd_8x8_internal2
1273    call pixel_satd_8x8_internal2
1274    HADDD m6, m0
1275    movd eax, m6
1276    RET
1277%endif
1278
1279%if WIN64
1280cglobal pixel_satd_24x64, 4,8,14    ;if WIN64 && cpuflag(avx)
1281    SATD_START_SSE2 m6, m7
1282    mov r6, r0
1283    mov r7, r2
1284    call pixel_satd_8x8_internal2
1285    call pixel_satd_8x8_internal2
1286    call pixel_satd_8x8_internal2
1287    call pixel_satd_8x8_internal2
1288    call pixel_satd_8x8_internal2
1289    call pixel_satd_8x8_internal2
1290    call pixel_satd_8x8_internal2
1291    call pixel_satd_8x8_internal2
1292    lea r0, [r6 + 8*SIZEOF_PIXEL]
1293    lea r2, [r7 + 8*SIZEOF_PIXEL]
1294    call pixel_satd_8x8_internal2
1295    call pixel_satd_8x8_internal2
1296    call pixel_satd_8x8_internal2
1297    call pixel_satd_8x8_internal2
1298    call pixel_satd_8x8_internal2
1299    call pixel_satd_8x8_internal2
1300    call pixel_satd_8x8_internal2
1301    call pixel_satd_8x8_internal2
1302    lea r0, [r6 + 16*SIZEOF_PIXEL]
1303    lea r2, [r7 + 16*SIZEOF_PIXEL]
1304    call pixel_satd_8x8_internal2
1305    call pixel_satd_8x8_internal2
1306    call pixel_satd_8x8_internal2
1307    call pixel_satd_8x8_internal2
1308    call pixel_satd_8x8_internal2
1309    call pixel_satd_8x8_internal2
1310    call pixel_satd_8x8_internal2
1311    call pixel_satd_8x8_internal2
1312    HADDD m6, m0
1313    movd eax, m6
1314    RET
1315%else
1316cglobal pixel_satd_24x64, 4,7,8,0-gprsize    ;if !WIN64
1317    SATD_START_SSE2 m6, m7
1318    mov r6, r0
1319    mov [rsp], r2
1320    call pixel_satd_8x8_internal2
1321    call pixel_satd_8x8_internal2
1322    call pixel_satd_8x8_internal2
1323    call pixel_satd_8x8_internal2
1324    call pixel_satd_8x8_internal2
1325    call pixel_satd_8x8_internal2
1326    call pixel_satd_8x8_internal2
1327    call pixel_satd_8x8_internal2
1328    lea r0, [r6 + 8*SIZEOF_PIXEL]
1329    mov r2, [rsp]
1330    add r2, 8*SIZEOF_PIXEL
1331    call pixel_satd_8x8_internal2
1332    call pixel_satd_8x8_internal2
1333    call pixel_satd_8x8_internal2
1334    call pixel_satd_8x8_internal2
1335    call pixel_satd_8x8_internal2
1336    call pixel_satd_8x8_internal2
1337    call pixel_satd_8x8_internal2
1338    call pixel_satd_8x8_internal2
1339    lea r0, [r6 + 16*SIZEOF_PIXEL]
1340    mov r2, [rsp]
1341    add r2, 16*SIZEOF_PIXEL
1342    call pixel_satd_8x8_internal2
1343    call pixel_satd_8x8_internal2
1344    call pixel_satd_8x8_internal2
1345    call pixel_satd_8x8_internal2
1346    call pixel_satd_8x8_internal2
1347    call pixel_satd_8x8_internal2
1348    call pixel_satd_8x8_internal2
1349    call pixel_satd_8x8_internal2
1350    HADDD m6, m0
1351    movd eax, m6
1352    RET
1353%endif
1354
1355%if WIN64
1356cglobal pixel_satd_8x64, 4,8,14    ;if WIN64 && cpuflag(avx)
1357    SATD_START_SSE2 m6, m7
1358    mov r6, r0
1359    mov r7, r2
1360    call pixel_satd_8x8_internal2
1361    call pixel_satd_8x8_internal2
1362    call pixel_satd_8x8_internal2
1363    call pixel_satd_8x8_internal2
1364    call pixel_satd_8x8_internal2
1365    call pixel_satd_8x8_internal2
1366    call pixel_satd_8x8_internal2
1367    call pixel_satd_8x8_internal2
1368    HADDD m6, m0
1369    movd eax, m6
1370    RET
1371%else
1372cglobal pixel_satd_8x64, 4,7,8,0-gprsize    ;if !WIN64
1373    SATD_START_SSE2 m6, m7
1374    mov r6, r0
1375    mov [rsp], r2
1376    call pixel_satd_8x8_internal2
1377    call pixel_satd_8x8_internal2
1378    call pixel_satd_8x8_internal2
1379    call pixel_satd_8x8_internal2
1380    call pixel_satd_8x8_internal2
1381    call pixel_satd_8x8_internal2
1382    call pixel_satd_8x8_internal2
1383    call pixel_satd_8x8_internal2
1384    HADDD m6, m0
1385    movd eax, m6
1386    RET
1387%endif
1388
1389%if WIN64
1390cglobal pixel_satd_8x12, 4,8,14    ;if WIN64 && cpuflag(avx)
1391    SATD_START_SSE2 m6, m7
1392    mov r6, r0
1393    mov r7, r2
1394    call pixel_satd_8x8_internal2
1395    call %%pixel_satd_8x4_internal2
1396    pxor    m7, m7
1397    movhlps m7, m6
1398    paddd   m6, m7
1399    pshufd  m7, m6, 1
1400    paddd   m6, m7
1401    movd   eax, m6
1402    RET
1403%else
1404cglobal pixel_satd_8x12, 4,7,8,0-gprsize    ;if !WIN64
1405    SATD_START_SSE2 m6, m7
1406    mov r6, r0
1407    mov [rsp], r2
1408    call pixel_satd_8x8_internal2
1409    call %%pixel_satd_8x4_internal2
1410    HADDD m6, m0
1411    movd eax, m6
1412    RET
1413%endif
1414
1415%if HIGH_BIT_DEPTH
1416%if WIN64
1417cglobal pixel_satd_12x32, 4,8,8   ;if WIN64 && cpuflag(avx)
1418    SATD_START_MMX
1419    mov r6, r0
1420    mov r7, r2
1421    pxor m7, m7
1422    SATD_4x8_SSE vertical, 0, 4, 5
1423    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1424    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1425    SATD_4x8_SSE vertical, 1, 4, 5
1426    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1427    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1428    SATD_4x8_SSE vertical, 1, 4, 5
1429    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1430    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1431    SATD_4x8_SSE vertical, 1, 4, 5
1432    lea r0, [r6 + 4*SIZEOF_PIXEL]
1433    lea r2, [r7 + 4*SIZEOF_PIXEL]
1434    SATD_4x8_SSE vertical, 1, 4, 5
1435    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1436    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1437    SATD_4x8_SSE vertical, 1, 4, 5
1438    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1439    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1440    SATD_4x8_SSE vertical, 1, 4, 5
1441    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1442    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1443    SATD_4x8_SSE vertical, 1, 4, 5
1444    lea r0, [r6 + 8*SIZEOF_PIXEL]
1445    lea r2, [r7 + 8*SIZEOF_PIXEL]
1446    SATD_4x8_SSE vertical, 1, 4, 5
1447    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1448    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1449    SATD_4x8_SSE vertical, 1, 4, 5
1450    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1451    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1452    SATD_4x8_SSE vertical, 1, 4, 5
1453    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1454    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1455    SATD_4x8_SSE vertical, 1, 4, 5
1456    HADDD m7, m0
1457    movd eax, m7
1458    RET
1459%else
1460cglobal pixel_satd_12x32, 4,7,8,0-gprsize
1461    SATD_START_MMX
1462    mov r6, r0
1463    mov [rsp], r2
1464    pxor m7, m7
1465    SATD_4x8_SSE vertical, 0, 4, 5
1466    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1467    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1468    SATD_4x8_SSE vertical, 1, 4, 5
1469    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1470    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1471    SATD_4x8_SSE vertical, 1, 4, 5
1472    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1473    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1474    SATD_4x8_SSE vertical, 1, 4, 5
1475    lea r0, [r6 + 4*SIZEOF_PIXEL]
1476    mov r2, [rsp]
1477    add r2, 4*SIZEOF_PIXEL
1478    SATD_4x8_SSE vertical, 1, 4, 5
1479    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1480    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1481    SATD_4x8_SSE vertical, 1, 4, 5
1482    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1483    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1484    SATD_4x8_SSE vertical, 1, 4, 5
1485    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1486    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1487    SATD_4x8_SSE vertical, 1, 4, 5
1488    lea r0, [r6 + 8*SIZEOF_PIXEL]
1489    mov r2, [rsp]
1490    add r2, 8*SIZEOF_PIXEL
1491    SATD_4x8_SSE vertical, 1, 4, 5
1492    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1493    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1494    SATD_4x8_SSE vertical, 1, 4, 5
1495    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1496    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1497    SATD_4x8_SSE vertical, 1, 4, 5
1498    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1499    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1500    SATD_4x8_SSE vertical, 1, 4, 5
1501    HADDD m7, m0
1502    movd eax, m7
1503    RET
1504%endif
1505%else ;HIGH_BIT_DEPTH
1506%if WIN64
1507cglobal pixel_satd_12x32, 4,8,8   ;if WIN64 && cpuflag(avx)
1508    SATD_START_MMX
1509    mov r6, r0
1510    mov r7, r2
1511%if vertical==0
1512    mova m7, [hmul_4p]
1513%endif
1514    SATD_4x8_SSE vertical, 0, swap
1515    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1516    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1517    SATD_4x8_SSE vertical, 1, add
1518    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1519    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1520    SATD_4x8_SSE vertical, 1, add
1521    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1522    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1523    SATD_4x8_SSE vertical, 1, add
1524    lea r0, [r6 + 4*SIZEOF_PIXEL]
1525    lea r2, [r7 + 4*SIZEOF_PIXEL]
1526    SATD_4x8_SSE vertical, 1, add
1527    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1528    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1529    SATD_4x8_SSE vertical, 1, add
1530    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1531    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1532    SATD_4x8_SSE vertical, 1, add
1533    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1534    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1535    SATD_4x8_SSE vertical, 1, add
1536    lea r0, [r6 + 8*SIZEOF_PIXEL]
1537    lea r2, [r7 + 8*SIZEOF_PIXEL]
1538    SATD_4x8_SSE vertical, 1, add
1539    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1540    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1541    SATD_4x8_SSE vertical, 1, add
1542    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1543    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1544    SATD_4x8_SSE vertical, 1, add
1545    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1546    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1547    SATD_4x8_SSE vertical, 1, add
1548    HADDW m7, m1
1549    movd eax, m7
1550    RET
1551%else
1552cglobal pixel_satd_12x32, 4,7,8,0-gprsize
1553    SATD_START_MMX
1554    mov r6, r0
1555    mov [rsp], r2
1556%if vertical==0
1557    mova m7, [hmul_4p]
1558%endif
1559    SATD_4x8_SSE vertical, 0, swap
1560    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1561    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1562    SATD_4x8_SSE vertical, 1, add
1563    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1564    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1565    SATD_4x8_SSE vertical, 1, add
1566    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1567    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1568    SATD_4x8_SSE vertical, 1, add
1569    lea r0, [r6 + 4*SIZEOF_PIXEL]
1570    mov r2, [rsp]
1571    add r2, 4*SIZEOF_PIXEL
1572    SATD_4x8_SSE vertical, 1, add
1573    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1574    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1575    SATD_4x8_SSE vertical, 1, add
1576    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1577    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1578    SATD_4x8_SSE vertical, 1, add
1579    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1580    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1581    SATD_4x8_SSE vertical, 1, add
1582    lea r0, [r6 + 8*SIZEOF_PIXEL]
1583    mov r2, [rsp]
1584    add r2, 8*SIZEOF_PIXEL
1585    SATD_4x8_SSE vertical, 1, add
1586    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1587    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1588    SATD_4x8_SSE vertical, 1, add
1589    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1590    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1591    SATD_4x8_SSE vertical, 1, add
1592    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1593    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1594    SATD_4x8_SSE vertical, 1, add
1595    HADDW m7, m1
1596    movd eax, m7
1597    RET
1598%endif
1599%endif
1600
1601%if HIGH_BIT_DEPTH
1602%if WIN64
1603cglobal pixel_satd_4x32, 4,8,8   ;if WIN64 && cpuflag(avx)
1604    SATD_START_MMX
1605    mov r6, r0
1606    mov r7, r2
1607    pxor m7, m7
1608    SATD_4x8_SSE vertical, 0, 4, 5
1609    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1610    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1611    SATD_4x8_SSE vertical, 1, 4, 5
1612    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1613    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1614    SATD_4x8_SSE vertical, 1, 4, 5
1615    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1616    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1617    SATD_4x8_SSE vertical, 1, 4, 5
1618    HADDD m7, m0
1619    movd eax, m7
1620    RET
1621%else
1622cglobal pixel_satd_4x32, 4,7,8,0-gprsize
1623    SATD_START_MMX
1624    mov r6, r0
1625    mov [rsp], r2
1626    pxor m7, m7
1627    SATD_4x8_SSE vertical, 0, 4, 5
1628    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1629    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1630    SATD_4x8_SSE vertical, 1, 4, 5
1631    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1632    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1633    SATD_4x8_SSE vertical, 1, 4, 5
1634    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1635    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1636    SATD_4x8_SSE vertical, 1, 4, 5
1637    pxor    m1, m1
1638    movhlps m1, m7
1639    paddd   m7, m1
1640    pshufd  m1, m7, 1
1641    paddd   m7, m1
1642    movd   eax, m7
1643    RET
1644%endif
1645%else
1646%if WIN64
1647cglobal pixel_satd_4x32, 4,8,8   ;if WIN64 && cpuflag(avx)
1648    SATD_START_MMX
1649    mov r6, r0
1650    mov r7, r2
1651%if vertical==0
1652    mova m7, [hmul_4p]
1653%endif
1654    SATD_4x8_SSE vertical, 0, swap
1655    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1656    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1657    SATD_4x8_SSE vertical, 1, add
1658    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1659    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1660    SATD_4x8_SSE vertical, 1, add
1661    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1662    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1663    SATD_4x8_SSE vertical, 1, add
1664    HADDW m7, m1
1665    movd eax, m7
1666    RET
1667%else
1668cglobal pixel_satd_4x32, 4,7,8,0-gprsize
1669    SATD_START_MMX
1670    mov r6, r0
1671    mov [rsp], r2
1672%if vertical==0
1673    mova m7, [hmul_4p]
1674%endif
1675    SATD_4x8_SSE vertical, 0, swap
1676    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1677    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1678    SATD_4x8_SSE vertical, 1, add
1679    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1680    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1681    SATD_4x8_SSE vertical, 1, add
1682    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
1683    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
1684    SATD_4x8_SSE vertical, 1, add
1685    HADDW m7, m1
1686    movd eax, m7
1687    RET
1688%endif
1689%endif
1690
1691%if WIN64
1692cglobal pixel_satd_32x8, 4,8,14    ;if WIN64 && cpuflag(avx)
1693    SATD_START_SSE2 m6, m7
1694    mov r6, r0
1695    mov r7, r2
1696    call pixel_satd_8x8_internal2
1697    lea r0, [r6 + 8*SIZEOF_PIXEL]
1698    lea r2, [r7 + 8*SIZEOF_PIXEL]
1699    call pixel_satd_8x8_internal2
1700    lea r0, [r6 + 16*SIZEOF_PIXEL]
1701    lea r2, [r7 + 16*SIZEOF_PIXEL]
1702    call pixel_satd_8x8_internal2
1703    lea r0, [r6 + 24*SIZEOF_PIXEL]
1704    lea r2, [r7 + 24*SIZEOF_PIXEL]
1705    call pixel_satd_8x8_internal2
1706    HADDD m6, m0
1707    movd eax, m6
1708    RET
1709%else
1710cglobal pixel_satd_32x8, 4,7,8,0-gprsize    ;if !WIN64
1711    SATD_START_SSE2 m6, m7
1712    mov r6, r0
1713    mov [rsp], r2
1714    call pixel_satd_8x8_internal2
1715    lea r0, [r6 + 8*SIZEOF_PIXEL]
1716    mov r2, [rsp]
1717    add r2, 8*SIZEOF_PIXEL
1718    call pixel_satd_8x8_internal2
1719    lea r0, [r6 + 16*SIZEOF_PIXEL]
1720    mov r2, [rsp]
1721    add r2, 16*SIZEOF_PIXEL
1722    call pixel_satd_8x8_internal2
1723    lea r0, [r6 + 24*SIZEOF_PIXEL]
1724    mov r2, [rsp]
1725    add r2, 24*SIZEOF_PIXEL
1726    call pixel_satd_8x8_internal2
1727    HADDD m6, m0
1728    movd eax, m6
1729    RET
1730%endif
1731
1732%if WIN64
1733cglobal pixel_satd_32x16, 4,8,14    ;if WIN64 && cpuflag(avx)
1734    SATD_START_SSE2 m6, m7
1735    mov r6, r0
1736    mov r7, r2
1737    call pixel_satd_8x8_internal2
1738    call pixel_satd_8x8_internal2
1739    lea r0, [r6 + 8*SIZEOF_PIXEL]
1740    lea r2, [r7 + 8*SIZEOF_PIXEL]
1741    call pixel_satd_8x8_internal2
1742    call pixel_satd_8x8_internal2
1743    lea r0, [r6 + 16*SIZEOF_PIXEL]
1744    lea r2, [r7 + 16*SIZEOF_PIXEL]
1745    call pixel_satd_8x8_internal2
1746    call pixel_satd_8x8_internal2
1747    lea r0, [r6 + 24*SIZEOF_PIXEL]
1748    lea r2, [r7 + 24*SIZEOF_PIXEL]
1749    call pixel_satd_8x8_internal2
1750    call pixel_satd_8x8_internal2
1751    HADDD m6, m0
1752    movd eax, m6
1753    RET
1754%else
1755cglobal pixel_satd_32x16, 4,7,8,0-gprsize   ;if !WIN64
1756    SATD_START_SSE2 m6, m7
1757    mov r6, r0
1758    mov [rsp], r2
1759    call pixel_satd_8x8_internal2
1760    call pixel_satd_8x8_internal2
1761    lea r0, [r6 + 8*SIZEOF_PIXEL]
1762    mov r2, [rsp]
1763    add r2, 8*SIZEOF_PIXEL
1764    call pixel_satd_8x8_internal2
1765    call pixel_satd_8x8_internal2
1766    lea r0, [r6 + 16*SIZEOF_PIXEL]
1767    mov r2, [rsp]
1768    add r2, 16*SIZEOF_PIXEL
1769    call pixel_satd_8x8_internal2
1770    call pixel_satd_8x8_internal2
1771    lea r0, [r6 + 24*SIZEOF_PIXEL]
1772    mov r2, [rsp]
1773    add r2, 24*SIZEOF_PIXEL
1774    call pixel_satd_8x8_internal2
1775    call pixel_satd_8x8_internal2
1776    HADDD m6, m0
1777    movd eax, m6
1778    RET
1779%endif
1780
1781%if WIN64
1782cglobal pixel_satd_32x24, 4,8,14    ;if WIN64 && cpuflag(avx)
1783    SATD_START_SSE2 m6, m7
1784    mov r6, r0
1785    mov r7, r2
1786    call pixel_satd_8x8_internal2
1787    call pixel_satd_8x8_internal2
1788    call pixel_satd_8x8_internal2
1789    lea r0, [r6 + 8*SIZEOF_PIXEL]
1790    lea r2, [r7 + 8*SIZEOF_PIXEL]
1791    call pixel_satd_8x8_internal2
1792    call pixel_satd_8x8_internal2
1793    call pixel_satd_8x8_internal2
1794    lea r0, [r6 + 16*SIZEOF_PIXEL]
1795    lea r2, [r7 + 16*SIZEOF_PIXEL]
1796    call pixel_satd_8x8_internal2
1797    call pixel_satd_8x8_internal2
1798    call pixel_satd_8x8_internal2
1799    lea r0, [r6 + 24*SIZEOF_PIXEL]
1800    lea r2, [r7 + 24*SIZEOF_PIXEL]
1801    call pixel_satd_8x8_internal2
1802    call pixel_satd_8x8_internal2
1803    call pixel_satd_8x8_internal2
1804    HADDD m6, m0
1805    movd eax, m6
1806    RET
1807%else
1808cglobal pixel_satd_32x24, 4,7,8,0-gprsize   ;if !WIN64
1809    SATD_START_SSE2 m6, m7
1810    mov r6, r0
1811    mov [rsp], r2
1812    call pixel_satd_8x8_internal2
1813    call pixel_satd_8x8_internal2
1814    call pixel_satd_8x8_internal2
1815    lea r0, [r6 + 8*SIZEOF_PIXEL]
1816    mov r2, [rsp]
1817    add r2, 8*SIZEOF_PIXEL
1818    call pixel_satd_8x8_internal2
1819    call pixel_satd_8x8_internal2
1820    call pixel_satd_8x8_internal2
1821    lea r0, [r6 + 16*SIZEOF_PIXEL]
1822    mov r2, [rsp]
1823    add r2, 16*SIZEOF_PIXEL
1824    call pixel_satd_8x8_internal2
1825    call pixel_satd_8x8_internal2
1826    call pixel_satd_8x8_internal2
1827    lea r0, [r6 + 24*SIZEOF_PIXEL]
1828    mov r2, [rsp]
1829    add r2, 24*SIZEOF_PIXEL
1830    call pixel_satd_8x8_internal2
1831    call pixel_satd_8x8_internal2
1832    call pixel_satd_8x8_internal2
1833    HADDD m6, m0
1834    movd eax, m6
1835    RET
1836%endif
1837
1838%if WIN64
1839cglobal pixel_satd_32x32, 4,8,14    ;if WIN64 && cpuflag(avx)
1840    SATD_START_SSE2 m6, m7
1841    mov r6, r0
1842    mov r7, r2
1843    call pixel_satd_8x8_internal2
1844    call pixel_satd_8x8_internal2
1845    call pixel_satd_8x8_internal2
1846    call pixel_satd_8x8_internal2
1847    lea r0, [r6 + 8*SIZEOF_PIXEL]
1848    lea r2, [r7 + 8*SIZEOF_PIXEL]
1849    call pixel_satd_8x8_internal2
1850    call pixel_satd_8x8_internal2
1851    call pixel_satd_8x8_internal2
1852    call pixel_satd_8x8_internal2
1853    lea r0, [r6 + 16*SIZEOF_PIXEL]
1854    lea r2, [r7 + 16*SIZEOF_PIXEL]
1855    call pixel_satd_8x8_internal2
1856    call pixel_satd_8x8_internal2
1857    call pixel_satd_8x8_internal2
1858    call pixel_satd_8x8_internal2
1859    lea r0, [r6 + 24*SIZEOF_PIXEL]
1860    lea r2, [r7 + 24*SIZEOF_PIXEL]
1861    call pixel_satd_8x8_internal2
1862    call pixel_satd_8x8_internal2
1863    call pixel_satd_8x8_internal2
1864    call pixel_satd_8x8_internal2
1865    HADDD m6, m0
1866    movd eax, m6
1867    RET
1868%else
1869cglobal pixel_satd_32x32, 4,7,8,0-gprsize   ;if !WIN64
1870    SATD_START_SSE2 m6, m7
1871    mov r6, r0
1872    mov [rsp], r2
1873    call pixel_satd_8x8_internal2
1874    call pixel_satd_8x8_internal2
1875    call pixel_satd_8x8_internal2
1876    call pixel_satd_8x8_internal2
1877    lea r0, [r6 + 8*SIZEOF_PIXEL]
1878    mov r2, [rsp]
1879    add r2, 8*SIZEOF_PIXEL
1880    call pixel_satd_8x8_internal2
1881    call pixel_satd_8x8_internal2
1882    call pixel_satd_8x8_internal2
1883    call pixel_satd_8x8_internal2
1884    lea r0, [r6 + 16*SIZEOF_PIXEL]
1885    mov r2, [rsp]
1886    add r2, 16*SIZEOF_PIXEL
1887    call pixel_satd_8x8_internal2
1888    call pixel_satd_8x8_internal2
1889    call pixel_satd_8x8_internal2
1890    call pixel_satd_8x8_internal2
1891    lea r0, [r6 + 24*SIZEOF_PIXEL]
1892    mov r2, [rsp]
1893    add r2, 24*SIZEOF_PIXEL
1894    call pixel_satd_8x8_internal2
1895    call pixel_satd_8x8_internal2
1896    call pixel_satd_8x8_internal2
1897    call pixel_satd_8x8_internal2
1898    HADDD m6, m0
1899    movd eax, m6
1900    RET
1901%endif
1902
1903%if WIN64
1904cglobal pixel_satd_32x64, 4,8,14    ;if WIN64 && cpuflag(avx)
1905    SATD_START_SSE2 m6, m7
1906    mov r6, r0
1907    mov r7, r2
1908    call pixel_satd_8x8_internal2
1909    call pixel_satd_8x8_internal2
1910    call pixel_satd_8x8_internal2
1911    call pixel_satd_8x8_internal2
1912    call pixel_satd_8x8_internal2
1913    call pixel_satd_8x8_internal2
1914    call pixel_satd_8x8_internal2
1915    call pixel_satd_8x8_internal2
1916    lea r0, [r6 + 8*SIZEOF_PIXEL]
1917    lea r2, [r7 + 8*SIZEOF_PIXEL]
1918    call pixel_satd_8x8_internal2
1919    call pixel_satd_8x8_internal2
1920    call pixel_satd_8x8_internal2
1921    call pixel_satd_8x8_internal2
1922    call pixel_satd_8x8_internal2
1923    call pixel_satd_8x8_internal2
1924    call pixel_satd_8x8_internal2
1925    call pixel_satd_8x8_internal2
1926    lea r0, [r6 + 16*SIZEOF_PIXEL]
1927    lea r2, [r7 + 16*SIZEOF_PIXEL]
1928    call pixel_satd_8x8_internal2
1929    call pixel_satd_8x8_internal2
1930    call pixel_satd_8x8_internal2
1931    call pixel_satd_8x8_internal2
1932    call pixel_satd_8x8_internal2
1933    call pixel_satd_8x8_internal2
1934    call pixel_satd_8x8_internal2
1935    call pixel_satd_8x8_internal2
1936    lea r0, [r6 + 24*SIZEOF_PIXEL]
1937    lea r2, [r7 + 24*SIZEOF_PIXEL]
1938    call pixel_satd_8x8_internal2
1939    call pixel_satd_8x8_internal2
1940    call pixel_satd_8x8_internal2
1941    call pixel_satd_8x8_internal2
1942    call pixel_satd_8x8_internal2
1943    call pixel_satd_8x8_internal2
1944    call pixel_satd_8x8_internal2
1945    call pixel_satd_8x8_internal2
1946    HADDD m6, m0
1947    movd eax, m6
1948    RET
1949%else
1950cglobal pixel_satd_32x64, 4,7,8,0-gprsize   ;if !WIN64
1951    SATD_START_SSE2 m6, m7
1952    mov r6, r0
1953    mov [rsp], r2
1954    call pixel_satd_8x8_internal2
1955    call pixel_satd_8x8_internal2
1956    call pixel_satd_8x8_internal2
1957    call pixel_satd_8x8_internal2
1958    call pixel_satd_8x8_internal2
1959    call pixel_satd_8x8_internal2
1960    call pixel_satd_8x8_internal2
1961    call pixel_satd_8x8_internal2
1962    lea r0, [r6 + 8*SIZEOF_PIXEL]
1963    mov r2, [rsp]
1964    add r2, 8*SIZEOF_PIXEL
1965    call pixel_satd_8x8_internal2
1966    call pixel_satd_8x8_internal2
1967    call pixel_satd_8x8_internal2
1968    call pixel_satd_8x8_internal2
1969    call pixel_satd_8x8_internal2
1970    call pixel_satd_8x8_internal2
1971    call pixel_satd_8x8_internal2
1972    call pixel_satd_8x8_internal2
1973    lea r0, [r6 + 16*SIZEOF_PIXEL]
1974    mov r2, [rsp]
1975    add r2, 16*SIZEOF_PIXEL
1976    call pixel_satd_8x8_internal2
1977    call pixel_satd_8x8_internal2
1978    call pixel_satd_8x8_internal2
1979    call pixel_satd_8x8_internal2
1980    call pixel_satd_8x8_internal2
1981    call pixel_satd_8x8_internal2
1982    call pixel_satd_8x8_internal2
1983    call pixel_satd_8x8_internal2
1984    lea r0, [r6 + 24*SIZEOF_PIXEL]
1985    mov r2, [rsp]
1986    add r2, 24*SIZEOF_PIXEL
1987    call pixel_satd_8x8_internal2
1988    call pixel_satd_8x8_internal2
1989    call pixel_satd_8x8_internal2
1990    call pixel_satd_8x8_internal2
1991    call pixel_satd_8x8_internal2
1992    call pixel_satd_8x8_internal2
1993    call pixel_satd_8x8_internal2
1994    call pixel_satd_8x8_internal2
1995    HADDD m6, m0
1996    movd eax, m6
1997    RET
1998%endif
1999
2000%if WIN64
2001cglobal pixel_satd_48x64, 4,8,14    ;if WIN64 && cpuflag(avx)
2002    SATD_START_SSE2 m6, m7
2003    mov r6, r0
2004    mov r7, r2
2005    call pixel_satd_8x8_internal2
2006    call pixel_satd_8x8_internal2
2007    call pixel_satd_8x8_internal2
2008    call pixel_satd_8x8_internal2
2009    call pixel_satd_8x8_internal2
2010    call pixel_satd_8x8_internal2
2011    call pixel_satd_8x8_internal2
2012    call pixel_satd_8x8_internal2
2013    lea r0, [r6 + 8*SIZEOF_PIXEL]
2014    lea r2, [r7 + 8*SIZEOF_PIXEL]
2015    call pixel_satd_8x8_internal2
2016    call pixel_satd_8x8_internal2
2017    call pixel_satd_8x8_internal2
2018    call pixel_satd_8x8_internal2
2019    call pixel_satd_8x8_internal2
2020    call pixel_satd_8x8_internal2
2021    call pixel_satd_8x8_internal2
2022    call pixel_satd_8x8_internal2
2023    lea r0, [r6 + 16*SIZEOF_PIXEL]
2024    lea r2, [r7 + 16*SIZEOF_PIXEL]
2025    call pixel_satd_8x8_internal2
2026    call pixel_satd_8x8_internal2
2027    call pixel_satd_8x8_internal2
2028    call pixel_satd_8x8_internal2
2029    call pixel_satd_8x8_internal2
2030    call pixel_satd_8x8_internal2
2031    call pixel_satd_8x8_internal2
2032    call pixel_satd_8x8_internal2
2033    lea r0, [r6 + 24*SIZEOF_PIXEL]
2034    lea r2, [r7 + 24*SIZEOF_PIXEL]
2035    call pixel_satd_8x8_internal2
2036    call pixel_satd_8x8_internal2
2037    call pixel_satd_8x8_internal2
2038    call pixel_satd_8x8_internal2
2039    call pixel_satd_8x8_internal2
2040    call pixel_satd_8x8_internal2
2041    call pixel_satd_8x8_internal2
2042    call pixel_satd_8x8_internal2
2043    lea r0, [r6 + 32*SIZEOF_PIXEL]
2044    lea r2, [r7 + 32*SIZEOF_PIXEL]
2045    call pixel_satd_8x8_internal2
2046    call pixel_satd_8x8_internal2
2047    call pixel_satd_8x8_internal2
2048    call pixel_satd_8x8_internal2
2049    call pixel_satd_8x8_internal2
2050    call pixel_satd_8x8_internal2
2051    call pixel_satd_8x8_internal2
2052    call pixel_satd_8x8_internal2
2053    lea r0, [r6 + 40*SIZEOF_PIXEL]
2054    lea r2, [r7 + 40*SIZEOF_PIXEL]
2055    call pixel_satd_8x8_internal2
2056    call pixel_satd_8x8_internal2
2057    call pixel_satd_8x8_internal2
2058    call pixel_satd_8x8_internal2
2059    call pixel_satd_8x8_internal2
2060    call pixel_satd_8x8_internal2
2061    call pixel_satd_8x8_internal2
2062    call pixel_satd_8x8_internal2
2063    HADDD m6, m0
2064    movd eax, m6
2065    RET
2066%else
2067cglobal pixel_satd_48x64, 4,7,8,0-gprsize   ;if !WIN64
2068    SATD_START_SSE2 m6, m7
2069    mov r6, r0
2070    mov [rsp], r2
2071    call pixel_satd_8x8_internal2
2072    call pixel_satd_8x8_internal2
2073    call pixel_satd_8x8_internal2
2074    call pixel_satd_8x8_internal2
2075    call pixel_satd_8x8_internal2
2076    call pixel_satd_8x8_internal2
2077    call pixel_satd_8x8_internal2
2078    call pixel_satd_8x8_internal2
2079    lea r0, [r6 + 8*SIZEOF_PIXEL]
2080    mov r2, [rsp]
2081    add r2,8*SIZEOF_PIXEL
2082    call pixel_satd_8x8_internal2
2083    call pixel_satd_8x8_internal2
2084    call pixel_satd_8x8_internal2
2085    call pixel_satd_8x8_internal2
2086    call pixel_satd_8x8_internal2
2087    call pixel_satd_8x8_internal2
2088    call pixel_satd_8x8_internal2
2089    call pixel_satd_8x8_internal2
2090    lea r0, [r6 + 16*SIZEOF_PIXEL]
2091    mov r2, [rsp]
2092    add r2,16*SIZEOF_PIXEL
2093    call pixel_satd_8x8_internal2
2094    call pixel_satd_8x8_internal2
2095    call pixel_satd_8x8_internal2
2096    call pixel_satd_8x8_internal2
2097    call pixel_satd_8x8_internal2
2098    call pixel_satd_8x8_internal2
2099    call pixel_satd_8x8_internal2
2100    call pixel_satd_8x8_internal2
2101    lea r0, [r6 + 24*SIZEOF_PIXEL]
2102    mov r2, [rsp]
2103    add r2,24*SIZEOF_PIXEL
2104    call pixel_satd_8x8_internal2
2105    call pixel_satd_8x8_internal2
2106    call pixel_satd_8x8_internal2
2107    call pixel_satd_8x8_internal2
2108    call pixel_satd_8x8_internal2
2109    call pixel_satd_8x8_internal2
2110    call pixel_satd_8x8_internal2
2111    call pixel_satd_8x8_internal2
2112    lea r0, [r6 + 32*SIZEOF_PIXEL]
2113    mov r2, [rsp]
2114    add r2,32*SIZEOF_PIXEL
2115    call pixel_satd_8x8_internal2
2116    call pixel_satd_8x8_internal2
2117    call pixel_satd_8x8_internal2
2118    call pixel_satd_8x8_internal2
2119    call pixel_satd_8x8_internal2
2120    call pixel_satd_8x8_internal2
2121    call pixel_satd_8x8_internal2
2122    call pixel_satd_8x8_internal2
2123    lea r0, [r6 + 40*SIZEOF_PIXEL]
2124    mov r2, [rsp]
2125    add r2,40*SIZEOF_PIXEL
2126    call pixel_satd_8x8_internal2
2127    call pixel_satd_8x8_internal2
2128    call pixel_satd_8x8_internal2
2129    call pixel_satd_8x8_internal2
2130    call pixel_satd_8x8_internal2
2131    call pixel_satd_8x8_internal2
2132    call pixel_satd_8x8_internal2
2133    call pixel_satd_8x8_internal2
2134    HADDD m6, m0
2135    movd eax, m6
2136    RET
2137%endif
2138
2139
2140%if WIN64
2141cglobal pixel_satd_64x16, 4,8,14    ;if WIN64 && cpuflag(avx)
2142    SATD_START_SSE2 m6, m7
2143    mov r6, r0
2144    mov r7, r2
2145    call pixel_satd_8x8_internal2
2146    call pixel_satd_8x8_internal2
2147    lea r0, [r6 + 8*SIZEOF_PIXEL]
2148    lea r2, [r7 + 8*SIZEOF_PIXEL]
2149    call pixel_satd_8x8_internal2
2150    call pixel_satd_8x8_internal2
2151    lea r0, [r6 + 16*SIZEOF_PIXEL]
2152    lea r2, [r7 + 16*SIZEOF_PIXEL]
2153    call pixel_satd_8x8_internal2
2154    call pixel_satd_8x8_internal2
2155    lea r0, [r6 + 24*SIZEOF_PIXEL]
2156    lea r2, [r7 + 24*SIZEOF_PIXEL]
2157    call pixel_satd_8x8_internal2
2158    call pixel_satd_8x8_internal2
2159    lea r0, [r6 + 32*SIZEOF_PIXEL]
2160    lea r2, [r7 + 32*SIZEOF_PIXEL]
2161    call pixel_satd_8x8_internal2
2162    call pixel_satd_8x8_internal2
2163    lea r0, [r6 + 40*SIZEOF_PIXEL]
2164    lea r2, [r7 + 40*SIZEOF_PIXEL]
2165    call pixel_satd_8x8_internal2
2166    call pixel_satd_8x8_internal2
2167    lea r0, [r6 + 48*SIZEOF_PIXEL]
2168    lea r2, [r7 + 48*SIZEOF_PIXEL]
2169    call pixel_satd_8x8_internal2
2170    call pixel_satd_8x8_internal2
2171    lea r0, [r6 + 56*SIZEOF_PIXEL]
2172    lea r2, [r7 + 56*SIZEOF_PIXEL]
2173    call pixel_satd_8x8_internal2
2174    call pixel_satd_8x8_internal2
2175    HADDD m6, m0
2176    movd eax, m6
2177    RET
2178%else
2179cglobal pixel_satd_64x16, 4,7,8,0-gprsize   ;if !WIN64
2180    SATD_START_SSE2 m6, m7
2181    mov r6, r0
2182    mov [rsp], r2
2183    call pixel_satd_8x8_internal2
2184    call pixel_satd_8x8_internal2
2185    lea r0, [r6 + 8*SIZEOF_PIXEL]
2186    mov r2, [rsp]
2187    add r2,8*SIZEOF_PIXEL
2188    call pixel_satd_8x8_internal2
2189    call pixel_satd_8x8_internal2
2190    lea r0, [r6 + 16*SIZEOF_PIXEL]
2191    mov r2, [rsp]
2192    add r2,16*SIZEOF_PIXEL
2193    call pixel_satd_8x8_internal2
2194    call pixel_satd_8x8_internal2
2195    lea r0, [r6 + 24*SIZEOF_PIXEL]
2196    mov r2, [rsp]
2197    add r2,24*SIZEOF_PIXEL
2198    call pixel_satd_8x8_internal2
2199    call pixel_satd_8x8_internal2
2200    lea r0, [r6 + 32*SIZEOF_PIXEL]
2201    mov r2, [rsp]
2202    add r2,32*SIZEOF_PIXEL
2203    call pixel_satd_8x8_internal2
2204    call pixel_satd_8x8_internal2
2205    lea r0, [r6 + 40*SIZEOF_PIXEL]
2206    mov r2, [rsp]
2207    add r2,40*SIZEOF_PIXEL
2208    call pixel_satd_8x8_internal2
2209    call pixel_satd_8x8_internal2
2210    lea r0, [r6 + 48*SIZEOF_PIXEL]
2211    mov r2, [rsp]
2212    add r2,48*SIZEOF_PIXEL
2213    call pixel_satd_8x8_internal2
2214    call pixel_satd_8x8_internal2
2215    lea r0, [r6 + 56*SIZEOF_PIXEL]
2216    mov r2, [rsp]
2217    add r2,56*SIZEOF_PIXEL
2218    call pixel_satd_8x8_internal2
2219    call pixel_satd_8x8_internal2
2220    HADDD m6, m0
2221    movd eax, m6
2222    RET
2223%endif
2224
2225%if WIN64
2226cglobal pixel_satd_64x32, 4,8,14    ;if WIN64 && cpuflag(avx)
2227    SATD_START_SSE2 m6, m7
2228    mov r6, r0
2229    mov r7, r2
2230    call pixel_satd_8x8_internal2
2231    call pixel_satd_8x8_internal2
2232    call pixel_satd_8x8_internal2
2233    call pixel_satd_8x8_internal2
2234    lea r0, [r6 + 8*SIZEOF_PIXEL]
2235    lea r2, [r7 + 8*SIZEOF_PIXEL]
2236    call pixel_satd_8x8_internal2
2237    call pixel_satd_8x8_internal2
2238    call pixel_satd_8x8_internal2
2239    call pixel_satd_8x8_internal2
2240    lea r0, [r6 + 16*SIZEOF_PIXEL]
2241    lea r2, [r7 + 16*SIZEOF_PIXEL]
2242    call pixel_satd_8x8_internal2
2243    call pixel_satd_8x8_internal2
2244    call pixel_satd_8x8_internal2
2245    call pixel_satd_8x8_internal2
2246    lea r0, [r6 + 24*SIZEOF_PIXEL]
2247    lea r2, [r7 + 24*SIZEOF_PIXEL]
2248    call pixel_satd_8x8_internal2
2249    call pixel_satd_8x8_internal2
2250    call pixel_satd_8x8_internal2
2251    call pixel_satd_8x8_internal2
2252    lea r0, [r6 + 32*SIZEOF_PIXEL]
2253    lea r2, [r7 + 32*SIZEOF_PIXEL]
2254    call pixel_satd_8x8_internal2
2255    call pixel_satd_8x8_internal2
2256    call pixel_satd_8x8_internal2
2257    call pixel_satd_8x8_internal2
2258    lea r0, [r6 + 40*SIZEOF_PIXEL]
2259    lea r2, [r7 + 40*SIZEOF_PIXEL]
2260    call pixel_satd_8x8_internal2
2261    call pixel_satd_8x8_internal2
2262    call pixel_satd_8x8_internal2
2263    call pixel_satd_8x8_internal2
2264    lea r0, [r6 + 48*SIZEOF_PIXEL]
2265    lea r2, [r7 + 48*SIZEOF_PIXEL]
2266    call pixel_satd_8x8_internal2
2267    call pixel_satd_8x8_internal2
2268    call pixel_satd_8x8_internal2
2269    call pixel_satd_8x8_internal2
2270    lea r0, [r6 + 56*SIZEOF_PIXEL]
2271    lea r2, [r7 + 56*SIZEOF_PIXEL]
2272    call pixel_satd_8x8_internal2
2273    call pixel_satd_8x8_internal2
2274    call pixel_satd_8x8_internal2
2275    call pixel_satd_8x8_internal2
2276    HADDD m6, m0
2277    movd eax, m6
2278    RET
2279%else
2280cglobal pixel_satd_64x32, 4,7,8,0-gprsize   ;if !WIN64
2281    SATD_START_SSE2 m6, m7
2282    mov r6, r0
2283    mov [rsp], r2
2284    call pixel_satd_8x8_internal2
2285    call pixel_satd_8x8_internal2
2286    call pixel_satd_8x8_internal2
2287    call pixel_satd_8x8_internal2
2288    lea r0, [r6 + 8*SIZEOF_PIXEL]
2289    mov r2, [rsp]
2290    add r2, 8*SIZEOF_PIXEL
2291    call pixel_satd_8x8_internal2
2292    call pixel_satd_8x8_internal2
2293    call pixel_satd_8x8_internal2
2294    call pixel_satd_8x8_internal2
2295    lea r0, [r6 + 16*SIZEOF_PIXEL]
2296    mov r2, [rsp]
2297    add r2, 16*SIZEOF_PIXEL
2298    call pixel_satd_8x8_internal2
2299    call pixel_satd_8x8_internal2
2300    call pixel_satd_8x8_internal2
2301    call pixel_satd_8x8_internal2
2302    lea r0, [r6 + 24*SIZEOF_PIXEL]
2303    mov r2, [rsp]
2304    add r2, 24*SIZEOF_PIXEL
2305    call pixel_satd_8x8_internal2
2306    call pixel_satd_8x8_internal2
2307    call pixel_satd_8x8_internal2
2308    call pixel_satd_8x8_internal2
2309    lea r0, [r6 + 32*SIZEOF_PIXEL]
2310    mov r2, [rsp]
2311    add r2, 32*SIZEOF_PIXEL
2312    call pixel_satd_8x8_internal2
2313    call pixel_satd_8x8_internal2
2314    call pixel_satd_8x8_internal2
2315    call pixel_satd_8x8_internal2
2316    lea r0, [r6 + 40*SIZEOF_PIXEL]
2317    mov r2, [rsp]
2318    add r2, 40*SIZEOF_PIXEL
2319    call pixel_satd_8x8_internal2
2320    call pixel_satd_8x8_internal2
2321    call pixel_satd_8x8_internal2
2322    call pixel_satd_8x8_internal2
2323    lea r0, [r6 + 48*SIZEOF_PIXEL]
2324    mov r2, [rsp]
2325    add r2, 48*SIZEOF_PIXEL
2326    call pixel_satd_8x8_internal2
2327    call pixel_satd_8x8_internal2
2328    call pixel_satd_8x8_internal2
2329    call pixel_satd_8x8_internal2
2330    lea r0, [r6 + 56*SIZEOF_PIXEL]
2331    mov r2, [rsp]
2332    add r2, 56*SIZEOF_PIXEL
2333    call pixel_satd_8x8_internal2
2334    call pixel_satd_8x8_internal2
2335    call pixel_satd_8x8_internal2
2336    call pixel_satd_8x8_internal2
2337    HADDD m6, m0
2338    movd eax, m6
2339    RET
2340%endif
2341
2342%if WIN64
2343cglobal pixel_satd_64x48, 4,8,14    ;if WIN64 && cpuflag(avx)
2344    SATD_START_SSE2 m6, m7
2345    mov r6, r0
2346    mov r7, r2
2347    call pixel_satd_8x8_internal2
2348    call pixel_satd_8x8_internal2
2349    call pixel_satd_8x8_internal2
2350    call pixel_satd_8x8_internal2
2351    call pixel_satd_8x8_internal2
2352    call pixel_satd_8x8_internal2
2353    lea r0, [r6 + 8*SIZEOF_PIXEL]
2354    lea r2, [r7 + 8*SIZEOF_PIXEL]
2355    call pixel_satd_8x8_internal2
2356    call pixel_satd_8x8_internal2
2357    call pixel_satd_8x8_internal2
2358    call pixel_satd_8x8_internal2
2359    call pixel_satd_8x8_internal2
2360    call pixel_satd_8x8_internal2
2361    lea r0, [r6 + 16*SIZEOF_PIXEL]
2362    lea r2, [r7 + 16*SIZEOF_PIXEL]
2363    call pixel_satd_8x8_internal2
2364    call pixel_satd_8x8_internal2
2365    call pixel_satd_8x8_internal2
2366    call pixel_satd_8x8_internal2
2367    call pixel_satd_8x8_internal2
2368    call pixel_satd_8x8_internal2
2369    lea r0, [r6 + 24*SIZEOF_PIXEL]
2370    lea r2, [r7 + 24*SIZEOF_PIXEL]
2371    call pixel_satd_8x8_internal2
2372    call pixel_satd_8x8_internal2
2373    call pixel_satd_8x8_internal2
2374    call pixel_satd_8x8_internal2
2375    call pixel_satd_8x8_internal2
2376    call pixel_satd_8x8_internal2
2377    lea r0, [r6 + 32*SIZEOF_PIXEL]
2378    lea r2, [r7 + 32*SIZEOF_PIXEL]
2379    call pixel_satd_8x8_internal2
2380    call pixel_satd_8x8_internal2
2381    call pixel_satd_8x8_internal2
2382    call pixel_satd_8x8_internal2
2383    call pixel_satd_8x8_internal2
2384    call pixel_satd_8x8_internal2
2385    lea r0, [r6 + 40*SIZEOF_PIXEL]
2386    lea r2, [r7 + 40*SIZEOF_PIXEL]
2387    call pixel_satd_8x8_internal2
2388    call pixel_satd_8x8_internal2
2389    call pixel_satd_8x8_internal2
2390    call pixel_satd_8x8_internal2
2391    call pixel_satd_8x8_internal2
2392    call pixel_satd_8x8_internal2
2393    lea r0, [r6 + 48*SIZEOF_PIXEL]
2394    lea r2, [r7 + 48*SIZEOF_PIXEL]
2395    call pixel_satd_8x8_internal2
2396    call pixel_satd_8x8_internal2
2397    call pixel_satd_8x8_internal2
2398    call pixel_satd_8x8_internal2
2399    call pixel_satd_8x8_internal2
2400    call pixel_satd_8x8_internal2
2401    lea r0, [r6 + 56*SIZEOF_PIXEL]
2402    lea r2, [r7 + 56*SIZEOF_PIXEL]
2403    call pixel_satd_8x8_internal2
2404    call pixel_satd_8x8_internal2
2405    call pixel_satd_8x8_internal2
2406    call pixel_satd_8x8_internal2
2407    call pixel_satd_8x8_internal2
2408    call pixel_satd_8x8_internal2
2409    HADDD m6, m0
2410    movd eax, m6
2411    RET
2412%else
2413cglobal pixel_satd_64x48, 4,7,8,0-gprsize   ;if !WIN64
2414    SATD_START_SSE2 m6, m7
2415    mov r6, r0
2416    mov [rsp], r2
2417    call pixel_satd_8x8_internal2
2418    call pixel_satd_8x8_internal2
2419    call pixel_satd_8x8_internal2
2420    call pixel_satd_8x8_internal2
2421    call pixel_satd_8x8_internal2
2422    call pixel_satd_8x8_internal2
2423    lea r0, [r6 + 8*SIZEOF_PIXEL]
2424    mov r2, [rsp]
2425    add r2, 8*SIZEOF_PIXEL
2426    call pixel_satd_8x8_internal2
2427    call pixel_satd_8x8_internal2
2428    call pixel_satd_8x8_internal2
2429    call pixel_satd_8x8_internal2
2430    call pixel_satd_8x8_internal2
2431    call pixel_satd_8x8_internal2
2432    lea r0, [r6 + 16*SIZEOF_PIXEL]
2433    mov r2, [rsp]
2434    add r2, 16*SIZEOF_PIXEL
2435    call pixel_satd_8x8_internal2
2436    call pixel_satd_8x8_internal2
2437    call pixel_satd_8x8_internal2
2438    call pixel_satd_8x8_internal2
2439    call pixel_satd_8x8_internal2
2440    call pixel_satd_8x8_internal2
2441    lea r0, [r6 + 24*SIZEOF_PIXEL]
2442    mov r2, [rsp]
2443    add r2, 24*SIZEOF_PIXEL
2444    call pixel_satd_8x8_internal2
2445    call pixel_satd_8x8_internal2
2446    call pixel_satd_8x8_internal2
2447    call pixel_satd_8x8_internal2
2448    call pixel_satd_8x8_internal2
2449    call pixel_satd_8x8_internal2
2450    lea r0, [r6 + 32*SIZEOF_PIXEL]
2451    mov r2, [rsp]
2452    add r2, 32*SIZEOF_PIXEL
2453    call pixel_satd_8x8_internal2
2454    call pixel_satd_8x8_internal2
2455    call pixel_satd_8x8_internal2
2456    call pixel_satd_8x8_internal2
2457    call pixel_satd_8x8_internal2
2458    call pixel_satd_8x8_internal2
2459    lea r0, [r6 + 40*SIZEOF_PIXEL]
2460    mov r2, [rsp]
2461    add r2, 40*SIZEOF_PIXEL
2462    call pixel_satd_8x8_internal2
2463    call pixel_satd_8x8_internal2
2464    call pixel_satd_8x8_internal2
2465    call pixel_satd_8x8_internal2
2466    call pixel_satd_8x8_internal2
2467    call pixel_satd_8x8_internal2
2468    lea r0, [r6 + 48*SIZEOF_PIXEL]
2469    mov r2, [rsp]
2470    add r2, 48*SIZEOF_PIXEL
2471    call pixel_satd_8x8_internal2
2472    call pixel_satd_8x8_internal2
2473    call pixel_satd_8x8_internal2
2474    call pixel_satd_8x8_internal2
2475    call pixel_satd_8x8_internal2
2476    call pixel_satd_8x8_internal2
2477    lea r0, [r6 + 56*SIZEOF_PIXEL]
2478    mov r2, [rsp]
2479    add r2, 56*SIZEOF_PIXEL
2480    call pixel_satd_8x8_internal2
2481    call pixel_satd_8x8_internal2
2482    call pixel_satd_8x8_internal2
2483    call pixel_satd_8x8_internal2
2484    call pixel_satd_8x8_internal2
2485    call pixel_satd_8x8_internal2
2486    HADDD m6, m0
2487    movd eax, m6
2488    RET
2489%endif
2490
2491%if WIN64
2492cglobal pixel_satd_64x64, 4,8,14    ;if WIN64 && cpuflag(avx)
2493    SATD_START_SSE2 m6, m7
2494    mov r6, r0
2495    mov r7, r2
2496    call pixel_satd_8x8_internal2
2497    call pixel_satd_8x8_internal2
2498    call pixel_satd_8x8_internal2
2499    call pixel_satd_8x8_internal2
2500    call pixel_satd_8x8_internal2
2501    call pixel_satd_8x8_internal2
2502    call pixel_satd_8x8_internal2
2503    call pixel_satd_8x8_internal2
2504    lea r0, [r6 + 8*SIZEOF_PIXEL]
2505    lea r2, [r7 + 8*SIZEOF_PIXEL]
2506    call pixel_satd_8x8_internal2
2507    call pixel_satd_8x8_internal2
2508    call pixel_satd_8x8_internal2
2509    call pixel_satd_8x8_internal2
2510    call pixel_satd_8x8_internal2
2511    call pixel_satd_8x8_internal2
2512    call pixel_satd_8x8_internal2
2513    call pixel_satd_8x8_internal2
2514    lea r0, [r6 + 16*SIZEOF_PIXEL]
2515    lea r2, [r7 + 16*SIZEOF_PIXEL]
2516    call pixel_satd_8x8_internal2
2517    call pixel_satd_8x8_internal2
2518    call pixel_satd_8x8_internal2
2519    call pixel_satd_8x8_internal2
2520    call pixel_satd_8x8_internal2
2521    call pixel_satd_8x8_internal2
2522    call pixel_satd_8x8_internal2
2523    call pixel_satd_8x8_internal2
2524    lea r0, [r6 + 24*SIZEOF_PIXEL]
2525    lea r2, [r7 + 24*SIZEOF_PIXEL]
2526    call pixel_satd_8x8_internal2
2527    call pixel_satd_8x8_internal2
2528    call pixel_satd_8x8_internal2
2529    call pixel_satd_8x8_internal2
2530    call pixel_satd_8x8_internal2
2531    call pixel_satd_8x8_internal2
2532    call pixel_satd_8x8_internal2
2533    call pixel_satd_8x8_internal2
2534    lea r0, [r6 + 32*SIZEOF_PIXEL]
2535    lea r2, [r7 + 32*SIZEOF_PIXEL]
2536    call pixel_satd_8x8_internal2
2537    call pixel_satd_8x8_internal2
2538    call pixel_satd_8x8_internal2
2539    call pixel_satd_8x8_internal2
2540    call pixel_satd_8x8_internal2
2541    call pixel_satd_8x8_internal2
2542    call pixel_satd_8x8_internal2
2543    call pixel_satd_8x8_internal2
2544    lea r0, [r6 + 40*SIZEOF_PIXEL]
2545    lea r2, [r7 + 40*SIZEOF_PIXEL]
2546    call pixel_satd_8x8_internal2
2547    call pixel_satd_8x8_internal2
2548    call pixel_satd_8x8_internal2
2549    call pixel_satd_8x8_internal2
2550    call pixel_satd_8x8_internal2
2551    call pixel_satd_8x8_internal2
2552    call pixel_satd_8x8_internal2
2553    call pixel_satd_8x8_internal2
2554    lea r0, [r6 + 48*SIZEOF_PIXEL]
2555    lea r2, [r7 + 48*SIZEOF_PIXEL]
2556    call pixel_satd_8x8_internal2
2557    call pixel_satd_8x8_internal2
2558    call pixel_satd_8x8_internal2
2559    call pixel_satd_8x8_internal2
2560    call pixel_satd_8x8_internal2
2561    call pixel_satd_8x8_internal2
2562    call pixel_satd_8x8_internal2
2563    call pixel_satd_8x8_internal2
2564    lea r0, [r6 + 56*SIZEOF_PIXEL]
2565    lea r2, [r7 + 56*SIZEOF_PIXEL]
2566    call pixel_satd_8x8_internal2
2567    call pixel_satd_8x8_internal2
2568    call pixel_satd_8x8_internal2
2569    call pixel_satd_8x8_internal2
2570    call pixel_satd_8x8_internal2
2571    call pixel_satd_8x8_internal2
2572    call pixel_satd_8x8_internal2
2573    call pixel_satd_8x8_internal2
2574    HADDD m6, m0
2575    movd eax, m6
2576    RET
2577%else
2578cglobal pixel_satd_64x64, 4,7,8,0-gprsize   ;if !WIN64
2579    SATD_START_SSE2 m6, m7
2580    mov r6, r0
2581    mov [rsp], r2
2582    call pixel_satd_8x8_internal2
2583    call pixel_satd_8x8_internal2
2584    call pixel_satd_8x8_internal2
2585    call pixel_satd_8x8_internal2
2586    call pixel_satd_8x8_internal2
2587    call pixel_satd_8x8_internal2
2588    call pixel_satd_8x8_internal2
2589    call pixel_satd_8x8_internal2
2590    lea r0, [r6 + 8*SIZEOF_PIXEL]
2591    mov r2, [rsp]
2592    add r2, 8*SIZEOF_PIXEL
2593    call pixel_satd_8x8_internal2
2594    call pixel_satd_8x8_internal2
2595    call pixel_satd_8x8_internal2
2596    call pixel_satd_8x8_internal2
2597    call pixel_satd_8x8_internal2
2598    call pixel_satd_8x8_internal2
2599    call pixel_satd_8x8_internal2
2600    call pixel_satd_8x8_internal2
2601    lea r0, [r6 + 16*SIZEOF_PIXEL]
2602    mov r2, [rsp]
2603    add r2, 16*SIZEOF_PIXEL
2604    call pixel_satd_8x8_internal2
2605    call pixel_satd_8x8_internal2
2606    call pixel_satd_8x8_internal2
2607    call pixel_satd_8x8_internal2
2608    call pixel_satd_8x8_internal2
2609    call pixel_satd_8x8_internal2
2610    call pixel_satd_8x8_internal2
2611    call pixel_satd_8x8_internal2
2612    lea r0, [r6 + 24*SIZEOF_PIXEL]
2613    mov r2, [rsp]
2614    add r2, 24*SIZEOF_PIXEL
2615    call pixel_satd_8x8_internal2
2616    call pixel_satd_8x8_internal2
2617    call pixel_satd_8x8_internal2
2618    call pixel_satd_8x8_internal2
2619    call pixel_satd_8x8_internal2
2620    call pixel_satd_8x8_internal2
2621    call pixel_satd_8x8_internal2
2622    call pixel_satd_8x8_internal2
2623    lea r0, [r6 + 32*SIZEOF_PIXEL]
2624    mov r2, [rsp]
2625    add r2, 32*SIZEOF_PIXEL
2626    call pixel_satd_8x8_internal2
2627    call pixel_satd_8x8_internal2
2628    call pixel_satd_8x8_internal2
2629    call pixel_satd_8x8_internal2
2630    call pixel_satd_8x8_internal2
2631    call pixel_satd_8x8_internal2
2632    call pixel_satd_8x8_internal2
2633    call pixel_satd_8x8_internal2
2634    lea r0, [r6 + 40*SIZEOF_PIXEL]
2635    mov r2, [rsp]
2636    add r2, 40*SIZEOF_PIXEL
2637    call pixel_satd_8x8_internal2
2638    call pixel_satd_8x8_internal2
2639    call pixel_satd_8x8_internal2
2640    call pixel_satd_8x8_internal2
2641    call pixel_satd_8x8_internal2
2642    call pixel_satd_8x8_internal2
2643    call pixel_satd_8x8_internal2
2644    call pixel_satd_8x8_internal2
2645    lea r0, [r6 + 48*SIZEOF_PIXEL]
2646    mov r2, [rsp]
2647    add r2, 48*SIZEOF_PIXEL
2648    call pixel_satd_8x8_internal2
2649    call pixel_satd_8x8_internal2
2650    call pixel_satd_8x8_internal2
2651    call pixel_satd_8x8_internal2
2652    call pixel_satd_8x8_internal2
2653    call pixel_satd_8x8_internal2
2654    call pixel_satd_8x8_internal2
2655    call pixel_satd_8x8_internal2
2656    lea r0, [r6 + 56*SIZEOF_PIXEL]
2657    mov r2, [rsp]
2658    add r2, 56*SIZEOF_PIXEL
2659    call pixel_satd_8x8_internal2
2660    call pixel_satd_8x8_internal2
2661    call pixel_satd_8x8_internal2
2662    call pixel_satd_8x8_internal2
2663    call pixel_satd_8x8_internal2
2664    call pixel_satd_8x8_internal2
2665    call pixel_satd_8x8_internal2
2666    call pixel_satd_8x8_internal2
2667    HADDD m6, m0
2668    movd eax, m6
2669    RET
2670%endif
2671
2672%if WIN64
2673cglobal pixel_satd_16x4, 4,6,14
2674%else
2675cglobal pixel_satd_16x4, 4,6,8
2676%endif
2677    SATD_START_SSE2 m6, m7
2678    BACKUP_POINTERS
2679    call %%pixel_satd_8x4_internal2
2680    RESTORE_AND_INC_POINTERS
2681    call %%pixel_satd_8x4_internal2
2682    HADDD m6, m0
2683    movd eax, m6
2684    RET
2685
2686%if WIN64
2687cglobal pixel_satd_16x8, 4,6,14
2688%else
2689cglobal pixel_satd_16x8, 4,6,8
2690%endif
2691    SATD_START_SSE2 m6, m7
2692    BACKUP_POINTERS
2693    call pixel_satd_8x8_internal2
2694    RESTORE_AND_INC_POINTERS
2695    call pixel_satd_8x8_internal2
2696    HADDD m6, m0
2697    movd eax, m6
2698    RET
2699
2700%if WIN64
2701cglobal pixel_satd_16x12, 4,6,14
2702%else
2703cglobal pixel_satd_16x12, 4,6,8
2704%endif
2705    SATD_START_SSE2 m6, m7, 1
2706    BACKUP_POINTERS
2707    call pixel_satd_8x8_internal2
2708    call %%pixel_satd_8x4_internal2
2709    RESTORE_AND_INC_POINTERS
2710    call pixel_satd_8x8_internal2
2711    call %%pixel_satd_8x4_internal2
2712    HADDD m6, m0
2713    movd eax, m6
2714    RET
2715
2716%if WIN64
2717cglobal pixel_satd_16x16, 4,6,14
2718%else
2719cglobal pixel_satd_16x16, 4,6,8
2720%endif
2721    SATD_START_SSE2 m6, m7, 1
2722    BACKUP_POINTERS
2723    call pixel_satd_8x8_internal2
2724    call pixel_satd_8x8_internal2
2725    RESTORE_AND_INC_POINTERS
2726    call pixel_satd_8x8_internal2
2727    call pixel_satd_8x8_internal2
2728    HADDD m6, m0
2729    movd eax, m6
2730    RET
2731
2732%if WIN64
2733cglobal pixel_satd_16x32, 4,6,14
2734%else
2735cglobal pixel_satd_16x32, 4,6,8
2736%endif
2737    SATD_START_SSE2 m6, m7, 1
2738    BACKUP_POINTERS
2739    call pixel_satd_8x8_internal2
2740    call pixel_satd_8x8_internal2
2741    call pixel_satd_8x8_internal2
2742    call pixel_satd_8x8_internal2
2743    RESTORE_AND_INC_POINTERS
2744    call pixel_satd_8x8_internal2
2745    call pixel_satd_8x8_internal2
2746    call pixel_satd_8x8_internal2
2747    call pixel_satd_8x8_internal2
2748    HADDD m6, m0
2749    movd eax, m6
2750    RET
2751
2752%if WIN64
2753cglobal pixel_satd_16x64, 4,6,14
2754%else
2755cglobal pixel_satd_16x64, 4,6,8
2756%endif
2757    SATD_START_SSE2 m6, m7, 1
2758    BACKUP_POINTERS
2759    call pixel_satd_8x8_internal2
2760    call pixel_satd_8x8_internal2
2761    call pixel_satd_8x8_internal2
2762    call pixel_satd_8x8_internal2
2763    call pixel_satd_8x8_internal2
2764    call pixel_satd_8x8_internal2
2765    call pixel_satd_8x8_internal2
2766    call pixel_satd_8x8_internal2
2767    RESTORE_AND_INC_POINTERS
2768    call pixel_satd_8x8_internal2
2769    call pixel_satd_8x8_internal2
2770    call pixel_satd_8x8_internal2
2771    call pixel_satd_8x8_internal2
2772    call pixel_satd_8x8_internal2
2773    call pixel_satd_8x8_internal2
2774    call pixel_satd_8x8_internal2
2775    call pixel_satd_8x8_internal2
2776    HADDD m6, m0
2777    movd eax, m6
2778    RET
2779%endif
2780
2781%if HIGH_BIT_DEPTH
2782%if WIN64
2783cglobal pixel_satd_12x16, 4,8,8
2784    SATD_START_MMX
2785    mov r6, r0
2786    mov r7, r2
2787    pxor m7, m7
2788    SATD_4x8_SSE vertical, 0, 4, 5
2789    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2790    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2791    SATD_4x8_SSE vertical, 1, 4, 5
2792    lea r0, [r6 + 4*SIZEOF_PIXEL]
2793    lea r2, [r7 + 4*SIZEOF_PIXEL]
2794    SATD_4x8_SSE vertical, 1, 4, 5
2795    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2796    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2797    SATD_4x8_SSE vertical, 1, 4, 5
2798    lea r0, [r6 + 8*SIZEOF_PIXEL]
2799    lea r2, [r7 + 8*SIZEOF_PIXEL]
2800    SATD_4x8_SSE vertical, 1, 4, 5
2801    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2802    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2803    SATD_4x8_SSE vertical, 1, 4, 5
2804    HADDD m7, m0
2805    movd eax, m7
2806    RET
2807%else
2808cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2809    SATD_START_MMX
2810    mov r6, r0
2811    mov [rsp], r2
2812    pxor m7, m7
2813    SATD_4x8_SSE vertical, 0, 4, 5
2814    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2815    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2816    SATD_4x8_SSE vertical, 1, 4, 5
2817    lea r0, [r6 + 4*SIZEOF_PIXEL]
2818    mov r2, [rsp]
2819    add r2, 4*SIZEOF_PIXEL
2820    SATD_4x8_SSE vertical, 1, 4, 5
2821    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2822    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2823    SATD_4x8_SSE vertical, 1, 4, 5
2824    lea r0, [r6 + 8*SIZEOF_PIXEL]
2825    mov r2, [rsp]
2826    add r2, 8*SIZEOF_PIXEL
2827    SATD_4x8_SSE vertical, 1, 4, 5
2828    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2829    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2830    SATD_4x8_SSE vertical, 1, 4, 5
2831    HADDD m7, m0
2832    movd eax, m7
2833    RET
2834%endif
2835%else    ;HIGH_BIT_DEPTH
2836%if WIN64
2837cglobal pixel_satd_12x16, 4,8,8
2838    SATD_START_MMX
2839    mov r6, r0
2840    mov r7, r2
2841%if vertical==0
2842    mova m7, [hmul_4p]
2843%endif
2844    SATD_4x8_SSE vertical, 0, swap
2845    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2846    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2847    SATD_4x8_SSE vertical, 1, add
2848    lea r0, [r6 + 4*SIZEOF_PIXEL]
2849    lea r2, [r7 + 4*SIZEOF_PIXEL]
2850    SATD_4x8_SSE vertical, 1, add
2851    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2852    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2853    SATD_4x8_SSE vertical, 1, add
2854    lea r0, [r6 + 8*SIZEOF_PIXEL]
2855    lea r2, [r7 + 8*SIZEOF_PIXEL]
2856    SATD_4x8_SSE vertical, 1, add
2857    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2858    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2859    SATD_4x8_SSE vertical, 1, add
2860    HADDW m7, m1
2861    movd eax, m7
2862    RET
2863%else
2864cglobal pixel_satd_12x16, 4,7,8,0-gprsize
2865    SATD_START_MMX
2866    mov r6, r0
2867    mov [rsp], r2
2868%if vertical==0
2869    mova m7, [hmul_4p]
2870%endif
2871    SATD_4x8_SSE vertical, 0, swap
2872    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2873    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2874    SATD_4x8_SSE vertical, 1, add
2875    lea r0, [r6 + 4*SIZEOF_PIXEL]
2876    mov r2, [rsp]
2877    add r2, 4*SIZEOF_PIXEL
2878    SATD_4x8_SSE vertical, 1, add
2879    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2880    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2881    SATD_4x8_SSE vertical, 1, add
2882    lea r0, [r6 + 8*SIZEOF_PIXEL]
2883    mov r2, [rsp]
2884    add r2, 8*SIZEOF_PIXEL
2885    SATD_4x8_SSE vertical, 1, add
2886    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
2887    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
2888    SATD_4x8_SSE vertical, 1, add
2889    HADDW m7, m1
2890    movd eax, m7
2891    RET
2892%endif
2893%endif
2894
2895%if WIN64
2896cglobal pixel_satd_24x32, 4,8,14
2897    SATD_START_SSE2 m6, m7
2898    mov r6, r0
2899    mov r7, r2
2900    call pixel_satd_8x8_internal2
2901    call pixel_satd_8x8_internal2
2902    call pixel_satd_8x8_internal2
2903    call pixel_satd_8x8_internal2
2904    lea r0, [r6 + 8*SIZEOF_PIXEL]
2905    lea r2, [r7 + 8*SIZEOF_PIXEL]
2906    call pixel_satd_8x8_internal2
2907    call pixel_satd_8x8_internal2
2908    call pixel_satd_8x8_internal2
2909    call pixel_satd_8x8_internal2
2910    lea r0, [r6 + 16*SIZEOF_PIXEL]
2911    lea r2, [r7 + 16*SIZEOF_PIXEL]
2912    call pixel_satd_8x8_internal2
2913    call pixel_satd_8x8_internal2
2914    call pixel_satd_8x8_internal2
2915    call pixel_satd_8x8_internal2
2916    HADDD m6, m0
2917    movd eax, m6
2918    RET
2919%else
2920cglobal pixel_satd_24x32, 4,7,8,0-gprsize
2921    SATD_START_SSE2 m6, m7
2922    mov r6, r0
2923    mov [rsp], r2
2924    call pixel_satd_8x8_internal2
2925    call pixel_satd_8x8_internal2
2926    call pixel_satd_8x8_internal2
2927    call pixel_satd_8x8_internal2
2928    lea r0, [r6 + 8*SIZEOF_PIXEL]
2929    mov r2, [rsp]
2930    add r2, 8*SIZEOF_PIXEL
2931    call pixel_satd_8x8_internal2
2932    call pixel_satd_8x8_internal2
2933    call pixel_satd_8x8_internal2
2934    call pixel_satd_8x8_internal2
2935    lea r0, [r6 + 16*SIZEOF_PIXEL]
2936    mov r2, [rsp]
2937    add r2, 16*SIZEOF_PIXEL
2938    call pixel_satd_8x8_internal2
2939    call pixel_satd_8x8_internal2
2940    call pixel_satd_8x8_internal2
2941    call pixel_satd_8x8_internal2
2942    HADDD m6, m0
2943    movd eax, m6
2944    RET
2945%endif    ;WIN64
2946
2947%if WIN64
2948cglobal pixel_satd_8x32, 4,6,14
2949%else
2950cglobal pixel_satd_8x32, 4,6,8
2951%endif
2952    SATD_START_SSE2 m6, m7
2953%if vertical
2954    mova m7, [pw_00ff]
2955%endif
2956    call pixel_satd_8x8_internal2
2957    call pixel_satd_8x8_internal2
2958    call pixel_satd_8x8_internal2
2959    call pixel_satd_8x8_internal2
2960    HADDD m6, m0
2961    movd eax, m6
2962    RET
2963
2964%if WIN64
2965cglobal pixel_satd_8x16, 4,6,14
2966%else
2967cglobal pixel_satd_8x16, 4,6,8
2968%endif
2969    SATD_START_SSE2 m6, m7
2970    call pixel_satd_8x8_internal2
2971    call pixel_satd_8x8_internal2
2972    HADDD m6, m0
2973    movd eax, m6
2974    RET
2975
2976cglobal pixel_satd_8x8, 4,6,8
2977    SATD_START_SSE2 m6, m7
2978    call pixel_satd_8x8_internal
2979    SATD_END_SSE2 m6
2980
2981%if WIN64
2982cglobal pixel_satd_8x4, 4,6,14
2983%else
2984cglobal pixel_satd_8x4, 4,6,8
2985%endif
2986    SATD_START_SSE2 m6, m7
2987    call %%pixel_satd_8x4_internal2
2988    SATD_END_SSE2 m6
2989%endmacro ; SATDS_SSE2
2990
2991
2992;=============================================================================
2993; SA8D
2994;=============================================================================
2995
2996%macro SA8D_INTER 0
2997%if ARCH_X86_64
2998    %define lh m10
2999    %define rh m0
3000%else
3001    %define lh m0
3002    %define rh [esp+48]
3003%endif
3004%if HIGH_BIT_DEPTH
3005    HADDUW  m0, m1
3006    paddd   lh, rh
3007%else
3008    paddusw lh, rh
3009%endif ; HIGH_BIT_DEPTH
3010%endmacro
3011
3012%macro SA8D_8x8 0
3013    call pixel_sa8d_8x8_internal
3014%if HIGH_BIT_DEPTH
3015    HADDUW m0, m1
3016%else
3017    HADDW m0, m1
3018%endif ; HIGH_BIT_DEPTH
3019    paddd  m0, [pd_1]
3020    psrld  m0, 1
3021    paddd  m12, m0
3022%endmacro
3023
3024%macro SA8D_16x16 0
3025    call pixel_sa8d_8x8_internal ; pix[0]
3026    add  r2, 8*SIZEOF_PIXEL
3027    add  r0, 8*SIZEOF_PIXEL
3028%if HIGH_BIT_DEPTH
3029    HADDUW m0, m1
3030%endif
3031    mova m10, m0
3032    call pixel_sa8d_8x8_internal ; pix[8]
3033    lea  r2, [r2+8*r3]
3034    lea  r0, [r0+8*r1]
3035    SA8D_INTER
3036    call pixel_sa8d_8x8_internal ; pix[8*stride+8]
3037    sub  r2, 8*SIZEOF_PIXEL
3038    sub  r0, 8*SIZEOF_PIXEL
3039    SA8D_INTER
3040    call pixel_sa8d_8x8_internal ; pix[8*stride]
3041    SA8D_INTER
3042    SWAP 0, 10
3043%if HIGH_BIT_DEPTH == 0
3044    HADDUW m0, m1
3045%endif
3046    paddd  m0, [pd_1]
3047    psrld  m0, 1
3048    paddd  m12, m0
3049%endmacro
3050
3051%macro AVG_16x16 0
3052    SA8D_INTER
3053%if HIGH_BIT_DEPTH == 0
3054    HADDUW m0, m1
3055%endif
3056    movd r4d, m0
3057    add  r4d, 1
3058    shr  r4d, 1
3059    add r4d, dword [esp+36]
3060    mov dword [esp+36], r4d
3061%endmacro
3062
3063%macro SA8D 0
3064; sse2 doesn't seem to like the horizontal way of doing things
3065%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
3066
3067%if ARCH_X86_64
3068;-----------------------------------------------------------------------------
3069; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
3070;-----------------------------------------------------------------------------
3071cglobal pixel_sa8d_8x8_internal
3072    lea  r6, [r0+4*r1]
3073    lea  r7, [r2+4*r3]
3074    LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
3075    LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
3076%if vertical
3077    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
3078%else ; non-sse2
3079    HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
3080%endif
3081    paddw m0, m1
3082    paddw m0, m2
3083    paddw m0, m8
3084    SAVE_MM_PERMUTATION
3085    ret
3086
3087cglobal pixel_sa8d_8x8, 4,8,12
3088    FIX_STRIDES r1, r3
3089    lea  r4, [3*r1]
3090    lea  r5, [3*r3]
3091%if vertical == 0
3092    mova m7, [hmul_8p]
3093%endif
3094    call pixel_sa8d_8x8_internal
3095%if HIGH_BIT_DEPTH
3096    HADDUW m0, m1
3097%else
3098    HADDW m0, m1
3099%endif ; HIGH_BIT_DEPTH
3100    movd eax, m0
3101    add eax, 1
3102    shr eax, 1
3103    RET
3104
3105cglobal pixel_sa8d_16x16, 4,8,12
3106    FIX_STRIDES r1, r3
3107    lea  r4, [3*r1]
3108    lea  r5, [3*r3]
3109%if vertical == 0
3110    mova m7, [hmul_8p]
3111%endif
3112    call pixel_sa8d_8x8_internal ; pix[0]
3113    add  r2, 8*SIZEOF_PIXEL
3114    add  r0, 8*SIZEOF_PIXEL
3115%if HIGH_BIT_DEPTH
3116    HADDUW m0, m1
3117%endif
3118    mova m10, m0
3119    call pixel_sa8d_8x8_internal ; pix[8]
3120    lea  r2, [r2+8*r3]
3121    lea  r0, [r0+8*r1]
3122    SA8D_INTER
3123    call pixel_sa8d_8x8_internal ; pix[8*stride+8]
3124    sub  r2, 8*SIZEOF_PIXEL
3125    sub  r0, 8*SIZEOF_PIXEL
3126    SA8D_INTER
3127    call pixel_sa8d_8x8_internal ; pix[8*stride]
3128    SA8D_INTER
3129    SWAP 0, 10
3130%if HIGH_BIT_DEPTH == 0
3131    HADDUW m0, m1
3132%endif
3133    movd eax, m0
3134    add  eax, 1
3135    shr  eax, 1
3136    RET
3137
3138cglobal pixel_sa8d_8x16, 4,8,13
3139    FIX_STRIDES r1, r3
3140    lea  r4, [3*r1]
3141    lea  r5, [3*r3]
3142    pxor m12, m12
3143%if vertical == 0
3144    mova m7, [hmul_8p]
3145%endif
3146    SA8D_8x8
3147    lea r0, [r0 + 8*r1]
3148    lea r2, [r2 + 8*r3]
3149    SA8D_8x8
3150    movd eax, m12
3151    RET
3152
3153cglobal pixel_sa8d_8x32, 4,8,13
3154    FIX_STRIDES r1, r3
3155    lea  r4, [3*r1]
3156    lea  r5, [3*r3]
3157    pxor m12, m12
3158%if vertical == 0
3159    mova m7, [hmul_8p]
3160%endif
3161    SA8D_8x8
3162    lea r0, [r0 + r1*8]
3163    lea r2, [r2 + r3*8]
3164    SA8D_8x8
3165    lea r0, [r0 + r1*8]
3166    lea r2, [r2 + r3*8]
3167    SA8D_8x8
3168    lea r0, [r0 + r1*8]
3169    lea r2, [r2 + r3*8]
3170    SA8D_8x8
3171    movd eax, m12
3172    RET
3173
3174cglobal pixel_sa8d_16x8, 4,8,13
3175    FIX_STRIDES r1, r3
3176    lea  r4, [3*r1]
3177    lea  r5, [3*r3]
3178    pxor m12, m12
3179%if vertical == 0
3180    mova m7, [hmul_8p]
3181%endif
3182    SA8D_8x8
3183    add r0, 8*SIZEOF_PIXEL
3184    add r2, 8*SIZEOF_PIXEL
3185    SA8D_8x8
3186    movd eax, m12
3187    RET
3188
3189cglobal pixel_sa8d_16x32, 4,8,13
3190    FIX_STRIDES r1, r3
3191    lea  r4, [3*r1]
3192    lea  r5, [3*r3]
3193    pxor m12, m12
3194%if vertical == 0
3195    mova m7, [hmul_8p]
3196%endif
3197    SA8D_16x16
3198    lea r0, [r0+8*r1]
3199    lea r2, [r2+8*r3]
3200    SA8D_16x16
3201    movd eax, m12
3202    RET
3203
3204cglobal pixel_sa8d_16x64, 4,8,13
3205    FIX_STRIDES r1, r3
3206    lea  r4, [3*r1]
3207    lea  r5, [3*r3]
3208    pxor m12, m12
3209%if vertical == 0
3210    mova m7, [hmul_8p]
3211%endif
3212    SA8D_16x16
3213    lea r0, [r0+8*r1]
3214    lea r2, [r2+8*r3]
3215    SA8D_16x16
3216    lea r0, [r0+8*r1]
3217    lea r2, [r2+8*r3]
3218    SA8D_16x16
3219    lea r0, [r0+8*r1]
3220    lea r2, [r2+8*r3]
3221    SA8D_16x16
3222    movd eax, m12
3223    RET
3224
3225cglobal pixel_sa8d_24x32, 4,8,13
3226    FIX_STRIDES r1, r3
3227    lea  r4, [3*r1]
3228    lea  r5, [3*r3]
3229    pxor m12, m12
3230%if vertical == 0
3231    mova m7, [hmul_8p]
3232%endif
3233    SA8D_8x8
3234    add r0, 8*SIZEOF_PIXEL
3235    add r2, 8*SIZEOF_PIXEL
3236    SA8D_8x8
3237    add r0, 8*SIZEOF_PIXEL
3238    add r2, 8*SIZEOF_PIXEL
3239    SA8D_8x8
3240    lea r0, [r0 + r1*8]
3241    lea r2, [r2 + r3*8]
3242    SA8D_8x8
3243    sub r0, 8*SIZEOF_PIXEL
3244    sub r2, 8*SIZEOF_PIXEL
3245    SA8D_8x8
3246    sub r0, 8*SIZEOF_PIXEL
3247    sub r2, 8*SIZEOF_PIXEL
3248    SA8D_8x8
3249    lea r0, [r0 + r1*8]
3250    lea r2, [r2 + r3*8]
3251    SA8D_8x8
3252    add r0, 8*SIZEOF_PIXEL
3253    add r2, 8*SIZEOF_PIXEL
3254    SA8D_8x8
3255    add r0, 8*SIZEOF_PIXEL
3256    add r2, 8*SIZEOF_PIXEL
3257    SA8D_8x8
3258    lea r0, [r0 + r1*8]
3259    lea r2, [r2 + r3*8]
3260    SA8D_8x8
3261    sub r0, 8*SIZEOF_PIXEL
3262    sub r2, 8*SIZEOF_PIXEL
3263    SA8D_8x8
3264    sub r0, 8*SIZEOF_PIXEL
3265    sub r2, 8*SIZEOF_PIXEL
3266    SA8D_8x8
3267    movd eax, m12
3268    RET
3269
3270cglobal pixel_sa8d_32x8, 4,8,13
3271    FIX_STRIDES r1, r3
3272    lea  r4, [3*r1]
3273    lea  r5, [3*r3]
3274    pxor m12, m12
3275%if vertical == 0
3276    mova m7, [hmul_8p]
3277%endif
3278    SA8D_8x8
3279    add r0, 8*SIZEOF_PIXEL
3280    add r2, 8*SIZEOF_PIXEL
3281    SA8D_8x8
3282    add r0, 8*SIZEOF_PIXEL
3283    add r2, 8*SIZEOF_PIXEL
3284    SA8D_8x8
3285    add r0, 8*SIZEOF_PIXEL
3286    add r2, 8*SIZEOF_PIXEL
3287    SA8D_8x8
3288    movd eax, m12
3289    RET
3290
3291cglobal pixel_sa8d_32x16, 4,8,13
3292    FIX_STRIDES r1, r3
3293    lea  r4, [3*r1]
3294    lea  r5, [3*r3]
3295    pxor m12, m12
3296%if vertical == 0
3297    mova m7, [hmul_8p]
3298%endif
3299    SA8D_16x16
3300    lea  r4, [8*r1]
3301    lea  r5, [8*r3]
3302    sub  r0, r4
3303    sub  r2, r5
3304    add  r2, 16*SIZEOF_PIXEL
3305    add  r0, 16*SIZEOF_PIXEL
3306    lea  r4, [3*r1]
3307    lea  r5, [3*r3]
3308    SA8D_16x16
3309    movd eax, m12
3310    RET
3311
3312cglobal pixel_sa8d_32x24, 4,8,13
3313    FIX_STRIDES r1, r3
3314    lea  r4, [3*r1]
3315    lea  r5, [3*r3]
3316    pxor m12, m12
3317%if vertical == 0
3318    mova m7, [hmul_8p]
3319%endif
3320    SA8D_8x8
3321    add r0, 8*SIZEOF_PIXEL
3322    add r2, 8*SIZEOF_PIXEL
3323    SA8D_8x8
3324    add r0, 8*SIZEOF_PIXEL
3325    add r2, 8*SIZEOF_PIXEL
3326    SA8D_8x8
3327    add r0, 8*SIZEOF_PIXEL
3328    add r2, 8*SIZEOF_PIXEL
3329    SA8D_8x8
3330    lea r0, [r0 + r1*8]
3331    lea r2, [r2 + r3*8]
3332    SA8D_8x8
3333    sub r0, 8*SIZEOF_PIXEL
3334    sub r2, 8*SIZEOF_PIXEL
3335    SA8D_8x8
3336    sub r0, 8*SIZEOF_PIXEL
3337    sub r2, 8*SIZEOF_PIXEL
3338    SA8D_8x8
3339    sub r0, 8*SIZEOF_PIXEL
3340    sub r2, 8*SIZEOF_PIXEL
3341    SA8D_8x8
3342    lea r0, [r0 + r1*8]
3343    lea r2, [r2 + r3*8]
3344    SA8D_8x8
3345    add r0, 8*SIZEOF_PIXEL
3346    add r2, 8*SIZEOF_PIXEL
3347    SA8D_8x8
3348    add r0, 8*SIZEOF_PIXEL
3349    add r2, 8*SIZEOF_PIXEL
3350    SA8D_8x8
3351    add r0, 8*SIZEOF_PIXEL
3352    add r2, 8*SIZEOF_PIXEL
3353    SA8D_8x8
3354    movd eax, m12
3355    RET
3356
3357cglobal pixel_sa8d_32x32, 4,8,13
3358    FIX_STRIDES r1, r3
3359    lea  r4, [3*r1]
3360    lea  r5, [3*r3]
3361    pxor m12, m12
3362%if vertical == 0
3363    mova m7, [hmul_8p]
3364%endif
3365    SA8D_16x16
3366    lea  r4, [8*r1]
3367    lea  r5, [8*r3]
3368    sub  r0, r4
3369    sub  r2, r5
3370    add  r2, 16*SIZEOF_PIXEL
3371    add  r0, 16*SIZEOF_PIXEL
3372    lea  r4, [3*r1]
3373    lea  r5, [3*r3]
3374    SA8D_16x16
3375    lea r0, [r0+8*r1]
3376    lea r2, [r2+8*r3]
3377    SA8D_16x16
3378    lea  r4, [8*r1]
3379    lea  r5, [8*r3]
3380    sub  r0, r4
3381    sub  r2, r5
3382    sub  r2, 16*SIZEOF_PIXEL
3383    sub  r0, 16*SIZEOF_PIXEL
3384    lea  r4, [3*r1]
3385    lea  r5, [3*r3]
3386    SA8D_16x16
3387    movd eax, m12
3388    RET
3389
3390cglobal pixel_sa8d_32x64, 4,8,13
3391    FIX_STRIDES r1, r3
3392    lea  r4, [3*r1]
3393    lea  r5, [3*r3]
3394    pxor m12, m12
3395%if vertical == 0
3396    mova m7, [hmul_8p]
3397%endif
3398    SA8D_16x16
3399    lea  r4, [8*r1]
3400    lea  r5, [8*r3]
3401    sub  r0, r4
3402    sub  r2, r5
3403    add  r2, 16*SIZEOF_PIXEL
3404    add  r0, 16*SIZEOF_PIXEL
3405    lea  r4, [3*r1]
3406    lea  r5, [3*r3]
3407    SA8D_16x16
3408    lea r0, [r0+8*r1]
3409    lea r2, [r2+8*r3]
3410    SA8D_16x16
3411    lea  r4, [8*r1]
3412    lea  r5, [8*r3]
3413    sub  r0, r4
3414    sub  r2, r5
3415    sub  r2, 16*SIZEOF_PIXEL
3416    sub  r0, 16*SIZEOF_PIXEL
3417    lea  r4, [3*r1]
3418    lea  r5, [3*r3]
3419    SA8D_16x16
3420    lea r0, [r0+8*r1]
3421    lea r2, [r2+8*r3]
3422    SA8D_16x16
3423    lea  r4, [8*r1]
3424    lea  r5, [8*r3]
3425    sub  r0, r4
3426    sub  r2, r5
3427    add  r2, 16*SIZEOF_PIXEL
3428    add  r0, 16*SIZEOF_PIXEL
3429    lea  r4, [3*r1]
3430    lea  r5, [3*r3]
3431    SA8D_16x16
3432    lea r0, [r0+8*r1]
3433    lea r2, [r2+8*r3]
3434    SA8D_16x16
3435    lea  r4, [8*r1]
3436    lea  r5, [8*r3]
3437    sub  r0, r4
3438    sub  r2, r5
3439    sub  r2, 16*SIZEOF_PIXEL
3440    sub  r0, 16*SIZEOF_PIXEL
3441    lea  r4, [3*r1]
3442    lea  r5, [3*r3]
3443    SA8D_16x16
3444    movd eax, m12
3445    RET
3446
3447cglobal pixel_sa8d_48x64, 4,8,13
3448    FIX_STRIDES r1, r3
3449    lea  r4, [3*r1]
3450    lea  r5, [3*r3]
3451    pxor m12, m12
3452%if vertical == 0
3453    mova m7, [hmul_8p]
3454%endif
3455    SA8D_16x16
3456    lea  r4, [8*r1]
3457    lea  r5, [8*r3]
3458    sub  r0, r4
3459    sub  r2, r5
3460    add  r2, 16*SIZEOF_PIXEL
3461    add  r0, 16*SIZEOF_PIXEL
3462    lea  r4, [3*r1]
3463    lea  r5, [3*r3]
3464    SA8D_16x16
3465    lea  r4, [8*r1]
3466    lea  r5, [8*r3]
3467    sub  r0, r4
3468    sub  r2, r5
3469    add  r2, 16*SIZEOF_PIXEL
3470    add  r0, 16*SIZEOF_PIXEL
3471    lea  r4, [3*r1]
3472    lea  r5, [3*r3]
3473    SA8D_16x16
3474    lea r0, [r0+8*r1]
3475    lea r2, [r2+8*r3]
3476    SA8D_16x16
3477    lea  r4, [8*r1]
3478    lea  r5, [8*r3]
3479    sub  r0, r4
3480    sub  r2, r5
3481    sub  r2, 16*SIZEOF_PIXEL
3482    sub  r0, 16*SIZEOF_PIXEL
3483    lea  r4, [3*r1]
3484    lea  r5, [3*r3]
3485    SA8D_16x16
3486    lea  r4, [8*r1]
3487    lea  r5, [8*r3]
3488    sub  r0, r4
3489    sub  r2, r5
3490    sub  r2, 16*SIZEOF_PIXEL
3491    sub  r0, 16*SIZEOF_PIXEL
3492    lea  r4, [3*r1]
3493    lea  r5, [3*r3]
3494    SA8D_16x16
3495    lea r0, [r0+8*r1]
3496    lea r2, [r2+8*r3]
3497    SA8D_16x16
3498    lea  r4, [8*r1]
3499    lea  r5, [8*r3]
3500    sub  r0, r4
3501    sub  r2, r5
3502    add  r2, 16*SIZEOF_PIXEL
3503    add  r0, 16*SIZEOF_PIXEL
3504    lea  r4, [3*r1]
3505    lea  r5, [3*r3]
3506    SA8D_16x16
3507    lea  r4, [8*r1]
3508    lea  r5, [8*r3]
3509    sub  r0, r4
3510    sub  r2, r5
3511    add  r2, 16*SIZEOF_PIXEL
3512    add  r0, 16*SIZEOF_PIXEL
3513    lea  r4, [3*r1]
3514    lea  r5, [3*r3]
3515    SA8D_16x16
3516    lea r0, [r0+8*r1]
3517    lea r2, [r2+8*r3]
3518    SA8D_16x16
3519    lea  r4, [8*r1]
3520    lea  r5, [8*r3]
3521    sub  r0, r4
3522    sub  r2, r5
3523    sub  r2, 16*SIZEOF_PIXEL
3524    sub  r0, 16*SIZEOF_PIXEL
3525    lea  r4, [3*r1]
3526    lea  r5, [3*r3]
3527    SA8D_16x16
3528    lea  r4, [8*r1]
3529    lea  r5, [8*r3]
3530    sub  r0, r4
3531    sub  r2, r5
3532    sub  r2, 16*SIZEOF_PIXEL
3533    sub  r0, 16*SIZEOF_PIXEL
3534    lea  r4, [3*r1]
3535    lea  r5, [3*r3]
3536    SA8D_16x16
3537    movd eax, m12
3538    RET
3539
3540cglobal pixel_sa8d_64x16, 4,8,13
3541    FIX_STRIDES r1, r3
3542    lea  r4, [3*r1]
3543    lea  r5, [3*r3]
3544    pxor m12, m12
3545%if vertical == 0
3546    mova m7, [hmul_8p]
3547%endif
3548    SA8D_16x16
3549    lea  r4, [8*r1]
3550    lea  r5, [8*r3]
3551    sub  r0, r4
3552    sub  r2, r5
3553    add  r2, 16*SIZEOF_PIXEL
3554    add  r0, 16*SIZEOF_PIXEL
3555    lea  r4, [3*r1]
3556    lea  r5, [3*r3]
3557    SA8D_16x16
3558    lea  r4, [8*r1]
3559    lea  r5, [8*r3]
3560    sub  r0, r4
3561    sub  r2, r5
3562    add  r2, 16*SIZEOF_PIXEL
3563    add  r0, 16*SIZEOF_PIXEL
3564    lea  r4, [3*r1]
3565    lea  r5, [3*r3]
3566    SA8D_16x16
3567    lea  r4, [8*r1]
3568    lea  r5, [8*r3]
3569    sub  r0, r4
3570    sub  r2, r5
3571    add  r2, 16*SIZEOF_PIXEL
3572    add  r0, 16*SIZEOF_PIXEL
3573    lea  r4, [3*r1]
3574    lea  r5, [3*r3]
3575    SA8D_16x16
3576    movd eax, m12
3577    RET
3578
3579cglobal pixel_sa8d_64x32, 4,8,13
3580    FIX_STRIDES r1, r3
3581    lea  r4, [3*r1]
3582    lea  r5, [3*r3]
3583    pxor m12, m12
3584%if vertical == 0
3585    mova m7, [hmul_8p]
3586%endif
3587    SA8D_16x16
3588    lea  r4, [8*r1]
3589    lea  r5, [8*r3]
3590    sub  r0, r4
3591    sub  r2, r5
3592    add  r2, 16*SIZEOF_PIXEL
3593    add  r0, 16*SIZEOF_PIXEL
3594    lea  r4, [3*r1]
3595    lea  r5, [3*r3]
3596    SA8D_16x16
3597    lea  r4, [8*r1]
3598    lea  r5, [8*r3]
3599    sub  r0, r4
3600    sub  r2, r5
3601    add  r2, 16*SIZEOF_PIXEL
3602    add  r0, 16*SIZEOF_PIXEL
3603    lea  r4, [3*r1]
3604    lea  r5, [3*r3]
3605    SA8D_16x16
3606    lea  r4, [8*r1]
3607    lea  r5, [8*r3]
3608    sub  r0, r4
3609    sub  r2, r5
3610    add  r2, 16*SIZEOF_PIXEL
3611    add  r0, 16*SIZEOF_PIXEL
3612    lea  r4, [3*r1]
3613    lea  r5, [3*r3]
3614    SA8D_16x16
3615    lea r0, [r0+8*r1]
3616    lea r2, [r2+8*r3]
3617    SA8D_16x16
3618    lea  r4, [8*r1]
3619    lea  r5, [8*r3]
3620    sub  r0, r4
3621    sub  r2, r5
3622    sub  r2, 16*SIZEOF_PIXEL
3623    sub  r0, 16*SIZEOF_PIXEL
3624    lea  r4, [3*r1]
3625    lea  r5, [3*r3]
3626    SA8D_16x16
3627    lea  r4, [8*r1]
3628    lea  r5, [8*r3]
3629    sub  r0, r4
3630    sub  r2, r5
3631    sub  r2, 16*SIZEOF_PIXEL
3632    sub  r0, 16*SIZEOF_PIXEL
3633    lea  r4, [3*r1]
3634    lea  r5, [3*r3]
3635    SA8D_16x16
3636    lea  r4, [8*r1]
3637    lea  r5, [8*r3]
3638    sub  r0, r4
3639    sub  r2, r5
3640    sub  r2, 16*SIZEOF_PIXEL
3641    sub  r0, 16*SIZEOF_PIXEL
3642    lea  r4, [3*r1]
3643    lea  r5, [3*r3]
3644    SA8D_16x16
3645    movd eax, m12
3646    RET
3647
3648cglobal pixel_sa8d_64x48, 4,8,13
3649    FIX_STRIDES r1, r3
3650    lea  r4, [3*r1]
3651    lea  r5, [3*r3]
3652    pxor m12, m12
3653%if vertical == 0
3654    mova m7, [hmul_8p]
3655%endif
3656    SA8D_16x16
3657    lea  r4, [8*r1]
3658    lea  r5, [8*r3]
3659    sub  r0, r4
3660    sub  r2, r5
3661    add  r2, 16*SIZEOF_PIXEL
3662    add  r0, 16*SIZEOF_PIXEL
3663    lea  r4, [3*r1]
3664    lea  r5, [3*r3]
3665    SA8D_16x16
3666    lea  r4, [8*r1]
3667    lea  r5, [8*r3]
3668    sub  r0, r4
3669    sub  r2, r5
3670    add  r2, 16*SIZEOF_PIXEL
3671    add  r0, 16*SIZEOF_PIXEL
3672    lea  r4, [3*r1]
3673    lea  r5, [3*r3]
3674    SA8D_16x16
3675    lea  r4, [8*r1]
3676    lea  r5, [8*r3]
3677    sub  r0, r4
3678    sub  r2, r5
3679    add  r2, 16*SIZEOF_PIXEL
3680    add  r0, 16*SIZEOF_PIXEL
3681    lea  r4, [3*r1]
3682    lea  r5, [3*r3]
3683    SA8D_16x16
3684    lea r0, [r0+8*r1]
3685    lea r2, [r2+8*r3]
3686    SA8D_16x16
3687    lea  r4, [8*r1]
3688    lea  r5, [8*r3]
3689    sub  r0, r4
3690    sub  r2, r5
3691    sub  r2, 16*SIZEOF_PIXEL
3692    sub  r0, 16*SIZEOF_PIXEL
3693    lea  r4, [3*r1]
3694    lea  r5, [3*r3]
3695    SA8D_16x16
3696    lea  r4, [8*r1]
3697    lea  r5, [8*r3]
3698    sub  r0, r4
3699    sub  r2, r5
3700    sub  r2, 16*SIZEOF_PIXEL
3701    sub  r0, 16*SIZEOF_PIXEL
3702    lea  r4, [3*r1]
3703    lea  r5, [3*r3]
3704    SA8D_16x16
3705    lea  r4, [8*r1]
3706    lea  r5, [8*r3]
3707    sub  r0, r4
3708    sub  r2, r5
3709    sub  r2, 16*SIZEOF_PIXEL
3710    sub  r0, 16*SIZEOF_PIXEL
3711    lea  r4, [3*r1]
3712    lea  r5, [3*r3]
3713    SA8D_16x16
3714    lea r0, [r0+8*r1]
3715    lea r2, [r2+8*r3]
3716    SA8D_16x16
3717    lea  r4, [8*r1]
3718    lea  r5, [8*r3]
3719    sub  r0, r4
3720    sub  r2, r5
3721    add  r2, 16*SIZEOF_PIXEL
3722    add  r0, 16*SIZEOF_PIXEL
3723    lea  r4, [3*r1]
3724    lea  r5, [3*r3]
3725    SA8D_16x16
3726    lea  r4, [8*r1]
3727    lea  r5, [8*r3]
3728    sub  r0, r4
3729    sub  r2, r5
3730    add  r2, 16*SIZEOF_PIXEL
3731    add  r0, 16*SIZEOF_PIXEL
3732    lea  r4, [3*r1]
3733    lea  r5, [3*r3]
3734    SA8D_16x16
3735    lea  r4, [8*r1]
3736    lea  r5, [8*r3]
3737    sub  r0, r4
3738    sub  r2, r5
3739    add  r2, 16*SIZEOF_PIXEL
3740    add  r0, 16*SIZEOF_PIXEL
3741    lea  r4, [3*r1]
3742    lea  r5, [3*r3]
3743    SA8D_16x16
3744    movd eax, m12
3745    RET
3746
3747cglobal pixel_sa8d_64x64, 4,8,13
3748    FIX_STRIDES r1, r3
3749    lea  r4, [3*r1]
3750    lea  r5, [3*r3]
3751    pxor m12, m12
3752%if vertical == 0
3753    mova m7, [hmul_8p]
3754%endif
3755    SA8D_16x16
3756    lea  r4, [8*r1]
3757    lea  r5, [8*r3]
3758    sub  r0, r4
3759    sub  r2, r5
3760    add  r2, 16*SIZEOF_PIXEL
3761    add  r0, 16*SIZEOF_PIXEL
3762    lea  r4, [3*r1]
3763    lea  r5, [3*r3]
3764    SA8D_16x16
3765    lea  r4, [8*r1]
3766    lea  r5, [8*r3]
3767    sub  r0, r4
3768    sub  r2, r5
3769    add  r2, 16*SIZEOF_PIXEL
3770    add  r0, 16*SIZEOF_PIXEL
3771    lea  r4, [3*r1]
3772    lea  r5, [3*r3]
3773    SA8D_16x16
3774    lea  r4, [8*r1]
3775    lea  r5, [8*r3]
3776    sub  r0, r4
3777    sub  r2, r5
3778    add  r2, 16*SIZEOF_PIXEL
3779    add  r0, 16*SIZEOF_PIXEL
3780    lea  r4, [3*r1]
3781    lea  r5, [3*r3]
3782    SA8D_16x16
3783    lea r0, [r0+8*r1]
3784    lea r2, [r2+8*r3]
3785    SA8D_16x16
3786    lea  r4, [8*r1]
3787    lea  r5, [8*r3]
3788    sub  r0, r4
3789    sub  r2, r5
3790    sub  r2, 16*SIZEOF_PIXEL
3791    sub  r0, 16*SIZEOF_PIXEL
3792    lea  r4, [3*r1]
3793    lea  r5, [3*r3]
3794    SA8D_16x16
3795    lea  r4, [8*r1]
3796    lea  r5, [8*r3]
3797    sub  r0, r4
3798    sub  r2, r5
3799    sub  r2, 16*SIZEOF_PIXEL
3800    sub  r0, 16*SIZEOF_PIXEL
3801    lea  r4, [3*r1]
3802    lea  r5, [3*r3]
3803    SA8D_16x16
3804    lea  r4, [8*r1]
3805    lea  r5, [8*r3]
3806    sub  r0, r4
3807    sub  r2, r5
3808    sub  r2, 16*SIZEOF_PIXEL
3809    sub  r0, 16*SIZEOF_PIXEL
3810    lea  r4, [3*r1]
3811    lea  r5, [3*r3]
3812    SA8D_16x16
3813    lea r0, [r0+8*r1]
3814    lea r2, [r2+8*r3]
3815    SA8D_16x16
3816    lea  r4, [8*r1]
3817    lea  r5, [8*r3]
3818    sub  r0, r4
3819    sub  r2, r5
3820    add  r2, 16*SIZEOF_PIXEL
3821    add  r0, 16*SIZEOF_PIXEL
3822    lea  r4, [3*r1]
3823    lea  r5, [3*r3]
3824    SA8D_16x16
3825    lea  r4, [8*r1]
3826    lea  r5, [8*r3]
3827    sub  r0, r4
3828    sub  r2, r5
3829    add  r2, 16*SIZEOF_PIXEL
3830    add  r0, 16*SIZEOF_PIXEL
3831    lea  r4, [3*r1]
3832    lea  r5, [3*r3]
3833    SA8D_16x16
3834    lea  r4, [8*r1]
3835    lea  r5, [8*r3]
3836    sub  r0, r4
3837    sub  r2, r5
3838    add  r2, 16*SIZEOF_PIXEL
3839    add  r0, 16*SIZEOF_PIXEL
3840    lea  r4, [3*r1]
3841    lea  r5, [3*r3]
3842    SA8D_16x16
3843    lea r0, [r0+8*r1]
3844    lea r2, [r2+8*r3]
3845    SA8D_16x16
3846    lea  r4, [8*r1]
3847    lea  r5, [8*r3]
3848    sub  r0, r4
3849    sub  r2, r5
3850    sub  r2, 16*SIZEOF_PIXEL
3851    sub  r0, 16*SIZEOF_PIXEL
3852    lea  r4, [3*r1]
3853    lea  r5, [3*r3]
3854    SA8D_16x16
3855    lea  r4, [8*r1]
3856    lea  r5, [8*r3]
3857    sub  r0, r4
3858    sub  r2, r5
3859    sub  r2, 16*SIZEOF_PIXEL
3860    sub  r0, 16*SIZEOF_PIXEL
3861    lea  r4, [3*r1]
3862    lea  r5, [3*r3]
3863    SA8D_16x16
3864    lea  r4, [8*r1]
3865    lea  r5, [8*r3]
3866    sub  r0, r4
3867    sub  r2, r5
3868    sub  r2, 16*SIZEOF_PIXEL
3869    sub  r0, 16*SIZEOF_PIXEL
3870    lea  r4, [3*r1]
3871    lea  r5, [3*r3]
3872    SA8D_16x16
3873    movd eax, m12
3874    RET
3875
3876%else ; ARCH_X86_32
3877%if mmsize == 16
3878cglobal pixel_sa8d_8x8_internal
3879    %define spill0 [esp+4]
3880    %define spill1 [esp+20]
3881    %define spill2 [esp+36]
3882%if vertical
3883    LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3884    HADAMARD4_2D 0, 1, 2, 3, 4
3885    movdqa spill0, m3
3886    LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3887    HADAMARD4_2D 4, 5, 6, 7, 3
3888    HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3889    movdqa m3, spill0
3890    paddw m0, m1
3891    HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3892%else ; mmsize == 8
3893    mova m7, [hmul_8p]
3894    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
3895    ; could do first HADAMARD4_V here to save spilling later
3896    ; surprisingly, not a win on conroe or even p4
3897    mova spill0, m2
3898    mova spill1, m3
3899    mova spill2, m1
3900    SWAP 1, 7
3901    LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
3902    HADAMARD4_V 4, 5, 6, 7, 3
3903    mova m1, spill2
3904    mova m2, spill0
3905    mova m3, spill1
3906    mova spill0, m6
3907    mova spill1, m7
3908    HADAMARD4_V 0, 1, 2, 3, 7
3909    SUMSUB_BADC w, 0, 4, 1, 5, 7
3910    HADAMARD 2, sumsub, 0, 4, 7, 6
3911    HADAMARD 2, sumsub, 1, 5, 7, 6
3912    HADAMARD 1, amax, 0, 4, 7, 6
3913    HADAMARD 1, amax, 1, 5, 7, 6
3914    mova m6, spill0
3915    mova m7, spill1
3916    paddw m0, m1
3917    SUMSUB_BADC w, 2, 6, 3, 7, 4
3918    HADAMARD 2, sumsub, 2, 6, 4, 5
3919    HADAMARD 2, sumsub, 3, 7, 4, 5
3920    HADAMARD 1, amax, 2, 6, 4, 5
3921    HADAMARD 1, amax, 3, 7, 4, 5
3922%endif ; sse2/non-sse2
3923    paddw m0, m2
3924    paddw m0, m3
3925    SAVE_MM_PERMUTATION
3926    ret
3927%endif ; ifndef mmx2
3928
3929cglobal pixel_sa8d_8x8_internal2
3930    %define spill0 [esp+4]
3931    LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
3932    HADAMARD4_2D 0, 1, 2, 3, 4
3933    movdqa spill0, m3
3934    LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
3935    HADAMARD4_2D 4, 5, 6, 7, 3
3936    HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
3937    movdqa m3, spill0
3938    paddw m0, m1
3939    HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
3940    paddw m0, m2
3941    paddw m0, m3
3942    SAVE_MM_PERMUTATION
3943    ret
3944
3945cglobal pixel_sa8d_8x8, 4,7
3946    FIX_STRIDES r1, r3
3947    mov    r6, esp
3948    and   esp, ~15
3949    sub   esp, 48
3950    lea    r4, [3*r1]
3951    lea    r5, [3*r3]
3952    call pixel_sa8d_8x8_internal
3953%if HIGH_BIT_DEPTH
3954    HADDUW m0, m1
3955%else
3956    HADDW  m0, m1
3957%endif ; HIGH_BIT_DEPTH
3958    movd  eax, m0
3959    add   eax, 1
3960    shr   eax, 1
3961    mov   esp, r6
3962    RET
3963
3964cglobal pixel_sa8d_16x16, 4,7
3965    FIX_STRIDES r1, r3
3966    mov  r6, esp
3967    and  esp, ~15
3968    sub  esp, 64
3969    lea  r4, [3*r1]
3970    lea  r5, [3*r3]
3971    call pixel_sa8d_8x8_internal
3972%if mmsize == 8
3973    lea  r0, [r0+4*r1]
3974    lea  r2, [r2+4*r3]
3975%endif
3976%if HIGH_BIT_DEPTH
3977    HADDUW m0, m1
3978%endif
3979    mova [esp+48], m0
3980    call pixel_sa8d_8x8_internal
3981    mov  r0, [r6+20]
3982    mov  r2, [r6+28]
3983    add  r0, 8*SIZEOF_PIXEL
3984    add  r2, 8*SIZEOF_PIXEL
3985    SA8D_INTER
3986    mova [esp+48], m0
3987    call pixel_sa8d_8x8_internal
3988%if mmsize == 8
3989    lea  r0, [r0+4*r1]
3990    lea  r2, [r2+4*r3]
3991%else
3992    SA8D_INTER
3993%endif
3994    mova [esp+64-mmsize], m0
3995    call pixel_sa8d_8x8_internal
3996%if HIGH_BIT_DEPTH
3997    SA8D_INTER
3998%else ; !HIGH_BIT_DEPTH
3999    paddusw m0, [esp+64-mmsize]
4000%if mmsize == 16
4001    HADDUW m0, m1
4002%else
4003    mova m2, [esp+48]
4004    pxor m7, m7
4005    mova m1, m0
4006    mova m3, m2
4007    punpcklwd m0, m7
4008    punpckhwd m1, m7
4009    punpcklwd m2, m7
4010    punpckhwd m3, m7
4011    paddd m0, m1
4012    paddd m2, m3
4013    paddd m0, m2
4014    HADDD m0, m1
4015%endif
4016%endif ; HIGH_BIT_DEPTH
4017    movd eax, m0
4018    add  eax, 1
4019    shr  eax, 1
4020    mov  esp, r6
4021    RET
4022
4023cglobal pixel_sa8d_8x16, 4,7,8
4024    FIX_STRIDES r1, r3
4025    mov  r6, esp
4026    and  esp, ~15
4027    sub  esp, 64
4028
4029    lea  r4, [r1 + 2*r1]
4030    lea  r5, [r3 + 2*r3]
4031    call pixel_sa8d_8x8_internal2
4032    HADDUW m0, m1
4033    movd r4d, m0
4034    add  r4d, 1
4035    shr  r4d, 1
4036    mov dword [esp+36], r4d
4037
4038    mov  r0, [r6+20]
4039    mov  r2, [r6+28]
4040    lea  r0, [r0 + r1*8]
4041    lea  r2, [r2 + r3*8]
4042    lea  r4, [r1 + 2*r1]
4043    call pixel_sa8d_8x8_internal2
4044    HADDUW m0, m1
4045    movd r4d, m0
4046    add  r4d, 1
4047    shr  r4d, 1
4048    add r4d, dword [esp+36]
4049    mov eax, r4d
4050    mov esp, r6
4051    RET
4052
4053cglobal pixel_sa8d_8x32, 4,7,8
4054    FIX_STRIDES r1, r3
4055    mov  r6, esp
4056    and  esp, ~15
4057    sub  esp, 64
4058
4059    lea  r4, [r1 + 2*r1]
4060    lea  r5, [r3 + 2*r3]
4061    call pixel_sa8d_8x8_internal2
4062    HADDUW m0, m1
4063    movd r4d, m0
4064    add  r4d, 1
4065    shr  r4d, 1
4066    mov dword [esp+36], r4d
4067
4068    mov  r0, [r6+20]
4069    mov  r2, [r6+28]
4070    lea  r0, [r0 + r1*8]
4071    lea  r2, [r2 + r3*8]
4072    lea  r4, [r1 + 2*r1]
4073    call pixel_sa8d_8x8_internal2
4074    HADDUW m0, m1
4075    movd r4d, m0
4076    add  r4d, 1
4077    shr  r4d, 1
4078    add  r4d, dword [esp+36]
4079    mov dword [esp+36], r4d
4080
4081    mov  r0, [r6+20]
4082    mov  r2, [r6+28]
4083    lea  r0, [r0 + r1*8]
4084    lea  r2, [r2 + r3*8]
4085    lea  r0, [r0 + r1*8]
4086    lea  r2, [r2 + r3*8]
4087    lea  r4, [r1 + 2*r1]
4088    call pixel_sa8d_8x8_internal2
4089    HADDUW m0, m1
4090    movd r4d, m0
4091    add  r4d, 1
4092    shr  r4d, 1
4093    add  r4d, dword [esp+36]
4094    mov dword [esp+36], r4d
4095
4096    mov  r0, [r6+20]
4097    mov  r2, [r6+28]
4098    lea  r0, [r0 + r1*8]
4099    lea  r2, [r2 + r3*8]
4100    lea  r0, [r0 + r1*8]
4101    lea  r2, [r2 + r3*8]
4102    lea  r0, [r0 + r1*8]
4103    lea  r2, [r2 + r3*8]
4104    lea  r4, [r1 + 2*r1]
4105    call pixel_sa8d_8x8_internal2
4106    HADDUW m0, m1
4107    movd r4d, m0
4108    add  r4d, 1
4109    shr  r4d, 1
4110    add  r4d, dword [esp+36]
4111    mov eax, r4d
4112    mov esp, r6
4113    RET
4114
4115cglobal pixel_sa8d_16x8, 4,7,8
4116    FIX_STRIDES r1, r3
4117    mov  r6, esp
4118    and  esp, ~15
4119    sub  esp, 64
4120
4121    lea  r4, [r1 + 2*r1]
4122    lea  r5, [r3 + 2*r3]
4123    call pixel_sa8d_8x8_internal2
4124    HADDUW m0, m1
4125    movd r4d, m0
4126    add  r4d, 1
4127    shr  r4d, 1
4128    mov dword [esp+36], r4d
4129
4130    mov  r0, [r6+20]
4131    mov  r2, [r6+28]
4132    add  r0, 8*SIZEOF_PIXEL
4133    add  r2, 8*SIZEOF_PIXEL
4134    lea  r4, [r1 + 2*r1]
4135    call pixel_sa8d_8x8_internal2
4136    HADDUW m0, m1
4137    movd r4d, m0
4138    add  r4d, 1
4139    shr  r4d, 1
4140    add r4d, dword [esp+36]
4141    mov eax, r4d
4142    mov esp, r6
4143    RET
4144
4145cglobal pixel_sa8d_16x32, 4,7,8
4146    FIX_STRIDES r1, r3
4147    mov  r6, esp
4148    and  esp, ~15
4149    sub  esp, 64
4150
4151    lea  r4, [r1 + 2*r1]
4152    lea  r5, [r3 + 2*r3]
4153    call pixel_sa8d_8x8_internal2
4154%if HIGH_BIT_DEPTH
4155    HADDUW m0, m1
4156%endif
4157    mova [rsp+48], m0
4158    call pixel_sa8d_8x8_internal2
4159    SA8D_INTER
4160    mova [esp+48], m0
4161
4162    mov  r0, [r6+20]
4163    mov  r2, [r6+28]
4164    add  r0, 8*SIZEOF_PIXEL
4165    add  r2, 8*SIZEOF_PIXEL
4166    call pixel_sa8d_8x8_internal2
4167    SA8D_INTER
4168    mova [esp+48], m0
4169    call pixel_sa8d_8x8_internal2
4170    SA8D_INTER
4171%if HIGH_BIT_DEPTH == 0
4172    HADDUW m0, m1
4173%endif
4174    movd r4d, m0
4175    add  r4d, 1
4176    shr  r4d, 1
4177    mov dword [esp+36], r4d
4178
4179    mov  r0, [r6+20]
4180    mov  r2, [r6+28]
4181    lea  r0, [r0 + r1*8]
4182    lea  r2, [r2 + r3*8]
4183    lea  r0, [r0 + r1*8]
4184    lea  r2, [r2 + r3*8]
4185    lea  r4, [r1 + 2*r1]
4186    call pixel_sa8d_8x8_internal2
4187%if HIGH_BIT_DEPTH
4188    HADDUW m0, m1
4189%endif
4190    mova [esp+48], m0
4191    call pixel_sa8d_8x8_internal2
4192    SA8D_INTER
4193    mova [esp+48], m0
4194
4195    mov  r0, [r6+20]
4196    mov  r2, [r6+28]
4197    lea  r0, [r0 + r1*8]
4198    lea  r2, [r2 + r3*8]
4199    lea  r0, [r0 + r1*8]
4200    lea  r2, [r2 + r3*8]
4201    add  r0, 8*SIZEOF_PIXEL
4202    add  r2, 8*SIZEOF_PIXEL
4203    call pixel_sa8d_8x8_internal2
4204    SA8D_INTER
4205    mova [esp+48], m0
4206    call pixel_sa8d_8x8_internal2
4207    SA8D_INTER
4208%if HIGH_BIT_DEPTH == 0
4209    HADDUW m0, m1
4210%endif
4211    movd r4d, m0
4212    add  r4d, 1
4213    shr  r4d, 1
4214    add r4d, dword [esp+36]
4215    mov eax, r4d
4216    mov esp, r6
4217    RET
4218
4219cglobal pixel_sa8d_16x64, 4,7,8
4220    FIX_STRIDES r1, r3
4221    mov  r6, esp
4222    and  esp, ~15
4223    sub  esp, 64
4224
4225    lea  r4, [r1 + 2*r1]
4226    lea  r5, [r3 + 2*r3]
4227    call pixel_sa8d_8x8_internal2
4228%if HIGH_BIT_DEPTH
4229    HADDUW m0, m1
4230%endif
4231    mova [rsp+48], m0
4232    call pixel_sa8d_8x8_internal2
4233    SA8D_INTER
4234    mova [esp+48], m0
4235
4236    mov  r0, [r6+20]
4237    mov  r2, [r6+28]
4238    add  r0, 8*SIZEOF_PIXEL
4239    add  r2, 8*SIZEOF_PIXEL
4240    call pixel_sa8d_8x8_internal2
4241    SA8D_INTER
4242    mova [esp+48], m0
4243    call pixel_sa8d_8x8_internal2
4244    SA8D_INTER
4245%if HIGH_BIT_DEPTH == 0
4246    HADDUW m0, m1
4247%endif
4248    movd r4d, m0
4249    add  r4d, 1
4250    shr  r4d, 1
4251    mov dword [esp+36], r4d
4252
4253    mov  r0, [r6+20]
4254    mov  r2, [r6+28]
4255    lea  r0, [r0 + r1*8]
4256    lea  r2, [r2 + r3*8]
4257    lea  r0, [r0 + r1*8]
4258    lea  r2, [r2 + r3*8]
4259    mov  [r6+20], r0
4260    mov  [r6+28], r2
4261
4262    lea  r4, [r1 + 2*r1]
4263    call pixel_sa8d_8x8_internal2
4264%if HIGH_BIT_DEPTH
4265    HADDUW m0, m1
4266%endif
4267    mova [esp+48], m0
4268    call pixel_sa8d_8x8_internal2
4269    SA8D_INTER
4270    mova [esp+48], m0
4271
4272    mov  r0, [r6+20]
4273    mov  r2, [r6+28]
4274    add  r0, 8*SIZEOF_PIXEL
4275    add  r2, 8*SIZEOF_PIXEL
4276    call pixel_sa8d_8x8_internal2
4277    SA8D_INTER
4278    mova [esp+64-mmsize], m0
4279    call pixel_sa8d_8x8_internal2
4280    AVG_16x16
4281
4282    mov  r0, [r6+20]
4283    mov  r2, [r6+28]
4284    lea  r0, [r0 + r1*8]
4285    lea  r2, [r2 + r3*8]
4286    lea  r0, [r0 + r1*8]
4287    lea  r2, [r2 + r3*8]
4288    mov  [r6+20], r0
4289    mov  [r6+28], r2
4290
4291    lea  r4, [r1 + 2*r1]
4292    call pixel_sa8d_8x8_internal2
4293%if HIGH_BIT_DEPTH
4294    HADDUW m0, m1
4295%endif
4296    mova [esp+48], m0
4297    call pixel_sa8d_8x8_internal2
4298    SA8D_INTER
4299    mova [esp+48], m0
4300
4301    mov  r0, [r6+20]
4302    mov  r2, [r6+28]
4303    add  r0, 8*SIZEOF_PIXEL
4304    add  r2, 8*SIZEOF_PIXEL
4305    call pixel_sa8d_8x8_internal2
4306    SA8D_INTER
4307    mova [esp+64-mmsize], m0
4308    call pixel_sa8d_8x8_internal2
4309    AVG_16x16
4310
4311    mov  r0, [r6+20]
4312    mov  r2, [r6+28]
4313    lea  r0, [r0 + r1*8]
4314    lea  r2, [r2 + r3*8]
4315    lea  r0, [r0 + r1*8]
4316    lea  r2, [r2 + r3*8]
4317    mov  [r6+20], r0
4318    mov  [r6+28], r2
4319
4320    lea  r4, [r1 + 2*r1]
4321    call pixel_sa8d_8x8_internal2
4322%if HIGH_BIT_DEPTH
4323    HADDUW m0, m1
4324%endif
4325    mova [esp+48], m0
4326    call pixel_sa8d_8x8_internal2
4327    SA8D_INTER
4328    mova [esp+48], m0
4329
4330    mov  r0, [r6+20]
4331    mov  r2, [r6+28]
4332    add  r0, 8*SIZEOF_PIXEL
4333    add  r2, 8*SIZEOF_PIXEL
4334    call pixel_sa8d_8x8_internal2
4335    SA8D_INTER
4336    mova [esp+64-mmsize], m0
4337    call pixel_sa8d_8x8_internal2
4338    SA8D_INTER
4339%if HIGH_BIT_DEPTH == 0
4340    HADDUW m0, m1
4341%endif
4342    movd r4d, m0
4343    add  r4d, 1
4344    shr  r4d, 1
4345    add r4d, dword [esp+36]
4346    mov eax, r4d
4347    mov esp, r6
4348    RET
4349
4350cglobal pixel_sa8d_24x32, 4,7,8
4351    FIX_STRIDES r1, r3
4352    mov  r6, esp
4353    and  esp, ~15
4354    sub  esp, 64
4355
4356    lea  r4, [r1 + 2*r1]
4357    lea  r5, [r3 + 2*r3]
4358    call pixel_sa8d_8x8_internal2
4359    HADDUW m0, m1
4360    movd r4d, m0
4361    add  r4d, 1
4362    shr  r4d, 1
4363    mov dword [esp+36], r4d
4364
4365    mov  r0, [r6+20]
4366    mov  r2, [r6+28]
4367    add  r0, 8*SIZEOF_PIXEL
4368    add  r2, 8*SIZEOF_PIXEL
4369    lea  r4, [r1 + 2*r1]
4370    call pixel_sa8d_8x8_internal2
4371    HADDUW m0, m1
4372    movd r4d, m0
4373    add  r4d, 1
4374    shr  r4d, 1
4375    add  r4d, dword [esp+36]
4376    mov dword [esp+36], r4d
4377
4378    mov  r0, [r6+20]
4379    mov  r2, [r6+28]
4380    add  r0, 16*SIZEOF_PIXEL
4381    add  r2, 16*SIZEOF_PIXEL
4382    lea  r4, [r1 + 2*r1]
4383    call pixel_sa8d_8x8_internal2
4384    HADDUW m0, m1
4385    movd r4d, m0
4386    add  r4d, 1
4387    shr  r4d, 1
4388    add  r4d, dword [esp+36]
4389    mov dword [esp+36], r4d
4390
4391    mov  r0, [r6+20]
4392    mov  r2, [r6+28]
4393    lea  r0, [r0 + r1*8]
4394    lea  r2, [r2 + r3*8]
4395    mov  [r6+20], r0
4396    mov  [r6+28], r2
4397    lea  r4, [r1 + 2*r1]
4398    call pixel_sa8d_8x8_internal2
4399    HADDUW m0, m1
4400    movd r4d, m0
4401    add  r4d, 1
4402    shr  r4d, 1
4403    add  r4d, dword [esp+36]
4404    mov dword [esp+36], r4d
4405
4406    mov  r0, [r6+20]
4407    mov  r2, [r6+28]
4408    add  r0, 8*SIZEOF_PIXEL
4409    add  r2, 8*SIZEOF_PIXEL
4410    lea  r4, [r1 + 2*r1]
4411    call pixel_sa8d_8x8_internal2
4412    HADDUW m0, m1
4413    movd r4d, m0
4414    add  r4d, 1
4415    shr  r4d, 1
4416    add  r4d, dword [esp+36]
4417    mov dword [esp+36], r4d
4418
4419    mov  r0, [r6+20]
4420    mov  r2, [r6+28]
4421    add  r0, 16*SIZEOF_PIXEL
4422    add  r2, 16*SIZEOF_PIXEL
4423    lea  r4, [r1 + 2*r1]
4424    call pixel_sa8d_8x8_internal2
4425    HADDUW m0, m1
4426    movd r4d, m0
4427    add  r4d, 1
4428    shr  r4d, 1
4429    add  r4d, dword [esp+36]
4430    mov dword [esp+36], r4d
4431
4432    mov  r0, [r6+20]
4433    mov  r2, [r6+28]
4434    lea  r0, [r0 + r1*8]
4435    lea  r2, [r2 + r3*8]
4436    mov  [r6+20], r0
4437    mov  [r6+28], r2
4438    lea  r4, [r1 + 2*r1]
4439    call pixel_sa8d_8x8_internal2
4440    HADDUW m0, m1
4441    movd r4d, m0
4442    add  r4d, 1
4443    shr  r4d, 1
4444    add  r4d, dword [esp+36]
4445    mov dword [esp+36], r4d
4446
4447    mov  r0, [r6+20]
4448    mov  r2, [r6+28]
4449    add  r0, 8*SIZEOF_PIXEL
4450    add  r2, 8*SIZEOF_PIXEL
4451    lea  r4, [r1 + 2*r1]
4452    call pixel_sa8d_8x8_internal2
4453    HADDUW m0, m1
4454    movd r4d, m0
4455    add  r4d, 1
4456    shr  r4d, 1
4457    add  r4d, dword [esp+36]
4458    mov dword [esp+36], r4d
4459
4460    mov  r0, [r6+20]
4461    mov  r2, [r6+28]
4462    add  r0, 16*SIZEOF_PIXEL
4463    add  r2, 16*SIZEOF_PIXEL
4464    lea  r4, [r1 + 2*r1]
4465    call pixel_sa8d_8x8_internal2
4466    HADDUW m0, m1
4467    movd r4d, m0
4468    add  r4d, 1
4469    shr  r4d, 1
4470    add  r4d, dword [esp+36]
4471    mov dword [esp+36], r4d
4472
4473    mov  r0, [r6+20]
4474    mov  r2, [r6+28]
4475    lea  r0, [r0 + r1*8]
4476    lea  r2, [r2 + r3*8]
4477    mov  [r6+20], r0
4478    mov  [r6+28], r2
4479    lea  r4, [r1 + 2*r1]
4480    call pixel_sa8d_8x8_internal2
4481    HADDUW m0, m1
4482    movd r4d, m0
4483    add  r4d, 1
4484    shr  r4d, 1
4485    add  r4d, dword [esp+36]
4486    mov dword [esp+36], r4d
4487
4488    mov  r0, [r6+20]
4489    mov  r2, [r6+28]
4490    add  r0, 8*SIZEOF_PIXEL
4491    add  r2, 8*SIZEOF_PIXEL
4492    lea  r4, [r1 + 2*r1]
4493    call pixel_sa8d_8x8_internal2
4494    HADDUW m0, m1
4495    movd r4d, m0
4496    add  r4d, 1
4497    shr  r4d, 1
4498    add  r4d, dword [esp+36]
4499    mov dword [esp+36], r4d
4500
4501    mov  r0, [r6+20]
4502    mov  r2, [r6+28]
4503    add  r0, 16*SIZEOF_PIXEL
4504    add  r2, 16*SIZEOF_PIXEL
4505    lea  r4, [r1 + 2*r1]
4506    call pixel_sa8d_8x8_internal2
4507    HADDUW m0, m1
4508    movd r4d, m0
4509    add  r4d, 1
4510    shr  r4d, 1
4511    add  r4d, dword [esp+36]
4512    mov eax, r4d
4513    mov esp, r6
4514    RET
4515
4516cglobal pixel_sa8d_32x8, 4,7,8
4517    FIX_STRIDES r1, r3
4518    mov  r6, esp
4519    and  esp, ~15
4520    sub  esp, 64
4521
4522    lea  r4, [r1 + 2*r1]
4523    lea  r5, [r3 + 2*r3]
4524    call pixel_sa8d_8x8_internal2
4525    HADDUW m0, m1
4526    movd r4d, m0
4527    add  r4d, 1
4528    shr  r4d, 1
4529    mov dword [esp+36], r4d
4530
4531    mov  r0, [r6+20]
4532    mov  r2, [r6+28]
4533    add  r0, 8*SIZEOF_PIXEL
4534    add  r2, 8*SIZEOF_PIXEL
4535    lea  r4, [r1 + 2*r1]
4536    call pixel_sa8d_8x8_internal2
4537    HADDUW m0, m1
4538    movd r4d, m0
4539    add  r4d, 1
4540    shr  r4d, 1
4541    add  r4d, dword [esp+36]
4542    mov dword [esp+36], r4d
4543
4544    mov  r0, [r6+20]
4545    mov  r2, [r6+28]
4546    add  r0, 16*SIZEOF_PIXEL
4547    add  r2, 16*SIZEOF_PIXEL
4548    lea  r4, [r1 + 2*r1]
4549    call pixel_sa8d_8x8_internal2
4550    HADDUW m0, m1
4551    movd r4d, m0
4552    add  r4d, 1
4553    shr  r4d, 1
4554    add  r4d, dword [esp+36]
4555    mov dword [esp+36], r4d
4556
4557    mov  r0, [r6+20]
4558    mov  r2, [r6+28]
4559    add  r0, 24*SIZEOF_PIXEL
4560    add  r2, 24*SIZEOF_PIXEL
4561    lea  r4, [r1 + 2*r1]
4562    call pixel_sa8d_8x8_internal2
4563    HADDUW m0, m1
4564    movd r4d, m0
4565    add  r4d, 1
4566    shr  r4d, 1
4567    add  r4d, dword [esp+36]
4568    mov eax, r4d
4569    mov esp, r6
4570    RET
4571
4572cglobal pixel_sa8d_32x16, 4,7,8
4573    FIX_STRIDES r1, r3
4574    mov  r6, esp
4575    and  esp, ~15
4576    sub  esp, 64
4577
4578    lea  r4, [r1 + 2*r1]
4579    lea  r5, [r3 + 2*r3]
4580    call pixel_sa8d_8x8_internal2
4581%if HIGH_BIT_DEPTH
4582    HADDUW m0, m1
4583%endif
4584    mova [rsp+48], m0
4585    call pixel_sa8d_8x8_internal2
4586    SA8D_INTER
4587    mova [esp+48], m0
4588
4589    mov  r0, [r6+20]
4590    mov  r2, [r6+28]
4591    add  r0, 8*SIZEOF_PIXEL
4592    add  r2, 8*SIZEOF_PIXEL
4593    call pixel_sa8d_8x8_internal2
4594    SA8D_INTER
4595    mova [esp+48], m0
4596    call pixel_sa8d_8x8_internal2
4597    SA8D_INTER
4598%if HIGH_BIT_DEPTH == 0
4599    HADDUW m0, m1
4600%endif
4601    movd r4d, m0
4602    add  r4d, 1
4603    shr  r4d, 1
4604    mov dword [esp+36], r4d
4605
4606    mov  r0, [r6+20]
4607    mov  r2, [r6+28]
4608    add  r0, 16*SIZEOF_PIXEL
4609    add  r2, 16*SIZEOF_PIXEL
4610    lea  r4, [r1 + 2*r1]
4611    call pixel_sa8d_8x8_internal2
4612%if HIGH_BIT_DEPTH
4613    HADDUW m0, m1
4614%endif
4615    mova [esp+48], m0
4616    call pixel_sa8d_8x8_internal2
4617    SA8D_INTER
4618    mova [esp+48], m0
4619
4620    mov  r0, [r6+20]
4621    mov  r2, [r6+28]
4622    add  r0, 24*SIZEOF_PIXEL
4623    add  r2, 24*SIZEOF_PIXEL
4624    call pixel_sa8d_8x8_internal2
4625    SA8D_INTER
4626    mova [esp+64-mmsize], m0
4627    call pixel_sa8d_8x8_internal2
4628    SA8D_INTER
4629%if HIGH_BIT_DEPTH == 0
4630    HADDUW m0, m1
4631%endif
4632    movd r4d, m0
4633    add  r4d, 1
4634    shr  r4d, 1
4635    add r4d, dword [esp+36]
4636    mov eax, r4d
4637    mov esp, r6
4638    RET
4639
4640cglobal pixel_sa8d_32x24, 4,7,8
4641    FIX_STRIDES r1, r3
4642    mov  r6, esp
4643    and  esp, ~15
4644    sub  esp, 64
4645
4646    lea  r4, [r1 + 2*r1]
4647    lea  r5, [r3 + 2*r3]
4648    call pixel_sa8d_8x8_internal2
4649    HADDUW m0, m1
4650    movd r4d, m0
4651    add  r4d, 1
4652    shr  r4d, 1
4653    mov dword [esp+36], r4d
4654
4655    mov  r0, [r6+20]
4656    mov  r2, [r6+28]
4657    add  r0, 8*SIZEOF_PIXEL
4658    add  r2, 8*SIZEOF_PIXEL
4659    lea  r4, [r1 + 2*r1]
4660    call pixel_sa8d_8x8_internal2
4661    HADDUW m0, m1
4662    movd r4d, m0
4663    add  r4d, 1
4664    shr  r4d, 1
4665    add  r4d, dword [esp+36]
4666    mov dword [esp+36], r4d
4667
4668    mov  r0, [r6+20]
4669    mov  r2, [r6+28]
4670    add  r0, 16*SIZEOF_PIXEL
4671    add  r2, 16*SIZEOF_PIXEL
4672    lea  r4, [r1 + 2*r1]
4673    call pixel_sa8d_8x8_internal2
4674    HADDUW m0, m1
4675    movd r4d, m0
4676    add  r4d, 1
4677    shr  r4d, 1
4678    add  r4d, dword [esp+36]
4679    mov dword [esp+36], r4d
4680
4681    mov  r0, [r6+20]
4682    mov  r2, [r6+28]
4683    add  r0, 24*SIZEOF_PIXEL
4684    add  r2, 24*SIZEOF_PIXEL
4685    lea  r4, [r1 + 2*r1]
4686    call pixel_sa8d_8x8_internal2
4687    HADDUW m0, m1
4688    movd r4d, m0
4689    add  r4d, 1
4690    shr  r4d, 1
4691    add  r4d, dword [esp+36]
4692    mov dword [esp+36], r4d
4693
4694    mov  r0, [r6+20]
4695    mov  r2, [r6+28]
4696    lea  r0, [r0 + r1*8]
4697    lea  r2, [r2 + r3*8]
4698    mov  [r6+20], r0
4699    mov  [r6+28], r2
4700    lea  r4, [r1 + 2*r1]
4701    call pixel_sa8d_8x8_internal2
4702    HADDUW m0, m1
4703    movd r4d, m0
4704    add  r4d, 1
4705    shr  r4d, 1
4706    add  r4d, dword [esp+36]
4707    mov dword [esp+36], r4d
4708
4709    mov  r0, [r6+20]
4710    mov  r2, [r6+28]
4711    add  r0, 8*SIZEOF_PIXEL
4712    add  r2, 8*SIZEOF_PIXEL
4713    lea  r4, [r1 + 2*r1]
4714    call pixel_sa8d_8x8_internal2
4715    HADDUW m0, m1
4716    movd r4d, m0
4717    add  r4d, 1
4718    shr  r4d, 1
4719    add  r4d, dword [esp+36]
4720    mov dword [esp+36], r4d
4721
4722    mov  r0, [r6+20]
4723    mov  r2, [r6+28]
4724    add  r0, 16*SIZEOF_PIXEL
4725    add  r2, 16*SIZEOF_PIXEL
4726    lea  r4, [r1 + 2*r1]
4727    call pixel_sa8d_8x8_internal2
4728    HADDUW m0, m1
4729    movd r4d, m0
4730    add  r4d, 1
4731    shr  r4d, 1
4732    add  r4d, dword [esp+36]
4733    mov dword [esp+36], r4d
4734
4735    mov  r0, [r6+20]
4736    mov  r2, [r6+28]
4737    add  r0, 24*SIZEOF_PIXEL
4738    add  r2, 24*SIZEOF_PIXEL
4739    lea  r4, [r1 + 2*r1]
4740    call pixel_sa8d_8x8_internal2
4741    HADDUW m0, m1
4742    movd r4d, m0
4743    add  r4d, 1
4744    shr  r4d, 1
4745    add  r4d, dword [esp+36]
4746    mov dword [esp+36], r4d
4747
4748    mov  r0, [r6+20]
4749    mov  r2, [r6+28]
4750    lea  r0, [r0 + r1*8]
4751    lea  r2, [r2 + r3*8]
4752    mov  [r6+20], r0
4753    mov  [r6+28], r2
4754    lea  r4, [r1 + 2*r1]
4755    call pixel_sa8d_8x8_internal2
4756    HADDUW m0, m1
4757    movd r4d, m0
4758    add  r4d, 1
4759    shr  r4d, 1
4760    add  r4d, dword [esp+36]
4761    mov dword [esp+36], r4d
4762
4763    mov  r0, [r6+20]
4764    mov  r2, [r6+28]
4765    add  r0, 8*SIZEOF_PIXEL
4766    add  r2, 8*SIZEOF_PIXEL
4767    lea  r4, [r1 + 2*r1]
4768    call pixel_sa8d_8x8_internal2
4769    HADDUW m0, m1
4770    movd r4d, m0
4771    add  r4d, 1
4772    shr  r4d, 1
4773    add  r4d, dword [esp+36]
4774    mov dword [esp+36], r4d
4775
4776    mov  r0, [r6+20]
4777    mov  r2, [r6+28]
4778    add  r0, 16*SIZEOF_PIXEL
4779    add  r2, 16*SIZEOF_PIXEL
4780    lea  r4, [r1 + 2*r1]
4781    call pixel_sa8d_8x8_internal2
4782    HADDUW m0, m1
4783    movd r4d, m0
4784    add  r4d, 1
4785    shr  r4d, 1
4786    add  r4d, dword [esp+36]
4787    mov dword [esp+36], r4d
4788
4789    mov  r0, [r6+20]
4790    mov  r2, [r6+28]
4791    add  r0, 24*SIZEOF_PIXEL
4792    add  r2, 24*SIZEOF_PIXEL
4793    lea  r4, [r1 + 2*r1]
4794    call pixel_sa8d_8x8_internal2
4795    HADDUW m0, m1
4796    movd r4d, m0
4797    add  r4d, 1
4798    shr  r4d, 1
4799    add  r4d, dword [esp+36]
4800    mov eax, r4d
4801    mov esp, r6
4802    RET
4803
4804cglobal pixel_sa8d_32x32, 4,7,8
4805    FIX_STRIDES r1, r3
4806    mov  r6, esp
4807    and  esp, ~15
4808    sub  esp, 64
4809
4810    lea  r4, [r1 + 2*r1]
4811    lea  r5, [r3 + 2*r3]
4812    call pixel_sa8d_8x8_internal2
4813%if HIGH_BIT_DEPTH
4814    HADDUW m0, m1
4815%endif
4816    mova [rsp+48], m0
4817    call pixel_sa8d_8x8_internal2
4818    SA8D_INTER
4819    mova [esp+48], m0
4820
4821    mov  r0, [r6+20]
4822    mov  r2, [r6+28]
4823    add  r0, 8*SIZEOF_PIXEL
4824    add  r2, 8*SIZEOF_PIXEL
4825    call pixel_sa8d_8x8_internal2
4826    SA8D_INTER
4827    mova [esp+48], m0
4828    call pixel_sa8d_8x8_internal2
4829    SA8D_INTER
4830%if HIGH_BIT_DEPTH == 0
4831    HADDUW m0, m1
4832%endif
4833    movd r4d, m0
4834    add  r4d, 1
4835    shr  r4d, 1
4836    mov dword [esp+36], r4d
4837
4838    mov  r0, [r6+20]
4839    mov  r2, [r6+28]
4840    add  r0, 16*SIZEOF_PIXEL
4841    add  r2, 16*SIZEOF_PIXEL
4842    lea  r4, [r1 + 2*r1]
4843    call pixel_sa8d_8x8_internal2
4844%if HIGH_BIT_DEPTH
4845    HADDUW m0, m1
4846%endif
4847    mova [esp+48], m0
4848    call pixel_sa8d_8x8_internal2
4849    SA8D_INTER
4850    mova [esp+48], m0
4851
4852    mov  r0, [r6+20]
4853    mov  r2, [r6+28]
4854    add  r0, 24*SIZEOF_PIXEL
4855    add  r2, 24*SIZEOF_PIXEL
4856    call pixel_sa8d_8x8_internal2
4857    SA8D_INTER
4858    mova [esp+64-mmsize], m0
4859    call pixel_sa8d_8x8_internal2
4860    AVG_16x16
4861
4862    mov  r0, [r6+20]
4863    mov  r2, [r6+28]
4864    lea  r0, [r0 + r1*8]
4865    lea  r2, [r2 + r3*8]
4866    lea  r0, [r0 + r1*8]
4867    lea  r2, [r2 + r3*8]
4868    lea  r4, [r1 + 2*r1]
4869    call pixel_sa8d_8x8_internal2
4870%if HIGH_BIT_DEPTH
4871    HADDUW m0, m1
4872%endif
4873    mova [esp+48], m0
4874    call pixel_sa8d_8x8_internal2
4875    SA8D_INTER
4876    mova [esp+48], m0
4877
4878    mov  r0, [r6+20]
4879    mov  r2, [r6+28]
4880    lea  r0, [r0 + r1*8]
4881    lea  r2, [r2 + r3*8]
4882    lea  r0, [r0 + r1*8]
4883    lea  r2, [r2 + r3*8]
4884    add  r0, 8*SIZEOF_PIXEL
4885    add  r2, 8*SIZEOF_PIXEL
4886    call pixel_sa8d_8x8_internal2
4887    SA8D_INTER
4888    mova [esp+64-mmsize], m0
4889    call pixel_sa8d_8x8_internal2
4890    AVG_16x16
4891
4892    mov  r0, [r6+20]
4893    mov  r2, [r6+28]
4894    lea  r0, [r0 + r1*8]
4895    lea  r2, [r2 + r3*8]
4896    lea  r0, [r0 + r1*8]
4897    lea  r2, [r2 + r3*8]
4898    add  r0, 16*SIZEOF_PIXEL
4899    add  r2, 16*SIZEOF_PIXEL
4900    lea  r4, [r1 + 2*r1]
4901    call pixel_sa8d_8x8_internal2
4902%if HIGH_BIT_DEPTH
4903    HADDUW m0, m1
4904%endif
4905    mova [esp+48], m0
4906    call pixel_sa8d_8x8_internal2
4907    SA8D_INTER
4908    mova [esp+48], m0
4909
4910    mov  r0, [r6+20]
4911    mov  r2, [r6+28]
4912    lea  r0, [r0 + r1*8]
4913    lea  r2, [r2 + r3*8]
4914    lea  r0, [r0 + r1*8]
4915    lea  r2, [r2 + r3*8]
4916    add  r0, 24*SIZEOF_PIXEL
4917    add  r2, 24*SIZEOF_PIXEL
4918    call pixel_sa8d_8x8_internal2
4919    SA8D_INTER
4920    mova [esp+64-mmsize], m0
4921    call pixel_sa8d_8x8_internal2
4922    SA8D_INTER
4923%if HIGH_BIT_DEPTH == 0
4924    HADDUW m0, m1
4925%endif
4926    movd r4d, m0
4927    add  r4d, 1
4928    shr  r4d, 1
4929    add r4d, dword [esp+36]
4930    mov eax, r4d
4931    mov esp, r6
4932    RET
4933
4934cglobal pixel_sa8d_32x64, 4,7,8
4935    FIX_STRIDES r1, r3
4936    mov  r6, esp
4937    and  esp, ~15
4938    sub  esp, 64
4939
4940    lea  r4, [r1 + 2*r1]
4941    lea  r5, [r3 + 2*r3]
4942    call pixel_sa8d_8x8_internal2
4943%if HIGH_BIT_DEPTH
4944    HADDUW m0, m1
4945%endif
4946    mova [rsp+48], m0
4947    call pixel_sa8d_8x8_internal2
4948    SA8D_INTER
4949    mova [esp+48], m0
4950
4951    mov  r0, [r6+20]
4952    mov  r2, [r6+28]
4953    add  r0, 8*SIZEOF_PIXEL
4954    add  r2, 8*SIZEOF_PIXEL
4955    call pixel_sa8d_8x8_internal2
4956    SA8D_INTER
4957    mova [esp+48], m0
4958    call pixel_sa8d_8x8_internal2
4959    SA8D_INTER
4960%if HIGH_BIT_DEPTH == 0
4961    HADDUW m0, m1
4962%endif
4963    movd r4d, m0
4964    add  r4d, 1
4965    shr  r4d, 1
4966    mov dword [esp+36], r4d
4967
4968    mov  r0, [r6+20]
4969    mov  r2, [r6+28]
4970    add  r0, 16*SIZEOF_PIXEL
4971    add  r2, 16*SIZEOF_PIXEL
4972    lea  r4, [r1 + 2*r1]
4973    call pixel_sa8d_8x8_internal2
4974%if HIGH_BIT_DEPTH
4975    HADDUW m0, m1
4976%endif
4977    mova [esp+48], m0
4978    call pixel_sa8d_8x8_internal2
4979    SA8D_INTER
4980    mova [esp+48], m0
4981
4982    mov  r0, [r6+20]
4983    mov  r2, [r6+28]
4984    add  r0, 24*SIZEOF_PIXEL
4985    add  r2, 24*SIZEOF_PIXEL
4986    call pixel_sa8d_8x8_internal2
4987    SA8D_INTER
4988    mova [esp+64-mmsize], m0
4989    call pixel_sa8d_8x8_internal2
4990    AVG_16x16
4991
4992    mov  r0, [r6+20]
4993    mov  r2, [r6+28]
4994    lea  r0, [r0 + r1*8]
4995    lea  r2, [r2 + r3*8]
4996    lea  r0, [r0 + r1*8]
4997    lea  r2, [r2 + r3*8]
4998    mov  [r6+20], r0
4999    mov  [r6+28], r2
5000
5001    lea  r4, [r1 + 2*r1]
5002    call pixel_sa8d_8x8_internal2
5003%if HIGH_BIT_DEPTH
5004    HADDUW m0, m1
5005%endif
5006    mova [esp+48], m0
5007    call pixel_sa8d_8x8_internal2
5008    SA8D_INTER
5009    mova [esp+48], m0
5010
5011    mov  r0, [r6+20]
5012    mov  r2, [r6+28]
5013    add  r0, 8*SIZEOF_PIXEL
5014    add  r2, 8*SIZEOF_PIXEL
5015    call pixel_sa8d_8x8_internal2
5016    SA8D_INTER
5017    mova [esp+64-mmsize], m0
5018    call pixel_sa8d_8x8_internal2
5019    AVG_16x16
5020
5021    mov  r0, [r6+20]
5022    mov  r2, [r6+28]
5023    add  r0, 16*SIZEOF_PIXEL
5024    add  r2, 16*SIZEOF_PIXEL
5025    lea  r4, [r1 + 2*r1]
5026    call pixel_sa8d_8x8_internal2
5027%if HIGH_BIT_DEPTH
5028    HADDUW m0, m1
5029%endif
5030    mova [esp+48], m0
5031    call pixel_sa8d_8x8_internal2
5032    SA8D_INTER
5033    mova [esp+48], m0
5034
5035    mov  r0, [r6+20]
5036    mov  r2, [r6+28]
5037    add  r0, 24*SIZEOF_PIXEL
5038    add  r2, 24*SIZEOF_PIXEL
5039    call pixel_sa8d_8x8_internal2
5040    SA8D_INTER
5041    mova [esp+64-mmsize], m0
5042    call pixel_sa8d_8x8_internal2
5043    AVG_16x16
5044
5045    mov  r0, [r6+20]
5046    mov  r2, [r6+28]
5047    lea  r0, [r0 + r1*8]
5048    lea  r2, [r2 + r3*8]
5049    lea  r0, [r0 + r1*8]
5050    lea  r2, [r2 + r3*8]
5051    mov  [r6+20], r0
5052    mov  [r6+28], r2
5053
5054    lea  r4, [r1 + 2*r1]
5055    call pixel_sa8d_8x8_internal2
5056%if HIGH_BIT_DEPTH
5057    HADDUW m0, m1
5058%endif
5059    mova [esp+48], m0
5060    call pixel_sa8d_8x8_internal2
5061    SA8D_INTER
5062    mova [esp+48], m0
5063
5064    mov  r0, [r6+20]
5065    mov  r2, [r6+28]
5066    add  r0, 8*SIZEOF_PIXEL
5067    add  r2, 8*SIZEOF_PIXEL
5068    call pixel_sa8d_8x8_internal2
5069    SA8D_INTER
5070    mova [esp+64-mmsize], m0
5071    call pixel_sa8d_8x8_internal2
5072    AVG_16x16
5073
5074    mov  r0, [r6+20]
5075    mov  r2, [r6+28]
5076    add  r0, 16*SIZEOF_PIXEL
5077    add  r2, 16*SIZEOF_PIXEL
5078    lea  r4, [r1 + 2*r1]
5079    call pixel_sa8d_8x8_internal2
5080%if HIGH_BIT_DEPTH
5081    HADDUW m0, m1
5082%endif
5083    mova [esp+48], m0
5084    call pixel_sa8d_8x8_internal2
5085    SA8D_INTER
5086    mova [esp+48], m0
5087
5088    mov  r0, [r6+20]
5089    mov  r2, [r6+28]
5090    add  r0, 24*SIZEOF_PIXEL
5091    add  r2, 24*SIZEOF_PIXEL
5092    call pixel_sa8d_8x8_internal2
5093    SA8D_INTER
5094    mova [esp+64-mmsize], m0
5095    call pixel_sa8d_8x8_internal2
5096    AVG_16x16
5097
5098    mov  r0, [r6+20]
5099    mov  r2, [r6+28]
5100    lea  r0, [r0 + r1*8]
5101    lea  r2, [r2 + r3*8]
5102    lea  r0, [r0 + r1*8]
5103    lea  r2, [r2 + r3*8]
5104    mov  [r6+20], r0
5105    mov  [r6+28], r2
5106
5107    lea  r4, [r1 + 2*r1]
5108    call pixel_sa8d_8x8_internal2
5109%if HIGH_BIT_DEPTH
5110    HADDUW m0, m1
5111%endif
5112    mova [esp+48], m0
5113    call pixel_sa8d_8x8_internal2
5114    SA8D_INTER
5115    mova [esp+48], m0
5116
5117    mov  r0, [r6+20]
5118    mov  r2, [r6+28]
5119    add  r0, 8*SIZEOF_PIXEL
5120    add  r2, 8*SIZEOF_PIXEL
5121    call pixel_sa8d_8x8_internal2
5122    SA8D_INTER
5123    mova [esp+64-mmsize], m0
5124    call pixel_sa8d_8x8_internal2
5125    AVG_16x16
5126
5127    mov  r0, [r6+20]
5128    mov  r2, [r6+28]
5129    add  r0, 16*SIZEOF_PIXEL
5130    add  r2, 16*SIZEOF_PIXEL
5131    lea  r4, [r1 + 2*r1]
5132    call pixel_sa8d_8x8_internal2
5133%if HIGH_BIT_DEPTH
5134    HADDUW m0, m1
5135%endif
5136    mova [esp+48], m0
5137    call pixel_sa8d_8x8_internal2
5138    SA8D_INTER
5139    mova [esp+48], m0
5140
5141    mov  r0, [r6+20]
5142    mov  r2, [r6+28]
5143    add  r0, 24*SIZEOF_PIXEL
5144    add  r2, 24*SIZEOF_PIXEL
5145    call pixel_sa8d_8x8_internal2
5146    SA8D_INTER
5147    mova [esp+64-mmsize], m0
5148    call pixel_sa8d_8x8_internal2
5149    SA8D_INTER
5150%if HIGH_BIT_DEPTH == 0
5151    HADDUW m0, m1
5152%endif
5153    movd r4d, m0
5154    add  r4d, 1
5155    shr  r4d, 1
5156    add r4d, dword [esp+36]
5157    mov eax, r4d
5158    mov esp, r6
5159    RET
5160
5161cglobal pixel_sa8d_48x64, 4,7,8
5162    FIX_STRIDES r1, r3
5163    mov  r6, esp
5164    and  esp, ~15
5165    sub  esp, 64
5166
5167    lea  r4, [r1 + 2*r1]
5168    lea  r5, [r3 + 2*r3]
5169    call pixel_sa8d_8x8_internal2
5170%if HIGH_BIT_DEPTH
5171    HADDUW m0, m1
5172%endif
5173    mova [rsp+48], m0
5174    call pixel_sa8d_8x8_internal2
5175    SA8D_INTER
5176    mova [esp+48], m0
5177
5178    mov  r0, [r6+20]
5179    mov  r2, [r6+28]
5180    add  r0, 8*SIZEOF_PIXEL
5181    add  r2, 8*SIZEOF_PIXEL
5182    call pixel_sa8d_8x8_internal2
5183    SA8D_INTER
5184    mova [esp+48], m0
5185    call pixel_sa8d_8x8_internal2
5186    SA8D_INTER
5187%if HIGH_BIT_DEPTH == 0
5188    HADDUW m0, m1
5189%endif
5190    movd r4d, m0
5191    add  r4d, 1
5192    shr  r4d, 1
5193    mov dword [esp+36], r4d
5194
5195    mov  r0, [r6+20]
5196    mov  r2, [r6+28]
5197    add  r0, 16*SIZEOF_PIXEL
5198    add  r2, 16*SIZEOF_PIXEL
5199    lea  r4, [r1 + 2*r1]
5200    call pixel_sa8d_8x8_internal2
5201%if HIGH_BIT_DEPTH
5202    HADDUW m0, m1
5203%endif
5204    mova [esp+48], m0
5205    call pixel_sa8d_8x8_internal2
5206    SA8D_INTER
5207    mova [esp+48], m0
5208
5209    mov  r0, [r6+20]
5210    mov  r2, [r6+28]
5211    add  r0, 24*SIZEOF_PIXEL
5212    add  r2, 24*SIZEOF_PIXEL
5213    call pixel_sa8d_8x8_internal2
5214    SA8D_INTER
5215    mova [esp+64-mmsize], m0
5216    call pixel_sa8d_8x8_internal2
5217    AVG_16x16
5218
5219    mov  r0, [r6+20]
5220    mov  r2, [r6+28]
5221    add  r0, 32*SIZEOF_PIXEL
5222    add  r2, 32*SIZEOF_PIXEL
5223    lea  r4, [r1 + 2*r1]
5224    call pixel_sa8d_8x8_internal2
5225%if HIGH_BIT_DEPTH
5226    HADDUW m0, m1
5227%endif
5228    mova [esp+48], m0
5229    call pixel_sa8d_8x8_internal2
5230    SA8D_INTER
5231    mova [esp+48], m0
5232
5233    mov  r0, [r6+20]
5234    mov  r2, [r6+28]
5235    add  r0, 40*SIZEOF_PIXEL
5236    add  r2, 40*SIZEOF_PIXEL
5237    call pixel_sa8d_8x8_internal2
5238    SA8D_INTER
5239    mova [esp+64-mmsize], m0
5240    call pixel_sa8d_8x8_internal2
5241    AVG_16x16
5242
5243    mov  r0, [r6+20]
5244    mov  r2, [r6+28]
5245    lea  r0, [r0 + r1*8]
5246    lea  r2, [r2 + r3*8]
5247    lea  r0, [r0 + r1*8]
5248    lea  r2, [r2 + r3*8]
5249    mov  [r6+20], r0
5250    mov  [r6+28], r2
5251
5252    lea  r4, [r1 + 2*r1]
5253    call pixel_sa8d_8x8_internal2
5254%if HIGH_BIT_DEPTH
5255    HADDUW m0, m1
5256%endif
5257    mova [esp+48], m0
5258    call pixel_sa8d_8x8_internal2
5259    SA8D_INTER
5260    mova [esp+48], m0
5261
5262    mov  r0, [r6+20]
5263    mov  r2, [r6+28]
5264    add  r0, 8*SIZEOF_PIXEL
5265    add  r2, 8*SIZEOF_PIXEL
5266    call pixel_sa8d_8x8_internal2
5267    SA8D_INTER
5268    mova [esp+64-mmsize], m0
5269    call pixel_sa8d_8x8_internal2
5270    AVG_16x16
5271
5272    mov  r0, [r6+20]
5273    mov  r2, [r6+28]
5274    add  r0, 16*SIZEOF_PIXEL
5275    add  r2, 16*SIZEOF_PIXEL
5276    lea  r4, [r1 + 2*r1]
5277    call pixel_sa8d_8x8_internal2
5278%if HIGH_BIT_DEPTH
5279    HADDUW m0, m1
5280%endif
5281    mova [esp+48], m0
5282    call pixel_sa8d_8x8_internal2
5283    SA8D_INTER
5284    mova [esp+48], m0
5285
5286    mov  r0, [r6+20]
5287    mov  r2, [r6+28]
5288    add  r0, 24*SIZEOF_PIXEL
5289    add  r2, 24*SIZEOF_PIXEL
5290    call pixel_sa8d_8x8_internal2
5291    SA8D_INTER
5292    mova [esp+64-mmsize], m0
5293    call pixel_sa8d_8x8_internal2
5294    AVG_16x16
5295
5296    mov  r0, [r6+20]
5297    mov  r2, [r6+28]
5298    add  r0, 32*SIZEOF_PIXEL
5299    add  r2, 32*SIZEOF_PIXEL
5300    lea  r4, [r1 + 2*r1]
5301    call pixel_sa8d_8x8_internal2
5302%if HIGH_BIT_DEPTH
5303    HADDUW m0, m1
5304%endif
5305    mova [esp+48], m0
5306    call pixel_sa8d_8x8_internal2
5307    SA8D_INTER
5308    mova [esp+48], m0
5309
5310    mov  r0, [r6+20]
5311    mov  r2, [r6+28]
5312    add  r0, 40*SIZEOF_PIXEL
5313    add  r2, 40*SIZEOF_PIXEL
5314    call pixel_sa8d_8x8_internal2
5315    SA8D_INTER
5316    mova [esp+64-mmsize], m0
5317    call pixel_sa8d_8x8_internal2
5318    AVG_16x16
5319
5320    mov  r0, [r6+20]
5321    mov  r2, [r6+28]
5322    lea  r0, [r0 + r1*8]
5323    lea  r2, [r2 + r3*8]
5324    lea  r0, [r0 + r1*8]
5325    lea  r2, [r2 + r3*8]
5326    mov  [r6+20], r0
5327    mov  [r6+28], r2
5328
5329    lea  r4, [r1 + 2*r1]
5330    call pixel_sa8d_8x8_internal2
5331%if HIGH_BIT_DEPTH
5332    HADDUW m0, m1
5333%endif
5334    mova [esp+48], m0
5335    call pixel_sa8d_8x8_internal2
5336    SA8D_INTER
5337    mova [esp+48], m0
5338
5339    mov  r0, [r6+20]
5340    mov  r2, [r6+28]
5341    add  r0, 8*SIZEOF_PIXEL
5342    add  r2, 8*SIZEOF_PIXEL
5343    call pixel_sa8d_8x8_internal2
5344    SA8D_INTER
5345    mova [esp+64-mmsize], m0
5346    call pixel_sa8d_8x8_internal2
5347    AVG_16x16
5348
5349    mov  r0, [r6+20]
5350    mov  r2, [r6+28]
5351    add  r0, 16*SIZEOF_PIXEL
5352    add  r2, 16*SIZEOF_PIXEL
5353    lea  r4, [r1 + 2*r1]
5354    call pixel_sa8d_8x8_internal2
5355%if HIGH_BIT_DEPTH
5356    HADDUW m0, m1
5357%endif
5358    mova [esp+48], m0
5359    call pixel_sa8d_8x8_internal2
5360    SA8D_INTER
5361    mova [esp+48], m0
5362
5363    mov  r0, [r6+20]
5364    mov  r2, [r6+28]
5365    add  r0, 24*SIZEOF_PIXEL
5366    add  r2, 24*SIZEOF_PIXEL
5367    call pixel_sa8d_8x8_internal2
5368    SA8D_INTER
5369    mova [esp+64-mmsize], m0
5370    call pixel_sa8d_8x8_internal2
5371    AVG_16x16
5372
5373    mov  r0, [r6+20]
5374    mov  r2, [r6+28]
5375    add  r0, 32*SIZEOF_PIXEL
5376    add  r2, 32*SIZEOF_PIXEL
5377    lea  r4, [r1 + 2*r1]
5378    call pixel_sa8d_8x8_internal2
5379%if HIGH_BIT_DEPTH
5380    HADDUW m0, m1
5381%endif
5382    mova [esp+48], m0
5383    call pixel_sa8d_8x8_internal2
5384    SA8D_INTER
5385    mova [esp+48], m0
5386
5387    mov  r0, [r6+20]
5388    mov  r2, [r6+28]
5389    add  r0, 40*SIZEOF_PIXEL
5390    add  r2, 40*SIZEOF_PIXEL
5391    call pixel_sa8d_8x8_internal2
5392    SA8D_INTER
5393    mova [esp+64-mmsize], m0
5394    call pixel_sa8d_8x8_internal2
5395    AVG_16x16
5396
5397    mov  r0, [r6+20]
5398    mov  r2, [r6+28]
5399    lea  r0, [r0 + r1*8]
5400    lea  r2, [r2 + r3*8]
5401    lea  r0, [r0 + r1*8]
5402    lea  r2, [r2 + r3*8]
5403    mov  [r6+20], r0
5404    mov  [r6+28], r2
5405
5406    lea  r4, [r1 + 2*r1]
5407    call pixel_sa8d_8x8_internal2
5408%if HIGH_BIT_DEPTH
5409    HADDUW m0, m1
5410%endif
5411    mova [esp+48], m0
5412    call pixel_sa8d_8x8_internal2
5413    SA8D_INTER
5414    mova [esp+48], m0
5415
5416    mov  r0, [r6+20]
5417    mov  r2, [r6+28]
5418    add  r0, 8*SIZEOF_PIXEL
5419    add  r2, 8*SIZEOF_PIXEL
5420    call pixel_sa8d_8x8_internal2
5421    SA8D_INTER
5422    mova [esp+64-mmsize], m0
5423    call pixel_sa8d_8x8_internal2
5424    AVG_16x16
5425
5426    mov  r0, [r6+20]
5427    mov  r2, [r6+28]
5428    add  r0, 16*SIZEOF_PIXEL
5429    add  r2, 16*SIZEOF_PIXEL
5430    lea  r4, [r1 + 2*r1]
5431    call pixel_sa8d_8x8_internal2
5432%if HIGH_BIT_DEPTH
5433    HADDUW m0, m1
5434%endif
5435    mova [esp+48], m0
5436    call pixel_sa8d_8x8_internal2
5437    SA8D_INTER
5438    mova [esp+48], m0
5439
5440    mov  r0, [r6+20]
5441    mov  r2, [r6+28]
5442    add  r0, 24*SIZEOF_PIXEL
5443    add  r2, 24*SIZEOF_PIXEL
5444    call pixel_sa8d_8x8_internal2
5445    SA8D_INTER
5446    mova [esp+64-mmsize], m0
5447    call pixel_sa8d_8x8_internal2
5448    AVG_16x16
5449
5450    mov  r0, [r6+20]
5451    mov  r2, [r6+28]
5452    add  r0, 32*SIZEOF_PIXEL
5453    add  r2, 32*SIZEOF_PIXEL
5454    lea  r4, [r1 + 2*r1]
5455    call pixel_sa8d_8x8_internal2
5456%if HIGH_BIT_DEPTH
5457    HADDUW m0, m1
5458%endif
5459    mova [esp+48], m0
5460    call pixel_sa8d_8x8_internal2
5461    SA8D_INTER
5462    mova [esp+48], m0
5463
5464    mov  r0, [r6+20]
5465    mov  r2, [r6+28]
5466    add  r0, 40*SIZEOF_PIXEL
5467    add  r2, 40*SIZEOF_PIXEL
5468    call pixel_sa8d_8x8_internal2
5469    SA8D_INTER
5470    mova [esp+64-mmsize], m0
5471    call pixel_sa8d_8x8_internal2
5472    SA8D_INTER
5473%if HIGH_BIT_DEPTH == 0
5474    HADDUW m0, m1
5475%endif
5476    movd r4d, m0
5477    add  r4d, 1
5478    shr  r4d, 1
5479    add r4d, dword [esp+36]
5480    mov eax, r4d
5481    mov esp, r6
5482    RET
5483
5484cglobal pixel_sa8d_64x16, 4,7,8
5485    FIX_STRIDES r1, r3
5486    mov  r6, esp
5487    and  esp, ~15
5488    sub  esp, 64
5489
5490    lea  r4, [r1 + 2*r1]
5491    lea  r5, [r3 + 2*r3]
5492    call pixel_sa8d_8x8_internal2
5493%if HIGH_BIT_DEPTH
5494    HADDUW m0, m1
5495%endif
5496    mova [rsp+48], m0
5497    call pixel_sa8d_8x8_internal2
5498    SA8D_INTER
5499    mova [esp+48], m0
5500
5501    mov  r0, [r6+20]
5502    mov  r2, [r6+28]
5503    add  r0, 8*SIZEOF_PIXEL
5504    add  r2, 8*SIZEOF_PIXEL
5505    call pixel_sa8d_8x8_internal2
5506    SA8D_INTER
5507    mova [esp+48], m0
5508    call pixel_sa8d_8x8_internal2
5509    SA8D_INTER
5510%if HIGH_BIT_DEPTH == 0
5511    HADDUW m0, m1
5512%endif
5513    movd r4d, m0
5514    add  r4d, 1
5515    shr  r4d, 1
5516    mov dword [esp+36], r4d
5517
5518    mov  r0, [r6+20]
5519    mov  r2, [r6+28]
5520    add  r0, 16*SIZEOF_PIXEL
5521    add  r2, 16*SIZEOF_PIXEL
5522    lea  r4, [r1 + 2*r1]
5523    call pixel_sa8d_8x8_internal2
5524%if HIGH_BIT_DEPTH
5525    HADDUW m0, m1
5526%endif
5527    mova [esp+48], m0
5528    call pixel_sa8d_8x8_internal2
5529    SA8D_INTER
5530    mova [esp+48], m0
5531
5532    mov  r0, [r6+20]
5533    mov  r2, [r6+28]
5534    add  r0, 24*SIZEOF_PIXEL
5535    add  r2, 24*SIZEOF_PIXEL
5536    call pixel_sa8d_8x8_internal2
5537    SA8D_INTER
5538    mova [esp+64-mmsize], m0
5539    call pixel_sa8d_8x8_internal2
5540    AVG_16x16
5541
5542    mov  r0, [r6+20]
5543    mov  r2, [r6+28]
5544    add  r0, 32*SIZEOF_PIXEL
5545    add  r2, 32*SIZEOF_PIXEL
5546    lea  r4, [r1 + 2*r1]
5547    call pixel_sa8d_8x8_internal2
5548%if HIGH_BIT_DEPTH
5549    HADDUW m0, m1
5550%endif
5551    mova [esp+48], m0
5552    call pixel_sa8d_8x8_internal2
5553    SA8D_INTER
5554    mova [esp+48], m0
5555
5556    mov  r0, [r6+20]
5557    mov  r2, [r6+28]
5558    add  r0, 40*SIZEOF_PIXEL
5559    add  r2, 40*SIZEOF_PIXEL
5560    call pixel_sa8d_8x8_internal2
5561    SA8D_INTER
5562    mova [esp+64-mmsize], m0
5563    call pixel_sa8d_8x8_internal2
5564    AVG_16x16
5565
5566    mov  r0, [r6+20]
5567    mov  r2, [r6+28]
5568    add  r0, 48*SIZEOF_PIXEL
5569    add  r2, 48*SIZEOF_PIXEL
5570    lea  r4, [r1 + 2*r1]
5571    call pixel_sa8d_8x8_internal2
5572%if HIGH_BIT_DEPTH
5573    HADDUW m0, m1
5574%endif
5575    mova [esp+48], m0
5576    call pixel_sa8d_8x8_internal2
5577    SA8D_INTER
5578    mova [esp+48], m0
5579
5580    mov  r0, [r6+20]
5581    mov  r2, [r6+28]
5582    add  r0, 56*SIZEOF_PIXEL
5583    add  r2, 56*SIZEOF_PIXEL
5584    call pixel_sa8d_8x8_internal2
5585    SA8D_INTER
5586    mova [esp+64-mmsize], m0
5587    call pixel_sa8d_8x8_internal2
5588    SA8D_INTER
5589%if HIGH_BIT_DEPTH == 0
5590    HADDUW m0, m1
5591%endif
5592    movd r4d, m0
5593    add  r4d, 1
5594    shr  r4d, 1
5595    add r4d, dword [esp+36]
5596    mov eax, r4d
5597    mov esp, r6
5598    RET
5599
5600cglobal pixel_sa8d_64x32, 4,7,8
5601    FIX_STRIDES r1, r3
5602    mov  r6, esp
5603    and  esp, ~15
5604    sub  esp, 64
5605
5606    lea  r4, [r1 + 2*r1]
5607    lea  r5, [r3 + 2*r3]
5608    call pixel_sa8d_8x8_internal2
5609%if HIGH_BIT_DEPTH
5610    HADDUW m0, m1
5611%endif
5612    mova [rsp+48], m0
5613    call pixel_sa8d_8x8_internal2
5614    SA8D_INTER
5615    mova [esp+48], m0
5616
5617    mov  r0, [r6+20]
5618    mov  r2, [r6+28]
5619    add  r0, 8*SIZEOF_PIXEL
5620    add  r2, 8*SIZEOF_PIXEL
5621    call pixel_sa8d_8x8_internal2
5622    SA8D_INTER
5623    mova [esp+48], m0
5624    call pixel_sa8d_8x8_internal2
5625    SA8D_INTER
5626%if HIGH_BIT_DEPTH == 0
5627    HADDUW m0, m1
5628%endif
5629    movd r4d, m0
5630    add  r4d, 1
5631    shr  r4d, 1
5632    mov dword [esp+36], r4d
5633
5634    mov  r0, [r6+20]
5635    mov  r2, [r6+28]
5636    add  r0, 16*SIZEOF_PIXEL
5637    add  r2, 16*SIZEOF_PIXEL
5638    lea  r4, [r1 + 2*r1]
5639    call pixel_sa8d_8x8_internal2
5640%if HIGH_BIT_DEPTH
5641    HADDUW m0, m1
5642%endif
5643    mova [esp+48], m0
5644    call pixel_sa8d_8x8_internal2
5645    SA8D_INTER
5646    mova [esp+48], m0
5647
5648    mov  r0, [r6+20]
5649    mov  r2, [r6+28]
5650    add  r0, 24*SIZEOF_PIXEL
5651    add  r2, 24*SIZEOF_PIXEL
5652    call pixel_sa8d_8x8_internal2
5653    SA8D_INTER
5654    mova [esp+64-mmsize], m0
5655    call pixel_sa8d_8x8_internal2
5656    AVG_16x16
5657
5658    mov  r0, [r6+20]
5659    mov  r2, [r6+28]
5660    add  r0, 32*SIZEOF_PIXEL
5661    add  r2, 32*SIZEOF_PIXEL
5662    lea  r4, [r1 + 2*r1]
5663    call pixel_sa8d_8x8_internal2
5664%if HIGH_BIT_DEPTH
5665    HADDUW m0, m1
5666%endif
5667    mova [esp+48], m0
5668    call pixel_sa8d_8x8_internal2
5669    SA8D_INTER
5670    mova [esp+48], m0
5671
5672    mov  r0, [r6+20]
5673    mov  r2, [r6+28]
5674    add  r0, 40*SIZEOF_PIXEL
5675    add  r2, 40*SIZEOF_PIXEL
5676    call pixel_sa8d_8x8_internal2
5677    SA8D_INTER
5678    mova [esp+64-mmsize], m0
5679    call pixel_sa8d_8x8_internal2
5680    AVG_16x16
5681
5682    mov  r0, [r6+20]
5683    mov  r2, [r6+28]
5684    add  r0, 48*SIZEOF_PIXEL
5685    add  r2, 48*SIZEOF_PIXEL
5686    lea  r4, [r1 + 2*r1]
5687    call pixel_sa8d_8x8_internal2
5688%if HIGH_BIT_DEPTH
5689    HADDUW m0, m1
5690%endif
5691    mova [esp+48], m0
5692    call pixel_sa8d_8x8_internal2
5693    SA8D_INTER
5694    mova [esp+48], m0
5695
5696    mov  r0, [r6+20]
5697    mov  r2, [r6+28]
5698    add  r0, 56*SIZEOF_PIXEL
5699    add  r2, 56*SIZEOF_PIXEL
5700    call pixel_sa8d_8x8_internal2
5701    SA8D_INTER
5702    mova [esp+64-mmsize], m0
5703    call pixel_sa8d_8x8_internal2
5704    AVG_16x16
5705
5706    mov  r0, [r6+20]
5707    mov  r2, [r6+28]
5708    lea  r0, [r0 + r1*8]
5709    lea  r2, [r2 + r3*8]
5710    lea  r0, [r0 + r1*8]
5711    lea  r2, [r2 + r3*8]
5712    mov  [r6+20], r0
5713    mov  [r6+28], r2
5714
5715    lea  r4, [r1 + 2*r1]
5716    call pixel_sa8d_8x8_internal2
5717%if HIGH_BIT_DEPTH
5718    HADDUW m0, m1
5719%endif
5720    mova [esp+48], m0
5721    call pixel_sa8d_8x8_internal2
5722    SA8D_INTER
5723    mova [esp+48], m0
5724
5725    mov  r0, [r6+20]
5726    mov  r2, [r6+28]
5727    add  r0, 8*SIZEOF_PIXEL
5728    add  r2, 8*SIZEOF_PIXEL
5729    call pixel_sa8d_8x8_internal2
5730    SA8D_INTER
5731    mova [esp+64-mmsize], m0
5732    call pixel_sa8d_8x8_internal2
5733    AVG_16x16
5734
5735    mov  r0, [r6+20]
5736    mov  r2, [r6+28]
5737    add  r0, 16*SIZEOF_PIXEL
5738    add  r2, 16*SIZEOF_PIXEL
5739    lea  r4, [r1 + 2*r1]
5740    call pixel_sa8d_8x8_internal2
5741%if HIGH_BIT_DEPTH
5742    HADDUW m0, m1
5743%endif
5744    mova [esp+48], m0
5745    call pixel_sa8d_8x8_internal2
5746    SA8D_INTER
5747    mova [esp+48], m0
5748
5749    mov  r0, [r6+20]
5750    mov  r2, [r6+28]
5751    add  r0, 24*SIZEOF_PIXEL
5752    add  r2, 24*SIZEOF_PIXEL
5753    call pixel_sa8d_8x8_internal2
5754    SA8D_INTER
5755    mova [esp+64-mmsize], m0
5756    call pixel_sa8d_8x8_internal2
5757    AVG_16x16
5758
5759    mov  r0, [r6+20]
5760    mov  r2, [r6+28]
5761    add  r0, 32*SIZEOF_PIXEL
5762    add  r2, 32*SIZEOF_PIXEL
5763    lea  r4, [r1 + 2*r1]
5764    call pixel_sa8d_8x8_internal2
5765%if HIGH_BIT_DEPTH
5766    HADDUW m0, m1
5767%endif
5768    mova [esp+48], m0
5769    call pixel_sa8d_8x8_internal2
5770    SA8D_INTER
5771    mova [esp+48], m0
5772
5773    mov  r0, [r6+20]
5774    mov  r2, [r6+28]
5775    add  r0, 40*SIZEOF_PIXEL
5776    add  r2, 40*SIZEOF_PIXEL
5777    call pixel_sa8d_8x8_internal2
5778    SA8D_INTER
5779    mova [esp+64-mmsize], m0
5780    call pixel_sa8d_8x8_internal2
5781    AVG_16x16
5782
5783    mov  r0, [r6+20]
5784    mov  r2, [r6+28]
5785    add  r0, 48*SIZEOF_PIXEL
5786    add  r2, 48*SIZEOF_PIXEL
5787    lea  r4, [r1 + 2*r1]
5788    call pixel_sa8d_8x8_internal2
5789%if HIGH_BIT_DEPTH
5790    HADDUW m0, m1
5791%endif
5792    mova [esp+48], m0
5793    call pixel_sa8d_8x8_internal2
5794    SA8D_INTER
5795    mova [esp+48], m0
5796
5797    mov  r0, [r6+20]
5798    mov  r2, [r6+28]
5799    add  r0, 56*SIZEOF_PIXEL
5800    add  r2, 56*SIZEOF_PIXEL
5801    call pixel_sa8d_8x8_internal2
5802    SA8D_INTER
5803    mova [esp+64-mmsize], m0
5804    call pixel_sa8d_8x8_internal2
5805    SA8D_INTER
5806%if HIGH_BIT_DEPTH == 0
5807    HADDUW m0, m1
5808%endif
5809    movd r4d, m0
5810    add  r4d, 1
5811    shr  r4d, 1
5812    add r4d, dword [esp+36]
5813    mov eax, r4d
5814    mov esp, r6
5815    RET
5816
5817cglobal pixel_sa8d_64x48, 4,7,8
5818    FIX_STRIDES r1, r3
5819    mov  r6, esp
5820    and  esp, ~15
5821    sub  esp, 64
5822
5823    lea  r4, [r1 + 2*r1]
5824    lea  r5, [r3 + 2*r3]
5825    call pixel_sa8d_8x8_internal2
5826%if HIGH_BIT_DEPTH
5827    HADDUW m0, m1
5828%endif
5829    mova [rsp+48], m0
5830    call pixel_sa8d_8x8_internal2
5831    SA8D_INTER
5832    mova [esp+48], m0
5833
5834    mov  r0, [r6+20]
5835    mov  r2, [r6+28]
5836    add  r0, 8*SIZEOF_PIXEL
5837    add  r2, 8*SIZEOF_PIXEL
5838    call pixel_sa8d_8x8_internal2
5839    SA8D_INTER
5840    mova [esp+48], m0
5841    call pixel_sa8d_8x8_internal2
5842    SA8D_INTER
5843%if HIGH_BIT_DEPTH == 0
5844    HADDUW m0, m1
5845%endif
5846    movd r4d, m0
5847    add  r4d, 1
5848    shr  r4d, 1
5849    mov dword [esp+36], r4d
5850
5851    mov  r0, [r6+20]
5852    mov  r2, [r6+28]
5853    add  r0, 16*SIZEOF_PIXEL
5854    add  r2, 16*SIZEOF_PIXEL
5855    lea  r4, [r1 + 2*r1]
5856    call pixel_sa8d_8x8_internal2
5857%if HIGH_BIT_DEPTH
5858    HADDUW m0, m1
5859%endif
5860    mova [esp+48], m0
5861    call pixel_sa8d_8x8_internal2
5862    SA8D_INTER
5863    mova [esp+48], m0
5864
5865    mov  r0, [r6+20]
5866    mov  r2, [r6+28]
5867    add  r0, 24*SIZEOF_PIXEL
5868    add  r2, 24*SIZEOF_PIXEL
5869    call pixel_sa8d_8x8_internal2
5870    SA8D_INTER
5871    mova [esp+64-mmsize], m0
5872    call pixel_sa8d_8x8_internal2
5873    AVG_16x16
5874
5875    mov  r0, [r6+20]
5876    mov  r2, [r6+28]
5877    add  r0, 32*SIZEOF_PIXEL
5878    add  r2, 32*SIZEOF_PIXEL
5879    lea  r4, [r1 + 2*r1]
5880    call pixel_sa8d_8x8_internal2
5881%if HIGH_BIT_DEPTH
5882    HADDUW m0, m1
5883%endif
5884    mova [esp+48], m0
5885    call pixel_sa8d_8x8_internal2
5886    SA8D_INTER
5887    mova [esp+48], m0
5888
5889    mov  r0, [r6+20]
5890    mov  r2, [r6+28]
5891    add  r0, 40*SIZEOF_PIXEL
5892    add  r2, 40*SIZEOF_PIXEL
5893    call pixel_sa8d_8x8_internal2
5894    SA8D_INTER
5895    mova [esp+64-mmsize], m0
5896    call pixel_sa8d_8x8_internal2
5897    AVG_16x16
5898
5899    mov  r0, [r6+20]
5900    mov  r2, [r6+28]
5901    add  r0, 48*SIZEOF_PIXEL
5902    add  r2, 48*SIZEOF_PIXEL
5903    lea  r4, [r1 + 2*r1]
5904    call pixel_sa8d_8x8_internal2
5905%if HIGH_BIT_DEPTH
5906    HADDUW m0, m1
5907%endif
5908    mova [esp+48], m0
5909    call pixel_sa8d_8x8_internal2
5910    SA8D_INTER
5911    mova [esp+48], m0
5912
5913    mov  r0, [r6+20]
5914    mov  r2, [r6+28]
5915    add  r0, 56*SIZEOF_PIXEL
5916    add  r2, 56*SIZEOF_PIXEL
5917    call pixel_sa8d_8x8_internal2
5918    SA8D_INTER
5919    mova [esp+64-mmsize], m0
5920    call pixel_sa8d_8x8_internal2
5921    AVG_16x16
5922
5923    mov  r0, [r6+20]
5924    mov  r2, [r6+28]
5925    lea  r0, [r0 + r1*8]
5926    lea  r2, [r2 + r3*8]
5927    lea  r0, [r0 + r1*8]
5928    lea  r2, [r2 + r3*8]
5929    mov  [r6+20], r0
5930    mov  [r6+28], r2
5931
5932    lea  r4, [r1 + 2*r1]
5933    call pixel_sa8d_8x8_internal2
5934%if HIGH_BIT_DEPTH
5935    HADDUW m0, m1
5936%endif
5937    mova [esp+48], m0
5938    call pixel_sa8d_8x8_internal2
5939    SA8D_INTER
5940    mova [esp+48], m0
5941
5942    mov  r0, [r6+20]
5943    mov  r2, [r6+28]
5944    add  r0, 8*SIZEOF_PIXEL
5945    add  r2, 8*SIZEOF_PIXEL
5946    call pixel_sa8d_8x8_internal2
5947    SA8D_INTER
5948    mova [esp+64-mmsize], m0
5949    call pixel_sa8d_8x8_internal2
5950    AVG_16x16
5951
5952    mov  r0, [r6+20]
5953    mov  r2, [r6+28]
5954    add  r0, 16*SIZEOF_PIXEL
5955    add  r2, 16*SIZEOF_PIXEL
5956    lea  r4, [r1 + 2*r1]
5957    call pixel_sa8d_8x8_internal2
5958%if HIGH_BIT_DEPTH
5959    HADDUW m0, m1
5960%endif
5961    mova [esp+48], m0
5962    call pixel_sa8d_8x8_internal2
5963    SA8D_INTER
5964    mova [esp+48], m0
5965
5966    mov  r0, [r6+20]
5967    mov  r2, [r6+28]
5968    add  r0, 24*SIZEOF_PIXEL
5969    add  r2, 24*SIZEOF_PIXEL
5970    call pixel_sa8d_8x8_internal2
5971    SA8D_INTER
5972    mova [esp+64-mmsize], m0
5973    call pixel_sa8d_8x8_internal2
5974    AVG_16x16
5975
5976    mov  r0, [r6+20]
5977    mov  r2, [r6+28]
5978    add  r0, 32*SIZEOF_PIXEL
5979    add  r2, 32*SIZEOF_PIXEL
5980    lea  r4, [r1 + 2*r1]
5981    call pixel_sa8d_8x8_internal2
5982%if HIGH_BIT_DEPTH
5983    HADDUW m0, m1
5984%endif
5985    mova [esp+48], m0
5986    call pixel_sa8d_8x8_internal2
5987    SA8D_INTER
5988    mova [esp+48], m0
5989
5990    mov  r0, [r6+20]
5991    mov  r2, [r6+28]
5992    add  r0, 40*SIZEOF_PIXEL
5993    add  r2, 40*SIZEOF_PIXEL
5994    call pixel_sa8d_8x8_internal2
5995    SA8D_INTER
5996    mova [esp+64-mmsize], m0
5997    call pixel_sa8d_8x8_internal2
5998    AVG_16x16
5999
6000    mov  r0, [r6+20]
6001    mov  r2, [r6+28]
6002    add  r0, 48*SIZEOF_PIXEL
6003    add  r2, 48*SIZEOF_PIXEL
6004    lea  r4, [r1 + 2*r1]
6005    call pixel_sa8d_8x8_internal2
6006%if HIGH_BIT_DEPTH
6007    HADDUW m0, m1
6008%endif
6009    mova [esp+48], m0
6010    call pixel_sa8d_8x8_internal2
6011    SA8D_INTER
6012    mova [esp+48], m0
6013
6014    mov  r0, [r6+20]
6015    mov  r2, [r6+28]
6016    add  r0, 56*SIZEOF_PIXEL
6017    add  r2, 56*SIZEOF_PIXEL
6018    call pixel_sa8d_8x8_internal2
6019    SA8D_INTER
6020    mova [esp+64-mmsize], m0
6021    call pixel_sa8d_8x8_internal2
6022    AVG_16x16
6023
6024    mov  r0, [r6+20]
6025    mov  r2, [r6+28]
6026    lea  r0, [r0 + r1*8]
6027    lea  r2, [r2 + r3*8]
6028    lea  r0, [r0 + r1*8]
6029    lea  r2, [r2 + r3*8]
6030    mov  [r6+20], r0
6031    mov  [r6+28], r2
6032
6033    lea  r4, [r1 + 2*r1]
6034    call pixel_sa8d_8x8_internal2
6035%if HIGH_BIT_DEPTH
6036    HADDUW m0, m1
6037%endif
6038    mova [esp+48], m0
6039    call pixel_sa8d_8x8_internal2
6040    SA8D_INTER
6041    mova [esp+48], m0
6042
6043    mov  r0, [r6+20]
6044    mov  r2, [r6+28]
6045    add  r0, 8*SIZEOF_PIXEL
6046    add  r2, 8*SIZEOF_PIXEL
6047    call pixel_sa8d_8x8_internal2
6048    SA8D_INTER
6049    mova [esp+64-mmsize], m0
6050    call pixel_sa8d_8x8_internal2
6051    AVG_16x16
6052
6053    mov  r0, [r6+20]
6054    mov  r2, [r6+28]
6055    add  r0, 16*SIZEOF_PIXEL
6056    add  r2, 16*SIZEOF_PIXEL
6057    lea  r4, [r1 + 2*r1]
6058    call pixel_sa8d_8x8_internal2
6059%if HIGH_BIT_DEPTH
6060    HADDUW m0, m1
6061%endif
6062    mova [esp+48], m0
6063    call pixel_sa8d_8x8_internal2
6064    SA8D_INTER
6065    mova [esp+48], m0
6066
6067    mov  r0, [r6+20]
6068    mov  r2, [r6+28]
6069    add  r0, 24*SIZEOF_PIXEL
6070    add  r2, 24*SIZEOF_PIXEL
6071    call pixel_sa8d_8x8_internal2
6072    SA8D_INTER
6073    mova [esp+64-mmsize], m0
6074    call pixel_sa8d_8x8_internal2
6075    AVG_16x16
6076
6077    mov  r0, [r6+20]
6078    mov  r2, [r6+28]
6079    add  r0, 32*SIZEOF_PIXEL
6080    add  r2, 32*SIZEOF_PIXEL
6081    lea  r4, [r1 + 2*r1]
6082    call pixel_sa8d_8x8_internal2
6083%if HIGH_BIT_DEPTH
6084    HADDUW m0, m1
6085%endif
6086    mova [esp+48], m0
6087    call pixel_sa8d_8x8_internal2
6088    SA8D_INTER
6089    mova [esp+48], m0
6090
6091    mov  r0, [r6+20]
6092    mov  r2, [r6+28]
6093    add  r0, 40*SIZEOF_PIXEL
6094    add  r2, 40*SIZEOF_PIXEL
6095    call pixel_sa8d_8x8_internal2
6096    SA8D_INTER
6097    mova [esp+64-mmsize], m0
6098    call pixel_sa8d_8x8_internal2
6099    AVG_16x16
6100
6101    mov  r0, [r6+20]
6102    mov  r2, [r6+28]
6103    add  r0, 48*SIZEOF_PIXEL
6104    add  r2, 48*SIZEOF_PIXEL
6105    lea  r4, [r1 + 2*r1]
6106    call pixel_sa8d_8x8_internal2
6107%if HIGH_BIT_DEPTH
6108    HADDUW m0, m1
6109%endif
6110    mova [esp+48], m0
6111    call pixel_sa8d_8x8_internal2
6112    SA8D_INTER
6113    mova [esp+48], m0
6114
6115    mov  r0, [r6+20]
6116    mov  r2, [r6+28]
6117    add  r0, 56*SIZEOF_PIXEL
6118    add  r2, 56*SIZEOF_PIXEL
6119    call pixel_sa8d_8x8_internal2
6120    SA8D_INTER
6121    mova [esp+64-mmsize], m0
6122    call pixel_sa8d_8x8_internal2
6123    SA8D_INTER
6124%if HIGH_BIT_DEPTH == 0
6125    HADDUW m0, m1
6126%endif
6127    movd r4d, m0
6128    add  r4d, 1
6129    shr  r4d, 1
6130    add r4d, dword [esp+36]
6131    mov eax, r4d
6132    mov esp, r6
6133    RET
6134
6135cglobal pixel_sa8d_64x64, 4,7,8
6136    FIX_STRIDES r1, r3
6137    mov  r6, esp
6138    and  esp, ~15
6139    sub  esp, 64
6140
6141    lea  r4, [r1 + 2*r1]
6142    lea  r5, [r3 + 2*r3]
6143    call pixel_sa8d_8x8_internal2
6144%if HIGH_BIT_DEPTH
6145    HADDUW m0, m1
6146%endif
6147    mova [rsp+48], m0
6148    call pixel_sa8d_8x8_internal2
6149    SA8D_INTER
6150    mova [esp+48], m0
6151
6152    mov  r0, [r6+20]
6153    mov  r2, [r6+28]
6154    add  r0, 8*SIZEOF_PIXEL
6155    add  r2, 8*SIZEOF_PIXEL
6156    call pixel_sa8d_8x8_internal2
6157    SA8D_INTER
6158    mova [esp+48], m0
6159    call pixel_sa8d_8x8_internal2
6160    SA8D_INTER
6161%if HIGH_BIT_DEPTH == 0
6162    HADDUW m0, m1
6163%endif
6164    movd r4d, m0
6165    add  r4d, 1
6166    shr  r4d, 1
6167    mov dword [esp+36], r4d
6168
6169    mov  r0, [r6+20]
6170    mov  r2, [r6+28]
6171    add  r0, 16*SIZEOF_PIXEL
6172    add  r2, 16*SIZEOF_PIXEL
6173    lea  r4, [r1 + 2*r1]
6174    call pixel_sa8d_8x8_internal2
6175%if HIGH_BIT_DEPTH
6176    HADDUW m0, m1
6177%endif
6178    mova [esp+48], m0
6179    call pixel_sa8d_8x8_internal2
6180    SA8D_INTER
6181    mova [esp+48], m0
6182
6183    mov  r0, [r6+20]
6184    mov  r2, [r6+28]
6185    add  r0, 24*SIZEOF_PIXEL
6186    add  r2, 24*SIZEOF_PIXEL
6187    call pixel_sa8d_8x8_internal2
6188    SA8D_INTER
6189    mova [esp+64-mmsize], m0
6190    call pixel_sa8d_8x8_internal2
6191    AVG_16x16
6192
6193    mov  r0, [r6+20]
6194    mov  r2, [r6+28]
6195    add  r0, 32*SIZEOF_PIXEL
6196    add  r2, 32*SIZEOF_PIXEL
6197    lea  r4, [r1 + 2*r1]
6198    call pixel_sa8d_8x8_internal2
6199%if HIGH_BIT_DEPTH
6200    HADDUW m0, m1
6201%endif
6202    mova [esp+48], m0
6203    call pixel_sa8d_8x8_internal2
6204    SA8D_INTER
6205    mova [esp+48], m0
6206
6207    mov  r0, [r6+20]
6208    mov  r2, [r6+28]
6209    add  r0, 40*SIZEOF_PIXEL
6210    add  r2, 40*SIZEOF_PIXEL
6211    call pixel_sa8d_8x8_internal2
6212    SA8D_INTER
6213    mova [esp+64-mmsize], m0
6214    call pixel_sa8d_8x8_internal2
6215    AVG_16x16
6216
6217    mov  r0, [r6+20]
6218    mov  r2, [r6+28]
6219    add  r0, 48*SIZEOF_PIXEL
6220    add  r2, 48*SIZEOF_PIXEL
6221    lea  r4, [r1 + 2*r1]
6222    call pixel_sa8d_8x8_internal2
6223%if HIGH_BIT_DEPTH
6224    HADDUW m0, m1
6225%endif
6226    mova [esp+48], m0
6227    call pixel_sa8d_8x8_internal2
6228    SA8D_INTER
6229    mova [esp+48], m0
6230
6231    mov  r0, [r6+20]
6232    mov  r2, [r6+28]
6233    add  r0, 56*SIZEOF_PIXEL
6234    add  r2, 56*SIZEOF_PIXEL
6235    call pixel_sa8d_8x8_internal2
6236    SA8D_INTER
6237    mova [esp+64-mmsize], m0
6238    call pixel_sa8d_8x8_internal2
6239    AVG_16x16
6240
6241    mov  r0, [r6+20]
6242    mov  r2, [r6+28]
6243    lea  r0, [r0 + r1*8]
6244    lea  r2, [r2 + r3*8]
6245    lea  r0, [r0 + r1*8]
6246    lea  r2, [r2 + r3*8]
6247    mov  [r6+20], r0
6248    mov  [r6+28], r2
6249
6250    lea  r4, [r1 + 2*r1]
6251    call pixel_sa8d_8x8_internal2
6252%if HIGH_BIT_DEPTH
6253    HADDUW m0, m1
6254%endif
6255    mova [esp+48], m0
6256    call pixel_sa8d_8x8_internal2
6257    SA8D_INTER
6258    mova [esp+48], m0
6259
6260    mov  r0, [r6+20]
6261    mov  r2, [r6+28]
6262    add  r0, 8*SIZEOF_PIXEL
6263    add  r2, 8*SIZEOF_PIXEL
6264    call pixel_sa8d_8x8_internal2
6265    SA8D_INTER
6266    mova [esp+64-mmsize], m0
6267    call pixel_sa8d_8x8_internal2
6268    AVG_16x16
6269
6270    mov  r0, [r6+20]
6271    mov  r2, [r6+28]
6272    add  r0, 16*SIZEOF_PIXEL
6273    add  r2, 16*SIZEOF_PIXEL
6274    lea  r4, [r1 + 2*r1]
6275    call pixel_sa8d_8x8_internal2
6276%if HIGH_BIT_DEPTH
6277    HADDUW m0, m1
6278%endif
6279    mova [esp+48], m0
6280    call pixel_sa8d_8x8_internal2
6281    SA8D_INTER
6282    mova [esp+48], m0
6283
6284    mov  r0, [r6+20]
6285    mov  r2, [r6+28]
6286    add  r0, 24*SIZEOF_PIXEL
6287    add  r2, 24*SIZEOF_PIXEL
6288    call pixel_sa8d_8x8_internal2
6289    SA8D_INTER
6290    mova [esp+64-mmsize], m0
6291    call pixel_sa8d_8x8_internal2
6292    AVG_16x16
6293
6294    mov  r0, [r6+20]
6295    mov  r2, [r6+28]
6296    add  r0, 32*SIZEOF_PIXEL
6297    add  r2, 32*SIZEOF_PIXEL
6298    lea  r4, [r1 + 2*r1]
6299    call pixel_sa8d_8x8_internal2
6300%if HIGH_BIT_DEPTH
6301    HADDUW m0, m1
6302%endif
6303    mova [esp+48], m0
6304    call pixel_sa8d_8x8_internal2
6305    SA8D_INTER
6306    mova [esp+48], m0
6307
6308    mov  r0, [r6+20]
6309    mov  r2, [r6+28]
6310    add  r0, 40*SIZEOF_PIXEL
6311    add  r2, 40*SIZEOF_PIXEL
6312    call pixel_sa8d_8x8_internal2
6313    SA8D_INTER
6314    mova [esp+64-mmsize], m0
6315    call pixel_sa8d_8x8_internal2
6316    AVG_16x16
6317
6318    mov  r0, [r6+20]
6319    mov  r2, [r6+28]
6320    add  r0, 48*SIZEOF_PIXEL
6321    add  r2, 48*SIZEOF_PIXEL
6322    lea  r4, [r1 + 2*r1]
6323    call pixel_sa8d_8x8_internal2
6324%if HIGH_BIT_DEPTH
6325    HADDUW m0, m1
6326%endif
6327    mova [esp+48], m0
6328    call pixel_sa8d_8x8_internal2
6329    SA8D_INTER
6330    mova [esp+48], m0
6331
6332    mov  r0, [r6+20]
6333    mov  r2, [r6+28]
6334    add  r0, 56*SIZEOF_PIXEL
6335    add  r2, 56*SIZEOF_PIXEL
6336    call pixel_sa8d_8x8_internal2
6337    SA8D_INTER
6338    mova [esp+64-mmsize], m0
6339    call pixel_sa8d_8x8_internal2
6340    AVG_16x16
6341
6342    mov  r0, [r6+20]
6343    mov  r2, [r6+28]
6344    lea  r0, [r0 + r1*8]
6345    lea  r2, [r2 + r3*8]
6346    lea  r0, [r0 + r1*8]
6347    lea  r2, [r2 + r3*8]
6348    mov  [r6+20], r0
6349    mov  [r6+28], r2
6350
6351    lea  r4, [r1 + 2*r1]
6352    call pixel_sa8d_8x8_internal2
6353%if HIGH_BIT_DEPTH
6354    HADDUW m0, m1
6355%endif
6356    mova [esp+48], m0
6357    call pixel_sa8d_8x8_internal2
6358    SA8D_INTER
6359    mova [esp+48], m0
6360
6361    mov  r0, [r6+20]
6362    mov  r2, [r6+28]
6363    add  r0, 8*SIZEOF_PIXEL
6364    add  r2, 8*SIZEOF_PIXEL
6365    call pixel_sa8d_8x8_internal2
6366    SA8D_INTER
6367    mova [esp+64-mmsize], m0
6368    call pixel_sa8d_8x8_internal2
6369    AVG_16x16
6370
6371    mov  r0, [r6+20]
6372    mov  r2, [r6+28]
6373    add  r0, 16*SIZEOF_PIXEL
6374    add  r2, 16*SIZEOF_PIXEL
6375    lea  r4, [r1 + 2*r1]
6376    call pixel_sa8d_8x8_internal2
6377%if HIGH_BIT_DEPTH
6378    HADDUW m0, m1
6379%endif
6380    mova [esp+48], m0
6381    call pixel_sa8d_8x8_internal2
6382    SA8D_INTER
6383    mova [esp+48], m0
6384
6385    mov  r0, [r6+20]
6386    mov  r2, [r6+28]
6387    add  r0, 24*SIZEOF_PIXEL
6388    add  r2, 24*SIZEOF_PIXEL
6389    call pixel_sa8d_8x8_internal2
6390    SA8D_INTER
6391    mova [esp+64-mmsize], m0
6392    call pixel_sa8d_8x8_internal2
6393    AVG_16x16
6394
6395    mov  r0, [r6+20]
6396    mov  r2, [r6+28]
6397    add  r0, 32*SIZEOF_PIXEL
6398    add  r2, 32*SIZEOF_PIXEL
6399    lea  r4, [r1 + 2*r1]
6400    call pixel_sa8d_8x8_internal2
6401%if HIGH_BIT_DEPTH
6402    HADDUW m0, m1
6403%endif
6404    mova [esp+48], m0
6405    call pixel_sa8d_8x8_internal2
6406    SA8D_INTER
6407    mova [esp+48], m0
6408
6409    mov  r0, [r6+20]
6410    mov  r2, [r6+28]
6411    add  r0, 40*SIZEOF_PIXEL
6412    add  r2, 40*SIZEOF_PIXEL
6413    call pixel_sa8d_8x8_internal2
6414    SA8D_INTER
6415    mova [esp+64-mmsize], m0
6416    call pixel_sa8d_8x8_internal2
6417    AVG_16x16
6418
6419    mov  r0, [r6+20]
6420    mov  r2, [r6+28]
6421    add  r0, 48*SIZEOF_PIXEL
6422    add  r2, 48*SIZEOF_PIXEL
6423    lea  r4, [r1 + 2*r1]
6424    call pixel_sa8d_8x8_internal2
6425%if HIGH_BIT_DEPTH
6426    HADDUW m0, m1
6427%endif
6428    mova [esp+48], m0
6429    call pixel_sa8d_8x8_internal2
6430    SA8D_INTER
6431    mova [esp+48], m0
6432
6433    mov  r0, [r6+20]
6434    mov  r2, [r6+28]
6435    add  r0, 56*SIZEOF_PIXEL
6436    add  r2, 56*SIZEOF_PIXEL
6437    call pixel_sa8d_8x8_internal2
6438    SA8D_INTER
6439    mova [esp+64-mmsize], m0
6440    call pixel_sa8d_8x8_internal2
6441    AVG_16x16
6442
6443    mov  r0, [r6+20]
6444    mov  r2, [r6+28]
6445    lea  r0, [r0 + r1*8]
6446    lea  r2, [r2 + r3*8]
6447    lea  r0, [r0 + r1*8]
6448    lea  r2, [r2 + r3*8]
6449    mov  [r6+20], r0
6450    mov  [r6+28], r2
6451
6452    lea  r4, [r1 + 2*r1]
6453    call pixel_sa8d_8x8_internal2
6454%if HIGH_BIT_DEPTH
6455    HADDUW m0, m1
6456%endif
6457    mova [esp+48], m0
6458    call pixel_sa8d_8x8_internal2
6459    SA8D_INTER
6460    mova [esp+48], m0
6461
6462    mov  r0, [r6+20]
6463    mov  r2, [r6+28]
6464    add  r0, 8*SIZEOF_PIXEL
6465    add  r2, 8*SIZEOF_PIXEL
6466    call pixel_sa8d_8x8_internal2
6467    SA8D_INTER
6468    mova [esp+64-mmsize], m0
6469    call pixel_sa8d_8x8_internal2
6470    AVG_16x16
6471
6472    mov  r0, [r6+20]
6473    mov  r2, [r6+28]
6474    add  r0, 16*SIZEOF_PIXEL
6475    add  r2, 16*SIZEOF_PIXEL
6476    lea  r4, [r1 + 2*r1]
6477    call pixel_sa8d_8x8_internal2
6478%if HIGH_BIT_DEPTH
6479    HADDUW m0, m1
6480%endif
6481    mova [esp+48], m0
6482    call pixel_sa8d_8x8_internal2
6483    SA8D_INTER
6484    mova [esp+48], m0
6485
6486    mov  r0, [r6+20]
6487    mov  r2, [r6+28]
6488    add  r0, 24*SIZEOF_PIXEL
6489    add  r2, 24*SIZEOF_PIXEL
6490    call pixel_sa8d_8x8_internal2
6491    SA8D_INTER
6492    mova [esp+64-mmsize], m0
6493    call pixel_sa8d_8x8_internal2
6494    AVG_16x16
6495
6496    mov  r0, [r6+20]
6497    mov  r2, [r6+28]
6498    add  r0, 32*SIZEOF_PIXEL
6499    add  r2, 32*SIZEOF_PIXEL
6500    lea  r4, [r1 + 2*r1]
6501    call pixel_sa8d_8x8_internal2
6502%if HIGH_BIT_DEPTH
6503    HADDUW m0, m1
6504%endif
6505    mova [esp+48], m0
6506    call pixel_sa8d_8x8_internal2
6507    SA8D_INTER
6508    mova [esp+48], m0
6509
6510    mov  r0, [r6+20]
6511    mov  r2, [r6+28]
6512    add  r0, 40*SIZEOF_PIXEL
6513    add  r2, 40*SIZEOF_PIXEL
6514    call pixel_sa8d_8x8_internal2
6515    SA8D_INTER
6516    mova [esp+64-mmsize], m0
6517    call pixel_sa8d_8x8_internal2
6518    AVG_16x16
6519
6520    mov  r0, [r6+20]
6521    mov  r2, [r6+28]
6522    add  r0, 48*SIZEOF_PIXEL
6523    add  r2, 48*SIZEOF_PIXEL
6524    lea  r4, [r1 + 2*r1]
6525    call pixel_sa8d_8x8_internal2
6526%if HIGH_BIT_DEPTH
6527    HADDUW m0, m1
6528%endif
6529    mova [esp+48], m0
6530    call pixel_sa8d_8x8_internal2
6531    SA8D_INTER
6532    mova [esp+48], m0
6533
6534    mov  r0, [r6+20]
6535    mov  r2, [r6+28]
6536    add  r0, 56*SIZEOF_PIXEL
6537    add  r2, 56*SIZEOF_PIXEL
6538    call pixel_sa8d_8x8_internal2
6539    SA8D_INTER
6540    mova [esp+64-mmsize], m0
6541    call pixel_sa8d_8x8_internal2
6542    SA8D_INTER
6543%if HIGH_BIT_DEPTH == 0
6544    HADDUW m0, m1
6545%endif
6546    movd r4d, m0
6547    add  r4d, 1
6548    shr  r4d, 1
6549    add r4d, dword [esp+36]
6550    mov eax, r4d
6551    mov esp, r6
6552    RET
6553%endif ; !ARCH_X86_64
6554%endmacro ; SA8D
6555
6556
6557%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
6558INIT_YMM avx2
6559cglobal sa8d_8x8_12bit
6560    pmovzxwd        m0, [r0]
6561    pmovzxwd        m9, [r2]
6562    psubd           m0, m9
6563
6564    pmovzxwd        m1, [r0 + r1]
6565    pmovzxwd        m9, [r2 + r3]
6566    psubd           m1, m9
6567
6568    pmovzxwd        m2, [r0 + r1 * 2]
6569    pmovzxwd        m9, [r2 + r3 * 2]
6570    psubd           m2, m9
6571
6572    pmovzxwd        m8, [r0 + r4]
6573    pmovzxwd        m9, [r2 + r5]
6574    psubd           m8, m9
6575
6576    lea             r0, [r0 + r1 * 4]
6577    lea             r2, [r2 + r3 * 4]
6578
6579    pmovzxwd        m4, [r0]
6580    pmovzxwd        m9, [r2]
6581    psubd           m4, m9
6582
6583    pmovzxwd        m5, [r0 + r1]
6584    pmovzxwd        m9, [r2 + r3]
6585    psubd           m5, m9
6586
6587    pmovzxwd        m3, [r0 + r1 * 2]
6588    pmovzxwd        m9, [r2 + r3 * 2]
6589    psubd           m3, m9
6590
6591    pmovzxwd        m7, [r0 + r4]
6592    pmovzxwd        m9, [r2 + r5]
6593    psubd           m7, m9
6594
6595    mova            m6, m0
6596    paddd           m0, m1
6597    psubd           m1, m6
6598    mova            m6, m2
6599    paddd           m2, m8
6600    psubd           m8, m6
6601    mova            m6, m0
6602
6603    punpckldq       m0, m1
6604    punpckhdq       m6, m1
6605
6606    mova            m1, m0
6607    paddd           m0, m6
6608    psubd           m6, m1
6609    mova            m1, m2
6610
6611    punpckldq       m2, m8
6612    punpckhdq       m1, m8
6613
6614    mova            m8, m2
6615    paddd           m2, m1
6616    psubd           m1, m8
6617    mova            m8, m4
6618    paddd           m4, m5
6619    psubd           m5, m8
6620    mova            m8, m3
6621    paddd           m3, m7
6622    psubd           m7, m8
6623    mova            m8, m4
6624
6625    punpckldq       m4, m5
6626    punpckhdq       m8, m5
6627
6628    mova            m5, m4
6629    paddd           m4, m8
6630    psubd           m8, m5
6631    mova            m5, m3
6632    punpckldq       m3, m7
6633    punpckhdq       m5, m7
6634
6635    mova            m7, m3
6636    paddd           m3, m5
6637    psubd           m5, m7
6638    mova            m7, m0
6639    paddd           m0, m2
6640    psubd           m2, m7
6641    mova            m7, m6
6642    paddd           m6, m1
6643    psubd           m1, m7
6644    mova            m7, m0
6645
6646    punpcklqdq      m0, m2
6647    punpckhqdq      m7, m2
6648
6649    mova            m2, m0
6650    paddd           m0, m7
6651    psubd           m7, m2
6652    mova            m2, m6
6653
6654    punpcklqdq      m6, m1
6655    punpckhqdq      m2, m1
6656
6657    mova            m1, m6
6658    paddd           m6, m2
6659    psubd           m2, m1
6660    mova            m1, m4
6661    paddd           m4, m3
6662    psubd           m3, m1
6663    mova            m1, m8
6664    paddd           m8, m5
6665    psubd           m5, m1
6666    mova            m1, m4
6667
6668    punpcklqdq      m4, m3
6669    punpckhqdq      m1, m3
6670
6671    mova            m3, m4
6672    paddd           m4, m1
6673    psubd           m1, m3
6674    mova            m3, m8
6675
6676    punpcklqdq      m8, m5
6677    punpckhqdq      m3, m5
6678
6679    mova            m5, m8
6680    paddd           m8, m3
6681    psubd           m3, m5
6682    mova            m5, m0
6683    paddd           m0, m4
6684    psubd           m4, m5
6685    mova            m5, m7
6686    paddd           m7, m1
6687    psubd           m1, m5
6688    mova            m5, m0
6689
6690    vinserti128     m0, m0, xm4, 1
6691    vperm2i128      m5, m5, m4, 00110001b
6692
6693    pxor            m4, m4
6694    psubd           m4, m0
6695    pmaxsd          m0, m4
6696    pxor            m4, m4
6697    psubd           m4, m5
6698    pmaxsd          m5, m4
6699    pmaxsd          m0, m5
6700    mova            m4, m7
6701
6702    vinserti128     m7, m7, xm1, 1
6703    vperm2i128      m4, m4, m1, 00110001b
6704
6705    pxor            m1, m1
6706    psubd           m1, m7
6707    pmaxsd          m7, m1
6708    pxor            m1, m1
6709    psubd           m1, m4
6710    pmaxsd          m4, m1
6711    pmaxsd          m7, m4
6712    mova            m1, m6
6713    paddd           m6, m8
6714    psubd           m8, m1
6715    mova            m1, m2
6716    paddd           m2, m3
6717    psubd           m3, m1
6718    mova            m1, m6
6719
6720    vinserti128     m6, m6, xm8, 1
6721    vperm2i128      m1, m1, m8, 00110001b
6722
6723    pxor            m8, m8
6724    psubd           m8, m6
6725    pmaxsd          m6, m8
6726    pxor            m8, m8
6727    psubd           m8, m1
6728    pmaxsd          m1, m8
6729    pmaxsd          m6, m1
6730    mova            m8, m2
6731
6732    vinserti128     m2, m2, xm3, 1
6733    vperm2i128      m8, m8, m3, 00110001b
6734
6735    pxor            m3, m3
6736    psubd           m3, m2
6737    pmaxsd          m2, m3
6738    pxor            m3, m3
6739    psubd           m3, m8
6740    pmaxsd          m8, m3
6741    pmaxsd          m2, m8
6742    paddd           m0, m6
6743    paddd           m0, m7
6744    paddd           m0, m2
6745    ret
6746
6747cglobal pixel_sa8d_8x8, 4,6,10
6748    add             r1d, r1d
6749    add             r3d, r3d
6750    lea             r4, [r1 + r1 * 2]
6751    lea             r5, [r3 + r3 * 2]
6752
6753    call            sa8d_8x8_12bit
6754
6755    vextracti128    xm6, m0, 1
6756    paddd           xm0, xm6
6757
6758    movhlps         xm6, xm0
6759    paddd           xm0, xm6
6760
6761    pshuflw         xm6, xm0, 0Eh
6762    paddd           xm0, xm6
6763    movd            eax, xm0
6764    add             eax, 1
6765    shr             eax, 1
6766    RET
6767
6768cglobal pixel_sa8d_8x16, 4,7,11
6769    add             r1d, r1d
6770    add             r3d, r3d
6771    lea             r4, [r1 + r1 * 2]
6772    lea             r5, [r3 + r3 * 2]
6773    pxor            m10, m10
6774
6775    call            sa8d_8x8_12bit
6776
6777    vextracti128    xm6, m0, 1
6778    paddd           xm0, xm6
6779
6780    movhlps         xm6, xm0
6781    paddd           xm0, xm6
6782
6783    pshuflw         xm6, xm0, 0Eh
6784    paddd           xm0, xm6
6785    paddd           xm0, [pd_1]
6786    psrld           xm0, 1
6787    paddd           xm10, xm0
6788
6789    lea             r0, [r0 + r1 * 4]
6790    lea             r2, [r2 + r3 * 4]
6791    call            sa8d_8x8_12bit
6792
6793    vextracti128    xm6, m0, 1
6794    paddd           xm0, xm6
6795
6796    movhlps         xm6, xm0
6797    paddd           xm0, xm6
6798
6799    pshuflw         xm6, xm0, 0Eh
6800    paddd           xm0, xm6
6801    paddd           xm0, [pd_1]
6802    psrld           xm0, 1
6803    paddd           xm0, xm10
6804    movd            eax, xm0
6805    RET
6806
6807cglobal pixel_sa8d_16x16, 4,8,11
6808    add             r1d, r1d
6809    add             r3d, r3d
6810    lea             r4, [r1 + r1 * 2]
6811    lea             r5, [r3 + r3 * 2]
6812    mov             r6, r0
6813    mov             r7, r2
6814    pxor            m10, m10
6815
6816    call            sa8d_8x8_12bit
6817    paddd           m10, m0
6818
6819    lea             r0, [r0 + r1 * 4]
6820    lea             r2, [r2 + r3 * 4]
6821    call            sa8d_8x8_12bit
6822    paddd           m10, m0
6823
6824    lea             r0, [r6 + 16]
6825    lea             r2, [r7 + 16]
6826    call            sa8d_8x8_12bit
6827    paddd           m10, m0
6828
6829    lea             r0, [r0 + r1 * 4]
6830    lea             r2, [r2 + r3 * 4]
6831    call            sa8d_8x8_12bit
6832    paddd           m0, m10
6833
6834    vextracti128    xm6, m0, 1
6835    paddd           xm0, xm6
6836
6837    movhlps         xm6, xm0
6838    paddd           xm0, xm6
6839
6840    pshuflw         xm6, xm0, 0Eh
6841    paddd           xm0, xm6
6842    movd            eax, xm0
6843    add             eax, 1
6844    shr             eax, 1
6845    RET
6846
6847cglobal pixel_sa8d_16x32, 4,8,12
6848    add             r1d, r1d
6849    add             r3d, r3d
6850    lea             r4, [r1 + r1 * 2]
6851    lea             r5, [r3 + r3 * 2]
6852    mov             r6, r0
6853    mov             r7, r2
6854    pxor            m10, m10
6855    pxor            m11, m11
6856
6857    call            sa8d_8x8_12bit
6858    paddd           m10, m0
6859
6860    lea             r0, [r0 + r1 * 4]
6861    lea             r2, [r2 + r3 * 4]
6862    call            sa8d_8x8_12bit
6863    paddd           m10, m0
6864
6865    lea             r0, [r6 + 16]
6866    lea             r2, [r7 + 16]
6867    call            sa8d_8x8_12bit
6868    paddd           m10, m0
6869
6870    lea             r0, [r0 + r1 * 4]
6871    lea             r2, [r2 + r3 * 4]
6872    call            sa8d_8x8_12bit
6873    paddd           m0, m10
6874
6875    vextracti128    xm6, m0, 1
6876    paddd           xm0, xm6
6877
6878    movhlps         xm6, xm0
6879    paddd           xm0, xm6
6880
6881    pshuflw         xm6, xm0, 0Eh
6882    paddd           xm0, xm6
6883    paddd           xm0, [pd_1]
6884    psrld           xm0, 1
6885    paddd           xm11, xm0
6886
6887    lea             r6, [r6 + r1 * 8]
6888    lea             r6, [r6 + r1 * 8]
6889    lea             r7, [r7 + r3 * 8]
6890    lea             r7, [r7 + r3 * 8]
6891    pxor            m10, m10
6892    mov             r0, r6
6893    mov             r2, r7
6894    call            sa8d_8x8_12bit
6895    paddd           m10, m0
6896
6897    lea             r0, [r0 + r1 * 4]
6898    lea             r2, [r2 + r3 * 4]
6899    call            sa8d_8x8_12bit
6900    paddd           m10, m0
6901
6902    lea             r0, [r6 + 16]
6903    lea             r2, [r7 + 16]
6904    call            sa8d_8x8_12bit
6905    paddd           m10, m0
6906
6907    lea             r0, [r0 + r1 * 4]
6908    lea             r2, [r2 + r3 * 4]
6909    call            sa8d_8x8_12bit
6910    paddd           m0, m10
6911
6912    vextracti128    xm6, m0, 1
6913    paddd           xm0, xm6
6914
6915    movhlps         xm6, xm0
6916    paddd           xm0, xm6
6917
6918    pshuflw         xm6, xm0, 0Eh
6919    paddd           xm0, xm6
6920    paddd           xm0, [pd_1]
6921    psrld           xm0, 1
6922    paddd           xm11, xm0
6923    movd            eax, xm11
6924    RET
6925
6926cglobal pixel_sa8d_32x32, 4,8,12
6927    add             r1d, r1d
6928    add             r3d, r3d
6929    lea             r4, [r1 + r1 * 2]
6930    lea             r5, [r3 + r3 * 2]
6931    mov             r6, r0
6932    mov             r7, r2
6933    pxor            m10, m10
6934    pxor            m11, m11
6935
6936    call            sa8d_8x8_12bit
6937    paddd           m10, m0
6938
6939    lea             r0, [r0 + r1 * 4]
6940    lea             r2, [r2 + r3 * 4]
6941    call            sa8d_8x8_12bit
6942    paddd           m10, m0
6943
6944    lea             r0, [r6 + 16]
6945    lea             r2, [r7 + 16]
6946    call            sa8d_8x8_12bit
6947    paddd           m10, m0
6948
6949    lea             r0, [r0 + r1 * 4]
6950    lea             r2, [r2 + r3 * 4]
6951    call            sa8d_8x8_12bit
6952    paddd           m0, m10
6953
6954    vextracti128    xm6, m0, 1
6955    paddd           xm0, xm6
6956
6957    movhlps         xm6, xm0
6958    paddd           xm0, xm6
6959
6960    pshuflw         xm6, xm0, 0Eh
6961    paddd           xm0, xm6
6962    paddd           xm0, [pd_1]
6963    psrld           xm0, 1
6964    paddd           xm11, xm0
6965
6966    pxor            m10, m10
6967    lea             r0, [r6 + 32]
6968    lea             r2, [r7 + 32]
6969    call            sa8d_8x8_12bit
6970    paddd           m10, m0
6971
6972    lea             r0, [r0 + r1 * 4]
6973    lea             r2, [r2 + r3 * 4]
6974    call            sa8d_8x8_12bit
6975    paddd           m10, m0
6976
6977    lea             r0, [r6 + 48]
6978    lea             r2, [r7 + 48]
6979    call            sa8d_8x8_12bit
6980    paddd           m10, m0
6981
6982    lea             r0, [r0 + r1 * 4]
6983    lea             r2, [r2 + r3 * 4]
6984    call            sa8d_8x8_12bit
6985    paddd           m0, m10
6986
6987    vextracti128    xm6, m0, 1
6988    paddd           xm0, xm6
6989
6990    movhlps         xm6, xm0
6991    paddd           xm0, xm6
6992
6993    pshuflw         xm6, xm0, 0Eh
6994    paddd           xm0, xm6
6995    paddd           xm0, [pd_1]
6996    psrld           xm0, 1
6997    paddd           xm11, xm0
6998
6999    lea             r6, [r6 + r1 * 8]
7000    lea             r6, [r6 + r1 * 8]
7001    lea             r7, [r7 + r3 * 8]
7002    lea             r7, [r7 + r3 * 8]
7003    pxor            m10, m10
7004    mov             r0, r6
7005    mov             r2, r7
7006    call            sa8d_8x8_12bit
7007    paddd           m10, m0
7008
7009    lea             r0, [r0 + r1 * 4]
7010    lea             r2, [r2 + r3 * 4]
7011    call            sa8d_8x8_12bit
7012    paddd           m10, m0
7013
7014    lea             r0, [r6 + 16]
7015    lea             r2, [r7 + 16]
7016    call            sa8d_8x8_12bit
7017    paddd           m10, m0
7018
7019    lea             r0, [r0 + r1 * 4]
7020    lea             r2, [r2 + r3 * 4]
7021    call            sa8d_8x8_12bit
7022    paddd           m0, m10
7023
7024    vextracti128    xm6, m0, 1
7025    paddd           xm0, xm6
7026
7027    movhlps         xm6, xm0
7028    paddd           xm0, xm6
7029
7030    pshuflw         xm6, xm0, 0Eh
7031    paddd           xm0, xm6
7032    paddd           xm0, [pd_1]
7033    psrld           xm0, 1
7034    paddd           xm11, xm0
7035
7036    pxor            m10, m10
7037    lea             r0, [r6 + 32]
7038    lea             r2, [r7 + 32]
7039    call            sa8d_8x8_12bit
7040    paddd           m10, m0
7041
7042    lea             r0, [r0 + r1 * 4]
7043    lea             r2, [r2 + r3 * 4]
7044    call            sa8d_8x8_12bit
7045    paddd           m10, m0
7046
7047    lea             r0, [r6 + 48]
7048    lea             r2, [r7 + 48]
7049    call            sa8d_8x8_12bit
7050    paddd           m10, m0
7051
7052    lea             r0, [r0 + r1 * 4]
7053    lea             r2, [r2 + r3 * 4]
7054    call            sa8d_8x8_12bit
7055    paddd           m0, m10
7056
7057    vextracti128    xm6, m0, 1
7058    paddd           xm0, xm6
7059
7060    movhlps         xm6, xm0
7061    paddd           xm0, xm6
7062
7063    pshuflw         xm6, xm0, 0Eh
7064    paddd           xm0, xm6
7065    paddd           xm0, [pd_1]
7066    psrld           xm0, 1
7067    paddd           xm11, xm0
7068    movd            eax, xm11
7069    RET
7070
7071cglobal pixel_sa8d_32x64, 4,8,12
7072    add             r1d, r1d
7073    add             r3d, r3d
7074    lea             r4, [r1 + r1 * 2]
7075    lea             r5, [r3 + r3 * 2]
7076    mov             r6, r0
7077    mov             r7, r2
7078    pxor            m10, m10
7079    pxor            m11, m11
7080
7081    call            sa8d_8x8_12bit
7082    paddd           m10, m0
7083
7084    lea             r0, [r0 + r1 * 4]
7085    lea             r2, [r2 + r3 * 4]
7086    call            sa8d_8x8_12bit
7087    paddd           m10, m0
7088
7089    lea             r0, [r6 + 16]
7090    lea             r2, [r7 + 16]
7091    call            sa8d_8x8_12bit
7092    paddd           m10, m0
7093
7094    lea             r0, [r0 + r1 * 4]
7095    lea             r2, [r2 + r3 * 4]
7096    call            sa8d_8x8_12bit
7097    paddd           m0, m10
7098
7099    vextracti128    xm6, m0, 1
7100    paddd           xm0, xm6
7101
7102    movhlps         xm6, xm0
7103    paddd           xm0, xm6
7104
7105    pshuflw         xm6, xm0, 0Eh
7106    paddd           xm0, xm6
7107    paddd           xm0, [pd_1]
7108    psrld           xm0, 1
7109    paddd           xm11, xm0
7110
7111    pxor            m10, m10
7112    lea             r0, [r6 + 32]
7113    lea             r2, [r7 + 32]
7114    call            sa8d_8x8_12bit
7115    paddd           m10, m0
7116
7117    lea             r0, [r0 + r1 * 4]
7118    lea             r2, [r2 + r3 * 4]
7119    call            sa8d_8x8_12bit
7120    paddd           m10, m0
7121
7122    lea             r0, [r6 + 48]
7123    lea             r2, [r7 + 48]
7124    call            sa8d_8x8_12bit
7125    paddd           m10, m0
7126
7127    lea             r0, [r0 + r1 * 4]
7128    lea             r2, [r2 + r3 * 4]
7129    call            sa8d_8x8_12bit
7130    paddd           m0, m10
7131
7132    vextracti128    xm6, m0, 1
7133    paddd           xm0, xm6
7134
7135    movhlps         xm6, xm0
7136    paddd           xm0, xm6
7137
7138    pshuflw         xm6, xm0, 0Eh
7139    paddd           xm0, xm6
7140    paddd           xm0, [pd_1]
7141    psrld           xm0, 1
7142    paddd           xm11, xm0
7143
7144    lea             r6, [r6 + r1 * 8]
7145    lea             r6, [r6 + r1 * 8]
7146    lea             r7, [r7 + r3 * 8]
7147    lea             r7, [r7 + r3 * 8]
7148    pxor            m10, m10
7149    mov             r0, r6
7150    mov             r2, r7
7151    call            sa8d_8x8_12bit
7152    paddd           m10, m0
7153
7154    lea             r0, [r0 + r1 * 4]
7155    lea             r2, [r2 + r3 * 4]
7156    call            sa8d_8x8_12bit
7157    paddd           m10, m0
7158
7159    lea             r0, [r6 + 16]
7160    lea             r2, [r7 + 16]
7161    call            sa8d_8x8_12bit
7162    paddd           m10, m0
7163
7164    lea             r0, [r0 + r1 * 4]
7165    lea             r2, [r2 + r3 * 4]
7166    call            sa8d_8x8_12bit
7167    paddd           m0, m10
7168
7169    vextracti128    xm6, m0, 1
7170    paddd           xm0, xm6
7171
7172    movhlps         xm6, xm0
7173    paddd           xm0, xm6
7174
7175    pshuflw         xm6, xm0, 0Eh
7176    paddd           xm0, xm6
7177    paddd           xm0, [pd_1]
7178    psrld           xm0, 1
7179    paddd           xm11, xm0
7180
7181    pxor            m10, m10
7182    lea             r0, [r6 + 32]
7183    lea             r2, [r7 + 32]
7184    call            sa8d_8x8_12bit
7185    paddd           m10, m0
7186
7187    lea             r0, [r0 + r1 * 4]
7188    lea             r2, [r2 + r3 * 4]
7189    call            sa8d_8x8_12bit
7190    paddd           m10, m0
7191
7192    lea             r0, [r6 + 48]
7193    lea             r2, [r7 + 48]
7194    call            sa8d_8x8_12bit
7195    paddd           m10, m0
7196
7197    lea             r0, [r0 + r1 * 4]
7198    lea             r2, [r2 + r3 * 4]
7199    call            sa8d_8x8_12bit
7200    paddd           m0, m10
7201
7202    vextracti128    xm6, m0, 1
7203    paddd           xm0, xm6
7204
7205    movhlps         xm6, xm0
7206    paddd           xm0, xm6
7207
7208    pshuflw         xm6, xm0, 0Eh
7209    paddd           xm0, xm6
7210    paddd           xm0, [pd_1]
7211    psrld           xm0, 1
7212    paddd           xm11, xm0
7213
7214    lea             r6, [r6 + r1 * 8]
7215    lea             r6, [r6 + r1 * 8]
7216    lea             r7, [r7 + r3 * 8]
7217    lea             r7, [r7 + r3 * 8]
7218    pxor            m10, m10
7219    mov             r0, r6
7220    mov             r2, r7
7221    call            sa8d_8x8_12bit
7222    paddd           m10, m0
7223
7224    lea             r0, [r0 + r1 * 4]
7225    lea             r2, [r2 + r3 * 4]
7226    call            sa8d_8x8_12bit
7227    paddd           m10, m0
7228
7229    lea             r0, [r6 + 16]
7230    lea             r2, [r7 + 16]
7231    call            sa8d_8x8_12bit
7232    paddd           m10, m0
7233
7234    lea             r0, [r0 + r1 * 4]
7235    lea             r2, [r2 + r3 * 4]
7236    call            sa8d_8x8_12bit
7237    paddd           m0, m10
7238
7239    vextracti128    xm6, m0, 1
7240    paddd           xm0, xm6
7241
7242    movhlps         xm6, xm0
7243    paddd           xm0, xm6
7244
7245    pshuflw         xm6, xm0, 0Eh
7246    paddd           xm0, xm6
7247    paddd           xm0, [pd_1]
7248    psrld           xm0, 1
7249    paddd           xm11, xm0
7250
7251    pxor            m10, m10
7252    lea             r0, [r6 + 32]
7253    lea             r2, [r7 + 32]
7254    call            sa8d_8x8_12bit
7255    paddd           m10, m0
7256
7257    lea             r0, [r0 + r1 * 4]
7258    lea             r2, [r2 + r3 * 4]
7259    call            sa8d_8x8_12bit
7260    paddd           m10, m0
7261
7262    lea             r0, [r6 + 48]
7263    lea             r2, [r7 + 48]
7264    call            sa8d_8x8_12bit
7265    paddd           m10, m0
7266
7267    lea             r0, [r0 + r1 * 4]
7268    lea             r2, [r2 + r3 * 4]
7269    call            sa8d_8x8_12bit
7270    paddd           m0, m10
7271
7272    vextracti128    xm6, m0, 1
7273    paddd           xm0, xm6
7274
7275    movhlps         xm6, xm0
7276    paddd           xm0, xm6
7277
7278    pshuflw         xm6, xm0, 0Eh
7279    paddd           xm0, xm6
7280    paddd           xm0, [pd_1]
7281    psrld           xm0, 1
7282    paddd           xm11, xm0
7283
7284    lea             r6, [r6 + r1 * 8]
7285    lea             r6, [r6 + r1 * 8]
7286    lea             r7, [r7 + r3 * 8]
7287    lea             r7, [r7 + r3 * 8]
7288    pxor            m10, m10
7289    mov             r0, r6
7290    mov             r2, r7
7291    call            sa8d_8x8_12bit
7292    paddd           m10, m0
7293
7294    lea             r0, [r0 + r1 * 4]
7295    lea             r2, [r2 + r3 * 4]
7296    call            sa8d_8x8_12bit
7297    paddd           m10, m0
7298
7299    lea             r0, [r6 + 16]
7300    lea             r2, [r7 + 16]
7301    call            sa8d_8x8_12bit
7302    paddd           m10, m0
7303
7304    lea             r0, [r0 + r1 * 4]
7305    lea             r2, [r2 + r3 * 4]
7306    call            sa8d_8x8_12bit
7307    paddd           m0, m10
7308
7309    vextracti128    xm6, m0, 1
7310    paddd           xm0, xm6
7311
7312    movhlps         xm6, xm0
7313    paddd           xm0, xm6
7314
7315    pshuflw         xm6, xm0, 0Eh
7316    paddd           xm0, xm6
7317    paddd           xm0, [pd_1]
7318    psrld           xm0, 1
7319    paddd           xm11, xm0
7320
7321    pxor            m10, m10
7322    lea             r0, [r6 + 32]
7323    lea             r2, [r7 + 32]
7324    call            sa8d_8x8_12bit
7325    paddd           m10, m0
7326
7327    lea             r0, [r0 + r1 * 4]
7328    lea             r2, [r2 + r3 * 4]
7329    call            sa8d_8x8_12bit
7330    paddd           m10, m0
7331
7332    lea             r0, [r6 + 48]
7333    lea             r2, [r7 + 48]
7334    call            sa8d_8x8_12bit
7335    paddd           m10, m0
7336
7337    lea             r0, [r0 + r1 * 4]
7338    lea             r2, [r2 + r3 * 4]
7339    call            sa8d_8x8_12bit
7340    paddd           m0, m10
7341
7342    vextracti128    xm6, m0, 1
7343    paddd           xm0, xm6
7344
7345    movhlps         xm6, xm0
7346    paddd           xm0, xm6
7347
7348    pshuflw         xm6, xm0, 0Eh
7349    paddd           xm0, xm6
7350    paddd           xm0, [pd_1]
7351    psrld           xm0, 1
7352    paddd           xm11, xm0
7353    movd            eax, xm11
7354    RET
7355
7356cglobal pixel_sa8d_64x64, 4,8,12
7357    add             r1d, r1d
7358    add             r3d, r3d
7359    lea             r4, [r1 + r1 * 2]
7360    lea             r5, [r3 + r3 * 2]
7361    mov             r6, r0
7362    mov             r7, r2
7363    pxor            m10, m10
7364    pxor            m11, m11
7365
7366    call            sa8d_8x8_12bit
7367    paddd           m10, m0
7368
7369    lea             r0, [r0 + r1 * 4]
7370    lea             r2, [r2 + r3 * 4]
7371    call            sa8d_8x8_12bit
7372    paddd           m10, m0
7373
7374    lea             r0, [r6 + 16]
7375    lea             r2, [r7 + 16]
7376    call            sa8d_8x8_12bit
7377    paddd           m10, m0
7378
7379    lea             r0, [r0 + r1 * 4]
7380    lea             r2, [r2 + r3 * 4]
7381    call            sa8d_8x8_12bit
7382    paddd           m0, m10
7383
7384    vextracti128    xm6, m0, 1
7385    paddd           xm0, xm6
7386
7387    movhlps         xm6, xm0
7388    paddd           xm0, xm6
7389
7390    pshuflw         xm6, xm0, 0Eh
7391    paddd           xm0, xm6
7392    paddd           xm0, [pd_1]
7393    psrld           xm0, 1
7394    paddd           xm11, xm0
7395
7396    pxor            m10, m10
7397    lea             r0, [r6 + 32]
7398    lea             r2, [r7 + 32]
7399    call            sa8d_8x8_12bit
7400    paddd           m10, m0
7401
7402    lea             r0, [r0 + r1 * 4]
7403    lea             r2, [r2 + r3 * 4]
7404    call            sa8d_8x8_12bit
7405    paddd           m10, m0
7406
7407    lea             r0, [r6 + 48]
7408    lea             r2, [r7 + 48]
7409    call            sa8d_8x8_12bit
7410    paddd           m10, m0
7411
7412    lea             r0, [r0 + r1 * 4]
7413    lea             r2, [r2 + r3 * 4]
7414    call            sa8d_8x8_12bit
7415    paddd           m0, m10
7416
7417    vextracti128    xm6, m0, 1
7418    paddd           xm0, xm6
7419
7420    movhlps         xm6, xm0
7421    paddd           xm0, xm6
7422
7423    pshuflw         xm6, xm0, 0Eh
7424    paddd           xm0, xm6
7425    paddd           xm0, [pd_1]
7426    psrld           xm0, 1
7427    paddd           xm11, xm0
7428
7429    pxor            m10, m10
7430    lea             r0, [r6 + 64]
7431    lea             r2, [r7 + 64]
7432    call            sa8d_8x8_12bit
7433    paddd           m10, m0
7434
7435    lea             r0, [r0 + r1 * 4]
7436    lea             r2, [r2 + r3 * 4]
7437    call            sa8d_8x8_12bit
7438    paddd           m10, m0
7439
7440    lea             r0, [r6 + 80]
7441    lea             r2, [r7 + 80]
7442    call            sa8d_8x8_12bit
7443    paddd           m10, m0
7444
7445    lea             r0, [r0 + r1 * 4]
7446    lea             r2, [r2 + r3 * 4]
7447    call            sa8d_8x8_12bit
7448    paddd           m0, m10
7449
7450    vextracti128    xm6, m0, 1
7451    paddd           xm0, xm6
7452
7453    movhlps         xm6, xm0
7454    paddd           xm0, xm6
7455
7456    pshuflw         xm6, xm0, 0Eh
7457    paddd           xm0, xm6
7458    paddd           xm0, [pd_1]
7459    psrld           xm0, 1
7460    paddd           xm11, xm0
7461
7462    pxor            m10, m10
7463    lea             r0, [r6 + 96]
7464    lea             r2, [r7 + 96]
7465    call            sa8d_8x8_12bit
7466    paddd           m10, m0
7467
7468    lea             r0, [r0 + r1 * 4]
7469    lea             r2, [r2 + r3 * 4]
7470    call            sa8d_8x8_12bit
7471    paddd           m10, m0
7472
7473    lea             r0, [r6 + 112]
7474    lea             r2, [r7 + 112]
7475    call            sa8d_8x8_12bit
7476    paddd           m10, m0
7477
7478    lea             r0, [r0 + r1 * 4]
7479    lea             r2, [r2 + r3 * 4]
7480    call            sa8d_8x8_12bit
7481    paddd           m0, m10
7482
7483    vextracti128    xm6, m0, 1
7484    paddd           xm0, xm6
7485
7486    movhlps         xm6, xm0
7487    paddd           xm0, xm6
7488
7489    pshuflw         xm6, xm0, 0Eh
7490    paddd           xm0, xm6
7491    paddd           xm0, [pd_1]
7492    psrld           xm0, 1
7493    paddd           xm11, xm0
7494
7495    lea             r6, [r6 + r1 * 8]
7496    lea             r6, [r6 + r1 * 8]
7497    lea             r7, [r7 + r3 * 8]
7498    lea             r7, [r7 + r3 * 8]
7499    pxor            m10, m10
7500    mov             r0, r6
7501    mov             r2, r7
7502    call            sa8d_8x8_12bit
7503    paddd           m10, m0
7504
7505    lea             r0, [r0 + r1 * 4]
7506    lea             r2, [r2 + r3 * 4]
7507    call            sa8d_8x8_12bit
7508    paddd           m10, m0
7509
7510    lea             r0, [r6 + 16]
7511    lea             r2, [r7 + 16]
7512    call            sa8d_8x8_12bit
7513    paddd           m10, m0
7514
7515    lea             r0, [r0 + r1 * 4]
7516    lea             r2, [r2 + r3 * 4]
7517    call            sa8d_8x8_12bit
7518    paddd           m0, m10
7519
7520    vextracti128    xm6, m0, 1
7521    paddd           xm0, xm6
7522
7523    movhlps         xm6, xm0
7524    paddd           xm0, xm6
7525
7526    pshuflw         xm6, xm0, 0Eh
7527    paddd           xm0, xm6
7528    paddd           xm0, [pd_1]
7529    psrld           xm0, 1
7530    paddd           xm11, xm0
7531
7532    pxor            m10, m10
7533    lea             r0, [r6 + 32]
7534    lea             r2, [r7 + 32]
7535    call            sa8d_8x8_12bit
7536    paddd           m10, m0
7537
7538    lea             r0, [r0 + r1 * 4]
7539    lea             r2, [r2 + r3 * 4]
7540    call            sa8d_8x8_12bit
7541    paddd           m10, m0
7542
7543    lea             r0, [r6 + 48]
7544    lea             r2, [r7 + 48]
7545    call            sa8d_8x8_12bit
7546    paddd           m10, m0
7547
7548    lea             r0, [r0 + r1 * 4]
7549    lea             r2, [r2 + r3 * 4]
7550    call            sa8d_8x8_12bit
7551    paddd           m0, m10
7552
7553    vextracti128    xm6, m0, 1
7554    paddd           xm0, xm6
7555
7556    movhlps         xm6, xm0
7557    paddd           xm0, xm6
7558
7559    pshuflw         xm6, xm0, 0Eh
7560    paddd           xm0, xm6
7561    paddd           xm0, [pd_1]
7562    psrld           xm0, 1
7563    paddd           xm11, xm0
7564
7565    pxor            m10, m10
7566    lea             r0, [r6 + 64]
7567    lea             r2, [r7 + 64]
7568    call            sa8d_8x8_12bit
7569    paddd           m10, m0
7570
7571    lea             r0, [r0 + r1 * 4]
7572    lea             r2, [r2 + r3 * 4]
7573    call            sa8d_8x8_12bit
7574    paddd           m10, m0
7575
7576    lea             r0, [r6 + 80]
7577    lea             r2, [r7 + 80]
7578    call            sa8d_8x8_12bit
7579    paddd           m10, m0
7580
7581    lea             r0, [r0 + r1 * 4]
7582    lea             r2, [r2 + r3 * 4]
7583    call            sa8d_8x8_12bit
7584    paddd           m0, m10
7585
7586    vextracti128    xm6, m0, 1
7587    paddd           xm0, xm6
7588
7589    movhlps         xm6, xm0
7590    paddd           xm0, xm6
7591
7592    pshuflw         xm6, xm0, 0Eh
7593    paddd           xm0, xm6
7594    paddd           xm0, [pd_1]
7595    psrld           xm0, 1
7596    paddd           xm11, xm0
7597
7598    pxor            m10, m10
7599    lea             r0, [r6 + 96]
7600    lea             r2, [r7 + 96]
7601    call            sa8d_8x8_12bit
7602    paddd           m10, m0
7603
7604    lea             r0, [r0 + r1 * 4]
7605    lea             r2, [r2 + r3 * 4]
7606    call            sa8d_8x8_12bit
7607    paddd           m10, m0
7608
7609    lea             r0, [r6 + 112]
7610    lea             r2, [r7 + 112]
7611    call            sa8d_8x8_12bit
7612    paddd           m10, m0
7613
7614    lea             r0, [r0 + r1 * 4]
7615    lea             r2, [r2 + r3 * 4]
7616    call            sa8d_8x8_12bit
7617    paddd           m0, m10
7618
7619    vextracti128    xm6, m0, 1
7620    paddd           xm0, xm6
7621
7622    movhlps         xm6, xm0
7623    paddd           xm0, xm6
7624
7625    pshuflw         xm6, xm0, 0Eh
7626    paddd           xm0, xm6
7627    paddd           xm0, [pd_1]
7628    psrld           xm0, 1
7629    paddd           xm11, xm0
7630
7631    lea             r6, [r6 + r1 * 8]
7632    lea             r6, [r6 + r1 * 8]
7633    lea             r7, [r7 + r3 * 8]
7634    lea             r7, [r7 + r3 * 8]
7635    pxor            m10, m10
7636    mov             r0, r6
7637    mov             r2, r7
7638    call            sa8d_8x8_12bit
7639    paddd           m10, m0
7640
7641    lea             r0, [r0 + r1 * 4]
7642    lea             r2, [r2 + r3 * 4]
7643    call            sa8d_8x8_12bit
7644    paddd           m10, m0
7645
7646    lea             r0, [r6 + 16]
7647    lea             r2, [r7 + 16]
7648    call            sa8d_8x8_12bit
7649    paddd           m10, m0
7650
7651    lea             r0, [r0 + r1 * 4]
7652    lea             r2, [r2 + r3 * 4]
7653    call            sa8d_8x8_12bit
7654    paddd           m0, m10
7655
7656    vextracti128    xm6, m0, 1
7657    paddd           xm0, xm6
7658
7659    movhlps         xm6, xm0
7660    paddd           xm0, xm6
7661
7662    pshuflw         xm6, xm0, 0Eh
7663    paddd           xm0, xm6
7664    paddd           xm0, [pd_1]
7665    psrld           xm0, 1
7666    paddd           xm11, xm0
7667
7668    pxor            m10, m10
7669    lea             r0, [r6 + 32]
7670    lea             r2, [r7 + 32]
7671    call            sa8d_8x8_12bit
7672    paddd           m10, m0
7673
7674    lea             r0, [r0 + r1 * 4]
7675    lea             r2, [r2 + r3 * 4]
7676    call            sa8d_8x8_12bit
7677    paddd           m10, m0
7678
7679    lea             r0, [r6 + 48]
7680    lea             r2, [r7 + 48]
7681    call            sa8d_8x8_12bit
7682    paddd           m10, m0
7683
7684    lea             r0, [r0 + r1 * 4]
7685    lea             r2, [r2 + r3 * 4]
7686    call            sa8d_8x8_12bit
7687    paddd           m0, m10
7688
7689    vextracti128    xm6, m0, 1
7690    paddd           xm0, xm6
7691
7692    movhlps         xm6, xm0
7693    paddd           xm0, xm6
7694
7695    pshuflw         xm6, xm0, 0Eh
7696    paddd           xm0, xm6
7697    paddd           xm0, [pd_1]
7698    psrld           xm0, 1
7699    paddd           xm11, xm0
7700
7701    pxor            m10, m10
7702    lea             r0, [r6 + 64]
7703    lea             r2, [r7 + 64]
7704    call            sa8d_8x8_12bit
7705    paddd           m10, m0
7706
7707    lea             r0, [r0 + r1 * 4]
7708    lea             r2, [r2 + r3 * 4]
7709    call            sa8d_8x8_12bit
7710    paddd           m10, m0
7711
7712    lea             r0, [r6 + 80]
7713    lea             r2, [r7 + 80]
7714    call            sa8d_8x8_12bit
7715    paddd           m10, m0
7716
7717    lea             r0, [r0 + r1 * 4]
7718    lea             r2, [r2 + r3 * 4]
7719    call            sa8d_8x8_12bit
7720    paddd           m0, m10
7721
7722    vextracti128    xm6, m0, 1
7723    paddd           xm0, xm6
7724
7725    movhlps         xm6, xm0
7726    paddd           xm0, xm6
7727
7728    pshuflw         xm6, xm0, 0Eh
7729    paddd           xm0, xm6
7730    paddd           xm0, [pd_1]
7731    psrld           xm0, 1
7732    paddd           xm11, xm0
7733
7734    pxor            m10, m10
7735    lea             r0, [r6 + 96]
7736    lea             r2, [r7 + 96]
7737    call            sa8d_8x8_12bit
7738    paddd           m10, m0
7739
7740    lea             r0, [r0 + r1 * 4]
7741    lea             r2, [r2 + r3 * 4]
7742    call            sa8d_8x8_12bit
7743    paddd           m10, m0
7744
7745    lea             r0, [r6 + 112]
7746    lea             r2, [r7 + 112]
7747    call            sa8d_8x8_12bit
7748    paddd           m10, m0
7749
7750    lea             r0, [r0 + r1 * 4]
7751    lea             r2, [r2 + r3 * 4]
7752    call            sa8d_8x8_12bit
7753    paddd           m0, m10
7754
7755    vextracti128    xm6, m0, 1
7756    paddd           xm0, xm6
7757
7758    movhlps         xm6, xm0
7759    paddd           xm0, xm6
7760
7761    pshuflw         xm6, xm0, 0Eh
7762    paddd           xm0, xm6
7763    paddd           xm0, [pd_1]
7764    psrld           xm0, 1
7765    paddd           xm11, xm0
7766
7767    lea             r6, [r6 + r1 * 8]
7768    lea             r6, [r6 + r1 * 8]
7769    lea             r7, [r7 + r3 * 8]
7770    lea             r7, [r7 + r3 * 8]
7771    pxor            m10, m10
7772    mov             r0, r6
7773    mov             r2, r7
7774    call            sa8d_8x8_12bit
7775    paddd           m10, m0
7776
7777    lea             r0, [r0 + r1 * 4]
7778    lea             r2, [r2 + r3 * 4]
7779    call            sa8d_8x8_12bit
7780    paddd           m10, m0
7781
7782    lea             r0, [r6 + 16]
7783    lea             r2, [r7 + 16]
7784    call            sa8d_8x8_12bit
7785    paddd           m10, m0
7786
7787    lea             r0, [r0 + r1 * 4]
7788    lea             r2, [r2 + r3 * 4]
7789    call            sa8d_8x8_12bit
7790    paddd           m0, m10
7791
7792    vextracti128    xm6, m0, 1
7793    paddd           xm0, xm6
7794
7795    movhlps         xm6, xm0
7796    paddd           xm0, xm6
7797
7798    pshuflw         xm6, xm0, 0Eh
7799    paddd           xm0, xm6
7800    paddd           xm0, [pd_1]
7801    psrld           xm0, 1
7802    paddd           xm11, xm0
7803
7804    pxor            m10, m10
7805    lea             r0, [r6 + 32]
7806    lea             r2, [r7 + 32]
7807    call            sa8d_8x8_12bit
7808    paddd           m10, m0
7809
7810    lea             r0, [r0 + r1 * 4]
7811    lea             r2, [r2 + r3 * 4]
7812    call            sa8d_8x8_12bit
7813    paddd           m10, m0
7814
7815    lea             r0, [r6 + 48]
7816    lea             r2, [r7 + 48]
7817    call            sa8d_8x8_12bit
7818    paddd           m10, m0
7819
7820    lea             r0, [r0 + r1 * 4]
7821    lea             r2, [r2 + r3 * 4]
7822    call            sa8d_8x8_12bit
7823    paddd           m0, m10
7824
7825    vextracti128    xm6, m0, 1
7826    paddd           xm0, xm6
7827
7828    movhlps         xm6, xm0
7829    paddd           xm0, xm6
7830
7831    pshuflw         xm6, xm0, 0Eh
7832    paddd           xm0, xm6
7833    paddd           xm0, [pd_1]
7834    psrld           xm0, 1
7835    paddd           xm11, xm0
7836
7837    pxor            m10, m10
7838    lea             r0, [r6 + 64]
7839    lea             r2, [r7 + 64]
7840    call            sa8d_8x8_12bit
7841    paddd           m10, m0
7842
7843    lea             r0, [r0 + r1 * 4]
7844    lea             r2, [r2 + r3 * 4]
7845    call            sa8d_8x8_12bit
7846    paddd           m10, m0
7847
7848    lea             r0, [r6 + 80]
7849    lea             r2, [r7 + 80]
7850    call            sa8d_8x8_12bit
7851    paddd           m10, m0
7852
7853    lea             r0, [r0 + r1 * 4]
7854    lea             r2, [r2 + r3 * 4]
7855    call            sa8d_8x8_12bit
7856    paddd           m0, m10
7857
7858    vextracti128    xm6, m0, 1
7859    paddd           xm0, xm6
7860
7861    movhlps         xm6, xm0
7862    paddd           xm0, xm6
7863
7864    pshuflw         xm6, xm0, 0Eh
7865    paddd           xm0, xm6
7866    paddd           xm0, [pd_1]
7867    psrld           xm0, 1
7868    paddd           xm11, xm0
7869
7870    pxor            m10, m10
7871    lea             r0, [r6 + 96]
7872    lea             r2, [r7 + 96]
7873    call            sa8d_8x8_12bit
7874    paddd           m10, m0
7875
7876    lea             r0, [r0 + r1 * 4]
7877    lea             r2, [r2 + r3 * 4]
7878    call            sa8d_8x8_12bit
7879    paddd           m10, m0
7880
7881    lea             r0, [r6 + 112]
7882    lea             r2, [r7 + 112]
7883    call            sa8d_8x8_12bit
7884    paddd           m10, m0
7885
7886    lea             r0, [r0 + r1 * 4]
7887    lea             r2, [r2 + r3 * 4]
7888    call            sa8d_8x8_12bit
7889    paddd           m0, m10
7890
7891    vextracti128    xm6, m0, 1
7892    paddd           xm0, xm6
7893
7894    movhlps         xm6, xm0
7895    paddd           xm0, xm6
7896
7897    pshuflw         xm6, xm0, 0Eh
7898    paddd           xm0, xm6
7899    paddd           xm0, [pd_1]
7900    psrld           xm0, 1
7901    paddd           xm11, xm0
7902    movd            eax, xm11
7903    RET
7904%endif
7905
7906
7907;=============================================================================
7908; INTRA SATD
7909;=============================================================================
7910%define TRANS TRANS_SSE2
7911%define DIFFOP DIFF_UNPACK_SSE2
7912%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
7913%define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSE2
7914%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
7915%define movdqu movups
7916%define punpcklqdq movlhps
7917INIT_XMM sse2
7918%if BIT_DEPTH <= 10
7919SA8D
7920%endif
7921SATDS_SSE2
7922
7923%if HIGH_BIT_DEPTH == 0
7924INIT_XMM ssse3,atom
7925SATDS_SSE2
7926SA8D
7927%endif
7928
7929%define DIFFOP DIFF_SUMSUB_SSSE3
7930%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
7931%if HIGH_BIT_DEPTH == 0
7932%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
7933%define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSSE3
7934%endif
7935INIT_XMM ssse3
7936%if BIT_DEPTH <= 10
7937SA8D
7938%endif
7939SATDS_SSE2
7940%undef movdqa ; nehalem doesn't like movaps
7941%undef movdqu ; movups
7942%undef punpcklqdq ; or movlhps
7943
7944%define TRANS TRANS_SSE4
7945%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
7946INIT_XMM sse4
7947%if BIT_DEPTH <= 10
7948SA8D
7949%endif
7950SATDS_SSE2
7951
7952; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
7953; it's effectively free.
7954%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
7955INIT_XMM avx
7956SA8D
7957SATDS_SSE2
7958
7959%define TRANS TRANS_XOP
7960INIT_XMM xop
7961%if BIT_DEPTH <= 10
7962SA8D
7963%endif
7964SATDS_SSE2
7965
7966%if HIGH_BIT_DEPTH == 0
7967%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
7968%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
7969%define TRANS TRANS_SSE4
7970
7971%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
7972    movddup xm%1, [r0]
7973    movddup xm%3, [r2]
7974    movddup xm%2, [r0+4*r1]
7975    movddup xm%5, [r2+4*r3]
7976    vinserti128 m%1, m%1, xm%2, 1
7977    vinserti128 m%3, m%3, xm%5, 1
7978
7979    movddup xm%2, [r0+r1]
7980    movddup xm%4, [r2+r3]
7981    movddup xm%5, [r0+r4]
7982    movddup xm%6, [r2+r5]
7983    vinserti128 m%2, m%2, xm%5, 1
7984    vinserti128 m%4, m%4, xm%6, 1
7985
7986    DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
7987    lea      r0, [r0+2*r1]
7988    lea      r2, [r2+2*r3]
7989
7990    movddup xm%3, [r0]
7991    movddup xm%5, [r0+4*r1]
7992    vinserti128 m%3, m%3, xm%5, 1
7993
7994    movddup xm%5, [r2]
7995    movddup xm%4, [r2+4*r3]
7996    vinserti128 m%5, m%5, xm%4, 1
7997
7998    movddup xm%4, [r0+r1]
7999    movddup xm%6, [r0+r4]
8000    vinserti128 m%4, m%4, xm%6, 1
8001
8002    movq   xm%6, [r2+r3]
8003    movhps xm%6, [r2+r5]
8004    vpermq m%6, m%6, q1100
8005    DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
8006%endmacro
8007
8008%macro SATD_START_AVX2 2-3 0
8009    FIX_STRIDES r1, r3
8010%if %3
8011    mova    %2, [hmul_8p]
8012    lea     r4, [5*r1]
8013    lea     r5, [5*r3]
8014%else
8015    mova    %2, [hmul_16p]
8016    lea     r4, [3*r1]
8017    lea     r5, [3*r3]
8018%endif
8019    pxor    %1, %1
8020%endmacro
8021
8022%define TRANS TRANS_SSE4
8023INIT_YMM avx2
8024cglobal pixel_satd_16x8_internal
8025    LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
8026    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
8027    LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
8028    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
8029    ret
8030
8031cglobal pixel_satd_16x16, 4,6,8
8032    SATD_START_AVX2 m6, m7
8033    call pixel_satd_16x8_internal
8034    lea  r0, [r0+4*r1]
8035    lea  r2, [r2+4*r3]
8036pixel_satd_16x8_internal:
8037    call pixel_satd_16x8_internal
8038    vextracti128 xm0, m6, 1
8039    paddw        xm0, xm6
8040    SATD_END_SSE2 xm0
8041    RET
8042
8043cglobal pixel_satd_16x8, 4,6,8
8044    SATD_START_AVX2 m6, m7
8045    jmp pixel_satd_16x8_internal
8046
8047cglobal pixel_satd_8x8_internal
8048    LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
8049    SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
8050    ret
8051
8052cglobal pixel_satd_8x16, 4,6,8
8053    SATD_START_AVX2 m6, m7, 1
8054    call pixel_satd_8x8_internal
8055    lea  r0, [r0+2*r1]
8056    lea  r2, [r2+2*r3]
8057    lea  r0, [r0+4*r1]
8058    lea  r2, [r2+4*r3]
8059    call pixel_satd_8x8_internal
8060    vextracti128 xm0, m6, 1
8061    paddw        xm0, xm6
8062    SATD_END_SSE2 xm0
8063    RET
8064
8065cglobal pixel_satd_8x8, 4,6,8
8066    SATD_START_AVX2 m6, m7, 1
8067    call pixel_satd_8x8_internal
8068    vextracti128 xm0, m6, 1
8069    paddw        xm0, xm6
8070    SATD_END_SSE2 xm0
8071    RET
8072
8073cglobal pixel_sa8d_8x8_internal
8074    LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
8075    HADAMARD4_V 0, 1, 2, 3, 4
8076    HADAMARD 8, sumsub, 0, 1, 4, 5
8077    HADAMARD 8, sumsub, 2, 3, 4, 5
8078    HADAMARD 2, sumsub, 0, 1, 4, 5
8079    HADAMARD 2, sumsub, 2, 3, 4, 5
8080    HADAMARD 1, amax, 0, 1, 4, 5
8081    HADAMARD 1, amax, 2, 3, 4, 5
8082    paddw  m6, m0
8083    paddw  m6, m2
8084    ret
8085
8086cglobal pixel_sa8d_8x8, 4,6,8
8087    SATD_START_AVX2 m6, m7, 1
8088    call pixel_sa8d_8x8_internal
8089    vextracti128 xm1, m6, 1
8090    paddw xm6, xm1
8091    HADDW xm6, xm1
8092    movd  eax, xm6
8093    add   eax, 1
8094    shr   eax, 1
8095    RET
8096
8097cglobal pixel_sa8d_16x16, 4,6,8
8098    SATD_START_AVX2 m6, m7, 1
8099
8100    call pixel_sa8d_8x8_internal ; pix[0]
8101
8102    sub  r0, r1
8103    sub  r0, r1
8104    add  r0, 8*SIZEOF_PIXEL
8105    sub  r2, r3
8106    sub  r2, r3
8107    add  r2, 8*SIZEOF_PIXEL
8108    call pixel_sa8d_8x8_internal ; pix[8]
8109
8110    add  r0, r4
8111    add  r0, r1
8112    add  r2, r5
8113    add  r2, r3
8114    call pixel_sa8d_8x8_internal ; pix[8*stride+8]
8115
8116    sub  r0, r1
8117    sub  r0, r1
8118    sub  r0, 8*SIZEOF_PIXEL
8119    sub  r2, r3
8120    sub  r2, r3
8121    sub  r2, 8*SIZEOF_PIXEL
8122    call pixel_sa8d_8x8_internal ; pix[8*stride]
8123
8124    ; TODO: analyze Dynamic Range
8125    vextracti128 xm0, m6, 1
8126    paddusw xm6, xm0
8127    HADDUW xm6, xm0
8128    movd  eax, xm6
8129    add   eax, 1
8130    shr   eax, 1
8131    RET
8132
8133cglobal pixel_sa8d_16x16_internal
8134    call pixel_sa8d_8x8_internal ; pix[0]
8135
8136    sub  r0, r1
8137    sub  r0, r1
8138    add  r0, 8*SIZEOF_PIXEL
8139    sub  r2, r3
8140    sub  r2, r3
8141    add  r2, 8*SIZEOF_PIXEL
8142    call pixel_sa8d_8x8_internal ; pix[8]
8143
8144    add  r0, r4
8145    add  r0, r1
8146    add  r2, r5
8147    add  r2, r3
8148    call pixel_sa8d_8x8_internal ; pix[8*stride+8]
8149
8150    sub  r0, r1
8151    sub  r0, r1
8152    sub  r0, 8*SIZEOF_PIXEL
8153    sub  r2, r3
8154    sub  r2, r3
8155    sub  r2, 8*SIZEOF_PIXEL
8156    call pixel_sa8d_8x8_internal ; pix[8*stride]
8157
8158    ; TODO: analyze Dynamic Range
8159    vextracti128 xm0, m6, 1
8160    paddusw xm6, xm0
8161    HADDUW xm6, xm0
8162    movd  eax, xm6
8163    add   eax, 1
8164    shr   eax, 1
8165    ret
8166
8167%if ARCH_X86_64
8168cglobal pixel_sa8d_32x32, 4,8,8
8169    ; TODO: R6 is RAX on x64 platform, so we use it directly
8170
8171    SATD_START_AVX2 m6, m7, 1
8172    xor     r7d, r7d
8173
8174    call    pixel_sa8d_16x16_internal   ; [0]
8175    pxor    m6, m6
8176    add     r7d, eax
8177
8178    add     r0, r4
8179    add     r0, r1
8180    add     r2, r5
8181    add     r2, r3
8182    call    pixel_sa8d_16x16_internal   ; [2]
8183    pxor    m6, m6
8184    add     r7d, eax
8185
8186    lea     eax, [r4 * 5 - 16]
8187    sub     r0, rax
8188    sub     r0, r1
8189    lea     eax, [r5 * 5 - 16]
8190    sub     r2, rax
8191    sub     r2, r3
8192    call    pixel_sa8d_16x16_internal   ; [1]
8193    pxor    m6, m6
8194    add     r7d, eax
8195
8196    add     r0, r4
8197    add     r0, r1
8198    add     r2, r5
8199    add     r2, r3
8200    call    pixel_sa8d_16x16_internal   ; [3]
8201    add     eax, r7d
8202    RET
8203%endif ; ARCH_X86_64=1
8204%endif ; HIGH_BIT_DEPTH
8205
8206%macro SATD_AVX512_LOAD4 2 ; size, opmask
8207    vpbroadcast%1 m0, [r0]
8208    vpbroadcast%1 m0 {%2}, [r0+2*r1]
8209    vpbroadcast%1 m2, [r2]
8210    vpbroadcast%1 m2 {%2}, [r2+2*r3]
8211    add           r0, r1
8212    add           r2, r3
8213    vpbroadcast%1 m1, [r0]
8214    vpbroadcast%1 m1 {%2}, [r0+2*r1]
8215    vpbroadcast%1 m3, [r2]
8216    vpbroadcast%1 m3 {%2}, [r2+2*r3]
8217%endmacro
8218
8219%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
8220    vpbroadcast%1 %{2}0, [r0]
8221    vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
8222    vpbroadcast%1 %{2}2, [r2]
8223    vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
8224    vpbroadcast%1    m0 {%4}, [r0+4*r1]
8225    vpbroadcast%1    m2 {%4}, [r2+4*r3]
8226    vpbroadcast%1    m0 {%5}, [r0+2*r4]
8227    vpbroadcast%1    m2 {%5}, [r2+2*r5]
8228    vpbroadcast%1 %{2}1, [r0+r1]
8229    vpbroadcast%1 %{2}1 {%3}, [r0+r4]
8230    vpbroadcast%1 %{2}3, [r2+r3]
8231    vpbroadcast%1 %{2}3 {%3}, [r2+r5]
8232    lea              r0, [r0+4*r1]
8233    lea              r2, [r2+4*r3]
8234    vpbroadcast%1    m1 {%4}, [r0+r1]
8235    vpbroadcast%1    m3 {%4}, [r2+r3]
8236    vpbroadcast%1    m1 {%5}, [r0+r4]
8237    vpbroadcast%1    m3 {%5}, [r2+r5]
8238%endmacro
8239
8240%macro SATD_AVX512_PACKED 0
8241    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
8242    SUMSUB_BA      w, 0, 1, 2
8243    SBUTTERFLY   qdq, 0, 1, 2
8244    SUMSUB_BA      w, 0, 1, 2
8245    HMAXABSW2         0, 1, 2, 3
8246%endmacro
8247
8248%macro SATD_AVX512_END 0-1 0 ; sa8d
8249    paddw          m0 {k1}{z}, m1 ; zero-extend to dwords
8250%if ARCH_X86_64
8251%if mmsize == 64
8252    vextracti32x8 ym1, m0, 1
8253    paddd         ym0, ym1
8254%endif
8255%if mmsize >= 32
8256    vextracti128  xm1, ym0, 1
8257    paddd        xmm0, xm0, xm1
8258%endif
8259    punpckhqdq   xmm1, xmm0, xmm0
8260    paddd        xmm0, xmm1
8261    movq          rax, xmm0
8262    rorx          rdx, rax, 32
8263%if %1
8264    lea           eax, [rax+rdx+1]
8265    shr           eax, 1
8266%else
8267    add           eax, edx
8268%endif
8269%else
8270    HADDD          m0, m1
8271    movd          eax, xm0
8272%if %1
8273    inc           eax
8274    shr           eax, 1
8275%endif
8276%endif
8277    RET
8278%endmacro
8279
8280%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
8281    pabsw     m%1, m%1
8282    pabsw     m%2, m%2
8283    psrldq    m%3, m%1, 2
8284    psrld     m%4, m%2, 16
8285    pmaxsw    m%1, m%3
8286    pmaxsw    m%2, m%4
8287%endmacro
8288%if HIGH_BIT_DEPTH==0
8289INIT_ZMM avx512
8290cglobal pixel_satd_16x8_internal
8291    vbroadcasti64x4 m6, [hmul_16p]
8292    kxnorb           k2, k2, k2
8293    mov             r4d, 0x55555555
8294    knotw            k2, k2
8295    kmovd            k1, r4d
8296    lea              r4, [3*r1]
8297    lea              r5, [3*r3]
8298satd_16x8_avx512:
8299    vbroadcasti128  ym0,      [r0]
8300    vbroadcasti32x4  m0 {k2}, [r0+4*r1] ; 0 0 4 4
8301    vbroadcasti128  ym4,      [r2]
8302    vbroadcasti32x4  m4 {k2}, [r2+4*r3]
8303    vbroadcasti128  ym2,      [r0+2*r1]
8304    vbroadcasti32x4  m2 {k2}, [r0+2*r4] ; 2 2 6 6
8305    vbroadcasti128  ym5,      [r2+2*r3]
8306    vbroadcasti32x4  m5 {k2}, [r2+2*r5]
8307    DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
8308    vbroadcasti128  ym1,      [r0+r1]
8309    vbroadcasti128  ym4,      [r2+r3]
8310    vbroadcasti128  ym3,      [r0+r4]
8311    vbroadcasti128  ym5,      [r2+r5]
8312    lea              r0, [r0+4*r1]
8313    lea              r2, [r2+4*r3]
8314    vbroadcasti32x4  m1 {k2}, [r0+r1] ; 1 1 5 5
8315    vbroadcasti32x4  m4 {k2}, [r2+r3]
8316    vbroadcasti32x4  m3 {k2}, [r0+r4] ; 3 3 7 7
8317    vbroadcasti32x4  m5 {k2}, [r2+r5]
8318    DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
8319    HADAMARD4_V       0, 1, 2, 3, 4
8320    HMAXABSW2         0, 2, 4, 5
8321    HMAXABSW2         1, 3, 4, 5
8322    paddw            m4, m0, m2 ; m1
8323    paddw            m2, m1, m3 ; m0
8324    ret
8325
8326cglobal pixel_satd_8x8_internal
8327    vbroadcasti64x4 m4, [hmul_16p]
8328    mov     r4d, 0x55555555
8329    kmovd    k1, r4d   ; 01010101
8330    kshiftlb k2, k1, 5 ; 10100000
8331    kshiftlb k3, k1, 4 ; 01010000
8332    lea      r4, [3*r1]
8333    lea      r5, [3*r3]
8334satd_8x8_avx512:
8335    SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
8336    SATD_AVX512_PACKED                  ; 3 1 3 1 7 5 7 5
8337    ret
8338
8339cglobal pixel_satd_16x8, 4,6
8340    call pixel_satd_16x8_internal_avx512
8341    jmp satd_zmm_avx512_end
8342
8343cglobal pixel_satd_16x16, 4,6
8344    call pixel_satd_16x8_internal_avx512
8345    lea      r0, [r0+4*r1]
8346    lea      r2, [r2+4*r3]
8347    paddw    m7, m0, m1
8348    call satd_16x8_avx512
8349    paddw    m1, m7
8350    jmp satd_zmm_avx512_end
8351
8352cglobal pixel_satd_8x8, 4,6
8353    call pixel_satd_8x8_internal_avx512
8354satd_zmm_avx512_end:
8355    SATD_AVX512_END
8356
8357cglobal pixel_satd_8x16, 4,6
8358    call pixel_satd_8x8_internal_avx512
8359    lea      r0, [r0+4*r1]
8360    lea      r2, [r2+4*r3]
8361    paddw    m5, m0, m1
8362    call satd_8x8_avx512
8363    paddw    m1, m5
8364    jmp satd_zmm_avx512_end
8365
8366INIT_YMM avx512
8367cglobal pixel_satd_4x8_internal
8368    vbroadcasti128 m4, [hmul_4p]
8369    mov     r4d, 0x55550c
8370    kmovd    k2, r4d   ; 00001100
8371    kshiftlb k3, k2, 2 ; 00110000
8372    kshiftlb k4, k2, 4 ; 11000000
8373    kshiftrd k1, k2, 8 ; 01010101
8374    lea      r4, [3*r1]
8375    lea      r5, [3*r3]
8376satd_4x8_avx512:
8377    SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
8378satd_ymm_avx512:                        ; 1 1 3 3 5 5 7 7
8379    SATD_AVX512_PACKED
8380    ret
8381
8382cglobal pixel_satd_8x4, 4,5
8383    mova     m4, [hmul_16p]
8384    mov     r4d, 0x5555
8385    kmovw    k1, r4d
8386    SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
8387    call satd_ymm_avx512    ; 3 1 3 1
8388    jmp satd_ymm_avx512_end2
8389
8390cglobal pixel_satd_4x8, 4,6
8391    call pixel_satd_4x8_internal_avx512
8392satd_ymm_avx512_end:
8393%if ARCH_X86_64 == 0
8394    pop     r5d
8395    %assign regs_used 5
8396%endif
8397satd_ymm_avx512_end2:
8398    SATD_AVX512_END
8399
8400cglobal pixel_satd_4x16, 4,6
8401    call pixel_satd_4x8_internal_avx512
8402    lea      r0, [r0+4*r1]
8403    lea      r2, [r2+4*r3]
8404    paddw    m5, m0, m1
8405    call satd_4x8_avx512
8406    paddw    m1, m5
8407    jmp satd_ymm_avx512_end
8408
8409INIT_XMM avx512
8410cglobal pixel_satd_4x4, 4,5
8411    mova     m4, [hmul_4p]
8412    mov     r4d, 0x550c
8413    kmovw    k2, r4d
8414    kshiftrw k1, k2, 8
8415    SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
8416    SATD_AVX512_PACKED      ; 1 1 3 3
8417    SWAP      0, 1
8418    SATD_AVX512_END
8419
8420INIT_ZMM avx512
8421cglobal pixel_sa8d_8x8, 4,6
8422    vbroadcasti64x4 m4, [hmul_16p]
8423    mov     r4d, 0x55555555
8424    kmovd    k1, r4d   ; 01010101
8425    kshiftlb k2, k1, 5 ; 10100000
8426    kshiftlb k3, k1, 4 ; 01010000
8427    lea      r4, [3*r1]
8428    lea      r5, [3*r3]
8429    SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
8430    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4     ; 3 1 3 1 7 5 7 5
8431    SUMSUB_BA      w, 0, 1, 2
8432    SBUTTERFLY   qdq, 0, 1, 2
8433    SUMSUB_BA      w, 0, 1, 2
8434    shufps        m2, m0, m1, q2020
8435    shufps        m1, m0, m1, q3131
8436    SUMSUB_BA      w, 2, 1, 0
8437    vshufi32x4    m0, m2, m1, q1010
8438    vshufi32x4    m1, m2, m1, q3232
8439    SUMSUB_BA      w, 0, 1, 2
8440    HMAXABSW2      0, 1, 2, 3
8441    SATD_AVX512_END 1
8442%endif
8443; Input 10bit, Output 8bit
8444;------------------------------------------------------------------------------------------------------------------------
8445;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
8446;------------------------------------------------------------------------------------------------------------------------
8447INIT_XMM sse2
8448cglobal downShift_16, 4,7,3
8449    mov         r4d, r4m
8450    mov         r5d, r5m
8451    movd        m0, r6m        ; m0 = shift
8452    add         r1, r1
8453
8454    dec         r5d
8455.loopH:
8456    xor         r6, r6
8457
8458.loopW:
8459    movu        m1, [r0 + r6 * 2]
8460    movu        m2, [r0 + r6 * 2 + mmsize]
8461    psrlw       m1, m0
8462    psrlw       m2, m0
8463    packuswb    m1, m2
8464    movu        [r2 + r6], m1
8465
8466    add         r6, mmsize
8467    cmp         r6d, r4d
8468    jl         .loopW
8469
8470    ; move to next row
8471    add         r0, r1
8472    add         r2, r3
8473    dec         r5d
8474    jnz        .loopH
8475
8476    ;processing last row of every frame [To handle width which not a multiple of 16]
8477    ; r4d must be more than or equal to 16(mmsize)
8478.loop16:
8479    movu        m1, [r0 + (r4 - mmsize) * 2]
8480    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
8481    psrlw       m1, m0
8482    psrlw       m2, m0
8483    packuswb    m1, m2
8484    movu        [r2 + r4 - mmsize], m1
8485
8486    sub         r4d, mmsize
8487    jz         .end
8488    cmp         r4d, mmsize
8489    jge        .loop16
8490
8491    ; process partial pixels
8492    movu        m1, [r0]
8493    movu        m2, [r0 + mmsize]
8494    psrlw       m1, m0
8495    psrlw       m2, m0
8496    packuswb    m1, m2
8497    movu        [r2], m1
8498
8499.end:
8500    RET
8501
8502; Input 10bit, Output 8bit
8503;-------------------------------------------------------------------------------------------------------------------------------------
8504;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
8505;-------------------------------------------------------------------------------------------------------------------------------------
8506INIT_YMM avx2
8507cglobal downShift_16, 4,7,3
8508    mov         r4d, r4m
8509    mov         r5d, r5m
8510    movd        xm0, r6m        ; m0 = shift
8511    add         r1d, r1d
8512
8513    dec         r5d
8514.loopH:
8515    xor         r6, r6
8516
8517.loopW:
8518    movu        m1, [r0 + r6 * 2 +  0]
8519    movu        m2, [r0 + r6 * 2 + 32]
8520    vpsrlw      m1, xm0
8521    vpsrlw      m2, xm0
8522    packuswb    m1, m2
8523    vpermq      m1, m1, 11011000b
8524    movu        [r2 + r6], m1
8525
8526    add         r6d, mmsize
8527    cmp         r6d, r4d
8528    jl         .loopW
8529
8530    ; move to next row
8531    add         r0, r1
8532    add         r2, r3
8533    dec         r5d
8534    jnz        .loopH
8535
8536    ; processing last row of every frame [To handle width which not a multiple of 32]
8537
8538.loop32:
8539    movu        m1, [r0 + (r4 - mmsize) * 2]
8540    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
8541    psrlw       m1, xm0
8542    psrlw       m2, xm0
8543    packuswb    m1, m2
8544    vpermq      m1, m1, q3120
8545    movu        [r2 + r4 - mmsize], m1
8546
8547    sub         r4d, mmsize
8548    jz         .end
8549    cmp         r4d, mmsize
8550    jge        .loop32
8551
8552    ; process partial pixels
8553    movu        m1, [r0]
8554    movu        m2, [r0 + mmsize]
8555    psrlw       m1, xm0
8556    psrlw       m2, xm0
8557    packuswb    m1, m2
8558    vpermq      m1, m1, q3120
8559    movu        [r2], m1
8560
8561.end:
8562    RET
8563
8564; Input 8bit, Output 10bit
8565;---------------------------------------------------------------------------------------------------------------------
8566;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
8567;---------------------------------------------------------------------------------------------------------------------
8568INIT_XMM sse4
8569cglobal upShift_8, 6,7,3
8570    movd        xm2, r6m
8571    add         r3d, r3d
8572    dec         r5d
8573
8574.loopH:
8575    xor         r6, r6
8576.loopW:
8577    pmovzxbw    m0,[r0 + r6]
8578    pmovzxbw    m1,[r0 + r6 + mmsize/2]
8579    psllw       m0, m2
8580    psllw       m1, m2
8581    movu        [r2 + r6 * 2], m0
8582    movu        [r2 + r6 * 2 + mmsize], m1
8583
8584    add         r6d, mmsize
8585    cmp         r6d, r4d
8586    jl         .loopW
8587
8588    ; move to next row
8589    add         r0, r1
8590    add         r2, r3
8591    dec         r5d
8592    jg         .loopH
8593
8594    ; processing last row of every frame [To handle width which not a multiple of 16]
8595    mov         r1d, (mmsize/2 - 1)
8596    and         r1d, r4d
8597    sub         r1, mmsize/2
8598
8599    ; NOTE: Width MUST BE more than or equal to 8
8600    shr         r4d, 3          ; log2(mmsize)
8601.loopW8:
8602    pmovzxbw    m0,[r0]
8603    psllw       m0, m2
8604    movu        [r2], m0
8605    add         r0, mmsize/2
8606    add         r2, mmsize
8607    dec         r4d
8608    jg         .loopW8
8609
8610    ; Mac OS X can't read beyond array bound, so rollback some bytes
8611    pmovzxbw    m0,[r0 + r1]
8612    psllw       m0, m2
8613    movu        [r2 + r1 * 2], m0
8614    RET
8615
8616
8617;---------------------------------------------------------------------------------------------------------------------
8618;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
8619;---------------------------------------------------------------------------------------------------------------------
8620%if ARCH_X86_64
8621INIT_YMM avx2
8622cglobal upShift_8, 6,7,3
8623    movd        xm2, r6m
8624    add         r3d, r3d
8625    dec         r5d
8626
8627.loopH:
8628    xor         r6, r6
8629.loopW:
8630    pmovzxbw    m0,[r0 + r6]
8631    pmovzxbw    m1,[r0 + r6 + mmsize/2]
8632    psllw       m0, xm2
8633    psllw       m1, xm2
8634    movu        [r2 + r6 * 2], m0
8635    movu        [r2 + r6 * 2 + mmsize], m1
8636
8637    add         r6d, mmsize
8638    cmp         r6d, r4d
8639    jl         .loopW
8640
8641    ; move to next row
8642    add         r0, r1
8643    add         r2, r3
8644    dec         r5d
8645    jg         .loopH
8646
8647    ; processing last row of every frame [To handle width which not a multiple of 32]
8648    mov         r1d, (mmsize/2 - 1)
8649    and         r1d, r4d
8650    sub         r1, mmsize/2
8651
8652    ; NOTE: Width MUST BE more than or equal to 16
8653    shr         r4d, 4          ; log2(mmsize)
8654.loopW16:
8655    pmovzxbw    m0,[r0]
8656    psllw       m0, xm2
8657    movu        [r2], m0
8658    add         r0, mmsize/2
8659    add         r2, mmsize
8660    dec         r4d
8661    jg         .loopW16
8662
8663    ; Mac OS X can't read beyond array bound, so rollback some bytes
8664    pmovzxbw    m0,[r0 + r1]
8665    psllw       m0, xm2
8666    movu        [r2 + r1 * 2], m0
8667    RET
8668%endif
8669
8670%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
8671%if cpuflag(ssse3)
8672    pabsd   %1, %3
8673    pabsd   %2, %4
8674%elifidn %1, %3
8675    pxor    %5, %5
8676    pxor    %6, %6
8677    psubd   %5, %1
8678    psubd   %6, %2
8679    pmaxsd  %1, %5
8680    pmaxsd  %2, %6
8681%else
8682    pxor    %1, %1
8683    pxor    %2, %2
8684    psubd   %1, %3
8685    psubd   %2, %4
8686    pmaxsd  %1, %3
8687    pmaxsd  %2, %4
8688%endif
8689%endmacro
8690
8691
8692; Input 10bit, Output 12bit
8693;------------------------------------------------------------------------------------------------------------------------
8694;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
8695;------------------------------------------------------------------------------------------------------------------------
8696INIT_XMM sse2
8697cglobal upShift_16, 4,7,4
8698    mov         r4d, r4m
8699    mov         r5d, r5m
8700    movd        m0, r6m        ; m0 = shift
8701    mova        m3, [pw_pixel_max]
8702    FIX_STRIDES r1d, r3d
8703    dec         r5d
8704.loopH:
8705    xor         r6d, r6d
8706.loopW:
8707    movu        m1, [r0 + r6 * SIZEOF_PIXEL]
8708    movu        m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
8709    psllw       m1, m0
8710    psllw       m2, m0
8711    ; TODO: if input always valid, we can remove below 2 instructions.
8712    pand        m1, m3
8713    pand        m2, m3
8714    movu        [r2 + r6 * SIZEOF_PIXEL], m1
8715    movu        [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
8716
8717    add         r6, mmsize * 2 / SIZEOF_PIXEL
8718    cmp         r6d, r4d
8719    jl         .loopW
8720
8721    ; move to next row
8722    add         r0, r1
8723    add         r2, r3
8724    dec         r5d
8725    jnz        .loopH
8726
8727    ;processing last row of every frame [To handle width which not a multiple of 16]
8728
8729    ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here
8730.loop16:
8731    movu        m1, [r0 + (r4 - mmsize) * 2]
8732    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
8733    psllw       m1, m0
8734    psllw       m2, m0
8735    pand        m1, m3
8736    pand        m2, m3
8737    movu        [r2 + (r4 - mmsize) * 2], m1
8738    movu        [r2 + (r4 - mmsize) * 2 + mmsize], m2
8739
8740    sub         r4d, mmsize
8741    jz         .end
8742    cmp         r4d, mmsize
8743    jge        .loop16
8744
8745    ; process partial pixels
8746    movu        m1, [r0]
8747    movu        m2, [r0 + mmsize]
8748    psllw       m1, m0
8749    psllw       m2, m0
8750    pand        m1, m3
8751    pand        m2, m3
8752    movu        [r2], m1
8753    movu        [r2 + mmsize], m2
8754
8755.end:
8756    RET
8757
8758; Input 10bit, Output 12bit
8759;-------------------------------------------------------------------------------------------------------------------------------------
8760;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
8761;-------------------------------------------------------------------------------------------------------------------------------------
8762INIT_YMM avx2
8763cglobal upShift_16, 4,7,4
8764    mov         r4d, r4m
8765    mov         r5d, r5m
8766    movd        xm0, r6m        ; m0 = shift
8767    vbroadcasti128 m3, [pw_pixel_max]
8768    FIX_STRIDES r1d, r3d
8769    dec         r5d
8770.loopH:
8771    xor         r6d, r6d
8772.loopW:
8773    movu        m1, [r0 + r6 * SIZEOF_PIXEL]
8774    movu        m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
8775    psllw       m1, xm0
8776    psllw       m2, xm0
8777    pand        m1, m3
8778    pand        m2, m3
8779    movu        [r2 + r6 * SIZEOF_PIXEL], m1
8780    movu        [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
8781
8782    add         r6, mmsize * 2 / SIZEOF_PIXEL
8783    cmp         r6d, r4d
8784    jl         .loopW
8785
8786    ; move to next row
8787    add         r0, r1
8788    add         r2, r3
8789    dec         r5d
8790    jnz        .loopH
8791
8792    ; processing last row of every frame [To handle width which not a multiple of 32]
8793
8794.loop32:
8795    movu        m1, [r0 + (r4 - mmsize) * 2]
8796    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
8797    psllw       m1, xm0
8798    psllw       m2, xm0
8799    pand        m1, m3
8800    pand        m2, m3
8801    movu        [r2 + (r4 - mmsize) * 2], m1
8802    movu        [r2 + (r4 - mmsize) * 2 + mmsize], m2
8803
8804    sub         r4d, mmsize
8805    jz         .end
8806    cmp         r4d, mmsize
8807    jge        .loop32
8808
8809    ; process partial pixels
8810    movu        m1, [r0]
8811    movu        m2, [r0 + mmsize]
8812    psllw       m1, xm0
8813    psllw       m2, xm0
8814    pand        m1, m3
8815    pand        m2, m3
8816    movu        [r2], m1
8817    movu        [r2 + mmsize], m2
8818
8819.end:
8820    RET
8821INIT_ZMM avx512
8822cglobal upShift_16, 4,7,4
8823    mov         r4d, r4m
8824    mov         r5d, r5m
8825    movd        xm0, r6m        ; m0 = shift
8826    vbroadcasti32x4 m3, [pw_pixel_max]
8827    FIX_STRIDES r1d, r3d
8828    dec         r5d
8829.loopH:
8830    xor         r6d, r6d
8831.loopW:
8832    movu        m1, [r0 + r6 * SIZEOF_PIXEL]
8833    psllw       m1, xm0
8834    pand        m1, m3
8835    movu        [r2 + r6 * SIZEOF_PIXEL], m1
8836
8837    add         r6, mmsize / SIZEOF_PIXEL
8838    cmp         r6d, r4d
8839    jl         .loopW
8840
8841    ; move to next row
8842    add         r0, r1
8843    add         r2, r3
8844    dec         r5d
8845    jnz        .loopH
8846
8847    ; processing last row of every frame [To handle width which not a multiple of 32]
8848
8849.loop32:
8850    movu        m1, [r0 + (r4 - mmsize/2) * 2]
8851    psllw       m1, xm0
8852    pand        m1, m3
8853    movu        [r2 + (r4 - mmsize/2) * 2], m1
8854
8855    sub         r4d, mmsize/2
8856    jz         .end
8857    cmp         r4d, mmsize/2
8858    jge        .loop32
8859
8860    ; process partial pixels
8861    movu        m1, [r0]
8862    psllw       m1, xm0
8863    pand        m1, m3
8864    movu        [r2], m1
8865
8866.end:
8867    RET
8868;---------------------------------------------------------------------------------------------------------------------
8869;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
8870;---------------------------------------------------------------------------------------------------------------------
8871INIT_XMM sse4
8872cglobal psyCost_pp_4x4, 4, 5, 8
8873
8874%if HIGH_BIT_DEPTH
8875    FIX_STRIDES r1, r3
8876    lea             r4, [3 * r1]
8877    movddup         m0, [r0]
8878    movddup         m1, [r0 + r1]
8879    movddup         m2, [r0 + r1 * 2]
8880    movddup         m3, [r0 + r4]
8881    mova            m4, [hmul_8w]
8882    pmaddwd         m0, m4
8883    pmaddwd         m1, m4
8884    pmaddwd         m2, m4
8885    pmaddwd         m3, m4
8886
8887    paddd           m5, m0, m1
8888    paddd           m5, m2
8889    paddd           m5, m3
8890    psrldq          m4, m5, 4
8891    paddd           m5, m4
8892    psrld           m5, 2
8893
8894    SUMSUB_BA d, 0, 1, 4
8895    SUMSUB_BA d, 2, 3, 4
8896    SUMSUB_BA d, 0, 2, 4
8897    SUMSUB_BA d, 1, 3, 4
8898    %define ORDER unord
8899    TRANS q, ORDER, 0, 2, 4, 6
8900    TRANS q, ORDER, 1, 3, 4, 6
8901    ABSD2 m0, m2, m0, m2, m4, m6
8902    pmaxsd          m0, m2
8903    ABSD2 m1, m3, m1, m3, m4, m6
8904    pmaxsd          m1, m3
8905    paddd           m0, m1
8906    movhlps         m1, m0
8907    paddd           m0, m1
8908    psrldq          m1, m0, 4
8909    paddd           m0, m1
8910
8911    psubd           m7, m0, m5
8912
8913    lea             r4, [3 * r3]
8914    movddup         m0, [r2]
8915    movddup         m1, [r2 + r3]
8916    movddup         m2, [r2 + r3 * 2]
8917    movddup         m3, [r2 + r4]
8918    mova            m4, [hmul_8w]
8919    pmaddwd         m0, m4
8920    pmaddwd         m1, m4
8921    pmaddwd         m2, m4
8922    pmaddwd         m3, m4
8923
8924    paddd           m5, m0, m1
8925    paddd           m5, m2
8926    paddd           m5, m3
8927    psrldq          m4, m5, 4
8928    paddd           m5, m4
8929    psrld           m5, 2
8930
8931    SUMSUB_BA d, 0, 1, 4
8932    SUMSUB_BA d, 2, 3, 4
8933    SUMSUB_BA d, 0, 2, 4
8934    SUMSUB_BA d, 1, 3, 4
8935    %define ORDER unord
8936    TRANS q, ORDER, 0, 2, 4, 6
8937    TRANS q, ORDER, 1, 3, 4, 6
8938    ABSD2 m0, m2, m0, m2, m4, m6
8939    pmaxsd          m0, m2
8940    ABSD2 m1, m3, m1, m3, m4, m6
8941    pmaxsd          m1, m3
8942    paddd           m0, m1
8943    movhlps         m1, m0
8944    paddd           m0, m1
8945    psrldq          m1, m0, 4
8946    paddd           m0, m1
8947
8948    psubd           m0, m5
8949
8950    psubd           m7, m0
8951    pabsd           m0, m7
8952    movd            eax, m0
8953
8954%else ; !HIGH_BIT_DEPTH
8955    lea             r4, [3 * r1]
8956    movd            m0, [r0]
8957    movd            m1, [r0 + r1]
8958    movd            m2, [r0 + r1 * 2]
8959    movd            m3, [r0 + r4]
8960    shufps          m0, m1, 0
8961    shufps          m2, m3, 0
8962    mova            m4, [hmul_4p]
8963    pmaddubsw       m0, m4
8964    pmaddubsw       m2, m4
8965
8966    paddw           m5, m0, m2
8967    movhlps         m4, m5
8968    paddw           m5, m4
8969    pmaddwd         m5, [pw_1]
8970    psrld           m5, 2
8971
8972    HADAMARD 0, sumsub, 0, 2, 1, 3
8973    HADAMARD 4, sumsub, 0, 2, 1, 3
8974    HADAMARD 1, amax, 0, 2, 1, 3
8975    HADDW m0, m2
8976
8977    psubd           m6, m0, m5
8978
8979    lea             r4, [3 * r3]
8980    movd            m0, [r2]
8981    movd            m1, [r2 + r3]
8982    movd            m2, [r2 + r3 * 2]
8983    movd            m3, [r2 + r4]
8984    shufps          m0, m1, 0
8985    shufps          m2, m3, 0
8986    mova            m4, [hmul_4p]
8987    pmaddubsw       m0, m4
8988    pmaddubsw       m2, m4
8989
8990    paddw           m5, m0, m2
8991    movhlps         m4, m5
8992    paddw           m5, m4
8993    pmaddwd         m5, [pw_1]
8994    psrld           m5, 2
8995
8996    HADAMARD 0, sumsub, 0, 2, 1, 3
8997    HADAMARD 4, sumsub, 0, 2, 1, 3
8998    HADAMARD 1, amax, 0, 2, 1, 3
8999    HADDW m0, m2
9000
9001    psubd           m0, m5
9002
9003    psubd           m6, m0
9004    pabsd           m0, m6
9005    movd            eax, m0
9006%endif ; HIGH_BIT_DEPTH
9007    RET
9008
9009%if ARCH_X86_64
9010INIT_XMM sse4
9011cglobal psyCost_pp_8x8, 4, 6, 13
9012
9013%if HIGH_BIT_DEPTH
9014    FIX_STRIDES r1, r3
9015    lea             r4, [3 * r1]
9016    pxor            m10, m10
9017    movu            m0, [r0]
9018    movu            m1, [r0 + r1]
9019    movu            m2, [r0 + r1 * 2]
9020    movu            m3, [r0 + r4]
9021    lea             r5, [r0 + r1 * 4]
9022    movu            m4, [r5]
9023    movu            m5, [r5 + r1]
9024    movu            m6, [r5 + r1 * 2]
9025    movu            m7, [r5 + r4]
9026
9027    paddw           m8, m0, m1
9028    paddw           m8, m2
9029    paddw           m8, m3
9030    paddw           m8, m4
9031    paddw           m8, m5
9032    paddw           m8, m6
9033    paddw           m8, m7
9034    pmaddwd         m8, [pw_1]
9035    movhlps         m9, m8
9036    paddd           m8, m9
9037    psrldq          m9, m8, 4
9038    paddd           m8, m9
9039    psrld           m8, 2
9040
9041    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9042
9043    paddd           m0, m1
9044    paddd           m0, m2
9045    paddd           m0, m3
9046    HADDUW m0, m1
9047    paddd           m0, [pd_1]
9048    psrld           m0, 1
9049    psubd           m10, m0, m8
9050
9051    lea             r4, [3 * r3]
9052    movu            m0, [r2]
9053    movu            m1, [r2 + r3]
9054    movu            m2, [r2 + r3 * 2]
9055    movu            m3, [r2 + r4]
9056    lea             r5, [r2 + r3 * 4]
9057    movu            m4, [r5]
9058    movu            m5, [r5 + r3]
9059    movu            m6, [r5 + r3 * 2]
9060    movu            m7, [r5 + r4]
9061
9062    paddw           m8, m0, m1
9063    paddw           m8, m2
9064    paddw           m8, m3
9065    paddw           m8, m4
9066    paddw           m8, m5
9067    paddw           m8, m6
9068    paddw           m8, m7
9069    pmaddwd         m8, [pw_1]
9070    movhlps         m9, m8
9071    paddd           m8, m9
9072    psrldq          m9, m8, 4
9073    paddd           m8, m9
9074    psrld           m8, 2
9075
9076    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9077
9078    paddd           m0, m1
9079    paddd           m0, m2
9080    paddd           m0, m3
9081    HADDUW m0, m1
9082    paddd           m0, [pd_1]
9083    psrld           m0, 1
9084    psubd           m0, m8
9085    psubd           m10, m0
9086    pabsd           m0, m10
9087    movd            eax, m0
9088%else ; !HIGH_BIT_DEPTH
9089    lea             r4, [3 * r1]
9090    mova            m8, [hmul_8p]
9091
9092    movddup         m0, [r0]
9093    movddup         m1, [r0 + r1]
9094    movddup         m2, [r0 + r1 * 2]
9095    movddup         m3, [r0 + r4]
9096    lea             r5, [r0 + r1 * 4]
9097    movddup         m4, [r5]
9098    movddup         m5, [r5 + r1]
9099    movddup         m6, [r5 + r1 * 2]
9100    movddup         m7, [r5 + r4]
9101
9102    pmaddubsw       m0, m8
9103    pmaddubsw       m1, m8
9104    pmaddubsw       m2, m8
9105    pmaddubsw       m3, m8
9106    pmaddubsw       m4, m8
9107    pmaddubsw       m5, m8
9108    pmaddubsw       m6, m8
9109    pmaddubsw       m7, m8
9110
9111    paddw           m11, m0, m1
9112    paddw           m11, m2
9113    paddw           m11, m3
9114    paddw           m11, m4
9115    paddw           m11, m5
9116    paddw           m11, m6
9117    paddw           m11, m7
9118
9119    pmaddwd         m11, [pw_1]
9120    psrldq          m10, m11, 4
9121    paddd           m11, m10
9122    psrld           m11, 2
9123
9124    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
9125
9126    paddw           m0, m1
9127    paddw           m0, m2
9128    paddw           m0, m3
9129    HADDW m0, m1
9130
9131    paddd           m0, [pd_1]
9132    psrld           m0, 1
9133    psubd           m12, m0, m11
9134
9135    lea             r4, [3 * r3]
9136
9137    movddup         m0, [r2]
9138    movddup         m1, [r2 + r3]
9139    movddup         m2, [r2 + r3 * 2]
9140    movddup         m3, [r2 + r4]
9141    lea             r5, [r2 + r3 * 4]
9142    movddup         m4, [r5]
9143    movddup         m5, [r5 + r3]
9144    movddup         m6, [r5 + r3 * 2]
9145    movddup         m7, [r5 + r4]
9146
9147    pmaddubsw       m0, m8
9148    pmaddubsw       m1, m8
9149    pmaddubsw       m2, m8
9150    pmaddubsw       m3, m8
9151    pmaddubsw       m4, m8
9152    pmaddubsw       m5, m8
9153    pmaddubsw       m6, m8
9154    pmaddubsw       m7, m8
9155
9156    paddw           m11, m0, m1
9157    paddw           m11, m2
9158    paddw           m11, m3
9159    paddw           m11, m4
9160    paddw           m11, m5
9161    paddw           m11, m6
9162    paddw           m11, m7
9163
9164    pmaddwd         m11, [pw_1]
9165    psrldq          m10, m11, 4
9166    paddd           m11, m10
9167    psrld           m11, 2
9168
9169    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
9170
9171    paddw           m0, m1
9172    paddw           m0, m2
9173    paddw           m0, m3
9174    HADDW m0, m1
9175
9176    paddd           m0, [pd_1]
9177    psrld           m0, 1
9178    psubd           m0, m11
9179    psubd           m12, m0
9180    pabsd           m0, m12
9181    movd            eax, m0
9182%endif ; HIGH_BIT_DEPTH
9183    RET
9184%endif
9185
9186%if ARCH_X86_64
9187%if HIGH_BIT_DEPTH
9188INIT_XMM sse4
9189cglobal psyCost_pp_16x16, 4, 9, 14
9190
9191    FIX_STRIDES r1, r3
9192    lea             r4, [3 * r1]
9193    lea             r8, [3 * r3]
9194    mova            m12, [pw_1]
9195    mova            m13, [pd_1]
9196    pxor            m11, m11
9197    mov             r7d, 2
9198.loopH:
9199    mov             r6d, 2
9200.loopW:
9201    pxor            m10, m10
9202    movu            m0, [r0]
9203    movu            m1, [r0 + r1]
9204    movu            m2, [r0 + r1 * 2]
9205    movu            m3, [r0 + r4]
9206    lea             r5, [r0 + r1 * 4]
9207    movu            m4, [r5]
9208    movu            m5, [r5 + r1]
9209    movu            m6, [r5 + r1 * 2]
9210    movu            m7, [r5 + r4]
9211
9212    paddw           m8, m0, m1
9213    paddw           m8, m2
9214    paddw           m8, m3
9215    paddw           m8, m4
9216    paddw           m8, m5
9217    paddw           m8, m6
9218    paddw           m8, m7
9219    pmaddwd         m8, m12
9220    movhlps         m9, m8
9221    paddd           m8, m9
9222    psrldq          m9, m8, 4
9223    paddd           m8, m9
9224    psrld           m8, 2
9225
9226    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9227
9228    paddd           m0, m1
9229    paddd           m0, m2
9230    paddd           m0, m3
9231    HADDUW m0, m1
9232    paddd           m0, m13
9233    psrld           m0, 1
9234    psubd           m10, m0, m8
9235
9236    movu            m0, [r2]
9237    movu            m1, [r2 + r3]
9238    movu            m2, [r2 + r3 * 2]
9239    movu            m3, [r2 + r8]
9240    lea             r5, [r2 + r3 * 4]
9241    movu            m4, [r5]
9242    movu            m5, [r5 + r3]
9243    movu            m6, [r5 + r3 * 2]
9244    movu            m7, [r5 + r8]
9245
9246    paddw           m8, m0, m1
9247    paddw           m8, m2
9248    paddw           m8, m3
9249    paddw           m8, m4
9250    paddw           m8, m5
9251    paddw           m8, m6
9252    paddw           m8, m7
9253    pmaddwd         m8, m12
9254    movhlps         m9, m8
9255    paddd           m8, m9
9256    psrldq          m9, m8, 4
9257    paddd           m8, m9
9258    psrld           m8, 2
9259
9260    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9261
9262    paddd           m0, m1
9263    paddd           m0, m2
9264    paddd           m0, m3
9265    HADDUW m0, m1
9266    paddd           m0, m13
9267    psrld           m0, 1
9268    psubd           m0, m8
9269    psubd           m10, m0
9270    pabsd           m0, m10
9271    paddd           m11, m0
9272    add             r0, 16
9273    add             r2, 16
9274    dec             r6d
9275    jnz             .loopW
9276    lea             r0, [r0 + r1 * 8 - 32]
9277    lea             r2, [r2 + r3 * 8 - 32]
9278    dec             r7d
9279    jnz             .loopH
9280    movd            eax, m11
9281    RET
9282%else ; !HIGH_BIT_DEPTH
9283INIT_XMM sse4
9284cglobal psyCost_pp_16x16, 4, 9, 15
9285    lea             r4, [3 * r1]
9286    lea             r8, [3 * r3]
9287    mova            m8, [hmul_8p]
9288    mova            m10, [pw_1]
9289    mova            m14, [pd_1]
9290    pxor            m13, m13
9291    mov             r7d, 2
9292.loopH:
9293    mov             r6d, 2
9294.loopW:
9295    pxor            m12, m12
9296    movddup         m0, [r0]
9297    movddup         m1, [r0 + r1]
9298    movddup         m2, [r0 + r1 * 2]
9299    movddup         m3, [r0 + r4]
9300    lea             r5, [r0 + r1 * 4]
9301    movddup         m4, [r5]
9302    movddup         m5, [r5 + r1]
9303    movddup         m6, [r5 + r1 * 2]
9304    movddup         m7, [r5 + r4]
9305
9306    pmaddubsw       m0, m8
9307    pmaddubsw       m1, m8
9308    pmaddubsw       m2, m8
9309    pmaddubsw       m3, m8
9310    pmaddubsw       m4, m8
9311    pmaddubsw       m5, m8
9312    pmaddubsw       m6, m8
9313    pmaddubsw       m7, m8
9314
9315    paddw           m11, m0, m1
9316    paddw           m11, m2
9317    paddw           m11, m3
9318    paddw           m11, m4
9319    paddw           m11, m5
9320    paddw           m11, m6
9321    paddw           m11, m7
9322
9323    pmaddwd         m11, m10
9324    psrldq          m9, m11, 4
9325    paddd           m11, m9
9326    psrld           m11, 2
9327
9328    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
9329
9330    paddw           m0, m1
9331    paddw           m0, m2
9332    paddw           m0, m3
9333    HADDW m0, m1
9334
9335    paddd           m0, m14
9336    psrld           m0, 1
9337    psubd           m12, m0, m11
9338
9339    movddup         m0, [r2]
9340    movddup         m1, [r2 + r3]
9341    movddup         m2, [r2 + r3 * 2]
9342    movddup         m3, [r2 + r8]
9343    lea             r5, [r2 + r3 * 4]
9344    movddup         m4, [r5]
9345    movddup         m5, [r5 + r3]
9346    movddup         m6, [r5 + r3 * 2]
9347    movddup         m7, [r5 + r8]
9348
9349    pmaddubsw       m0, m8
9350    pmaddubsw       m1, m8
9351    pmaddubsw       m2, m8
9352    pmaddubsw       m3, m8
9353    pmaddubsw       m4, m8
9354    pmaddubsw       m5, m8
9355    pmaddubsw       m6, m8
9356    pmaddubsw       m7, m8
9357
9358    paddw           m11, m0, m1
9359    paddw           m11, m2
9360    paddw           m11, m3
9361    paddw           m11, m4
9362    paddw           m11, m5
9363    paddw           m11, m6
9364    paddw           m11, m7
9365
9366    pmaddwd         m11, m10
9367    psrldq          m9, m11, 4
9368    paddd           m11, m9
9369    psrld           m11, 2
9370
9371    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
9372
9373    paddw           m0, m1
9374    paddw           m0, m2
9375    paddw           m0, m3
9376    HADDW m0, m1
9377
9378    paddd           m0, m14
9379    psrld           m0, 1
9380    psubd           m0, m11
9381    psubd           m12, m0
9382    pabsd           m0, m12
9383    paddd           m13, m0
9384    add             r0, 8
9385    add             r2, 8
9386    dec             r6d
9387    jnz             .loopW
9388    lea             r0, [r0 + r1 * 8 - 16]
9389    lea             r2, [r2 + r3 * 8 - 16]
9390    dec             r7d
9391    jnz             .loopH
9392    movd            eax, m13
9393    RET
9394%endif ; HIGH_BIT_DEPTH
9395%endif
9396
9397%if ARCH_X86_64
9398%if HIGH_BIT_DEPTH
9399INIT_XMM sse4
9400cglobal psyCost_pp_32x32, 4, 9, 14
9401
9402    FIX_STRIDES r1, r3
9403    lea             r4, [3 * r1]
9404    lea             r8, [3 * r3]
9405    mova            m12, [pw_1]
9406    mova            m13, [pd_1]
9407    pxor            m11, m11
9408    mov             r7d, 4
9409.loopH:
9410    mov             r6d, 4
9411.loopW:
9412    pxor            m10, m10
9413    movu            m0, [r0]
9414    movu            m1, [r0 + r1]
9415    movu            m2, [r0 + r1 * 2]
9416    movu            m3, [r0 + r4]
9417    lea             r5, [r0 + r1 * 4]
9418    movu            m4, [r5]
9419    movu            m5, [r5 + r1]
9420    movu            m6, [r5 + r1 * 2]
9421    movu            m7, [r5 + r4]
9422
9423    paddw           m8, m0, m1
9424    paddw           m8, m2
9425    paddw           m8, m3
9426    paddw           m8, m4
9427    paddw           m8, m5
9428    paddw           m8, m6
9429    paddw           m8, m7
9430    pmaddwd         m8, m12
9431    movhlps         m9, m8
9432    paddd           m8, m9
9433    psrldq          m9, m8, 4
9434    paddd           m8, m9
9435    psrld           m8, 2
9436
9437    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9438
9439    paddd           m0, m1
9440    paddd           m0, m2
9441    paddd           m0, m3
9442    HADDUW m0, m1
9443    paddd           m0, m13
9444    psrld           m0, 1
9445    psubd           m10, m0, m8
9446
9447    movu            m0, [r2]
9448    movu            m1, [r2 + r3]
9449    movu            m2, [r2 + r3 * 2]
9450    movu            m3, [r2 + r8]
9451    lea             r5, [r2 + r3 * 4]
9452    movu            m4, [r5]
9453    movu            m5, [r5 + r3]
9454    movu            m6, [r5 + r3 * 2]
9455    movu            m7, [r5 + r8]
9456
9457    paddw           m8, m0, m1
9458    paddw           m8, m2
9459    paddw           m8, m3
9460    paddw           m8, m4
9461    paddw           m8, m5
9462    paddw           m8, m6
9463    paddw           m8, m7
9464    pmaddwd         m8, m12
9465    movhlps         m9, m8
9466    paddd           m8, m9
9467    psrldq          m9, m8, 4
9468    paddd           m8, m9
9469    psrld           m8, 2
9470
9471    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9472
9473    paddd           m0, m1
9474    paddd           m0, m2
9475    paddd           m0, m3
9476    HADDUW m0, m1
9477    paddd           m0, m13
9478    psrld           m0, 1
9479    psubd           m0, m8
9480    psubd           m10, m0
9481    pabsd           m0, m10
9482    paddd           m11, m0
9483    add             r0, 16
9484    add             r2, 16
9485    dec             r6d
9486    jnz             .loopW
9487    lea             r0, [r0 + r1 * 8 - 64]
9488    lea             r2, [r2 + r3 * 8 - 64]
9489    dec             r7d
9490    jnz             .loopH
9491    movd            eax, m11
9492    RET
9493
9494%else ; !HIGH_BIT_DEPTH
9495INIT_XMM sse4
9496cglobal psyCost_pp_32x32, 4, 9, 15
9497
9498    lea             r4, [3 * r1]
9499    lea             r8, [3 * r3]
9500    mova            m8, [hmul_8p]
9501    mova            m10, [pw_1]
9502    mova            m14, [pd_1]
9503    pxor            m13, m13
9504    mov             r7d, 4
9505.loopH:
9506    mov             r6d, 4
9507.loopW:
9508    pxor            m12, m12
9509    movddup         m0, [r0]
9510    movddup         m1, [r0 + r1]
9511    movddup         m2, [r0 + r1 * 2]
9512    movddup         m3, [r0 + r4]
9513    lea             r5, [r0 + r1 * 4]
9514    movddup         m4, [r5]
9515    movddup         m5, [r5 + r1]
9516    movddup         m6, [r5 + r1 * 2]
9517    movddup         m7, [r5 + r4]
9518
9519    pmaddubsw       m0, m8
9520    pmaddubsw       m1, m8
9521    pmaddubsw       m2, m8
9522    pmaddubsw       m3, m8
9523    pmaddubsw       m4, m8
9524    pmaddubsw       m5, m8
9525    pmaddubsw       m6, m8
9526    pmaddubsw       m7, m8
9527
9528    paddw           m11, m0, m1
9529    paddw           m11, m2
9530    paddw           m11, m3
9531    paddw           m11, m4
9532    paddw           m11, m5
9533    paddw           m11, m6
9534    paddw           m11, m7
9535
9536    pmaddwd         m11, m10
9537    psrldq          m9, m11, 4
9538    paddd           m11, m9
9539    psrld           m11, 2
9540
9541    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
9542
9543    paddw           m0, m1
9544    paddw           m0, m2
9545    paddw           m0, m3
9546    HADDW m0, m1
9547
9548    paddd           m0, m14
9549    psrld           m0, 1
9550    psubd           m12, m0, m11
9551
9552    movddup         m0, [r2]
9553    movddup         m1, [r2 + r3]
9554    movddup         m2, [r2 + r3 * 2]
9555    movddup         m3, [r2 + r8]
9556    lea             r5, [r2 + r3 * 4]
9557    movddup         m4, [r5]
9558    movddup         m5, [r5 + r3]
9559    movddup         m6, [r5 + r3 * 2]
9560    movddup         m7, [r5 + r8]
9561
9562    pmaddubsw       m0, m8
9563    pmaddubsw       m1, m8
9564    pmaddubsw       m2, m8
9565    pmaddubsw       m3, m8
9566    pmaddubsw       m4, m8
9567    pmaddubsw       m5, m8
9568    pmaddubsw       m6, m8
9569    pmaddubsw       m7, m8
9570
9571    paddw           m11, m0, m1
9572    paddw           m11, m2
9573    paddw           m11, m3
9574    paddw           m11, m4
9575    paddw           m11, m5
9576    paddw           m11, m6
9577    paddw           m11, m7
9578
9579    pmaddwd         m11, m10
9580    psrldq          m9, m11, 4
9581    paddd           m11, m9
9582    psrld           m11, 2
9583
9584    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
9585
9586    paddw           m0, m1
9587    paddw           m0, m2
9588    paddw           m0, m3
9589    HADDW m0, m1
9590
9591    paddd           m0, m14
9592    psrld           m0, 1
9593    psubd           m0, m11
9594    psubd           m12, m0
9595    pabsd           m0, m12
9596    paddd           m13, m0
9597    add             r0, 8
9598    add             r2, 8
9599    dec             r6d
9600    jnz             .loopW
9601    lea             r0, [r0 + r1 * 8 - 32]
9602    lea             r2, [r2 + r3 * 8 - 32]
9603    dec             r7d
9604    jnz             .loopH
9605    movd            eax, m13
9606    RET
9607%endif ; HIGH_BIT_DEPTH
9608%endif
9609
9610%if ARCH_X86_64
9611%if HIGH_BIT_DEPTH
9612INIT_XMM sse4
9613cglobal psyCost_pp_64x64, 4, 9, 14
9614
9615    FIX_STRIDES r1, r3
9616    lea             r4, [3 * r1]
9617    lea             r8, [3 * r3]
9618    mova            m12, [pw_1]
9619    mova            m13, [pd_1]
9620    pxor            m11, m11
9621    mov             r7d, 8
9622.loopH:
9623    mov             r6d, 8
9624.loopW:
9625    pxor            m10, m10
9626    movu            m0, [r0]
9627    movu            m1, [r0 + r1]
9628    movu            m2, [r0 + r1 * 2]
9629    movu            m3, [r0 + r4]
9630    lea             r5, [r0 + r1 * 4]
9631    movu            m4, [r5]
9632    movu            m5, [r5 + r1]
9633    movu            m6, [r5 + r1 * 2]
9634    movu            m7, [r5 + r4]
9635
9636    paddw           m8, m0, m1
9637    paddw           m8, m2
9638    paddw           m8, m3
9639    paddw           m8, m4
9640    paddw           m8, m5
9641    paddw           m8, m6
9642    paddw           m8, m7
9643    pmaddwd         m8, m12
9644    movhlps         m9, m8
9645    paddd           m8, m9
9646    psrldq          m9, m8, 4
9647    paddd           m8, m9
9648    psrld           m8, 2
9649
9650    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9651
9652    paddd           m0, m1
9653    paddd           m0, m2
9654    paddd           m0, m3
9655    HADDUW m0, m1
9656    paddd           m0, m13
9657    psrld           m0, 1
9658    psubd           m10, m0, m8
9659
9660    movu            m0, [r2]
9661    movu            m1, [r2 + r3]
9662    movu            m2, [r2 + r3 * 2]
9663    movu            m3, [r2 + r8]
9664    lea             r5, [r2 + r3 * 4]
9665    movu            m4, [r5]
9666    movu            m5, [r5 + r3]
9667    movu            m6, [r5 + r3 * 2]
9668    movu            m7, [r5 + r8]
9669
9670    paddw           m8, m0, m1
9671    paddw           m8, m2
9672    paddw           m8, m3
9673    paddw           m8, m4
9674    paddw           m8, m5
9675    paddw           m8, m6
9676    paddw           m8, m7
9677    pmaddwd         m8, m12
9678    movhlps         m9, m8
9679    paddd           m8, m9
9680    psrldq          m9, m8, 4
9681    paddd           m8, m9
9682    psrld           m8, 2
9683
9684    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
9685
9686    paddd           m0, m1
9687    paddd           m0, m2
9688    paddd           m0, m3
9689    HADDUW m0, m1
9690    paddd           m0, m13
9691    psrld           m0, 1
9692    psubd           m0, m8
9693    psubd           m10, m0
9694    pabsd           m0, m10
9695    paddd           m11, m0
9696    add             r0, 16
9697    add             r2, 16
9698    dec             r6d
9699    jnz             .loopW
9700    lea             r0, [r0 + r1 * 8 - 128]
9701    lea             r2, [r2 + r3 * 8 - 128]
9702    dec             r7d
9703    jnz             .loopH
9704    movd            eax, m11
9705    RET
9706
9707%else ; !HIGH_BIT_DEPTH
9708INIT_XMM sse4
9709cglobal psyCost_pp_64x64, 4, 9, 15
9710
9711    lea             r4, [3 * r1]
9712    lea             r8, [3 * r3]
9713    mova            m8, [hmul_8p]
9714    mova            m10, [pw_1]
9715    mova            m14, [pd_1]
9716    pxor            m13, m13
9717    mov             r7d, 8
9718.loopH:
9719    mov             r6d, 8
9720.loopW:
9721    pxor            m12, m12
9722    movddup         m0, [r0]
9723    movddup         m1, [r0 + r1]
9724    movddup         m2, [r0 + r1 * 2]
9725    movddup         m3, [r0 + r4]
9726    lea             r5, [r0 + r1 * 4]
9727    movddup         m4, [r5]
9728    movddup         m5, [r5 + r1]
9729    movddup         m6, [r5 + r1 * 2]
9730    movddup         m7, [r5 + r4]
9731
9732    pmaddubsw       m0, m8
9733    pmaddubsw       m1, m8
9734    pmaddubsw       m2, m8
9735    pmaddubsw       m3, m8
9736    pmaddubsw       m4, m8
9737    pmaddubsw       m5, m8
9738    pmaddubsw       m6, m8
9739    pmaddubsw       m7, m8
9740
9741    paddw           m11, m0, m1
9742    paddw           m11, m2
9743    paddw           m11, m3
9744    paddw           m11, m4
9745    paddw           m11, m5
9746    paddw           m11, m6
9747    paddw           m11, m7
9748
9749    pmaddwd         m11, m10
9750    psrldq          m9, m11, 4
9751    paddd           m11, m9
9752    psrld           m11, 2
9753
9754    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
9755
9756    paddw           m0, m1
9757    paddw           m0, m2
9758    paddw           m0, m3
9759    HADDW m0, m1
9760
9761    paddd           m0, m14
9762    psrld           m0, 1
9763    psubd           m12, m0, m11
9764
9765    movddup         m0, [r2]
9766    movddup         m1, [r2 + r3]
9767    movddup         m2, [r2 + r3 * 2]
9768    movddup         m3, [r2 + r8]
9769    lea             r5, [r2 + r3 * 4]
9770    movddup         m4, [r5]
9771    movddup         m5, [r5 + r3]
9772    movddup         m6, [r5 + r3 * 2]
9773    movddup         m7, [r5 + r8]
9774
9775    pmaddubsw       m0, m8
9776    pmaddubsw       m1, m8
9777    pmaddubsw       m2, m8
9778    pmaddubsw       m3, m8
9779    pmaddubsw       m4, m8
9780    pmaddubsw       m5, m8
9781    pmaddubsw       m6, m8
9782    pmaddubsw       m7, m8
9783
9784    paddw           m11, m0, m1
9785    paddw           m11, m2
9786    paddw           m11, m3
9787    paddw           m11, m4
9788    paddw           m11, m5
9789    paddw           m11, m6
9790    paddw           m11, m7
9791
9792    pmaddwd         m11, m10
9793    psrldq          m9, m11, 4
9794    paddd           m11, m9
9795    psrld           m11, 2
9796
9797    HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
9798
9799    paddw           m0, m1
9800    paddw           m0, m2
9801    paddw           m0, m3
9802    HADDW m0, m1
9803
9804    paddd           m0, m14
9805    psrld           m0, 1
9806    psubd           m0, m11
9807    psubd           m12, m0
9808    pabsd           m0, m12
9809    paddd           m13, m0
9810    add             r0, 8
9811    add             r2, 8
9812    dec             r6d
9813    jnz             .loopW
9814    lea             r0, [r0 + r1 * 8 - 64]
9815    lea             r2, [r2 + r3 * 8 - 64]
9816    dec             r7d
9817    jnz             .loopH
9818    movd            eax, m13
9819    RET
9820%endif ; HIGH_BIT_DEPTH
9821%endif
9822
9823INIT_YMM avx2
9824%if HIGH_BIT_DEPTH
9825cglobal psyCost_pp_4x4, 4, 5, 6
9826    add             r1d, r1d
9827    add             r3d, r3d
9828    lea              r4, [r1 * 3]
9829    movddup         xm0, [r0]
9830    movddup         xm1, [r0 + r1]
9831    movddup         xm2, [r0 + r1 * 2]
9832    movddup         xm3, [r0 + r4]
9833
9834    lea              r4, [r3 * 3]
9835    movddup         xm4, [r2]
9836    movddup         xm5, [r2 + r3]
9837    vinserti128      m0, m0, xm4, 1
9838    vinserti128      m1, m1, xm5, 1
9839    movddup         xm4, [r2 + r3 * 2]
9840    movddup         xm5, [r2 + r4]
9841    vinserti128      m2, m2, xm4, 1
9842    vinserti128      m3, m3, xm5, 1
9843
9844    mova             m4, [hmul_8w]
9845    pmaddwd          m0, m4
9846    pmaddwd          m1, m4
9847    pmaddwd          m2, m4
9848    pmaddwd          m3, m4
9849    paddd            m5, m0, m1
9850    paddd            m4, m2, m3
9851    paddd            m5, m4
9852    psrldq           m4, m5, 4
9853    paddd            m5, m4
9854    psrld            m5, 2
9855
9856    mova             m4, m0
9857    paddd            m0, m1
9858    psubd            m1, m4
9859    mova             m4, m2
9860    paddd            m2, m3
9861    psubd            m3, m4
9862    mova             m4, m0
9863    paddd            m0, m2
9864    psubd            m2, m4
9865    mova             m4, m1
9866    paddd            m1, m3
9867    psubd            m3, m4
9868    movaps           m4, m0
9869    vshufps          m4, m4, m2, 11011101b
9870    vshufps          m0, m0, m2, 10001000b
9871    movaps           m2, m1
9872    vshufps          m2, m2, m3, 11011101b
9873    vshufps          m1, m1, m3, 10001000b
9874    pabsd            m0, m0
9875    pabsd            m4, m4
9876    pmaxsd           m0, m4
9877    pabsd            m1, m1
9878    pabsd            m2, m2
9879    pmaxsd           m1, m2
9880    paddd            m0, m1
9881
9882    vpermq           m1, m0, 11110101b
9883    paddd            m0, m1
9884    psrldq           m1, m0, 4
9885    paddd            m0, m1
9886    psubd            m0, m5
9887
9888    vextracti128    xm1, m0, 1
9889    psubd           xm1, xm0
9890    pabsd           xm1, xm1
9891    movd            eax, xm1
9892    RET
9893%else ; !HIGH_BIT_DEPTH
9894cglobal psyCost_pp_4x4, 4, 5, 6
9895    lea             r4, [3 * r1]
9896    movd            xm0, [r0]
9897    movd            xm1, [r0 + r1]
9898    movd            xm2, [r0 + r1 * 2]
9899    movd            xm3, [r0 + r4]
9900    vshufps         xm0, xm1, 0
9901    vshufps         xm2, xm3, 0
9902
9903    lea             r4, [3 * r3]
9904    movd            xm1, [r2]
9905    movd            xm3, [r2 + r3]
9906    movd            xm4, [r2 + r3 * 2]
9907    movd            xm5, [r2 + r4]
9908    vshufps         xm1, xm3, 0
9909    vshufps         xm4, xm5, 0
9910
9911    vinserti128     m0, m0, xm1, 1
9912    vinserti128     m2, m2, xm4, 1
9913
9914    mova            m4, [hmul_4p]
9915    pmaddubsw       m0, m4
9916    pmaddubsw       m2, m4
9917
9918    paddw           m5, m0, m2
9919    mova            m1, m5
9920    psrldq          m4, m5, 8
9921    paddw           m5, m4
9922    pmaddwd         m5, [pw_1]
9923    psrld           m5, 2
9924
9925    vpsubw          m2, m2, m0
9926    vpunpckhqdq     m0, m1, m2
9927    vpunpcklqdq     m1, m1, m2
9928    vpaddw          m2, m1, m0
9929    vpsubw          m0, m0, m1
9930    vpblendw        m1, m2, m0, 10101010b
9931    vpslld          m0, m0, 10h
9932    vpsrld          m2, m2, 10h
9933    vpor            m0, m0, m2
9934    vpabsw          m1, m1
9935    vpabsw          m0, m0
9936    vpmaxsw         m1, m1, m0
9937    vpmaddwd        m1, m1, [pw_1]
9938    psrldq          m2, m1, 8
9939    paddd           m1, m2
9940    psrldq          m3, m1, 4
9941    paddd           m1, m3
9942    psubd           m1, m5
9943    vextracti128    xm2, m1, 1
9944    psubd           m1, m2
9945    pabsd           m1, m1
9946    movd            eax, xm1
9947    RET
9948%endif
9949
9950%macro PSY_PP_8x8 0
9951    movddup         m0, [r0 + r1 * 0]
9952    movddup         m1, [r0 + r1 * 1]
9953    movddup         m2, [r0 + r1 * 2]
9954    movddup         m3, [r0 + r4 * 1]
9955
9956    lea             r5, [r0 + r1 * 4]
9957
9958    movddup         m4, [r2 + r3 * 0]
9959    movddup         m5, [r2 + r3 * 1]
9960    movddup         m6, [r2 + r3 * 2]
9961    movddup         m7, [r2 + r7 * 1]
9962
9963    lea             r6, [r2 + r3 * 4]
9964
9965    vinserti128     m0, m0, xm4, 1
9966    vinserti128     m1, m1, xm5, 1
9967    vinserti128     m2, m2, xm6, 1
9968    vinserti128     m3, m3, xm7, 1
9969
9970    movddup         m4, [r5 + r1 * 0]
9971    movddup         m5, [r5 + r1 * 1]
9972    movddup         m6, [r5 + r1 * 2]
9973    movddup         m7, [r5 + r4 * 1]
9974
9975    movddup         m9, [r6 + r3 * 0]
9976    movddup         m10, [r6 + r3 * 1]
9977    movddup         m11, [r6 + r3 * 2]
9978    movddup         m12, [r6 + r7 * 1]
9979
9980    vinserti128     m4, m4, xm9, 1
9981    vinserti128     m5, m5, xm10, 1
9982    vinserti128     m6, m6, xm11, 1
9983    vinserti128     m7, m7, xm12, 1
9984
9985    pmaddubsw       m0, m8
9986    pmaddubsw       m1, m8
9987    pmaddubsw       m2, m8
9988    pmaddubsw       m3, m8
9989    pmaddubsw       m4, m8
9990    pmaddubsw       m5, m8
9991    pmaddubsw       m6, m8
9992    pmaddubsw       m7, m8
9993
9994    paddw           m11, m0, m1
9995    paddw           m11, m2
9996    paddw           m11, m3
9997    paddw           m11, m4
9998    paddw           m11, m5
9999    paddw           m11, m6
10000    paddw           m11, m7
10001
10002    pmaddwd         m11, [pw_1]
10003    psrldq          m10, m11, 4
10004    paddd           m11, m10
10005    psrld           m11, 2
10006
10007    mova            m9, m0
10008    paddw           m0, m1      ; m0+m1
10009    psubw           m1, m9      ; m1-m0
10010    mova            m9, m2
10011    paddw           m2, m3      ; m2+m3
10012    psubw           m3, m9      ; m3-m2
10013    mova            m9, m0
10014    paddw           m0, m2      ; m0+m1+m2+m3
10015    psubw           m2, m9      ; m2+m3-m0+m1
10016    mova            m9, m1
10017    paddw           m1, m3      ; m1-m0+m3-m2
10018    psubw           m3, m9      ; m3-m2-m1-m0
10019
10020    movdqa          m9, m4
10021    paddw           m4, m5      ; m4+m5
10022    psubw           m5, m9      ; m5-m4
10023    movdqa          m9, m6
10024    paddw           m6, m7      ; m6+m7
10025    psubw           m7, m9      ; m7-m6
10026    movdqa          m9, m4
10027    paddw           m4, m6      ; m4+m5+m6+m7
10028    psubw           m6, m9      ; m6+m7-m4+m5
10029    movdqa          m9, m5
10030    paddw           m5, m7      ; m5-m4+m7-m6
10031    psubw           m7, m9      ; m7-m6-m5-m4
10032
10033    movdqa          m9, m0
10034    paddw           m0, m4      ; (m0+m1+m2+m3)+(m4+m5+m6+m7)
10035    psubw           m4, m9      ; (m4+m5+m6+m7)-(m0+m1+m2+m3)
10036    movdqa          m9, m1
10037    paddw           m1, m5      ; (m1-m0+m3-m2)+(m5-m4+m7-m6)
10038    psubw           m5, m9      ; (m5-m4+m7-m6)-(m1-m0+m3-m2)
10039
10040    mova            m9, m0
10041    vshufps         m9, m9, m4, 11011101b
10042    vshufps         m0, m0, m4, 10001000b
10043
10044    movdqa          m4, m0
10045    paddw           m0, m9      ; (a0 + a4) + (a4 - a0)
10046    psubw           m9, m4      ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4)
10047
10048    movaps          m4, m1
10049    vshufps         m4, m4, m5, 11011101b
10050    vshufps         m1, m1, m5, 10001000b
10051
10052    movdqa          m5, m1
10053    paddw           m1, m4
10054    psubw           m4, m5
10055    movdqa          m5, m2
10056    paddw           m2, m6
10057    psubw           m6, m5
10058    movdqa          m5, m3
10059    paddw           m3, m7
10060    psubw           m7, m5
10061
10062    movaps          m5, m2
10063    vshufps         m5, m5, m6, 11011101b
10064    vshufps         m2, m2, m6, 10001000b
10065
10066    movdqa          m6, m2
10067    paddw           m2, m5
10068    psubw           m5, m6
10069    movaps          m6, m3
10070
10071    vshufps         m6, m6, m7, 11011101b
10072    vshufps         m3, m3, m7, 10001000b
10073
10074    movdqa          m7, m3
10075    paddw           m3, m6
10076    psubw           m6, m7
10077    movdqa          m7, m0
10078
10079    pblendw         m0, m9, 10101010b
10080    pslld           m9, 10h
10081    psrld           m7, 10h
10082    por             m9, m7
10083    pabsw           m0, m0
10084    pabsw           m9, m9
10085    pmaxsw          m0, m9
10086    movdqa          m7, m1
10087    pblendw         m1, m4, 10101010b
10088    pslld           m4, 10h
10089    psrld           m7, 10h
10090    por             m4, m7
10091    pabsw           m1, m1
10092    pabsw           m4, m4
10093    pmaxsw          m1, m4
10094    movdqa          m7, m2
10095    pblendw         m2, m5, 10101010b
10096    pslld           m5, 10h
10097    psrld           m7, 10h
10098    por             m5, m7
10099    pabsw           m2, m2
10100    pabsw           m5, m5
10101    pmaxsw          m2, m5
10102    mova            m7, m3
10103
10104    pblendw         m3, m6, 10101010b
10105    pslld           m6, 10h
10106    psrld           m7, 10h
10107    por             m6, m7
10108    pabsw           m3, m3
10109    pabsw           m6, m6
10110    pmaxsw          m3, m6
10111    paddw           m0, m1
10112    paddw           m0, m2
10113    paddw           m0, m3
10114    pmaddwd         m0, [pw_1]
10115    psrldq          m1, m0, 8
10116    paddd           m0, m1
10117
10118    pshuflw         m1, m0, 00001110b
10119    paddd           m0, m1
10120    paddd           m0, [pd_1]
10121    psrld           m0, 1
10122
10123    psubd           m0, m11
10124
10125    vextracti128    xm1, m0, 1
10126    psubd           m0, m1
10127    pabsd           m0, m0
10128%endmacro
10129
10130%macro PSY_PP_8x8_AVX2 0
10131    lea             r4, [r1 * 3]
10132    movu           xm0, [r0]
10133    movu           xm1, [r0 + r1]
10134    movu           xm2, [r0 + r1 * 2]
10135    movu           xm3, [r0 + r4]
10136    lea             r5, [r0 + r1 * 4]
10137    movu           xm4, [r5]
10138    movu           xm5, [r5 + r1]
10139    movu           xm6, [r5 + r1 * 2]
10140    movu           xm7, [r5 + r4]
10141
10142    lea             r4, [r3 * 3]
10143    vinserti128     m0, m0, [r2], 1
10144    vinserti128     m1, m1, [r2 + r3], 1
10145    vinserti128     m2, m2, [r2 + r3 * 2], 1
10146    vinserti128     m3, m3, [r2 + r4], 1
10147    lea             r5, [r2 + r3 * 4]
10148    vinserti128     m4, m4, [r5], 1
10149    vinserti128     m5, m5, [r5 + r3], 1
10150    vinserti128     m6, m6, [r5 + r3 * 2], 1
10151    vinserti128     m7, m7, [r5 + r4], 1
10152
10153    paddw           m8, m0, m1
10154    paddw           m8, m2
10155    paddw           m8, m3
10156    paddw           m8, m4
10157    paddw           m8, m5
10158    paddw           m8, m6
10159    paddw           m8, m7
10160    pmaddwd         m8, [pw_1]
10161
10162    psrldq          m9, m8, 8
10163    paddd           m8, m9
10164    psrldq          m9, m8, 4
10165    paddd           m8, m9
10166    psrld           m8, 2
10167
10168    psubw           m9, m1, m0
10169    paddw           m0, m1
10170    psubw           m1, m3, m2
10171    paddw           m2, m3
10172    punpckhwd       m3, m0, m9
10173    punpcklwd       m0, m9
10174    psubw           m9, m3, m0
10175    paddw           m0, m3
10176    punpckhwd       m3, m2, m1
10177    punpcklwd       m2, m1
10178    psubw           m10, m3, m2
10179    paddw           m2, m3
10180    psubw           m3, m5, m4
10181    paddw           m4, m5
10182    psubw           m5, m7, m6
10183    paddw           m6, m7
10184    punpckhwd       m1, m4, m3
10185    punpcklwd       m4, m3
10186    psubw           m7, m1, m4
10187    paddw           m4, m1
10188    punpckhwd       m3, m6, m5
10189    punpcklwd       m6, m5
10190    psubw           m1, m3, m6
10191    paddw           m6, m3
10192    psubw           m3, m2, m0
10193    paddw           m0, m2
10194    psubw           m2, m10, m9
10195    paddw           m9, m10
10196    punpckhdq       m5, m0, m3
10197    punpckldq       m0, m3
10198    psubw           m10, m5, m0
10199    paddw           m0, m5
10200    punpckhdq       m3, m9, m2
10201    punpckldq       m9, m2
10202    psubw           m5, m3, m9
10203    paddw           m9, m3
10204    psubw           m3, m6, m4
10205    paddw           m4, m6
10206    psubw           m6, m1, m7
10207    paddw           m7, m1
10208    punpckhdq       m2, m4, m3
10209    punpckldq       m4, m3
10210    psubw           m1, m2, m4
10211    paddw           m4, m2
10212    punpckhdq       m3, m7, m6
10213    punpckldq       m7, m6
10214    psubw           m2, m3, m7
10215    paddw           m7, m3
10216    psubw           m3, m4, m0
10217    paddw           m0, m4
10218    psubw           m4, m1, m10
10219    paddw           m10, m1
10220    punpckhqdq      m6, m0, m3
10221    punpcklqdq      m0, m3
10222    pabsw           m0, m0
10223    pabsw           m6, m6
10224    pmaxsw          m0, m6
10225    punpckhqdq      m3, m10, m4
10226    punpcklqdq      m10, m4
10227    pabsw           m10, m10
10228    pabsw           m3, m3
10229    pmaxsw          m10, m3
10230    psubw           m3, m7, m9
10231    paddw           m9, m7
10232    psubw           m7, m2, m5
10233    paddw           m5, m2
10234    punpckhqdq      m4, m9, m3
10235    punpcklqdq      m9, m3
10236    pabsw           m9, m9
10237    pabsw           m4, m4
10238    pmaxsw          m9, m4
10239    punpckhqdq      m3, m5, m7
10240    punpcklqdq      m5, m7
10241    pabsw           m5, m5
10242    pabsw           m3, m3
10243    pmaxsw          m5, m3
10244    paddd           m0, m9
10245    paddd           m0, m10
10246    paddd           m0, m5
10247    psrld           m9, m0, 16
10248    pslld           m0, 16
10249    psrld           m0, 16
10250    paddd           m0, m9
10251    psrldq          m9, m0, 8
10252    paddd           m0, m9
10253    psrldq          m9, m0, 4
10254    paddd           m0, m9
10255    paddd           m0, [pd_1]
10256    psrld           m0, 1
10257    psubd           m0, m8
10258
10259    vextracti128   xm1, m0, 1
10260    psubd          xm1, xm0
10261    pabsd          xm1, xm1
10262%endmacro
10263
10264%macro PSY_COST_PP_8x8_MAIN12 0
10265    ; load source pixels
10266    lea             r4, [r1 * 3]
10267    pmovzxwd        m0, [r0]
10268    pmovzxwd        m1, [r0 + r1]
10269    pmovzxwd        m2, [r0 + r1 * 2]
10270    pmovzxwd        m3, [r0 + r4]
10271    lea             r5, [r0 + r1 * 4]
10272    pmovzxwd        m4, [r5]
10273    pmovzxwd        m5, [r5 + r1]
10274    pmovzxwd        m6, [r5 + r1 * 2]
10275    pmovzxwd        m7, [r5 + r4]
10276
10277    ; source SAD
10278    paddd           m8, m0, m1
10279    paddd           m8, m2
10280    paddd           m8, m3
10281    paddd           m8, m4
10282    paddd           m8, m5
10283    paddd           m8, m6
10284    paddd           m8, m7
10285
10286    vextracti128    xm9, m8, 1
10287    paddd           m8, m9              ; sad_8x8
10288    movhlps         xm9, xm8
10289    paddd           xm8, xm9
10290    pshuflw         xm9, xm8, 0Eh
10291    paddd           xm8, xm9
10292    psrld           m8, 2
10293
10294    ; source SA8D
10295    psubd           m9, m1, m0
10296    paddd           m0, m1
10297    psubd           m1, m3, m2
10298    paddd           m2, m3
10299    punpckhdq       m3, m0, m9
10300    punpckldq       m0, m9
10301    psubd           m9, m3, m0
10302    paddd           m0, m3
10303    punpckhdq       m3, m2, m1
10304    punpckldq       m2, m1
10305    psubd           m10, m3, m2
10306    paddd           m2, m3
10307    psubd           m3, m5, m4
10308    paddd           m4, m5
10309    psubd           m5, m7, m6
10310    paddd           m6, m7
10311    punpckhdq       m1, m4, m3
10312    punpckldq       m4, m3
10313    psubd           m7, m1, m4
10314    paddd           m4, m1
10315    punpckhdq       m3, m6, m5
10316    punpckldq       m6, m5
10317    psubd           m1, m3, m6
10318    paddd           m6, m3
10319    psubd           m3, m2, m0
10320    paddd           m0, m2
10321    psubd           m2, m10, m9
10322    paddd           m9, m10
10323    punpckhqdq      m5, m0, m3
10324    punpcklqdq      m0, m3
10325    psubd           m10, m5, m0
10326    paddd           m0, m5
10327    punpckhqdq      m3, m9, m2
10328    punpcklqdq      m9, m2
10329    psubd           m5, m3, m9
10330    paddd           m9, m3
10331    psubd           m3, m6, m4
10332    paddd           m4, m6
10333    psubd           m6, m1, m7
10334    paddd           m7, m1
10335    punpckhqdq      m2, m4, m3
10336    punpcklqdq      m4, m3
10337    psubd           m1, m2, m4
10338    paddd           m4, m2
10339    punpckhqdq      m3, m7, m6
10340    punpcklqdq      m7, m6
10341    psubd           m2, m3, m7
10342    paddd           m7, m3
10343    psubd           m3, m4, m0
10344    paddd           m0, m4
10345    psubd           m4, m1, m10
10346    paddd           m10, m1
10347    vinserti128     m6, m0, xm3, 1
10348    vperm2i128      m0, m0, m3, 00110001b
10349    pabsd           m0, m0
10350    pabsd           m6, m6
10351    pmaxsd          m0, m6
10352    vinserti128     m3, m10, xm4, 1
10353    vperm2i128      m10, m10, m4, 00110001b
10354    pabsd           m10, m10
10355    pabsd           m3, m3
10356    pmaxsd          m10, m3
10357    psubd           m3, m7, m9
10358    paddd           m9, m7
10359    psubd           m7, m2, m5
10360    paddd           m5, m2
10361    vinserti128     m4, m9, xm3, 1
10362    vperm2i128      m9, m9, m3, 00110001b
10363    pabsd           m9, m9
10364    pabsd           m4, m4
10365    pmaxsd          m9, m4
10366    vinserti128     m3, m5, xm7, 1
10367    vperm2i128      m5, m5, m7, 00110001b
10368    pabsd           m5, m5
10369    pabsd           m3, m3
10370    pmaxsd          m5, m3
10371    paddd           m0, m9
10372    paddd           m0, m10
10373    paddd           m0, m5
10374
10375    vextracti128    xm9, m0, 1
10376    paddd           m0, m9              ; sad_8x8
10377    movhlps         xm9, xm0
10378    paddd           xm0, xm9
10379    pshuflw         xm9, xm0, 0Eh
10380    paddd           xm0, xm9
10381    paddd           m0, [pd_1]
10382    psrld           m0, 1               ; sa8d_8x8
10383    psubd           m11, m0, m8         ; sa8d_8x8 - sad_8x8
10384
10385    ; load recon pixels
10386    lea             r4, [r3 * 3]
10387    pmovzxwd        m0, [r2]
10388    pmovzxwd        m1, [r2 + r3]
10389    pmovzxwd        m2, [r2 + r3 * 2]
10390    pmovzxwd        m3, [r2 + r4]
10391    lea             r5, [r2 + r3 * 4]
10392    pmovzxwd        m4, [r5]
10393    pmovzxwd        m5, [r5 + r3]
10394    pmovzxwd        m6, [r5 + r3 * 2]
10395    pmovzxwd        m7, [r5 + r4]
10396
10397    ; recon SAD
10398    paddd           m8, m0, m1
10399    paddd           m8, m2
10400    paddd           m8, m3
10401    paddd           m8, m4
10402    paddd           m8, m5
10403    paddd           m8, m6
10404    paddd           m8, m7
10405
10406    vextracti128    xm9, m8, 1
10407    paddd           m8, m9              ; sad_8x8
10408    movhlps         xm9, xm8
10409    paddd           xm8, xm9
10410    pshuflw         xm9, xm8, 0Eh
10411    paddd           xm8, xm9
10412    psrld           m8, 2
10413
10414    ; recon SA8D
10415    psubd           m9, m1, m0
10416    paddd           m0, m1
10417    psubd           m1, m3, m2
10418    paddd           m2, m3
10419    punpckhdq       m3, m0, m9
10420    punpckldq       m0, m9
10421    psubd           m9, m3, m0
10422    paddd           m0, m3
10423    punpckhdq       m3, m2, m1
10424    punpckldq       m2, m1
10425    psubd           m10, m3, m2
10426    paddd           m2, m3
10427    psubd           m3, m5, m4
10428    paddd           m4, m5
10429    psubd           m5, m7, m6
10430    paddd           m6, m7
10431    punpckhdq       m1, m4, m3
10432    punpckldq       m4, m3
10433    psubd           m7, m1, m4
10434    paddd           m4, m1
10435    punpckhdq       m3, m6, m5
10436    punpckldq       m6, m5
10437    psubd           m1, m3, m6
10438    paddd           m6, m3
10439    psubd           m3, m2, m0
10440    paddd           m0, m2
10441    psubd           m2, m10, m9
10442    paddd           m9, m10
10443    punpckhqdq      m5, m0, m3
10444    punpcklqdq      m0, m3
10445    psubd           m10, m5, m0
10446    paddd           m0, m5
10447    punpckhqdq      m3, m9, m2
10448    punpcklqdq      m9, m2
10449    psubd           m5, m3, m9
10450    paddd           m9, m3
10451    psubd           m3, m6, m4
10452    paddd           m4, m6
10453    psubd           m6, m1, m7
10454    paddd           m7, m1
10455    punpckhqdq      m2, m4, m3
10456    punpcklqdq      m4, m3
10457    psubd           m1, m2, m4
10458    paddd           m4, m2
10459    punpckhqdq      m3, m7, m6
10460    punpcklqdq      m7, m6
10461    psubd           m2, m3, m7
10462    paddd           m7, m3
10463    psubd           m3, m4, m0
10464    paddd           m0, m4
10465    psubd           m4, m1, m10
10466    paddd           m10, m1
10467    vinserti128     m6, m0, xm3, 1
10468    vperm2i128      m0, m0, m3, 00110001b
10469    pabsd           m0, m0
10470    pabsd           m6, m6
10471    pmaxsd          m0, m6
10472    vinserti128     m3, m10, xm4, 1
10473    vperm2i128      m10, m10, m4, 00110001b
10474    pabsd           m10, m10
10475    pabsd           m3, m3
10476    pmaxsd          m10, m3
10477    psubd           m3, m7, m9
10478    paddd           m9, m7
10479    psubd           m7, m2, m5
10480    paddd           m5, m2
10481    vinserti128     m4, m9, xm3, 1
10482    vperm2i128      m9, m9, m3, 00110001b
10483    pabsd           m9, m9
10484    pabsd           m4, m4
10485    pmaxsd          m9, m4
10486    vinserti128     m3, m5, xm7, 1
10487    vperm2i128      m5, m5, m7, 00110001b
10488    pabsd           m5, m5
10489    pabsd           m3, m3
10490    pmaxsd          m5, m3
10491    paddd           m0, m9
10492    paddd           m0, m10
10493    paddd           m0, m5
10494
10495    vextracti128    xm9, m0, 1
10496    paddd           m0, m9              ; sad_8x8
10497    movhlps         xm9, xm0
10498    paddd           xm0, xm9
10499    pshuflw         xm9, xm0, 0Eh
10500    paddd           xm0, xm9
10501    paddd           m0, [pd_1]
10502    psrld           m0, 1               ; sa8d_8x8
10503    psubd           m0, m8              ; sa8d_8x8 - sad_8x8
10504
10505    psubd          m11, m0
10506    pabsd          m11, m11
10507%endmacro
10508
10509%macro PSY_COST_PP_8x8_AVX512_MAIN12 0
10510    ; load source and recon pixels
10511    lea             r4, [r1 * 3]
10512    pmovzxwd        ym0, [r0]
10513    pmovzxwd        ym1, [r0 + r1]
10514    pmovzxwd        ym2, [r0 + r1 * 2]
10515    pmovzxwd        ym3, [r0 + r4]
10516    lea             r5, [r0 + r1 * 4]
10517    pmovzxwd        ym4, [r5]
10518    pmovzxwd        ym5, [r5 + r1]
10519    pmovzxwd        ym6, [r5 + r1 * 2]
10520    pmovzxwd        ym7, [r5 + r4]
10521
10522    lea             r4, [r3 * 3]
10523    pmovzxwd        ym16, [r2]
10524    pmovzxwd        ym17, [r2 + r3]
10525    pmovzxwd        ym18, [r2 + r3 * 2]
10526    pmovzxwd        ym19, [r2 + r4]
10527    lea               r5, [r2 + r3 * 4]
10528    pmovzxwd        ym20, [r5]
10529    pmovzxwd        ym21, [r5 + r3]
10530    pmovzxwd        ym22, [r5 + r3 * 2]
10531    pmovzxwd        ym23, [r5 + r4]
10532
10533    vinserti64x4    m0, m0, ym16, 1
10534    vinserti64x4    m1, m1, ym17, 1
10535    vinserti64x4    m2, m2, ym18, 1
10536    vinserti64x4    m3, m3, ym19, 1
10537    vinserti64x4    m4, m4, ym20, 1
10538    vinserti64x4    m5, m5, ym21, 1
10539    vinserti64x4    m6, m6, ym22, 1
10540    vinserti64x4    m7, m7, ym23, 1
10541
10542    ; source +  recon SAD
10543    paddd           m8, m0, m1
10544    paddd           m8, m2
10545    paddd           m8, m3
10546    paddd           m8, m4
10547    paddd           m8, m5
10548    paddd           m8, m6
10549    paddd           m8, m7
10550
10551    vextracti64x4   ym15, m8, 1
10552
10553    vextracti128    xm9, ym8, 1
10554    paddd           ym8, ym9              ; sad_8x8
10555    movhlps         xm9, xm8
10556    paddd           xm8, xm9
10557    pshuflw         xm9, xm8, 0Eh
10558    paddd           xm8, xm9
10559    psrld           ym8, 2
10560
10561    vextracti128    xm9, ym15, 1
10562    paddd           ym15, ym9              ; sad_8x8
10563    movhlps         xm9, xm15
10564    paddd           xm15, xm9
10565    pshuflw         xm9, xm15, 0Eh
10566    paddd           xm15, xm9
10567    psrld           ym15, 2
10568
10569    ; source and recon SA8D
10570    psubd           m9, m1, m0
10571    paddd           m0, m1
10572    psubd           m1, m3, m2
10573    paddd           m2, m3
10574    punpckhdq       m3, m0, m9
10575    punpckldq       m0, m9
10576    psubd           m9, m3, m0
10577    paddd           m0, m3
10578    punpckhdq       m3, m2, m1
10579    punpckldq       m2, m1
10580    psubd           m10, m3, m2
10581    paddd           m2, m3
10582    psubd           m3, m5, m4
10583    paddd           m4, m5
10584    psubd           m5, m7, m6
10585    paddd           m6, m7
10586    punpckhdq       m1, m4, m3
10587    punpckldq       m4, m3
10588    psubd           m7, m1, m4
10589    paddd           m4, m1
10590    punpckhdq       m3, m6, m5
10591    punpckldq       m6, m5
10592    psubd           m1, m3, m6
10593    paddd           m6, m3
10594    psubd           m3, m2, m0
10595    paddd           m0, m2
10596    psubd           m2, m10, m9
10597    paddd           m9, m10
10598    punpckhqdq      m5, m0, m3
10599    punpcklqdq      m0, m3
10600    psubd           m10, m5, m0
10601    paddd           m0, m5
10602    punpckhqdq      m3, m9, m2
10603    punpcklqdq      m9, m2
10604    psubd           m5, m3, m9
10605    paddd           m9, m3
10606    psubd           m3, m6, m4
10607    paddd           m4, m6
10608    psubd           m6, m1, m7
10609    paddd           m7, m1
10610    punpckhqdq      m2, m4, m3
10611    punpcklqdq      m4, m3
10612    psubd           m1, m2, m4
10613    paddd           m4, m2
10614    punpckhqdq      m3, m7, m6
10615    punpcklqdq      m7, m6
10616
10617    psubd           m2, m3, m7
10618    paddd           m7, m3
10619    psubd           m3, m4, m0
10620    paddd           m0, m4
10621    psubd           m4, m1, m10
10622    paddd           m10, m1
10623
10624    mova       m16,    m13
10625    mova       m17,    m14
10626    vpermi2q   m16,    m0, m3
10627    vpermi2q   m17,    m0, m3
10628
10629    pabsd           m17, m17
10630    pabsd           m16, m16
10631    pmaxsd          m17, m16
10632
10633    mova       m18,    m13
10634    mova       m19,    m14
10635    vpermi2q   m18,    m10, m4
10636    vpermi2q   m19,    m10, m4
10637
10638    pabsd           m19, m19
10639    pabsd           m18, m18
10640    pmaxsd          m19, m18
10641    psubd           m18, m7, m9
10642    paddd           m9, m7
10643    psubd           m7, m2, m5
10644    paddd           m5, m2
10645
10646    mova       m20,    m13
10647    mova       m21,    m14
10648    vpermi2q   m20,    m9, m18
10649    vpermi2q   m21,    m9, m18
10650
10651    pabsd           m21, m21
10652    pabsd           m20, m20
10653    pmaxsd          m21, m20
10654
10655    mova       m22,    m13
10656    mova       m23,    m14
10657    vpermi2q   m22,    m5, m7
10658    vpermi2q   m23,    m5, m7
10659
10660    pabsd           m23, m23
10661    pabsd           m22, m22
10662    pmaxsd          m23, m22
10663    paddd           m17, m21
10664    paddd           m17, m19
10665    paddd           m17, m23
10666
10667    vextracti64x4   ym26, m17, 1
10668
10669    vextracti128    xm9, m17, 1
10670    paddd           ym17, ym9              ; sad_8x8
10671    movhlps         xm9, xm17
10672    paddd           xm17, xm9
10673    pshuflw         xm9, xm17, 0Eh
10674    paddd           xm17, xm9
10675    paddd           ym17, [pd_1]
10676    psrld           ym17, 1               ; sa8d_8x8
10677
10678    vextracti128    xm9, ym26, 1
10679    paddd           ym26, ym9              ; sad_8x8
10680    movhlps         xm9, xm26
10681    paddd           xm26, xm9
10682    pshuflw         xm9, xm26, 0Eh
10683    paddd           xm26, xm9
10684    paddd           ym26, [pd_1]
10685    psrld           ym26, 1               ; sa8d_8x8
10686
10687
10688
10689    psubd           ym11, ym17, ym8         ; sa8d_8x8 - sad_8x8
10690    psubd           ym12, ym26, ym15        ; sa8d_8x8 - sad_8x8
10691
10692    psubd          ym11, ym12
10693    pabsd          ym11, ym11
10694%endmacro
10695
10696%macro PSY_PP_INPUT_AVX512_MAIN10 0
10697    lea             r4, [r1 * 3]
10698    movu           xm0, [r0]
10699    movu           xm1, [r0 + r1]
10700    movu           xm2, [r0 + r1 * 2]
10701    movu           xm3, [r0 + r4]
10702    lea             r5, [r0 + r1 * 4]
10703    movu           xm4, [r5]
10704    movu           xm5, [r5 + r1]
10705    movu           xm6, [r5 + r1 * 2]
10706    movu           xm7, [r5 + r4]
10707
10708    lea             r4, [r3 * 3]
10709    vinserti128     ym0, ym0, [r2], 1
10710    vinserti128     ym1, ym1, [r2 + r3], 1
10711    vinserti128     ym2, ym2, [r2 + r3 * 2], 1
10712    vinserti128     ym3, ym3, [r2 + r4], 1
10713    lea             r5, [r2 + r3 * 4]
10714    vinserti128     ym4, ym4, [r5], 1
10715    vinserti128     ym5, ym5, [r5 + r3], 1
10716    vinserti128     ym6, ym6, [r5 + r3 * 2], 1
10717    vinserti128     ym7, ym7, [r5 + r4], 1
10718
10719    add             r0, 16
10720    add             r2, 16
10721
10722    lea             r4, [r1 * 3]
10723    vinserti32x4    m0, m0, [r0], 2
10724    vinserti32x4    m1, m1, [r0 + r1], 2
10725    vinserti32x4    m2, m2, [r0 + r1 * 2], 2
10726    vinserti32x4    m3, m3, [r0 + r4], 2
10727    lea             r5, [r0 + r1 * 4]
10728    vinserti32x4    m4, m4, [r5], 2
10729    vinserti32x4    m5, m5, [r5 + r1], 2
10730    vinserti32x4    m6, m6, [r5 + r1 * 2], 2
10731    vinserti32x4    m7, m7, [r5 + r4], 2
10732
10733    lea             r4, [r3 * 3]
10734    vinserti32x4    m0, m0, [r2], 3
10735    vinserti32x4    m1, m1, [r2 + r3], 3
10736    vinserti32x4    m2, m2, [r2 + r3 * 2], 3
10737    vinserti32x4    m3, m3, [r2 + r4], 3
10738    lea             r5, [r2 + r3 * 4]
10739    vinserti32x4    m4, m4, [r5], 3
10740    vinserti32x4    m5, m5, [r5 + r3], 3
10741    vinserti32x4    m6, m6, [r5 + r3 * 2], 3
10742    vinserti32x4    m7, m7, [r5 + r4], 3
10743%endmacro
10744
10745
10746%macro PSY_PP_16x8_AVX512_MAIN10 0
10747    paddw           m8, m0, m1
10748    paddw           m8, m2
10749    paddw           m8, m3
10750    paddw           m8, m4
10751    paddw           m8, m5
10752    paddw           m8, m6
10753    paddw           m8, m7
10754    pmaddwd         m8, m14
10755
10756    psrldq          m9, m8, 8
10757    paddd           m8, m9
10758    psrldq          m9, m8, 4
10759    paddd           m8, m9
10760    psrld           m8, 2
10761
10762    psubw           m9, m1, m0
10763    paddw           m0, m1
10764    psubw           m1, m3, m2
10765    paddw           m2, m3
10766    punpckhwd       m3, m0, m9
10767    punpcklwd       m0, m9
10768    psubw           m9, m3, m0
10769    paddw           m0, m3
10770    punpckhwd       m3, m2, m1
10771    punpcklwd       m2, m1
10772    psubw           m10, m3, m2
10773    paddw           m2, m3
10774
10775    psubw           m3, m5, m4
10776    paddw           m4, m5
10777    psubw           m5, m7, m6
10778    paddw           m6, m7
10779    punpckhwd       m1, m4, m3
10780    punpcklwd       m4, m3
10781    psubw           m7, m1, m4
10782    paddw           m4, m1
10783    punpckhwd       m3, m6, m5
10784    punpcklwd       m6, m5
10785    psubw           m1, m3, m6
10786    paddw           m6, m3
10787
10788    psubw           m3, m2, m0
10789    paddw           m0, m2
10790    psubw           m2, m10, m9
10791    paddw           m9, m10
10792    punpckhdq       m5, m0, m3
10793    punpckldq       m0, m3
10794    psubw           m10, m5, m0
10795    paddw           m0, m5
10796    punpckhdq       m3, m9, m2
10797    punpckldq       m9, m2
10798    psubw           m5, m3, m9
10799    paddw           m9, m3
10800
10801    psubw           m3, m6, m4
10802    paddw           m4, m6
10803    psubw           m6, m1, m7
10804    paddw           m7, m1
10805    punpckhdq       m2, m4, m3
10806    punpckldq       m4, m3
10807    psubw           m1, m2, m4
10808    paddw           m4, m2
10809    punpckhdq       m3, m7, m6
10810    punpckldq       m7, m6
10811    psubw           m2, m3, m7
10812    paddw           m7, m3
10813
10814    psubw           m3, m4, m0
10815    paddw           m0, m4
10816    psubw           m4, m1, m10
10817    paddw           m10, m1
10818    punpckhqdq      m6, m0, m3
10819    punpcklqdq      m0, m3
10820    pabsw           m0, m0
10821    pabsw           m6, m6
10822    pmaxsw          m0, m6
10823    punpckhqdq      m3, m10, m4
10824    punpcklqdq      m10, m4
10825    pabsw           m10, m10
10826    pabsw           m3, m3
10827    pmaxsw          m10, m3
10828
10829    psubw           m3, m7, m9
10830    paddw           m9, m7
10831    psubw           m7, m2, m5
10832    paddw           m5, m2
10833    punpckhqdq      m4, m9, m3
10834    punpcklqdq      m9, m3
10835    pabsw           m9, m9
10836    pabsw           m4, m4
10837    pmaxsw          m9, m4
10838    punpckhqdq      m3, m5, m7
10839    punpcklqdq      m5, m7
10840    pabsw           m5, m5
10841    pabsw           m3, m3
10842    pmaxsw          m5, m3
10843
10844    paddd           m0, m9
10845    paddd           m0, m10
10846    paddd           m0, m5
10847    psrld           m9, m0, 16
10848    pslld           m0, 16
10849    psrld           m0, 16
10850    paddd           m0, m9
10851    psrldq          m9, m0, 8
10852    paddd           m0, m9
10853    psrldq          m9, m0, 4
10854    paddd           m0, m9
10855    paddd           m0, m15
10856    psrld           m0, 1
10857    psubd           m0, m8
10858
10859    vextracti64x4   ym2, m0, 1
10860
10861    vextracti128   xm3, ym2, 1
10862    psubd          xm3, xm2
10863    pabsd          xm3, xm3
10864
10865    vextracti128   xm1, ym0, 1
10866    psubd          xm1, xm0
10867    pabsd          xm1, xm1
10868    paddd          xm1, xm3
10869%endmacro
10870
10871%macro PSY_PP_INPUT_AVX512_MAIN 0
10872    movu       xm16, [r0 + r1 * 0]
10873    movu       xm17, [r0 + r1 * 1]
10874    movu       xm18, [r0 + r1 * 2]
10875    movu       xm19, [r0 + r4 * 1]
10876
10877    movu       xm20, [r2 + r3 * 0]
10878    movu       xm21, [r2 + r3 * 1]
10879    movu       xm22, [r2 + r3 * 2]
10880    movu       xm23, [r2 + r7 * 1]
10881
10882    mova         m0, m26
10883    vpermi2q     m0, m16, m20
10884    mova         m1, m26
10885    vpermi2q     m1, m17, m21
10886    mova         m2, m26
10887    vpermi2q     m2, m18, m22
10888    mova         m3, m26
10889    vpermi2q     m3, m19, m23
10890
10891
10892    lea          r5, [r0 + r1 * 4]
10893    lea          r6, [r2 + r3 * 4]
10894
10895    movu      xm16, [r5 + r1 * 0]
10896    movu      xm17, [r5 + r1 * 1]
10897    movu      xm18, [r5 + r1 * 2]
10898    movu      xm19, [r5 + r4 * 1]
10899
10900    movu      xm20, [r6 + r3 * 0]
10901    movu      xm21, [r6 + r3 * 1]
10902    movu      xm22, [r6 + r3 * 2]
10903    movu      xm23, [r6 + r7 * 1]
10904
10905    mova        m4, m26
10906    vpermi2q    m4, m16, m20
10907    mova        m5, m26
10908    vpermi2q    m5, m17, m21
10909    mova        m6, m26
10910    vpermi2q    m6, m18, m22
10911    mova        m7, m26
10912    vpermi2q    m7, m19, m23
10913%endmacro
10914
10915%macro PSY_PP_16x8_AVX512_MAIN 0
10916    pmaddubsw       m0, m8
10917    pmaddubsw       m1, m8
10918    pmaddubsw       m2, m8
10919    pmaddubsw       m3, m8
10920    pmaddubsw       m4, m8
10921    pmaddubsw       m5, m8
10922    pmaddubsw       m6, m8
10923    pmaddubsw       m7, m8
10924
10925    paddw           m11, m0, m1
10926    paddw           m11, m2
10927    paddw           m11, m3
10928    paddw           m11, m4
10929    paddw           m11, m5
10930    paddw           m11, m6
10931    paddw           m11, m7
10932
10933    pmaddwd         m11, m14
10934    psrldq          m10, m11, 4
10935    paddd           m11, m10
10936    psrld           m11, 2
10937
10938    mova            m9, m0
10939    paddw           m0, m1
10940    psubw           m1, m9
10941    mova            m9, m2
10942    paddw           m2, m3
10943    psubw           m3, m9
10944    mova            m9, m0
10945    paddw           m0, m2
10946    psubw           m2, m9
10947    mova            m9, m1
10948    paddw           m1, m3
10949    psubw           m3, m9
10950
10951    movdqa          m9, m4
10952    paddw           m4, m5
10953    psubw           m5, m9
10954    movdqa          m9, m6
10955    paddw           m6, m7
10956    psubw           m7, m9
10957    movdqa          m9, m4
10958    paddw           m4, m6
10959    psubw           m6, m9
10960    movdqa          m9, m5
10961    paddw           m5, m7
10962    psubw           m7, m9
10963
10964    movdqa          m9, m0
10965    paddw           m0, m4
10966    psubw           m4, m9
10967    movdqa          m9, m1
10968    paddw           m1, m5
10969    psubw           m5, m9
10970
10971    mova            m9, m0
10972    vshufps         m9, m9, m4, 11011101b
10973    vshufps         m0, m0, m4, 10001000b
10974
10975    movdqa          m4, m0
10976    paddw           m16, m0, m9
10977    psubw           m17, m9, m4
10978
10979    movaps          m4, m1
10980    vshufps         m4, m4, m5, 11011101b
10981    vshufps         m1, m1, m5, 10001000b
10982
10983    movdqa          m5, m1
10984    paddw           m18, m1, m4
10985    psubw           m19, m4, m5
10986
10987    movdqa          m5, m2
10988    paddw           m2, m6
10989    psubw           m6, m5
10990    movdqa          m5, m3
10991    paddw           m3, m7
10992    psubw           m7, m5
10993
10994    movaps          m5, m2
10995    vshufps         m5, m5, m6, 11011101b
10996    vshufps         m2, m2, m6, 10001000b
10997
10998    movdqa          m6, m2
10999    paddw           m20, m2, m5
11000    psubw           m21, m5, m6
11001
11002    movaps          m6, m3
11003
11004    vshufps         m6, m6, m7, 11011101b
11005    vshufps         m3, m3, m7, 10001000b
11006
11007    movdqa          m7, m3
11008    paddw           m22, m3, m6
11009    psubw           m23, m6, m7
11010
11011    movdqa          m7, m16
11012
11013    vextracti64x4    ym24,  m16, 1
11014    vextracti64x4    ym25,  m17, 1
11015    pblendw          ym16, ym17, 10101010b
11016    pblendw          ym24, ym25, 10101010b
11017    vinserti64x4     m16, m16, ym24, 1
11018
11019    pslld           m17, 10h
11020    psrld           m7, 10h
11021    por             m17, m7
11022    pabsw           m16, m16
11023    pabsw           m17, m17
11024    pmaxsw          m16, m17
11025    movdqa          m7, m18
11026
11027    vextracti64x4    ym24,  m18, 1
11028    vextracti64x4    ym25,  m19, 1
11029    pblendw          ym18,  ym19, 10101010b
11030    pblendw          ym24,  ym25, 10101010b
11031    vinserti64x4     m18, m18, ym24, 1
11032
11033    pslld           m19, 10h
11034    psrld           m7, 10h
11035    por             m19, m7
11036    pabsw           m18, m18
11037    pabsw           m19, m19
11038    pmaxsw          m18, m19
11039    movdqa          m7, m20
11040
11041    vextracti64x4    ym24,  m20, 1
11042    vextracti64x4    ym25,  m21, 1
11043    pblendw          ym20,  ym21, 10101010b
11044    pblendw          ym24,  ym25, 10101010b
11045    vinserti64x4     m20,   m20, ym24, 1
11046
11047    pslld           m21, 10h
11048    psrld           m7, 10h
11049    por             m21, m7
11050    pabsw           m20, m20
11051    pabsw           m21, m21
11052    pmaxsw          m20, m21
11053    mova            m7, m22
11054
11055    vextracti64x4    ym24,  m22, 1
11056    vextracti64x4    ym25,  m23, 1
11057    pblendw          ym22,  ym23, 10101010b
11058    pblendw          ym24,  ym25, 10101010b
11059    vinserti64x4     m22,   m22,  ym24, 1
11060
11061    pslld           m23, 10h
11062    psrld           m7, 10h
11063    por             m23, m7
11064    pabsw           m22, m22
11065    pabsw           m23, m23
11066    pmaxsw          m22, m23
11067    paddw           m16, m18
11068    paddw           m16, m20
11069    paddw           m16, m22
11070    pmaddwd         m16, m14
11071    psrldq          m1, m16, 8
11072    paddd           m16, m1
11073
11074    pshuflw         m1, m16, 00001110b
11075    paddd           m16, m1
11076    paddd           m16, m15
11077    psrld           m16, 1
11078
11079    psubd           m16, m11
11080    vextracti64x4   ym2, m16, 1
11081
11082    vextracti128    xm1, ym16, 1
11083    psubd           xm16, xm1
11084    pabsd           xm16, xm16
11085
11086    vextracti128   xm3, ym2, 1
11087    psubd          xm3, xm2
11088    pabsd          xm3, xm3
11089    paddd          xm16, xm3
11090%endmacro
11091
11092
11093%if ARCH_X86_64
11094INIT_YMM avx2
11095%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11096cglobal psyCost_pp_8x8, 4, 8, 12
11097    add             r1d, r1d
11098    add             r3d, r3d
11099    PSY_COST_PP_8x8_MAIN12
11100    movd           eax, xm11
11101    RET
11102%endif
11103
11104%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11105cglobal psyCost_pp_8x8, 4, 8, 11
11106    add            r1d, r1d
11107    add            r3d, r3d
11108    PSY_PP_8x8_AVX2
11109    movd           eax, xm1
11110    RET
11111%endif
11112
11113%if BIT_DEPTH == 8
11114cglobal psyCost_pp_8x8, 4, 8, 13
11115    lea             r4, [3 * r1]
11116    lea             r7, [3 * r3]
11117    mova            m8, [hmul_8p]
11118
11119    PSY_PP_8x8
11120
11121    movd            eax, xm0
11122    RET
11123%endif
11124%endif
11125
11126%if ARCH_X86_64
11127INIT_YMM avx2
11128%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11129cglobal psyCost_pp_16x16, 4, 10, 13
11130    add            r1d, r1d
11131    add            r3d, r3d
11132    pxor           m12, m12
11133
11134    mov            r8d, 2
11135.loopH:
11136    mov            r9d, 2
11137.loopW:
11138    PSY_COST_PP_8x8_MAIN12
11139
11140    paddd         xm12, xm11
11141    add             r0, 16
11142    add             r2, 16
11143    dec            r9d
11144    jnz            .loopW
11145    lea             r0, [r0 + r1 * 8 - 32]
11146    lea             r2, [r2 + r3 * 8 - 32]
11147    dec            r8d
11148    jnz            .loopH
11149    movd           eax, xm12
11150    RET
11151%endif
11152
11153%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11154cglobal psyCost_pp_16x16, 4, 10, 12
11155    add            r1d, r1d
11156    add            r3d, r3d
11157    pxor           m11, m11
11158
11159    mov            r8d, 2
11160.loopH:
11161    mov            r9d, 2
11162.loopW:
11163    PSY_PP_8x8_AVX2
11164
11165    paddd         xm11, xm1
11166    add             r0, 16
11167    add             r2, 16
11168    dec            r9d
11169    jnz            .loopW
11170    lea             r0, [r0 + r1 * 8 - 32]
11171    lea             r2, [r2 + r3 * 8 - 32]
11172    dec            r8d
11173    jnz            .loopH
11174    movd           eax, xm11
11175    RET
11176%endif
11177
11178%if BIT_DEPTH == 8
11179cglobal psyCost_pp_16x16, 4, 10, 14
11180    lea             r4, [3 * r1]
11181    lea             r7, [3 * r3]
11182    mova            m8, [hmul_8p]
11183    pxor            m13, m13
11184
11185    mov             r8d, 2
11186.loopH:
11187    mov             r9d, 2
11188.loopW:
11189    PSY_PP_8x8
11190
11191    paddd           m13, m0
11192    add             r0, 8
11193    add             r2, 8
11194    dec             r9d
11195    jnz             .loopW
11196    lea             r0, [r0 + r1 * 8 - 16]
11197    lea             r2, [r2 + r3 * 8 - 16]
11198    dec             r8d
11199    jnz             .loopH
11200    movd            eax, xm13
11201    RET
11202%endif
11203%endif
11204
11205%if ARCH_X86_64
11206INIT_YMM avx2
11207%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11208cglobal psyCost_pp_32x32, 4, 10, 13
11209    add            r1d, r1d
11210    add            r3d, r3d
11211    pxor           m12, m12
11212
11213    mov            r8d, 4
11214.loopH:
11215    mov            r9d, 4
11216.loopW:
11217    PSY_COST_PP_8x8_MAIN12
11218
11219    paddd         xm12, xm11
11220    add             r0, 16
11221    add             r2, 16
11222    dec            r9d
11223    jnz            .loopW
11224    lea             r0, [r0 + r1 * 8 - 64]
11225    lea             r2, [r2 + r3 * 8 - 64]
11226    dec            r8d
11227    jnz            .loopH
11228    movd           eax, xm12
11229    RET
11230%endif
11231
11232%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11233cglobal psyCost_pp_32x32, 4, 10, 12
11234    add            r1d, r1d
11235    add            r3d, r3d
11236    pxor           m11, m11
11237
11238    mov            r8d, 4
11239.loopH:
11240    mov            r9d, 4
11241.loopW:
11242    PSY_PP_8x8_AVX2
11243
11244    paddd         xm11, xm1
11245    add             r0, 16
11246    add             r2, 16
11247    dec            r9d
11248    jnz            .loopW
11249    lea             r0, [r0 + r1 * 8 - 64]
11250    lea             r2, [r2 + r3 * 8 - 64]
11251    dec            r8d
11252    jnz            .loopH
11253    movd           eax, xm11
11254    RET
11255%endif
11256
11257%if BIT_DEPTH == 8
11258cglobal psyCost_pp_32x32, 4, 10, 14
11259    lea             r4, [3 * r1]
11260    lea             r7, [3 * r3]
11261    mova            m8, [hmul_8p]
11262    pxor            m13, m13
11263
11264    mov             r8d, 4
11265.loopH:
11266    mov             r9d, 4
11267.loopW:
11268    PSY_PP_8x8
11269
11270    paddd           m13, m0
11271    add             r0, 8
11272    add             r2, 8
11273    dec             r9d
11274    jnz             .loopW
11275    lea             r0, [r0 + r1 * 8 - 32]
11276    lea             r2, [r2 + r3 * 8 - 32]
11277    dec             r8d
11278    jnz             .loopH
11279    movd            eax, xm13
11280    RET
11281%endif
11282%endif
11283
11284%if ARCH_X86_64
11285INIT_YMM avx2
11286%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11287cglobal psyCost_pp_64x64, 4, 10, 13
11288    add            r1d, r1d
11289    add            r3d, r3d
11290    pxor           m12, m12
11291
11292    mov            r8d, 8
11293.loopH:
11294    mov            r9d, 8
11295.loopW:
11296    PSY_COST_PP_8x8_MAIN12
11297
11298    paddd         xm12, xm11
11299    add             r0, 16
11300    add             r2, 16
11301    dec            r9d
11302    jnz            .loopW
11303    lea             r0, [r0 + r1 * 8 - 128]
11304    lea             r2, [r2 + r3 * 8 - 128]
11305    dec            r8d
11306    jnz            .loopH
11307    movd           eax, xm12
11308    RET
11309%endif
11310
11311%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11312cglobal psyCost_pp_64x64, 4, 10, 12
11313    add            r1d, r1d
11314    add            r3d, r3d
11315    pxor           m11, m11
11316
11317    mov            r8d, 8
11318.loopH:
11319    mov            r9d, 8
11320.loopW:
11321    PSY_PP_8x8_AVX2
11322
11323    paddd         xm11, xm1
11324    add             r0, 16
11325    add             r2, 16
11326    dec            r9d
11327    jnz            .loopW
11328    lea             r0, [r0 + r1 * 8 - 128]
11329    lea             r2, [r2 + r3 * 8 - 128]
11330    dec            r8d
11331    jnz            .loopH
11332    movd           eax, xm11
11333    RET
11334%endif
11335
11336%if BIT_DEPTH == 8
11337cglobal psyCost_pp_64x64, 4, 10, 14
11338    lea             r4, [3 * r1]
11339    lea             r7, [3 * r3]
11340    mova            m8, [hmul_8p]
11341    pxor            m13, m13
11342
11343    mov             r8d, 8
11344.loopH:
11345    mov             r9d, 8
11346.loopW:
11347    PSY_PP_8x8
11348
11349    paddd           m13, m0
11350    add             r0, 8
11351    add             r2, 8
11352    dec             r9d
11353    jnz             .loopW
11354    lea             r0, [r0 + r1 * 8 - 64]
11355    lea             r2, [r2 + r3 * 8 - 64]
11356    dec             r8d
11357    jnz             .loopH
11358    movd            eax, xm13
11359    RET
11360%endif
11361%endif
11362%if ARCH_X86_64
11363INIT_ZMM avx512
11364%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11365cglobal psyCost_pp_16x16, 4, 10, 27
11366    add            r1d, r1d
11367    add            r3d, r3d
11368    pxor           m24, m24
11369    movu       m13,    [psy_pp_shuff1]
11370    movu       m14,    [psy_pp_shuff2]
11371
11372    mov            r8d, 2
11373.loopH:
11374    mov            r9d, 2
11375.loopW:
11376    PSY_COST_PP_8x8_AVX512_MAIN12
11377
11378    paddd         xm24, xm11
11379    add             r0, 16
11380    add             r2, 16
11381    dec            r9d
11382    jnz            .loopW
11383    lea             r0, [r0 + r1 * 8 - 32]
11384    lea             r2, [r2 + r3 * 8 - 32]
11385    dec            r8d
11386    jnz            .loopH
11387    movd           eax, xm24
11388    RET
11389%endif
11390
11391%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11392cglobal psyCost_pp_16x16, 4, 10, 16
11393    add            r1d, r1d
11394    add            r3d, r3d
11395    pxor           m11, m11
11396    vbroadcasti32x8 m14, [pw_1]
11397    vbroadcasti32x8 m15, [pd_1]
11398
11399    mov            r8d, 2
11400.loopH:
11401    PSY_PP_INPUT_AVX512_MAIN10
11402    PSY_PP_16x8_AVX512_MAIN10
11403
11404    paddd         xm11, xm1
11405    lea             r0, [r0 + r1 * 8 - 16]
11406    lea             r2, [r2 + r3 * 8 - 16]
11407    dec            r8d
11408    jnz            .loopH
11409    movd           eax, xm11
11410    RET
11411%endif
11412
11413%if BIT_DEPTH == 8
11414cglobal psyCost_pp_16x16, 4, 10, 27
11415    lea             r4, [3 * r1]
11416    lea             r7, [3 * r3]
11417    vbroadcasti32x8  m8, [hmul_8p]
11418    pxor            m13, m13
11419    vbroadcasti32x8 m14, [pw_1]
11420    vbroadcasti32x8 m15, [pd_1]
11421    movu            m26, [psy_pp_shuff3]
11422
11423    mov             r8d, 2
11424.loopH:
11425    PSY_PP_INPUT_AVX512_MAIN
11426    PSY_PP_16x8_AVX512_MAIN
11427
11428    paddd           m13, m16
11429    lea             r0, [r0 + r1 * 8]
11430    lea             r2, [r2 + r3 * 8]
11431    dec             r8d
11432    jnz             .loopH
11433    movd            eax, xm13
11434    RET
11435%endif
11436%endif
11437
11438%if ARCH_X86_64
11439INIT_ZMM avx512
11440%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11441cglobal psyCost_pp_32x32, 4, 10, 27
11442    add            r1d, r1d
11443    add            r3d, r3d
11444    pxor           m24, m24
11445    movu       m13,    [psy_pp_shuff1]
11446    movu       m14,    [psy_pp_shuff2]
11447
11448    mov            r8d, 4
11449.loopH:
11450    mov            r9d, 4
11451.loopW:
11452    PSY_COST_PP_8x8_AVX512_MAIN12
11453
11454    paddd         xm24, xm11
11455    add             r0, 16
11456    add             r2, 16
11457    dec            r9d
11458    jnz            .loopW
11459    lea             r0, [r0 + r1 * 8 - 64]
11460    lea             r2, [r2 + r3 * 8 - 64]
11461    dec            r8d
11462    jnz            .loopH
11463    movd           eax, xm24
11464    RET
11465%endif
11466
11467%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11468cglobal psyCost_pp_32x32, 4, 10, 16
11469    add            r1d, r1d
11470    add            r3d, r3d
11471    pxor           m11, m11
11472    vbroadcasti32x8 m14, [pw_1]
11473    vbroadcasti32x8 m15, [pd_1]
11474
11475    mov            r8d, 4
11476.loopH:
11477    mov            r9d, 2
11478.loopW:
11479    PSY_PP_INPUT_AVX512_MAIN10
11480    PSY_PP_16x8_AVX512_MAIN10
11481
11482    paddd         xm11, xm1
11483    add             r0, 16
11484    add             r2, 16
11485    dec            r9d
11486    jnz            .loopW
11487    lea             r0, [r0 + r1 * 8 - 64]
11488    lea             r2, [r2 + r3 * 8 - 64]
11489    dec            r8d
11490    jnz            .loopH
11491    movd           eax, xm11
11492    RET
11493%endif
11494
11495%if BIT_DEPTH == 8
11496cglobal psyCost_pp_32x32, 4, 10, 27
11497    lea             r4, [3 * r1]
11498    lea             r7, [3 * r3]
11499    vbroadcasti32x8  m8, [hmul_8p]
11500    pxor            m13, m13
11501    vbroadcasti32x8 m14, [pw_1]
11502    vbroadcasti32x8 m15, [pd_1]
11503    movu            m26, [psy_pp_shuff3]
11504
11505    mov             r8d, 4
11506.loopH:
11507    mov             r9d, 2
11508.loopW:
11509    PSY_PP_INPUT_AVX512_MAIN
11510    PSY_PP_16x8_AVX512_MAIN
11511
11512    paddd           m13, m16
11513    add             r0, 16
11514    add             r2, 16
11515    dec             r9d
11516    jnz             .loopW
11517    lea             r0, [r0 + r1 * 8 - 32]
11518    lea             r2, [r2 + r3 * 8 - 32]
11519    dec             r8d
11520    jnz             .loopH
11521    movd            eax, xm13
11522    RET
11523%endif
11524%endif
11525
11526%if ARCH_X86_64
11527INIT_ZMM avx512
11528%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
11529cglobal psyCost_pp_64x64, 4, 10, 27
11530    add            r1d, r1d
11531    add            r3d, r3d
11532    pxor           m24, m24
11533    movu       m13,    [psy_pp_shuff1]
11534    movu       m14,    [psy_pp_shuff2]
11535
11536    mov            r8d, 8
11537.loopH:
11538    mov            r9d, 8
11539.loopW:
11540    PSY_COST_PP_8x8_AVX512_MAIN12
11541
11542    paddd         xm24, xm11
11543    add             r0, 16
11544    add             r2, 16
11545    dec            r9d
11546    jnz            .loopW
11547    lea             r0, [r0 + r1 * 8 - 128]
11548    lea             r2, [r2 + r3 * 8 - 128]
11549    dec            r8d
11550    jnz            .loopH
11551    movd           eax, xm24
11552    RET
11553%endif
11554
11555%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
11556cglobal psyCost_pp_64x64, 4, 10, 16
11557    add            r1d, r1d
11558    add            r3d, r3d
11559    pxor           m11, m11
11560    vbroadcasti32x8 m14, [pw_1]
11561    vbroadcasti32x8 m15, [pd_1]
11562
11563    mov            r8d, 8
11564.loopH:
11565    mov            r9d, 4
11566.loopW:
11567    PSY_PP_INPUT_AVX512_MAIN10
11568    PSY_PP_16x8_AVX512_MAIN10
11569
11570    paddd         xm11, xm1
11571    add             r0, 16
11572    add             r2, 16
11573    dec            r9d
11574    jnz            .loopW
11575    lea             r0, [r0 + r1 * 8 - 128]
11576    lea             r2, [r2 + r3 * 8 - 128]
11577    dec            r8d
11578    jnz            .loopH
11579    movd           eax, xm11
11580    RET
11581%endif
11582
11583%if BIT_DEPTH == 8
11584cglobal psyCost_pp_64x64, 4, 10, 27
11585    lea             r4, [3 * r1]
11586    lea             r7, [3 * r3]
11587    vbroadcasti32x8  m8, [hmul_8p]
11588    pxor            m13, m13
11589    vbroadcasti32x8 m14, [pw_1]
11590    vbroadcasti32x8 m15, [pd_1]
11591    movu            m26, [psy_pp_shuff3]
11592
11593    mov             r8d, 8
11594.loopH:
11595    mov             r9d, 4
11596.loopW:
11597    PSY_PP_INPUT_AVX512_MAIN
11598    PSY_PP_16x8_AVX512_MAIN
11599
11600    paddd           m13, m16
11601    add             r0, 16
11602    add             r2, 16
11603    dec             r9d
11604    jnz             .loopW
11605    lea             r0, [r0 + r1 * 8 - 64]
11606    lea             r2, [r2 + r3 * 8 - 64]
11607    dec             r8d
11608    jnz             .loopH
11609    movd            eax, xm13
11610    RET
11611%endif
11612%endif
11613
11614;---------------------------------------------------------------------------------------------------------------------
11615;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
11616;---------------------------------------------------------------------------------------------------------------------
11617INIT_XMM sse4
11618cglobal psyCost_ss_4x4, 4, 5, 8
11619
11620    add             r1, r1
11621    lea             r4, [3 * r1]
11622    movddup         m0, [r0]
11623    movddup         m1, [r0 + r1]
11624    movddup         m2, [r0 + r1 * 2]
11625    movddup         m3, [r0 + r4]
11626
11627    pabsw           m4, m0
11628    pabsw           m5, m1
11629    paddw           m5, m4
11630    pabsw           m4, m2
11631    paddw           m5, m4
11632    pabsw           m4, m3
11633    paddw           m5, m4
11634    pmaddwd         m5, [pw_1]
11635    psrldq          m4, m5, 4
11636    paddd           m5, m4
11637    psrld           m6, m5, 2
11638
11639    mova            m4, [hmul_8w]
11640    pmaddwd         m0, m4
11641    pmaddwd         m1, m4
11642    pmaddwd         m2, m4
11643    pmaddwd         m3, m4
11644
11645    psrldq          m4, m0, 4
11646    psubd           m5, m0, m4
11647    paddd           m0, m4
11648    shufps          m0, m5, 10001000b
11649
11650    psrldq          m4, m1, 4
11651    psubd           m5, m1, m4
11652    paddd           m1, m4
11653    shufps          m1, m5, 10001000b
11654
11655    psrldq          m4, m2, 4
11656    psubd           m5, m2, m4
11657    paddd           m2, m4
11658    shufps          m2, m5, 10001000b
11659
11660    psrldq          m4, m3, 4
11661    psubd           m5, m3, m4
11662    paddd           m3, m4
11663    shufps          m3, m5, 10001000b
11664
11665    mova            m4, m0
11666    paddd           m0, m1
11667    psubd           m1, m4
11668    mova            m4, m2
11669    paddd           m2, m3
11670    psubd           m3, m4
11671    mova            m4, m0
11672    paddd           m0, m2
11673    psubd           m2, m4
11674    mova            m4, m1
11675    paddd           m1, m3
11676    psubd           m3, m4
11677
11678    pabsd           m0, m0
11679    pabsd           m2, m2
11680    pabsd           m1, m1
11681    pabsd           m3, m3
11682    paddd           m0, m2
11683    paddd           m1, m3
11684    paddd           m0, m1
11685    movhlps         m1, m0
11686    paddd           m0, m1
11687    psrldq          m1, m0, 4
11688    paddd           m0, m1
11689    psrld           m0, 1
11690    psubd           m7, m0, m6
11691
11692    add             r3, r3
11693    lea             r4, [3 * r3]
11694    movddup         m0, [r2]
11695    movddup         m1, [r2 + r3]
11696    movddup         m2, [r2 + r3 * 2]
11697    movddup         m3, [r2 + r4]
11698
11699    pabsw           m4, m0
11700    pabsw           m5, m1
11701    paddw           m5, m4
11702    pabsw           m4, m2
11703    paddw           m5, m4
11704    pabsw           m4, m3
11705    paddw           m5, m4
11706    pmaddwd         m5, [pw_1]
11707    psrldq          m4, m5, 4
11708    paddd           m5, m4
11709    psrld           m6, m5, 2
11710
11711    mova            m4, [hmul_8w]
11712    pmaddwd         m0, m4
11713    pmaddwd         m1, m4
11714    pmaddwd         m2, m4
11715    pmaddwd         m3, m4
11716
11717    psrldq          m4, m0, 4
11718    psubd           m5, m0, m4
11719    paddd           m0, m4
11720    shufps          m0, m5, 10001000b
11721
11722    psrldq          m4, m1, 4
11723    psubd           m5, m1, m4
11724    paddd           m1, m4
11725    shufps          m1, m5, 10001000b
11726
11727    psrldq          m4, m2, 4
11728    psubd           m5, m2, m4
11729    paddd           m2, m4
11730    shufps          m2, m5, 10001000b
11731
11732    psrldq          m4, m3, 4
11733    psubd           m5, m3, m4
11734    paddd           m3, m4
11735    shufps          m3, m5, 10001000b
11736
11737    mova            m4, m0
11738    paddd           m0, m1
11739    psubd           m1, m4
11740    mova            m4, m2
11741    paddd           m2, m3
11742    psubd           m3, m4
11743    mova            m4, m0
11744    paddd           m0, m2
11745    psubd           m2, m4
11746    mova            m4, m1
11747    paddd           m1, m3
11748    psubd           m3, m4
11749
11750    pabsd           m0, m0
11751    pabsd           m2, m2
11752    pabsd           m1, m1
11753    pabsd           m3, m3
11754    paddd           m0, m2
11755    paddd           m1, m3
11756    paddd           m0, m1
11757    movhlps         m1, m0
11758    paddd           m0, m1
11759    psrldq          m1, m0, 4
11760    paddd           m0, m1
11761    psrld           m0, 1
11762    psubd           m0, m6
11763    psubd           m7, m0
11764    pabsd           m0, m7
11765    movd            eax, m0
11766    RET
11767
11768%if ARCH_X86_64
11769INIT_XMM sse4
11770cglobal psyCost_ss_8x8, 4, 6, 15
11771
11772    mova            m13, [pw_pmpmpmpm]
11773    mova            m14, [pw_1]
11774    add             r1, r1
11775    add             r3, r3
11776    lea             r4, [3 * r1]
11777    movu            m0, [r0]
11778    movu            m1, [r0 + r1]
11779    movu            m2, [r0 + r1 * 2]
11780    movu            m3, [r0 + r4]
11781    lea             r5, [r0 + r1 * 4]
11782    movu            m4, [r5]
11783    movu            m5, [r5 + r1]
11784    movu            m6, [r5 + r1 * 2]
11785    movu            m7, [r5 + r4]
11786
11787    pabsw           m8, m0
11788    pabsw           m9, m1
11789    paddw           m8, m9
11790    pabsw           m10, m2
11791    pabsw           m11, m3
11792    paddw           m10, m11
11793    paddw           m8, m10
11794    pabsw           m9, m4
11795    pabsw           m10, m5
11796    paddw           m9, m10
11797    pabsw           m11, m6
11798    pabsw           m12, m7
11799    paddw           m11, m12
11800    paddw           m9, m11
11801    paddw           m8, m9
11802    movhlps         m9, m8
11803    pmovzxwd        m8, m8
11804    pmovzxwd        m9, m9
11805    paddd           m8, m9
11806    movhlps         m9, m8
11807    paddd           m8, m9
11808    psrldq          m9, m8, 4
11809    paddd           m8, m9
11810    psrld           m8, 2
11811
11812    pmaddwd         m0, m13
11813    pmaddwd         m1, m13
11814    pmaddwd         m2, m13
11815    pmaddwd         m3, m13
11816
11817    psrldq          m9, m0, 4
11818    psubd           m10, m0, m9
11819    paddd           m0, m9
11820    shufps          m0, m10, 10001000b
11821    psrldq          m9, m0, 4
11822    psubd           m10, m0, m9
11823    paddd           m0, m9
11824    shufps          m0, m10, 10001000b
11825
11826    psrldq          m9, m1, 4
11827    psubd           m10, m1, m9
11828    paddd           m1, m9
11829    shufps          m1, m10, 10001000b
11830    psrldq          m9, m1, 4
11831    psubd           m10, m1, m9
11832    paddd           m1, m9
11833    shufps          m1, m10, 10001000b
11834
11835    psrldq          m9, m2, 4
11836    psubd           m10, m2, m9
11837    paddd           m2, m9
11838    shufps          m2, m10, 10001000b
11839    psrldq          m9, m2, 4
11840    psubd           m10, m2, m9
11841    paddd           m2, m9
11842    shufps          m2, m10, 10001000b
11843
11844    psrldq          m9, m3, 4
11845    psubd           m10, m3, m9
11846    paddd           m3, m9
11847    shufps          m3, m10, 10001000b
11848    psrldq          m9, m3, 4
11849    psubd           m10, m3, m9
11850    paddd           m3, m9
11851    shufps          m3, m10, 10001000b
11852
11853    SUMSUB_BA d, 0, 1, 9
11854    SUMSUB_BA d, 2, 3, 9
11855    SUMSUB_BA d, 0, 2, 9
11856    SUMSUB_BA d, 1, 3, 9
11857
11858    pmaddwd         m4, m13
11859    pmaddwd         m5, m13
11860    pmaddwd         m6, m13
11861    pmaddwd         m7, m13
11862
11863    psrldq          m9, m4, 4
11864    psubd           m10, m4, m9
11865    paddd           m4, m9
11866    shufps          m4, m10, 10001000b
11867    psrldq          m9, m4, 4
11868    psubd           m10, m4, m9
11869    paddd           m4, m9
11870    shufps          m4, m10, 10001000b
11871
11872    psrldq          m9, m5, 4
11873    psubd           m10, m5, m9
11874    paddd           m5, m9
11875    shufps          m5, m10, 10001000b
11876    psrldq          m9, m5, 4
11877    psubd           m10, m5, m9
11878    paddd           m5, m9
11879    shufps          m5, m10, 10001000b
11880
11881    psrldq          m9, m6, 4
11882    psubd           m10, m6, m9
11883    paddd           m6, m9
11884    shufps          m6, m10, 10001000b
11885    psrldq          m9, m6, 4
11886    psubd           m10, m6, m9
11887    paddd           m6, m9
11888    shufps          m6, m10, 10001000b
11889
11890    psrldq          m9, m7, 4
11891    psubd           m10, m7, m9
11892    paddd           m7, m9
11893    shufps          m7, m10, 10001000b
11894    psrldq          m9, m7, 4
11895    psubd           m10, m7, m9
11896    paddd           m7, m9
11897    shufps          m7, m10, 10001000b
11898
11899    SUMSUB_BA d, 4, 5, 9
11900    SUMSUB_BA d, 6, 7, 9
11901    SUMSUB_BA d, 4, 6, 9
11902    SUMSUB_BA d, 5, 7, 9
11903
11904    SUMSUB_BA d, 0, 4, 9
11905    SUMSUB_BA d, 1, 5, 9
11906    SUMSUB_BA d, 2, 6, 9
11907    SUMSUB_BA d, 3, 7, 9
11908
11909    pabsd           m0, m0
11910    pabsd           m2, m2
11911    pabsd           m1, m1
11912    pabsd           m3, m3
11913    pabsd           m4, m4
11914    pabsd           m5, m5
11915    pabsd           m6, m6
11916    pabsd           m7, m7
11917
11918    paddd           m0, m2
11919    paddd           m1, m3
11920    paddd           m0, m1
11921    paddd           m5, m4
11922    paddd           m0, m5
11923    paddd           m7, m6
11924    paddd           m11, m0, m7
11925
11926    movu            m0, [r0]
11927    movu            m1, [r0 + r1]
11928    movu            m2, [r0 + r1 * 2]
11929    movu            m3, [r0 + r4]
11930
11931    pmaddwd         m0, m14
11932    pmaddwd         m1, m14
11933    pmaddwd         m2, m14
11934    pmaddwd         m3, m14
11935
11936    psrldq          m9, m0, 4
11937    psubd           m10, m0, m9
11938    paddd           m0, m9
11939    shufps          m0, m10, 10001000b
11940    psrldq          m9, m0, 4
11941    psubd           m10, m0, m9
11942    paddd           m0, m9
11943    shufps          m0, m10, 10001000b
11944
11945    psrldq          m9, m1, 4
11946    psubd           m10, m1, m9
11947    paddd           m1, m9
11948    shufps          m1, m10, 10001000b
11949    psrldq          m9, m1, 4
11950    psubd           m10, m1, m9
11951    paddd           m1, m9
11952    shufps          m1, m10, 10001000b
11953
11954    psrldq          m9, m2, 4
11955    psubd           m10, m2, m9
11956    paddd           m2, m9
11957    shufps          m2, m10, 10001000b
11958    psrldq          m9, m2, 4
11959    psubd           m10, m2, m9
11960    paddd           m2, m9
11961    shufps          m2, m10, 10001000b
11962
11963    psrldq          m9, m3, 4
11964    psubd           m10, m3, m9
11965    paddd           m3, m9
11966    shufps          m3, m10, 10001000b
11967    psrldq          m9, m3, 4
11968    psubd           m10, m3, m9
11969    paddd           m3, m9
11970    shufps          m3, m10, 10001000b
11971
11972    SUMSUB_BA d, 0, 1, 9
11973    SUMSUB_BA d, 2, 3, 9
11974    SUMSUB_BA d, 0, 2, 9
11975    SUMSUB_BA d, 1, 3, 9
11976
11977    movu            m4, [r5]
11978    movu            m5, [r5 + r1]
11979    movu            m6, [r5 + r1 * 2]
11980    movu            m7, [r5 + r4]
11981
11982    pmaddwd         m4, m14
11983    pmaddwd         m5, m14
11984    pmaddwd         m6, m14
11985    pmaddwd         m7, m14
11986
11987    psrldq          m9, m4, 4
11988    psubd           m10, m4, m9
11989    paddd           m4, m9
11990    shufps          m4, m10, 10001000b
11991    psrldq          m9, m4, 4
11992    psubd           m10, m4, m9
11993    paddd           m4, m9
11994    shufps          m4, m10, 10001000b
11995
11996    psrldq          m9, m5, 4
11997    psubd           m10, m5, m9
11998    paddd           m5, m9
11999    shufps          m5, m10, 10001000b
12000    psrldq          m9, m5, 4
12001    psubd           m10, m5, m9
12002    paddd           m5, m9
12003    shufps          m5, m10, 10001000b
12004
12005    psrldq          m9, m6, 4
12006    psubd           m10, m6, m9
12007    paddd           m6, m9
12008    shufps          m6, m10, 10001000b
12009    psrldq          m9, m6, 4
12010    psubd           m10, m6, m9
12011    paddd           m6, m9
12012    shufps          m6, m10, 10001000b
12013
12014    psrldq          m9, m7, 4
12015    psubd           m10, m7, m9
12016    paddd           m7, m9
12017    shufps          m7, m10, 10001000b
12018    psrldq          m9, m7, 4
12019    psubd           m10, m7, m9
12020    paddd           m7, m9
12021    shufps          m7, m10, 10001000b
12022
12023    SUMSUB_BA d, 4, 5, 9
12024    SUMSUB_BA d, 6, 7, 9
12025    SUMSUB_BA d, 4, 6, 9
12026    SUMSUB_BA d, 5, 7, 9
12027
12028    SUMSUB_BA d, 0, 4, 9
12029    SUMSUB_BA d, 1, 5, 9
12030    SUMSUB_BA d, 2, 6, 9
12031    SUMSUB_BA d, 3, 7, 9
12032
12033    pabsd           m0, m0
12034    pabsd           m2, m2
12035    pabsd           m1, m1
12036    pabsd           m3, m3
12037    pabsd           m4, m4
12038    pabsd           m5, m5
12039    pabsd           m6, m6
12040    pabsd           m7, m7
12041
12042    paddd           m0, m2
12043    paddd           m1, m3
12044    paddd           m0, m1
12045    paddd           m5, m4
12046    paddd           m0, m5
12047    paddd           m7, m6
12048    paddd           m0, m7
12049    paddd           m0, m11
12050
12051    movhlps         m1, m0
12052    paddd           m0, m1
12053    psrldq          m1, m0, 4
12054    paddd           m0, m1
12055    paddd           m0, [pd_2]
12056    psrld           m0, 2
12057    psubd           m12, m0, m8
12058
12059    lea             r4, [3 * r3]
12060    movu            m0, [r2]
12061    movu            m1, [r2 + r3]
12062    movu            m2, [r2 + r3 * 2]
12063    movu            m3, [r2 + r4]
12064    lea             r5, [r2 + r3 * 4]
12065    movu            m4, [r5]
12066    movu            m5, [r5 + r3]
12067    movu            m6, [r5 + r3 * 2]
12068    movu            m7, [r5 + r4]
12069
12070    pabsw           m8, m0
12071    pabsw           m9, m1
12072    paddw           m8, m9
12073    pabsw           m10, m2
12074    pabsw           m11, m3
12075    paddw           m10, m11
12076    paddw           m8, m10
12077    pabsw           m9, m4
12078    pabsw           m10, m5
12079    paddw           m9, m10
12080    pabsw           m11, m6
12081    pabsw           m10, m7
12082    paddw           m11, m10
12083    paddw           m9, m11
12084    paddw           m8, m9
12085    movhlps         m9, m8
12086    pmovzxwd        m8, m8
12087    pmovzxwd        m9, m9
12088    paddd           m8, m9
12089    movhlps         m9, m8
12090    paddd           m8, m9
12091    psrldq          m9, m8, 4
12092    paddd           m8, m9
12093    psrld           m8, 2
12094
12095    pmaddwd         m0, m13
12096    pmaddwd         m1, m13
12097    pmaddwd         m2, m13
12098    pmaddwd         m3, m13
12099
12100    psrldq          m9, m0, 4
12101    psubd           m10, m0, m9
12102    paddd           m0, m9
12103    shufps          m0, m10, 10001000b
12104    psrldq          m9, m0, 4
12105    psubd           m10, m0, m9
12106    paddd           m0, m9
12107    shufps          m0, m10, 10001000b
12108
12109    psrldq          m9, m1, 4
12110    psubd           m10, m1, m9
12111    paddd           m1, m9
12112    shufps          m1, m10, 10001000b
12113    psrldq          m9, m1, 4
12114    psubd           m10, m1, m9
12115    paddd           m1, m9
12116    shufps          m1, m10, 10001000b
12117
12118    psrldq          m9, m2, 4
12119    psubd           m10, m2, m9
12120    paddd           m2, m9
12121    shufps          m2, m10, 10001000b
12122    psrldq          m9, m2, 4
12123    psubd           m10, m2, m9
12124    paddd           m2, m9
12125    shufps          m2, m10, 10001000b
12126
12127    psrldq          m9, m3, 4
12128    psubd           m10, m3, m9
12129    paddd           m3, m9
12130    shufps          m3, m10, 10001000b
12131    psrldq          m9, m3, 4
12132    psubd           m10, m3, m9
12133    paddd           m3, m9
12134    shufps          m3, m10, 10001000b
12135
12136    SUMSUB_BA d, 0, 1, 9
12137    SUMSUB_BA d, 2, 3, 9
12138    SUMSUB_BA d, 0, 2, 9
12139    SUMSUB_BA d, 1, 3, 9
12140
12141    pmaddwd         m4, m13
12142    pmaddwd         m5, m13
12143    pmaddwd         m6, m13
12144    pmaddwd         m7, m13
12145
12146    psrldq          m9, m4, 4
12147    psubd           m10, m4, m9
12148    paddd           m4, m9
12149    shufps          m4, m10, 10001000b
12150    psrldq          m9, m4, 4
12151    psubd           m10, m4, m9
12152    paddd           m4, m9
12153    shufps          m4, m10, 10001000b
12154
12155    psrldq          m9, m5, 4
12156    psubd           m10, m5, m9
12157    paddd           m5, m9
12158    shufps          m5, m10, 10001000b
12159    psrldq          m9, m5, 4
12160    psubd           m10, m5, m9
12161    paddd           m5, m9
12162    shufps          m5, m10, 10001000b
12163
12164    psrldq          m9, m6, 4
12165    psubd           m10, m6, m9
12166    paddd           m6, m9
12167    shufps          m6, m10, 10001000b
12168    psrldq          m9, m6, 4
12169    psubd           m10, m6, m9
12170    paddd           m6, m9
12171    shufps          m6, m10, 10001000b
12172
12173    psrldq          m9, m7, 4
12174    psubd           m10, m7, m9
12175    paddd           m7, m9
12176    shufps          m7, m10, 10001000b
12177    psrldq          m9, m7, 4
12178    psubd           m10, m7, m9
12179    paddd           m7, m9
12180    shufps          m7, m10, 10001000b
12181
12182    SUMSUB_BA d, 4, 5, 9
12183    SUMSUB_BA d, 6, 7, 9
12184    SUMSUB_BA d, 4, 6, 9
12185    SUMSUB_BA d, 5, 7, 9
12186
12187    SUMSUB_BA d, 0, 4, 9
12188    SUMSUB_BA d, 1, 5, 9
12189    SUMSUB_BA d, 2, 6, 9
12190    SUMSUB_BA d, 3, 7, 9
12191
12192    pabsd           m0, m0
12193    pabsd           m2, m2
12194    pabsd           m1, m1
12195    pabsd           m3, m3
12196    pabsd           m4, m4
12197    pabsd           m5, m5
12198    pabsd           m6, m6
12199    pabsd           m7, m7
12200
12201    paddd           m0, m2
12202    paddd           m1, m3
12203    paddd           m0, m1
12204    paddd           m5, m4
12205    paddd           m0, m5
12206    paddd           m7, m6
12207    paddd           m11, m0, m7
12208
12209    movu            m0, [r2]
12210    movu            m1, [r2 + r3]
12211    movu            m2, [r2 + r3 * 2]
12212    movu            m3, [r2 + r4]
12213
12214    pmaddwd         m0, m14
12215    pmaddwd         m1, m14
12216    pmaddwd         m2, m14
12217    pmaddwd         m3, m14
12218
12219    psrldq          m9, m0, 4
12220    psubd           m10, m0, m9
12221    paddd           m0, m9
12222    shufps          m0, m10, 10001000b
12223    psrldq          m9, m0, 4
12224    psubd           m10, m0, m9
12225    paddd           m0, m9
12226    shufps          m0, m10, 10001000b
12227
12228    psrldq          m9, m1, 4
12229    psubd           m10, m1, m9
12230    paddd           m1, m9
12231    shufps          m1, m10, 10001000b
12232    psrldq          m9, m1, 4
12233    psubd           m10, m1, m9
12234    paddd           m1, m9
12235    shufps          m1, m10, 10001000b
12236
12237    psrldq          m9, m2, 4
12238    psubd           m10, m2, m9
12239    paddd           m2, m9
12240    shufps          m2, m10, 10001000b
12241    psrldq          m9, m2, 4
12242    psubd           m10, m2, m9
12243    paddd           m2, m9
12244    shufps          m2, m10, 10001000b
12245
12246    psrldq          m9, m3, 4
12247    psubd           m10, m3, m9
12248    paddd           m3, m9
12249    shufps          m3, m10, 10001000b
12250    psrldq          m9, m3, 4
12251    psubd           m10, m3, m9
12252    paddd           m3, m9
12253    shufps          m3, m10, 10001000b
12254
12255    SUMSUB_BA d, 0, 1, 9
12256    SUMSUB_BA d, 2, 3, 9
12257    SUMSUB_BA d, 0, 2, 9
12258    SUMSUB_BA d, 1, 3, 9
12259
12260    movu            m4, [r5]
12261    movu            m5, [r5 + r3]
12262    movu            m6, [r5 + r3 * 2]
12263    movu            m7, [r5 + r4]
12264
12265    pmaddwd         m4, m14
12266    pmaddwd         m5, m14
12267    pmaddwd         m6, m14
12268    pmaddwd         m7, m14
12269
12270    psrldq          m9, m4, 4
12271    psubd           m10, m4, m9
12272    paddd           m4, m9
12273    shufps          m4, m10, 10001000b
12274    psrldq          m9, m4, 4
12275    psubd           m10, m4, m9
12276    paddd           m4, m9
12277    shufps          m4, m10, 10001000b
12278
12279    psrldq          m9, m5, 4
12280    psubd           m10, m5, m9
12281    paddd           m5, m9
12282    shufps          m5, m10, 10001000b
12283    psrldq          m9, m5, 4
12284    psubd           m10, m5, m9
12285    paddd           m5, m9
12286    shufps          m5, m10, 10001000b
12287
12288    psrldq          m9, m6, 4
12289    psubd           m10, m6, m9
12290    paddd           m6, m9
12291    shufps          m6, m10, 10001000b
12292    psrldq          m9, m6, 4
12293    psubd           m10, m6, m9
12294    paddd           m6, m9
12295    shufps          m6, m10, 10001000b
12296
12297    psrldq          m9, m7, 4
12298    psubd           m10, m7, m9
12299    paddd           m7, m9
12300    shufps          m7, m10, 10001000b
12301    psrldq          m9, m7, 4
12302    psubd           m10, m7, m9
12303    paddd           m7, m9
12304    shufps          m7, m10, 10001000b
12305
12306    SUMSUB_BA d, 4, 5, 9
12307    SUMSUB_BA d, 6, 7, 9
12308    SUMSUB_BA d, 4, 6, 9
12309    SUMSUB_BA d, 5, 7, 9
12310
12311    SUMSUB_BA d, 0, 4, 9
12312    SUMSUB_BA d, 1, 5, 9
12313    SUMSUB_BA d, 2, 6, 9
12314    SUMSUB_BA d, 3, 7, 9
12315
12316    pabsd           m0, m0
12317    pabsd           m2, m2
12318    pabsd           m1, m1
12319    pabsd           m3, m3
12320    pabsd           m4, m4
12321    pabsd           m5, m5
12322    pabsd           m6, m6
12323    pabsd           m7, m7
12324
12325    paddd           m0, m2
12326    paddd           m1, m3
12327    paddd           m0, m1
12328    paddd           m5, m4
12329    paddd           m0, m5
12330    paddd           m7, m6
12331    paddd           m0, m7
12332    paddd           m0, m11
12333
12334    movhlps         m1, m0
12335    paddd           m0, m1
12336    psrldq          m1, m0, 4
12337    paddd           m0, m1
12338    paddd           m0, [pd_2]
12339    psrld           m0, 2
12340    psubd           m0, m8
12341
12342    psubd           m12, m0
12343    pabsd           m0, m12
12344    movd            eax, m0
12345    RET
12346%endif
12347
12348%macro psy_cost_ss 0
12349    movu            m0, [r0]
12350    movu            m1, [r0 + r1]
12351    movu            m2, [r0 + r1 * 2]
12352    movu            m3, [r0 + r4]
12353    lea             r5, [r0 + r1 * 4]
12354    movu            m4, [r5]
12355    movu            m5, [r5 + r1]
12356    movu            m6, [r5 + r1 * 2]
12357    movu            m7, [r5 + r4]
12358
12359    pabsw           m8, m0
12360    pabsw           m9, m1
12361    paddw           m8, m9
12362    pabsw           m10, m2
12363    pabsw           m11, m3
12364    paddw           m10, m11
12365    paddw           m8, m10
12366    pabsw           m9, m4
12367    pabsw           m10, m5
12368    paddw           m9, m10
12369    pabsw           m11, m6
12370    pabsw           m12, m7
12371    paddw           m11, m12
12372    paddw           m9, m11
12373    paddw           m8, m9
12374    movhlps         m9, m8
12375    pmovzxwd        m8, m8
12376    pmovzxwd        m9, m9
12377    paddd           m8, m9
12378    movhlps         m9, m8
12379    paddd           m8, m9
12380    psrldq          m9, m8, 4
12381    paddd           m8, m9
12382    psrld           m8, 2
12383
12384    pmaddwd         m0, m13
12385    pmaddwd         m1, m13
12386    pmaddwd         m2, m13
12387    pmaddwd         m3, m13
12388
12389    psrldq          m9, m0, 4
12390    psubd           m10, m0, m9
12391    paddd           m0, m9
12392    shufps          m0, m10, 10001000b
12393    psrldq          m9, m0, 4
12394    psubd           m10, m0, m9
12395    paddd           m0, m9
12396    shufps          m0, m10, 10001000b
12397
12398    psrldq          m9, m1, 4
12399    psubd           m10, m1, m9
12400    paddd           m1, m9
12401    shufps          m1, m10, 10001000b
12402    psrldq          m9, m1, 4
12403    psubd           m10, m1, m9
12404    paddd           m1, m9
12405    shufps          m1, m10, 10001000b
12406
12407    psrldq          m9, m2, 4
12408    psubd           m10, m2, m9
12409    paddd           m2, m9
12410    shufps          m2, m10, 10001000b
12411    psrldq          m9, m2, 4
12412    psubd           m10, m2, m9
12413    paddd           m2, m9
12414    shufps          m2, m10, 10001000b
12415
12416    psrldq          m9, m3, 4
12417    psubd           m10, m3, m9
12418    paddd           m3, m9
12419    shufps          m3, m10, 10001000b
12420    psrldq          m9, m3, 4
12421    psubd           m10, m3, m9
12422    paddd           m3, m9
12423    shufps          m3, m10, 10001000b
12424
12425    SUMSUB_BA d, 0, 1, 9
12426    SUMSUB_BA d, 2, 3, 9
12427    SUMSUB_BA d, 0, 2, 9
12428    SUMSUB_BA d, 1, 3, 9
12429
12430    pmaddwd         m4, m13
12431    pmaddwd         m5, m13
12432    pmaddwd         m6, m13
12433    pmaddwd         m7, m13
12434
12435    psrldq          m9, m4, 4
12436    psubd           m10, m4, m9
12437    paddd           m4, m9
12438    shufps          m4, m10, 10001000b
12439    psrldq          m9, m4, 4
12440    psubd           m10, m4, m9
12441    paddd           m4, m9
12442    shufps          m4, m10, 10001000b
12443
12444    psrldq          m9, m5, 4
12445    psubd           m10, m5, m9
12446    paddd           m5, m9
12447    shufps          m5, m10, 10001000b
12448    psrldq          m9, m5, 4
12449    psubd           m10, m5, m9
12450    paddd           m5, m9
12451    shufps          m5, m10, 10001000b
12452
12453    psrldq          m9, m6, 4
12454    psubd           m10, m6, m9
12455    paddd           m6, m9
12456    shufps          m6, m10, 10001000b
12457    psrldq          m9, m6, 4
12458    psubd           m10, m6, m9
12459    paddd           m6, m9
12460    shufps          m6, m10, 10001000b
12461
12462    psrldq          m9, m7, 4
12463    psubd           m10, m7, m9
12464    paddd           m7, m9
12465    shufps          m7, m10, 10001000b
12466    psrldq          m9, m7, 4
12467    psubd           m10, m7, m9
12468    paddd           m7, m9
12469    shufps          m7, m10, 10001000b
12470
12471    SUMSUB_BA d, 4, 5, 9
12472    SUMSUB_BA d, 6, 7, 9
12473    SUMSUB_BA d, 4, 6, 9
12474    SUMSUB_BA d, 5, 7, 9
12475
12476    SUMSUB_BA d, 0, 4, 9
12477    SUMSUB_BA d, 1, 5, 9
12478    SUMSUB_BA d, 2, 6, 9
12479    SUMSUB_BA d, 3, 7, 9
12480
12481    pabsd           m0, m0
12482    pabsd           m2, m2
12483    pabsd           m1, m1
12484    pabsd           m3, m3
12485    pabsd           m4, m4
12486    pabsd           m5, m5
12487    pabsd           m6, m6
12488    pabsd           m7, m7
12489
12490    paddd           m0, m2
12491    paddd           m1, m3
12492    paddd           m0, m1
12493    paddd           m5, m4
12494    paddd           m0, m5
12495    paddd           m7, m6
12496    paddd           m11, m0, m7
12497
12498    movu            m0, [r0]
12499    movu            m1, [r0 + r1]
12500    movu            m2, [r0 + r1 * 2]
12501    movu            m3, [r0 + r4]
12502
12503    pmaddwd         m0, m14
12504    pmaddwd         m1, m14
12505    pmaddwd         m2, m14
12506    pmaddwd         m3, m14
12507
12508    psrldq          m9, m0, 4
12509    psubd           m10, m0, m9
12510    paddd           m0, m9
12511    shufps          m0, m10, 10001000b
12512    psrldq          m9, m0, 4
12513    psubd           m10, m0, m9
12514    paddd           m0, m9
12515    shufps          m0, m10, 10001000b
12516
12517    psrldq          m9, m1, 4
12518    psubd           m10, m1, m9
12519    paddd           m1, m9
12520    shufps          m1, m10, 10001000b
12521    psrldq          m9, m1, 4
12522    psubd           m10, m1, m9
12523    paddd           m1, m9
12524    shufps          m1, m10, 10001000b
12525
12526    psrldq          m9, m2, 4
12527    psubd           m10, m2, m9
12528    paddd           m2, m9
12529    shufps          m2, m10, 10001000b
12530    psrldq          m9, m2, 4
12531    psubd           m10, m2, m9
12532    paddd           m2, m9
12533    shufps          m2, m10, 10001000b
12534
12535    psrldq          m9, m3, 4
12536    psubd           m10, m3, m9
12537    paddd           m3, m9
12538    shufps          m3, m10, 10001000b
12539    psrldq          m9, m3, 4
12540    psubd           m10, m3, m9
12541    paddd           m3, m9
12542    shufps          m3, m10, 10001000b
12543
12544    SUMSUB_BA d, 0, 1, 9
12545    SUMSUB_BA d, 2, 3, 9
12546    SUMSUB_BA d, 0, 2, 9
12547    SUMSUB_BA d, 1, 3, 9
12548
12549    movu            m4, [r5]
12550    movu            m5, [r5 + r1]
12551    movu            m6, [r5 + r1 * 2]
12552    movu            m7, [r5 + r4]
12553
12554    pmaddwd         m4, m14
12555    pmaddwd         m5, m14
12556    pmaddwd         m6, m14
12557    pmaddwd         m7, m14
12558
12559    psrldq          m9, m4, 4
12560    psubd           m10, m4, m9
12561    paddd           m4, m9
12562    shufps          m4, m10, 10001000b
12563    psrldq          m9, m4, 4
12564    psubd           m10, m4, m9
12565    paddd           m4, m9
12566    shufps          m4, m10, 10001000b
12567
12568    psrldq          m9, m5, 4
12569    psubd           m10, m5, m9
12570    paddd           m5, m9
12571    shufps          m5, m10, 10001000b
12572    psrldq          m9, m5, 4
12573    psubd           m10, m5, m9
12574    paddd           m5, m9
12575    shufps          m5, m10, 10001000b
12576
12577    psrldq          m9, m6, 4
12578    psubd           m10, m6, m9
12579    paddd           m6, m9
12580    shufps          m6, m10, 10001000b
12581    psrldq          m9, m6, 4
12582    psubd           m10, m6, m9
12583    paddd           m6, m9
12584    shufps          m6, m10, 10001000b
12585
12586    psrldq          m9, m7, 4
12587    psubd           m10, m7, m9
12588    paddd           m7, m9
12589    shufps          m7, m10, 10001000b
12590    psrldq          m9, m7, 4
12591    psubd           m10, m7, m9
12592    paddd           m7, m9
12593    shufps          m7, m10, 10001000b
12594
12595    SUMSUB_BA d, 4, 5, 9
12596    SUMSUB_BA d, 6, 7, 9
12597    SUMSUB_BA d, 4, 6, 9
12598    SUMSUB_BA d, 5, 7, 9
12599
12600    SUMSUB_BA d, 0, 4, 9
12601    SUMSUB_BA d, 1, 5, 9
12602    SUMSUB_BA d, 2, 6, 9
12603    SUMSUB_BA d, 3, 7, 9
12604
12605    pabsd           m0, m0
12606    pabsd           m2, m2
12607    pabsd           m1, m1
12608    pabsd           m3, m3
12609    pabsd           m4, m4
12610    pabsd           m5, m5
12611    pabsd           m6, m6
12612    pabsd           m7, m7
12613
12614    paddd           m0, m2
12615    paddd           m1, m3
12616    paddd           m0, m1
12617    paddd           m5, m4
12618    paddd           m0, m5
12619    paddd           m7, m6
12620    paddd           m0, m7
12621    paddd           m0, m11
12622
12623    movhlps         m1, m0
12624    paddd           m0, m1
12625    psrldq          m1, m0, 4
12626    paddd           m0, m1
12627    paddd           m0, [pd_2]
12628    psrld           m0, 2
12629    psubd           m12, m0, m8
12630
12631    movu            m0, [r2]
12632    movu            m1, [r2 + r3]
12633    movu            m2, [r2 + r3 * 2]
12634    movu            m3, [r2 + r6]
12635    lea             r5, [r2 + r3 * 4]
12636    movu            m4, [r5]
12637    movu            m5, [r5 + r3]
12638    movu            m6, [r5 + r3 * 2]
12639    movu            m7, [r5 + r6]
12640
12641    pabsw           m8, m0
12642    pabsw           m9, m1
12643    paddw           m8, m9
12644    pabsw           m10, m2
12645    pabsw           m11, m3
12646    paddw           m10, m11
12647    paddw           m8, m10
12648    pabsw           m9, m4
12649    pabsw           m10, m5
12650    paddw           m9, m10
12651    pabsw           m11, m6
12652    pabsw           m10, m7
12653    paddw           m11, m10
12654    paddw           m9, m11
12655    paddw           m8, m9
12656    movhlps         m9, m8
12657    pmovzxwd        m8, m8
12658    pmovzxwd        m9, m9
12659    paddd           m8, m9
12660    movhlps         m9, m8
12661    paddd           m8, m9
12662    psrldq          m9, m8, 4
12663    paddd           m8, m9
12664    psrld           m8, 2
12665
12666    pmaddwd         m0, m13
12667    pmaddwd         m1, m13
12668    pmaddwd         m2, m13
12669    pmaddwd         m3, m13
12670
12671    psrldq          m9, m0, 4
12672    psubd           m10, m0, m9
12673    paddd           m0, m9
12674    shufps          m0, m10, 10001000b
12675    psrldq          m9, m0, 4
12676    psubd           m10, m0, m9
12677    paddd           m0, m9
12678    shufps          m0, m10, 10001000b
12679
12680    psrldq          m9, m1, 4
12681    psubd           m10, m1, m9
12682    paddd           m1, m9
12683    shufps          m1, m10, 10001000b
12684    psrldq          m9, m1, 4
12685    psubd           m10, m1, m9
12686    paddd           m1, m9
12687    shufps          m1, m10, 10001000b
12688
12689    psrldq          m9, m2, 4
12690    psubd           m10, m2, m9
12691    paddd           m2, m9
12692    shufps          m2, m10, 10001000b
12693    psrldq          m9, m2, 4
12694    psubd           m10, m2, m9
12695    paddd           m2, m9
12696    shufps          m2, m10, 10001000b
12697
12698    psrldq          m9, m3, 4
12699    psubd           m10, m3, m9
12700    paddd           m3, m9
12701    shufps          m3, m10, 10001000b
12702    psrldq          m9, m3, 4
12703    psubd           m10, m3, m9
12704    paddd           m3, m9
12705    shufps          m3, m10, 10001000b
12706
12707    SUMSUB_BA d, 0, 1, 9
12708    SUMSUB_BA d, 2, 3, 9
12709    SUMSUB_BA d, 0, 2, 9
12710    SUMSUB_BA d, 1, 3, 9
12711
12712    pmaddwd         m4, m13
12713    pmaddwd         m5, m13
12714    pmaddwd         m6, m13
12715    pmaddwd         m7, m13
12716
12717    psrldq          m9, m4, 4
12718    psubd           m10, m4, m9
12719    paddd           m4, m9
12720    shufps          m4, m10, 10001000b
12721    psrldq          m9, m4, 4
12722    psubd           m10, m4, m9
12723    paddd           m4, m9
12724    shufps          m4, m10, 10001000b
12725
12726    psrldq          m9, m5, 4
12727    psubd           m10, m5, m9
12728    paddd           m5, m9
12729    shufps          m5, m10, 10001000b
12730    psrldq          m9, m5, 4
12731    psubd           m10, m5, m9
12732    paddd           m5, m9
12733    shufps          m5, m10, 10001000b
12734
12735    psrldq          m9, m6, 4
12736    psubd           m10, m6, m9
12737    paddd           m6, m9
12738    shufps          m6, m10, 10001000b
12739    psrldq          m9, m6, 4
12740    psubd           m10, m6, m9
12741    paddd           m6, m9
12742    shufps          m6, m10, 10001000b
12743
12744    psrldq          m9, m7, 4
12745    psubd           m10, m7, m9
12746    paddd           m7, m9
12747    shufps          m7, m10, 10001000b
12748    psrldq          m9, m7, 4
12749    psubd           m10, m7, m9
12750    paddd           m7, m9
12751    shufps          m7, m10, 10001000b
12752
12753    SUMSUB_BA d, 4, 5, 9
12754    SUMSUB_BA d, 6, 7, 9
12755    SUMSUB_BA d, 4, 6, 9
12756    SUMSUB_BA d, 5, 7, 9
12757
12758    SUMSUB_BA d, 0, 4, 9
12759    SUMSUB_BA d, 1, 5, 9
12760    SUMSUB_BA d, 2, 6, 9
12761    SUMSUB_BA d, 3, 7, 9
12762
12763    pabsd           m0, m0
12764    pabsd           m2, m2
12765    pabsd           m1, m1
12766    pabsd           m3, m3
12767    pabsd           m4, m4
12768    pabsd           m5, m5
12769    pabsd           m6, m6
12770    pabsd           m7, m7
12771
12772    paddd           m0, m2
12773    paddd           m1, m3
12774    paddd           m0, m1
12775    paddd           m5, m4
12776    paddd           m0, m5
12777    paddd           m7, m6
12778    paddd           m11, m0, m7
12779
12780    movu            m0, [r2]
12781    movu            m1, [r2 + r3]
12782    movu            m2, [r2 + r3 * 2]
12783    movu            m3, [r2 + r6]
12784
12785    pmaddwd         m0, m14
12786    pmaddwd         m1, m14
12787    pmaddwd         m2, m14
12788    pmaddwd         m3, m14
12789
12790    psrldq          m9, m0, 4
12791    psubd           m10, m0, m9
12792    paddd           m0, m9
12793    shufps          m0, m10, 10001000b
12794    psrldq          m9, m0, 4
12795    psubd           m10, m0, m9
12796    paddd           m0, m9
12797    shufps          m0, m10, 10001000b
12798
12799    psrldq          m9, m1, 4
12800    psubd           m10, m1, m9
12801    paddd           m1, m9
12802    shufps          m1, m10, 10001000b
12803    psrldq          m9, m1, 4
12804    psubd           m10, m1, m9
12805    paddd           m1, m9
12806    shufps          m1, m10, 10001000b
12807
12808    psrldq          m9, m2, 4
12809    psubd           m10, m2, m9
12810    paddd           m2, m9
12811    shufps          m2, m10, 10001000b
12812    psrldq          m9, m2, 4
12813    psubd           m10, m2, m9
12814    paddd           m2, m9
12815    shufps          m2, m10, 10001000b
12816
12817    psrldq          m9, m3, 4
12818    psubd           m10, m3, m9
12819    paddd           m3, m9
12820    shufps          m3, m10, 10001000b
12821    psrldq          m9, m3, 4
12822    psubd           m10, m3, m9
12823    paddd           m3, m9
12824    shufps          m3, m10, 10001000b
12825
12826    SUMSUB_BA d, 0, 1, 9
12827    SUMSUB_BA d, 2, 3, 9
12828    SUMSUB_BA d, 0, 2, 9
12829    SUMSUB_BA d, 1, 3, 9
12830
12831    movu            m4, [r5]
12832    movu            m5, [r5 + r3]
12833    movu            m6, [r5 + r3 * 2]
12834    movu            m7, [r5 + r6]
12835
12836    pmaddwd         m4, m14
12837    pmaddwd         m5, m14
12838    pmaddwd         m6, m14
12839    pmaddwd         m7, m14
12840
12841    psrldq          m9, m4, 4
12842    psubd           m10, m4, m9
12843    paddd           m4, m9
12844    shufps          m4, m10, 10001000b
12845    psrldq          m9, m4, 4
12846    psubd           m10, m4, m9
12847    paddd           m4, m9
12848    shufps          m4, m10, 10001000b
12849
12850    psrldq          m9, m5, 4
12851    psubd           m10, m5, m9
12852    paddd           m5, m9
12853    shufps          m5, m10, 10001000b
12854    psrldq          m9, m5, 4
12855    psubd           m10, m5, m9
12856    paddd           m5, m9
12857    shufps          m5, m10, 10001000b
12858
12859    psrldq          m9, m6, 4
12860    psubd           m10, m6, m9
12861    paddd           m6, m9
12862    shufps          m6, m10, 10001000b
12863    psrldq          m9, m6, 4
12864    psubd           m10, m6, m9
12865    paddd           m6, m9
12866    shufps          m6, m10, 10001000b
12867
12868    psrldq          m9, m7, 4
12869    psubd           m10, m7, m9
12870    paddd           m7, m9
12871    shufps          m7, m10, 10001000b
12872    psrldq          m9, m7, 4
12873    psubd           m10, m7, m9
12874    paddd           m7, m9
12875    shufps          m7, m10, 10001000b
12876
12877    SUMSUB_BA d, 4, 5, 9
12878    SUMSUB_BA d, 6, 7, 9
12879    SUMSUB_BA d, 4, 6, 9
12880    SUMSUB_BA d, 5, 7, 9
12881
12882    SUMSUB_BA d, 0, 4, 9
12883    SUMSUB_BA d, 1, 5, 9
12884    SUMSUB_BA d, 2, 6, 9
12885    SUMSUB_BA d, 3, 7, 9
12886
12887    pabsd           m0, m0
12888    pabsd           m2, m2
12889    pabsd           m1, m1
12890    pabsd           m3, m3
12891    pabsd           m4, m4
12892    pabsd           m5, m5
12893    pabsd           m6, m6
12894    pabsd           m7, m7
12895
12896    paddd           m0, m2
12897    paddd           m1, m3
12898    paddd           m0, m1
12899    paddd           m5, m4
12900    paddd           m0, m5
12901    paddd           m7, m6
12902    paddd           m0, m7
12903    paddd           m0, m11
12904
12905    movhlps         m1, m0
12906    paddd           m0, m1
12907    psrldq          m1, m0, 4
12908    paddd           m0, m1
12909    paddd           m0, [pd_2]
12910    psrld           m0, 2
12911    psubd           m0, m8
12912
12913    psubd           m12, m0
12914    pabsd           m0, m12
12915    paddd           m15, m0
12916%endmacro
12917
12918%if ARCH_X86_64
12919INIT_XMM sse4
12920cglobal psyCost_ss_16x16, 4, 9, 16
12921
12922    mova            m13, [pw_pmpmpmpm]
12923    mova            m14, [pw_1]
12924    add             r1, r1
12925    add             r3, r3
12926    lea             r4, [3 * r1]
12927    lea             r6, [3 * r3]
12928    pxor            m15, m15
12929    mov             r7d, 2
12930.loopH:
12931    mov             r8d, 2
12932.loopW:
12933    psy_cost_ss
12934    add             r0, 16
12935    add             r2, 16
12936    dec             r8d
12937    jnz             .loopW
12938    lea             r0, [r0 + r1 * 8 - 32]
12939    lea             r2, [r2 + r3 * 8 - 32]
12940    dec             r7d
12941    jnz             .loopH
12942    movd            eax, m15
12943    RET
12944%endif
12945
12946%if ARCH_X86_64
12947INIT_XMM sse4
12948cglobal psyCost_ss_32x32, 4, 9, 16
12949
12950    mova            m13, [pw_pmpmpmpm]
12951    mova            m14, [pw_1]
12952    add             r1, r1
12953    add             r3, r3
12954    lea             r4, [3 * r1]
12955    lea             r6, [3 * r3]
12956    pxor            m15, m15
12957    mov             r7d, 4
12958.loopH:
12959    mov             r8d, 4
12960.loopW:
12961    psy_cost_ss
12962    add             r0, 16
12963    add             r2, 16
12964    dec             r8d
12965    jnz             .loopW
12966    lea             r0, [r0 + r1 * 8 - 64]
12967    lea             r2, [r2 + r3 * 8 - 64]
12968    dec             r7d
12969    jnz             .loopH
12970    movd            eax, m15
12971    RET
12972%endif
12973
12974%if ARCH_X86_64
12975INIT_XMM sse4
12976cglobal psyCost_ss_64x64, 4, 9, 16
12977
12978    mova            m13, [pw_pmpmpmpm]
12979    mova            m14, [pw_1]
12980    add             r1, r1
12981    add             r3, r3
12982    lea             r4, [3 * r1]
12983    lea             r6, [3 * r3]
12984    pxor            m15, m15
12985    mov             r7d, 8
12986.loopH:
12987    mov             r8d, 8
12988.loopW:
12989    psy_cost_ss
12990    add             r0, 16
12991    add             r2, 16
12992    dec             r8d
12993    jnz             .loopW
12994    lea             r0, [r0 + r1 * 8 - 128]
12995    lea             r2, [r2 + r3 * 8 - 128]
12996    dec             r7d
12997    jnz             .loopH
12998    movd            eax, m15
12999    RET
13000%endif
13001
13002INIT_YMM avx2
13003cglobal psyCost_ss_4x4, 4, 5, 8
13004    add             r1, r1
13005    add             r3, r3
13006    lea             r4, [3 * r1]
13007    movddup         m0, [r0]
13008    movddup         m1, [r0 + r1]
13009    movddup         m2, [r0 + r1 * 2]
13010    movddup         m3, [r0 + r4]
13011
13012    lea             r4, [3 * r3]
13013    movddup         m4, [r2]
13014    movddup         m5, [r2 + r3]
13015    movddup         m6, [r2 + r3 * 2]
13016    movddup         m7, [r2 + r4]
13017
13018    vinserti128     m0, m0, xm4, 1
13019    vinserti128     m1, m1, xm5, 1
13020    vinserti128     m2, m2, xm6, 1
13021    vinserti128     m3, m3, xm7, 1
13022
13023    pabsw           m4, m0
13024    pabsw           m5, m1
13025    paddw           m5, m4
13026    pabsw           m4, m2
13027    paddw           m5, m4
13028    pabsw           m4, m3
13029    paddw           m5, m4
13030    pmaddwd         m5, [pw_1]
13031    psrldq          m4, m5, 4
13032    paddd           m5, m4
13033    psrld           m6, m5, 2
13034
13035    mova            m4, [hmul_8w]
13036    pmaddwd         m0, m4
13037    pmaddwd         m1, m4
13038    pmaddwd         m2, m4
13039    pmaddwd         m3, m4
13040
13041    psrldq          m4, m0, 4
13042    psubd           m5, m0, m4
13043    paddd           m0, m4
13044    shufps          m0, m0, m5, 10001000b
13045
13046    psrldq          m4, m1, 4
13047    psubd           m5, m1, m4
13048    paddd           m1, m4
13049    shufps          m1, m1, m5, 10001000b
13050
13051    psrldq          m4, m2, 4
13052    psubd           m5, m2, m4
13053    paddd           m2, m4
13054    shufps          m2, m2, m5, 10001000b
13055
13056    psrldq          m4, m3, 4
13057    psubd           m5, m3, m4
13058    paddd           m3, m4
13059    shufps          m3, m3, m5, 10001000b
13060
13061    mova            m4, m0
13062    paddd           m0, m1
13063    psubd           m1, m4
13064    mova            m4, m2
13065    paddd           m2, m3
13066    psubd           m3, m4
13067    mova            m4, m0
13068    paddd           m0, m2
13069    psubd           m2, m4
13070    mova            m4, m1
13071    paddd           m1, m3
13072    psubd           m3, m4
13073
13074    pabsd           m0, m0
13075    pabsd           m2, m2
13076    pabsd           m1, m1
13077    pabsd           m3, m3
13078    paddd           m0, m2
13079    paddd           m1, m3
13080    paddd           m0, m1
13081    psrldq          m1, m0, 8
13082    paddd           m0, m1
13083    psrldq          m1, m0, 4
13084    paddd           m0, m1
13085    psrld           m0, 1
13086    psubd           m0, m6
13087    vextracti128    xm1, m0, 1
13088    psubd           m0, m1
13089    pabsd           m0, m0
13090    movd            eax, xm0
13091    RET
13092
13093%macro PSY_SS_8x8 0
13094    lea             r4, [3 * r1]
13095    lea             r6, [r0 + r1 * 4]
13096    movu            xm0, [r0]
13097    movu            xm1, [r0 + r1]
13098    movu            xm2, [r0 + r1 * 2]
13099    movu            xm3, [r0 + r4]
13100    movu            xm4, [r6]
13101    movu            xm5, [r6 + r1]
13102    movu            xm6, [r6 + r1 * 2]
13103    movu            xm7, [r6 + r4]
13104
13105    lea             r4, [3 * r3]
13106    lea             r6, [r2 + r3 * 4]
13107    movu            xm8, [r2]
13108    movu            xm9, [r2 + r3]
13109    movu            xm10, [r2 + r3 * 2]
13110    movu            xm11, [r2 + r4]
13111    vinserti128     m0, m0, xm8, 1
13112    vinserti128     m1, m1, xm9, 1
13113    vinserti128     m2, m2, xm10, 1
13114    vinserti128     m3, m3, xm11, 1
13115    movu            xm8, [r6]
13116    movu            xm9, [r6 + r3]
13117    movu            xm10, [r6 + r3 * 2]
13118    movu            xm11, [r6 + r4]
13119    vinserti128     m4, m4, xm8, 1
13120    vinserti128     m5, m5, xm9, 1
13121    vinserti128     m6, m6, xm10, 1
13122    vinserti128     m7, m7, xm11, 1
13123
13124    ;; store on stack to use later
13125    mova            [rsp + 0 * mmsize], m0
13126    mova            [rsp + 1 * mmsize], m1
13127    mova            [rsp + 2 * mmsize], m2
13128    mova            [rsp + 3 * mmsize], m3
13129    mova            [rsp + 4 * mmsize], m4
13130    mova            [rsp + 5 * mmsize], m5
13131    mova            [rsp + 6 * mmsize], m6
13132    mova            [rsp + 7 * mmsize], m7
13133
13134    pabsw           m8, m0
13135    pabsw           m9, m1
13136    paddw           m8, m9
13137    pabsw           m10, m2
13138    pabsw           m11, m3
13139    paddw           m10, m11
13140    paddw           m8, m10
13141    pabsw           m9, m4
13142    pabsw           m10, m5
13143    paddw           m9, m10
13144    pabsw           m11, m6
13145    pabsw           m10, m7
13146    paddw           m11, m10
13147    paddw           m9, m11
13148    paddw           m8, m9
13149    psrldq          m9, m8, 8
13150
13151    vextracti128    xm10, m8, 1
13152    vextracti128    xm11, m9, 1
13153
13154    vpmovzxwd       m8, xm8
13155    vpmovzxwd       m9, xm9
13156    vpmovzxwd       m10, xm10
13157    vpmovzxwd       m11, xm11
13158
13159    vinserti128     m8, m8, xm10, 1
13160    vinserti128     m9, m9, xm11, 1
13161
13162    paddd           m8, m9
13163    psrldq          m9, m8, 8
13164    paddd           m8, m9
13165    psrldq          m9, m8, 4
13166    paddd           m8, m9
13167    psrld           m8, 2       ; sad_4x4
13168
13169    pmaddwd         m0, m13
13170    pmaddwd         m1, m13
13171    pmaddwd         m2, m13
13172    pmaddwd         m3, m13
13173
13174    psrldq          m9, m0, 4
13175    psubd           m10, m0, m9
13176    paddd           m0, m9
13177    vshufps         m0, m0, m10, 10001000b
13178    psrldq          m9, m0, 4
13179    psubd           m10, m0, m9
13180    paddd           m0, m9
13181    vshufps         m0, m0, m10, 10001000b
13182
13183    psrldq          m9, m1, 4
13184    psubd           m10, m1, m9
13185    paddd           m1, m9
13186    vshufps         m1, m1, m10, 10001000b
13187    psrldq          m9, m1, 4
13188    psubd           m10, m1, m9
13189    paddd           m1, m9
13190    vshufps         m1, m1, m10, 10001000b
13191
13192    psrldq          m9, m2, 4
13193    psubd           m10, m2, m9
13194    paddd           m2, m9
13195    vshufps         m2, m2, m10, 10001000b
13196    psrldq          m9, m2, 4
13197    psubd           m10, m2, m9
13198    paddd           m2, m9
13199    vshufps         m2, m2, m10, 10001000b
13200
13201    psrldq          m9, m3, 4
13202    psubd           m10, m3, m9
13203    paddd           m3, m9
13204    vshufps         m3, m3, m10, 10001000b
13205    psrldq          m9, m3, 4
13206    psubd           m10, m3, m9
13207    paddd           m3, m9
13208    vshufps         m3, m3, m10, 10001000b
13209
13210    SUMSUB_BA d, 0, 1, 9
13211    SUMSUB_BA d, 2, 3, 9
13212    SUMSUB_BA d, 0, 2, 9
13213    SUMSUB_BA d, 1, 3, 9
13214
13215    pmaddwd         m4, m13
13216    pmaddwd         m5, m13
13217    pmaddwd         m6, m13
13218    pmaddwd         m7, m13
13219
13220    psrldq          m9, m4, 4
13221    psubd           m10, m4, m9
13222    paddd           m4, m9
13223    vshufps         m4, m4, m10, 10001000b
13224    psrldq          m9, m4, 4
13225    psubd           m10, m4, m9
13226    paddd           m4, m9
13227    vshufps         m4, m4, m10, 10001000b
13228
13229    psrldq          m9, m5, 4
13230    psubd           m10, m5, m9
13231    paddd           m5, m9
13232    vshufps         m5, m5, m10, 10001000b
13233    psrldq          m9, m5, 4
13234    psubd           m10, m5, m9
13235    paddd           m5, m9
13236    vshufps         m5, m5, m10, 10001000b
13237
13238    psrldq          m9, m6, 4
13239    psubd           m10, m6, m9
13240    paddd           m6, m9
13241    vshufps         m6, m6, m10, 10001000b
13242    psrldq          m9, m6, 4
13243    psubd           m10, m6, m9
13244    paddd           m6, m9
13245    vshufps         m6, m6, m10, 10001000b
13246
13247    psrldq          m9, m7, 4
13248    psubd           m10, m7, m9
13249    paddd           m7, m9
13250    vshufps         m7, m7, m10, 10001000b
13251    psrldq          m9, m7, 4
13252    psubd           m10, m7, m9
13253    paddd           m7, m9
13254    vshufps         m7, m7, m10, 10001000b
13255
13256    SUMSUB_BA d, 4, 5, 9
13257    SUMSUB_BA d, 6, 7, 9
13258    SUMSUB_BA d, 4, 6, 9
13259    SUMSUB_BA d, 5, 7, 9
13260
13261    SUMSUB_BA d, 0, 4, 9
13262    SUMSUB_BA d, 1, 5, 9
13263    SUMSUB_BA d, 2, 6, 9
13264    SUMSUB_BA d, 3, 7, 9
13265
13266    pabsd           m0, m0
13267    pabsd           m2, m2
13268    pabsd           m1, m1
13269    pabsd           m3, m3
13270    pabsd           m4, m4
13271    pabsd           m5, m5
13272    pabsd           m6, m6
13273    pabsd           m7, m7
13274
13275    paddd           m0, m2
13276    paddd           m1, m3
13277    paddd           m0, m1
13278    paddd           m5, m4
13279    paddd           m0, m5
13280    paddd           m7, m6
13281    paddd           m11, m0, m7
13282
13283    pmaddwd         m0, m12, [rsp + 0 * mmsize]
13284    pmaddwd         m1, m12, [rsp + 1 * mmsize]
13285    pmaddwd         m2, m12, [rsp + 2 * mmsize]
13286    pmaddwd         m3, m12, [rsp + 3 * mmsize]
13287
13288    psrldq          m9, m0, 4
13289    psubd           m10, m0, m9
13290    paddd           m0, m9
13291    vshufps         m0, m0, m10, 10001000b
13292    psrldq          m9, m0, 4
13293    psubd           m10, m0, m9
13294    paddd           m0, m9
13295    vshufps         m0, m0, m10, 10001000b
13296
13297    psrldq          m9, m1, 4
13298    psubd           m10, m1, m9
13299    paddd           m1, m9
13300    vshufps         m1, m1, m10, 10001000b
13301    psrldq          m9, m1, 4
13302    psubd           m10, m1, m9
13303    paddd           m1, m9
13304    vshufps         m1, m1, m10, 10001000b
13305
13306    psrldq          m9, m2, 4
13307    psubd           m10, m2, m9
13308    paddd           m2, m9
13309    vshufps         m2, m2, m10, 10001000b
13310    psrldq          m9, m2, 4
13311    psubd           m10, m2, m9
13312    paddd           m2, m9
13313    vshufps         m2, m2, m10, 10001000b
13314
13315    psrldq          m9, m3, 4
13316    psubd           m10, m3, m9
13317    paddd           m3, m9
13318    vshufps         m3, m3, m10, 10001000b
13319    psrldq          m9, m3, 4
13320    psubd           m10, m3, m9
13321    paddd           m3, m9
13322    vshufps         m3, m3, m10, 10001000b
13323
13324    SUMSUB_BA d, 0, 1, 9
13325    SUMSUB_BA d, 2, 3, 9
13326    SUMSUB_BA d, 0, 2, 9
13327    SUMSUB_BA d, 1, 3, 9
13328
13329    pmaddwd         m4, m12, [rsp + 4 * mmsize]
13330    pmaddwd         m5, m12, [rsp + 5 * mmsize]
13331    pmaddwd         m6, m12, [rsp + 6 * mmsize]
13332    pmaddwd         m7, m12, [rsp + 7 * mmsize]
13333
13334    psrldq          m9, m4, 4
13335    psubd           m10, m4, m9
13336    paddd           m4, m9
13337    vshufps         m4, m4, m10, 10001000b
13338    psrldq          m9, m4, 4
13339    psubd           m10, m4, m9
13340    paddd           m4, m9
13341    vshufps         m4, m4, m10, 10001000b
13342
13343    psrldq          m9, m5, 4
13344    psubd           m10, m5, m9
13345    paddd           m5, m9
13346    vshufps         m5, m5, m10, 10001000b
13347    psrldq          m9, m5, 4
13348    psubd           m10, m5, m9
13349    paddd           m5, m9
13350    vshufps         m5, m5, m10, 10001000b
13351
13352    psrldq          m9, m6, 4
13353    psubd           m10, m6, m9
13354    paddd           m6, m9
13355    vshufps         m6, m6, m10, 10001000b
13356    psrldq          m9, m6, 4
13357    psubd           m10, m6, m9
13358    paddd           m6, m9
13359    vshufps         m6, m6, m10, 10001000b
13360
13361    psrldq          m9, m7, 4
13362    psubd           m10, m7, m9
13363    paddd           m7, m9
13364    vshufps         m7, m7, m10, 10001000b
13365    psrldq          m9, m7, 4
13366    psubd           m10, m7, m9
13367    paddd           m7, m9
13368    vshufps         m7, m7, m10, 10001000b
13369
13370    SUMSUB_BA d, 4, 5, 9
13371    SUMSUB_BA d, 6, 7, 9
13372    SUMSUB_BA d, 4, 6, 9
13373    SUMSUB_BA d, 5, 7, 9
13374
13375    SUMSUB_BA d, 0, 4, 9
13376    SUMSUB_BA d, 1, 5, 9
13377    SUMSUB_BA d, 2, 6, 9
13378    SUMSUB_BA d, 3, 7, 9
13379
13380    pabsd           m0, m0
13381    pabsd           m2, m2
13382    pabsd           m1, m1
13383    pabsd           m3, m3
13384    pabsd           m4, m4
13385    pabsd           m5, m5
13386    pabsd           m6, m6
13387    pabsd           m7, m7
13388
13389    paddd           m0, m2
13390    paddd           m1, m3
13391    paddd           m0, m1
13392    paddd           m5, m4
13393    paddd           m0, m5
13394    paddd           m7, m6
13395    paddd           m0, m7
13396    paddd           m0, m11
13397
13398    psrldq          m1, m0, 8
13399    paddd           m0, m1
13400    psrldq          m1, m0, 4
13401    paddd           m0, m1
13402    paddd           m0, [pd_2]
13403    psrld           m0, 2
13404    psubd           m0, m8
13405    vextracti128    xm1, m0, 1
13406    psubd           m0, m1
13407    pabsd           m0, m0
13408%endmacro
13409
13410%if ARCH_X86_64
13411INIT_YMM avx2
13412cglobal psyCost_ss_8x8, 4, 7, 14
13413    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
13414    mov             r5, rsp
13415    sub             rsp, 8*mmsize
13416    and             rsp, ~63
13417
13418    mova            m12, [pw_1]
13419    mova            m13, [pw_pmpmpmpm]
13420    add             r1, r1
13421    add             r3, r3
13422
13423    PSY_SS_8x8
13424
13425    movd            eax, xm0
13426    mov             rsp, r5
13427    RET
13428%endif
13429
13430%if ARCH_X86_64
13431INIT_YMM avx2
13432cglobal psyCost_ss_16x16, 4, 9, 15
13433    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
13434    mov             r5, rsp
13435    sub             rsp, 8*mmsize
13436    and             rsp, ~63
13437
13438    mova            m12, [pw_1]
13439    mova            m13, [pw_pmpmpmpm]
13440    add             r1, r1
13441    add             r3, r3
13442    pxor            m14, m14
13443
13444    mov             r7d, 2
13445.loopH:
13446    mov             r8d, 2
13447.loopW:
13448    PSY_SS_8x8
13449
13450    paddd           m14, m0
13451    add             r0, 16
13452    add             r2, 16
13453    dec             r8d
13454    jnz             .loopW
13455    lea             r0, [r0 + r1 * 8 - 32]
13456    lea             r2, [r2 + r3 * 8 - 32]
13457    dec             r7d
13458    jnz             .loopH
13459    movd            eax, xm14
13460    mov             rsp, r5
13461    RET
13462%endif
13463
13464%if ARCH_X86_64
13465INIT_YMM avx2
13466cglobal psyCost_ss_32x32, 4, 9, 15
13467    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
13468    mov             r5, rsp
13469    sub             rsp, 8*mmsize
13470    and             rsp, ~63
13471
13472    mova            m12, [pw_1]
13473    mova            m13, [pw_pmpmpmpm]
13474    add             r1, r1
13475    add             r3, r3
13476    pxor            m14, m14
13477
13478    mov             r7d, 4
13479.loopH:
13480    mov             r8d, 4
13481.loopW:
13482    PSY_SS_8x8
13483
13484    paddd           m14, m0
13485    add             r0, 16
13486    add             r2, 16
13487    dec             r8d
13488    jnz             .loopW
13489    lea             r0, [r0 + r1 * 8 - 64]
13490    lea             r2, [r2 + r3 * 8 - 64]
13491    dec             r7d
13492    jnz             .loopH
13493    movd            eax, xm14
13494    mov             rsp, r5
13495    RET
13496%endif
13497
13498%if ARCH_X86_64
13499INIT_YMM avx2
13500cglobal psyCost_ss_64x64, 4, 9, 15
13501    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
13502    mov             r5, rsp
13503    sub             rsp, 8*mmsize
13504    and             rsp, ~63
13505
13506    mova            m12, [pw_1]
13507    mova            m13, [pw_pmpmpmpm]
13508    add             r1, r1
13509    add             r3, r3
13510    pxor            m14, m14
13511
13512    mov             r7d, 8
13513.loopH:
13514    mov             r8d, 8
13515.loopW:
13516    PSY_SS_8x8
13517
13518    paddd           m14, m0
13519    add             r0, 16
13520    add             r2, 16
13521    dec             r8d
13522    jnz             .loopW
13523    lea             r0, [r0 + r1 * 8 - 128]
13524    lea             r2, [r2 + r3 * 8 - 128]
13525    dec             r7d
13526    jnz             .loopH
13527    movd            eax, xm14
13528    mov             rsp, r5
13529    RET
13530%endif
13531
13532;;---------------------------------------------------------------
13533;; SATD AVX2
13534;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t)
13535;;---------------------------------------------------------------
13536;; r0   - pix0
13537;; r1   - pix0Stride
13538;; r2   - pix1
13539;; r3   - pix1Stride
13540
13541%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
13542INIT_YMM avx2
13543cglobal calc_satd_16x8    ; function to compute satd cost for 16 columns, 8 rows
13544    pxor                m6, m6
13545    vbroadcasti128      m0, [r0]
13546    vbroadcasti128      m4, [r2]
13547    vbroadcasti128      m1, [r0 + r1]
13548    vbroadcasti128      m5, [r2 + r3]
13549    pmaddubsw           m4, m7
13550    pmaddubsw           m0, m7
13551    pmaddubsw           m5, m7
13552    pmaddubsw           m1, m7
13553    psubw               m0, m4
13554    psubw               m1, m5
13555    vbroadcasti128      m2, [r0 + r1 * 2]
13556    vbroadcasti128      m4, [r2 + r3 * 2]
13557    vbroadcasti128      m3, [r0 + r4]
13558    vbroadcasti128      m5, [r2 + r5]
13559    pmaddubsw           m4, m7
13560    pmaddubsw           m2, m7
13561    pmaddubsw           m5, m7
13562    pmaddubsw           m3, m7
13563    psubw               m2, m4
13564    psubw               m3, m5
13565    lea                 r0, [r0 + r1 * 4]
13566    lea                 r2, [r2 + r3 * 4]
13567    paddw               m4, m0, m1
13568    psubw               m1, m1, m0
13569    paddw               m0, m2, m3
13570    psubw               m3, m2
13571    paddw               m2, m4, m0
13572    psubw               m0, m4
13573    paddw               m4, m1, m3
13574    psubw               m3, m1
13575    pabsw               m2, m2
13576    pabsw               m0, m0
13577    pabsw               m4, m4
13578    pabsw               m3, m3
13579    pblendw             m1, m2, m0, 10101010b
13580    pslld               m0, 16
13581    psrld               m2, 16
13582    por                 m0, m2
13583    pmaxsw              m1, m0
13584    paddw               m6, m1
13585    pblendw             m2, m4, m3, 10101010b
13586    pslld               m3, 16
13587    psrld               m4, 16
13588    por                 m3, m4
13589    pmaxsw              m2, m3
13590    paddw               m6, m2
13591    vbroadcasti128      m1, [r0]
13592    vbroadcasti128      m4, [r2]
13593    vbroadcasti128      m2, [r0 + r1]
13594    vbroadcasti128      m5, [r2 + r3]
13595    pmaddubsw           m4, m7
13596    pmaddubsw           m1, m7
13597    pmaddubsw           m5, m7
13598    pmaddubsw           m2, m7
13599    psubw               m1, m4
13600    psubw               m2, m5
13601    vbroadcasti128      m0, [r0 + r1 * 2]
13602    vbroadcasti128      m4, [r2 + r3 * 2]
13603    vbroadcasti128      m3, [r0 + r4]
13604    vbroadcasti128      m5, [r2 + r5]
13605    lea                 r0, [r0 + r1 * 4]
13606    lea                 r2, [r2 + r3 * 4]
13607    pmaddubsw           m4, m7
13608    pmaddubsw           m0, m7
13609    pmaddubsw           m5, m7
13610    pmaddubsw           m3, m7
13611    psubw               m0, m4
13612    psubw               m3, m5
13613    paddw               m4, m1, m2
13614    psubw               m2, m1
13615    paddw               m1, m0, m3
13616    psubw               m3, m0
13617    paddw               m0, m4, m1
13618    psubw               m1, m4
13619    paddw               m4, m2, m3
13620    psubw               m3, m2
13621    pabsw               m0, m0
13622    pabsw               m1, m1
13623    pabsw               m4, m4
13624    pabsw               m3, m3
13625    pblendw             m2, m0, m1, 10101010b
13626    pslld               m1, 16
13627    psrld               m0, 16
13628    por                 m1, m0
13629    pmaxsw              m2, m1
13630    paddw               m6, m2
13631    pblendw             m0, m4, m3, 10101010b
13632    pslld               m3, 16
13633    psrld               m4, 16
13634    por                 m3, m4
13635    pmaxsw              m0, m3
13636    paddw               m6, m0
13637    vextracti128        xm0, m6, 1
13638    pmovzxwd            m6, xm6
13639    pmovzxwd            m0, xm0
13640    paddd               m8, m6
13641    paddd               m9, m0
13642    ret
13643
13644cglobal calc_satd_16x4    ; function to compute satd cost for 16 columns, 4 rows
13645    pxor                m6, m6
13646    vbroadcasti128      m0, [r0]
13647    vbroadcasti128      m4, [r2]
13648    vbroadcasti128      m1, [r0 + r1]
13649    vbroadcasti128      m5, [r2 + r3]
13650    pmaddubsw           m4, m7
13651    pmaddubsw           m0, m7
13652    pmaddubsw           m5, m7
13653    pmaddubsw           m1, m7
13654    psubw               m0, m4
13655    psubw               m1, m5
13656    vbroadcasti128      m2, [r0 + r1 * 2]
13657    vbroadcasti128      m4, [r2 + r3 * 2]
13658    vbroadcasti128      m3, [r0 + r4]
13659    vbroadcasti128      m5, [r2 + r5]
13660    pmaddubsw           m4, m7
13661    pmaddubsw           m2, m7
13662    pmaddubsw           m5, m7
13663    pmaddubsw           m3, m7
13664    psubw               m2, m4
13665    psubw               m3, m5
13666    paddw               m4, m0, m1
13667    psubw               m1, m1, m0
13668    paddw               m0, m2, m3
13669    psubw               m3, m2
13670    paddw               m2, m4, m0
13671    psubw               m0, m4
13672    paddw               m4, m1, m3
13673    psubw               m3, m1
13674    pabsw               m2, m2
13675    pabsw               m0, m0
13676    pabsw               m4, m4
13677    pabsw               m3, m3
13678    pblendw             m1, m2, m0, 10101010b
13679    pslld               m0, 16
13680    psrld               m2, 16
13681    por                 m0, m2
13682    pmaxsw              m1, m0
13683    paddw               m6, m1
13684    pblendw             m2, m4, m3, 10101010b
13685    pslld               m3, 16
13686    psrld               m4, 16
13687    por                 m3, m4
13688    pmaxsw              m2, m3
13689    paddw               m6, m2
13690    vextracti128        xm0, m6, 1
13691    pmovzxwd            m6, xm6
13692    pmovzxwd            m0, xm0
13693    paddd               m8, m6
13694    paddd               m9, m0
13695    ret
13696
13697cglobal pixel_satd_16x4, 4,6,10         ; if WIN64 && cpuflag(avx2)
13698    mova            m7, [hmul_16p]
13699    lea             r4, [3 * r1]
13700    lea             r5, [3 * r3]
13701    pxor            m8, m8
13702    pxor            m9, m9
13703
13704    call            calc_satd_16x4
13705
13706    paddd           m8, m9
13707    vextracti128    xm0, m8, 1
13708    paddd           xm0, xm8
13709    movhlps         xm1, xm0
13710    paddd           xm0, xm1
13711    pshuflw         xm1, xm0, q0032
13712    paddd           xm0, xm1
13713    movd            eax, xm0
13714    RET
13715
13716cglobal pixel_satd_16x12, 4,6,10        ; if WIN64 && cpuflag(avx2)
13717    mova            m7, [hmul_16p]
13718    lea             r4, [3 * r1]
13719    lea             r5, [3 * r3]
13720    pxor            m8, m8
13721    pxor            m9, m9
13722
13723    call            calc_satd_16x8
13724    call            calc_satd_16x4
13725
13726    paddd           m8, m9
13727    vextracti128    xm0, m8, 1
13728    paddd           xm0, xm8
13729    movhlps         xm1, xm0
13730    paddd           xm0, xm1
13731    pshuflw         xm1, xm0, q0032
13732    paddd           xm0, xm1
13733    movd            eax, xm0
13734    RET
13735
13736cglobal pixel_satd_16x32, 4,6,10        ; if WIN64 && cpuflag(avx2)
13737    mova            m7, [hmul_16p]
13738    lea             r4, [3 * r1]
13739    lea             r5, [3 * r3]
13740    pxor            m8, m8
13741    pxor            m9, m9
13742
13743    call            calc_satd_16x8
13744    call            calc_satd_16x8
13745    call            calc_satd_16x8
13746    call            calc_satd_16x8
13747
13748    paddd           m8, m9
13749    vextracti128    xm0, m8, 1
13750    paddd           xm0, xm8
13751    movhlps         xm1, xm0
13752    paddd           xm0, xm1
13753    pshuflw         xm1, xm0, q0032
13754    paddd           xm0, xm1
13755    movd            eax, xm0
13756    RET
13757
13758cglobal pixel_satd_16x64, 4,6,10        ; if WIN64 && cpuflag(avx2)
13759    mova            m7, [hmul_16p]
13760    lea             r4, [3 * r1]
13761    lea             r5, [3 * r3]
13762    pxor            m8, m8
13763    pxor            m9, m9
13764
13765    call            calc_satd_16x8
13766    call            calc_satd_16x8
13767    call            calc_satd_16x8
13768    call            calc_satd_16x8
13769    call            calc_satd_16x8
13770    call            calc_satd_16x8
13771    call            calc_satd_16x8
13772    call            calc_satd_16x8
13773
13774    paddd           m8, m9
13775    vextracti128    xm0, m8, 1
13776    paddd           xm0, xm8
13777    movhlps         xm1, xm0
13778    paddd           xm0, xm1
13779    pshuflw         xm1, xm0, q0032
13780    paddd           xm0, xm1
13781    movd            eax, xm0
13782    RET
13783
13784cglobal pixel_satd_32x8, 4,8,10          ; if WIN64 && cpuflag(avx2)
13785    mova            m7, [hmul_16p]
13786    lea             r4, [3 * r1]
13787    lea             r5, [3 * r3]
13788    pxor            m8, m8
13789    pxor            m9, m9
13790    mov             r6, r0
13791    mov             r7, r2
13792
13793    call            calc_satd_16x8
13794
13795    lea             r0, [r6 + 16]
13796    lea             r2, [r7 + 16]
13797
13798    call            calc_satd_16x8
13799
13800    paddd           m8, m9
13801    vextracti128    xm0, m8, 1
13802    paddd           xm0, xm8
13803    movhlps         xm1, xm0
13804    paddd           xm0, xm1
13805    pshuflw         xm1, xm0, q0032
13806    paddd           xm0, xm1
13807    movd            eax, xm0
13808    RET
13809
13810cglobal pixel_satd_32x16, 4,8,10         ; if WIN64 && cpuflag(avx2)
13811    mova            m7, [hmul_16p]
13812    lea             r4, [3 * r1]
13813    lea             r5, [3 * r3]
13814    pxor            m8, m8
13815    pxor            m9, m9
13816    mov             r6, r0
13817    mov             r7, r2
13818
13819    call            calc_satd_16x8
13820    call            calc_satd_16x8
13821
13822    lea             r0, [r6 + 16]
13823    lea             r2, [r7 + 16]
13824
13825    call            calc_satd_16x8
13826    call            calc_satd_16x8
13827
13828    paddd           m8, m9
13829    vextracti128    xm0, m8, 1
13830    paddd           xm0, xm8
13831    movhlps         xm1, xm0
13832    paddd           xm0, xm1
13833    pshuflw         xm1, xm0, q0032
13834    paddd           xm0, xm1
13835    movd            eax, xm0
13836    RET
13837
13838cglobal pixel_satd_32x24, 4,8,10         ; if WIN64 && cpuflag(avx2)
13839    mova            m7, [hmul_16p]
13840    lea             r4, [3 * r1]
13841    lea             r5, [3 * r3]
13842    pxor            m8, m8
13843    pxor            m9, m9
13844    mov             r6, r0
13845    mov             r7, r2
13846
13847    call            calc_satd_16x8
13848    call            calc_satd_16x8
13849    call            calc_satd_16x8
13850
13851    lea             r0, [r6 + 16]
13852    lea             r2, [r7 + 16]
13853
13854    call            calc_satd_16x8
13855    call            calc_satd_16x8
13856    call            calc_satd_16x8
13857
13858    paddd           m8, m9
13859    vextracti128    xm0, m8, 1
13860    paddd           xm0, xm8
13861    movhlps         xm1, xm0
13862    paddd           xm0, xm1
13863    pshuflw         xm1, xm0, q0032
13864    paddd           xm0, xm1
13865    movd            eax, xm0
13866    RET
13867
13868cglobal pixel_satd_32x32, 4,8,10         ; if WIN64 && cpuflag(avx2)
13869    mova            m7, [hmul_16p]
13870    lea             r4, [3 * r1]
13871    lea             r5, [3 * r3]
13872    pxor            m8, m8
13873    pxor            m9, m9
13874    mov             r6, r0
13875    mov             r7, r2
13876
13877    call            calc_satd_16x8
13878    call            calc_satd_16x8
13879    call            calc_satd_16x8
13880    call            calc_satd_16x8
13881
13882    lea             r0, [r6 + 16]
13883    lea             r2, [r7 + 16]
13884
13885    call            calc_satd_16x8
13886    call            calc_satd_16x8
13887    call            calc_satd_16x8
13888    call            calc_satd_16x8
13889
13890    paddd           m8, m9
13891    vextracti128    xm0, m8, 1
13892    paddd           xm0, xm8
13893    movhlps         xm1, xm0
13894    paddd           xm0, xm1
13895    pshuflw         xm1, xm0, q0032
13896    paddd           xm0, xm1
13897    movd            eax, xm0
13898    RET
13899
13900cglobal pixel_satd_32x64, 4,8,10         ; if WIN64 && cpuflag(avx2)
13901    mova            m7, [hmul_16p]
13902    lea             r4, [3 * r1]
13903    lea             r5, [3 * r3]
13904    pxor            m8, m8
13905    pxor            m9, m9
13906    mov             r6, r0
13907    mov             r7, r2
13908
13909    call            calc_satd_16x8
13910    call            calc_satd_16x8
13911    call            calc_satd_16x8
13912    call            calc_satd_16x8
13913    call            calc_satd_16x8
13914    call            calc_satd_16x8
13915    call            calc_satd_16x8
13916    call            calc_satd_16x8
13917
13918    lea             r0, [r6 + 16]
13919    lea             r2, [r7 + 16]
13920
13921    call            calc_satd_16x8
13922    call            calc_satd_16x8
13923    call            calc_satd_16x8
13924    call            calc_satd_16x8
13925    call            calc_satd_16x8
13926    call            calc_satd_16x8
13927    call            calc_satd_16x8
13928    call            calc_satd_16x8
13929
13930    paddd           m8, m9
13931    vextracti128    xm0, m8, 1
13932    paddd           xm0, xm8
13933    movhlps         xm1, xm0
13934    paddd           xm0, xm1
13935    pshuflw         xm1, xm0, q0032
13936    paddd           xm0, xm1
13937    movd            eax, xm0
13938    RET
13939
13940cglobal pixel_satd_48x64, 4,8,10        ; if WIN64 && cpuflag(avx2)
13941    mova            m7, [hmul_16p]
13942    lea             r4, [3 * r1]
13943    lea             r5, [3 * r3]
13944    pxor            m8, m8
13945    pxor            m9, m9
13946    mov             r6, r0
13947    mov             r7, r2
13948
13949    call            calc_satd_16x8
13950    call            calc_satd_16x8
13951    call            calc_satd_16x8
13952    call            calc_satd_16x8
13953    call            calc_satd_16x8
13954    call            calc_satd_16x8
13955    call            calc_satd_16x8
13956    call            calc_satd_16x8
13957    lea             r0, [r6 + 16]
13958    lea             r2, [r7 + 16]
13959    call            calc_satd_16x8
13960    call            calc_satd_16x8
13961    call            calc_satd_16x8
13962    call            calc_satd_16x8
13963    call            calc_satd_16x8
13964    call            calc_satd_16x8
13965    call            calc_satd_16x8
13966    call            calc_satd_16x8
13967    lea             r0, [r6 + 32]
13968    lea             r2, [r7 + 32]
13969    call            calc_satd_16x8
13970    call            calc_satd_16x8
13971    call            calc_satd_16x8
13972    call            calc_satd_16x8
13973    call            calc_satd_16x8
13974    call            calc_satd_16x8
13975    call            calc_satd_16x8
13976    call            calc_satd_16x8
13977
13978    paddd           m8, m9
13979    vextracti128    xm0, m8, 1
13980    paddd           xm0, xm8
13981    movhlps         xm1, xm0
13982    paddd           xm0, xm1
13983    pshuflw         xm1, xm0, q0032
13984    paddd           xm0, xm1
13985    movd            eax, xm0
13986    RET
13987
13988cglobal pixel_satd_64x16, 4,8,10         ; if WIN64 && cpuflag(avx2)
13989    mova            m7, [hmul_16p]
13990    lea             r4, [3 * r1]
13991    lea             r5, [3 * r3]
13992    pxor            m8, m8
13993    pxor            m9, m9
13994    mov             r6, r0
13995    mov             r7, r2
13996
13997    call            calc_satd_16x8
13998    call            calc_satd_16x8
13999    lea             r0, [r6 + 16]
14000    lea             r2, [r7 + 16]
14001    call            calc_satd_16x8
14002    call            calc_satd_16x8
14003    lea             r0, [r6 + 32]
14004    lea             r2, [r7 + 32]
14005    call            calc_satd_16x8
14006    call            calc_satd_16x8
14007    lea             r0, [r6 + 48]
14008    lea             r2, [r7 + 48]
14009    call            calc_satd_16x8
14010    call            calc_satd_16x8
14011
14012    paddd           m8, m9
14013    vextracti128    xm0, m8, 1
14014    paddd           xm0, xm8
14015    movhlps         xm1, xm0
14016    paddd           xm0, xm1
14017    pshuflw         xm1, xm0, q0032
14018    paddd           xm0, xm1
14019    movd            eax, xm0
14020    RET
14021
14022cglobal pixel_satd_64x32, 4,8,10         ; if WIN64 && cpuflag(avx2)
14023    mova            m7, [hmul_16p]
14024    lea             r4, [3 * r1]
14025    lea             r5, [3 * r3]
14026    pxor            m8, m8
14027    pxor            m9, m9
14028    mov             r6, r0
14029    mov             r7, r2
14030
14031    call            calc_satd_16x8
14032    call            calc_satd_16x8
14033    call            calc_satd_16x8
14034    call            calc_satd_16x8
14035    lea             r0, [r6 + 16]
14036    lea             r2, [r7 + 16]
14037    call            calc_satd_16x8
14038    call            calc_satd_16x8
14039    call            calc_satd_16x8
14040    call            calc_satd_16x8
14041    lea             r0, [r6 + 32]
14042    lea             r2, [r7 + 32]
14043    call            calc_satd_16x8
14044    call            calc_satd_16x8
14045    call            calc_satd_16x8
14046    call            calc_satd_16x8
14047    lea             r0, [r6 + 48]
14048    lea             r2, [r7 + 48]
14049    call            calc_satd_16x8
14050    call            calc_satd_16x8
14051    call            calc_satd_16x8
14052    call            calc_satd_16x8
14053
14054    paddd           m8, m9
14055    vextracti128    xm0, m8, 1
14056    paddd           xm0, xm8
14057    movhlps         xm1, xm0
14058    paddd           xm0, xm1
14059    pshuflw         xm1, xm0, q0032
14060    paddd           xm0, xm1
14061    movd            eax, xm0
14062    RET
14063
14064cglobal pixel_satd_64x48, 4,8,10        ; if WIN64 && cpuflag(avx2)
14065    mova            m7, [hmul_16p]
14066    lea             r4, [3 * r1]
14067    lea             r5, [3 * r3]
14068    pxor            m8, m8
14069    pxor            m9, m9
14070    mov             r6, r0
14071    mov             r7, r2
14072
14073    call            calc_satd_16x8
14074    call            calc_satd_16x8
14075    call            calc_satd_16x8
14076    call            calc_satd_16x8
14077    call            calc_satd_16x8
14078    call            calc_satd_16x8
14079    lea             r0, [r6 + 16]
14080    lea             r2, [r7 + 16]
14081    call            calc_satd_16x8
14082    call            calc_satd_16x8
14083    call            calc_satd_16x8
14084    call            calc_satd_16x8
14085    call            calc_satd_16x8
14086    call            calc_satd_16x8
14087    lea             r0, [r6 + 32]
14088    lea             r2, [r7 + 32]
14089    call            calc_satd_16x8
14090    call            calc_satd_16x8
14091    call            calc_satd_16x8
14092    call            calc_satd_16x8
14093    call            calc_satd_16x8
14094    call            calc_satd_16x8
14095    lea             r0, [r6 + 48]
14096    lea             r2, [r7 + 48]
14097    call            calc_satd_16x8
14098    call            calc_satd_16x8
14099    call            calc_satd_16x8
14100    call            calc_satd_16x8
14101    call            calc_satd_16x8
14102    call            calc_satd_16x8
14103
14104    paddd           m8, m9
14105    vextracti128    xm0, m8, 1
14106    paddd           xm0, xm8
14107    movhlps         xm1, xm0
14108    paddd           xm0, xm1
14109    pshuflw         xm1, xm0, q0032
14110    paddd           xm0, xm1
14111    movd            eax, xm0
14112    RET
14113
14114cglobal pixel_satd_64x64, 4,8,10        ; if WIN64 && cpuflag(avx2)
14115    mova            m7, [hmul_16p]
14116    lea             r4, [3 * r1]
14117    lea             r5, [3 * r3]
14118    pxor            m8, m8
14119    pxor            m9, m9
14120    mov             r6, r0
14121    mov             r7, r2
14122
14123    call            calc_satd_16x8
14124    call            calc_satd_16x8
14125    call            calc_satd_16x8
14126    call            calc_satd_16x8
14127    call            calc_satd_16x8
14128    call            calc_satd_16x8
14129    call            calc_satd_16x8
14130    call            calc_satd_16x8
14131    lea             r0, [r6 + 16]
14132    lea             r2, [r7 + 16]
14133    call            calc_satd_16x8
14134    call            calc_satd_16x8
14135    call            calc_satd_16x8
14136    call            calc_satd_16x8
14137    call            calc_satd_16x8
14138    call            calc_satd_16x8
14139    call            calc_satd_16x8
14140    call            calc_satd_16x8
14141    lea             r0, [r6 + 32]
14142    lea             r2, [r7 + 32]
14143    call            calc_satd_16x8
14144    call            calc_satd_16x8
14145    call            calc_satd_16x8
14146    call            calc_satd_16x8
14147    call            calc_satd_16x8
14148    call            calc_satd_16x8
14149    call            calc_satd_16x8
14150    call            calc_satd_16x8
14151    lea             r0, [r6 + 48]
14152    lea             r2, [r7 + 48]
14153    call            calc_satd_16x8
14154    call            calc_satd_16x8
14155    call            calc_satd_16x8
14156    call            calc_satd_16x8
14157    call            calc_satd_16x8
14158    call            calc_satd_16x8
14159    call            calc_satd_16x8
14160    call            calc_satd_16x8
14161
14162    paddd           m8, m9
14163    vextracti128    xm0, m8, 1
14164    paddd           xm0, xm8
14165    movhlps         xm1, xm0
14166    paddd           xm0, xm1
14167    pshuflw         xm1, xm0, q0032
14168    paddd           xm0, xm1
14169    movd            eax, xm0
14170    RET
14171
14172%macro PROCESS_SATD_32x4_AVX512 0        ; function to compute satd cost for 32 columns, 4 rows
14173    ; rows 0-3
14174    pmovzxbw         m0, [r0]
14175    pmovzxbw         m4, [r2]
14176    psubw           m0, m4
14177    pmovzxbw         m1, [r0 + r1]
14178    pmovzxbw         m5, [r2 + r3]
14179    psubw           m1, m5
14180    pmovzxbw         m2, [r0 + r1 * 2]
14181    pmovzxbw         m4, [r2 + r3 * 2]
14182    psubw           m2, m4
14183    pmovzxbw         m3, [r0 + r4]
14184    pmovzxbw         m5, [r2 + r5]
14185    psubw           m3, m5
14186    paddw           m4, m0, m1
14187    psubw           m1, m0
14188    paddw           m0, m2, m3
14189    psubw           m3, m2
14190    punpckhwd       m2, m4, m1
14191    punpcklwd       m4, m1
14192    punpckhwd       m1, m0, m3
14193    punpcklwd       m0, m3
14194    paddw           m3, m4, m0
14195    psubw           m0, m4
14196    paddw           m4, m2, m1
14197    psubw           m1, m2
14198    punpckhdq       m2, m3, m0
14199    punpckldq       m3, m0
14200    paddw           m0, m3, m2
14201    psubw           m2, m3
14202    punpckhdq       m3, m4, m1
14203    punpckldq       m4, m1
14204    paddw           m1, m4, m3
14205    psubw           m3, m4
14206    punpckhqdq      m4, m0, m1
14207    punpcklqdq      m0, m1
14208    pabsw           m0, m0
14209    pabsw           m4, m4
14210    pmaxsw          m0, m0, m4
14211    punpckhqdq      m1, m2, m3
14212    punpcklqdq      m2, m3
14213    pabsw           m2, m2
14214    pabsw           m1, m1
14215    pmaxsw          m2, m1
14216    pxor            m7, m7
14217    mova            m1, m0
14218    punpcklwd       m1, m7
14219    paddd           m6, m1
14220    mova            m1, m0
14221    punpckhwd       m1, m7
14222    paddd           m6, m1
14223    pxor            m7, m7
14224    mova            m1, m2
14225    punpcklwd       m1, m7
14226    paddd           m6, m1
14227    mova            m1, m2
14228    punpckhwd       m1, m7
14229    paddd           m6, m1
14230%endmacro
14231
14232%macro SATD_MAIN_AVX512_END 0
14233    vextracti32x8   ym7,   m6,   1
14234    paddd           ym6,   ym7
14235    vextracti128    xm7,   ym6,  1
14236    paddd           xm6,   xm6,  xm7
14237    punpckhqdq      xm7,   xm6,  xm6
14238    paddd           xm6,   xm7
14239    movq            rax,   xm6
14240    rorx            rdx,   rax,  32
14241    add             eax,   edx
14242%endmacro
14243
14244%macro SATD_32xN_AVX512 1
14245INIT_ZMM avx512
14246cglobal pixel_satd_32x%1, 4,6,8
14247    lea             r4, [3 * r1]
14248    lea             r5, [3 * r3]
14249    pxor            m6, m6
14250%rep %1/4 - 1
14251    PROCESS_SATD_32x4_AVX512
14252    lea             r0, [r0 + 4 * r1]
14253    lea             r2, [r2 + 4 * r3]
14254%endrep
14255    PROCESS_SATD_32x4_AVX512
14256    SATD_MAIN_AVX512_END
14257    RET
14258%endmacro
14259
14260SATD_32xN_AVX512 8
14261SATD_32xN_AVX512 16
14262SATD_32xN_AVX512 24
14263SATD_32xN_AVX512 32
14264SATD_32xN_AVX512 48
14265SATD_32xN_AVX512 64
14266
14267%macro SATD_64xN_AVX512 1
14268INIT_ZMM avx512
14269cglobal pixel_satd_64x%1, 4,8,8
14270    lea             r4, [3 * r1]
14271    lea             r5, [3 * r3]
14272    pxor            m6, m6
14273    mov             r6, r0
14274    mov             r7, r2
14275
14276%rep %1/4 - 1
14277    PROCESS_SATD_32x4_AVX512
14278    lea             r0, [r0 + 4 * r1]
14279    lea             r2, [r2 + 4 * r3]
14280%endrep
14281    PROCESS_SATD_32x4_AVX512
14282    lea             r0, [r6 + mmsize/2]
14283    lea             r2, [r7 + mmsize/2]
14284%rep %1/4 - 1
14285    PROCESS_SATD_32x4_AVX512
14286    lea             r0, [r0 + 4 * r1]
14287    lea             r2, [r2 + 4 * r3]
14288%endrep
14289    PROCESS_SATD_32x4_AVX512
14290    SATD_MAIN_AVX512_END
14291    RET
14292%endmacro
14293
14294SATD_64xN_AVX512 16
14295SATD_64xN_AVX512 32
14296SATD_64xN_AVX512 48
14297SATD_64xN_AVX512 64
14298%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
14299%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
14300INIT_YMM avx2
14301cglobal calc_satd_16x8    ; function to compute satd cost for 16 columns, 8 rows
14302    ; rows 0-3
14303    movu            m0, [r0]
14304    movu            m4, [r2]
14305    psubw           m0, m4
14306    movu            m1, [r0 + r1]
14307    movu            m5, [r2 + r3]
14308    psubw           m1, m5
14309    movu            m2, [r0 + r1 * 2]
14310    movu            m4, [r2 + r3 * 2]
14311    psubw           m2, m4
14312    movu            m3, [r0 + r4]
14313    movu            m5, [r2 + r5]
14314    psubw           m3, m5
14315    lea             r0, [r0 + r1 * 4]
14316    lea             r2, [r2 + r3 * 4]
14317    paddw           m4, m0, m1
14318    psubw           m1, m0
14319    paddw           m0, m2, m3
14320    psubw           m3, m2
14321    punpckhwd       m2, m4, m1
14322    punpcklwd       m4, m1
14323    punpckhwd       m1, m0, m3
14324    punpcklwd       m0, m3
14325    paddw           m3, m4, m0
14326    psubw           m0, m4
14327    paddw           m4, m2, m1
14328    psubw           m1, m2
14329    punpckhdq       m2, m3, m0
14330    punpckldq       m3, m0
14331    paddw           m0, m3, m2
14332    psubw           m2, m3
14333    punpckhdq       m3, m4, m1
14334    punpckldq       m4, m1
14335    paddw           m1, m4, m3
14336    psubw           m3, m4
14337    punpckhqdq      m4, m0, m1
14338    punpcklqdq      m0, m1
14339    pabsw           m0, m0
14340    pabsw           m4, m4
14341    pmaxsw          m0, m0, m4
14342    punpckhqdq      m1, m2, m3
14343    punpcklqdq      m2, m3
14344    pabsw           m2, m2
14345    pabsw           m1, m1
14346    pmaxsw          m2, m1
14347    pxor            m7, m7
14348    mova            m1, m0
14349    punpcklwd       m1, m7
14350    paddd           m6, m1
14351    mova            m1, m0
14352    punpckhwd       m1, m7
14353    paddd           m6, m1
14354    pxor            m7, m7
14355    mova            m1, m2
14356    punpcklwd       m1, m7
14357    paddd           m6, m1
14358    mova            m1, m2
14359    punpckhwd       m1, m7
14360    paddd           m6, m1
14361    ; rows 4-7
14362    movu            m0, [r0]
14363    movu            m4, [r2]
14364    psubw           m0, m4
14365    movu            m1, [r0 + r1]
14366    movu            m5, [r2 + r3]
14367    psubw           m1, m5
14368    movu            m2, [r0 + r1 * 2]
14369    movu            m4, [r2 + r3 * 2]
14370    psubw           m2, m4
14371    movu            m3, [r0 + r4]
14372    movu            m5, [r2 + r5]
14373    psubw           m3, m5
14374    lea             r0, [r0 + r1 * 4]
14375    lea             r2, [r2 + r3 * 4]
14376    paddw           m4, m0, m1
14377    psubw           m1, m0
14378    paddw           m0, m2, m3
14379    psubw           m3, m2
14380    punpckhwd       m2, m4, m1
14381    punpcklwd       m4, m1
14382    punpckhwd       m1, m0, m3
14383    punpcklwd       m0, m3
14384    paddw           m3, m4, m0
14385    psubw           m0, m4
14386    paddw           m4, m2, m1
14387    psubw           m1, m2
14388    punpckhdq       m2, m3, m0
14389    punpckldq       m3, m0
14390    paddw           m0, m3, m2
14391    psubw           m2, m3
14392    punpckhdq       m3, m4, m1
14393    punpckldq       m4, m1
14394    paddw           m1, m4, m3
14395    psubw           m3, m4
14396    punpckhqdq      m4, m0, m1
14397    punpcklqdq      m0, m1
14398    pabsw           m0, m0
14399    pabsw           m4, m4
14400    pmaxsw          m0, m0, m4
14401    punpckhqdq      m1, m2, m3
14402    punpcklqdq      m2, m3
14403    pabsw           m2, m2
14404    pabsw           m1, m1
14405    pmaxsw          m2, m1
14406    pxor            m7, m7
14407    mova            m1, m0
14408    punpcklwd       m1, m7
14409    paddd           m6, m1
14410    mova            m1, m0
14411    punpckhwd       m1, m7
14412    paddd           m6, m1
14413    pxor            m7, m7
14414    mova            m1, m2
14415    punpcklwd       m1, m7
14416    paddd           m6, m1
14417    mova            m1, m2
14418    punpckhwd       m1, m7
14419    paddd           m6, m1
14420    ret
14421
14422cglobal calc_satd_16x4    ; function to compute satd cost for 16 columns, 4 rows
14423    ; rows 0-3
14424    movu            m0, [r0]
14425    movu            m4, [r2]
14426    psubw           m0, m4
14427    movu            m1, [r0 + r1]
14428    movu            m5, [r2 + r3]
14429    psubw           m1, m5
14430    movu            m2, [r0 + r1 * 2]
14431    movu            m4, [r2 + r3 * 2]
14432    psubw           m2, m4
14433    movu            m3, [r0 + r4]
14434    movu            m5, [r2 + r5]
14435    psubw           m3, m5
14436    lea             r0, [r0 + r1 * 4]
14437    lea             r2, [r2 + r3 * 4]
14438    paddw           m4, m0, m1
14439    psubw           m1, m0
14440    paddw           m0, m2, m3
14441    psubw           m3, m2
14442    punpckhwd       m2, m4, m1
14443    punpcklwd       m4, m1
14444    punpckhwd       m1, m0, m3
14445    punpcklwd       m0, m3
14446    paddw           m3, m4, m0
14447    psubw           m0, m4
14448    paddw           m4, m2, m1
14449    psubw           m1, m2
14450    punpckhdq       m2, m3, m0
14451    punpckldq       m3, m0
14452    paddw           m0, m3, m2
14453    psubw           m2, m3
14454    punpckhdq       m3, m4, m1
14455    punpckldq       m4, m1
14456    paddw           m1, m4, m3
14457    psubw           m3, m4
14458    punpckhqdq      m4, m0, m1
14459    punpcklqdq      m0, m1
14460    pabsw           m0, m0
14461    pabsw           m4, m4
14462    pmaxsw          m0, m0, m4
14463    punpckhqdq      m1, m2, m3
14464    punpcklqdq      m2, m3
14465    pabsw           m2, m2
14466    pabsw           m1, m1
14467    pmaxsw          m2, m1
14468    pxor            m7, m7
14469    mova            m1, m0
14470    punpcklwd       m1, m7
14471    paddd           m6, m1
14472    mova            m1, m0
14473    punpckhwd       m1, m7
14474    paddd           m6, m1
14475    pxor            m7, m7
14476    mova            m1, m2
14477    punpcklwd       m1, m7
14478    paddd           m6, m1
14479    mova            m1, m2
14480    punpckhwd       m1, m7
14481    paddd           m6, m1
14482    ret
14483
14484cglobal pixel_satd_16x4, 4,6,8
14485    add             r1d, r1d
14486    add             r3d, r3d
14487    lea             r4, [3 * r1]
14488    lea             r5, [3 * r3]
14489    pxor            m6, m6
14490
14491    call            calc_satd_16x4
14492
14493    vextracti128    xm7, m6, 1
14494    paddd           xm6, xm7
14495    pxor            xm7, xm7
14496    movhlps         xm7, xm6
14497    paddd           xm6, xm7
14498    pshufd          xm7, xm6, 1
14499    paddd           xm6, xm7
14500    movd            eax, xm6
14501    RET
14502
14503cglobal pixel_satd_16x8, 4,6,8
14504    add             r1d, r1d
14505    add             r3d, r3d
14506    lea             r4, [3 * r1]
14507    lea             r5, [3 * r3]
14508    pxor            m6, m6
14509
14510    call            calc_satd_16x8
14511
14512    vextracti128    xm7, m6, 1
14513    paddd           xm6, xm7
14514    pxor            xm7, xm7
14515    movhlps         xm7, xm6
14516    paddd           xm6, xm7
14517    pshufd          xm7, xm6, 1
14518    paddd           xm6, xm7
14519    movd            eax, xm6
14520    RET
14521
14522cglobal pixel_satd_16x12, 4,6,8
14523    add             r1d, r1d
14524    add             r3d, r3d
14525    lea             r4, [3 * r1]
14526    lea             r5, [3 * r3]
14527    pxor            m6, m6
14528
14529    call            calc_satd_16x8
14530    call            calc_satd_16x4
14531
14532    vextracti128    xm7, m6, 1
14533    paddd           xm6, xm7
14534    pxor            xm7, xm7
14535    movhlps         xm7, xm6
14536    paddd           xm6, xm7
14537    pshufd          xm7, xm6, 1
14538    paddd           xm6, xm7
14539    movd            eax, xm6
14540    RET
14541
14542cglobal pixel_satd_16x16, 4,6,8
14543    add             r1d, r1d
14544    add             r3d, r3d
14545    lea             r4, [3 * r1]
14546    lea             r5, [3 * r3]
14547    pxor            m6, m6
14548
14549    call            calc_satd_16x8
14550    call            calc_satd_16x8
14551
14552    vextracti128    xm7, m6, 1
14553    paddd           xm6, xm7
14554    pxor            xm7, xm7
14555    movhlps         xm7, xm6
14556    paddd           xm6, xm7
14557    pshufd          xm7, xm6, 1
14558    paddd           xm6, xm7
14559    movd            eax, xm6
14560    RET
14561
14562cglobal pixel_satd_16x32, 4,6,8
14563    add             r1d, r1d
14564    add             r3d, r3d
14565    lea             r4, [3 * r1]
14566    lea             r5, [3 * r3]
14567    pxor            m6, m6
14568
14569    call            calc_satd_16x8
14570    call            calc_satd_16x8
14571    call            calc_satd_16x8
14572    call            calc_satd_16x8
14573
14574    vextracti128    xm7, m6, 1
14575    paddd           xm6, xm7
14576    pxor            xm7, xm7
14577    movhlps         xm7, xm6
14578    paddd           xm6, xm7
14579    pshufd          xm7, xm6, 1
14580    paddd           xm6, xm7
14581    movd            eax, xm6
14582    RET
14583
14584cglobal pixel_satd_16x64, 4,6,8
14585    add             r1d, r1d
14586    add             r3d, r3d
14587    lea             r4, [3 * r1]
14588    lea             r5, [3 * r3]
14589    pxor            m6, m6
14590
14591    call            calc_satd_16x8
14592    call            calc_satd_16x8
14593    call            calc_satd_16x8
14594    call            calc_satd_16x8
14595    call            calc_satd_16x8
14596    call            calc_satd_16x8
14597    call            calc_satd_16x8
14598    call            calc_satd_16x8
14599
14600    vextracti128    xm7, m6, 1
14601    paddd           xm6, xm7
14602    pxor            xm7, xm7
14603    movhlps         xm7, xm6
14604    paddd           xm6, xm7
14605    pshufd          xm7, xm6, 1
14606    paddd           xm6, xm7
14607    movd            eax, xm6
14608    RET
14609
14610cglobal pixel_satd_32x8, 4,8,8
14611    add             r1d, r1d
14612    add             r3d, r3d
14613    lea             r4, [3 * r1]
14614    lea             r5, [3 * r3]
14615    pxor            m6, m6
14616    mov             r6, r0
14617    mov             r7, r2
14618
14619    call            calc_satd_16x8
14620
14621    lea             r0, [r6 + 32]
14622    lea             r2, [r7 + 32]
14623
14624    call            calc_satd_16x8
14625
14626    vextracti128    xm7, m6, 1
14627    paddd           xm6, xm7
14628    pxor            xm7, xm7
14629    movhlps         xm7, xm6
14630    paddd           xm6, xm7
14631    pshufd          xm7, xm6, 1
14632    paddd           xm6, xm7
14633    movd            eax, xm6
14634    RET
14635
14636cglobal pixel_satd_32x16, 4,8,8
14637    add             r1d, r1d
14638    add             r3d, r3d
14639    lea             r4, [3 * r1]
14640    lea             r5, [3 * r3]
14641    pxor            m6, m6
14642    mov             r6, r0
14643    mov             r7, r2
14644
14645    call            calc_satd_16x8
14646    call            calc_satd_16x8
14647
14648    lea             r0, [r6 + 32]
14649    lea             r2, [r7 + 32]
14650
14651    call            calc_satd_16x8
14652    call            calc_satd_16x8
14653
14654    vextracti128    xm7, m6, 1
14655    paddd           xm6, xm7
14656    pxor            xm7, xm7
14657    movhlps         xm7, xm6
14658    paddd           xm6, xm7
14659    pshufd          xm7, xm6, 1
14660    paddd           xm6, xm7
14661    movd            eax, xm6
14662    RET
14663
14664cglobal pixel_satd_32x24, 4,8,8
14665    add             r1d, r1d
14666    add             r3d, r3d
14667    lea             r4, [3 * r1]
14668    lea             r5, [3 * r3]
14669    pxor            m6, m6
14670    mov             r6, r0
14671    mov             r7, r2
14672
14673    call            calc_satd_16x8
14674    call            calc_satd_16x8
14675    call            calc_satd_16x8
14676
14677    lea             r0, [r6 + 32]
14678    lea             r2, [r7 + 32]
14679
14680    call            calc_satd_16x8
14681    call            calc_satd_16x8
14682    call            calc_satd_16x8
14683
14684    vextracti128    xm7, m6, 1
14685    paddd           xm6, xm7
14686    pxor            xm7, xm7
14687    movhlps         xm7, xm6
14688    paddd           xm6, xm7
14689    pshufd          xm7, xm6, 1
14690    paddd           xm6, xm7
14691    movd            eax, xm6
14692    RET
14693
14694cglobal pixel_satd_32x32, 4,8,8
14695    add             r1d, r1d
14696    add             r3d, r3d
14697    lea             r4, [3 * r1]
14698    lea             r5, [3 * r3]
14699    pxor            m6, m6
14700    mov             r6, r0
14701    mov             r7, r2
14702
14703    call            calc_satd_16x8
14704    call            calc_satd_16x8
14705    call            calc_satd_16x8
14706    call            calc_satd_16x8
14707
14708    lea             r0, [r6 + 32]
14709    lea             r2, [r7 + 32]
14710
14711    call            calc_satd_16x8
14712    call            calc_satd_16x8
14713    call            calc_satd_16x8
14714    call            calc_satd_16x8
14715
14716    vextracti128    xm7, m6, 1
14717    paddd           xm6, xm7
14718    pxor            xm7, xm7
14719    movhlps         xm7, xm6
14720    paddd           xm6, xm7
14721    pshufd          xm7, xm6, 1
14722    paddd           xm6, xm7
14723    movd            eax, xm6
14724    RET
14725
14726cglobal pixel_satd_32x64, 4,8,8
14727    add             r1d, r1d
14728    add             r3d, r3d
14729    lea             r4, [3 * r1]
14730    lea             r5, [3 * r3]
14731    pxor            m6, m6
14732    mov             r6, r0
14733    mov             r7, r2
14734
14735    call            calc_satd_16x8
14736    call            calc_satd_16x8
14737    call            calc_satd_16x8
14738    call            calc_satd_16x8
14739    call            calc_satd_16x8
14740    call            calc_satd_16x8
14741    call            calc_satd_16x8
14742    call            calc_satd_16x8
14743
14744    lea             r0, [r6 + 32]
14745    lea             r2, [r7 + 32]
14746
14747    call            calc_satd_16x8
14748    call            calc_satd_16x8
14749    call            calc_satd_16x8
14750    call            calc_satd_16x8
14751    call            calc_satd_16x8
14752    call            calc_satd_16x8
14753    call            calc_satd_16x8
14754    call            calc_satd_16x8
14755
14756    vextracti128    xm7, m6, 1
14757    paddd           xm6, xm7
14758    pxor            xm7, xm7
14759    movhlps         xm7, xm6
14760    paddd           xm6, xm7
14761    pshufd          xm7, xm6, 1
14762    paddd           xm6, xm7
14763    movd            eax, xm6
14764    RET
14765
14766cglobal pixel_satd_48x64, 4,8,8
14767    add             r1d, r1d
14768    add             r3d, r3d
14769    lea             r4, [3 * r1]
14770    lea             r5, [3 * r3]
14771    pxor            m6, m6
14772    mov             r6, r0
14773    mov             r7, r2
14774
14775    call            calc_satd_16x8
14776    call            calc_satd_16x8
14777    call            calc_satd_16x8
14778    call            calc_satd_16x8
14779    call            calc_satd_16x8
14780    call            calc_satd_16x8
14781    call            calc_satd_16x8
14782    call            calc_satd_16x8
14783
14784    lea             r0, [r6 + 32]
14785    lea             r2, [r7 + 32]
14786
14787    call            calc_satd_16x8
14788    call            calc_satd_16x8
14789    call            calc_satd_16x8
14790    call            calc_satd_16x8
14791    call            calc_satd_16x8
14792    call            calc_satd_16x8
14793    call            calc_satd_16x8
14794    call            calc_satd_16x8
14795
14796    lea             r0, [r6 + 64]
14797    lea             r2, [r7 + 64]
14798
14799    call            calc_satd_16x8
14800    call            calc_satd_16x8
14801    call            calc_satd_16x8
14802    call            calc_satd_16x8
14803    call            calc_satd_16x8
14804    call            calc_satd_16x8
14805    call            calc_satd_16x8
14806    call            calc_satd_16x8
14807
14808    vextracti128    xm7, m6, 1
14809    paddd           xm6, xm7
14810    pxor            xm7, xm7
14811    movhlps         xm7, xm6
14812    paddd           xm6, xm7
14813    pshufd          xm7, xm6, 1
14814    paddd           xm6, xm7
14815    movd            eax, xm6
14816    RET
14817
14818cglobal pixel_satd_64x16, 4,8,8
14819    add             r1d, r1d
14820    add             r3d, r3d
14821    lea             r4, [3 * r1]
14822    lea             r5, [3 * r3]
14823    pxor            m6, m6
14824    mov             r6, r0
14825    mov             r7, r2
14826
14827    call            calc_satd_16x8
14828    call            calc_satd_16x8
14829
14830    lea             r0, [r6 + 32]
14831    lea             r2, [r7 + 32]
14832
14833    call            calc_satd_16x8
14834    call            calc_satd_16x8
14835
14836    lea             r0, [r6 + 64]
14837    lea             r2, [r7 + 64]
14838
14839    call            calc_satd_16x8
14840    call            calc_satd_16x8
14841
14842    lea             r0, [r6 + 96]
14843    lea             r2, [r7 + 96]
14844
14845    call            calc_satd_16x8
14846    call            calc_satd_16x8
14847
14848    vextracti128    xm7, m6, 1
14849    paddd           xm6, xm7
14850    pxor            xm7, xm7
14851    movhlps         xm7, xm6
14852    paddd           xm6, xm7
14853    pshufd          xm7, xm6, 1
14854    paddd           xm6, xm7
14855    movd            eax, xm6
14856    RET
14857
14858cglobal pixel_satd_64x32, 4,8,8
14859    add             r1d, r1d
14860    add             r3d, r3d
14861    lea             r4, [3 * r1]
14862    lea             r5, [3 * r3]
14863    pxor            m6, m6
14864    mov             r6, r0
14865    mov             r7, r2
14866
14867    call            calc_satd_16x8
14868    call            calc_satd_16x8
14869    call            calc_satd_16x8
14870    call            calc_satd_16x8
14871
14872    lea             r0, [r6 + 32]
14873    lea             r2, [r7 + 32]
14874
14875    call            calc_satd_16x8
14876    call            calc_satd_16x8
14877    call            calc_satd_16x8
14878    call            calc_satd_16x8
14879
14880    lea             r0, [r6 + 64]
14881    lea             r2, [r7 + 64]
14882
14883    call            calc_satd_16x8
14884    call            calc_satd_16x8
14885    call            calc_satd_16x8
14886    call            calc_satd_16x8
14887
14888    lea             r0, [r6 + 96]
14889    lea             r2, [r7 + 96]
14890
14891    call            calc_satd_16x8
14892    call            calc_satd_16x8
14893    call            calc_satd_16x8
14894    call            calc_satd_16x8
14895
14896    vextracti128    xm7, m6, 1
14897    paddd           xm6, xm7
14898    pxor            xm7, xm7
14899    movhlps         xm7, xm6
14900    paddd           xm6, xm7
14901    pshufd          xm7, xm6, 1
14902    paddd           xm6, xm7
14903    movd            eax, xm6
14904    RET
14905
14906cglobal pixel_satd_64x48, 4,8,8
14907    add             r1d, r1d
14908    add             r3d, r3d
14909    lea             r4, [3 * r1]
14910    lea             r5, [3 * r3]
14911    pxor            m6, m6
14912    mov             r6, r0
14913    mov             r7, r2
14914
14915    call            calc_satd_16x8
14916    call            calc_satd_16x8
14917    call            calc_satd_16x8
14918    call            calc_satd_16x8
14919    call            calc_satd_16x8
14920    call            calc_satd_16x8
14921
14922    lea             r0, [r6 + 32]
14923    lea             r2, [r7 + 32]
14924
14925    call            calc_satd_16x8
14926    call            calc_satd_16x8
14927    call            calc_satd_16x8
14928    call            calc_satd_16x8
14929    call            calc_satd_16x8
14930    call            calc_satd_16x8
14931
14932    lea             r0, [r6 + 64]
14933    lea             r2, [r7 + 64]
14934
14935    call            calc_satd_16x8
14936    call            calc_satd_16x8
14937    call            calc_satd_16x8
14938    call            calc_satd_16x8
14939    call            calc_satd_16x8
14940    call            calc_satd_16x8
14941
14942    lea             r0, [r6 + 96]
14943    lea             r2, [r7 + 96]
14944
14945    call            calc_satd_16x8
14946    call            calc_satd_16x8
14947    call            calc_satd_16x8
14948    call            calc_satd_16x8
14949    call            calc_satd_16x8
14950    call            calc_satd_16x8
14951
14952    vextracti128    xm7, m6, 1
14953    paddd           xm6, xm7
14954    pxor            xm7, xm7
14955    movhlps         xm7, xm6
14956    paddd           xm6, xm7
14957    pshufd          xm7, xm6, 1
14958    paddd           xm6, xm7
14959    movd            eax, xm6
14960    RET
14961
14962cglobal pixel_satd_64x64, 4,8,8
14963    add             r1d, r1d
14964    add             r3d, r3d
14965    lea             r4, [3 * r1]
14966    lea             r5, [3 * r3]
14967    pxor            m6, m6
14968    mov             r6, r0
14969    mov             r7, r2
14970
14971    call            calc_satd_16x8
14972    call            calc_satd_16x8
14973    call            calc_satd_16x8
14974    call            calc_satd_16x8
14975    call            calc_satd_16x8
14976    call            calc_satd_16x8
14977    call            calc_satd_16x8
14978    call            calc_satd_16x8
14979
14980    lea             r0, [r6 + 32]
14981    lea             r2, [r7 + 32]
14982
14983    call            calc_satd_16x8
14984    call            calc_satd_16x8
14985    call            calc_satd_16x8
14986    call            calc_satd_16x8
14987    call            calc_satd_16x8
14988    call            calc_satd_16x8
14989    call            calc_satd_16x8
14990    call            calc_satd_16x8
14991
14992    lea             r0, [r6 + 64]
14993    lea             r2, [r7 + 64]
14994
14995    call            calc_satd_16x8
14996    call            calc_satd_16x8
14997    call            calc_satd_16x8
14998    call            calc_satd_16x8
14999    call            calc_satd_16x8
15000    call            calc_satd_16x8
15001    call            calc_satd_16x8
15002    call            calc_satd_16x8
15003
15004    lea             r0, [r6 + 96]
15005    lea             r2, [r7 + 96]
15006
15007    call            calc_satd_16x8
15008    call            calc_satd_16x8
15009    call            calc_satd_16x8
15010    call            calc_satd_16x8
15011    call            calc_satd_16x8
15012    call            calc_satd_16x8
15013    call            calc_satd_16x8
15014    call            calc_satd_16x8
15015
15016    vextracti128    xm7, m6, 1
15017    paddd           xm6, xm7
15018    pxor            xm7, xm7
15019    movhlps         xm7, xm6
15020    paddd           xm6, xm7
15021    pshufd          xm7, xm6, 1
15022    paddd           xm6, xm7
15023    movd            eax, xm6
15024    RET
15025
15026%macro SATD_HBD_AVX512_END 0
15027    vextracti32x8   ym7, m6, 1
15028    paddd           ym6, ym7
15029    vextracti128    xm7, ym6, 1
15030    paddd           xm6, xm7
15031    pxor            xm7, xm7
15032    movhlps         xm7, xm6
15033    paddd           xm6, xm7
15034    pshufd          xm7, xm6, 1
15035    paddd           xm6, xm7
15036    movd            eax, xm6
15037%endmacro
15038%macro PROCESS_SATD_16x8_HBD_AVX512 0        ; function to compute satd cost for 16 columns, 8 rows
15039    ; rows 0-3
15040    lea             r6, [r0 + r1 * 4]
15041    lea             r7, [r2 + r3 * 4]
15042    movu            ym0, [r0]
15043    movu            ym4, [r2]
15044    vinserti32x8    m0, [r6], 1
15045    vinserti32x8    m4, [r7], 1
15046    psubw           m0, m4
15047    movu            ym1, [r0 + r1]
15048    movu            ym5, [r2 + r3]
15049    vinserti32x8    m1, [r6 + r1], 1
15050    vinserti32x8    m5, [r7 + r3], 1
15051    psubw           m1, m5
15052    movu            ym2, [r0 + r1 * 2]
15053    movu            ym4, [r2 + r3 * 2]
15054    vinserti32x8    m2, [r6 + r1 * 2], 1
15055    vinserti32x8    m4, [r7 + r3 * 2], 1
15056    psubw           m2, m4
15057    movu            ym3, [r0 + r4]
15058    movu            ym5, [r2 + r5]
15059    vinserti32x8    m3, [r6 + r4], 1
15060    vinserti32x8    m5, [r7 + r5], 1
15061    psubw           m3, m5
15062
15063    paddw           m4, m0, m1
15064    psubw           m1, m0
15065    paddw           m0, m2, m3
15066    psubw           m3, m2
15067    punpckhwd       m2, m4, m1
15068    punpcklwd       m4, m1
15069    punpckhwd       m1, m0, m3
15070    punpcklwd       m0, m3
15071    paddw           m3, m4, m0
15072    psubw           m0, m4
15073    paddw           m4, m2, m1
15074    psubw           m1, m2
15075    punpckhdq       m2, m3, m0
15076    punpckldq       m3, m0
15077    paddw           m0, m3, m2
15078    psubw           m2, m3
15079    punpckhdq       m3, m4, m1
15080    punpckldq       m4, m1
15081    paddw           m1, m4, m3
15082    psubw           m3, m4
15083    punpckhqdq      m4, m0, m1
15084    punpcklqdq      m0, m1
15085    pabsw           m0, m0
15086    pabsw           m4, m4
15087    pmaxsw          m0, m0, m4
15088    punpckhqdq      m1, m2, m3
15089    punpcklqdq      m2, m3
15090    pabsw           m2, m2
15091    pabsw           m1, m1
15092    pmaxsw          m2, m1
15093    pxor            m7, m7
15094    mova            m1, m0
15095    punpcklwd       m1, m7
15096    paddd           m6, m1
15097    mova            m1, m0
15098    punpckhwd       m1, m7
15099    paddd           m6, m1
15100    pxor            m7, m7
15101    mova            m1, m2
15102    punpcklwd       m1, m7
15103    paddd           m6, m1
15104    mova            m1, m2
15105    punpckhwd       m1, m7
15106    paddd           m6, m1
15107%endmacro
15108%macro PROCESS_SATD_32x4_HBD_AVX512 0        ; function to compute satd cost for 32 columns, 4 rows
15109    ; rows 0-3
15110    movu            m0, [r0]
15111    movu            m4, [r2]
15112    psubw           m0, m4
15113    movu            m1, [r0 + r1]
15114    movu            m5, [r2 + r3]
15115    psubw           m1, m5
15116    movu            m2, [r0 + r1 * 2]
15117    movu            m4, [r2 + r3 * 2]
15118    psubw           m2, m4
15119    movu            m3, [r0 + r4]
15120    movu            m5, [r2 + r5]
15121    psubw           m3, m5
15122    paddw           m4, m0, m1
15123    psubw           m1, m0
15124    paddw           m0, m2, m3
15125    psubw           m3, m2
15126    punpckhwd       m2, m4, m1
15127    punpcklwd       m4, m1
15128    punpckhwd       m1, m0, m3
15129    punpcklwd       m0, m3
15130    paddw           m3, m4, m0
15131    psubw           m0, m4
15132    paddw           m4, m2, m1
15133    psubw           m1, m2
15134    punpckhdq       m2, m3, m0
15135    punpckldq       m3, m0
15136    paddw           m0, m3, m2
15137    psubw           m2, m3
15138    punpckhdq       m3, m4, m1
15139    punpckldq       m4, m1
15140    paddw           m1, m4, m3
15141    psubw           m3, m4
15142    punpckhqdq      m4, m0, m1
15143    punpcklqdq      m0, m1
15144    pabsw           m0, m0
15145    pabsw           m4, m4
15146    pmaxsw          m0, m0, m4
15147    punpckhqdq      m1, m2, m3
15148    punpcklqdq      m2, m3
15149    pabsw           m2, m2
15150    pabsw           m1, m1
15151    pmaxsw          m2, m1
15152    pxor            m7, m7
15153    mova            m1, m0
15154    punpcklwd       m1, m7
15155    paddd           m6, m1
15156    mova            m1, m0
15157    punpckhwd       m1, m7
15158    paddd           m6, m1
15159    pxor            m7, m7
15160    mova            m1, m2
15161    punpcklwd       m1, m7
15162    paddd           m6, m1
15163    mova            m1, m2
15164    punpckhwd       m1, m7
15165    paddd           m6, m1
15166%endmacro
15167
15168%macro SATD_16xN_HBD_AVX512 1
15169INIT_ZMM avx512
15170cglobal pixel_satd_16x%1, 4,8,8
15171    add             r1d, r1d
15172    add             r3d, r3d
15173    lea             r4, [3 * r1]
15174    lea             r5, [3 * r3]
15175    pxor            m6, m6
15176
15177%rep %1/8 - 1
15178    PROCESS_SATD_16x8_HBD_AVX512
15179    lea             r0, [r6 + 4 * r1]
15180    lea             r2, [r7 + 4 * r3]
15181%endrep
15182    PROCESS_SATD_16x8_HBD_AVX512
15183    SATD_HBD_AVX512_END
15184    RET
15185%endmacro
15186
15187SATD_16xN_HBD_AVX512 8
15188SATD_16xN_HBD_AVX512 16
15189SATD_16xN_HBD_AVX512 32
15190SATD_16xN_HBD_AVX512 64
15191
15192%macro SATD_32xN_HBD_AVX512 1
15193INIT_ZMM avx512
15194cglobal pixel_satd_32x%1, 4,8,8
15195    add             r1d, r1d
15196    add             r3d, r3d
15197    lea             r4, [3 * r1]
15198    lea             r5, [3 * r3]
15199    pxor            m6, m6
15200    mov             r6, r0
15201    mov             r7, r2
15202%rep %1/4 - 1
15203    PROCESS_SATD_32x4_HBD_AVX512
15204    lea             r0, [r0 + 4 * r1]
15205    lea             r2, [r2 + 4 * r3]
15206%endrep
15207    PROCESS_SATD_32x4_HBD_AVX512
15208    SATD_HBD_AVX512_END
15209    RET
15210%endmacro
15211
15212SATD_32xN_HBD_AVX512 8
15213SATD_32xN_HBD_AVX512 16
15214SATD_32xN_HBD_AVX512 24
15215SATD_32xN_HBD_AVX512 32
15216SATD_32xN_HBD_AVX512 64
15217INIT_ZMM avx512
15218cglobal pixel_satd_48x64, 4,10,8
15219    add             r1d, r1d
15220    add             r3d, r3d
15221    lea             r4, [3 * r1]
15222    lea             r5, [3 * r3]
15223    pxor            m6, m6
15224    mov             r8, r0
15225    mov             r9, r2
15226
15227%rep 15
15228    PROCESS_SATD_32x4_HBD_AVX512
15229    lea             r0, [r0 + 4 * r1]
15230    lea             r2, [r2 + 4 * r3]
15231%endrep
15232    PROCESS_SATD_32x4_HBD_AVX512
15233    lea             r0, [r8 + mmsize]
15234    lea             r2, [r9 + mmsize]
15235%rep 7
15236    PROCESS_SATD_16x8_HBD_AVX512
15237    lea             r0, [r6 + 4 * r1]
15238    lea             r2, [r7 + 4 * r3]
15239%endrep
15240    PROCESS_SATD_16x8_HBD_AVX512
15241    SATD_HBD_AVX512_END
15242    RET
15243
15244%macro SATD_64xN_HBD_AVX512 1
15245INIT_ZMM avx512
15246cglobal pixel_satd_64x%1, 4,8,8
15247    add             r1d, r1d
15248    add             r3d, r3d
15249    lea             r4, [3 * r1]
15250    lea             r5, [3 * r3]
15251    pxor            m6, m6
15252    mov             r6, r0
15253    mov             r7, r2
15254%rep %1/4 - 1
15255    PROCESS_SATD_32x4_HBD_AVX512
15256    lea             r0, [r0 + 4 * r1]
15257    lea             r2, [r2 + 4 * r3]
15258%endrep
15259    PROCESS_SATD_32x4_HBD_AVX512
15260    lea             r0, [r6 + mmsize]
15261    lea             r2, [r7 + mmsize]
15262%rep %1/4 - 1
15263    PROCESS_SATD_32x4_HBD_AVX512
15264    lea             r0, [r0 + 4 * r1]
15265    lea             r2, [r2 + 4 * r3]
15266%endrep
15267    PROCESS_SATD_32x4_HBD_AVX512
15268    SATD_HBD_AVX512_END
15269    RET
15270%endmacro
15271
15272SATD_64xN_HBD_AVX512 16
15273SATD_64xN_HBD_AVX512 32
15274SATD_64xN_HBD_AVX512 48
15275SATD_64xN_HBD_AVX512 64
15276%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
15277
15278
15279;-------------------------------------------------------------------------------------------------------------------------------------
15280; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
15281;-------------------------------------------------------------------------------------------------------------------------------------
15282%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
15283INIT_YMM avx2
15284cglobal planeClipAndMax, 5,7,8
15285    movd            xm0, r5m
15286    vpbroadcastb    m0, xm0                 ; m0 = [min]
15287    vpbroadcastb    m1, r6m                 ; m1 = [max]
15288    pxor            m2, m2                  ; m2 = sumLuma
15289    pxor            m3, m3                  ; m3 = maxLumaLevel
15290    pxor            m4, m4                  ; m4 = zero
15291
15292    ; get mask to partial register pixels
15293    mov             r5d, r2d
15294    and             r2d, ~(mmsize - 1)
15295    sub             r5d, r2d
15296    lea             r6, [pb_movemask_32 + mmsize]
15297    sub             r6, r5
15298    movu            m5, [r6]                ; m5 = mask for last couple column
15299
15300.loopH:
15301    lea             r5d, [r2 - mmsize]
15302
15303.loopW:
15304    movu            m6, [r0 + r5]
15305    pmaxub          m6, m0
15306    pminub          m6, m1
15307    movu            [r0 + r5], m6           ; store back
15308    pmaxub          m3, m6                  ; update maxLumaLevel
15309    psadbw          m6, m4
15310    paddq           m2, m6
15311
15312    sub             r5d, mmsize
15313    jge            .loopW
15314
15315    ; partial pixels
15316    movu            m7, [r0 + r2]
15317    pmaxub          m6, m7, m0
15318    pminub          m6, m1
15319
15320    pand            m7, m5                  ; get invalid/unchange pixel
15321    pandn           m6, m5, m6              ; clear invalid pixels
15322    por             m7, m6                  ; combin valid & invalid pixels
15323    movu            [r0 + r2], m7           ; store back
15324    pmaxub          m3, m6                  ; update maxLumaLevel
15325    psadbw          m6, m4
15326    paddq           m2, m6
15327
15328.next:
15329    add             r0, r1
15330    dec             r3d
15331    jg             .loopH
15332
15333    ; sumLuma
15334    vextracti128    xm0, m2, 1
15335    paddq           xm0, xm2
15336    movhlps         xm1, xm0
15337    paddq           xm0, xm1
15338    movq            [r4], xm0
15339
15340    ; maxLumaLevel
15341    vextracti128    xm0, m3, 1
15342    pmaxub          xm0, xm3
15343    movhlps         xm3, xm0
15344    pmaxub          xm0, xm3
15345    pmovzxbw        xm0, xm0
15346    pxor            xm0, [pb_movemask + 16]
15347    phminposuw      xm0, xm0
15348
15349    movd            eax, xm0
15350    not             al
15351    movzx           eax, al
15352    RET
15353%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
15354
15355
15356%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
15357%macro LOAD_DIFF_AVX2 4
15358    movu       %1, %3
15359    movu       %2, %4
15360    psubw      %1, %2
15361%endmacro
15362
15363%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
15364    LOAD_DIFF_AVX2 xm%1, xm%5, [%7],      [%8]
15365    LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1],   [%8+r3]
15366    LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3]
15367    LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4],   [%8+r5]
15368
15369    ;lea %7, [%7+4*r1]
15370    ;lea %8, [%8+4*r3]
15371%endmacro
15372
15373%if ARCH_X86_64
15374INIT_YMM avx2
15375cglobal pixel_satd_8x8, 4,4,7
15376
15377    FIX_STRIDES r1, r3
15378    pxor    xm6, xm6
15379
15380    ; load_diff 0 & 4
15381    movu    xm0, [r0]
15382    movu    xm1, [r2]
15383    vinserti128 m0, m0, [r0 + r1 * 4], 1
15384    vinserti128 m1, m1, [r2 + r3 * 4], 1
15385    psubw   m0, m1
15386    add     r0, r1
15387    add     r2, r3
15388
15389    ; load_diff 1 & 5
15390    movu    xm1, [r0]
15391    movu    xm2, [r2]
15392    vinserti128 m1, m1, [r0 + r1 * 4], 1
15393    vinserti128 m2, m2, [r2 + r3 * 4], 1
15394    psubw   m1, m2
15395    add     r0, r1
15396    add     r2, r3
15397
15398    ; load_diff 2 & 6
15399    movu    xm2, [r0]
15400    movu    xm3, [r2]
15401    vinserti128 m2, m2, [r0 + r1 * 4], 1
15402    vinserti128 m3, m3, [r2 + r3 * 4], 1
15403    psubw   m2, m3
15404    add     r0, r1
15405    add     r2, r3
15406
15407    ; load_diff 3 & 7
15408    movu    xm3, [r0]
15409    movu    xm4, [r2]
15410    vinserti128 m3, m3, [r0 + r1 * 4], 1
15411    vinserti128 m4, m4, [r2 + r3 * 4], 1
15412    psubw   m3, m4
15413
15414    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
15415
15416    vextracti128 xm0, m6, 1
15417    paddw xm6, xm0
15418    HADDUW xm6, xm0
15419    movd   eax, xm6
15420    RET
15421
15422INIT_XMM avx2
15423cglobal pixel_sa8d_8x8_internal
15424    lea  r6, [r0+4*r1]
15425    lea  r7, [r2+4*r3]
15426    LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15427    LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15428
15429    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15430    ;HADAMARD2_2D 0, 1, 2, 8, 6, wd
15431    ;HADAMARD2_2D 4, 5, 3, 9, 6, wd
15432    ;HADAMARD2_2D 0, 2, 1, 8, 6, dq
15433    ;HADAMARD2_2D 4, 3, 5, 9, 6, dq
15434    ;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax
15435    ;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax
15436
15437    paddw m0, m1
15438    paddw m0, m2
15439    paddw m0, m8
15440    SAVE_MM_PERMUTATION
15441    ret
15442
15443
15444INIT_XMM avx2
15445cglobal pixel_sa8d_8x8, 4,8,12
15446    FIX_STRIDES r1, r3
15447    lea  r4, [3*r1]
15448    lea  r5, [3*r3]
15449    call pixel_sa8d_8x8_internal
15450    HADDUW m0, m1
15451    movd eax, m0
15452    add eax, 1
15453    shr eax, 1
15454    RET
15455
15456
15457INIT_YMM avx2
15458cglobal pixel_sa8d_16x16, 4,8,12
15459    FIX_STRIDES r1, r3
15460    lea  r4, [3*r1]
15461    lea  r5, [3*r3]
15462    lea  r6, [r0+4*r1]
15463    lea  r7, [r2+4*r3]
15464    vbroadcasti128 m7, [pw_1]
15465
15466    ; Top 16x8
15467    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15468    movu m0, [r0]                                   ; 10 bits
15469    movu m5, [r2]
15470    psubw m0, m5                                    ; 11 bits
15471    movu m1, [r0 + r1]
15472    movu m6, [r2 + r3]
15473    psubw m1, m6
15474    movu m2, [r0 + r1 * 2]
15475    movu m5, [r2 + r3 * 2]
15476    psubw m2, m5
15477    movu m8, [r0 + r4]
15478    movu m6, [r2 + r5]
15479    psubw m8, m6
15480
15481    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15482    movu m4, [r6]
15483    movu m11, [r7]
15484    psubw m4, m11
15485    movu m5, [r6 + r1]
15486    movu m6, [r7 + r3]
15487    psubw m5, m6
15488    movu m3, [r6 + r1 * 2]
15489    movu m11, [r7 + r3 * 2]
15490    psubw m3, m11
15491    movu m9, [r6 + r4]
15492    movu m6, [r7 + r5]
15493    psubw m9, m6
15494
15495    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax    ; 16 bits
15496    pmaddwd m0, m7
15497    pmaddwd m1, m7
15498    pmaddwd m2, m7
15499    pmaddwd m8, m7
15500    paddd m0, m1
15501    paddd m2, m8
15502    paddd m10, m0, m2
15503
15504    lea  r0, [r0+8*r1]
15505    lea  r2, [r2+8*r3]
15506    lea  r6, [r6+8*r1]
15507    lea  r7, [r7+8*r3]
15508
15509    ; Bottom 16x8
15510    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15511    movu m0, [r0]
15512    movu m5, [r2]
15513    psubw m0, m5
15514    movu m1, [r0 + r1]
15515    movu m6, [r2 + r3]
15516    psubw m1, m6
15517    movu m2, [r0 + r1 * 2]
15518    movu m5, [r2 + r3 * 2]
15519    psubw m2, m5
15520    movu m8, [r0 + r4]
15521    movu m6, [r2 + r5]
15522    psubw m8, m6
15523
15524    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15525    movu m4, [r6]
15526    movu m11, [r7]
15527    psubw m4, m11
15528    movu m5, [r6 + r1]
15529    movu m6, [r7 + r3]
15530    psubw m5, m6
15531    movu m3, [r6 + r1 * 2]
15532    movu m11, [r7 + r3 * 2]
15533    psubw m3, m11
15534    movu m9, [r6 + r4]
15535    movu m6, [r7 + r5]
15536    psubw m9, m6
15537
15538    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15539    pmaddwd m0, m7
15540    pmaddwd m1, m7
15541    pmaddwd m2, m7
15542    pmaddwd m8, m7
15543    paddd m0, m1
15544    paddd m2, m8
15545    paddd m10, m0
15546    paddd m10, m2
15547
15548    HADDD m10, m0
15549
15550    movd eax, xm10
15551    add  eax, 1
15552    shr  eax, 1
15553    RET
15554
15555
15556; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block
15557INIT_YMM avx2
15558cglobal pixel_sa8d_32x32, 4,8,14
15559    FIX_STRIDES r1, r3
15560    lea  r4, [3*r1]
15561    lea  r5, [3*r3]
15562    lea  r6, [r0+4*r1]
15563    lea  r7, [r2+4*r3]
15564    vbroadcasti128 m7, [pw_1]
15565
15566
15567    ;SA8D[16x8] ; pix[0]
15568    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15569    movu m0, [r0]
15570    movu m5, [r2]
15571    psubw m0, m5
15572    movu m1, [r0 + r1]
15573    movu m6, [r2 + r3]
15574    psubw m1, m6
15575    movu m2, [r0 + r1 * 2]
15576    movu m5, [r2 + r3 * 2]
15577    psubw m2, m5
15578    movu m8, [r0 + r4]
15579    movu m6, [r2 + r5]
15580    psubw m8, m6
15581
15582    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15583    movu m4, [r6]
15584    movu m11, [r7]
15585    psubw m4, m11
15586    movu m5, [r6 + r1]
15587    movu m6, [r7 + r3]
15588    psubw m5, m6
15589    movu m3, [r6 + r1 * 2]
15590    movu m11, [r7 + r3 * 2]
15591    psubw m3, m11
15592    movu m9, [r6 + r4]
15593    movu m6, [r7 + r5]
15594    psubw m9, m6
15595
15596    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15597    pmaddwd m0, m7
15598    pmaddwd m1, m7
15599    pmaddwd m2, m7
15600    pmaddwd m8, m7
15601    paddd m0, m1
15602    paddd m2, m8
15603    paddd m10, m0, m2
15604
15605
15606    ; SA8D[16x8] ; pix[16]
15607    add  r0, mmsize
15608    add  r2, mmsize
15609    add  r6, mmsize
15610    add  r7, mmsize
15611
15612    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15613    movu m0, [r0]
15614    movu m5, [r2]
15615    psubw m0, m5
15616    movu m1, [r0 + r1]
15617    movu m6, [r2 + r3]
15618    psubw m1, m6
15619    movu m2, [r0 + r1 * 2]
15620    movu m5, [r2 + r3 * 2]
15621    psubw m2, m5
15622    movu m8, [r0 + r4]
15623    movu m6, [r2 + r5]
15624    psubw m8, m6
15625
15626    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15627    movu m4, [r6]
15628    movu m11, [r7]
15629    psubw m4, m11
15630    movu m5, [r6 + r1]
15631    movu m6, [r7 + r3]
15632    psubw m5, m6
15633    movu m3, [r6 + r1 * 2]
15634    movu m11, [r7 + r3 * 2]
15635    psubw m3, m11
15636    movu m9, [r6 + r4]
15637    movu m6, [r7 + r5]
15638    psubw m9, m6
15639
15640    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15641    pmaddwd m0, m7
15642    pmaddwd m1, m7
15643    pmaddwd m2, m7
15644    pmaddwd m8, m7
15645    paddd m0, m1
15646    paddd m2, m8
15647    paddd m12, m0, m2
15648
15649
15650    ; SA8D[16x8] ; pix[8*stride+16]
15651    lea  r0, [r0+8*r1]
15652    lea  r2, [r2+8*r3]
15653    lea  r6, [r6+8*r1]
15654    lea  r7, [r7+8*r3]
15655
15656    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15657    movu m0, [r0]
15658    movu m5, [r2]
15659    psubw m0, m5
15660    movu m1, [r0 + r1]
15661    movu m6, [r2 + r3]
15662    psubw m1, m6
15663    movu m2, [r0 + r1 * 2]
15664    movu m5, [r2 + r3 * 2]
15665    psubw m2, m5
15666    movu m8, [r0 + r4]
15667    movu m6, [r2 + r5]
15668    psubw m8, m6
15669
15670    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15671    movu m4, [r6]
15672    movu m11, [r7]
15673    psubw m4, m11
15674    movu m5, [r6 + r1]
15675    movu m6, [r7 + r3]
15676    psubw m5, m6
15677    movu m3, [r6 + r1 * 2]
15678    movu m11, [r7 + r3 * 2]
15679    psubw m3, m11
15680    movu m9, [r6 + r4]
15681    movu m6, [r7 + r5]
15682    psubw m9, m6
15683
15684    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15685    pmaddwd m0, m7
15686    pmaddwd m1, m7
15687    pmaddwd m2, m7
15688    pmaddwd m8, m7
15689    paddd m0, m1
15690    paddd m2, m8
15691    paddd m12, m0
15692    paddd m12, m2
15693
15694    ; sum[1]
15695    HADDD m12, m0
15696
15697
15698    ; SA8D[16x8] ; pix[8*stride]
15699    sub  r0, mmsize
15700    sub  r2, mmsize
15701    sub  r6, mmsize
15702    sub  r7, mmsize
15703
15704    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15705    movu m0, [r0]
15706    movu m5, [r2]
15707    psubw m0, m5
15708    movu m1, [r0 + r1]
15709    movu m6, [r2 + r3]
15710    psubw m1, m6
15711    movu m2, [r0 + r1 * 2]
15712    movu m5, [r2 + r3 * 2]
15713    psubw m2, m5
15714    movu m8, [r0 + r4]
15715    movu m6, [r2 + r5]
15716    psubw m8, m6
15717
15718    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15719    movu m4, [r6]
15720    movu m11, [r7]
15721    psubw m4, m11
15722    movu m5, [r6 + r1]
15723    movu m6, [r7 + r3]
15724    psubw m5, m6
15725    movu m3, [r6 + r1 * 2]
15726    movu m11, [r7 + r3 * 2]
15727    psubw m3, m11
15728    movu m9, [r6 + r4]
15729    movu m6, [r7 + r5]
15730    psubw m9, m6
15731
15732    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15733    pmaddwd m0, m7
15734    pmaddwd m1, m7
15735    pmaddwd m2, m7
15736    pmaddwd m8, m7
15737    paddd m0, m1
15738    paddd m2, m8
15739    paddd m10, m0
15740    paddd m10, m2
15741
15742    ; sum[0]
15743    HADDD m10, m0
15744    punpckldq xm10, xm12
15745
15746
15747    ;SA8D[16x8] ; pix[16*stridr]
15748    lea  r0, [r0+8*r1]
15749    lea  r2, [r2+8*r3]
15750    lea  r6, [r6+8*r1]
15751    lea  r7, [r7+8*r3]
15752
15753    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15754    movu m0, [r0]
15755    movu m5, [r2]
15756    psubw m0, m5
15757    movu m1, [r0 + r1]
15758    movu m6, [r2 + r3]
15759    psubw m1, m6
15760    movu m2, [r0 + r1 * 2]
15761    movu m5, [r2 + r3 * 2]
15762    psubw m2, m5
15763    movu m8, [r0 + r4]
15764    movu m6, [r2 + r5]
15765    psubw m8, m6
15766
15767    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15768    movu m4, [r6]
15769    movu m11, [r7]
15770    psubw m4, m11
15771    movu m5, [r6 + r1]
15772    movu m6, [r7 + r3]
15773    psubw m5, m6
15774    movu m3, [r6 + r1 * 2]
15775    movu m11, [r7 + r3 * 2]
15776    psubw m3, m11
15777    movu m9, [r6 + r4]
15778    movu m6, [r7 + r5]
15779    psubw m9, m6
15780
15781    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15782    pmaddwd m0, m7
15783    pmaddwd m1, m7
15784    pmaddwd m2, m7
15785    pmaddwd m8, m7
15786    paddd m0, m1
15787    paddd m2, m8
15788    paddd m12, m0, m2
15789
15790
15791    ; SA8D[16x8] ; pix[16*stride+16]
15792    add  r0, mmsize
15793    add  r2, mmsize
15794    add  r6, mmsize
15795    add  r7, mmsize
15796
15797    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15798    movu m0, [r0]
15799    movu m5, [r2]
15800    psubw m0, m5
15801    movu m1, [r0 + r1]
15802    movu m6, [r2 + r3]
15803    psubw m1, m6
15804    movu m2, [r0 + r1 * 2]
15805    movu m5, [r2 + r3 * 2]
15806    psubw m2, m5
15807    movu m8, [r0 + r4]
15808    movu m6, [r2 + r5]
15809    psubw m8, m6
15810
15811    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15812    movu m4, [r6]
15813    movu m11, [r7]
15814    psubw m4, m11
15815    movu m5, [r6 + r1]
15816    movu m6, [r7 + r3]
15817    psubw m5, m6
15818    movu m3, [r6 + r1 * 2]
15819    movu m11, [r7 + r3 * 2]
15820    psubw m3, m11
15821    movu m9, [r6 + r4]
15822    movu m6, [r7 + r5]
15823    psubw m9, m6
15824
15825    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15826    pmaddwd m0, m7
15827    pmaddwd m1, m7
15828    pmaddwd m2, m7
15829    pmaddwd m8, m7
15830    paddd m0, m1
15831    paddd m2, m8
15832    paddd m13, m0, m2
15833
15834
15835    ; SA8D[16x8] ; pix[24*stride+16]
15836    lea  r0, [r0+8*r1]
15837    lea  r2, [r2+8*r3]
15838    lea  r6, [r6+8*r1]
15839    lea  r7, [r7+8*r3]
15840
15841    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15842    movu m0, [r0]
15843    movu m5, [r2]
15844    psubw m0, m5
15845    movu m1, [r0 + r1]
15846    movu m6, [r2 + r3]
15847    psubw m1, m6
15848    movu m2, [r0 + r1 * 2]
15849    movu m5, [r2 + r3 * 2]
15850    psubw m2, m5
15851    movu m8, [r0 + r4]
15852    movu m6, [r2 + r5]
15853    psubw m8, m6
15854
15855    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15856    movu m4, [r6]
15857    movu m11, [r7]
15858    psubw m4, m11
15859    movu m5, [r6 + r1]
15860    movu m6, [r7 + r3]
15861    psubw m5, m6
15862    movu m3, [r6 + r1 * 2]
15863    movu m11, [r7 + r3 * 2]
15864    psubw m3, m11
15865    movu m9, [r6 + r4]
15866    movu m6, [r7 + r5]
15867    psubw m9, m6
15868
15869    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15870    pmaddwd m0, m7
15871    pmaddwd m1, m7
15872    pmaddwd m2, m7
15873    pmaddwd m8, m7
15874    paddd m0, m1
15875    paddd m2, m8
15876    paddd m13, m0
15877    paddd m13, m2
15878
15879    ; sum[3]
15880    HADDD m13, m0
15881
15882
15883    ; SA8D[16x8] ; pix[24*stride]
15884    sub  r0, mmsize
15885    sub  r2, mmsize
15886    sub  r6, mmsize
15887    sub  r7, mmsize
15888
15889    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
15890    movu m0, [r0]
15891    movu m5, [r2]
15892    psubw m0, m5
15893    movu m1, [r0 + r1]
15894    movu m6, [r2 + r3]
15895    psubw m1, m6
15896    movu m2, [r0 + r1 * 2]
15897    movu m5, [r2 + r3 * 2]
15898    psubw m2, m5
15899    movu m8, [r0 + r4]
15900    movu m6, [r2 + r5]
15901    psubw m8, m6
15902
15903    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
15904    movu m4, [r6]
15905    movu m11, [r7]
15906    psubw m4, m11
15907    movu m5, [r6 + r1]
15908    movu m6, [r7 + r3]
15909    psubw m5, m6
15910    movu m3, [r6 + r1 * 2]
15911    movu m11, [r7 + r3 * 2]
15912    psubw m3, m11
15913    movu m9, [r6 + r4]
15914    movu m6, [r7 + r5]
15915    psubw m9, m6
15916
15917    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
15918    pmaddwd m0, m7
15919    pmaddwd m1, m7
15920    pmaddwd m2, m7
15921    pmaddwd m8, m7
15922    paddd m0, m1
15923    paddd m2, m8
15924    paddd m12, m0
15925    paddd m12, m2
15926
15927    ; sum[2]
15928    HADDD m12, m0
15929    punpckldq xm12, xm13
15930
15931    ; SA8D
15932    punpcklqdq xm0, xm10, xm12
15933    paddd xm0, [pd_1]
15934    psrld xm0, 1
15935    HADDD xm0, xm1
15936
15937    movd eax, xm0
15938    RET
15939%endif
15940%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
15941
15942;template<int log2TrSize>
15943;static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
15944;{
15945;    *ssBlock = 0;
15946;    const uint32_t trSize = 1 << log2TrSize;
15947;    for (int y = 0; y < trSize; y++)
15948;    {
15949;        for (int x = 0; x < trSize; x++)
15950;        {
15951;            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff
15952;            *ssBlock += temp * temp;
15953;        }
15954;    }
15955;
15956;    *ac_k = 0;
15957;    for (int block_yy = 0; block_yy < trSize; block_yy += 1)
15958;    {
15959;        for (int block_xx = 0; block_xx < trSize; block_xx += 1)
15960;        {
15961;            uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
15962;            *ac_k += temp * temp;
15963;        }
15964;    }
15965;}
15966;-----------------------------------------------------------------------------------------------------------------
15967; void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
15968;-----------------------------------------------------------------------------------------------------------------
15969
15970INIT_YMM avx2
15971cglobal ssimDist4, 7, 8, 8
15972    mov            r5d,        4
15973    vpxor          m4,         m4                              ;ssBlock
15974    vpxor          m3,         m3
15975    vpxor          m7,         m7                              ;ac_k
15976.row:
15977%if HIGH_BIT_DEPTH
15978    vpmovzxwq      m0,        [r0]                             ;fenc
15979    vpmovzxwq      m1,        [r2]                             ;recon
15980%elif BIT_DEPTH == 8
15981    vpmovzxbq      m0,        [r0]
15982    vpmovzxbq      m1,        [r2]
15983%else
15984    %error Unsupported BIT_DEPTH!
15985%endif
15986    vpsrlq         m6,        m0,        SSIMRD_SHIFT
15987    vpsubq         m0,        m1
15988    vpmuldq        m0,        m0,        m0
15989    vpmuldq        m6,        m6,        m6
15990    vpaddq         m4,        m0
15991    vpaddq         m7,        m6
15992
15993%if HIGH_BIT_DEPTH
15994    lea            r0,        [r0 + 2 * r1]
15995    lea            r2,        [r2 + 2 * r3]
15996%else
15997    lea            r0,        [r0 + r1]
15998    lea            r2,        [r2 + r3]
15999%endif
16000    dec            r5d
16001    jnz           .row
16002    vextracti128   xm5,       m4,        1
16003    vpaddq         xm4,       xm5
16004    punpckhqdq     xm2,       xm4,       xm3
16005    paddq          xm4,       xm2
16006
16007    vextracti128   xm5,       m7,        1
16008    vpaddq         xm7,       xm5
16009    punpckhqdq     xm2,       xm7,       xm3
16010    paddq          xm7,       xm2
16011
16012    movq          [r4],       xm4
16013    movq          [r6],       xm7
16014    RET
16015
16016
16017INIT_YMM avx2
16018cglobal ssimDist8, 7, 8, 8
16019    mov            r5d,        8
16020    vpxor          m4,         m4                              ;ssBlock
16021    vpxor          m3,         m3
16022    vpxor          m7,         m7                              ;ac_k
16023.row:
16024%if HIGH_BIT_DEPTH
16025    vpmovzxwd      m0,        [r0]                             ;fenc
16026    vpmovzxwd      m1,        [r2]                             ;recon
16027%elif BIT_DEPTH == 8
16028    vpmovzxbd      m0,        [r0]
16029    vpmovzxbd      m1,        [r2]
16030%else
16031    %error Unsupported BIT_DEPTH!
16032%endif
16033
16034    SSIM_DIST_HIGH m0,          m1
16035
16036%if HIGH_BIT_DEPTH
16037    lea            r0,         [r0 + 2 * r1]
16038    lea            r2,         [r2 + 2 * r3]
16039%else
16040    lea            r0,         [r0 + r1]
16041    lea            r2,         [r2 + r3]
16042%endif
16043    dec            r5d
16044    jnz            .row
16045    vextracti128   xm5,        m4,        1
16046    vpaddq         xm4,        xm5
16047    punpckhqdq     xm2,        xm4,       xm3
16048    paddq          xm4,        xm2
16049
16050    vextracti128   xm5,        m7,       1
16051    vpaddq         xm7,        xm5
16052    punpckhqdq     xm2,        xm7,      xm3
16053    paddq          xm7,        xm2
16054
16055    movq           [r4],       xm4
16056    movq           [r6],       xm7
16057    RET
16058
16059
16060INIT_YMM avx2
16061cglobal ssimDist16, 7, 8, 8
16062    mov            r5d,         16
16063    vpxor          m4,          m4                                ;ssBlock
16064    vpxor          m3,          m3
16065    vpxor          m7,          m7                                ;ac_k
16066.row:
16067%if HIGH_BIT_DEPTH
16068;Col 1-8
16069    vpmovzxwd      m0,          [r0]                              ;fenc
16070    vpmovzxwd      m1,          [r2]                              ;recon
16071
16072    SSIM_DIST_HIGH m0,          m1
16073
16074;Col 9-16
16075    vpmovzxwd      m0,          [r0 + 16]
16076    vpmovzxwd      m1,          [r2 + 16]
16077
16078    SSIM_DIST_HIGH m0,          m1
16079
16080    lea            r0,         [r0 + 2 * r1]
16081    lea            r2,         [r2 + 2 * r3]
16082%elif BIT_DEPTH == 8
16083;col 1- 16
16084    vpmovzxbw      m0,         [r0]                             ;fenc
16085    vpmovzxbw      m1,         [r2]                             ;recon
16086
16087    SSIM_DIST_LOW  m0,         m1
16088
16089    lea            r0,         [r0 + r1]
16090    lea            r2,         [r2 + r3]
16091%else
16092    %error Unsupported BIT_DEPTH!
16093%endif
16094    dec            r5d
16095    jnz           .row
16096
16097%if HIGH_BIT_DEPTH
16098    vextracti128   xm5,        m4,        1
16099    vpaddq         xm4,        xm5
16100    punpckhqdq     xm2,        xm4,       xm3
16101    paddq          xm4,        xm2
16102
16103    vextracti128   xm5,        m7,        1
16104    vpaddq         xm7,        xm5
16105    punpckhqdq     xm2,        xm7,       xm3
16106    paddq          xm7,        xm2
16107%else
16108    vextracti128   xm5,        m4,        1
16109    vpaddd         xm4,        xm5
16110    punpckhqdq     xm2,        xm4,       xm3
16111    paddd          xm4,        xm2
16112    punpckldq      xm4,        xm4,       xm3
16113    punpckhqdq     xm2,        xm4,       xm3
16114    paddd          xm4,        xm2
16115
16116    vextracti128   xm5,        m7,        1
16117    vpaddd         xm7,        xm5
16118    punpckhqdq     xm2,        xm7,       xm3
16119    paddd          xm7,        xm2
16120    punpckldq      xm7,        xm7,       xm3
16121    punpckhqdq     xm2,        xm7,       xm3
16122    paddd          xm7,        xm2
16123%endif
16124    movq           [r4],       xm4
16125    movq           [r6],       xm7
16126    RET
16127
16128
16129INIT_YMM avx2
16130cglobal ssimDist32, 7, 8, 8
16131    mov            r5d,        32
16132    vpxor          m4,         m4                              ;ssBlock
16133    vpxor          m3,         m3
16134    vpxor          m7,         m7                              ;ac_k
16135.row:
16136%if HIGH_BIT_DEPTH
16137;Col 1-8
16138    vpmovzxwd      m0,         [r0]                            ;fenc
16139    vpmovzxwd      m1,         [r2]                            ;recon
16140
16141    SSIM_DIST_HIGH m0,          m1
16142
16143;Col 9-16
16144    vpmovzxwd      m0,          [r0 + 16]
16145    vpmovzxwd      m1,          [r2 + 16]
16146
16147    SSIM_DIST_HIGH m0,          m1
16148
16149;Col 17-24
16150    vpmovzxwd      m0,          [r0 + 32]
16151    vpmovzxwd      m1,          [r2 + 32]
16152
16153    SSIM_DIST_HIGH m0,          m1
16154
16155;Col 25-32
16156    vpmovzxwd      m0,          [r0 + 48]
16157    vpmovzxwd      m1,          [r2 + 48]
16158
16159    SSIM_DIST_HIGH m0,          m1
16160
16161    lea            r0,          [r0 + 2 * r1]
16162    lea            r2,          [r2 + 2 * r3]
16163%elif BIT_DEPTH == 8
16164;col 1-16
16165    vpmovzxbw      m0,         [r0]                             ;fenc
16166    vpmovzxbw      m1,         [r2]                             ;recon
16167
16168    SSIM_DIST_LOW  m0,         m1
16169
16170;col 17-32
16171    vpmovzxbw      m0,         [r0 + 16]
16172    vpmovzxbw      m1,         [r2 + 16]
16173
16174    SSIM_DIST_LOW  m0,         m1
16175
16176    lea            r0,          [r0 + r1]
16177    lea            r2,          [r2 + r3]
16178%else
16179    %error Unsupported BIT_DEPTH!
16180%endif
16181    dec            r5d
16182    jnz           .row
16183
16184%if HIGH_BIT_DEPTH
16185    vextracti128   xm5,         m4,        1
16186    vpaddq         xm4,         xm5
16187    punpckhqdq     xm2,         xm4,       xm3
16188    paddq          xm4,         xm2
16189
16190    vextracti128   xm5,         m7,        1
16191    vpaddq         xm7,         xm5
16192    punpckhqdq     xm2,         xm7,       xm3
16193    paddq          xm7,         xm2
16194%else
16195    vextracti128   xm5,        m4,        1
16196    vpaddd         xm4,        xm5
16197    punpckhqdq     xm2,        xm4,       xm3
16198    paddd          xm4,        xm2
16199    punpckldq      xm4,        xm4,       xm3
16200    punpckhqdq     xm2,        xm4,       xm3
16201    paddd          xm4,        xm2
16202
16203    vextracti128   xm5,        m7,        1
16204    vpaddd         xm7,        xm5
16205    punpckhqdq     xm2,        xm7,       xm3
16206    paddd          xm7,        xm2
16207    punpckldq      xm7,        xm7,       xm3
16208    punpckhqdq     xm2,        xm7,       xm3
16209    paddd          xm7,        xm2
16210%endif
16211    movq           [r4],        xm4
16212    movq           [r6],        xm7
16213    RET
16214
16215
16216INIT_YMM avx2
16217cglobal ssimDist64, 7, 8, 8
16218    mov            r5d,         64
16219    vpxor          m4,          m4                             ;ssBlock
16220    vpxor          m3,          m3
16221    vpxor          m7,          m7                             ;ac_k
16222.row:
16223%if HIGH_BIT_DEPTH
16224;Col 1-8
16225    vpmovzxwd      m0,          [r0]                           ;fenc
16226    vpmovzxwd      m1,          [r2]                           ;recon
16227
16228    SSIM_DIST_HIGH m0,          m1
16229
16230;Col 9-16
16231    vpmovzxwd      m0,          [r0 + 16]
16232    vpmovzxwd      m1,          [r2 + 16]
16233
16234    SSIM_DIST_HIGH m0,          m1
16235
16236;Col 17-24
16237    vpmovzxwd      m0,          [r0 + 32]
16238    vpmovzxwd      m1,          [r2 + 32]
16239
16240    SSIM_DIST_HIGH m0,          m1
16241
16242;Col 25-32
16243    vpmovzxwd      m0,          [r0 + 48]
16244    vpmovzxwd      m1,          [r2 + 48]
16245
16246    SSIM_DIST_HIGH m0,          m1
16247
16248;Col 33-40
16249    vpmovzxwd      m0,          [r0 + 64]
16250    vpmovzxwd      m1,          [r2 + 64]
16251
16252    SSIM_DIST_HIGH m0,          m1
16253
16254;Col 41-48
16255    vpmovzxwd      m0,          [r0 + 80]
16256    vpmovzxwd      m1,          [r2 + 80]
16257
16258    SSIM_DIST_HIGH m0,          m1
16259
16260;Col 49-56
16261    vpmovzxwd      m0,          [r0 + 96]
16262    vpmovzxwd      m1,          [r2 + 96]
16263
16264    SSIM_DIST_HIGH m0,          m1
16265
16266;Col 57-64
16267    vpmovzxwd      m0,          [r0 + 112]
16268    vpmovzxwd      m1,          [r2 + 112]
16269
16270    SSIM_DIST_HIGH m0,          m1
16271
16272    lea            r0,          [r0 + 2 * r1]
16273    lea            r2,          [r2 + 2 * r3]
16274%elif BIT_DEPTH == 8
16275;col 1-16
16276    vpmovzxbw      m0,         [r0]                             ;fenc
16277    vpmovzxbw      m1,         [r2]                             ;recon
16278
16279    SSIM_DIST_LOW  m0,         m1
16280
16281;col 17-32
16282    vpmovzxbw      m0,         [r0 + 16]
16283    vpmovzxbw      m1,         [r2 + 16]
16284
16285    SSIM_DIST_LOW  m0,         m1
16286
16287;col 33-48
16288    vpmovzxbw      m0,         [r0 + 32]
16289    vpmovzxbw      m1,         [r2 + 32]
16290
16291    SSIM_DIST_LOW  m0,         m1
16292
16293;col 49-64
16294    vpmovzxbw      m0,         [r0 + 48]
16295    vpmovzxbw      m1,         [r2 + 48]
16296
16297    SSIM_DIST_LOW  m0,         m1
16298
16299    lea            r0,          [r0 + r1]
16300    lea            r2,          [r2 + r3]
16301%endif
16302    dec            r5d
16303    jnz            .row
16304
16305%if HIGH_BIT_DEPTH
16306    vextracti128   xm5,          m4,        1
16307    vpaddq         xm4,          xm5
16308    punpckhqdq     xm2,          xm4,       xm3
16309    paddq          xm4,          xm2
16310
16311    vextracti128   xm5,          m7,        1
16312    vpaddq         xm7,          xm5
16313    punpckhqdq     xm2,          xm7,       xm3
16314    paddq          xm7,          xm2
16315%else
16316    vextracti128   xm5,        m4,        1
16317    vpaddd         xm4,        xm5
16318    punpckhqdq     xm2,        xm4,       xm3
16319    paddd          xm4,        xm2
16320    punpckldq      xm4,        xm4,       xm3
16321    punpckhqdq     xm2,        xm4,       xm3
16322    paddd          xm4,        xm2
16323
16324    vextracti128   xm5,        m7,        1
16325    vpaddd         xm7,        xm5
16326    punpckhqdq     xm2,        xm7,       xm3
16327    paddd          xm7,        xm2
16328    punpckldq      xm7,        xm7,       xm3
16329    punpckhqdq     xm2,        xm7,       xm3
16330    paddd          xm7,        xm2
16331%endif
16332    movq           [r4],         xm4
16333    movq           [r6],         xm7
16334    RET
16335
16336
16337;static void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
16338;{
16339;    *z_k = 0;
16340;    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
16341;    {
16342;        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
16343;        {
16344;            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
16345;            *z_k += temp * temp;
16346;        }
16347;    }
16348;}
16349;--------------------------------------------------------------------------------------
16350; void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
16351;--------------------------------------------------------------------------------------
16352INIT_YMM avx2
16353cglobal normFact8, 4, 5, 6
16354    mov            r4d,       8
16355    vpxor          m3,        m3                               ;z_k
16356    vpxor          m5,        m5
16357.row:
16358%if HIGH_BIT_DEPTH
16359    vpmovzxwd      m0,        [r0]                             ;src
16360%elif BIT_DEPTH == 8
16361    vpmovzxbd      m0,        [r0]
16362%else
16363    %error Unsupported BIT_DEPTH!
16364%endif
16365
16366    NORM_FACT_HIGH m0
16367
16368%if HIGH_BIT_DEPTH
16369    lea            r0,         [r0 + 2 * r1]
16370%else
16371    lea            r0,         [r0 + r1]
16372%endif
16373    dec            r4d
16374    jnz           .row
16375    vextracti128   xm4,         m3,        1
16376    vpaddq         xm3,         xm4
16377    punpckhqdq     xm2,         xm3,       xm5
16378    paddq          xm3,         xm2
16379    movq           [r3],        xm3
16380    RET
16381
16382
16383INIT_YMM avx2
16384cglobal normFact16, 4, 5, 6
16385    mov            r4d,         16
16386    vpxor          m3,          m3                                ;z_k
16387    vpxor          m5,          m5
16388.row:
16389%if HIGH_BIT_DEPTH
16390;Col 1-8
16391    vpmovzxwd      m0,          [r0]                              ;src
16392
16393    NORM_FACT_HIGH  m0
16394
16395;Col 9-16
16396    vpmovzxwd      m0,          [r0 + 16]
16397
16398    NORM_FACT_HIGH m0
16399
16400    lea            r0,         [r0 + 2 * r1]
16401%elif BIT_DEPTH == 8
16402;col 1-16
16403    vpmovzxbw      m0,         [r0]                             ;src
16404
16405    NORM_FACT_LOW  m0
16406
16407    lea            r0,         [r0 + r1]
16408%else
16409    %error Unsupported BIT_DEPTH!
16410%endif
16411    dec            r4d
16412    jnz           .row
16413
16414%if HIGH_BIT_DEPTH
16415    vextracti128   xm4,         m3,        1
16416    vpaddq         xm3,         xm4
16417    punpckhqdq     xm2,         xm3,       xm5
16418    paddq          xm3,         xm2
16419%else
16420    vextracti128   xm4,        m3,        1
16421    vpaddd         xm3,        xm4
16422    punpckhqdq     xm2,        xm3,       xm5
16423    paddd          xm3,        xm2
16424    punpckldq      xm3,        xm3,       xm5
16425    punpckhqdq     xm2,        xm3,       xm5
16426    paddd          xm3,        xm2
16427%endif
16428    movq           [r3],        xm3
16429    RET
16430
16431
16432INIT_YMM avx2
16433cglobal normFact32, 4, 5, 6
16434    mov            r4d,         32
16435    vpxor          m3,          m3                              ;z_k
16436    vpxor          m5,          m5
16437.row:
16438%if HIGH_BIT_DEPTH
16439;Col 1-8
16440    vpmovzxwd      m0,         [r0]                             ;src
16441
16442    NORM_FACT_HIGH m0
16443
16444;Col 9-16
16445    vpmovzxwd      m0,          [r0 + 16]
16446
16447    NORM_FACT_HIGH m0
16448
16449;Col 17-24
16450    vpmovzxwd      m0,          [r0 + 32]
16451
16452    NORM_FACT_HIGH  m0
16453
16454;Col 25-32
16455    vpmovzxwd      m0,          [r0 + 48]
16456
16457    NORM_FACT_HIGH m0
16458
16459    lea            r0,          [r0 + 2 * r1]
16460%elif BIT_DEPTH == 8
16461;col 1-16
16462    vpmovzxbw      m0,         [r0]                             ;src
16463
16464    NORM_FACT_LOW  m0
16465;col 17-32
16466    vpmovzxbw      m0,         [r0 + 16]
16467
16468    NORM_FACT_LOW  m0
16469
16470    lea            r0,          [r0 + r1]
16471%else
16472    %error Unsupported BIT_DEPTH!
16473%endif
16474    dec            r4d
16475    jnz           .row
16476
16477%if HIGH_BIT_DEPTH
16478    vextracti128   xm4,         m3,        1
16479    vpaddq         xm3,         xm4
16480    punpckhqdq     xm2,         xm3,       xm5
16481    paddq          xm3,         xm2
16482%else
16483    vextracti128   xm4,        m3,        1
16484    vpaddd         xm3,        xm4
16485    punpckhqdq     xm2,        xm3,       xm5
16486    paddd          xm3,        xm2
16487    punpckldq      xm3,        xm3,       xm5
16488    punpckhqdq     xm2,        xm3,       xm5
16489    paddd          xm3,        xm2
16490%endif
16491    movq           [r3],        xm3
16492    RET
16493
16494
16495INIT_YMM avx2
16496cglobal normFact64, 4, 5, 6
16497    mov            r4d,         64
16498    vpxor          m3,          m3                             ;z_k
16499    vpxor          m5,          m5
16500.row:
16501%if HIGH_BIT_DEPTH
16502;Col 1-8
16503    vpmovzxwd      m0,          [r0]                           ;src
16504
16505    NORM_FACT_HIGH m0
16506
16507;Col 9-16
16508    vpmovzxwd      m0,          [r0 + 16]
16509
16510    NORM_FACT_HIGH m0
16511
16512;Col 17-24
16513    vpmovzxwd      m0,          [r0 + 32]
16514
16515    NORM_FACT_HIGH  m0
16516
16517;Col 25-32
16518    vpmovzxwd      m0,          [r0 + 48]
16519
16520    NORM_FACT_HIGH  m0
16521
16522;Col 33-40
16523    vpmovzxwd      m0,          [r0 + 64]
16524
16525    NORM_FACT_HIGH  m0
16526
16527;Col 41-48
16528    vpmovzxwd      m0,          [r0 + 80]
16529
16530    NORM_FACT_HIGH  m0
16531
16532;Col 49-56
16533    vpmovzxwd      m0,          [r0 + 96]
16534
16535    NORM_FACT_HIGH  m0
16536
16537;Col 57-64
16538    vpmovzxwd      m0,          [r0 + 112]
16539
16540    NORM_FACT_HIGH m0
16541
16542    lea            r0,          [r0 + 2 * r1]
16543%elif BIT_DEPTH == 8
16544;col 1-16
16545    vpmovzxbw      m0,         [r0]                             ;src
16546
16547    NORM_FACT_LOW  m0
16548;col 17-32
16549    vpmovzxbw      m0,         [r0 + 16]
16550
16551    NORM_FACT_LOW  m0
16552;col 33-48
16553    vpmovzxbw      m0,         [r0 + 32]
16554
16555    NORM_FACT_LOW  m0
16556;col 49-56
16557    vpmovzxbw      m0,         [r0 + 48]
16558
16559    NORM_FACT_LOW  m0
16560
16561    lea            r0,          [r0 + r1]
16562%else
16563    %error Unsupported BIT_DEPTH!
16564%endif
16565    dec            r4d
16566    jnz           .row
16567
16568%if HIGH_BIT_DEPTH
16569    vextracti128   xm4,         m3,        1
16570    vpaddq         xm3,         xm4
16571    punpckhqdq     xm2,         xm3,       xm5
16572    paddq          xm3,         xm2
16573%else
16574    vextracti128   xm4,        m3,        1
16575    vpaddd         xm3,        xm4
16576    punpckhqdq     xm2,        xm3,       xm5
16577    paddd          xm3,        xm2
16578    punpckldq      xm3,        xm3,       xm5
16579    punpckhqdq     xm2,        xm3,       xm5
16580    paddd          xm3,        xm2
16581%endif
16582    movq           [r3],        xm3
16583    RET
16584