1;*****************************************************************************
2;* ssd-a.asm: x86 ssd functions
3;*****************************************************************************
4;* Copyright (C) 2003-2013 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Fiona Glaser <fiona@x264.com>
8;*          Laurent Aimar <fenrir@via.ecp.fr>
9;*          Alex Izvorski <aizvorksi@gmail.com>
10;*
11;* This program is free software; you can redistribute it and/or modify
12;* it under the terms of the GNU General Public License as published by
13;* the Free Software Foundation; either version 2 of the License, or
14;* (at your option) any later version.
15;*
16;* This program is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19;* GNU General Public License for more details.
20;*
21;* You should have received a copy of the GNU General Public License
22;* along with this program; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24;*
25;* This program is also available under a commercial proprietary license.
26;* For more information, contact us at license @ x265.com.
27;*****************************************************************************
28
29%include "x86inc.asm"
30%include "x86util.asm"
31
32SECTION_RODATA 32
33
34SECTION .text
35
36cextern pw_00ff
37cextern hsub_mul
38
39;=============================================================================
40; SSD
41;=============================================================================
42
43%if HIGH_BIT_DEPTH
44;-----------------------------------------------------------------------------
45; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
46;-----------------------------------------------------------------------------
47%macro SSD_ONE 2
48cglobal pixel_ssd_ss_%1x%2, 4,7,8
49    FIX_STRIDES r1, r3
50%if mmsize == %1*2
51    %define offset0_1 r1
52    %define offset0_2 r1*2
53    %define offset0_3 r5
54    %define offset1_1 r3
55    %define offset1_2 r3*2
56    %define offset1_3 r6
57    lea     r5, [3*r1]
58    lea     r6, [3*r3]
59%elif mmsize == %1
60    %define offset0_1 mmsize
61    %define offset0_2 r1
62    %define offset0_3 r1+mmsize
63    %define offset1_1 mmsize
64    %define offset1_2 r3
65    %define offset1_3 r3+mmsize
66%elif mmsize == %1/2
67    %define offset0_1 mmsize
68    %define offset0_2 mmsize*2
69    %define offset0_3 mmsize*3
70    %define offset1_1 mmsize
71    %define offset1_2 mmsize*2
72    %define offset1_3 mmsize*3
73%endif
74    %assign %%n %2/(2*mmsize/%1)
75%if %%n > 1
76    mov    r4d, %%n
77%endif
78    pxor    m0, m0
79.loop:
80    movu    m1, [r0]
81    movu    m2, [r0+offset0_1]
82    movu    m3, [r0+offset0_2]
83    movu    m4, [r0+offset0_3]
84    movu    m6, [r2]
85    movu    m7, [r2+offset1_1]
86    psubw   m1, m6
87    psubw   m2, m7
88    movu    m6, [r2+offset1_2]
89    movu    m7, [r2+offset1_3]
90    psubw   m3, m6
91    psubw   m4, m7
92%if %%n > 1
93    lea     r0, [r0+r1*(%2/%%n)]
94    lea     r2, [r2+r3*(%2/%%n)]
95%endif
96    pmaddwd m1, m1
97    pmaddwd m2, m2
98    pmaddwd m3, m3
99    pmaddwd m4, m4
100    paddd   m1, m2
101    paddd   m3, m4
102    paddd   m0, m1
103    paddd   m0, m3
104%if %%n > 1
105    dec    r4d
106    jg .loop
107%endif
108
109%if BIT_DEPTH == 12 && mmsize == 16
110    movu        m5, m0
111    pxor        m6, m6
112    punpckldq   m0, m6
113    punpckhdq   m5, m6
114    paddq       m0, m5
115    movhlps     m5, m0
116    paddq       m0, m5
117    movq        r6, xm0
118%else
119    HADDD   m0, m5
120    movd    eax,xm0
121%endif
122%ifidn movu,movq ; detect MMX
123    EMMS
124%endif
125    RET
126%endmacro
127
128%macro SSD_TWO 2
129cglobal pixel_ssd_ss_%1x%2, 4,7,8
130    FIX_STRIDES r1, r3
131    pxor    m0,  m0
132    mov     r4d, %2/2
133    lea     r5,  [r1 * 2]
134    lea     r6,  [r3 * 2]
135.loop:
136    movu    m1,  [r0]
137    movu    m2,  [r0 + 16]
138    movu    m3,  [r0 + 32]
139    movu    m4,  [r0 + 48]
140    movu    m6,  [r2]
141    movu    m7,  [r2 + 16]
142    psubw   m1,  m6
143    psubw   m2,  m7
144    movu    m6,  [r2 + 32]
145    movu    m7,  [r2 + 48]
146    psubw   m3,  m6
147    psubw   m4,  m7
148    pmaddwd m1,  m1
149    pmaddwd m2,  m2
150    pmaddwd m3,  m3
151    pmaddwd m4,  m4
152    paddd   m1,  m2
153    paddd   m3,  m4
154    paddd   m0,  m1
155    paddd   m0,  m3
156    movu    m1,  [r0 + 64]
157    movu    m2,  [r0 + 80]
158    movu    m6,  [r2 + 64]
159    movu    m7,  [r2 + 80]
160    psubw   m1,  m6
161    psubw   m2,  m7
162    pmaddwd m1,  m1
163    pmaddwd m2,  m2
164    paddd   m1,  m2
165    paddd   m0,  m1
166%if %1 == 64
167    movu    m3,  [r0 + 96]
168    movu    m4,  [r0 + 112]
169    movu    m6,  [r2 + 96]
170    movu    m7,  [r2 + 112]
171    psubw   m3,  m6
172    psubw   m4,  m7
173    pmaddwd m3,  m3
174    pmaddwd m4,  m4
175    paddd   m3,  m4
176    paddd   m0,  m3
177%endif
178    movu    m1,  [r0 + r1]
179    movu    m2,  [r0 + r1 + 16]
180    movu    m3,  [r0 + r1 + 32]
181    movu    m4,  [r0 + r1 + 48]
182    movu    m6,  [r2 + r3]
183    movu    m7,  [r2 + r3 + 16]
184    psubw   m1,  m6
185    psubw   m2,  m7
186    movu    m6,  [r2 + r3 + 32]
187    movu    m7,  [r2 + r3 + 48]
188    psubw   m3,  m6
189    psubw   m4,  m7
190    pmaddwd m1,  m1
191    pmaddwd m2,  m2
192    pmaddwd m3,  m3
193    pmaddwd m4,  m4
194    paddd   m1,  m2
195    paddd   m3,  m4
196    paddd   m0,  m1
197    paddd   m0,  m3
198    movu    m1,  [r0 + r1 + 64]
199    movu    m2,  [r0 + r1 + 80]
200    movu    m6,  [r2 + r3 + 64]
201    movu    m7,  [r2 + r3 + 80]
202    psubw   m1,  m6
203    psubw   m2,  m7
204    pmaddwd m1,  m1
205    pmaddwd m2,  m2
206    paddd   m1,  m2
207    paddd   m0,  m1
208%if %1 == 64
209    movu    m3,  [r0 + r1 + 96]
210    movu    m4,  [r0 + r1 + 112]
211    movu    m6,  [r2 + r3 + 96]
212    movu    m7,  [r2 + r3 + 112]
213    psubw   m3,  m6
214    psubw   m4,  m7
215    pmaddwd m3,  m3
216    pmaddwd m4,  m4
217    paddd   m3,  m4
218    paddd   m0,  m3
219%endif
220    lea     r0,  [r0 + r5]
221    lea     r2,  [r2 + r6]
222    dec     r4d
223    jnz  .loop
224    HADDD   m0, m5
225    movd   eax, xm0
226    RET
227%endmacro
228%macro SSD_24 2
229cglobal pixel_ssd_ss_%1x%2, 4,7,8
230    FIX_STRIDES r1, r3
231    pxor    m0,  m0
232    mov     r4d, %2/2
233    lea     r5,  [r1 * 2]
234    lea     r6,  [r3 * 2]
235.loop:
236    movu    m1,  [r0]
237    movu    m2,  [r0 + 16]
238    movu    m3,  [r0 + 32]
239    movu    m5,  [r2]
240    movu    m6,  [r2 + 16]
241    movu    m7,  [r2 + 32]
242    psubw   m1,  m5
243    psubw   m2,  m6
244    psubw   m3,  m7
245    pmaddwd m1,  m1
246    pmaddwd m2,  m2
247    pmaddwd m3,  m3
248    paddd   m1,  m2
249    paddd   m0,  m1
250    movu    m1,  [r0 + r1]
251    movu    m2,  [r0 + r1 + 16]
252    movu    m4,  [r0 + r1 + 32]
253    movu    m5,  [r2 + r3]
254    movu    m6,  [r2 + r3 + 16]
255    movu    m7,  [r2 + r3 + 32]
256    psubw   m1,  m5
257    psubw   m2,  m6
258    psubw   m4,  m7
259    pmaddwd m1,  m1
260    pmaddwd m2,  m2
261    pmaddwd m4,  m4
262    paddd   m1,  m2
263    paddd   m3,  m4
264    paddd   m0,  m1
265    paddd   m0,  m3
266    lea     r0,  [r0 + r5]
267    lea     r2,  [r2 + r6]
268    dec     r4d
269    jnz  .loop
270    HADDD   m0, m5
271    movd   eax, xm0
272    RET
273%endmacro
274%macro SSD_12 2
275cglobal pixel_ssd_ss_%1x%2, 4,7,8
276    FIX_STRIDES r1, r3
277    pxor    m0,  m0
278    mov     r4d, %2/4
279    lea     r5,  [r1 * 2]
280    lea     r6,  [r3 * 2]
281.loop:
282    movu        m1,  [r0]
283    movh        m2,  [r0 + 16]
284    movu        m3,  [r0 + r1]
285    punpcklqdq  m2,  [r0 + r1 + 16]
286    movu        m7,  [r2]
287    psubw       m1,  m7
288    movh        m4,  [r2 + 16]
289    movu        m7,  [r2 + r3]
290    psubw       m3,  m7
291    punpcklqdq  m4,  [r2 + r3 + 16]
292    psubw       m2,  m4
293    pmaddwd     m1,  m1
294    pmaddwd     m2,  m2
295    pmaddwd     m3,  m3
296    paddd       m1,  m2
297    paddd       m0,  m1
298
299    movu        m1,  [r0 + r5]
300    movh        m2,  [r0 + r5 + 16]
301    lea         r0,  [r0 + r5]
302    movu        m6,  [r0 + r1]
303    punpcklqdq  m2,  [r0 + r1 + 16]
304    movu        m7,  [r2 + r6]
305    psubw       m1,  m7
306    movh        m4,  [r2 + r6 + 16]
307    lea         r2,  [r2 + r6]
308    movu        m7,  [r2 + r3]
309    psubw       m6,  m7
310    punpcklqdq  m4,  [r2 + r3 + 16]
311    psubw       m2,  m4
312    pmaddwd     m1,  m1
313    pmaddwd     m2,  m2
314    pmaddwd     m6,  m6
315    paddd       m1,  m2
316    paddd       m3,  m6
317    paddd       m0,  m1
318    paddd       m0,  m3
319    lea         r0,  [r0 + r5]
320    lea         r2,  [r2 + r6]
321    dec         r4d
322    jnz     .loop
323    HADDD   m0, m5
324    movd   eax, xm0
325    RET
326%endmacro
327
328INIT_YMM avx2
329cglobal pixel_ssd_16x16, 4,7,8
330    FIX_STRIDES r1, r3
331    lea     r5, [3 * r1]
332    lea     r6, [3 * r3]
333    mov    r4d, 4
334    pxor    m0, m0
335.loop:
336    movu    m1, [r0]
337    movu    m2, [r0 + r1]
338    movu    m3, [r0 + r1 * 2]
339    movu    m4, [r0 + r5]
340    movu    m6, [r2]
341    movu    m7, [r2 + r3]
342    psubw   m1, m6
343    psubw   m2, m7
344    movu    m6, [r2 + r3 * 2]
345    movu    m7, [r2 + r6]
346    psubw   m3, m6
347    psubw   m4, m7
348
349    lea     r0, [r0 + r1 * 4]
350    lea     r2, [r2 + r3 * 4]
351
352    pmaddwd m1, m1
353    pmaddwd m2, m2
354    pmaddwd m3, m3
355    pmaddwd m4, m4
356    paddd   m1, m2
357    paddd   m3, m4
358    paddd   m0, m1
359    paddd   m0, m3
360
361    dec    r4d
362    jg .loop
363
364    HADDD   m0, m5
365    movd   eax, xm0
366    RET
367
368INIT_YMM avx2
369cglobal pixel_ssd_32x32, 4,7,8
370    add     r1, r1
371    add     r3, r3
372    mov     r4d, 16
373    pxor    m0, m0
374.loop:
375    movu    m1, [r0]
376    movu    m2, [r0 + 32]
377    movu    m3, [r0 + r1]
378    movu    m4, [r0 + r1 + 32]
379    movu    m6, [r2]
380    movu    m7, [r2 + 32]
381    psubw   m1, m6
382    psubw   m2, m7
383    movu    m6, [r2 + r3]
384    movu    m7, [r2 + r3 + 32]
385    psubw   m3, m6
386    psubw   m4, m7
387
388    lea     r0, [r0 + r1 * 2]
389    lea     r2, [r2 + r3 * 2]
390
391    pmaddwd m1, m1
392    pmaddwd m2, m2
393    pmaddwd m3, m3
394    pmaddwd m4, m4
395    paddd   m1, m2
396    paddd   m3, m4
397    paddd   m0, m1
398    paddd   m0, m3
399
400    dec    r4d
401    jg .loop
402
403    HADDD   m0, m5
404    movd   eax, xm0
405    RET
406
407INIT_YMM avx2
408cglobal pixel_ssd_64x64, 4,7,8
409    FIX_STRIDES r1, r3
410    mov    r4d, 64
411    pxor    m0, m0
412.loop:
413    movu    m1, [r0]
414    movu    m2, [r0+32]
415    movu    m3, [r0+32*2]
416    movu    m4, [r0+32*3]
417    movu    m6, [r2]
418    movu    m7, [r2+32]
419    psubw   m1, m6
420    psubw   m2, m7
421    movu    m6, [r2+32*2]
422    movu    m7, [r2+32*3]
423    psubw   m3, m6
424    psubw   m4, m7
425
426    lea     r0, [r0+r1]
427    lea     r2, [r2+r3]
428
429    pmaddwd m1, m1
430    pmaddwd m2, m2
431    pmaddwd m3, m3
432    pmaddwd m4, m4
433    paddd   m1, m2
434    paddd   m3, m4
435    paddd   m0, m1
436    paddd   m0, m3
437
438    dec    r4d
439    jg .loop
440
441    HADDD   m0, m5
442    movd   eax, xm0
443    RET
444
445INIT_MMX mmx2
446SSD_ONE     4,  4
447SSD_ONE     4,  8
448SSD_ONE     4, 16
449SSD_ONE     8,  4
450SSD_ONE     8,  8
451SSD_ONE     8, 16
452SSD_ONE    16,  8
453SSD_ONE    16, 16
454INIT_XMM sse2
455SSD_ONE     8,  4
456SSD_ONE     8,  8
457SSD_ONE     8, 16
458SSD_ONE     8, 32
459SSD_12     12, 16
460SSD_ONE    16,  4
461SSD_ONE    16,  8
462SSD_ONE    16, 12
463SSD_ONE    16, 16
464SSD_ONE    16, 32
465SSD_ONE    16, 64
466SSD_24     24, 32
467SSD_ONE    32,  8
468SSD_ONE    32, 16
469SSD_ONE    32, 24
470SSD_ONE    32, 32
471SSD_ONE    32, 64
472SSD_TWO    48, 64
473SSD_TWO    64, 16
474SSD_TWO    64, 32
475SSD_TWO    64, 48
476SSD_TWO    64, 64
477INIT_YMM avx2
478SSD_ONE    16,  8
479SSD_ONE    16, 16
480SSD_ONE    32, 32
481SSD_ONE    64, 64
482SSD_ONE    16, 32
483SSD_ONE    32, 64
484%endif ; HIGH_BIT_DEPTH
485
486;-----------------------------------------------------------------------------
487; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
488;-----------------------------------------------------------------------------
489%if HIGH_BIT_DEPTH == 0
490%macro SSD_SS 2
491cglobal pixel_ssd_ss_%1x%2, 4,7,6
492    FIX_STRIDES r1, r3
493%if mmsize == %1*4 || mmsize == %1*2
494    %define offset0_1 r1*2
495    %define offset0_2 r1*4
496    %define offset0_3 r5
497    %define offset1_1 r3*2
498    %define offset1_2 r3*4
499    %define offset1_3 r6
500    lea     r5, [4*r1]
501    lea     r6, [4*r3]
502    lea     r5, [r5 + 2*r1]
503    lea     r6, [r6 + 2*r3]
504%elif mmsize == %1
505    %define offset0_1 16
506    %define offset0_2 r1*2
507    %define offset0_3 r1*2+16
508    %define offset1_1 16
509    %define offset1_2 r3*2
510    %define offset1_3 r3*2+16
511%endif
512%if %1 == 4
513    %assign %%n %2/(mmsize/%1)
514%else
515    %assign %%n %2/(2*mmsize/%1)
516%endif
517%if %%n > 1
518    mov    r4d, %%n
519%endif
520    pxor    m0, m0
521.loop:
522%if %1 == 4
523    movh    m1, [r0]
524    movh    m2, [r2]
525    psubw   m1, m2
526    pmaddwd m1, m1
527    paddd   m0, m1
528    movh    m1, [r0 + offset0_1]
529    movh    m2, [r2 + offset1_1]
530    psubw   m1, m2
531    pmaddwd m1, m1
532    paddd   m0, m1
533    movh    m1, [r0 + offset0_2]
534    movh    m2, [r2 + offset1_2]
535    psubw   m1, m2
536    pmaddwd m1, m1
537    paddd   m0, m1
538    movh    m1, [r0 + offset0_3]
539    movh    m2, [r2 + offset1_3]
540    psubw   m1, m2
541    pmaddwd m1, m1
542    paddd   m0, m1
543%else
544    movu    m1, [r0]
545    movu    m2, [r2]
546    psubw   m1, m2
547    pmaddwd m1, m1
548    paddd   m0, m1
549    movu    m1, [r0 + offset0_1]
550    movu    m2, [r2 + offset1_1]
551    psubw   m1, m2
552    pmaddwd m1, m1
553    paddd   m0, m1
554    movu    m1, [r0 + offset0_2]
555    movu    m2, [r2 + offset1_2]
556    psubw   m1, m2
557    pmaddwd m1, m1
558    paddd   m0, m1
559    movu    m1, [r0 + offset0_3]
560    movu    m2, [r2 + offset1_3]
561    psubw   m1, m2
562    pmaddwd m1, m1
563    paddd   m0, m1
564%endif
565    lea       r0, [r0+r1*(%2/%%n)*2]
566    lea       r2, [r2+r3*(%2/%%n)*2]
567%if %%n > 1
568    dec    r4d
569    jg .loop
570%endif
571%if %1 == 4
572  %if notcpuflag(ssse3)
573    pshufd   m1, m0, 1
574    paddd    m0, m1
575  %else
576    phaddd   m0, m0
577  %endif
578%else
579    HADDD    m0, m1
580%endif
581    movd     eax, m0
582    RET
583%endmacro
584%macro SSD_SS_ONE 0
585SSD_SS     4,  4
586SSD_SS     4,  8
587SSD_SS     4, 16
588SSD_SS     8,  4
589SSD_SS     8,  8
590SSD_SS     8, 16
591SSD_SS     8, 32
592SSD_SS    16,  4
593SSD_SS    16,  8
594SSD_SS    16, 12
595SSD_SS    16, 16
596SSD_SS    16, 32
597SSD_SS    16, 64
598%endmacro
599
600%macro SSD_SS_12x16 0
601cglobal pixel_ssd_ss_12x16, 4,7,6
602    FIX_STRIDES r1, r3
603    mov    r4d, 8
604    pxor    m0, m0
605.loop:
606    movu    m1, [r0]
607    movu    m2, [r2]
608    psubw   m1, m2
609    pmaddwd m1, m1
610    paddd   m0, m1
611    movu    m1, [r0 + 16]
612    movu    m2, [r2 + 16]
613    psubw   m1, m2
614    pmaddwd m1, m1
615    pslldq  m1, 8
616    psrldq  m1, 8
617    paddd   m0, m1
618    lea       r0, [r0 + 2*r1]
619    lea       r2, [r2 + 2*r3]
620    movu    m1, [r0]
621    movu    m2, [r2]
622    psubw   m1, m2
623    pmaddwd m1, m1
624    paddd   m0, m1
625    movu    m1, [r0 + 16]
626    movu    m2, [r2 + 16]
627    psubw   m1, m2
628    pmaddwd m1, m1
629    pslldq  m1, 8
630    psrldq  m1, 8
631    paddd   m0, m1
632    lea       r0, [r0 + 2*r1]
633    lea       r2, [r2 + 2*r3]
634    dec      r4d
635    jnz .loop
636    HADDD     m0, m1
637    movd     eax, m0
638    RET
639%endmacro
640
641%macro SSD_SS_32 1
642cglobal pixel_ssd_ss_32x%1, 4,7,6
643    FIX_STRIDES r1, r3
644    mov    r4d, %1/2
645    pxor    m0, m0
646.loop:
647    movu    m1, [r0]
648    movu    m2, [r2]
649    psubw   m1, m2
650    pmaddwd m1, m1
651    paddd   m0, m1
652    movu    m1, [r0 + 16]
653    movu    m2, [r2 + 16]
654    psubw   m1, m2
655    pmaddwd m1, m1
656    paddd   m0, m1
657    movu    m1, [r0 + 32]
658    movu    m2, [r2 + 32]
659    psubw   m1, m2
660    pmaddwd m1, m1
661    paddd   m0, m1
662    movu    m1, [r0 + 48]
663    movu    m2, [r2 + 48]
664    psubw   m1, m2
665    pmaddwd m1, m1
666    paddd   m0, m1
667    lea       r0, [r0 + 2*r1]
668    lea       r2, [r2 + 2*r3]
669    movu    m1, [r0]
670    movu    m2, [r2]
671    psubw   m1, m2
672    pmaddwd m1, m1
673    paddd   m0, m1
674    movu    m1, [r0 + 16]
675    movu    m2, [r2 + 16]
676    psubw   m1, m2
677    pmaddwd m1, m1
678    paddd   m0, m1
679    movu    m1, [r0 + 32]
680    movu    m2, [r2 + 32]
681    psubw   m1, m2
682    pmaddwd m1, m1
683    paddd   m0, m1
684    movu    m1, [r0 + 48]
685    movu    m2, [r2 + 48]
686    psubw   m1, m2
687    pmaddwd m1, m1
688    paddd   m0, m1
689    lea       r0, [r0 + 2*r1]
690    lea       r2, [r2 + 2*r3]
691    dec      r4d
692    jnz .loop
693    HADDD     m0, m1
694    movd     eax, m0
695    RET
696%endmacro
697
698%macro SSD_SS_32xN 0
699SSD_SS_32 8
700SSD_SS_32 16
701SSD_SS_32 24
702SSD_SS_32 32
703SSD_SS_32 64
704%endmacro
705
706%macro SSD_SS_24 0
707cglobal pixel_ssd_ss_24x32, 4,7,6
708    FIX_STRIDES r1, r3
709    mov    r4d, 16
710    pxor    m0, m0
711.loop:
712    movu    m1, [r0]
713    movu    m2, [r2]
714    psubw   m1, m2
715    pmaddwd m1, m1
716    paddd   m0, m1
717    movu    m1, [r0 + 16]
718    movu    m2, [r2 + 16]
719    psubw   m1, m2
720    pmaddwd m1, m1
721    paddd   m0, m1
722    movu    m1, [r0 + 32]
723    movu    m2, [r2 + 32]
724    psubw   m1, m2
725    pmaddwd m1, m1
726    paddd   m0, m1
727    lea       r0, [r0 + 2*r1]
728    lea       r2, [r2 + 2*r3]
729    movu    m1, [r0]
730    movu    m2, [r2]
731    psubw   m1, m2
732    pmaddwd m1, m1
733    paddd   m0, m1
734    movu    m1, [r0 + 16]
735    movu    m2, [r2 + 16]
736    psubw   m1, m2
737    pmaddwd m1, m1
738    paddd   m0, m1
739    movu    m1, [r0 + 32]
740    movu    m2, [r2 + 32]
741    psubw   m1, m2
742    pmaddwd m1, m1
743    paddd   m0, m1
744    lea       r0, [r0 + 2*r1]
745    lea       r2, [r2 + 2*r3]
746    dec      r4d
747    jnz .loop
748    HADDD     m0, m1
749    movd     eax, m0
750    RET
751%endmacro
752
753%macro SSD_SS_48 0
754cglobal pixel_ssd_ss_48x64, 4,7,6
755    FIX_STRIDES r1, r3
756    mov    r4d, 32
757    pxor    m0, m0
758.loop:
759    movu    m1, [r0]
760    movu    m2, [r2]
761    psubw   m1, m2
762    pmaddwd m1, m1
763    paddd   m0, m1
764    movu    m1, [r0 + 16]
765    movu    m2, [r2 + 16]
766    psubw   m1, m2
767    pmaddwd m1, m1
768    paddd   m0, m1
769    movu    m1, [r0 + 32]
770    movu    m2, [r2 + 32]
771    psubw   m1, m2
772    pmaddwd m1, m1
773    paddd   m0, m1
774    movu    m1, [r0 + 48]
775    movu    m2, [r2 + 48]
776    psubw   m1, m2
777    pmaddwd m1, m1
778    paddd   m0, m1
779    movu    m1, [r0 + 64]
780    movu    m2, [r2 + 64]
781    psubw   m1, m2
782    pmaddwd m1, m1
783    paddd   m0, m1
784    movu    m1, [r0 + 80]
785    movu    m2, [r2 + 80]
786    psubw   m1, m2
787    pmaddwd m1, m1
788    paddd   m0, m1
789    lea       r0, [r0 + 2*r1]
790    lea       r2, [r2 + 2*r3]
791    movu    m1, [r0]
792    movu    m2, [r2]
793    psubw   m1, m2
794    pmaddwd m1, m1
795    paddd   m0, m1
796    movu    m1, [r0 + 16]
797    movu    m2, [r2 + 16]
798    psubw   m1, m2
799    pmaddwd m1, m1
800    paddd   m0, m1
801    movu    m1, [r0 + 32]
802    movu    m2, [r2 + 32]
803    psubw   m1, m2
804    pmaddwd m1, m1
805    paddd   m0, m1
806    movu    m1, [r0 + 48]
807    movu    m2, [r2 + 48]
808    psubw   m1, m2
809    pmaddwd m1, m1
810    paddd   m0, m1
811    movu    m1, [r0 + 64]
812    movu    m2, [r2 + 64]
813    psubw   m1, m2
814    pmaddwd m1, m1
815    paddd   m0, m1
816    movu    m1, [r0 + 80]
817    movu    m2, [r2 + 80]
818    psubw   m1, m2
819    pmaddwd m1, m1
820    paddd   m0, m1
821    lea       r0, [r0 + 2*r1]
822    lea       r2, [r2 + 2*r3]
823    dec      r4d
824    jnz .loop
825    HADDD     m0, m1
826    movd     eax, m0
827    RET
828%endmacro
829
830%macro SSD_SS_64 1
831cglobal pixel_ssd_ss_64x%1, 4,7,6
832    FIX_STRIDES r1, r3
833    mov    r4d, %1/2
834    pxor    m0, m0
835.loop:
836    movu    m1, [r0]
837    movu    m2, [r2]
838    psubw   m1, m2
839    pmaddwd m1, m1
840    paddd   m0, m1
841    movu    m1, [r0 + 16]
842    movu    m2, [r2 + 16]
843    psubw   m1, m2
844    pmaddwd m1, m1
845    paddd   m0, m1
846    movu    m1, [r0 + 32]
847    movu    m2, [r2 + 32]
848    psubw   m1, m2
849    pmaddwd m1, m1
850    paddd   m0, m1
851    movu    m1, [r0 + 48]
852    movu    m2, [r2 + 48]
853    psubw   m1, m2
854    pmaddwd m1, m1
855    paddd   m0, m1
856    movu    m1, [r0 + 64]
857    movu    m2, [r2 + 64]
858    psubw   m1, m2
859    pmaddwd m1, m1
860    paddd   m0, m1
861    movu    m1, [r0 + 80]
862    movu    m2, [r2 + 80]
863    psubw   m1, m2
864    pmaddwd m1, m1
865    paddd   m0, m1
866    movu    m1, [r0 + 96]
867    movu    m2, [r2 + 96]
868    psubw   m1, m2
869    pmaddwd m1, m1
870    paddd   m0, m1
871    movu    m1, [r0 + 112]
872    movu    m2, [r2 + 112]
873    psubw   m1, m2
874    pmaddwd m1, m1
875    paddd   m0, m1
876    lea     r0, [r0 + 2*r1]
877    lea     r2, [r2 + 2*r3]
878    movu    m1, [r0]
879    movu    m2, [r2]
880    psubw   m1, m2
881    pmaddwd m1, m1
882    paddd   m0, m1
883    movu    m1, [r0 + 16]
884    movu    m2, [r2 + 16]
885    psubw   m1, m2
886    pmaddwd m1, m1
887    paddd   m0, m1
888    movu    m1, [r0 + 32]
889    movu    m2, [r2 + 32]
890    psubw   m1, m2
891    pmaddwd m1, m1
892    paddd   m0, m1
893    movu    m1, [r0 + 48]
894    movu    m2, [r2 + 48]
895    psubw   m1, m2
896    pmaddwd m1, m1
897    paddd   m0, m1
898    movu    m1, [r0 + 64]
899    movu    m2, [r2 + 64]
900    psubw   m1, m2
901    pmaddwd m1, m1
902    paddd   m0, m1
903    movu    m1, [r0 + 80]
904    movu    m2, [r2 + 80]
905    psubw   m1, m2
906    pmaddwd m1, m1
907    paddd   m0, m1
908    movu    m1, [r0 + 96]
909    movu    m2, [r2 + 96]
910    psubw   m1, m2
911    pmaddwd m1, m1
912    paddd   m0, m1
913    movu    m1, [r0 + 112]
914    movu    m2, [r2 + 112]
915    psubw   m1, m2
916    pmaddwd m1, m1
917    paddd   m0, m1
918    lea     r0, [r0 + 2*r1]
919    lea     r2, [r2 + 2*r3]
920    dec     r4d
921    jnz .loop
922    HADDD     m0, m1
923    movd     eax, m0
924    RET
925%endmacro
926
927%macro SSD_SS_64xN 0
928SSD_SS_64 16
929SSD_SS_64 32
930SSD_SS_64 48
931SSD_SS_64 64
932%endmacro
933
934INIT_XMM sse2
935SSD_SS_ONE
936SSD_SS_12x16
937SSD_SS_24
938SSD_SS_32xN
939SSD_SS_48
940SSD_SS_64xN
941INIT_XMM sse4
942SSD_SS_ONE
943SSD_SS_12x16
944SSD_SS_24
945SSD_SS_32xN
946SSD_SS_48
947SSD_SS_64xN
948INIT_XMM avx
949SSD_SS_ONE
950SSD_SS_12x16
951SSD_SS_24
952SSD_SS_32xN
953SSD_SS_48
954SSD_SS_64xN
955%endif ; !HIGH_BIT_DEPTH
956
957%if HIGH_BIT_DEPTH == 0
958%macro SSD_LOAD_FULL 5
959    movu      m1, [t0+%1]
960    movu      m2, [t2+%2]
961    movu      m3, [t0+%3]
962    movu      m4, [t2+%4]
963%if %5==1
964    add       t0, t1
965    add       t2, t3
966%elif %5==2
967    lea       t0, [t0+2*t1]
968    lea       t2, [t2+2*t3]
969%endif
970%endmacro
971
972%macro LOAD 5
973    movh      m%1, %3
974    movh      m%2, %4
975%if %5
976    lea       t0, [t0+2*t1]
977%endif
978%endmacro
979
980%macro JOIN 7
981    movh      m%3, %5
982    movh      m%4, %6
983%if %7
984    lea       t2, [t2+2*t3]
985%endif
986    punpcklbw m%1, m7
987    punpcklbw m%3, m7
988    psubw     m%1, m%3
989    punpcklbw m%2, m7
990    punpcklbw m%4, m7
991    psubw     m%2, m%4
992%endmacro
993
994%macro JOIN_SSE2 7
995    movh      m%3, %5
996    movh      m%4, %6
997%if %7
998    lea       t2, [t2+2*t3]
999%endif
1000    punpcklqdq m%1, m%2
1001    punpcklqdq m%3, m%4
1002    DEINTB %2, %1, %4, %3, 7
1003    psubw m%2, m%4
1004    psubw m%1, m%3
1005%endmacro
1006
1007%macro JOIN_SSSE3 7
1008    movh      m%3, %5
1009    movh      m%4, %6
1010%if %7
1011    lea       t2, [t2+2*t3]
1012%endif
1013    punpcklbw m%1, m%3
1014    punpcklbw m%2, m%4
1015%endmacro
1016
1017%macro LOAD_AVX2 5
1018    mova     xm%1, %3
1019    vinserti128 m%1, m%1, %4, 1
1020%if %5
1021    lea       t0, [t0+2*t1]
1022%endif
1023%endmacro
1024
1025%macro JOIN_AVX2 7
1026    mova     xm%2, %5
1027    vinserti128 m%2, m%2, %6, 1
1028%if %7
1029    lea       t2, [t2+2*t3]
1030%endif
1031    SBUTTERFLY bw, %1, %2, %3
1032%endmacro
1033
1034%macro SSD_LOAD_HALF 5
1035    LOAD      1, 2, [t0+%1], [t0+%3], 1
1036    JOIN      1, 2, 3, 4, [t2+%2], [t2+%4], 1
1037    LOAD      3, 4, [t0+%1], [t0+%3], %5
1038    JOIN      3, 4, 5, 6, [t2+%2], [t2+%4], %5
1039%endmacro
1040
1041%macro SSD_CORE 7-8
1042%ifidn %8, FULL
1043    mova      m%6, m%2
1044    mova      m%7, m%4
1045    psubusb   m%2, m%1
1046    psubusb   m%4, m%3
1047    psubusb   m%1, m%6
1048    psubusb   m%3, m%7
1049    por       m%1, m%2
1050    por       m%3, m%4
1051    punpcklbw m%2, m%1, m%5
1052    punpckhbw m%1, m%5
1053    punpcklbw m%4, m%3, m%5
1054    punpckhbw m%3, m%5
1055%endif
1056    pmaddwd   m%1, m%1
1057    pmaddwd   m%2, m%2
1058    pmaddwd   m%3, m%3
1059    pmaddwd   m%4, m%4
1060%endmacro
1061
1062%macro SSD_CORE_SSE2 7-8
1063%ifidn %8, FULL
1064    DEINTB %6, %1, %7, %2, %5
1065    psubw m%6, m%7
1066    psubw m%1, m%2
1067    SWAP %6, %2, %1
1068    DEINTB %6, %3, %7, %4, %5
1069    psubw m%6, m%7
1070    psubw m%3, m%4
1071    SWAP %6, %4, %3
1072%endif
1073    pmaddwd   m%1, m%1
1074    pmaddwd   m%2, m%2
1075    pmaddwd   m%3, m%3
1076    pmaddwd   m%4, m%4
1077%endmacro
1078
1079%macro SSD_CORE_SSSE3 7-8
1080%ifidn %8, FULL
1081    punpckhbw m%6, m%1, m%2
1082    punpckhbw m%7, m%3, m%4
1083    punpcklbw m%1, m%2
1084    punpcklbw m%3, m%4
1085    SWAP %6, %2, %3
1086    SWAP %7, %4
1087%endif
1088    pmaddubsw m%1, m%5
1089    pmaddubsw m%2, m%5
1090    pmaddubsw m%3, m%5
1091    pmaddubsw m%4, m%5
1092    pmaddwd   m%1, m%1
1093    pmaddwd   m%2, m%2
1094    pmaddwd   m%3, m%3
1095    pmaddwd   m%4, m%4
1096%endmacro
1097
1098%macro SSD_ITER 6
1099    SSD_LOAD_%1 %2,%3,%4,%5,%6
1100    SSD_CORE  1, 2, 3, 4, 7, 5, 6, %1
1101    paddd     m1, m2
1102    paddd     m3, m4
1103    paddd     m0, m1
1104    paddd     m0, m3
1105%endmacro
1106
1107;-----------------------------------------------------------------------------
1108; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1109;-----------------------------------------------------------------------------
1110%macro SSD 2
1111%if %1 != %2
1112    %assign function_align 8
1113%else
1114    %assign function_align 16
1115%endif
1116cglobal pixel_ssd_%1x%2, 0,0,0
1117    mov     al, %1*%2/mmsize/2
1118
1119%if %1 != %2
1120    jmp mangle(private_prefix %+ _ %+ pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
1121%else
1122
1123.startloop:
1124%if ARCH_X86_64
1125    DECLARE_REG_TMP 0,1,2,3
1126    PROLOGUE 0,0,8
1127%else
1128    PROLOGUE 0,5
1129    DECLARE_REG_TMP 1,2,3,4
1130    mov t0, r0m
1131    mov t1, r1m
1132    mov t2, r2m
1133    mov t3, r3m
1134%endif
1135
1136%if cpuflag(ssse3)
1137    mova    m7, [hsub_mul]
1138%elifidn cpuname, sse2
1139    mova    m7, [pw_00ff]
1140%elif %1 >= mmsize
1141    pxor    m7, m7
1142%endif
1143    pxor    m0, m0
1144
1145ALIGN 16
1146.loop:
1147%if %1 > mmsize
1148    SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
1149%elif %1 == mmsize
1150    SSD_ITER FULL, 0, 0, t1, t3, 2
1151%else
1152    SSD_ITER HALF, 0, 0, t1, t3, 2
1153%endif
1154    dec     al
1155    jg .loop
1156%if mmsize==32
1157    vextracti128 xm1, m0, 1
1158    paddd  xm0, xm1
1159    HADDD  xm0, xm1
1160    movd   eax, xm0
1161%else
1162    HADDD   m0, m1
1163    movd   eax, m0
1164%endif
1165%if (mmsize == 8)
1166    emms
1167%endif
1168    RET
1169%endif
1170%endmacro
1171
1172%macro HEVC_SSD 0
1173SSD 32, 64
1174SSD 16, 64
1175SSD 32, 32
1176SSD 32, 16
1177SSD 16, 32
1178SSD 32, 8
1179SSD 8,  32
1180SSD 32, 24
1181SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol
1182SSD 8,  4
1183SSD 8,  8
1184SSD 16, 16
1185SSD 16, 12
1186SSD 16, 8
1187SSD 8,  16
1188SSD 16, 4
1189%endmacro
1190
1191INIT_MMX mmx
1192SSD 16, 16
1193SSD 16,  8
1194SSD  8,  8
1195SSD  8, 16
1196SSD  4,  4
1197SSD  8,  4
1198SSD  4,  8
1199SSD  4, 16
1200INIT_XMM sse2slow
1201SSD 16, 16
1202SSD  8,  8
1203SSD 16,  8
1204SSD  8, 16
1205SSD  8,  4
1206INIT_XMM sse2
1207%define SSD_CORE SSD_CORE_SSE2
1208%define JOIN JOIN_SSE2
1209HEVC_SSD
1210INIT_XMM ssse3
1211%define SSD_CORE SSD_CORE_SSSE3
1212%define JOIN JOIN_SSSE3
1213HEVC_SSD
1214INIT_XMM avx
1215HEVC_SSD
1216INIT_MMX ssse3
1217SSD  4,  4
1218SSD  4,  8
1219SSD  4, 16
1220INIT_XMM xop
1221SSD 16, 16
1222SSD  8,  8
1223SSD 16,  8
1224SSD  8, 16
1225SSD  8,  4
1226%define LOAD LOAD_AVX2
1227%define JOIN JOIN_AVX2
1228INIT_YMM avx2
1229SSD 16, 16
1230SSD 16,  8
1231SSD 32, 32
1232SSD 64, 64
1233%assign function_align 16
1234%endif ; !HIGH_BIT_DEPTH
1235
1236;-----------------------------------------------------------------------------
1237; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1238;-----------------------------------------------------------------------------
1239INIT_XMM sse4
1240cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
1241
1242    pxor        m6,     m6
1243    mov         r4d,    4
1244
1245.loop:
1246    movu        m0,    [r0]
1247    movu        m1,    [r2]
1248    movu        m2,    [r0 + r1]
1249    movu        m3,    [r2 + r3]
1250
1251    punpckhdq   m4,    m0,    m2
1252    punpckhdq   m5,    m1,    m3
1253
1254    pmovzxbw    m0,    m0
1255    pmovzxbw    m1,    m1
1256    pmovzxbw    m2,    m2
1257    pmovzxbw    m3,    m3
1258    pmovzxbw    m4,    m4
1259    pmovzxbw    m5,    m5
1260
1261    psubw       m0,    m1
1262    psubw       m2,    m3
1263    psubw       m4,    m5
1264
1265    pmaddwd     m0,    m0
1266    pmaddwd     m2,    m2
1267    pmaddwd     m4,    m4
1268
1269    paddd       m0,    m2
1270    paddd       m6,    m4
1271    paddd       m6,    m0
1272
1273    movu        m0,    [r0 + 2 * r1]
1274    movu        m1,    [r2 + 2 * r3]
1275    lea         r0,    [r0 + 2 * r1]
1276    lea         r2,    [r2 + 2 * r3]
1277    movu        m2,    [r0 + r1]
1278    movu        m3,    [r2 + r3]
1279
1280    punpckhdq   m4,    m0,    m2
1281    punpckhdq   m5,    m1,    m3
1282
1283    pmovzxbw    m0,    m0
1284    pmovzxbw    m1,    m1
1285    pmovzxbw    m2,    m2
1286    pmovzxbw    m3,    m3
1287    pmovzxbw    m4,    m4
1288    pmovzxbw    m5,    m5
1289
1290    psubw       m0,    m1
1291    psubw       m2,    m3
1292    psubw       m4,    m5
1293
1294    pmaddwd     m0,    m0
1295    pmaddwd     m2,    m2
1296    pmaddwd     m4,    m4
1297
1298    paddd       m0,    m2
1299    paddd       m6,    m4
1300    paddd       m6,    m0
1301
1302    dec    r4d
1303    lea       r0,                    [r0 + 2 * r1]
1304    lea       r2,                    [r2 + 2 * r3]
1305    jnz    .loop
1306
1307    HADDD   m6, m1
1308    movd   eax, m6
1309
1310    RET
1311
1312;-----------------------------------------------------------------------------
1313; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
1314;-----------------------------------------------------------------------------
1315INIT_XMM sse4
1316cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2
1317
1318    pxor    m7,     m7
1319    pxor    m6,     m6
1320    mov     r4d,    16
1321
1322.loop:
1323    movu         m1,    [r0]
1324    pmovzxbw     m0,    m1
1325    punpckhbw    m1,    m6
1326    pmovzxbw     m2,    [r0 + 16]
1327    movu         m4,    [r2]
1328    pmovzxbw     m3,    m4
1329    punpckhbw    m4,    m6
1330    pmovzxbw     m5,    [r2 + 16]
1331
1332    psubw        m0,    m3
1333    psubw        m1,    m4
1334    psubw        m2,    m5
1335
1336    pmaddwd      m0,    m0
1337    pmaddwd      m1,    m1
1338    pmaddwd      m2,    m2
1339
1340    paddd        m0,    m1
1341    paddd        m7,    m2
1342    paddd        m7,    m0
1343
1344    movu         m1,    [r0 + r1]
1345    pmovzxbw     m0,    m1
1346    punpckhbw    m1,    m6
1347    pmovzxbw     m2,    [r0 + r1 + 16]
1348    movu         m4,    [r2 + r3]
1349    pmovzxbw     m3,    m4
1350    punpckhbw    m4,    m6
1351    pmovzxbw     m5,    [r2 + r3 + 16]
1352
1353    psubw        m0,    m3
1354    psubw        m1,    m4
1355    psubw        m2,    m5
1356
1357    pmaddwd      m0,    m0
1358    pmaddwd      m1,    m1
1359    pmaddwd      m2,    m2
1360
1361    paddd        m0,    m1
1362    paddd        m7,    m2
1363    paddd        m7,    m0
1364
1365    dec    r4d
1366    lea    r0,    [r0 + 2 * r1]
1367    lea    r2,    [r2 + 2 * r3]
1368    jnz    .loop
1369
1370    HADDD   m7, m1
1371    movd   eax, m7
1372
1373    RET
1374
1375%macro PIXEL_SSD_16x4 0
1376    movu         m1,    [r0]
1377    pmovzxbw     m0,    m1
1378    punpckhbw    m1,    m6
1379    movu         m3,    [r2]
1380    pmovzxbw     m2,    m3
1381    punpckhbw    m3,    m6
1382
1383    psubw        m0,    m2
1384    psubw        m1,    m3
1385
1386    movu         m5,    [r0 + r1]
1387    pmovzxbw     m4,    m5
1388    punpckhbw    m5,    m6
1389    movu         m3,    [r2 + r3]
1390    pmovzxbw     m2,    m3
1391    punpckhbw    m3,    m6
1392
1393    psubw        m4,    m2
1394    psubw        m5,    m3
1395
1396    pmaddwd      m0,    m0
1397    pmaddwd      m1,    m1
1398    pmaddwd      m4,    m4
1399    pmaddwd      m5,    m5
1400
1401    paddd        m0,    m1
1402    paddd        m4,    m5
1403    paddd        m4,    m0
1404    paddd        m7,    m4
1405
1406    movu         m1,    [r0 + r6]
1407    pmovzxbw     m0,    m1
1408    punpckhbw    m1,    m6
1409    movu         m3,    [r2 + 2 * r3]
1410    pmovzxbw     m2,    m3
1411    punpckhbw    m3,    m6
1412
1413    psubw        m0,    m2
1414    psubw        m1,    m3
1415
1416    lea          r0,    [r0 + r6]
1417    lea          r2,    [r2 + 2 * r3]
1418    movu         m5,    [r0 + r1]
1419    pmovzxbw     m4,    m5
1420    punpckhbw    m5,    m6
1421    movu         m3,    [r2 + r3]
1422    pmovzxbw     m2,    m3
1423    punpckhbw    m3,    m6
1424
1425    psubw        m4,    m2
1426    psubw        m5,    m3
1427
1428    pmaddwd      m0,    m0
1429    pmaddwd      m1,    m1
1430    pmaddwd      m4,    m4
1431    pmaddwd      m5,    m5
1432
1433    paddd        m0,    m1
1434    paddd        m4,    m5
1435    paddd        m4,    m0
1436    paddd        m7,    m4
1437%endmacro
1438
1439cglobal pixel_ssd_16x16_internal
1440    PIXEL_SSD_16x4
1441    lea     r0,    [r0 + r6]
1442    lea     r2,    [r2 + 2 * r3]
1443    PIXEL_SSD_16x4
1444    lea     r0,    [r0 + r6]
1445    lea     r2,    [r2 + 2 * r3]
1446    PIXEL_SSD_16x4
1447    lea     r0,    [r0 + r6]
1448    lea     r2,    [r2 + 2 * r3]
1449    PIXEL_SSD_16x4
1450    ret
1451
1452;-----------------------------------------------------------------------------
1453; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
1454;-----------------------------------------------------------------------------
1455INIT_XMM sse4
1456cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2
1457
1458    pxor    m7,    m7
1459    pxor    m6,    m6
1460    mov     r4,    r0
1461    mov     r5,    r2
1462    lea     r6,    [r1 * 2]
1463
1464    call    pixel_ssd_16x16_internal
1465    lea     r0,    [r0 + r6]
1466    lea     r2,    [r2 + 2 * r3]
1467    call    pixel_ssd_16x16_internal
1468    lea     r0,    [r0 + r6]
1469    lea     r2,    [r2 + 2 * r3]
1470    call    pixel_ssd_16x16_internal
1471    lea     r0,    [r0 + r6]
1472    lea     r2,    [r2 + 2 * r3]
1473    call    pixel_ssd_16x16_internal
1474    lea     r0,    [r4 + 16]
1475    lea     r2,    [r5 + 16]
1476    call    pixel_ssd_16x16_internal
1477    lea     r0,    [r0 + r6]
1478    lea     r2,    [r2 + 2 * r3]
1479    call    pixel_ssd_16x16_internal
1480    lea     r0,    [r0 + r6]
1481    lea     r2,    [r2 + 2 * r3]
1482    call    pixel_ssd_16x16_internal
1483    lea     r0,    [r0 + r6]
1484    lea     r2,    [r2 + 2 * r3]
1485    call    pixel_ssd_16x16_internal
1486    lea     r0,    [r4 + 32]
1487    lea     r2,    [r5 + 32]
1488    call    pixel_ssd_16x16_internal
1489    lea     r0,    [r0 + r6]
1490    lea     r2,    [r2 + 2 * r3]
1491    call    pixel_ssd_16x16_internal
1492    lea     r0,    [r0 + r6]
1493    lea     r2,    [r2 + 2 * r3]
1494    call    pixel_ssd_16x16_internal
1495    lea     r0,    [r0 + r6]
1496    lea     r2,    [r2 + 2 * r3]
1497    call    pixel_ssd_16x16_internal
1498
1499    HADDD    m7,     m1
1500    movd     eax,    m7
1501
1502    RET
1503
1504;-----------------------------------------------------------------------------
1505; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1506;-----------------------------------------------------------------------------
1507INIT_XMM sse4
1508cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2
1509
1510    pxor    m7,    m7
1511    pxor    m6,    m6
1512    mov     r4,    r0
1513    mov     r5,    r2
1514    lea     r6,    [r1 * 2]
1515
1516    call    pixel_ssd_16x16_internal
1517    lea     r0,    [r4 + 16]
1518    lea     r2,    [r5 + 16]
1519    call    pixel_ssd_16x16_internal
1520    lea     r0,    [r4 + 32]
1521    lea     r2,    [r5 + 32]
1522    call    pixel_ssd_16x16_internal
1523    lea     r0,    [r4 + 48]
1524    lea     r2,    [r5 + 48]
1525    call    pixel_ssd_16x16_internal
1526
1527    HADDD    m7,      m1
1528    movd     eax,     m7
1529
1530    RET
1531
1532;-----------------------------------------------------------------------------
1533; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
1534;-----------------------------------------------------------------------------
1535INIT_XMM sse4
1536cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2
1537
1538    pxor    m7,    m7
1539    pxor    m6,    m6
1540    mov     r4,    r0
1541    mov     r5,    r2
1542    lea     r6,    [r1 * 2]
1543
1544    call    pixel_ssd_16x16_internal
1545    lea     r0,    [r0 + r6]
1546    lea     r2,    [r2 + 2 * r3]
1547    call    pixel_ssd_16x16_internal
1548    lea     r0,    [r4 + 16]
1549    lea     r2,    [r5 + 16]
1550    call    pixel_ssd_16x16_internal
1551    lea     r0,    [r0 + r6]
1552    lea     r2,    [r2 + 2 * r3]
1553    call    pixel_ssd_16x16_internal
1554    lea     r0,    [r4 + 32]
1555    lea     r2,    [r5 + 32]
1556    call    pixel_ssd_16x16_internal
1557    lea     r0,    [r0 + r6]
1558    lea     r2,    [r2 + 2 * r3]
1559    call    pixel_ssd_16x16_internal
1560    lea     r0,    [r4 + 48]
1561    lea     r2,    [r5 + 48]
1562    call    pixel_ssd_16x16_internal
1563    lea     r0,    [r0 + r6]
1564    lea     r2,    [r2 + 2 * r3]
1565    call    pixel_ssd_16x16_internal
1566
1567    HADDD    m7,     m1
1568    movd     eax,    m7
1569
1570    RET
1571
1572;-----------------------------------------------------------------------------
1573; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
1574;-----------------------------------------------------------------------------
1575INIT_XMM sse4
1576cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2
1577
1578    pxor    m7,    m7
1579    pxor    m6,    m6
1580    mov     r4,    r0
1581    mov     r5,    r2
1582    lea     r6,    [r1 * 2]
1583
1584    call    pixel_ssd_16x16_internal
1585    lea     r0,    [r0 + r6]
1586    lea     r2,    [r2 + 2 * r3]
1587    call    pixel_ssd_16x16_internal
1588    lea     r0,    [r0 + r6]
1589    lea     r2,    [r2 + 2 * r3]
1590    call    pixel_ssd_16x16_internal
1591    lea     r0,    [r4 + 16]
1592    lea     r2,    [r5 + 16]
1593    call    pixel_ssd_16x16_internal
1594    lea     r0,    [r0 + r6]
1595    lea     r2,    [r2 + 2 * r3]
1596    call    pixel_ssd_16x16_internal
1597    lea     r0,    [r0 + r6]
1598    lea     r2,    [r2 + 2 * r3]
1599    call    pixel_ssd_16x16_internal
1600    lea     r0,    [r4 + 32]
1601    lea     r2,    [r5 + 32]
1602    call    pixel_ssd_16x16_internal
1603    lea     r0,    [r0 + r6]
1604    lea     r2,    [r2 + 2 * r3]
1605    call    pixel_ssd_16x16_internal
1606    lea     r0,    [r0 + r6]
1607    lea     r2,    [r2 + 2 * r3]
1608    call    pixel_ssd_16x16_internal
1609    lea     r0,    [r4 + 48]
1610    lea     r2,    [r5 + 48]
1611    call    pixel_ssd_16x16_internal
1612    lea     r0,    [r0 + r6]
1613    lea     r2,    [r2 + 2 * r3]
1614    call    pixel_ssd_16x16_internal
1615    lea     r0,    [r0 + r6]
1616    lea     r2,    [r2 + 2 * r3]
1617    call    pixel_ssd_16x16_internal
1618
1619    HADDD    m7,     m1
1620    movd     eax,    m7
1621
1622    RET
1623
1624;-----------------------------------------------------------------------------
1625; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
1626;-----------------------------------------------------------------------------
1627INIT_XMM sse4
1628cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2
1629
1630    pxor    m7,    m7
1631    pxor    m6,    m6
1632    mov     r4,    r0
1633    mov     r5,    r2
1634    lea     r6,    [r1 * 2]
1635
1636    call    pixel_ssd_16x16_internal
1637    lea     r0,    [r0 + r6]
1638    lea     r2,    [r2 + 2 * r3]
1639    call    pixel_ssd_16x16_internal
1640    lea     r0,    [r0 + r6]
1641    lea     r2,    [r2 + 2 * r3]
1642    call    pixel_ssd_16x16_internal
1643    lea     r0,    [r0 + r6]
1644    lea     r2,    [r2 + 2 * r3]
1645    call    pixel_ssd_16x16_internal
1646    lea     r0,    [r4 + 16]
1647    lea     r2,    [r5 + 16]
1648    call    pixel_ssd_16x16_internal
1649    lea     r0,    [r0 + r6]
1650    lea     r2,    [r2 + 2 * r3]
1651    call    pixel_ssd_16x16_internal
1652    lea     r0,    [r0 + r6]
1653    lea     r2,    [r2 + 2 * r3]
1654    call    pixel_ssd_16x16_internal
1655    lea     r0,    [r0 + r6]
1656    lea     r2,    [r2 + 2 * r3]
1657    call    pixel_ssd_16x16_internal
1658    lea     r0,    [r4 + 32]
1659    lea     r2,    [r5 + 32]
1660    call    pixel_ssd_16x16_internal
1661    lea     r0,    [r0 + r6]
1662    lea     r2,    [r2 + 2 * r3]
1663    call    pixel_ssd_16x16_internal
1664    lea     r0,    [r0 + r6]
1665    lea     r2,    [r2 + 2 * r3]
1666    call    pixel_ssd_16x16_internal
1667    lea     r0,    [r0 + r6]
1668    lea     r2,    [r2 + 2 * r3]
1669    call    pixel_ssd_16x16_internal
1670    lea     r0,    [r4 + 48]
1671    lea     r2,    [r5 + 48]
1672    call    pixel_ssd_16x16_internal
1673    lea     r0,    [r0 + r6]
1674    lea     r2,    [r2 + 2 * r3]
1675    call    pixel_ssd_16x16_internal
1676    lea     r0,    [r0 + r6]
1677    lea     r2,    [r2 + 2 * r3]
1678    call    pixel_ssd_16x16_internal
1679    lea     r0,    [r0 + r6]
1680    lea     r2,    [r2 + 2 * r3]
1681    call    pixel_ssd_16x16_internal
1682
1683    HADDD    m7,     m1
1684    movd     eax,    m7
1685
1686    RET
1687
1688;-----------------------------------------------------------------------------
1689; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t )
1690;-----------------------------------------------------------------------------
1691
1692cglobal pixel_ssd_sp_4x4_internal
1693    movh          m0,    [r0]
1694    movh          m1,    [r0 + r1]
1695    punpcklqdq    m0,    m1
1696    movd          m2,    [r2]
1697    movd          m3,    [r2 + r3]
1698    punpckldq     m2,    m3
1699    pmovzxbw      m2,    m2
1700    psubw         m0,    m2
1701    movh          m4,    [r0 + 2 * r1]
1702    movh          m5,    [r0 + r4]
1703    punpcklqdq    m4,    m5
1704    movd          m6,    [r2 + 2 * r3]
1705    lea           r2,    [r2 + 2 * r3]
1706    movd          m1,    [r2 + r3]
1707    punpckldq     m6,    m1
1708    pmovzxbw      m6,    m6
1709    psubw         m4,    m6
1710    pmaddwd       m0,    m0
1711    pmaddwd       m4,    m4
1712    paddd         m0,    m4
1713    paddd         m7,    m0
1714    ret
1715
1716;-----------------------------------------------------------------------------
1717; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1718;-----------------------------------------------------------------------------
1719INIT_XMM sse4
1720cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2
1721    pxor     m7,     m7
1722    add      r1,     r1
1723    lea      r4,     [r1 * 3]
1724    call     pixel_ssd_sp_4x4_internal
1725    HADDD    m7,     m1
1726    movd     eax,    m7
1727    RET
1728
1729;-----------------------------------------------------------------------------
1730; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1731;-----------------------------------------------------------------------------
1732INIT_XMM sse4
1733cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2
1734    pxor     m7,     m7
1735    add      r1,     r1
1736    lea      r4,     [r1 * 3]
1737    call     pixel_ssd_sp_4x4_internal
1738    lea      r0,     [r0 + 4 * r1]
1739    lea      r2,     [r2 + 2 * r3]
1740    call     pixel_ssd_sp_4x4_internal
1741    HADDD    m7,     m1
1742    movd     eax,    m7
1743    RET
1744
1745;-----------------------------------------------------------------------------
1746; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1747;-----------------------------------------------------------------------------
1748INIT_XMM sse4
1749cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2
1750    pxor     m7,     m7
1751    add      r1,     r1
1752    lea      r4,     [r1 * 3]
1753    call     pixel_ssd_sp_4x4_internal
1754    lea      r0,     [r0 + 4 * r1]
1755    lea      r2,     [r2 + 2 * r3]
1756    call     pixel_ssd_sp_4x4_internal
1757    lea      r0,     [r0 + 4 * r1]
1758    lea      r2,     [r2 + 2 * r3]
1759    call     pixel_ssd_sp_4x4_internal
1760    lea      r0,     [r0 + 4 * r1]
1761    lea      r2,     [r2 + 2 * r3]
1762    call     pixel_ssd_sp_4x4_internal
1763    HADDD    m7,     m1
1764    movd     eax,    m7
1765    RET
1766
1767cglobal pixel_ssd_sp_8x4_internal
1768    movu         m0,    [r0]
1769    movu         m1,    [r0 + r1]
1770    movh         m2,    [r2]
1771    movh         m3,    [r2 + r3]
1772    pmovzxbw     m2,    m2
1773    pmovzxbw     m3,    m3
1774
1775    psubw        m0,    m2
1776    psubw        m1,    m3
1777
1778    movu         m4,    [r0 + 2 * r1]
1779    movu         m5,    [r0 + r4]
1780    movh         m2,    [r2 + 2 * r3]
1781    movh         m3,    [r2 + r5]
1782    pmovzxbw     m2,    m2
1783    pmovzxbw     m3,    m3
1784
1785    psubw        m4,    m2
1786    psubw        m5,    m3
1787
1788    pmaddwd      m0,    m0
1789    pmaddwd      m1,    m1
1790    pmaddwd      m4,    m4
1791    pmaddwd      m5,    m5
1792
1793    paddd        m0,    m1
1794    paddd        m4,    m5
1795    paddd        m4,    m0
1796    paddd        m7,    m4
1797    ret
1798
1799;-----------------------------------------------------------------------------
1800; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1801;-----------------------------------------------------------------------------
1802INIT_XMM sse4
1803cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2
1804    pxor     m7,     m7
1805    add      r1,     r1
1806    lea      r4,     [r1 * 3]
1807    lea      r5,     [r3 * 3]
1808    call     pixel_ssd_sp_8x4_internal
1809    HADDD    m7,     m1
1810    movd     eax,    m7
1811    RET
1812
1813;-----------------------------------------------------------------------------
1814; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1815;-----------------------------------------------------------------------------
1816INIT_XMM sse4
1817cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2
1818    pxor     m7,     m7
1819    add      r1,     r1
1820    lea      r4,     [r1 * 3]
1821    lea      r5,     [r3 * 3]
1822    call     pixel_ssd_sp_8x4_internal
1823    lea      r0,     [r0 + 4 * r1]
1824    lea      r2,     [r2 + 4 * r3]
1825    call     pixel_ssd_sp_8x4_internal
1826    HADDD    m7,     m1
1827    movd     eax,    m7
1828    RET
1829
1830;-----------------------------------------------------------------------------
1831; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1832;-----------------------------------------------------------------------------
1833INIT_XMM sse4
1834cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2
1835    pxor     m7,     m7
1836    add      r1,     r1
1837    lea      r4,     [r1 * 3]
1838    lea      r5,     [r3 * 3]
1839    call     pixel_ssd_sp_8x4_internal
1840    lea      r0,     [r0 + 4 * r1]
1841    lea      r2,     [r2 + 4 * r3]
1842    call     pixel_ssd_sp_8x4_internal
1843    lea      r0,     [r0 + 4 * r1]
1844    lea      r2,     [r2 + 4 * r3]
1845    call     pixel_ssd_sp_8x4_internal
1846    lea      r0,     [r0 + 4 * r1]
1847    lea      r2,     [r2 + 4 * r3]
1848    call     pixel_ssd_sp_8x4_internal
1849    HADDD    m7,     m1
1850    movd     eax,    m7
1851    RET
1852
1853;-----------------------------------------------------------------------------
1854; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1855;-----------------------------------------------------------------------------
1856INIT_XMM sse4
1857cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2
1858    pxor     m7,     m7
1859    add      r1,     r1
1860    lea      r4,     [r1 * 3]
1861    lea      r5,     [r3 * 3]
1862    call     pixel_ssd_sp_8x4_internal
1863    lea      r0,     [r0 + 4 * r1]
1864    lea      r2,     [r2 + 4 * r3]
1865    call     pixel_ssd_sp_8x4_internal
1866    lea      r0,     [r0 + 4 * r1]
1867    lea      r2,     [r2 + 4 * r3]
1868    call     pixel_ssd_sp_8x4_internal
1869    lea      r0,     [r0 + 4 * r1]
1870    lea      r2,     [r2 + 4 * r3]
1871    call     pixel_ssd_sp_8x4_internal
1872    lea      r0,     [r0 + 4 * r1]
1873    lea      r2,     [r2 + 4 * r3]
1874    call     pixel_ssd_sp_8x4_internal
1875    lea      r0,     [r0 + 4 * r1]
1876    lea      r2,     [r2 + 4 * r3]
1877    call     pixel_ssd_sp_8x4_internal
1878    lea      r0,     [r0 + 4 * r1]
1879    lea      r2,     [r2 + 4 * r3]
1880    call     pixel_ssd_sp_8x4_internal
1881    lea      r0,     [r0 + 4 * r1]
1882    lea      r2,     [r2 + 4 * r3]
1883    call     pixel_ssd_sp_8x4_internal
1884    HADDD    m7,     m1
1885    movd     eax,    m7
1886    RET
1887
1888;-----------------------------------------------------------------------------
1889; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1890;-----------------------------------------------------------------------------
1891INIT_XMM sse4
1892cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2
1893    pxor     m7,     m7
1894    add      r1,     r1
1895    lea      r4,     [r1 * 3]
1896    mov      r5,     r0
1897    mov      r6,     r2
1898    call     pixel_ssd_sp_4x4_internal
1899    lea      r0,     [r0 + 4 * r1]
1900    lea      r2,     [r2 + 2 * r3]
1901    call     pixel_ssd_sp_4x4_internal
1902    lea      r0,     [r0 + 4 * r1]
1903    lea      r2,     [r2 + 2 * r3]
1904    call     pixel_ssd_sp_4x4_internal
1905    lea      r0,     [r0 + 4 * r1]
1906    lea      r2,     [r2 + 2 * r3]
1907    call     pixel_ssd_sp_4x4_internal
1908    lea      r0,     [r5 + 8]
1909    lea      r2,     [r6 + 4]
1910    lea      r5,     [r3 * 3]
1911    call     pixel_ssd_sp_8x4_internal
1912    lea      r0,     [r0 + 4 * r1]
1913    lea      r2,     [r2 + 4 * r3]
1914    call     pixel_ssd_sp_8x4_internal
1915    lea      r0,     [r0 + 4 * r1]
1916    lea      r2,     [r2 + 4 * r3]
1917    call     pixel_ssd_sp_8x4_internal
1918    lea      r0,     [r0 + 4 * r1]
1919    lea      r2,     [r2 + 4 * r3]
1920    call     pixel_ssd_sp_8x4_internal
1921    HADDD    m7,     m1
1922    movd     eax,    m7
1923    RET
1924
1925%macro PIXEL_SSD_SP_16x4 0
1926    movu         m0,    [r0]
1927    movu         m1,    [r0 + 16]
1928    movu         m3,    [r2]
1929    pmovzxbw     m2,    m3
1930    punpckhbw    m3,    m6
1931
1932    psubw        m0,    m2
1933    psubw        m1,    m3
1934
1935    movu         m4,    [r0 + r1]
1936    movu         m5,    [r0 + r1 +16]
1937    movu         m3,    [r2 + r3]
1938    pmovzxbw     m2,    m3
1939    punpckhbw    m3,    m6
1940
1941    psubw        m4,    m2
1942    psubw        m5,    m3
1943
1944    pmaddwd      m0,    m0
1945    pmaddwd      m1,    m1
1946    pmaddwd      m4,    m4
1947    pmaddwd      m5,    m5
1948
1949    paddd        m0,    m1
1950    paddd        m4,    m5
1951    paddd        m4,    m0
1952    paddd        m7,    m4
1953
1954    movu         m0,    [r0 + 2 * r1]
1955    movu         m1,    [r0 + 2 * r1 + 16]
1956    movu         m3,    [r2 + 2 * r3]
1957    pmovzxbw     m2,    m3
1958    punpckhbw    m3,    m6
1959
1960    psubw        m0,    m2
1961    psubw        m1,    m3
1962
1963    lea          r0,    [r0 + 2 * r1]
1964    lea          r2,    [r2 + 2 * r3]
1965    movu         m4,    [r0 + r1]
1966    movu         m5,    [r0 + r1 + 16]
1967    movu         m3,    [r2 + r3]
1968    pmovzxbw     m2,    m3
1969    punpckhbw    m3,    m6
1970
1971    psubw        m4,    m2
1972    psubw        m5,    m3
1973
1974    pmaddwd      m0,    m0
1975    pmaddwd      m1,    m1
1976    pmaddwd      m4,    m4
1977    pmaddwd      m5,    m5
1978
1979    paddd        m0,    m1
1980    paddd        m4,    m5
1981    paddd        m4,    m0
1982    paddd        m7,    m4
1983%endmacro
1984
1985;-----------------------------------------------------------------------------
1986; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1987;-----------------------------------------------------------------------------
1988INIT_XMM sse4
1989cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2
1990
1991    pxor        m6,     m6
1992    pxor        m7,     m7
1993    add         r1,     r1
1994    PIXEL_SSD_SP_16x4
1995    HADDD   m7, m1
1996    movd   eax, m7
1997
1998    RET
1999
2000;-----------------------------------------------------------------------------
2001; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t )
2002;-----------------------------------------------------------------------------
2003INIT_XMM sse4
2004cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2
2005
2006    pxor    m6,     m6
2007    pxor    m7,     m7
2008    add     r1,     r1
2009    PIXEL_SSD_SP_16x4
2010    lea     r0,    [r0 + 2 * r1]
2011    lea     r2,    [r2 + 2 * r3]
2012    PIXEL_SSD_SP_16x4
2013    HADDD   m7,     m1
2014    movd    eax,    m7
2015    RET
2016
2017;-----------------------------------------------------------------------------
2018; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t )
2019;-----------------------------------------------------------------------------
2020INIT_XMM sse4
2021cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2
2022
2023    pxor    m6,     m6
2024    pxor    m7,     m7
2025    add     r1,     r1
2026    lea     r4,     [r1 * 2]
2027    lea     r5,     [r3 * 2]
2028    PIXEL_SSD_SP_16x4
2029    lea     r0,     [r0 + r4]
2030    lea     r2,     [r2 + r5]
2031    PIXEL_SSD_SP_16x4
2032    lea     r0,     [r0 + r4]
2033    lea     r2,     [r2 + r5]
2034    PIXEL_SSD_SP_16x4
2035    HADDD   m7,     m1
2036    movd    eax,    m7
2037    RET
2038
2039;-----------------------------------------------------------------------------
2040; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t )
2041;-----------------------------------------------------------------------------
2042INIT_XMM sse4
2043cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2
2044
2045    pxor    m6,     m6
2046    pxor    m7,     m7
2047    add     r1,     r1
2048    lea     r4,     [r1 * 2]
2049    lea     r5,     [r3 * 2]
2050    PIXEL_SSD_SP_16x4
2051    lea     r0,     [r0 + r4]
2052    lea     r2,     [r2 + r5]
2053    PIXEL_SSD_SP_16x4
2054    lea     r0,     [r0 + r4]
2055    lea     r2,     [r2 + r5]
2056    PIXEL_SSD_SP_16x4
2057    lea     r0,     [r0 + r4]
2058    lea     r2,     [r2 + r5]
2059    PIXEL_SSD_SP_16x4
2060    HADDD   m7,     m1
2061    movd    eax,    m7
2062    RET
2063
2064cglobal pixel_ssd_sp_16x16_internal
2065    PIXEL_SSD_SP_16x4
2066    lea     r0,    [r0 + r4]
2067    lea     r2,    [r2 + 2 * r3]
2068    PIXEL_SSD_SP_16x4
2069    lea     r0,    [r0 + r4]
2070    lea     r2,    [r2 + 2 * r3]
2071    PIXEL_SSD_SP_16x4
2072    lea     r0,    [r0 + r4]
2073    lea     r2,    [r2 + 2 * r3]
2074    PIXEL_SSD_SP_16x4
2075    ret
2076
2077;-----------------------------------------------------------------------------
2078; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t )
2079;-----------------------------------------------------------------------------
2080INIT_XMM sse4
2081cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2
2082
2083    pxor     m6,     m6
2084    pxor     m7,     m7
2085    add      r1,     r1
2086    lea      r4,     [r1 * 2]
2087    call     pixel_ssd_sp_16x16_internal
2088    lea      r0,     [r0 + r4]
2089    lea      r2,     [r2 + 2 * r3]
2090    call     pixel_ssd_sp_16x16_internal
2091    HADDD    m7,     m1
2092    movd     eax,    m7
2093    RET
2094
2095;-----------------------------------------------------------------------------
2096; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t )
2097;-----------------------------------------------------------------------------
2098INIT_XMM sse4
2099cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2
2100
2101    pxor     m6,     m6
2102    pxor     m7,     m7
2103    add      r1,     r1
2104    lea      r4,     [r1 * 2]
2105    lea      r5,     [r3 * 2]
2106    call     pixel_ssd_sp_16x16_internal
2107    lea      r0,     [r0 + r4]
2108    lea      r2,     [r2 + r5]
2109    call     pixel_ssd_sp_16x16_internal
2110    lea      r0,     [r0 + r4]
2111    lea      r2,     [r2 + r5]
2112    call     pixel_ssd_sp_16x16_internal
2113    lea      r0,     [r0 + r4]
2114    lea      r2,     [r2 + r5]
2115    call     pixel_ssd_sp_16x16_internal
2116
2117    HADDD    m7,     m1
2118    movd     eax,    m7
2119    RET
2120
2121;-----------------------------------------------------------------------------
2122; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t )
2123;-----------------------------------------------------------------------------
2124INIT_XMM sse4
2125cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2
2126    pxor     m6,     m6
2127    pxor     m7,     m7
2128    add      r1,     r1
2129    lea      r4,     [r1 * 2]
2130    mov      r5,     r0
2131    mov      r6,     r2
2132    call     pixel_ssd_sp_16x16_internal
2133    lea      r0,     [r0 + r4]
2134    lea      r2,     [r2 + 2 * r3]
2135    call     pixel_ssd_sp_16x16_internal
2136    lea      r0,     [r5 + 32]
2137    lea      r2,     [r6 + 16]
2138    lea      r4,     [r1 * 3]
2139    lea      r5,     [r3 * 3]
2140    call     pixel_ssd_sp_8x4_internal
2141    lea      r0,     [r0 + 4 * r1]
2142    lea      r2,     [r2 + 4 * r3]
2143    call     pixel_ssd_sp_8x4_internal
2144    lea      r0,     [r0 + 4 * r1]
2145    lea      r2,     [r2 + 4 * r3]
2146    call     pixel_ssd_sp_8x4_internal
2147    lea      r0,     [r0 + 4 * r1]
2148    lea      r2,     [r2 + 4 * r3]
2149    call     pixel_ssd_sp_8x4_internal
2150    lea      r0,     [r0 + 4 * r1]
2151    lea      r2,     [r2 + 4 * r3]
2152    call     pixel_ssd_sp_8x4_internal
2153    lea      r0,     [r0 + 4 * r1]
2154    lea      r2,     [r2 + 4 * r3]
2155    call     pixel_ssd_sp_8x4_internal
2156    lea      r0,     [r0 + 4 * r1]
2157    lea      r2,     [r2 + 4 * r3]
2158    call     pixel_ssd_sp_8x4_internal
2159    lea      r0,     [r0 + 4 * r1]
2160    lea      r2,     [r2 + 4 * r3]
2161    call     pixel_ssd_sp_8x4_internal
2162    HADDD    m7,     m1
2163    movd     eax,    m7
2164    RET
2165
2166;-----------------------------------------------------------------------------
2167; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
2168;-----------------------------------------------------------------------------
2169INIT_XMM sse4
2170cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2
2171
2172    pxor     m7,     m7
2173    pxor     m6,     m6
2174    mov      r5,     r0
2175    mov      r6,     r2
2176    add      r1,     r1
2177    lea      r4,     [r1 * 2]
2178    PIXEL_SSD_SP_16x4
2179    lea      r0,     [r0 + r4]
2180    lea      r2,     [r2 + 2 * r3]
2181    PIXEL_SSD_SP_16x4
2182    lea      r0,     [r5 + 32]
2183    lea      r2,     [r6 + 16]
2184    PIXEL_SSD_SP_16x4
2185    lea      r0,     [r0 + r4]
2186    lea      r2,     [r2 + 2 * r3]
2187    PIXEL_SSD_SP_16x4
2188    HADDD    m7,     m1
2189    movd     eax,    m7
2190    RET
2191
2192;-----------------------------------------------------------------------------
2193; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
2194;-----------------------------------------------------------------------------
2195INIT_XMM sse4
2196cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2
2197
2198    pxor     m7,     m7
2199    pxor     m6,     m6
2200    mov      r5,     r0
2201    mov      r6,     r2
2202    add      r1,     r1
2203    lea      r4,     [r1 * 2]
2204    call     pixel_ssd_sp_16x16_internal
2205    lea      r0,     [r5 + 32]
2206    lea      r2,     [r6 + 16]
2207    call     pixel_ssd_sp_16x16_internal
2208    HADDD    m7,     m1
2209    movd     eax,    m7
2210    RET
2211
2212;-----------------------------------------------------------------------------
2213; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
2214;-----------------------------------------------------------------------------
2215INIT_XMM sse4
2216cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2
2217
2218    pxor     m7,     m7
2219    pxor     m6,     m6
2220    mov      r5,     r0
2221    mov      r6,     r2
2222    add      r1,     r1
2223    lea      r4,     [r1 * 2]
2224    call     pixel_ssd_sp_16x16_internal
2225    lea      r0,     [r0 + r4]
2226    lea      r2,     [r2 + 2 * r3]
2227    PIXEL_SSD_SP_16x4
2228    lea      r0,     [r0 + r4]
2229    lea      r2,     [r2 + 2 * r3]
2230    PIXEL_SSD_SP_16x4
2231    lea      r0,     [r5 + 32]
2232    lea      r2,     [r6 + 16]
2233    call     pixel_ssd_sp_16x16_internal
2234    lea      r0,     [r0 + r4]
2235    lea      r2,     [r2 + 2 * r3]
2236    PIXEL_SSD_SP_16x4
2237    lea      r0,     [r0 + r4]
2238    lea      r2,     [r2 + 2 * r3]
2239    PIXEL_SSD_SP_16x4
2240    HADDD    m7,     m1
2241    movd     eax,    m7
2242    RET
2243
2244;-----------------------------------------------------------------------------
2245; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
2246;-----------------------------------------------------------------------------
2247INIT_XMM sse4
2248cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2
2249
2250    pxor     m7,     m7
2251    pxor     m6,     m6
2252    mov      r5,     r0
2253    mov      r6,     r2
2254    add      r1,     r1
2255    lea      r4,     [r1 * 2]
2256    call     pixel_ssd_sp_16x16_internal
2257    lea      r0,     [r0 + r4]
2258    lea      r2,     [r2 + 2 * r3]
2259    call     pixel_ssd_sp_16x16_internal
2260    lea      r0,     [r5 + 32]
2261    lea      r2,     [r6 + 16]
2262    call     pixel_ssd_sp_16x16_internal
2263    lea      r0,     [r0 + r4]
2264    lea      r2,     [r2 + 2 * r3]
2265    call     pixel_ssd_sp_16x16_internal
2266    HADDD    m7,     m1
2267    movd     eax,    m7
2268    RET
2269
2270;-----------------------------------------------------------------------------
2271; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2272;-----------------------------------------------------------------------------
2273INIT_XMM sse4
2274cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2
2275
2276    pxor     m7,     m7
2277    pxor     m6,     m6
2278    mov      r5,     r0
2279    mov      r6,     r2
2280    add      r1,     r1
2281    lea      r4,     [r1 * 2]
2282    call     pixel_ssd_sp_16x16_internal
2283    lea      r0,     [r0 + r4]
2284    lea      r2,     [r2 + 2 * r3]
2285    call     pixel_ssd_sp_16x16_internal
2286    lea      r0,     [r0 + r4]
2287    lea      r2,     [r2 + 2 * r3]
2288    call     pixel_ssd_sp_16x16_internal
2289    lea      r0,     [r0 + r4]
2290    lea      r2,     [r2 + 2 * r3]
2291    call     pixel_ssd_sp_16x16_internal
2292    lea      r0,     [r5 + 32]
2293    lea      r2,     [r6 + 16]
2294    call     pixel_ssd_sp_16x16_internal
2295    lea      r0,     [r0 + r4]
2296    lea      r2,     [r2 + 2 * r3]
2297    call     pixel_ssd_sp_16x16_internal
2298    lea      r0,     [r0 + r4]
2299    lea      r2,     [r2 + 2 * r3]
2300    call     pixel_ssd_sp_16x16_internal
2301    lea      r0,     [r0 + r4]
2302    lea      r2,     [r2 + 2 * r3]
2303    call     pixel_ssd_sp_16x16_internal
2304    HADDD    m7,     m1
2305    movd     eax,    m7
2306    RET
2307
2308;-----------------------------------------------------------------------------
2309; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2310;-----------------------------------------------------------------------------
2311INIT_XMM sse4
2312cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2
2313
2314    pxor     m7,     m7
2315    pxor     m6,     m6
2316    mov      r5,     r0
2317    mov      r6,     r2
2318    add      r1,     r1
2319    lea      r4,     [r1 * 2]
2320    call     pixel_ssd_sp_16x16_internal
2321    lea      r0,     [r0 + r4]
2322    lea      r2,     [r2 + 2 * r3]
2323    call     pixel_ssd_sp_16x16_internal
2324    lea      r0,     [r0 + r4]
2325    lea      r2,     [r2 + 2 * r3]
2326    call     pixel_ssd_sp_16x16_internal
2327    lea      r0,     [r0 + r4]
2328    lea      r2,     [r2 + 2 * r3]
2329    call     pixel_ssd_sp_16x16_internal
2330    lea      r0,     [r5 + 32]
2331    lea      r2,     [r6 + 16]
2332    call     pixel_ssd_sp_16x16_internal
2333    lea      r0,     [r0 + r4]
2334    lea      r2,     [r2 + 2 * r3]
2335    call     pixel_ssd_sp_16x16_internal
2336    lea      r0,     [r0 + r4]
2337    lea      r2,     [r2 + 2 * r3]
2338    call     pixel_ssd_sp_16x16_internal
2339    lea      r0,     [r0 + r4]
2340    lea      r2,     [r2 + 2 * r3]
2341    call     pixel_ssd_sp_16x16_internal
2342    lea      r0,     [r5 + 64]
2343    lea      r2,     [r6 + 32]
2344    call     pixel_ssd_sp_16x16_internal
2345    lea      r0,     [r0 + r4]
2346    lea      r2,     [r2 + 2 * r3]
2347    call     pixel_ssd_sp_16x16_internal
2348    lea      r0,     [r0 + r4]
2349    lea      r2,     [r2 + 2 * r3]
2350    call     pixel_ssd_sp_16x16_internal
2351    lea      r0,     [r0 + r4]
2352    lea      r2,     [r2 + 2 * r3]
2353    call     pixel_ssd_sp_16x16_internal
2354    HADDD    m7,     m1
2355    movd     eax,    m7
2356    RET
2357
2358;-----------------------------------------------------------------------------
2359; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
2360;-----------------------------------------------------------------------------
2361INIT_XMM sse4
2362cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2
2363
2364    pxor     m7,     m7
2365    pxor     m6,     m6
2366    mov      r5,     r0
2367    mov      r6,     r2
2368    add      r1,     r1
2369    lea      r4,     [r1 * 2]
2370    call     pixel_ssd_sp_16x16_internal
2371    lea      r0,     [r5 + 32]
2372    lea      r2,     [r6 + 16]
2373    call     pixel_ssd_sp_16x16_internal
2374    lea      r0,     [r5 + 64]
2375    lea      r2,     [r6 + 32]
2376    call     pixel_ssd_sp_16x16_internal
2377    lea      r0,     [r5 + 96]
2378    lea      r2,     [r6 + 48]
2379    call     pixel_ssd_sp_16x16_internal
2380    HADDD    m7,     m1
2381    movd     eax,    m7
2382    RET
2383
2384;-----------------------------------------------------------------------------
2385; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
2386;-----------------------------------------------------------------------------
2387INIT_XMM sse4
2388cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2
2389
2390    pxor     m7,     m7
2391    pxor     m6,     m6
2392    mov      r5,     r0
2393    mov      r6,     r2
2394    add      r1,     r1
2395    lea      r4,     [r1 * 2]
2396    call     pixel_ssd_sp_16x16_internal
2397    lea      r0,     [r0 + r4]
2398    lea      r2,     [r2 + 2 * r3]
2399    call     pixel_ssd_sp_16x16_internal
2400    lea      r0,     [r5 + 32]
2401    lea      r2,     [r6 + 16]
2402    call     pixel_ssd_sp_16x16_internal
2403    lea      r0,     [r0 + r4]
2404    lea      r2,     [r2 + 2 * r3]
2405    call     pixel_ssd_sp_16x16_internal
2406    lea      r0,     [r5 + 64]
2407    lea      r2,     [r6 + 32]
2408    call     pixel_ssd_sp_16x16_internal
2409    lea      r0,     [r0 + r4]
2410    lea      r2,     [r2 + 2 * r3]
2411    call     pixel_ssd_sp_16x16_internal
2412    lea      r0,     [r5 + 96]
2413    lea      r2,     [r6 + 48]
2414    call     pixel_ssd_sp_16x16_internal
2415    lea      r0,     [r0 + r4]
2416    lea      r2,     [r2 + 2 * r3]
2417    call     pixel_ssd_sp_16x16_internal
2418    HADDD    m7,     m1
2419    movd     eax,    m7
2420    RET
2421
2422;-----------------------------------------------------------------------------
2423; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
2424;-----------------------------------------------------------------------------
2425INIT_XMM sse4
2426cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2
2427
2428    pxor     m7,     m7
2429    pxor     m6,     m6
2430    mov      r5,     r0
2431    mov      r6,     r2
2432    add      r1,     r1
2433    lea      r4,     [r1 * 2]
2434    call     pixel_ssd_sp_16x16_internal
2435    lea      r0,     [r0 + r4]
2436    lea      r2,     [r2 + 2 * r3]
2437    call     pixel_ssd_sp_16x16_internal
2438    lea      r0,     [r0 + r4]
2439    lea      r2,     [r2 + 2 * r3]
2440    call     pixel_ssd_sp_16x16_internal
2441    lea      r0,     [r5 + 32]
2442    lea      r2,     [r6 + 16]
2443    call     pixel_ssd_sp_16x16_internal
2444    lea      r0,     [r0 + r4]
2445    lea      r2,     [r2 + 2 * r3]
2446    call     pixel_ssd_sp_16x16_internal
2447    lea      r0,     [r0 + r4]
2448    lea      r2,     [r2 + 2 * r3]
2449    call     pixel_ssd_sp_16x16_internal
2450    lea      r0,     [r5 + 64]
2451    lea      r2,     [r6 + 32]
2452    call     pixel_ssd_sp_16x16_internal
2453    lea      r0,     [r0 + r4]
2454    lea      r2,     [r2 + 2 * r3]
2455    call     pixel_ssd_sp_16x16_internal
2456    lea      r0,     [r0 + r4]
2457    lea      r2,     [r2 + 2 * r3]
2458    call     pixel_ssd_sp_16x16_internal
2459    lea      r0,     [r5 + 96]
2460    lea      r2,     [r6 + 48]
2461    call     pixel_ssd_sp_16x16_internal
2462    lea      r0,     [r0 + r4]
2463    lea      r2,     [r2 + 2 * r3]
2464    call     pixel_ssd_sp_16x16_internal
2465    lea      r0,     [r0 + r4]
2466    lea      r2,     [r2 + 2 * r3]
2467    call     pixel_ssd_sp_16x16_internal
2468    HADDD    m7,     m1
2469    movd     eax,    m7
2470    RET
2471
2472;-----------------------------------------------------------------------------
2473; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2474;-----------------------------------------------------------------------------
2475INIT_XMM sse4
2476cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2
2477
2478    pxor     m7,     m7
2479    pxor     m6,     m6
2480    mov      r5,     r0
2481    mov      r6,     r2
2482    add      r1,     r1
2483    lea      r4,     [r1 * 2]
2484    call     pixel_ssd_sp_16x16_internal
2485    lea      r0,     [r0 + r4]
2486    lea      r2,     [r2 + 2 * r3]
2487    call     pixel_ssd_sp_16x16_internal
2488    lea      r0,     [r0 + r4]
2489    lea      r2,     [r2 + 2 * r3]
2490    call     pixel_ssd_sp_16x16_internal
2491    lea      r0,     [r0 + r4]
2492    lea      r2,     [r2 + 2 * r3]
2493    call     pixel_ssd_sp_16x16_internal
2494    lea      r0,     [r5 + 32]
2495    lea      r2,     [r6 + 16]
2496    call     pixel_ssd_sp_16x16_internal
2497    lea      r0,     [r0 + r4]
2498    lea      r2,     [r2 + 2 * r3]
2499    call     pixel_ssd_sp_16x16_internal
2500    lea      r0,     [r0 + r4]
2501    lea      r2,     [r2 + 2 * r3]
2502    call     pixel_ssd_sp_16x16_internal
2503    lea      r0,     [r0 + r4]
2504    lea      r2,     [r2 + 2 * r3]
2505    call     pixel_ssd_sp_16x16_internal
2506    lea      r0,     [r5 + 64]
2507    lea      r2,     [r6 + 32]
2508    call     pixel_ssd_sp_16x16_internal
2509    lea      r0,     [r0 + r4]
2510    lea      r2,     [r2 + 2 * r3]
2511    call     pixel_ssd_sp_16x16_internal
2512    lea      r0,     [r0 + r4]
2513    lea      r2,     [r2 + 2 * r3]
2514    call     pixel_ssd_sp_16x16_internal
2515    lea      r0,     [r0 + r4]
2516    lea      r2,     [r2 + 2 * r3]
2517    call     pixel_ssd_sp_16x16_internal
2518    lea      r0,     [r5 + 96]
2519    lea      r2,     [r6 + 48]
2520    call     pixel_ssd_sp_16x16_internal
2521    lea      r0,     [r0 + r4]
2522    lea      r2,     [r2 + 2 * r3]
2523    call     pixel_ssd_sp_16x16_internal
2524    lea      r0,     [r0 + r4]
2525    lea      r2,     [r2 + 2 * r3]
2526    call     pixel_ssd_sp_16x16_internal
2527    lea      r0,     [r0 + r4]
2528    lea      r2,     [r2 + 2 * r3]
2529    call     pixel_ssd_sp_16x16_internal
2530    HADDD    m7,     m1
2531    movd     eax,    m7
2532    RET
2533
2534
2535;-----------------------------------------------------------------------------
2536; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
2537;-----------------------------------------------------------------------------
2538INIT_XMM sse2
2539cglobal pixel_ssd_s_4, 2,2,2
2540    add     r1, r1
2541    movh    m0, [r0]
2542    movhps  m0, [r0 + r1]
2543
2544    lea     r0, [r0 + r1 * 2]
2545    movh    m1, [r0]
2546    movhps  m1, [r0 + r1]
2547
2548    pmaddwd m0, m0
2549    pmaddwd m1, m1
2550    paddd   m0, m1
2551
2552    ; calculate sum and return
2553    HADDD   m0, m1
2554    movd    eax, m0
2555    RET
2556
2557
2558INIT_XMM sse2
2559cglobal pixel_ssd_s_8, 2,3,5
2560    add     r1, r1
2561    lea     r2, [r1 * 3]
2562    movu    m0, [r0]
2563    movu    m1, [r0 + r1]
2564    movu    m2, [r0 + r1 * 2]
2565    movu    m3, [r0 + r2]
2566
2567    pmaddwd m0, m0
2568    pmaddwd m1, m1
2569    pmaddwd m2, m2
2570    pmaddwd m3, m3
2571    paddd   m0, m1
2572    paddd   m2, m3
2573    paddd   m0, m2
2574
2575    lea     r0, [r0 + r1 * 4]
2576    movu    m4, [r0]
2577    movu    m1, [r0 + r1]
2578    movu    m2, [r0 + r1 * 2]
2579    movu    m3, [r0 + r2]
2580
2581    pmaddwd m4, m4
2582    pmaddwd m1, m1
2583    pmaddwd m2, m2
2584    pmaddwd m3, m3
2585    paddd   m4, m1
2586    paddd   m2, m3
2587    paddd   m4, m2
2588    paddd   m0, m4
2589
2590    ; calculate sum and return
2591    HADDD   m0, m1
2592    movd    eax, m0
2593    RET
2594
2595
2596INIT_XMM sse2
2597cglobal pixel_ssd_s_16, 2,3,5
2598    add     r1, r1
2599
2600    mov     r2d, 4
2601    pxor    m0, m0
2602.loop:
2603    movu    m1, [r0]
2604    movu    m2, [r0 + mmsize]
2605    movu    m3, [r0 + r1]
2606    movu    m4, [r0 + r1 + mmsize]
2607    lea     r0, [r0 + r1 * 2]
2608
2609    pmaddwd m1, m1
2610    pmaddwd m2, m2
2611    pmaddwd m3, m3
2612    pmaddwd m4, m4
2613    paddd   m1, m2
2614    paddd   m3, m4
2615    paddd   m1, m3
2616    paddd   m0, m1
2617
2618    movu    m1, [r0]
2619    movu    m2, [r0 + mmsize]
2620    movu    m3, [r0 + r1]
2621    movu    m4, [r0 + r1 + mmsize]
2622    lea     r0, [r0 + r1 * 2]
2623
2624    pmaddwd m1, m1
2625    pmaddwd m2, m2
2626    pmaddwd m3, m3
2627    pmaddwd m4, m4
2628    paddd   m1, m2
2629    paddd   m3, m4
2630    paddd   m1, m3
2631    paddd   m0, m1
2632
2633    dec     r2d
2634    jnz    .loop
2635
2636    ; calculate sum and return
2637    HADDD   m0, m1
2638    movd    eax, m0
2639    RET
2640
2641
2642INIT_XMM sse2
2643cglobal pixel_ssd_s_32, 2,3,5
2644    add     r1, r1
2645
2646    mov     r2d, 16
2647    pxor    m0, m0
2648.loop:
2649    movu    m1, [r0 + 0 * mmsize]
2650    movu    m2, [r0 + 1 * mmsize]
2651    movu    m3, [r0 + 2 * mmsize]
2652    movu    m4, [r0 + 3 * mmsize]
2653    add     r0, r1
2654
2655    pmaddwd m1, m1
2656    pmaddwd m2, m2
2657    pmaddwd m3, m3
2658    pmaddwd m4, m4
2659    paddd   m1, m2
2660    paddd   m3, m4
2661    paddd   m1, m3
2662    paddd   m0, m1
2663
2664    movu    m1, [r0 + 0 * mmsize]
2665    movu    m2, [r0 + 1 * mmsize]
2666    movu    m3, [r0 + 2 * mmsize]
2667    movu    m4, [r0 + 3 * mmsize]
2668    add     r0, r1
2669
2670    pmaddwd m1, m1
2671    pmaddwd m2, m2
2672    pmaddwd m3, m3
2673    pmaddwd m4, m4
2674    paddd   m1, m2
2675    paddd   m3, m4
2676    paddd   m1, m3
2677    paddd   m0, m1
2678
2679    dec     r2d
2680    jnz    .loop
2681
2682    ; calculate sum and return
2683    HADDD   m0, m1
2684    movd    eax, m0
2685    RET
2686
2687INIT_YMM avx2
2688cglobal pixel_ssd_s_16, 2,4,5
2689    add     r1, r1
2690    lea     r3, [r1 * 3]
2691    mov     r2d, 16/4
2692    pxor    m0, m0
2693.loop:
2694    movu    m1, [r0]
2695    movu    m2, [r0 + r1]
2696    movu    m3, [r0 + 2 * r1]
2697    movu    m4, [r0 + r3]
2698
2699    lea     r0, [r0 + r1 * 4]
2700    pmaddwd m1, m1
2701    pmaddwd m2, m2
2702    pmaddwd m3, m3
2703    pmaddwd m4, m4
2704    paddd   m1, m2
2705    paddd   m3, m4
2706    paddd   m1, m3
2707    paddd   m0, m1
2708
2709    dec     r2d
2710    jnz    .loop
2711
2712    ; calculate sum and return
2713    HADDD   m0, m1
2714    movd    eax, xm0
2715    RET
2716
2717INIT_YMM avx2
2718cglobal pixel_ssd_s_32, 2,4,5
2719    add     r1, r1
2720    lea     r3, [r1 * 3]
2721
2722    mov     r2d, 8
2723    pxor    m0, m0
2724.loop:
2725    movu    m1, [r0 + 0 * mmsize]
2726    movu    m2, [r0 + 1 * mmsize]
2727    movu    m3, [r0 + r1 + 0 * mmsize]
2728    movu    m4, [r0 + r1 + 1 * mmsize]
2729
2730    pmaddwd m1, m1
2731    pmaddwd m2, m2
2732    pmaddwd m3, m3
2733    pmaddwd m4, m4
2734    paddd   m1, m2
2735    paddd   m3, m4
2736    paddd   m1, m3
2737    paddd   m0, m1
2738
2739    movu    m1, [r0 + r1 * 2 + 0 * mmsize]
2740    movu    m2, [r0 + r1 * 2 + 1 * mmsize]
2741    movu    m3, [r0 + r3 + 0 * mmsize]
2742    movu    m4, [r0 + r3 + 1 * mmsize]
2743    lea     r0, [r0 + 4 * r1]
2744
2745    pmaddwd m1, m1
2746    pmaddwd m2, m2
2747    pmaddwd m3, m3
2748    pmaddwd m4, m4
2749    paddd   m1, m2
2750    paddd   m3, m4
2751    paddd   m1, m3
2752    paddd   m0, m1
2753
2754    dec     r2d
2755    jnz    .loop
2756
2757    ; calculate sum and return
2758    HADDD   m0, m1
2759    movd    eax, xm0
2760    RET
2761