1;*****************************************************************************
2;* sad-a.asm: x86 sad functions
3;*****************************************************************************
4;* Copyright (C) 2003-2013 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Fiona Glaser <fiona@x264.com>
8;*          Laurent Aimar <fenrir@via.ecp.fr>
9;*          Alex Izvorski <aizvorksi@gmail.com>
10;*          Min Chen <chenm003@163.com>
11;*
12;* This program is free software; you can redistribute it and/or modify
13;* it under the terms of the GNU General Public License as published by
14;* the Free Software Foundation; either version 2 of the License, or
15;* (at your option) any later version.
16;*
17;* This program is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20;* GNU General Public License for more details.
21;*
22;* You should have received a copy of the GNU General Public License
23;* along with this program; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
25;*
26;* This program is also available under a commercial proprietary license.
27;* For more information, contact us at license @ x265.com.
28;*****************************************************************************
29
30%include "x86inc.asm"
31%include "x86util.asm"
32
33SECTION_RODATA 32
34
35MSK:                  db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
36
37SECTION .text
38
39cextern pb_3
40cextern pb_shuf8x8c
41cextern pw_8
42cextern pd_64
43
44;=============================================================================
45; SAD MMX
46;=============================================================================
47
48%macro SAD_INC_2x16P 0
49    movq    mm1,    [r0]
50    movq    mm2,    [r0+8]
51    movq    mm3,    [r0+r1]
52    movq    mm4,    [r0+r1+8]
53    psadbw  mm1,    [r2]
54    psadbw  mm2,    [r2+8]
55    psadbw  mm3,    [r2+r3]
56    psadbw  mm4,    [r2+r3+8]
57    lea     r0,     [r0+2*r1]
58    paddw   mm1,    mm2
59    paddw   mm3,    mm4
60    lea     r2,     [r2+2*r3]
61    paddw   mm0,    mm1
62    paddw   mm0,    mm3
63%endmacro
64
65%macro SAD_INC_2x8P 0
66    movq    mm1,    [r0]
67    movq    mm2,    [r0+r1]
68    psadbw  mm1,    [r2]
69    psadbw  mm2,    [r2+r3]
70    lea     r0,     [r0+2*r1]
71    paddw   mm0,    mm1
72    paddw   mm0,    mm2
73    lea     r2,     [r2+2*r3]
74%endmacro
75
76%macro SAD_INC_2x4P 0
77    movd    mm1,    [r0]
78    movd    mm2,    [r2]
79    punpckldq mm1,  [r0+r1]
80    punpckldq mm2,  [r2+r3]
81    psadbw  mm1,    mm2
82    paddw   mm0,    mm1
83    lea     r0,     [r0+2*r1]
84    lea     r2,     [r2+2*r3]
85%endmacro
86
87;-----------------------------------------------------------------------------
88; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
89;-----------------------------------------------------------------------------
90%macro SAD 2
91cglobal pixel_sad_%1x%2_mmx2, 4,4
92    pxor    mm0, mm0
93%rep %2/2
94    SAD_INC_2x%1P
95%endrep
96    movd    eax, mm0
97    RET
98%endmacro
99
100SAD 16, 16
101SAD 16,  8
102SAD  8, 16
103SAD  8,  8
104SAD  8,  4
105SAD  4, 16
106SAD  4,  8
107SAD  4,  4
108
109
110
111;=============================================================================
112; SAD XMM
113;=============================================================================
114
115%macro SAD_END_SSE2 0
116    movhlps m1, m0
117    paddw   m0, m1
118    movd   eax, m0
119    RET
120%endmacro
121
122%macro PROCESS_SAD_12x4 0
123    movu    m1,  [r2]
124    movu    m2,  [r0]
125    pand    m1,  m4
126    pand    m2,  m4
127    psadbw  m1,  m2
128    paddd   m0,  m1
129    lea     r2,  [r2 + r3]
130    lea     r0,  [r0 + r1]
131    movu    m1,  [r2]
132    movu    m2,  [r0]
133    pand    m1,  m4
134    pand    m2,  m4
135    psadbw  m1,  m2
136    paddd   m0,  m1
137    lea     r2,  [r2 + r3]
138    lea     r0,  [r0 + r1]
139    movu    m1,  [r2]
140    movu    m2,  [r0]
141    pand    m1,  m4
142    pand    m2,  m4
143    psadbw  m1,  m2
144    paddd   m0,  m1
145    lea     r2,  [r2 + r3]
146    lea     r0,  [r0 + r1]
147    movu    m1,  [r2]
148    movu    m2,  [r0]
149    pand    m1,  m4
150    pand    m2,  m4
151    psadbw  m1,  m2
152    paddd   m0,  m1
153%endmacro
154
155%macro PROCESS_SAD_16x4 0
156    movu    m1,  [r2]
157    movu    m2,  [r2 + r3]
158    psadbw  m1,  [r0]
159    psadbw  m2,  [r0 + r1]
160    paddd   m1,  m2
161    paddd   m0,  m1
162    lea     r2,  [r2 + 2 * r3]
163    lea     r0,  [r0 + 2 * r1]
164    movu    m1,  [r2]
165    movu    m2,  [r2 + r3]
166    psadbw  m1,  [r0]
167    psadbw  m2,  [r0 + r1]
168    paddd   m1,  m2
169    paddd   m0,  m1
170    lea     r2,  [r2 + 2 * r3]
171    lea     r0,  [r0 + 2 * r1]
172%endmacro
173
174%macro PROCESS_SAD_24x4 0
175    movu        m1,  [r2]
176    movq        m2,  [r2 + 16]
177    lea         r2,  [r2 + r3]
178    movu        m3,  [r2]
179    movq        m4,  [r2 + 16]
180    psadbw      m1,  [r0]
181    psadbw      m3,  [r0 + r1]
182    paddd       m0,  m1
183    paddd       m0,  m3
184    movq        m1,  [r0 + 16]
185    lea         r0,  [r0 + r1]
186    movq        m3,  [r0 + 16]
187    punpcklqdq  m2,  m4
188    punpcklqdq  m1,  m3
189    psadbw      m2, m1
190    paddd       m0, m2
191    lea         r2,  [r2 + r3]
192    lea         r0,  [r0 + r1]
193
194    movu        m1,  [r2]
195    movq        m2,  [r2 + 16]
196    lea         r2,  [r2 + r3]
197    movu        m3,  [r2]
198    movq        m4,  [r2 + 16]
199    psadbw      m1,  [r0]
200    psadbw      m3,  [r0 + r1]
201    paddd       m0,  m1
202    paddd       m0,  m3
203    movq        m1,  [r0 + 16]
204    lea         r0,  [r0 + r1]
205    movq        m3,  [r0 + 16]
206    punpcklqdq  m2,  m4
207    punpcklqdq  m1,  m3
208    psadbw      m2, m1
209    paddd       m0, m2
210%endmacro
211
212%macro PROCESS_SAD_32x4 0
213    movu    m1,  [r2]
214    movu    m2,  [r2 + 16]
215    psadbw  m1,  [r0]
216    psadbw  m2,  [r0 + 16]
217    paddd   m1,  m2
218    paddd   m0,  m1
219    lea     r2,  [r2 + r3]
220    lea     r0,  [r0 + r1]
221    movu    m1,  [r2]
222    movu    m2,  [r2 + 16]
223    psadbw  m1,  [r0]
224    psadbw  m2,  [r0 + 16]
225    paddd   m1,  m2
226    paddd   m0,  m1
227    lea     r2,  [r2 + r3]
228    lea     r0,  [r0 + r1]
229    movu    m1,  [r2]
230    movu    m2,  [r2 + 16]
231    psadbw  m1,  [r0]
232    psadbw  m2,  [r0 + 16]
233    paddd   m1,  m2
234    paddd   m0,  m1
235    lea     r2,  [r2 + r3]
236    lea     r0,  [r0 + r1]
237    movu    m1,  [r2]
238    movu    m2,  [r2 + 16]
239    psadbw  m1,  [r0]
240    psadbw  m2,  [r0 + 16]
241    paddd   m1,  m2
242    paddd   m0,  m1
243    lea     r2,  [r2 + r3]
244    lea     r0,  [r0 + r1]
245%endmacro
246
247%macro PROCESS_SAD_48x4 0
248    movu    m1,  [r2]
249    movu    m2,  [r2 + 16]
250    movu    m3,  [r2 + 32]
251    psadbw  m1,  [r0]
252    psadbw  m2,  [r0 + 16]
253    psadbw  m3,  [r0 + 32]
254    paddd   m1,  m2
255    paddd   m0,  m1
256    paddd   m0,  m3
257    lea     r2,  [r2 + r3]
258    lea     r0,  [r0 + r1]
259
260    movu    m1,  [r2]
261    movu    m2,  [r2 + 16]
262    movu    m3,  [r2 + 32]
263    psadbw  m1,  [r0]
264    psadbw  m2,  [r0 + 16]
265    psadbw  m3,  [r0 + 32]
266    paddd   m1,  m2
267    paddd   m0,  m1
268    paddd   m0,  m3
269    lea     r2,  [r2 + r3]
270    lea     r0,  [r0 + r1]
271
272    movu    m1,  [r2]
273    movu    m2,  [r2 + 16]
274    movu    m3,  [r2 + 32]
275    psadbw  m1,  [r0]
276    psadbw  m2,  [r0 + 16]
277    psadbw  m3,  [r0 + 32]
278    paddd   m1,  m2
279    paddd   m0,  m1
280    paddd   m0,  m3
281    lea     r2,  [r2 + r3]
282    lea     r0,  [r0 + r1]
283
284    movu    m1,  [r2]
285    movu    m2,  [r2 + 16]
286    movu    m3,  [r2 + 32]
287    psadbw  m1,  [r0]
288    psadbw  m2,  [r0 + 16]
289    psadbw  m3,  [r0 + 32]
290    paddd   m1,  m2
291    paddd   m0,  m1
292    paddd   m0,  m3
293%endmacro
294
295%macro PROCESS_SAD_8x4 0
296    movq        m1, [r2]
297    movq        m2, [r2 + r3]
298    lea         r2, [r2 + 2 * r3]
299    movq        m3, [r0]
300    movq        m4, [r0 + r1]
301    lea         r0, [r0 + 2 * r1]
302    punpcklqdq  m1, m2
303    punpcklqdq  m3, m4
304    psadbw      m1, m3
305    paddd       m0, m1
306    movq        m1, [r2]
307    movq        m2, [r2 + r3]
308    lea         r2, [r2 + 2 * r3]
309    movq        m3, [r0]
310    movq        m4, [r0 + r1]
311    lea         r0, [r0 + 2 * r1]
312    punpcklqdq  m1, m2
313    punpcklqdq  m3, m4
314    psadbw      m1, m3
315    paddd       m0, m1
316%endmacro
317
318%macro PROCESS_SAD_64x4 0
319    movu    m1,  [r2]
320    movu    m2,  [r2 + 16]
321    movu    m3,  [r2 + 32]
322    movu    m4,  [r2 + 48]
323    psadbw  m1,  [r0]
324    psadbw  m2,  [r0 + 16]
325    psadbw  m3,  [r0 + 32]
326    psadbw  m4,  [r0 + 48]
327    paddd   m1,  m2
328    paddd   m3,  m4
329    paddd   m0,  m1
330    paddd   m0,  m3
331    lea     r2,  [r2 + r3]
332    lea     r0,  [r0 + r1]
333
334    movu    m1,  [r2]
335    movu    m2,  [r2 + 16]
336    movu    m3,  [r2 + 32]
337    movu    m4,  [r2 + 48]
338    psadbw  m1,  [r0]
339    psadbw  m2,  [r0 + 16]
340    psadbw  m3,  [r0 + 32]
341    psadbw  m4,  [r0 + 48]
342    paddd   m1,  m2
343    paddd   m3,  m4
344    paddd   m0,  m1
345    paddd   m0,  m3
346    lea     r2,  [r2 + r3]
347    lea     r0,  [r0 + r1]
348
349    movu    m1,  [r2]
350    movu    m2,  [r2 + 16]
351    movu    m3,  [r2 + 32]
352    movu    m4,  [r2 + 48]
353    psadbw  m1,  [r0]
354    psadbw  m2,  [r0 + 16]
355    psadbw  m3,  [r0 + 32]
356    psadbw  m4,  [r0 + 48]
357    paddd   m1,  m2
358    paddd   m3,  m4
359    paddd   m0,  m1
360    paddd   m0,  m3
361    lea     r2,  [r2 + r3]
362    lea     r0,  [r0 + r1]
363
364    movu    m1,  [r2]
365    movu    m2,  [r2 + 16]
366    movu    m3,  [r2 + 32]
367    movu    m4,  [r2 + 48]
368    psadbw  m1,  [r0]
369    psadbw  m2,  [r0 + 16]
370    psadbw  m3,  [r0 + 32]
371    psadbw  m4,  [r0 + 48]
372    paddd   m1,  m2
373    paddd   m3,  m4
374    paddd   m0,  m1
375    paddd   m0,  m3
376    lea     r2,  [r2 + r3]
377    lea     r0,  [r0 + r1]
378%endmacro
379
380%macro SAD_W16 0
381;-----------------------------------------------------------------------------
382; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
383;-----------------------------------------------------------------------------
384cglobal pixel_sad_16x16, 4,4,8
385    movu    m0, [r2]
386    movu    m1, [r2+r3]
387    lea     r2, [r2+2*r3]
388    movu    m2, [r2]
389    movu    m3, [r2+r3]
390    lea     r2, [r2+2*r3]
391    psadbw  m0, [r0]
392    psadbw  m1, [r0+r1]
393    lea     r0, [r0+2*r1]
394    movu    m4, [r2]
395    paddw   m0, m1
396    psadbw  m2, [r0]
397    psadbw  m3, [r0+r1]
398    lea     r0, [r0+2*r1]
399    movu    m5, [r2+r3]
400    lea     r2, [r2+2*r3]
401    paddw   m2, m3
402    movu    m6, [r2]
403    movu    m7, [r2+r3]
404    lea     r2, [r2+2*r3]
405    paddw   m0, m2
406    psadbw  m4, [r0]
407    psadbw  m5, [r0+r1]
408    lea     r0, [r0+2*r1]
409    movu    m1, [r2]
410    paddw   m4, m5
411    psadbw  m6, [r0]
412    psadbw  m7, [r0+r1]
413    lea     r0, [r0+2*r1]
414    movu    m2, [r2+r3]
415    lea     r2, [r2+2*r3]
416    paddw   m6, m7
417    movu    m3, [r2]
418    paddw   m0, m4
419    movu    m4, [r2+r3]
420    lea     r2, [r2+2*r3]
421    paddw   m0, m6
422    psadbw  m1, [r0]
423    psadbw  m2, [r0+r1]
424    lea     r0, [r0+2*r1]
425    movu    m5, [r2]
426    paddw   m1, m2
427    psadbw  m3, [r0]
428    psadbw  m4, [r0+r1]
429    lea     r0, [r0+2*r1]
430    movu    m6, [r2+r3]
431    lea     r2, [r2+2*r3]
432    paddw   m3, m4
433    movu    m7, [r2]
434    paddw   m0, m1
435    movu    m1, [r2+r3]
436    paddw   m0, m3
437    psadbw  m5, [r0]
438    psadbw  m6, [r0+r1]
439    lea     r0, [r0+2*r1]
440    paddw   m5, m6
441    psadbw  m7, [r0]
442    psadbw  m1, [r0+r1]
443    paddw   m7, m1
444    paddw   m0, m5
445    paddw   m0, m7
446    SAD_END_SSE2
447
448;-----------------------------------------------------------------------------
449; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
450;-----------------------------------------------------------------------------
451cglobal pixel_sad_16x8, 4,4
452    movu    m0, [r2]
453    movu    m2, [r2+r3]
454    lea     r2, [r2+2*r3]
455    movu    m3, [r2]
456    movu    m4, [r2+r3]
457    psadbw  m0, [r0]
458    psadbw  m2, [r0+r1]
459    lea     r0, [r0+2*r1]
460    psadbw  m3, [r0]
461    psadbw  m4, [r0+r1]
462    lea     r0, [r0+2*r1]
463    lea     r2, [r2+2*r3]
464    paddw   m0, m2
465    paddw   m3, m4
466    paddw   m0, m3
467    movu    m1, [r2]
468    movu    m2, [r2+r3]
469    lea     r2, [r2+2*r3]
470    movu    m3, [r2]
471    movu    m4, [r2+r3]
472    psadbw  m1, [r0]
473    psadbw  m2, [r0+r1]
474    lea     r0, [r0+2*r1]
475    psadbw  m3, [r0]
476    psadbw  m4, [r0+r1]
477    lea     r0, [r0+2*r1]
478    lea     r2, [r2+2*r3]
479    paddw   m1, m2
480    paddw   m3, m4
481    paddw   m0, m1
482    paddw   m0, m3
483    SAD_END_SSE2
484
485;-----------------------------------------------------------------------------
486; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
487;-----------------------------------------------------------------------------
488cglobal pixel_sad_16x12, 4,4,3
489    pxor m0, m0
490
491    PROCESS_SAD_16x4
492    PROCESS_SAD_16x4
493    PROCESS_SAD_16x4
494
495    movhlps m1, m0
496    paddd   m0, m1
497    movd    eax, m0
498    RET
499
500;-----------------------------------------------------------------------------
501; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
502;-----------------------------------------------------------------------------
503cglobal pixel_sad_16x32, 4,5,3
504    pxor m0,  m0
505    mov  r4d, 4
506.loop:
507    PROCESS_SAD_16x4
508    PROCESS_SAD_16x4
509    dec  r4d
510    jnz .loop
511
512    movhlps m1, m0
513    paddd   m0, m1
514    movd    eax, m0
515    RET
516
517;-----------------------------------------------------------------------------
518; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
519;-----------------------------------------------------------------------------
520cglobal pixel_sad_16x64, 4,5,3
521    pxor m0,  m0
522    mov  r4d, 8
523.loop:
524    PROCESS_SAD_16x4
525    PROCESS_SAD_16x4
526    dec  r4d
527    jnz .loop
528
529    movhlps m1, m0
530    paddd   m0, m1
531    movd    eax, m0
532    RET
533
534;-----------------------------------------------------------------------------
535; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
536;-----------------------------------------------------------------------------
537cglobal pixel_sad_16x4, 4,4,3
538
539    movu    m0,  [r2]
540    movu    m1,  [r2 + r3]
541    psadbw  m0,  [r0]
542    psadbw  m1,  [r0 + r1]
543    paddd   m0,  m1
544    lea     r2,  [r2 + 2 * r3]
545    lea     r0,  [r0 + 2 * r1]
546    movu    m1,  [r2]
547    movu    m2,  [r2 + r3]
548    psadbw  m1,  [r0]
549    psadbw  m2,  [r0 + r1]
550    paddd   m1,  m2
551    paddd   m0,  m1
552
553    movhlps m1,  m0
554    paddd   m0,  m1
555    movd    eax, m0
556    RET
557
558;-----------------------------------------------------------------------------
559; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
560;-----------------------------------------------------------------------------
561cglobal pixel_sad_32x8, 4,4,3
562    pxor  m0,  m0
563
564    PROCESS_SAD_32x4
565    PROCESS_SAD_32x4
566
567    movhlps m1,  m0
568    paddd   m0,  m1
569    movd    eax, m0
570    RET
571
572;-----------------------------------------------------------------------------
573; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
574;-----------------------------------------------------------------------------
575cglobal pixel_sad_32x24, 4,5,3
576    pxor  m0,  m0
577    mov   r4d, 3
578.loop:
579    PROCESS_SAD_32x4
580    PROCESS_SAD_32x4
581    dec r4d
582    jnz .loop
583
584    movhlps m1,  m0
585    paddd   m0,  m1
586    movd    eax, m0
587    RET
588
589;-----------------------------------------------------------------------------
590; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
591;-----------------------------------------------------------------------------
592cglobal pixel_sad_32x32, 4,5,3
593    pxor  m0,  m0
594    mov   r4d, 4
595.loop:
596    PROCESS_SAD_32x4
597    PROCESS_SAD_32x4
598    dec r4d
599    jnz .loop
600
601    movhlps m1,  m0
602    paddd   m0,  m1
603    movd    eax, m0
604    RET
605
606;-----------------------------------------------------------------------------
607; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
608;-----------------------------------------------------------------------------
609cglobal pixel_sad_32x16, 4,4,3
610    pxor  m0,  m0
611
612    PROCESS_SAD_32x4
613    PROCESS_SAD_32x4
614    PROCESS_SAD_32x4
615    PROCESS_SAD_32x4
616
617    movhlps m1,  m0
618    paddd   m0,  m1
619    movd    eax, m0
620    RET
621
622;-----------------------------------------------------------------------------
623; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
624;-----------------------------------------------------------------------------
625cglobal pixel_sad_32x64, 4,5,3
626    pxor  m0,  m0
627    mov   r4d, 8
628.loop:
629    PROCESS_SAD_32x4
630    PROCESS_SAD_32x4
631    dec  r4d
632    jnz .loop
633
634    movhlps m1,  m0
635    paddd   m0,  m1
636    movd    eax, m0
637    RET
638
639;-----------------------------------------------------------------------------
640; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
641;-----------------------------------------------------------------------------
642cglobal pixel_sad_8x32, 4,5,3
643    pxor  m0,  m0
644    mov   r4d, 4
645.loop:
646    PROCESS_SAD_8x4
647    PROCESS_SAD_8x4
648    dec  r4d
649    jnz .loop
650
651    movhlps m1,  m0
652    paddd   m0,  m1
653    movd    eax, m0
654    RET
655
656;-----------------------------------------------------------------------------
657; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
658;-----------------------------------------------------------------------------
659cglobal pixel_sad_64x16, 4,4,5
660    pxor  m0,  m0
661
662    PROCESS_SAD_64x4
663    PROCESS_SAD_64x4
664    PROCESS_SAD_64x4
665    PROCESS_SAD_64x4
666
667    movhlps m1,  m0
668    paddd   m0,  m1
669    movd    eax, m0
670    RET
671
672;-----------------------------------------------------------------------------
673; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
674;-----------------------------------------------------------------------------
675cglobal pixel_sad_64x32, 4,5,5
676    pxor  m0,  m0
677    mov   r4,  4
678
679.loop:
680    PROCESS_SAD_64x4
681    PROCESS_SAD_64x4
682
683    dec   r4
684    jnz   .loop
685
686    movhlps m1,  m0
687    paddd   m0,  m1
688    movd    eax, m0
689    RET
690
691;-----------------------------------------------------------------------------
692; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
693;-----------------------------------------------------------------------------
694cglobal pixel_sad_64x48, 4,5,5
695    pxor  m0,  m0
696    mov   r4,  6
697
698.loop:
699    PROCESS_SAD_64x4
700    PROCESS_SAD_64x4
701    dec     r4d
702    jnz     .loop
703
704    movhlps m1,  m0
705    paddd   m0,  m1
706    movd    eax, m0
707    RET
708
709;-----------------------------------------------------------------------------
710; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
711;-----------------------------------------------------------------------------
712cglobal pixel_sad_64x64, 4,5,5
713    pxor  m0,  m0
714    mov   r4,  8
715
716.loop:
717    PROCESS_SAD_64x4
718    PROCESS_SAD_64x4
719    dec   r4
720    jnz   .loop
721
722    movhlps m1,  m0
723    paddd   m0,  m1
724    movd    eax, m0
725    RET
726
727;-----------------------------------------------------------------------------
728; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
729;-----------------------------------------------------------------------------
730cglobal pixel_sad_48x64, 4,5,5
731    pxor  m0,  m0
732    mov   r4,  64
733
734.loop:
735    PROCESS_SAD_48x4
736    lea     r2,  [r2 + r3]
737    lea     r0,  [r0 + r1]
738
739    PROCESS_SAD_48x4
740    lea     r2,  [r2 + r3]
741    lea     r0,  [r0 + r1]
742
743    sub   r4,  8
744    cmp   r4,  8
745
746jnz .loop
747    PROCESS_SAD_48x4
748    lea   r2,  [r2 + r3]
749    lea   r0,  [r0 + r1]
750    PROCESS_SAD_48x4
751
752    movhlps m1,  m0
753    paddd   m0,  m1
754    movd    eax, m0
755    RET
756
757;-----------------------------------------------------------------------------
758; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
759;-----------------------------------------------------------------------------
760cglobal pixel_sad_24x32, 4,5,4
761    pxor  m0,  m0
762    mov   r4,  32
763
764.loop:
765    PROCESS_SAD_24x4
766    lea         r2,  [r2 + r3]
767    lea         r0,  [r0 + r1]
768    PROCESS_SAD_24x4
769    lea         r2,  [r2 + r3]
770    lea         r0,  [r0 + r1]
771    sub   r4,  8
772    cmp   r4,  8
773jnz .loop
774    PROCESS_SAD_24x4
775    lea         r2,  [r2 + r3]
776    lea         r0,  [r0 + r1]
777    PROCESS_SAD_24x4
778
779    movhlps m1,  m0
780    paddd   m0,  m1
781    movd    eax, m0
782    RET
783
784;-----------------------------------------------------------------------------
785; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
786;-----------------------------------------------------------------------------
787cglobal pixel_sad_12x16, 4,4,4
788    mova  m4,  [MSK]
789    pxor  m0,  m0
790
791    PROCESS_SAD_12x4
792    lea         r2,  [r2 + r3]
793    lea         r0,  [r0 + r1]
794    PROCESS_SAD_12x4
795    lea         r2,  [r2 + r3]
796    lea         r0,  [r0 + r1]
797    PROCESS_SAD_12x4
798    lea         r2,  [r2 + r3]
799    lea         r0,  [r0 + r1]
800    PROCESS_SAD_12x4
801
802    movhlps m1,  m0
803    paddd   m0,  m1
804    movd    eax, m0
805    RET
806
807%endmacro
808
809INIT_XMM sse2
810SAD_W16
811INIT_XMM sse3
812SAD_W16
813INIT_XMM sse2, aligned
814SAD_W16
815
816%macro SAD_INC_4x8P_SSE 1
817    movq    m1, [r0]
818    movq    m2, [r0+r1]
819    lea     r0, [r0+2*r1]
820    movq    m3, [r2]
821    movq    m4, [r2+r3]
822    lea     r2, [r2+2*r3]
823    movhps  m1, [r0]
824    movhps  m2, [r0+r1]
825    movhps  m3, [r2]
826    movhps  m4, [r2+r3]
827    lea     r0, [r0+2*r1]
828    psadbw  m1, m3
829    psadbw  m2, m4
830    lea     r2, [r2+2*r3]
831    ACCUM paddw, 0, 1, %1
832    paddw   m0, m2
833%endmacro
834
835INIT_XMM
836;Even on Nehalem, no sizes other than 8x16 benefit from this method.
837cglobal pixel_sad_8x16_sse2, 4,4
838    SAD_INC_4x8P_SSE 0
839    SAD_INC_4x8P_SSE 1
840    SAD_INC_4x8P_SSE 1
841    SAD_INC_4x8P_SSE 1
842    SAD_END_SSE2
843    RET
844
845;=============================================================================
846; SAD x3/x4 MMX
847;=============================================================================
848
849%macro SAD_X3_START_1x8P 0
850    movq    mm3,    [r0]
851    movq    mm0,    [r1]
852    movq    mm1,    [r2]
853    movq    mm2,    [r3]
854    psadbw  mm0,    mm3
855    psadbw  mm1,    mm3
856    psadbw  mm2,    mm3
857%endmacro
858
859%macro SAD_X3_1x8P 2
860    movq    mm3,    [r0+%1]
861    movq    mm4,    [r1+%2]
862    movq    mm5,    [r2+%2]
863    movq    mm6,    [r3+%2]
864    psadbw  mm4,    mm3
865    psadbw  mm5,    mm3
866    psadbw  mm6,    mm3
867    paddw   mm0,    mm4
868    paddw   mm1,    mm5
869    paddw   mm2,    mm6
870%endmacro
871
872%macro SAD_X3_START_2x4P 3
873    movd      mm3,  [r0]
874    movd      %1,   [r1]
875    movd      %2,   [r2]
876    movd      %3,   [r3]
877    punpckldq mm3,  [r0+FENC_STRIDE]
878    punpckldq %1,   [r1+r4]
879    punpckldq %2,   [r2+r4]
880    punpckldq %3,   [r3+r4]
881    psadbw    %1,   mm3
882    psadbw    %2,   mm3
883    psadbw    %3,   mm3
884%endmacro
885
886%macro SAD_X3_2x16P 1
887%if %1
888    SAD_X3_START_1x8P
889%else
890    SAD_X3_1x8P 0, 0
891%endif
892    SAD_X3_1x8P 8, 8
893    SAD_X3_1x8P FENC_STRIDE, r4
894    SAD_X3_1x8P FENC_STRIDE+8, r4+8
895    add     r0, 2*FENC_STRIDE
896    lea     r1, [r1+2*r4]
897    lea     r2, [r2+2*r4]
898    lea     r3, [r3+2*r4]
899%endmacro
900
901%macro SAD_X3_2x8P 1
902%if %1
903    SAD_X3_START_1x8P
904%else
905    SAD_X3_1x8P 0, 0
906%endif
907    SAD_X3_1x8P FENC_STRIDE, r4
908    add     r0, 2*FENC_STRIDE
909    lea     r1, [r1+2*r4]
910    lea     r2, [r2+2*r4]
911    lea     r3, [r3+2*r4]
912%endmacro
913
914%macro SAD_X3_2x4P 1
915%if %1
916    SAD_X3_START_2x4P mm0, mm1, mm2
917%else
918    SAD_X3_START_2x4P mm4, mm5, mm6
919    paddw     mm0,  mm4
920    paddw     mm1,  mm5
921    paddw     mm2,  mm6
922%endif
923    add     r0, 2*FENC_STRIDE
924    lea     r1, [r1+2*r4]
925    lea     r2, [r2+2*r4]
926    lea     r3, [r3+2*r4]
927%endmacro
928
929%macro SAD_X4_START_1x8P 0
930    movq    mm7,    [r0]
931    movq    mm0,    [r1]
932    movq    mm1,    [r2]
933    movq    mm2,    [r3]
934    movq    mm3,    [r4]
935    psadbw  mm0,    mm7
936    psadbw  mm1,    mm7
937    psadbw  mm2,    mm7
938    psadbw  mm3,    mm7
939%endmacro
940
941%macro SAD_X4_1x8P 2
942    movq    mm7,    [r0+%1]
943    movq    mm4,    [r1+%2]
944    movq    mm5,    [r2+%2]
945    movq    mm6,    [r3+%2]
946    psadbw  mm4,    mm7
947    psadbw  mm5,    mm7
948    psadbw  mm6,    mm7
949    psadbw  mm7,    [r4+%2]
950    paddw   mm0,    mm4
951    paddw   mm1,    mm5
952    paddw   mm2,    mm6
953    paddw   mm3,    mm7
954%endmacro
955
956%macro SAD_X4_START_2x4P 0
957    movd      mm7,  [r0]
958    movd      mm0,  [r1]
959    movd      mm1,  [r2]
960    movd      mm2,  [r3]
961    movd      mm3,  [r4]
962    punpckldq mm7,  [r0+FENC_STRIDE]
963    punpckldq mm0,  [r1+r5]
964    punpckldq mm1,  [r2+r5]
965    punpckldq mm2,  [r3+r5]
966    punpckldq mm3,  [r4+r5]
967    psadbw    mm0,  mm7
968    psadbw    mm1,  mm7
969    psadbw    mm2,  mm7
970    psadbw    mm3,  mm7
971%endmacro
972
973%macro SAD_X4_INC_2x4P 0
974    movd      mm7,  [r0]
975    movd      mm4,  [r1]
976    movd      mm5,  [r2]
977    punpckldq mm7,  [r0+FENC_STRIDE]
978    punpckldq mm4,  [r1+r5]
979    punpckldq mm5,  [r2+r5]
980    psadbw    mm4,  mm7
981    psadbw    mm5,  mm7
982    paddw     mm0,  mm4
983    paddw     mm1,  mm5
984    movd      mm4,  [r3]
985    movd      mm5,  [r4]
986    punpckldq mm4,  [r3+r5]
987    punpckldq mm5,  [r4+r5]
988    psadbw    mm4,  mm7
989    psadbw    mm5,  mm7
990    paddw     mm2,  mm4
991    paddw     mm3,  mm5
992%endmacro
993
994%macro SAD_X4_2x16P 1
995%if %1
996    SAD_X4_START_1x8P
997%else
998    SAD_X4_1x8P 0, 0
999%endif
1000    SAD_X4_1x8P 8, 8
1001    SAD_X4_1x8P FENC_STRIDE, r5
1002    SAD_X4_1x8P FENC_STRIDE+8, r5+8
1003    add     r0, 2*FENC_STRIDE
1004    lea     r1, [r1+2*r5]
1005    lea     r2, [r2+2*r5]
1006    lea     r3, [r3+2*r5]
1007    lea     r4, [r4+2*r5]
1008%endmacro
1009
1010%macro SAD_X4_2x8P 1
1011%if %1
1012    SAD_X4_START_1x8P
1013%else
1014    SAD_X4_1x8P 0, 0
1015%endif
1016    SAD_X4_1x8P FENC_STRIDE, r5
1017    add     r0, 2*FENC_STRIDE
1018    lea     r1, [r1+2*r5]
1019    lea     r2, [r2+2*r5]
1020    lea     r3, [r3+2*r5]
1021    lea     r4, [r4+2*r5]
1022%endmacro
1023
1024%macro SAD_X4_2x4P 1
1025%if %1
1026    SAD_X4_START_2x4P
1027%else
1028    SAD_X4_INC_2x4P
1029%endif
1030    add     r0, 2*FENC_STRIDE
1031    lea     r1, [r1+2*r5]
1032    lea     r2, [r2+2*r5]
1033    lea     r3, [r3+2*r5]
1034    lea     r4, [r4+2*r5]
1035%endmacro
1036
1037%macro SAD_X3_END 0
1038%if UNIX64
1039    movd    [r5+0], mm0
1040    movd    [r5+4], mm1
1041    movd    [r5+8], mm2
1042%else
1043    mov     r0, r5mp
1044    movd    [r0+0], mm0
1045    movd    [r0+4], mm1
1046    movd    [r0+8], mm2
1047%endif
1048    RET
1049%endmacro
1050
1051%macro SAD_X4_END 0
1052    mov     r0, r6mp
1053    movd    [r0+0], mm0
1054    movd    [r0+4], mm1
1055    movd    [r0+8], mm2
1056    movd    [r0+12], mm3
1057    RET
1058%endmacro
1059
1060%macro SAD_X3_12x4 0
1061    mova    m3,  [r0]
1062    movu    m5,  [r1]
1063    pand    m3,  m4
1064    pand    m5,  m4
1065    psadbw  m5,  m3
1066    paddd   m0,  m5
1067    movu    m5,  [r2]
1068    pand    m5,  m4
1069    psadbw  m5,  m3
1070    paddd   m1,  m5
1071    movu    m5,  [r3]
1072    pand    m5,  m4
1073    psadbw  m5,  m3
1074    paddd   m2,  m5
1075    mova    m3,  [r0 + FENC_STRIDE]
1076    movu    m5,  [r1 + r4]
1077    pand    m3,  m4
1078    pand    m5,  m4
1079    psadbw  m5,  m3
1080    paddd   m0,  m5
1081    movu    m5,  [r2 + r4]
1082    pand    m5,  m4
1083    psadbw  m5,  m3
1084    paddd   m1,  m5
1085    movu    m5,  [r3 + r4]
1086    pand    m5,  m4
1087    psadbw  m5,  m3
1088    paddd   m2,  m5
1089    mova    m3,  [r0 + FENC_STRIDE * 2]
1090    movu    m5,  [r1 + r4 * 2]
1091    pand    m3,  m4
1092    pand    m5,  m4
1093    psadbw  m5,  m3
1094    paddd   m0,  m5
1095    movu    m5,  [r2 + r4 * 2]
1096    pand    m5,  m4
1097    psadbw  m5,  m3
1098    paddd   m1,  m5
1099    movu    m5,  [r3 + r4 * 2]
1100    pand    m5,  m4
1101    psadbw  m5,  m3
1102    paddd   m2,  m5
1103    lea     r1, [r1 + r4 * 2]
1104    lea     r2, [r2 + r4 * 2]
1105    lea     r3, [r3 + r4 * 2]
1106    mova    m3,  [r0 + FENC_STRIDE + FENC_STRIDE * 2]
1107    movu    m5,  [r1 + r4]
1108    pand    m3,  m4
1109    pand    m5,  m4
1110    psadbw  m5,  m3
1111    paddd   m0,  m5
1112    movu    m5,  [r2 + r4]
1113    pand    m5,  m4
1114    psadbw  m5,  m3
1115    paddd   m1,  m5
1116    movu    m5,  [r3 + r4]
1117    pand    m5,  m4
1118    psadbw  m5,  m3
1119    paddd   m2,  m5
1120    lea     r0,  [r0 + FENC_STRIDE * 4]
1121    lea     r1,  [r1 + r4 * 2]
1122    lea     r2,  [r2 + r4 * 2]
1123    lea     r3,  [r3 + r4 * 2]
1124%endmacro
1125
1126%macro SAD_X4_12x4 0
1127    mova    m4,  [r0]
1128    movu    m5,  [r1]
1129    pand    m4,  m6
1130    pand    m5,  m6
1131    psadbw  m5,  m4
1132    paddd   m0,  m5
1133    movu    m5,  [r2]
1134    pand    m5,  m6
1135    psadbw  m5,  m4
1136    paddd   m1,  m5
1137    movu    m5,  [r3]
1138    pand    m5,  m6
1139    psadbw  m5,  m4
1140    paddd   m2,  m5
1141    movu    m5,  [r4]
1142    pand    m5,  m6
1143    psadbw  m5,  m4
1144    paddd   m3,  m5
1145    mova    m4,  [r0 + FENC_STRIDE]
1146    movu    m5,  [r1 + r5]
1147    pand    m4,  m6
1148    pand    m5,  m6
1149    psadbw  m5,  m4
1150    paddd   m0,  m5
1151    movu    m5,  [r2 + r5]
1152    pand    m5,  m6
1153    psadbw  m5,  m4
1154    paddd   m1,  m5
1155    movu    m5,  [r3 + r5]
1156    pand    m5,  m6
1157    psadbw  m5,  m4
1158    paddd   m2,  m5
1159    movu    m5,  [r4 + r5]
1160    pand    m5,  m6
1161    psadbw  m5,  m4
1162    paddd   m3,  m5
1163    mova    m4,  [r0 + FENC_STRIDE * 2]
1164    movu    m5,  [r1 + r5 * 2]
1165    pand    m4,  m6
1166    pand    m5,  m6
1167    psadbw  m5,  m4
1168    paddd   m0,  m5
1169    movu    m5,  [r2 + r5 * 2]
1170    pand    m5,  m6
1171    psadbw  m5,  m4
1172    paddd   m1,  m5
1173    movu    m5,  [r3 + r5 * 2]
1174    pand    m5,  m6
1175    psadbw  m5,  m4
1176    paddd   m2,  m5
1177    movu    m5,  [r4 + r5 * 2]
1178    pand    m5,  m6
1179    psadbw  m5,  m4
1180    paddd   m3,  m5
1181    lea     r1, [r1 + r5 * 2]
1182    lea     r2, [r2 + r5 * 2]
1183    lea     r3, [r3 + r5 * 2]
1184    lea     r4, [r4 + r5 * 2]
1185    mova    m4,  [r0 + FENC_STRIDE + FENC_STRIDE * 2]
1186    movu    m5,  [r1 + r5]
1187    pand    m4,  m6
1188    pand    m5,  m6
1189    psadbw  m5,  m4
1190    paddd   m0,  m5
1191    movu    m5,  [r2 + r5]
1192    pand    m5,  m6
1193    psadbw  m5,  m4
1194    paddd   m1,  m5
1195    movu    m5,  [r3 + r5]
1196    pand    m5,  m6
1197    psadbw  m5,  m4
1198    paddd   m2,  m5
1199    movu    m5,  [r4 + r5]
1200    pand    m5,  m6
1201    psadbw  m5,  m4
1202    paddd   m3,  m5
1203    lea     r0,  [r0 + FENC_STRIDE * 4]
1204    lea     r1,  [r1 + r5 * 2]
1205    lea     r2,  [r2 + r5 * 2]
1206    lea     r3,  [r3 + r5 * 2]
1207    lea     r4,  [r4 + r5 * 2]
1208%endmacro
1209
1210%macro SAD_X3_24x4 0
1211    mova    m3,  [r0]
1212    mova    m4,  [r0 + 16]
1213    movu    m5,  [r1]
1214    movu    m6,  [r1 + 16]
1215    psadbw  m5,  m3
1216    psadbw  m6,  m4
1217    pshufd  m6,  m6, 84
1218    paddd   m5,  m6
1219    paddd   m0,  m5
1220    movu    m5,  [r2]
1221    movu    m6,  [r2 + 16]
1222    psadbw  m5,  m3
1223    psadbw  m6,  m4
1224    pshufd  m6,  m6, 84
1225    paddd   m5,  m6
1226    paddd   m1,  m5
1227    movu    m5,  [r3]
1228    movu    m6,  [r3 + 16]
1229    psadbw  m5,  m3
1230    psadbw  m6,  m4
1231    pshufd  m6,  m6, 84
1232    paddd   m5,  m6
1233    paddd   m2,  m5
1234
1235    mova    m3,  [r0 + FENC_STRIDE]
1236    mova    m4,  [r0 + 16 + FENC_STRIDE]
1237    movu    m5,  [r1 + r4]
1238    movu    m6,  [r1 + 16 + r4]
1239    psadbw  m5,  m3
1240    psadbw  m6,  m4
1241    pshufd  m6,  m6, 84
1242    paddd   m5,  m6
1243    paddd   m0,  m5
1244    movu    m5,  [r2 + r4]
1245    movu    m6,  [r2 + 16 + r4]
1246    psadbw  m5,  m3
1247    psadbw  m6,  m4
1248    pshufd  m6,  m6, 84
1249    paddd   m5,  m6
1250    paddd   m1,  m5
1251    movu    m5,  [r3 + r4]
1252    movu    m6,  [r3 + 16 + r4]
1253    psadbw  m5,  m3
1254    psadbw  m6,  m4
1255    pshufd  m6,  m6, 84
1256    paddd   m5,  m6
1257    paddd   m2,  m5
1258
1259    mova    m3,  [r0 + FENC_STRIDE * 2]
1260    mova    m4,  [r0 + 16 + FENC_STRIDE * 2]
1261    movu    m5,  [r1 + r4 * 2]
1262    movu    m6,  [r1 + 16 + r4 * 2]
1263    psadbw  m5,  m3
1264    psadbw  m6,  m4
1265    pshufd  m6,  m6, 84
1266    paddd   m5,  m6
1267    paddd   m0,  m5
1268    movu    m5,  [r2 + r4 * 2]
1269    movu    m6,  [r2 + 16 + r4 * 2]
1270    psadbw  m5,  m3
1271    psadbw  m6,  m4
1272    pshufd  m6,  m6, 84
1273    paddd   m5,  m6
1274    paddd   m1,  m5
1275    movu    m5,  [r3 + r4 * 2]
1276    movu    m6,  [r3 + 16 + r4 * 2]
1277    psadbw  m5,  m3
1278    psadbw  m6,  m4
1279    pshufd  m6,  m6, 84
1280    paddd   m5,  m6
1281    paddd   m2,  m5
1282    lea     r0,  [r0 + FENC_STRIDE * 2]
1283    lea     r1,  [r1 + r4 * 2]
1284    lea     r2,  [r2 + r4 * 2]
1285    lea     r3,  [r3 + r4 * 2]
1286
1287    mova    m3,  [r0 + FENC_STRIDE]
1288    mova    m4,  [r0 + 16 + FENC_STRIDE]
1289    movu    m5,  [r1 + r4]
1290    movu    m6,  [r1 + 16 + r4]
1291    psadbw  m5,  m3
1292    psadbw  m6,  m4
1293    pshufd  m6,  m6, 84
1294    paddd   m5,  m6
1295    paddd   m0,  m5
1296    movu    m5,  [r2 + r4]
1297    movu    m6,  [r2 + 16 + r4]
1298    psadbw  m5,  m3
1299    psadbw  m6,  m4
1300    pshufd  m6,  m6, 84
1301    paddd   m5,  m6
1302    paddd   m1,  m5
1303    movu    m5,  [r3 + r4]
1304    movu    m6,  [r3 + 16 + r4]
1305    psadbw  m5,  m3
1306    psadbw  m6,  m4
1307    pshufd  m6,  m6, 84
1308    paddd   m5,  m6
1309    paddd   m2,  m5
1310    lea     r0,  [r0 + FENC_STRIDE * 2]
1311    lea     r1,  [r1 + r4 * 2]
1312    lea     r2,  [r2 + r4 * 2]
1313    lea     r3,  [r3 + r4 * 2]
1314%endmacro
1315
1316%macro SAD_X4_24x4 0
1317    mova    m4,  [r0]
1318    mova    m5,  [r0 + 16]
1319    movu    m6,  [r1]
1320    movu    m7,  [r1 + 16]
1321    psadbw  m6,  m4
1322    psadbw  m7,  m5
1323    pshufd  m7,  m7, 84
1324    paddd   m6,  m7
1325    paddd   m0,  m6
1326    movu    m6,  [r2]
1327    movu    m7,  [r2 + 16]
1328    psadbw  m6,  m4
1329    psadbw  m7,  m5
1330    pshufd  m7,  m7, 84
1331    paddd   m6,  m7
1332    paddd   m1,  m6
1333    movu    m6,  [r3]
1334    movu    m7,  [r3 + 16]
1335    psadbw  m6,  m4
1336    psadbw  m7,  m5
1337    pshufd  m7,  m7, 84
1338    paddd   m6,  m7
1339    paddd   m2,  m6
1340    movu    m6,  [r4]
1341    movu    m7,  [r4 + 16]
1342    psadbw  m6,  m4
1343    psadbw  m7,  m5
1344    pshufd  m7,  m7, 84
1345    paddd   m6,  m7
1346    paddd   m3,  m6
1347
1348    mova    m4,  [r0 + FENC_STRIDE]
1349    mova    m5,  [r0 + 16 + FENC_STRIDE]
1350    movu    m6,  [r1 + r5]
1351    movu    m7,  [r1 + 16 + r5]
1352    psadbw  m6,  m4
1353    psadbw  m7,  m5
1354    pshufd  m7,  m7, 84
1355    paddd   m6,  m7
1356    paddd   m0,  m6
1357    movu    m6,  [r2 + r5]
1358    movu    m7,  [r2 + 16 + r5]
1359    psadbw  m6,  m4
1360    psadbw  m7,  m5
1361    pshufd  m7,  m7, 84
1362    paddd   m6,  m7
1363    paddd   m1,  m6
1364    movu    m6,  [r3 + r5]
1365    movu    m7,  [r3 + 16 + r5]
1366    psadbw  m6,  m4
1367    psadbw  m7,  m5
1368    pshufd  m7,  m7, 84
1369    paddd   m6,  m7
1370    paddd   m2,  m6
1371    movu    m6,  [r4 + r5]
1372    movu    m7,  [r4 + 16 + r5]
1373    psadbw  m6,  m4
1374    psadbw  m7,  m5
1375    pshufd  m7,  m7, 84
1376    paddd   m6,  m7
1377    paddd   m3,  m6
1378
1379    mova    m4,  [r0 + FENC_STRIDE * 2]
1380    mova    m5,  [r0 + 16 + FENC_STRIDE * 2]
1381    movu    m6,  [r1 + r5 * 2]
1382    movu    m7,  [r1 + 16 + r5 * 2]
1383    psadbw  m6,  m4
1384    psadbw  m7,  m5
1385    pshufd  m7,  m7, 84
1386    paddd   m6,  m7
1387    paddd   m0,  m6
1388    movu    m6,  [r2 + r5 * 2]
1389    movu    m7,  [r2 + 16 + r5 * 2]
1390    psadbw  m6,  m4
1391    psadbw  m7,  m5
1392    pshufd  m7,  m7, 84
1393    paddd   m6,  m7
1394    paddd   m1,  m6
1395    movu    m6,  [r3 + r5 * 2]
1396    movu    m7,  [r3 + 16 + r5 * 2]
1397    psadbw  m6,  m4
1398    psadbw  m7,  m5
1399    pshufd  m7,  m7, 84
1400    paddd   m6,  m7
1401    paddd   m2,  m6
1402    movu    m6,  [r4 + r5 * 2]
1403    movu    m7,  [r4 + 16 + r5 * 2]
1404    psadbw  m6,  m4
1405    psadbw  m7,  m5
1406    pshufd  m7,  m7, 84
1407    paddd   m6,  m7
1408    paddd   m3,  m6
1409    lea     r0,  [r0 + FENC_STRIDE * 2]
1410    lea     r1,  [r1 + r5 * 2]
1411    lea     r2,  [r2 + r5 * 2]
1412    lea     r3,  [r3 + r5 * 2]
1413    lea     r4,  [r4 + r5 * 2]
1414    mova    m4,  [r0 + FENC_STRIDE]
1415    mova    m5,  [r0 + 16 + FENC_STRIDE]
1416    movu    m6,  [r1 + r5]
1417    movu    m7,  [r1 + 16 + r5]
1418    psadbw  m6,  m4
1419    psadbw  m7,  m5
1420    pshufd  m7,  m7, 84
1421    paddd   m6,  m7
1422    paddd   m0,  m6
1423    movu    m6,  [r2 + r5]
1424    movu    m7,  [r2 + 16 + r5]
1425    psadbw  m6,  m4
1426    psadbw  m7,  m5
1427    pshufd  m7,  m7, 84
1428    paddd   m6,  m7
1429    paddd   m1,  m6
1430    movu    m6,  [r3 + r5]
1431    movu    m7,  [r3 + 16 + r5]
1432    psadbw  m6,  m4
1433    psadbw  m7,  m5
1434    pshufd  m7,  m7, 84
1435    paddd   m6,  m7
1436    paddd   m2,  m6
1437    movu    m6,  [r4 + r5]
1438    movu    m7,  [r4 + 16 + r5]
1439    psadbw  m6,  m4
1440    psadbw  m7,  m5
1441    pshufd  m7,  m7, 84
1442    paddd   m6,  m7
1443    paddd   m3,  m6
1444    lea     r0,  [r0 + FENC_STRIDE * 2]
1445    lea     r1,  [r1 + r5 * 2]
1446    lea     r2,  [r2 + r5 * 2]
1447    lea     r3,  [r3 + r5 * 2]
1448    lea     r4,  [r4 + r5 * 2]
1449%endmacro
1450
1451%macro SAD_X3_32x4 0
1452    mova    m3,  [r0]
1453    mova    m4,  [r0 + 16]
1454    movu    m5,  [r1]
1455    movu    m6,  [r1 + 16]
1456    psadbw  m5,  m3
1457    psadbw  m6,  m4
1458    paddd   m5,  m6
1459    paddd   m0,  m5
1460    movu    m5,  [r2]
1461    movu    m6,  [r2 + 16]
1462    psadbw  m5,  m3
1463    psadbw  m6,  m4
1464    paddd   m5,  m6
1465    paddd   m1,  m5
1466    movu    m5,  [r3]
1467    movu    m6,  [r3 + 16]
1468    psadbw  m5,  m3
1469    psadbw  m6,  m4
1470    paddd   m5,  m6
1471    paddd   m2,  m5
1472    lea     r0,  [r0 + FENC_STRIDE]
1473    lea     r1,  [r1 + r4]
1474    lea     r2,  [r2 + r4]
1475    lea     r3,  [r3 + r4]
1476    mova    m3,  [r0]
1477    mova    m4,  [r0 + 16]
1478    movu    m5,  [r1]
1479    movu    m6,  [r1 + 16]
1480    psadbw  m5,  m3
1481    psadbw  m6,  m4
1482    paddd   m5,  m6
1483    paddd   m0,  m5
1484    movu    m5,  [r2]
1485    movu    m6,  [r2 + 16]
1486    psadbw  m5,  m3
1487    psadbw  m6,  m4
1488    paddd   m5,  m6
1489    paddd   m1,  m5
1490    movu    m5,  [r3]
1491    movu    m6,  [r3 + 16]
1492    psadbw  m5,  m3
1493    psadbw  m6,  m4
1494    paddd   m5,  m6
1495    paddd   m2,  m5
1496    lea     r0,  [r0 + FENC_STRIDE]
1497    lea     r1,  [r1 + r4]
1498    lea     r2,  [r2 + r4]
1499    lea     r3,  [r3 + r4]
1500    mova    m3,  [r0]
1501    mova    m4,  [r0 + 16]
1502    movu    m5,  [r1]
1503    movu    m6,  [r1 + 16]
1504    psadbw  m5,  m3
1505    psadbw  m6,  m4
1506    paddd   m5,  m6
1507    paddd   m0,  m5
1508    movu    m5,  [r2]
1509    movu    m6,  [r2 + 16]
1510    psadbw  m5,  m3
1511    psadbw  m6,  m4
1512    paddd   m5,  m6
1513    paddd   m1,  m5
1514    movu    m5,  [r3]
1515    movu    m6,  [r3 + 16]
1516    psadbw  m5,  m3
1517    psadbw  m6,  m4
1518    paddd   m5,  m6
1519    paddd   m2,  m5
1520    lea     r0,  [r0 + FENC_STRIDE]
1521    lea     r1,  [r1 + r4]
1522    lea     r2,  [r2 + r4]
1523    lea     r3,  [r3 + r4]
1524    mova    m3,  [r0]
1525    mova    m4,  [r0 + 16]
1526    movu    m5,  [r1]
1527    movu    m6,  [r1 + 16]
1528    psadbw  m5,  m3
1529    psadbw  m6,  m4
1530    paddd   m5,  m6
1531    paddd   m0,  m5
1532    movu    m5,  [r2]
1533    movu    m6,  [r2 + 16]
1534    psadbw  m5,  m3
1535    psadbw  m6,  m4
1536    paddd   m5,  m6
1537    paddd   m1,  m5
1538    movu    m5,  [r3]
1539    movu    m6,  [r3 + 16]
1540    psadbw  m5,  m3
1541    psadbw  m6,  m4
1542    paddd   m5,  m6
1543    paddd   m2,  m5
1544    lea     r0,  [r0 + FENC_STRIDE]
1545    lea     r1,  [r1 + r4]
1546    lea     r2,  [r2 + r4]
1547    lea     r3,  [r3 + r4]
1548%endmacro
1549
1550%macro SAD_X4_32x4 0
1551    mova    m4,  [r0]
1552    mova    m5,  [r0 + 16]
1553    movu    m6,  [r1]
1554    movu    m7,  [r1 + 16]
1555    psadbw  m6,  m4
1556    psadbw  m7,  m5
1557    paddd   m6,  m7
1558    paddd   m0,  m6
1559    movu    m6,  [r2]
1560    movu    m7,  [r2 + 16]
1561    psadbw  m6,  m4
1562    psadbw  m7,  m5
1563    paddd   m6,  m7
1564    paddd   m1,  m6
1565    movu    m6,  [r3]
1566    movu    m7,  [r3 + 16]
1567    psadbw  m6,  m4
1568    psadbw  m7,  m5
1569    paddd   m6,  m7
1570    paddd   m2,  m6
1571    movu    m6,  [r4]
1572    movu    m7,  [r4 + 16]
1573    psadbw  m6,  m4
1574    psadbw  m7,  m5
1575    paddd   m6,  m7
1576    paddd   m3,  m6
1577    lea     r0,  [r0 + FENC_STRIDE]
1578    lea     r1,  [r1 + r5]
1579    lea     r2,  [r2 + r5]
1580    lea     r3,  [r3 + r5]
1581    lea     r4,  [r4 + r5]
1582    mova    m4,  [r0]
1583    mova    m5,  [r0 + 16]
1584    movu    m6,  [r1]
1585    movu    m7,  [r1 + 16]
1586    psadbw  m6,  m4
1587    psadbw  m7,  m5
1588    paddd   m6,  m7
1589    paddd   m0,  m6
1590    movu    m6,  [r2]
1591    movu    m7,  [r2 + 16]
1592    psadbw  m6,  m4
1593    psadbw  m7,  m5
1594    paddd   m6,  m7
1595    paddd   m1,  m6
1596    movu    m6,  [r3]
1597    movu    m7,  [r3 + 16]
1598    psadbw  m6,  m4
1599    psadbw  m7,  m5
1600    paddd   m6,  m7
1601    paddd   m2,  m6
1602    movu    m6,  [r4]
1603    movu    m7,  [r4 + 16]
1604    psadbw  m6,  m4
1605    psadbw  m7,  m5
1606    paddd   m6,  m7
1607    paddd   m3,  m6
1608    lea     r0,  [r0 + FENC_STRIDE]
1609    lea     r1,  [r1 + r5]
1610    lea     r2,  [r2 + r5]
1611    lea     r3,  [r3 + r5]
1612    lea     r4,  [r4 + r5]
1613    mova    m4,  [r0]
1614    mova    m5,  [r0 + 16]
1615    movu    m6,  [r1]
1616    movu    m7,  [r1 + 16]
1617    psadbw  m6,  m4
1618    psadbw  m7,  m5
1619    paddd   m6,  m7
1620    paddd   m0,  m6
1621    movu    m6,  [r2]
1622    movu    m7,  [r2 + 16]
1623    psadbw  m6,  m4
1624    psadbw  m7,  m5
1625    paddd   m6,  m7
1626    paddd   m1,  m6
1627    movu    m6,  [r3]
1628    movu    m7,  [r3 + 16]
1629    psadbw  m6,  m4
1630    psadbw  m7,  m5
1631    paddd   m6,  m7
1632    paddd   m2,  m6
1633    movu    m6,  [r4]
1634    movu    m7,  [r4 + 16]
1635    psadbw  m6,  m4
1636    psadbw  m7,  m5
1637    paddd   m6,  m7
1638    paddd   m3,  m6
1639    lea     r0,  [r0 + FENC_STRIDE]
1640    lea     r1,  [r1 + r5]
1641    lea     r2,  [r2 + r5]
1642    lea     r3,  [r3 + r5]
1643    lea     r4,  [r4 + r5]
1644    mova    m4,  [r0]
1645    mova    m5,  [r0 + 16]
1646    movu    m6,  [r1]
1647    movu    m7,  [r1 + 16]
1648    psadbw  m6,  m4
1649    psadbw  m7,  m5
1650    paddd   m6,  m7
1651    paddd   m0,  m6
1652    movu    m6,  [r2]
1653    movu    m7,  [r2 + 16]
1654    psadbw  m6,  m4
1655    psadbw  m7,  m5
1656    paddd   m6,  m7
1657    paddd   m1,  m6
1658    movu    m6,  [r3]
1659    movu    m7,  [r3 + 16]
1660    psadbw  m6,  m4
1661    psadbw  m7,  m5
1662    paddd   m6,  m7
1663    paddd   m2,  m6
1664    movu    m6,  [r4]
1665    movu    m7,  [r4 + 16]
1666    psadbw  m6,  m4
1667    psadbw  m7,  m5
1668    paddd   m6,  m7
1669    paddd   m3,  m6
1670    lea     r0,  [r0 + FENC_STRIDE]
1671    lea     r1,  [r1 + r5]
1672    lea     r2,  [r2 + r5]
1673    lea     r3,  [r3 + r5]
1674    lea     r4,  [r4 + r5]
1675%endmacro
1676
1677%macro SAD_X3_48x4 0
1678    mova    m3,  [r0]
1679    mova    m4,  [r0 + 16]
1680    mova    m5,  [r0 + 32]
1681    movu    m6,  [r1]
1682    psadbw  m6,  m3
1683    paddd   m0,  m6
1684    movu    m6,  [r1 + 16]
1685    psadbw  m6,  m4
1686    paddd   m0,  m6
1687    movu    m6,  [r1 + 32]
1688    psadbw  m6,  m5
1689    paddd   m0,  m6
1690    movu    m6,  [r2]
1691    psadbw  m6,  m3
1692    paddd   m1,  m6
1693    movu    m6,  [r2 + 16]
1694    psadbw  m6,  m4
1695    paddd   m1,  m6
1696    movu    m6,  [r2 + 32]
1697    psadbw  m6,  m5
1698    paddd   m1,  m6
1699    movu    m6,  [r3]
1700    psadbw  m6,  m3
1701    paddd   m2,  m6
1702    movu    m6,  [r3 + 16]
1703    psadbw  m6,  m4
1704    paddd   m2,  m6
1705    movu    m6,  [r3 + 32]
1706    psadbw  m6,  m5
1707    paddd   m2,  m6
1708
1709    mova    m3,  [r0 + FENC_STRIDE]
1710    mova    m4,  [r0 + 16 + FENC_STRIDE]
1711    mova    m5,  [r0 + 32 + FENC_STRIDE]
1712    movu    m6,  [r1 + r4]
1713    psadbw  m6,  m3
1714    paddd   m0,  m6
1715    movu    m6,  [r1 + 16 + r4]
1716    psadbw  m6,  m4
1717    paddd   m0,  m6
1718    movu    m6,  [r1 + 32 + r4]
1719    psadbw  m6,  m5
1720    paddd   m0,  m6
1721    movu    m6,  [r2 + r4]
1722    psadbw  m6,  m3
1723    paddd   m1,  m6
1724    movu    m6,  [r2 + 16 + r4]
1725    psadbw  m6,  m4
1726    paddd   m1,  m6
1727    movu    m6,  [r2 + 32 + r4]
1728    psadbw  m6,  m5
1729    paddd   m1,  m6
1730    movu    m6,  [r3 + r4]
1731    psadbw  m6,  m3
1732    paddd   m2,  m6
1733    movu    m6,  [r3 + 16 + r4]
1734    psadbw  m6,  m4
1735    paddd   m2,  m6
1736    movu    m6,  [r3 + 32 + r4]
1737    psadbw  m6,  m5
1738    paddd   m2,  m6
1739
1740    mova    m3,  [r0 + FENC_STRIDE * 2]
1741    mova    m4,  [r0 + 16 + FENC_STRIDE * 2]
1742    mova    m5,  [r0 + 32 + FENC_STRIDE * 2]
1743    movu    m6,  [r1 + r4 * 2]
1744    psadbw  m6,  m3
1745    paddd   m0,  m6
1746    movu    m6,  [r1 + 16 + r4 * 2]
1747    psadbw  m6,  m4
1748    paddd   m0,  m6
1749    movu    m6,  [r1 + 32 + r4 * 2]
1750    psadbw  m6,  m5
1751    paddd   m0,  m6
1752    movu    m6,  [r2 + r4 * 2]
1753    psadbw  m6,  m3
1754    paddd   m1,  m6
1755    movu    m6,  [r2 + 16 + r4 * 2]
1756    psadbw  m6,  m4
1757    paddd   m1,  m6
1758    movu    m6,  [r2 + 32 + r4 * 2]
1759    psadbw  m6,  m5
1760    paddd   m1,  m6
1761    movu    m6,  [r3 + r4 * 2]
1762    psadbw  m6,  m3
1763    paddd   m2,  m6
1764    movu    m6,  [r3 + 16 + r4 * 2]
1765    psadbw  m6,  m4
1766    paddd   m2,  m6
1767    movu    m6,  [r3 + 32 + r4 * 2]
1768    psadbw  m6,  m5
1769    paddd   m2,  m6
1770
1771    lea     r0,  [r0 + FENC_STRIDE * 2]
1772    lea     r1,  [r1 + r4 * 2]
1773    lea     r2,  [r2 + r4 * 2]
1774    lea     r3,  [r3 + r4 * 2]
1775    mova    m3,  [r0 + FENC_STRIDE]
1776    mova    m4,  [r0 + 16 + FENC_STRIDE]
1777    mova    m5,  [r0 + 32 + FENC_STRIDE]
1778    movu    m6,  [r1 + r4]
1779    psadbw  m6,  m3
1780    paddd   m0,  m6
1781    movu    m6,  [r1 + 16 + r4]
1782    psadbw  m6,  m4
1783    paddd   m0,  m6
1784    movu    m6,  [r1 + 32 + r4]
1785    psadbw  m6,  m5
1786    paddd   m0,  m6
1787    movu    m6,  [r2 + r4]
1788    psadbw  m6,  m3
1789    paddd   m1,  m6
1790    movu    m6,  [r2 + 16 + r4]
1791    psadbw  m6,  m4
1792    paddd   m1,  m6
1793    movu    m6,  [r2 + 32 + r4]
1794    psadbw  m6,  m5
1795    paddd   m1,  m6
1796    movu    m6,  [r3 + r4]
1797    psadbw  m6,  m3
1798    paddd   m2,  m6
1799    movu    m6,  [r3 + 16 + r4]
1800    psadbw  m6,  m4
1801    paddd   m2,  m6
1802    movu    m6,  [r3 + 32 + r4]
1803    psadbw  m6,  m5
1804    paddd   m2,  m6
1805    lea     r0,  [r0 + FENC_STRIDE * 2]
1806    lea     r1,  [r1 + r4 * 2]
1807    lea     r2,  [r2 + r4 * 2]
1808    lea     r3,  [r3 + r4 * 2]
1809%endmacro
1810
1811%macro SAD_X4_48x4 0
1812    mova    m4,  [r0]
1813    mova    m5,  [r0 + 16]
1814    mova    m6,  [r0 + 32]
1815    movu    m7,  [r1]
1816    psadbw  m7,  m4
1817    paddd   m0,  m7
1818    movu    m7,  [r1 + 16]
1819    psadbw  m7,  m5
1820    paddd   m0,  m7
1821    movu    m7,  [r1 + 32]
1822    psadbw  m7,  m6
1823    paddd   m0,  m7
1824    movu    m7,  [r2]
1825    psadbw  m7,  m4
1826    paddd   m1,  m7
1827    movu    m7,  [r2 + 16]
1828    psadbw  m7,  m5
1829    paddd   m1,  m7
1830    movu    m7,  [r2 + 32]
1831    psadbw  m7,  m6
1832    paddd   m1,  m7
1833    movu    m7,  [r3]
1834    psadbw  m7,  m4
1835    paddd   m2,  m7
1836    movu    m7,  [r3 + 16]
1837    psadbw  m7,  m5
1838    paddd   m2,  m7
1839    movu    m7,  [r3 + 32]
1840    psadbw  m7,  m6
1841    paddd   m2,  m7
1842    movu    m7,  [r4]
1843    psadbw  m7,  m4
1844    paddd   m3,  m7
1845    movu    m7,  [r4 + 16]
1846    psadbw  m7,  m5
1847    paddd   m3,  m7
1848    movu    m7,  [r4 + 32]
1849    psadbw  m7,  m6
1850    paddd   m3,  m7
1851
1852    mova    m4,  [r0 + FENC_STRIDE]
1853    mova    m5,  [r0 + 16 + FENC_STRIDE]
1854    mova    m6,  [r0 + 32 + FENC_STRIDE]
1855    movu    m7,  [r1 + r5]
1856    psadbw  m7,  m4
1857    paddd   m0,  m7
1858    movu    m7,  [r1 + 16 + r5]
1859    psadbw  m7,  m5
1860    paddd   m0,  m7
1861    movu    m7,  [r1 + 32 + r5]
1862    psadbw  m7,  m6
1863    paddd   m0,  m7
1864    movu    m7,  [r2 + r5]
1865    psadbw  m7,  m4
1866    paddd   m1,  m7
1867    movu    m7,  [r2 + 16 + r5]
1868    psadbw  m7,  m5
1869    paddd   m1,  m7
1870    movu    m7,  [r2 + 32 + r5]
1871    psadbw  m7,  m6
1872    paddd   m1,  m7
1873    movu    m7,  [r3 + r5]
1874    psadbw  m7,  m4
1875    paddd   m2,  m7
1876    movu    m7,  [r3 + 16 + r5]
1877    psadbw  m7,  m5
1878    paddd   m2,  m7
1879    movu    m7,  [r3 + 32 + r5]
1880    psadbw  m7,  m6
1881    paddd   m2,  m7
1882    movu    m7,  [r4 + r5]
1883    psadbw  m7,  m4
1884    paddd   m3,  m7
1885    movu    m7,  [r4 + 16 + r5]
1886    psadbw  m7,  m5
1887    paddd   m3,  m7
1888    movu    m7,  [r4 + 32 + r5]
1889    psadbw  m7,  m6
1890    paddd   m3,  m7
1891
1892    mova    m4,  [r0 + FENC_STRIDE * 2]
1893    mova    m5,  [r0 + 16 + FENC_STRIDE * 2]
1894    mova    m6,  [r0 + 32 + FENC_STRIDE * 2]
1895    movu    m7,  [r1 + r5 * 2]
1896    psadbw  m7,  m4
1897    paddd   m0,  m7
1898    movu    m7,  [r1 + 16 + r5 * 2]
1899    psadbw  m7,  m5
1900    paddd   m0,  m7
1901    movu    m7,  [r1 + 32 + r5 * 2]
1902    psadbw  m7,  m6
1903    paddd   m0,  m7
1904    movu    m7,  [r2 + r5 * 2]
1905    psadbw  m7,  m4
1906    paddd   m1,  m7
1907    movu    m7,  [r2 + 16 + r5 * 2]
1908    psadbw  m7,  m5
1909    paddd   m1,  m7
1910    movu    m7,  [r2 + 32 + r5 * 2]
1911    psadbw  m7,  m6
1912    paddd   m1,  m7
1913    movu    m7,  [r3 + r5 * 2]
1914    psadbw  m7,  m4
1915    paddd   m2,  m7
1916    movu    m7,  [r3 + 16 + r5 * 2]
1917    psadbw  m7,  m5
1918    paddd   m2,  m7
1919    movu    m7,  [r3 + 32 + r5 * 2]
1920    psadbw  m7,  m6
1921    paddd   m2,  m7
1922    movu    m7,  [r4 + r5 * 2]
1923    psadbw  m7,  m4
1924    paddd   m3,  m7
1925    movu    m7,  [r4 + 16 + r5 * 2]
1926    psadbw  m7,  m5
1927    paddd   m3,  m7
1928    movu    m7,  [r4 + 32 + r5 * 2]
1929    psadbw  m7,  m6
1930    paddd   m3,  m7
1931
1932    lea     r0,  [r0 + FENC_STRIDE * 2]
1933    lea     r1,  [r1 + r5 * 2]
1934    lea     r2,  [r2 + r5 * 2]
1935    lea     r3,  [r3 + r5 * 2]
1936    lea     r4,  [r4 + r5 * 2]
1937    mova    m4,  [r0 + FENC_STRIDE]
1938    mova    m5,  [r0 + 16 + FENC_STRIDE]
1939    mova    m6,  [r0 + 32 + FENC_STRIDE]
1940    movu    m7,  [r1 + r5]
1941    psadbw  m7,  m4
1942    paddd   m0,  m7
1943    movu    m7,  [r1 + 16 + r5]
1944    psadbw  m7,  m5
1945    paddd   m0,  m7
1946    movu    m7,  [r1 + 32 + r5]
1947    psadbw  m7,  m6
1948    paddd   m0,  m7
1949    movu    m7,  [r2 + r5]
1950    psadbw  m7,  m4
1951    paddd   m1,  m7
1952    movu    m7,  [r2 + 16 + r5]
1953    psadbw  m7,  m5
1954    paddd   m1,  m7
1955    movu    m7,  [r2 + 32 + r5]
1956    psadbw  m7,  m6
1957    paddd   m1,  m7
1958    movu    m7,  [r3 + r5]
1959    psadbw  m7,  m4
1960    paddd   m2,  m7
1961    movu    m7,  [r3 + 16 + r5]
1962    psadbw  m7,  m5
1963    paddd   m2,  m7
1964    movu    m7,  [r3 + 32 + r5]
1965    psadbw  m7,  m6
1966    paddd   m2,  m7
1967    movu    m7,  [r4 + r5]
1968    psadbw  m7,  m4
1969    paddd   m3,  m7
1970    movu    m7,  [r4 + 16 + r5]
1971    psadbw  m7,  m5
1972    paddd   m3,  m7
1973    movu    m7,  [r4 + 32 + r5]
1974    psadbw  m7,  m6
1975    paddd   m3,  m7
1976    lea     r0,  [r0 + FENC_STRIDE * 2]
1977    lea     r1,  [r1 + r5 * 2]
1978    lea     r2,  [r2 + r5 * 2]
1979    lea     r3,  [r3 + r5 * 2]
1980    lea     r4,  [r4 + r5 * 2]
1981%endmacro
1982
1983%macro SAD_X3_64x4 0
1984    mova    m3,  [r0]
1985    mova    m4,  [r0 + 16]
1986    movu    m5,  [r1]
1987    psadbw  m5,  m3
1988    paddd   m0,  m5
1989    movu    m5,  [r1 + 16]
1990    psadbw  m5,  m4
1991    paddd   m0,  m5
1992    movu    m5,  [r2]
1993    psadbw  m5,  m3
1994    paddd   m1,  m5
1995    movu    m5,  [r2 + 16]
1996    psadbw  m5,  m4
1997    paddd   m1,  m5
1998    movu    m5,  [r3]
1999    psadbw  m5,  m3
2000    paddd   m2,  m5
2001    movu    m5,  [r3 + 16]
2002    psadbw  m5,  m4
2003    paddd   m2,  m5
2004    mova    m3,  [r0 + 32]
2005    mova    m4,  [r0 + 48]
2006    movu    m5,  [r1 + 32]
2007    psadbw  m5,  m3
2008    paddd   m0,  m5
2009    movu    m5,  [r1 + 48]
2010    psadbw  m5,  m4
2011    paddd   m0,  m5
2012    movu    m5,  [r2 + 32]
2013    psadbw  m5,  m3
2014    paddd   m1,  m5
2015    movu    m5,  [r2 + 48]
2016    psadbw  m5,  m4
2017    paddd   m1,  m5
2018    movu    m5,  [r3 + 32]
2019    psadbw  m5,  m3
2020    paddd   m2,  m5
2021    movu    m5,  [r3 + 48]
2022    psadbw  m5,  m4
2023    paddd   m2,  m5
2024
2025    mova    m3,  [r0 + FENC_STRIDE]
2026    mova    m4,  [r0 + 16 + FENC_STRIDE]
2027    movu    m5,  [r1 + r4]
2028    psadbw  m5,  m3
2029    paddd   m0,  m5
2030    movu    m5,  [r1 + 16 + r4]
2031    psadbw  m5,  m4
2032    paddd   m0,  m5
2033    movu    m5,  [r2 + r4]
2034    psadbw  m5,  m3
2035    paddd   m1,  m5
2036    movu    m5,  [r2 + 16 + r4]
2037    psadbw  m5,  m4
2038    paddd   m1,  m5
2039    movu    m5,  [r3 + r4]
2040    psadbw  m5,  m3
2041    paddd   m2,  m5
2042    movu    m5,  [r3 + 16 + r4]
2043    psadbw  m5,  m4
2044    paddd   m2,  m5
2045    mova    m3,  [r0 + 32 + FENC_STRIDE]
2046    mova    m4,  [r0 + 48 + FENC_STRIDE]
2047    movu    m5,  [r1 + 32 + r4]
2048    psadbw  m5,  m3
2049    paddd   m0,  m5
2050    movu    m5,  [r1 + 48 + r4]
2051    psadbw  m5,  m4
2052    paddd   m0,  m5
2053    movu    m5,  [r2 + 32 + r4]
2054    psadbw  m5,  m3
2055    paddd   m1,  m5
2056    movu    m5,  [r2 + 48 + r4]
2057    psadbw  m5,  m4
2058    paddd   m1,  m5
2059    movu    m5,  [r3 + 32 + r4]
2060    psadbw  m5,  m3
2061    paddd   m2,  m5
2062    movu    m5,  [r3 + 48 + r4]
2063    psadbw  m5,  m4
2064    paddd   m2,  m5
2065
2066    mova    m3,  [r0 + FENC_STRIDE * 2]
2067    mova    m4,  [r0 + 16 + FENC_STRIDE * 2]
2068    movu    m5,  [r1 + r4 * 2]
2069    psadbw  m5,  m3
2070    paddd   m0,  m5
2071    movu    m5,  [r1 + 16 + r4 * 2]
2072    psadbw  m5,  m4
2073    paddd   m0,  m5
2074    movu    m5,  [r2 + r4 * 2]
2075    psadbw  m5,  m3
2076    paddd   m1,  m5
2077    movu    m5,  [r2 + 16 + r4 * 2]
2078    psadbw  m5,  m4
2079    paddd   m1,  m5
2080    movu    m5,  [r3 + r4 * 2]
2081    psadbw  m5,  m3
2082    paddd   m2,  m5
2083    movu    m5,  [r3 + 16 + r4 * 2]
2084    psadbw  m5,  m4
2085    paddd   m2,  m5
2086    mova    m3,  [r0 + 32 + FENC_STRIDE * 2]
2087    mova    m4,  [r0 + 48 + FENC_STRIDE * 2]
2088    movu    m5,  [r1 + 32 + r4 * 2]
2089    psadbw  m5,  m3
2090    paddd   m0,  m5
2091    movu    m5,  [r1 + 48 + r4 * 2]
2092    psadbw  m5,  m4
2093    paddd   m0,  m5
2094    movu    m5,  [r2 + 32 + r4 * 2]
2095    psadbw  m5,  m3
2096    paddd   m1,  m5
2097    movu    m5,  [r2 + 48 + r4 * 2]
2098    psadbw  m5,  m4
2099    paddd   m1,  m5
2100    movu    m5,  [r3 + 32 + r4 * 2]
2101    psadbw  m5,  m3
2102    paddd   m2,  m5
2103    movu    m5,  [r3 + 48 + r4 * 2]
2104    psadbw  m5,  m4
2105    paddd   m2,  m5
2106
2107    lea     r0,  [r0 + FENC_STRIDE * 2]
2108    lea     r1,  [r1 + r4 * 2]
2109    lea     r2,  [r2 + r4 * 2]
2110    lea     r3,  [r3 + r4 * 2]
2111    mova    m3,  [r0 + FENC_STRIDE]
2112    mova    m4,  [r0 + 16 + FENC_STRIDE]
2113    movu    m5,  [r1 + r4]
2114    psadbw  m5,  m3
2115    paddd   m0,  m5
2116    movu    m5,  [r1 + 16 + r4]
2117    psadbw  m5,  m4
2118    paddd   m0,  m5
2119    movu    m5,  [r2 + r4]
2120    psadbw  m5,  m3
2121    paddd   m1,  m5
2122    movu    m5,  [r2 + 16 + r4]
2123    psadbw  m5,  m4
2124    paddd   m1,  m5
2125    movu    m5,  [r3 + r4]
2126    psadbw  m5,  m3
2127    paddd   m2,  m5
2128    movu    m5,  [r3 + 16 + r4]
2129    psadbw  m5,  m4
2130    paddd   m2,  m5
2131    mova    m3,  [r0 + 32 + FENC_STRIDE]
2132    mova    m4,  [r0 + 48 + FENC_STRIDE]
2133    movu    m5,  [r1 + 32 + r4]
2134    psadbw  m5,  m3
2135    paddd   m0,  m5
2136    movu    m5,  [r1 + 48 + r4]
2137    psadbw  m5,  m4
2138    paddd   m0,  m5
2139    movu    m5,  [r2 + 32 + r4]
2140    psadbw  m5,  m3
2141    paddd   m1,  m5
2142    movu    m5,  [r2 + 48 + r4]
2143    psadbw  m5,  m4
2144    paddd   m1,  m5
2145    movu    m5,  [r3 + 32 + r4]
2146    psadbw  m5,  m3
2147    paddd   m2,  m5
2148    movu    m5,  [r3 + 48 + r4]
2149    psadbw  m5,  m4
2150    paddd   m2,  m5
2151    lea     r0,  [r0 + FENC_STRIDE * 2]
2152    lea     r1,  [r1 + r4 * 2]
2153    lea     r2,  [r2 + r4 * 2]
2154    lea     r3,  [r3 + r4 * 2]
2155%endmacro
2156
2157%macro SAD_X4_64x4 0
2158    mova    m4,  [r0]
2159    mova    m5,  [r0 + 16]
2160    movu    m6,  [r1]
2161    psadbw  m6,  m4
2162    paddd   m0,  m6
2163    movu    m6,  [r1 + 16]
2164    psadbw  m6,  m5
2165    paddd   m0,  m6
2166    movu    m6,  [r2]
2167    psadbw  m6,  m4
2168    paddd   m1,  m6
2169    movu    m6,  [r2 + 16]
2170    psadbw  m6,  m5
2171    paddd   m1,  m6
2172    movu    m6,  [r3]
2173    psadbw  m6,  m4
2174    paddd   m2,  m6
2175    movu    m6,  [r3 + 16]
2176    psadbw  m6,  m5
2177    paddd   m2,  m6
2178    movu    m6,  [r4]
2179    psadbw  m6,  m4
2180    paddd   m3,  m6
2181    movu    m6,  [r4 + 16]
2182    psadbw  m6,  m5
2183    paddd   m3,  m6
2184    mova    m4,  [r0 + 32]
2185    mova    m5,  [r0 + 48]
2186    movu    m6,  [r1 + 32]
2187    psadbw  m6,  m4
2188    paddd   m0,  m6
2189    movu    m6,  [r1 + 48]
2190    psadbw  m6,  m5
2191    paddd   m0,  m6
2192    movu    m6,  [r2 + 32]
2193    psadbw  m6,  m4
2194    paddd   m1,  m6
2195    movu    m6,  [r2 + 48]
2196    psadbw  m6,  m5
2197    paddd   m1,  m6
2198    movu    m6,  [r3 + 32]
2199    psadbw  m6,  m4
2200    paddd   m2,  m6
2201    movu    m6,  [r3 + 48]
2202    psadbw  m6,  m5
2203    paddd   m2,  m6
2204    movu    m6,  [r4 + 32]
2205    psadbw  m6,  m4
2206    paddd   m3,  m6
2207    movu    m6,  [r4 + 48]
2208    psadbw  m6,  m5
2209    paddd   m3,  m6
2210
2211    mova    m4,  [r0 + FENC_STRIDE]
2212    mova    m5,  [r0 + 16 + FENC_STRIDE]
2213    movu    m6,  [r1 + r5]
2214    psadbw  m6,  m4
2215    paddd   m0,  m6
2216    movu    m6,  [r1 + 16 + r5]
2217    psadbw  m6,  m5
2218    paddd   m0,  m6
2219    movu    m6,  [r2 + r5]
2220    psadbw  m6,  m4
2221    paddd   m1,  m6
2222    movu    m6,  [r2 + 16 + r5]
2223    psadbw  m6,  m5
2224    paddd   m1,  m6
2225    movu    m6,  [r3 + r5]
2226    psadbw  m6,  m4
2227    paddd   m2,  m6
2228    movu    m6,  [r3 + 16 + r5]
2229    psadbw  m6,  m5
2230    paddd   m2,  m6
2231    movu    m6,  [r4 + r5]
2232    psadbw  m6,  m4
2233    paddd   m3,  m6
2234    movu    m6,  [r4 + 16 + r5]
2235    psadbw  m6,  m5
2236    paddd   m3,  m6
2237    mova    m4,  [r0 + 32 + FENC_STRIDE]
2238    mova    m5,  [r0 + 48 + FENC_STRIDE]
2239    movu    m6,  [r1 + 32 + r5]
2240    psadbw  m6,  m4
2241    paddd   m0,  m6
2242    movu    m6,  [r1 + 48 + r5]
2243    psadbw  m6,  m5
2244    paddd   m0,  m6
2245    movu    m6,  [r2 + 32 + r5]
2246    psadbw  m6,  m4
2247    paddd   m1,  m6
2248    movu    m6,  [r2 + 48 + r5]
2249    psadbw  m6,  m5
2250    paddd   m1,  m6
2251    movu    m6,  [r3 + 32 + r5]
2252    psadbw  m6,  m4
2253    paddd   m2,  m6
2254    movu    m6,  [r3 + 48 + r5]
2255    psadbw  m6,  m5
2256    paddd   m2,  m6
2257    movu    m6,  [r4 + 32 + r5]
2258    psadbw  m6,  m4
2259    paddd   m3,  m6
2260    movu    m6,  [r4 + 48 + r5]
2261    psadbw  m6,  m5
2262    paddd   m3,  m6
2263
2264    mova    m4,  [r0 + FENC_STRIDE * 2]
2265    mova    m5,  [r0 + 16 + FENC_STRIDE * 2]
2266    movu    m6,  [r1 + r5 * 2]
2267    psadbw  m6,  m4
2268    paddd   m0,  m6
2269    movu    m6,  [r1 + 16 + r5 * 2]
2270    psadbw  m6,  m5
2271    paddd   m0,  m6
2272    movu    m6,  [r2 + r5 * 2]
2273    psadbw  m6,  m4
2274    paddd   m1,  m6
2275    movu    m6,  [r2 + 16 + r5 * 2]
2276    psadbw  m6,  m5
2277    paddd   m1,  m6
2278    movu    m6,  [r3 + r5 * 2]
2279    psadbw  m6,  m4
2280    paddd   m2,  m6
2281    movu    m6,  [r3 + 16 + r5 * 2]
2282    psadbw  m6,  m5
2283    paddd   m2,  m6
2284    movu    m6,  [r4 + r5 * 2]
2285    psadbw  m6,  m4
2286    paddd   m3,  m6
2287    movu    m6,  [r4 + 16 + r5 * 2]
2288    psadbw  m6,  m5
2289    paddd   m3,  m6
2290    mova    m4,  [r0 + 32 + FENC_STRIDE * 2]
2291    mova    m5,  [r0 + 48 + FENC_STRIDE * 2]
2292    movu    m6,  [r1 + 32 + r5 * 2]
2293    psadbw  m6,  m4
2294    paddd   m0,  m6
2295    movu    m6,  [r1 + 48 + r5 * 2]
2296    psadbw  m6,  m5
2297    paddd   m0,  m6
2298    movu    m6,  [r2 + 32 + r5 * 2]
2299    psadbw  m6,  m4
2300    paddd   m1,  m6
2301    movu    m6,  [r2 + 48 + r5 * 2]
2302    psadbw  m6,  m5
2303    paddd   m1,  m6
2304    movu    m6,  [r3 + 32 + r5 * 2]
2305    psadbw  m6,  m4
2306    paddd   m2,  m6
2307    movu    m6,  [r3 + 48 + r5 * 2]
2308    psadbw  m6,  m5
2309    paddd   m2,  m6
2310    movu    m6,  [r4 + 32 + r5 * 2]
2311    psadbw  m6,  m4
2312    paddd   m3,  m6
2313    movu    m6,  [r4 + 48 + r5 * 2]
2314    psadbw  m6,  m5
2315    paddd   m3,  m6
2316
2317    lea     r0,  [r0 + FENC_STRIDE * 2]
2318    lea     r1,  [r1 + r5 * 2]
2319    lea     r2,  [r2 + r5 * 2]
2320    lea     r3,  [r3 + r5 * 2]
2321    lea     r4,  [r4 + r5 * 2]
2322    mova    m4,  [r0 + FENC_STRIDE]
2323    mova    m5,  [r0 + 16 + FENC_STRIDE]
2324    movu    m6,  [r1 + r5]
2325    psadbw  m6,  m4
2326    paddd   m0,  m6
2327    movu    m6,  [r1 + 16 + r5]
2328    psadbw  m6,  m5
2329    paddd   m0,  m6
2330    movu    m6,  [r2 + r5]
2331    psadbw  m6,  m4
2332    paddd   m1,  m6
2333    movu    m6,  [r2 + 16 + r5]
2334    psadbw  m6,  m5
2335    paddd   m1,  m6
2336    movu    m6,  [r3 + r5]
2337    psadbw  m6,  m4
2338    paddd   m2,  m6
2339    movu    m6,  [r3 + 16 + r5]
2340    psadbw  m6,  m5
2341    paddd   m2,  m6
2342    movu    m6,  [r4 + r5]
2343    psadbw  m6,  m4
2344    paddd   m3,  m6
2345    movu    m6,  [r4 + 16 + r5]
2346    psadbw  m6,  m5
2347    paddd   m3,  m6
2348    mova    m4,  [r0 + 32 + FENC_STRIDE]
2349    mova    m5,  [r0 + 48 + FENC_STRIDE]
2350    movu    m6,  [r1 + 32 + r5]
2351    psadbw  m6,  m4
2352    paddd   m0,  m6
2353    movu    m6,  [r1 + 48 + r5]
2354    psadbw  m6,  m5
2355    paddd   m0,  m6
2356    movu    m6,  [r2 + 32 + r5]
2357    psadbw  m6,  m4
2358    paddd   m1,  m6
2359    movu    m6,  [r2 + 48 + r5]
2360    psadbw  m6,  m5
2361    paddd   m1,  m6
2362    movu    m6,  [r3 + 32 + r5]
2363    psadbw  m6,  m4
2364    paddd   m2,  m6
2365    movu    m6,  [r3 + 48 + r5]
2366    psadbw  m6,  m5
2367    paddd   m2,  m6
2368    movu    m6,  [r4 + 32 + r5]
2369    psadbw  m6,  m4
2370    paddd   m3,  m6
2371    movu    m6,  [r4 + 48 + r5]
2372    psadbw  m6,  m5
2373    paddd   m3,  m6
2374    lea     r0,  [r0 + FENC_STRIDE * 2]
2375    lea     r1,  [r1 + r5 * 2]
2376    lea     r2,  [r2 + r5 * 2]
2377    lea     r3,  [r3 + r5 * 2]
2378    lea     r4,  [r4 + r5 * 2]
2379%endmacro
2380
2381;-----------------------------------------------------------------------------
2382; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
2383;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
2384;-----------------------------------------------------------------------------
2385%macro SAD_X 3
2386cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
2387    SAD_X%1_2x%2P 1
2388%rep %3/2-1
2389    SAD_X%1_2x%2P 0
2390%endrep
2391    SAD_X%1_END
2392%endmacro
2393
2394INIT_MMX
2395SAD_X 3, 16, 16
2396SAD_X 3, 16,  8
2397SAD_X 3,  8, 16
2398SAD_X 3,  8,  8
2399SAD_X 3,  8,  4
2400SAD_X 3,  4, 16
2401SAD_X 3,  4,  8
2402SAD_X 3,  4,  4
2403SAD_X 4, 16, 16
2404SAD_X 4, 16,  8
2405SAD_X 4,  8, 16
2406SAD_X 4,  8,  8
2407SAD_X 4,  8,  4
2408SAD_X 4,  4, 16
2409SAD_X 4,  4,  8
2410SAD_X 4,  4,  4
2411
2412
2413
2414;=============================================================================
2415; SAD x3/x4 XMM
2416;=============================================================================
2417
2418%macro SAD_X3_START_1x16P_SSE2 0
2419    mova     m2, [r0]
2420%if cpuflag(avx)
2421    psadbw   m0, m2, [r1]
2422    psadbw   m1, m2, [r2]
2423    psadbw   m2, [r3]
2424%else
2425    movu     m0, [r1]
2426    movu     m1, [r2]
2427    movu     m3, [r3]
2428    psadbw   m0, m2
2429    psadbw   m1, m2
2430    psadbw   m2, m3
2431%endif
2432%endmacro
2433
2434%macro SAD_X3_1x16P_SSE2 2
2435    mova     m3, [r0+%1]
2436%if cpuflag(avx)
2437    psadbw   m4, m3, [r1+%2]
2438    psadbw   m5, m3, [r2+%2]
2439    psadbw   m3, [r3+%2]
2440%else
2441    movu     m4, [r1+%2]
2442    movu     m5, [r2+%2]
2443    movu     m6, [r3+%2]
2444    psadbw   m4, m3
2445    psadbw   m5, m3
2446    psadbw   m3, m6
2447%endif
2448    paddd    m0, m4
2449    paddd    m1, m5
2450    paddd    m2, m3
2451%endmacro
2452
2453%if ARCH_X86_64
2454    DECLARE_REG_TMP 6
2455%else
2456    DECLARE_REG_TMP 5
2457%endif
2458
2459%macro SAD_X3_4x16P_SSE2 2
2460%if %1==0
2461    lea  t0, [r4*3]
2462    SAD_X3_START_1x16P_SSE2
2463%else
2464    SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
2465%endif
2466    SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
2467    SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
2468    SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
2469%if %1 != %2-1
2470%if (%1&1) != 0
2471    add  r0, 8*FENC_STRIDE
2472%endif
2473    lea  r1, [r1+4*r4]
2474    lea  r2, [r2+4*r4]
2475    lea  r3, [r3+4*r4]
2476%endif
2477%endmacro
2478
2479%macro SAD_X3_START_2x8P_SSE2 0
2480    movq     m3, [r0]
2481    movq     m0, [r1]
2482    movq     m1, [r2]
2483    movq     m2, [r3]
2484    movhps   m3, [r0+FENC_STRIDE]
2485    movhps   m0, [r1+r4]
2486    movhps   m1, [r2+r4]
2487    movhps   m2, [r3+r4]
2488    psadbw   m0, m3
2489    psadbw   m1, m3
2490    psadbw   m2, m3
2491%endmacro
2492
2493%macro SAD_X3_2x8P_SSE2 4
2494    movq     m6, [r0+%1]
2495    movq     m3, [r1+%2]
2496    movq     m4, [r2+%2]
2497    movq     m5, [r3+%2]
2498    movhps   m6, [r0+%3]
2499    movhps   m3, [r1+%4]
2500    movhps   m4, [r2+%4]
2501    movhps   m5, [r3+%4]
2502    psadbw   m3, m6
2503    psadbw   m4, m6
2504    psadbw   m5, m6
2505    paddd    m0, m3
2506    paddd    m1, m4
2507    paddd    m2, m5
2508%endmacro
2509
2510%macro SAD_X4_START_2x8P_SSE2 0
2511    movq     m4, [r0]
2512    movq     m0, [r1]
2513    movq     m1, [r2]
2514    movq     m2, [r3]
2515    movq     m3, [r4]
2516    movhps   m4, [r0+FENC_STRIDE]
2517    movhps   m0, [r1+r5]
2518    movhps   m1, [r2+r5]
2519    movhps   m2, [r3+r5]
2520    movhps   m3, [r4+r5]
2521    psadbw   m0, m4
2522    psadbw   m1, m4
2523    psadbw   m2, m4
2524    psadbw   m3, m4
2525%endmacro
2526
2527%macro SAD_X4_2x8P_SSE2 4
2528    movq     m6, [r0+%1]
2529    movq     m4, [r1+%2]
2530    movq     m5, [r2+%2]
2531    movhps   m6, [r0+%3]
2532    movhps   m4, [r1+%4]
2533    movhps   m5, [r2+%4]
2534    psadbw   m4, m6
2535    psadbw   m5, m6
2536    paddd    m0, m4
2537    paddd    m1, m5
2538    movq     m4, [r3+%2]
2539    movq     m5, [r4+%2]
2540    movhps   m4, [r3+%4]
2541    movhps   m5, [r4+%4]
2542    psadbw   m4, m6
2543    psadbw   m5, m6
2544    paddd    m2, m4
2545    paddd    m3, m5
2546%endmacro
2547
2548%macro SAD_X4_START_1x16P_SSE2 0
2549    mova     m3, [r0]
2550%if cpuflag(avx)
2551    psadbw   m0, m3, [r1]
2552    psadbw   m1, m3, [r2]
2553    psadbw   m2, m3, [r3]
2554    psadbw   m3, [r4]
2555%else
2556    movu     m0, [r1]
2557    movu     m1, [r2]
2558    movu     m2, [r3]
2559    movu     m4, [r4]
2560    psadbw   m0, m3
2561    psadbw   m1, m3
2562    psadbw   m2, m3
2563    psadbw   m3, m4
2564%endif
2565%endmacro
2566
2567%macro SAD_X4_1x16P_SSE2 2
2568    mova     m6, [r0+%1]
2569%if cpuflag(avx)
2570    psadbw   m4, m6, [r1+%2]
2571    psadbw   m5, m6, [r2+%2]
2572%else
2573    movu     m4, [r1+%2]
2574    movu     m5, [r2+%2]
2575    psadbw   m4, m6
2576    psadbw   m5, m6
2577%endif
2578    paddd    m0, m4
2579    paddd    m1, m5
2580%if cpuflag(avx)
2581    psadbw   m4, m6, [r3+%2]
2582    psadbw   m5, m6, [r4+%2]
2583%else
2584    movu     m4, [r3+%2]
2585    movu     m5, [r4+%2]
2586    psadbw   m4, m6
2587    psadbw   m5, m6
2588%endif
2589    paddd    m2, m4
2590    paddd    m3, m5
2591%endmacro
2592
2593%macro SAD_X4_4x16P_SSE2 2
2594%if %1==0
2595    lea  r6, [r5*3]
2596    SAD_X4_START_1x16P_SSE2
2597%else
2598    SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
2599%endif
2600    SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
2601    SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
2602    SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
2603%if %1 != %2-1
2604%if (%1&1) != 0
2605    add  r0, 8*FENC_STRIDE
2606%endif
2607    lea  r1, [r1+4*r5]
2608    lea  r2, [r2+4*r5]
2609    lea  r3, [r3+4*r5]
2610    lea  r4, [r4+4*r5]
2611%endif
2612%endmacro
2613
2614%macro SAD_X3_4x8P_SSE2 2
2615%if %1==0
2616    lea  t0, [r4*3]
2617    SAD_X3_START_2x8P_SSE2
2618%else
2619    SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
2620%endif
2621    SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
2622%if %1 != %2-1
2623%if (%1&1) != 0
2624    add  r0, 8*FENC_STRIDE
2625%endif
2626    lea  r1, [r1+4*r4]
2627    lea  r2, [r2+4*r4]
2628    lea  r3, [r3+4*r4]
2629%endif
2630%endmacro
2631
2632%macro SAD_X4_4x8P_SSE2 2
2633%if %1==0
2634    lea    r6, [r5*3]
2635    SAD_X4_START_2x8P_SSE2
2636%else
2637    SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2638%endif
2639    SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2640%if %1 != %2-1
2641%if (%1&1) != 0
2642    add  r0, 8*FENC_STRIDE
2643%endif
2644    lea  r1, [r1+4*r5]
2645    lea  r2, [r2+4*r5]
2646    lea  r3, [r3+4*r5]
2647    lea  r4, [r4+4*r5]
2648%endif
2649%endmacro
2650
2651%macro SAD_X3_END_SSE2 1
2652    movifnidn r5, r5mp
2653    movhlps    m3, m0
2654    movhlps    m4, m1
2655    movhlps    m5, m2
2656    paddd      m0, m3
2657    paddd      m1, m4
2658    paddd      m2, m5
2659    movd   [r5+0], m0
2660    movd   [r5+4], m1
2661    movd   [r5+8], m2
2662    RET
2663%endmacro
2664
2665%macro SAD_X4_END_SSE2 1
2666    mov      r0, r6mp
2667    psllq      m1, 32
2668    psllq      m3, 32
2669    paddd      m0, m1
2670    paddd      m2, m3
2671    movhlps    m1, m0
2672    movhlps    m3, m2
2673    paddd      m0, m1
2674    paddd      m2, m3
2675    movq   [r0+0], m0
2676    movq   [r0+8], m2
2677    RET
2678%endmacro
2679
2680%macro SAD_X3_START_2x16P_AVX2 0
2681    movu    m3, [r0] ; assumes FENC_STRIDE == 16
2682    movu   xm0, [r1]
2683    movu   xm1, [r2]
2684    movu   xm2, [r3]
2685    vinserti128  m0, m0, [r1+r4], 1
2686    vinserti128  m1, m1, [r2+r4], 1
2687    vinserti128  m2, m2, [r3+r4], 1
2688    psadbw  m0, m3
2689    psadbw  m1, m3
2690    psadbw  m2, m3
2691%endmacro
2692
2693%macro SAD_X3_2x16P_AVX2 3
2694    movu    m3, [r0+%1] ; assumes FENC_STRIDE == 16
2695    movu   xm4, [r1+%2]
2696    movu   xm5, [r2+%2]
2697    movu   xm6, [r3+%2]
2698    vinserti128  m4, m4, [r1+%3], 1
2699    vinserti128  m5, m5, [r2+%3], 1
2700    vinserti128  m6, m6, [r3+%3], 1
2701    psadbw  m4, m3
2702    psadbw  m5, m3
2703    psadbw  m6, m3
2704    paddw   m0, m4
2705    paddw   m1, m5
2706    paddw   m2, m6
2707%endmacro
2708
2709%macro SAD_X3_4x16P_AVX2 2
2710%if %1==0
2711    lea  t0, [r4*3]
2712    SAD_X3_START_2x16P_AVX2
2713%else
2714    SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
2715%endif
2716    SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
2717%if %1 != %2-1
2718%if (%1&1) != 0
2719    add  r0, 8*FENC_STRIDE
2720%endif
2721    lea  r1, [r1+4*r4]
2722    lea  r2, [r2+4*r4]
2723    lea  r3, [r3+4*r4]
2724%endif
2725%endmacro
2726
2727%macro SAD_X4_START_2x16P_AVX2 0
2728    vbroadcasti128 m4, [r0]
2729    vbroadcasti128 m5, [r0+FENC_STRIDE]
2730    movu   xm0, [r1]
2731    movu   xm1, [r2]
2732    movu   xm2, [r1+r5]
2733    movu   xm3, [r2+r5]
2734    vinserti128 m0, m0, [r3], 1
2735    vinserti128 m1, m1, [r4], 1
2736    vinserti128 m2, m2, [r3+r5], 1
2737    vinserti128 m3, m3, [r4+r5], 1
2738    psadbw  m0, m4
2739    psadbw  m1, m4
2740    psadbw  m2, m5
2741    psadbw  m3, m5
2742    paddw   m0, m2
2743    paddw   m1, m3
2744%endmacro
2745
2746%macro SAD_X4_2x16P_AVX2 4
2747    vbroadcasti128 m6, [r0+%1]
2748    vbroadcasti128 m7, [r0+%3]
2749    movu   xm2, [r1+%2]
2750    movu   xm3, [r2+%2]
2751    movu   xm4, [r1+%4]
2752    movu   xm5, [r2+%4]
2753    vinserti128 m2, m2, [r3+%2], 1
2754    vinserti128 m3, m3, [r4+%2], 1
2755    vinserti128 m4, m4, [r3+%4], 1
2756    vinserti128 m5, m5, [r4+%4], 1
2757    psadbw  m2, m6
2758    psadbw  m3, m6
2759    psadbw  m4, m7
2760    psadbw  m5, m7
2761    paddd   m0, m2
2762    paddd   m1, m3
2763    paddd   m0, m4
2764    paddd   m1, m5
2765%endmacro
2766
2767%macro SAD_X4_4x16P_AVX2 2
2768%if %1==0
2769    lea  r6, [r5*3]
2770    SAD_X4_START_2x16P_AVX2
2771%else
2772    SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2773%endif
2774    SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2775%if %1 != %2-1
2776%if (%1&1) != 0
2777    add  r0, 8*FENC_STRIDE
2778%endif
2779    lea  r1, [r1+4*r5]
2780    lea  r2, [r2+4*r5]
2781    lea  r3, [r3+4*r5]
2782    lea  r4, [r4+4*r5]
2783%endif
2784%endmacro
2785
2786%macro SAD_X4_START_2x32P_AVX2 0
2787    mova        m4, [r0]
2788    movu        m0, [r1]
2789    movu        m2, [r2]
2790    movu        m1, [r3]
2791    movu        m3, [r4]
2792    psadbw      m0, m4
2793    psadbw      m2, m4
2794    psadbw      m1, m4
2795    psadbw      m3, m4
2796    packusdw    m0, m2
2797    packusdw    m1, m3
2798
2799    mova        m6, [r0+FENC_STRIDE]
2800    movu        m2, [r1+r5]
2801    movu        m4, [r2+r5]
2802    movu        m3, [r3+r5]
2803    movu        m5, [r4+r5]
2804    psadbw      m2, m6
2805    psadbw      m4, m6
2806    psadbw      m3, m6
2807    psadbw      m5, m6
2808    packusdw    m2, m4
2809    packusdw    m3, m5
2810    paddd       m0, m2
2811    paddd       m1, m3
2812%endmacro
2813
2814%macro SAD_X4_2x32P_AVX2 4
2815    mova        m6, [r0+%1]
2816    movu        m2, [r1+%2]
2817    movu        m4, [r2+%2]
2818    movu        m3, [r3+%2]
2819    movu        m5, [r4+%2]
2820    psadbw      m2, m6
2821    psadbw      m4, m6
2822    psadbw      m3, m6
2823    psadbw      m5, m6
2824    packusdw    m2, m4
2825    packusdw    m3, m5
2826    paddd       m0, m2
2827    paddd       m1, m3
2828
2829    mova        m6, [r0+%3]
2830    movu        m2, [r1+%4]
2831    movu        m4, [r2+%4]
2832    movu        m3, [r3+%4]
2833    movu        m5, [r4+%4]
2834    psadbw      m2, m6
2835    psadbw      m4, m6
2836    psadbw      m3, m6
2837    psadbw      m5, m6
2838    packusdw    m2, m4
2839    packusdw    m3, m5
2840    paddd       m0, m2
2841    paddd       m1, m3
2842%endmacro
2843
2844%macro SAD_X4_4x32P_AVX2 2
2845%if %1==0
2846    lea  r6, [r5*3]
2847    SAD_X4_START_2x32P_AVX2
2848%else
2849    SAD_X4_2x32P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2850%endif
2851    SAD_X4_2x32P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2852%if %1 != %2-1
2853%if (%1&1) != 0
2854    add  r0, 8*FENC_STRIDE
2855%endif
2856    lea  r1, [r1+4*r5]
2857    lea  r2, [r2+4*r5]
2858    lea  r3, [r3+4*r5]
2859    lea  r4, [r4+4*r5]
2860%endif
2861%endmacro
2862
2863%macro SAD_X3_END_AVX2 0
2864    movifnidn r5, r5mp
2865    packssdw  m0, m1        ; 0 0 1 1 0 0 1 1
2866    packssdw  m2, m2        ; 2 2 _ _ 2 2 _ _
2867    phaddd    m0, m2        ; 0 1 2 _ 0 1 2 _
2868    vextracti128 xm1, m0, 1
2869    paddd    xm0, xm1       ; 0 1 2 _
2870    mova    [r5], xm0
2871    RET
2872%endmacro
2873
2874%macro SAD_X4_END_AVX2 0
2875    mov       r0, r6mp
2876    pshufd     m0, m0, 0x8
2877    pshufd     m1, m1, 0x8
2878    vextracti128 xm2, m0, 1
2879    vextracti128 xm3, m1, 1
2880    punpcklqdq   xm0, xm1
2881    punpcklqdq   xm2, xm3
2882    phaddd   xm0, xm2       ; 0 1 2 3
2883    mova    [r0], xm0
2884    RET
2885%endmacro
2886
2887%macro SAD_X4_32P_END_AVX2 0
2888    mov          r0, r6mp
2889    vextracti128 xm2, m0, 1
2890    vextracti128 xm3, m1, 1
2891    paddd        xm0, xm2
2892    paddd        xm1, xm3
2893    phaddd       xm0, xm1
2894    mova         [r0], xm0
2895    RET
2896%endmacro
2897
2898;-----------------------------------------------------------------------------
2899; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
2900;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
2901;-----------------------------------------------------------------------------
2902%macro SAD_X_SSE2 4
2903cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
2904%assign x 0
2905%rep %3/4
2906    SAD_X%1_4x%2P_SSE2 x, %3/4
2907%assign x x+1
2908%endrep
2909%if %3 == 64
2910    SAD_X%1_END_SSE2 1
2911%else
2912    SAD_X%1_END_SSE2 0
2913%endif
2914%endmacro
2915
2916%macro SAD_X3_W12 0
2917cglobal pixel_sad_x3_12x16, 5, 7, 8
2918    mova  m4,  [MSK]
2919    pxor  m0,  m0
2920    pxor  m1,  m1
2921    pxor  m2,  m2
2922
2923    SAD_X3_12x4
2924    SAD_X3_12x4
2925    SAD_X3_12x4
2926    SAD_X3_12x4
2927    SAD_X3_END_SSE2 1
2928%endmacro
2929
2930%macro SAD_X4_W12 0
2931cglobal pixel_sad_x4_12x16, 6, 8, 8
2932    mova  m6,  [MSK]
2933    pxor  m0,  m0
2934    pxor  m1,  m1
2935    pxor  m2,  m2
2936    pxor  m3,  m3
2937
2938    SAD_X4_12x4
2939    SAD_X4_12x4
2940    SAD_X4_12x4
2941    SAD_X4_12x4
2942    SAD_X4_END_SSE2 1
2943%endmacro
2944
2945%macro SAD_X3_W24 0
2946cglobal pixel_sad_x3_24x32, 5, 7, 8
2947    pxor  m0, m0
2948    pxor  m1, m1
2949    pxor  m2, m2
2950    mov   r6, 32
2951
2952.loop:
2953    SAD_X3_24x4
2954    SAD_X3_24x4
2955    SAD_X3_24x4
2956    SAD_X3_24x4
2957
2958    sub r6,  16
2959    cmp r6,  0
2960jnz .loop
2961    SAD_X3_END_SSE2 1
2962%endmacro
2963
2964%macro SAD_X4_W24 0
2965%if ARCH_X86_64 == 1
2966cglobal pixel_sad_x4_24x32, 6, 8, 8
2967%define count r7
2968%else
2969cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4
2970%define count dword [rsp]
2971%endif
2972    pxor  m0, m0
2973    pxor  m1, m1
2974    pxor  m2, m2
2975    pxor  m3, m3
2976    mov   count, 32
2977
2978.loop:
2979    SAD_X4_24x4
2980    SAD_X4_24x4
2981    SAD_X4_24x4
2982    SAD_X4_24x4
2983
2984    sub count,  16
2985    jnz .loop
2986    SAD_X4_END_SSE2 1
2987
2988%endmacro
2989
2990%macro SAD_X3_W32 0
2991cglobal pixel_sad_x3_32x8, 5, 6, 8
2992    pxor  m0, m0
2993    pxor  m1, m1
2994    pxor  m2, m2
2995
2996    SAD_X3_32x4
2997    SAD_X3_32x4
2998    SAD_X3_END_SSE2 1
2999
3000cglobal pixel_sad_x3_32x16, 5, 6, 8
3001    pxor  m0, m0
3002    pxor  m1, m1
3003    pxor  m2, m2
3004
3005    SAD_X3_32x4
3006    SAD_X3_32x4
3007    SAD_X3_32x4
3008    SAD_X3_32x4
3009    SAD_X3_END_SSE2 1
3010
3011cglobal pixel_sad_x3_32x24, 5, 6, 8
3012    pxor  m0, m0
3013    pxor  m1, m1
3014    pxor  m2, m2
3015
3016    SAD_X3_32x4
3017    SAD_X3_32x4
3018    SAD_X3_32x4
3019    SAD_X3_32x4
3020    SAD_X3_32x4
3021    SAD_X3_32x4
3022    SAD_X3_END_SSE2 1
3023
3024cglobal pixel_sad_x3_32x32, 5, 7, 8
3025    pxor  m0, m0
3026    pxor  m1, m1
3027    pxor  m2, m2
3028    mov   r6, 32
3029
3030.loop:
3031    SAD_X3_32x4
3032    SAD_X3_32x4
3033    SAD_X3_32x4
3034    SAD_X3_32x4
3035
3036    sub r6,  16
3037    cmp r6,  0
3038jnz .loop
3039    SAD_X3_END_SSE2 1
3040
3041cglobal pixel_sad_x3_32x64, 5, 7, 8
3042    pxor  m0, m0
3043    pxor  m1, m1
3044    pxor  m2, m2
3045    mov   r6, 64
3046
3047.loop1:
3048    SAD_X3_32x4
3049    SAD_X3_32x4
3050    SAD_X3_32x4
3051    SAD_X3_32x4
3052
3053    sub r6,  16
3054    cmp r6,  0
3055jnz .loop1
3056    SAD_X3_END_SSE2 1
3057%endmacro
3058
3059%macro SAD_X4_W32 0
3060cglobal pixel_sad_x4_32x8, 6, 7, 8
3061    pxor  m0, m0
3062    pxor  m1, m1
3063    pxor  m2, m2
3064    pxor  m3, m3
3065
3066    SAD_X4_32x4
3067    SAD_X4_32x4
3068    SAD_X4_END_SSE2 1
3069
3070cglobal pixel_sad_x4_32x16, 6, 7, 8
3071    pxor  m0, m0
3072    pxor  m1, m1
3073    pxor  m2, m2
3074    pxor  m3, m3
3075
3076    SAD_X4_32x4
3077    SAD_X4_32x4
3078    SAD_X4_32x4
3079    SAD_X4_32x4
3080    SAD_X4_END_SSE2 1
3081
3082cglobal pixel_sad_x4_32x24, 6, 7, 8
3083    pxor  m0, m0
3084    pxor  m1, m1
3085    pxor  m2, m2
3086    pxor  m3, m3
3087
3088    SAD_X4_32x4
3089    SAD_X4_32x4
3090    SAD_X4_32x4
3091    SAD_X4_32x4
3092    SAD_X4_32x4
3093    SAD_X4_32x4
3094    SAD_X4_END_SSE2 1
3095
3096%if ARCH_X86_64 == 1
3097cglobal pixel_sad_x4_32x32, 6, 8, 8
3098%define count r7
3099%else
3100cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4
3101%define count dword [rsp]
3102%endif
3103    pxor  m0, m0
3104    pxor  m1, m1
3105    pxor  m2, m2
3106    pxor  m3, m3
3107    mov   count, 32
3108
3109.loop:
3110    SAD_X4_32x4
3111    SAD_X4_32x4
3112    SAD_X4_32x4
3113    SAD_X4_32x4
3114
3115    sub count,  16
3116    jnz .loop
3117    SAD_X4_END_SSE2 1
3118
3119%if ARCH_X86_64 == 1
3120cglobal pixel_sad_x4_32x64, 6, 8, 8
3121%define count r7
3122%else
3123cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4
3124%define count dword [rsp]
3125%endif
3126    pxor  m0, m0
3127    pxor  m1, m1
3128    pxor  m2, m2
3129    pxor  m3, m3
3130    mov   count, 64
3131
3132.loop:
3133    SAD_X4_32x4
3134    SAD_X4_32x4
3135    SAD_X4_32x4
3136    SAD_X4_32x4
3137
3138    sub count,  16
3139    jnz .loop
3140    SAD_X4_END_SSE2 1
3141
3142%endmacro
3143
3144%macro SAD_X3_W48 0
3145cglobal pixel_sad_x3_48x64, 5, 7, 8
3146    pxor  m0, m0
3147    pxor  m1, m1
3148    pxor  m2, m2
3149    mov   r6, 64
3150
3151.loop:
3152    SAD_X3_48x4
3153    SAD_X3_48x4
3154    SAD_X3_48x4
3155    SAD_X3_48x4
3156
3157    sub r6,  16
3158    jnz .loop
3159    SAD_X3_END_SSE2 1
3160%endmacro
3161
3162%macro SAD_X4_W48 0
3163%if ARCH_X86_64 == 1
3164cglobal pixel_sad_x4_48x64, 6, 8, 8
3165%define count r7
3166%else
3167cglobal pixel_sad_x4_48x64, 6, 7, 8, 0-4
3168%define count dword [rsp]
3169%endif
3170    pxor  m0, m0
3171    pxor  m1, m1
3172    pxor  m2, m2
3173    pxor  m3, m3
3174    mov   count, 64
3175
3176.loop:
3177    SAD_X4_48x4
3178    SAD_X4_48x4
3179    SAD_X4_48x4
3180    SAD_X4_48x4
3181
3182    sub count,  16
3183    jnz .loop
3184    SAD_X4_END_SSE2 1
3185%endmacro
3186
3187%macro SAD_X3_W64 0
3188cglobal pixel_sad_x3_64x16, 5, 7, 7
3189    pxor  m0, m0
3190    pxor  m1, m1
3191    pxor  m2, m2
3192    mov   r6, 16
3193
3194.loop:
3195    SAD_X3_64x4
3196    SAD_X3_64x4
3197
3198    sub r6,  8
3199    jnz .loop
3200    SAD_X3_END_SSE2 1
3201
3202cglobal pixel_sad_x3_64x32, 5, 7, 7
3203    pxor  m0, m0
3204    pxor  m1, m1
3205    pxor  m2, m2
3206    mov   r6, 32
3207
3208.loop:
3209    SAD_X3_64x4
3210    SAD_X3_64x4
3211
3212    sub r6,  8
3213    jnz .loop
3214    SAD_X3_END_SSE2 1
3215
3216cglobal pixel_sad_x3_64x48, 5, 7, 7
3217    pxor  m0, m0
3218    pxor  m1, m1
3219    pxor  m2, m2
3220    mov   r6, 48
3221
3222.loop:
3223    SAD_X3_64x4
3224    SAD_X3_64x4
3225
3226    sub r6,  8
3227    jnz .loop
3228    SAD_X3_END_SSE2 1
3229
3230cglobal pixel_sad_x3_64x64, 5, 7, 7
3231    pxor  m0, m0
3232    pxor  m1, m1
3233    pxor  m2, m2
3234    mov   r6, 64
3235
3236.loop:
3237    SAD_X3_64x4
3238    SAD_X3_64x4
3239
3240    sub r6,  8
3241    jnz .loop
3242    SAD_X3_END_SSE2 1
3243%endmacro
3244
3245%macro SAD_X4_W64 0
3246%if ARCH_X86_64 == 1
3247cglobal pixel_sad_x4_64x16, 6, 8, 8
3248%define count r7
3249%else
3250cglobal pixel_sad_x4_64x16, 6, 7, 8, 0-4
3251%define count dword [rsp]
3252%endif
3253    pxor  m0, m0
3254    pxor  m1, m1
3255    pxor  m2, m2
3256    pxor  m3, m3
3257    mov   count, 16
3258
3259.loop:
3260    SAD_X4_64x4
3261    SAD_X4_64x4
3262
3263    sub count,  8
3264    jnz .loop
3265    SAD_X4_END_SSE2 1
3266
3267%if ARCH_X86_64 == 1
3268cglobal pixel_sad_x4_64x32, 6, 8, 8
3269%define count r7
3270%else
3271cglobal pixel_sad_x4_64x32, 6, 7, 8, 0-4
3272%define count dword [rsp]
3273%endif
3274    pxor  m0, m0
3275    pxor  m1, m1
3276    pxor  m2, m2
3277    pxor  m3, m3
3278    mov   count, 32
3279
3280.loop:
3281    SAD_X4_64x4
3282    SAD_X4_64x4
3283
3284    sub count,  8
3285    jnz .loop
3286    SAD_X4_END_SSE2 1
3287
3288%if ARCH_X86_64 == 1
3289cglobal pixel_sad_x4_64x48, 6, 8, 8
3290%define count r7
3291%else
3292cglobal pixel_sad_x4_64x48, 6, 7, 8, 0-4
3293%define count dword [rsp]
3294%endif
3295    pxor  m0, m0
3296    pxor  m1, m1
3297    pxor  m2, m2
3298    pxor  m3, m3
3299    mov   count, 48
3300
3301.loop:
3302    SAD_X4_64x4
3303    SAD_X4_64x4
3304
3305    sub count,  8
3306    jnz .loop
3307    SAD_X4_END_SSE2 1
3308
3309%if ARCH_X86_64 == 1
3310cglobal pixel_sad_x4_64x64, 6, 8, 8
3311%define count r7
3312%else
3313cglobal pixel_sad_x4_64x64, 6, 7, 8, 0-4
3314%define count dword [rsp]
3315%endif
3316    pxor  m0, m0
3317    pxor  m1, m1
3318    pxor  m2, m2
3319    pxor  m3, m3
3320    mov   count, 64
3321
3322.loop:
3323    SAD_X4_64x4
3324    SAD_X4_64x4
3325
3326    sub count,  8
3327    jnz .loop
3328    SAD_X4_END_SSE2 1
3329%endmacro
3330
3331INIT_XMM sse2
3332SAD_X_SSE2 3, 16, 16, 7
3333SAD_X_SSE2 3, 16,  8, 7
3334SAD_X_SSE2 3,  8, 16, 7
3335SAD_X_SSE2 3,  8,  8, 7
3336SAD_X_SSE2 3,  8,  4, 7
3337SAD_X_SSE2 4, 16, 16, 7
3338SAD_X_SSE2 4, 16,  8, 7
3339SAD_X_SSE2 4,  8, 16, 7
3340SAD_X_SSE2 4,  8,  8, 7
3341SAD_X_SSE2 4,  8,  4, 7
3342
3343INIT_XMM sse3
3344SAD_X_SSE2 3, 16, 16, 7
3345SAD_X_SSE2 3, 16,  8, 7
3346SAD_X_SSE2 3, 16,  4, 7
3347SAD_X_SSE2 4, 16, 16, 7
3348SAD_X_SSE2 4, 16,  8, 7
3349SAD_X_SSE2 4, 16,  4, 7
3350
3351INIT_XMM ssse3
3352SAD_X3_W12
3353SAD_X3_W32
3354SAD_X3_W24
3355SAD_X3_W48
3356SAD_X3_W64
3357SAD_X_SSE2  3, 16, 64, 7
3358SAD_X_SSE2  3, 16, 32, 7
3359SAD_X_SSE2  3, 16, 16, 7
3360SAD_X_SSE2  3, 16, 12, 7
3361SAD_X_SSE2  3, 16,  8, 7
3362SAD_X_SSE2  3,  8, 32, 7
3363SAD_X_SSE2  3,  8, 16, 7
3364SAD_X4_W12
3365SAD_X4_W24
3366SAD_X4_W32
3367SAD_X4_W48
3368SAD_X4_W64
3369SAD_X_SSE2  4, 16, 64, 7
3370SAD_X_SSE2  4, 16, 32, 7
3371SAD_X_SSE2  4, 16, 16, 7
3372SAD_X_SSE2  4, 16, 12, 7
3373SAD_X_SSE2  4, 16,  8, 7
3374SAD_X_SSE2  4,  8, 32, 7
3375SAD_X_SSE2  4,  8, 16, 7
3376SAD_X_SSE2  4,  8,  8, 7
3377SAD_X_SSE2  4,  8,  4, 7
3378
3379INIT_XMM avx
3380SAD_X3_W12
3381SAD_X3_W32
3382SAD_X3_W24
3383SAD_X3_W48
3384SAD_X3_W64
3385SAD_X_SSE2 3, 16, 64, 7
3386SAD_X_SSE2 3, 16, 32, 6
3387SAD_X_SSE2 3, 16, 16, 6
3388SAD_X_SSE2 3, 16, 12, 6
3389SAD_X_SSE2 3, 16,  8, 6
3390SAD_X_SSE2 3, 16,  4, 6
3391SAD_X4_W12
3392SAD_X4_W24
3393SAD_X4_W32
3394SAD_X4_W48
3395SAD_X4_W64
3396SAD_X_SSE2 4, 16, 64, 7
3397SAD_X_SSE2 4, 16, 32, 7
3398SAD_X_SSE2 4, 16, 16, 7
3399SAD_X_SSE2 4, 16, 12, 7
3400SAD_X_SSE2 4, 16,  8, 7
3401SAD_X_SSE2 4, 16,  4, 7
3402
3403%macro SAD_X_AVX2 4
3404cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
3405%assign x 0
3406%rep %3/4
3407    SAD_X%1_4x%2P_AVX2 x, %3/4
3408%assign x x+1
3409%endrep
3410
3411  %if (%1==4) && (%2==32)
3412    SAD_X%1_32P_END_AVX2
3413  %else
3414    SAD_X%1_END_AVX2
3415  %endif
3416%endmacro
3417
3418INIT_YMM avx2
3419SAD_X_AVX2 3, 16, 32, 7
3420SAD_X_AVX2 3, 16, 16, 7
3421SAD_X_AVX2 3, 16, 12, 7
3422SAD_X_AVX2 3, 16,  8, 7
3423SAD_X_AVX2 4, 16, 32, 8
3424SAD_X_AVX2 4, 16, 16, 8
3425SAD_X_AVX2 4, 16, 12, 8
3426SAD_X_AVX2 4, 16,  8, 8
3427
3428SAD_X_AVX2 4, 32,  8, 8
3429SAD_X_AVX2 4, 32, 16, 8
3430SAD_X_AVX2 4, 32, 24, 8
3431SAD_X_AVX2 4, 32, 32, 8
3432SAD_X_AVX2 4, 32, 64, 8
3433
3434;=============================================================================
3435; SAD cacheline split
3436;=============================================================================
3437
3438; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
3439; unless the unaligned data spans the border between 2 cachelines, in which
3440; case it's really slow. The exact numbers may differ, but all Intel cpus prior
3441; to Nehalem have a large penalty for cacheline splits.
3442; (8-byte alignment exactly half way between two cachelines is ok though.)
3443; LDDQU was supposed to fix this, but it only works on Pentium 4.
3444; So in the split case we load aligned data and explicitly perform the
3445; alignment between registers. Like on archs that have only aligned loads,
3446; except complicated by the fact that PALIGNR takes only an immediate, not
3447; a variable alignment.
3448; It is also possible to hoist the realignment to the macroblock level (keep
3449; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
3450; needed for that method makes it often slower.
3451
3452; sad 16x16 costs on Core2:
3453; good offsets: 49 cycles (50/64 of all mvs)
3454; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
3455; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
3456; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
3457
3458; computed jump assumes this loop is exactly 80 bytes
3459%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
3460ALIGN 16
3461sad_w16_align%1_sse2:
3462    movdqa  xmm1, [r2+16]
3463    movdqa  xmm2, [r2+r3+16]
3464    movdqa  xmm3, [r2]
3465    movdqa  xmm4, [r2+r3]
3466    pslldq  xmm1, 16-%1
3467    pslldq  xmm2, 16-%1
3468    psrldq  xmm3, %1
3469    psrldq  xmm4, %1
3470    por     xmm1, xmm3
3471    por     xmm2, xmm4
3472    psadbw  xmm1, [r0]
3473    psadbw  xmm2, [r0+r1]
3474    paddw   xmm0, xmm1
3475    paddw   xmm0, xmm2
3476    lea     r0,   [r0+2*r1]
3477    lea     r2,   [r2+2*r3]
3478    dec     r4
3479    jg sad_w16_align%1_sse2
3480    ret
3481%endmacro
3482
3483; computed jump assumes this loop is exactly 64 bytes
3484%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
3485ALIGN 16
3486sad_w16_align%1_ssse3:
3487    movdqa  xmm1, [r2+16]
3488    movdqa  xmm2, [r2+r3+16]
3489    palignr xmm1, [r2], %1
3490    palignr xmm2, [r2+r3], %1
3491    psadbw  xmm1, [r0]
3492    psadbw  xmm2, [r0+r1]
3493    paddw   xmm0, xmm1
3494    paddw   xmm0, xmm2
3495    lea     r0,   [r0+2*r1]
3496    lea     r2,   [r2+2*r3]
3497    dec     r4
3498    jg sad_w16_align%1_ssse3
3499    ret
3500%endmacro
3501
3502%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
3503cglobal pixel_sad_16x%2_cache64_%1
3504    mov     eax, r2m
3505    and     eax, 0x37
3506    cmp     eax, 0x30
3507    jle pixel_sad_16x%2_sse2
3508    PROLOGUE 4,6
3509    mov     r4d, r2d
3510    and     r4d, 15
3511%ifidn %1, ssse3
3512    shl     r4d, 6  ; code size = 64
3513%else
3514    lea     r4, [r4*5]
3515    shl     r4d, 4  ; code size = 80
3516%endif
3517%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
3518%ifdef PIC
3519    lea     r5, [sad_w16_addr]
3520    add     r5, r4
3521%else
3522    lea     r5, [sad_w16_addr + r4]
3523%endif
3524    and     r2, ~15
3525    mov     r4d, %2/2
3526    pxor    xmm0, xmm0
3527    call    r5
3528    movhlps xmm1, xmm0
3529    paddw   xmm0, xmm1
3530    movd    eax,  xmm0
3531    RET
3532%endmacro
3533
3534%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
3535    mov    eax, r2m
3536    and    eax, 0x17|%1|(%4>>1)
3537    cmp    eax, 0x10|%1|(%4>>1)
3538    jle pixel_sad_%1x%2_mmx2
3539    and    eax, 7
3540    shl    eax, 3
3541    movd   mm6, [pd_64]
3542    movd   mm7, eax
3543    psubw  mm6, mm7
3544    PROLOGUE 4,5
3545    and    r2, ~7
3546    mov    r4d, %3
3547    pxor   mm0, mm0
3548%endmacro
3549
3550%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
3551cglobal pixel_sad_16x%1_cache%2_mmx2
3552    SAD_CACHELINE_START_MMX2 16, %1, %1, %2
3553.loop:
3554    movq   mm1, [r2]
3555    movq   mm2, [r2+8]
3556    movq   mm3, [r2+16]
3557    movq   mm4, mm2
3558    psrlq  mm1, mm7
3559    psllq  mm2, mm6
3560    psllq  mm3, mm6
3561    psrlq  mm4, mm7
3562    por    mm1, mm2
3563    por    mm3, mm4
3564    psadbw mm1, [r0]
3565    psadbw mm3, [r0+8]
3566    paddw  mm0, mm1
3567    paddw  mm0, mm3
3568    add    r2, r3
3569    add    r0, r1
3570    dec    r4
3571    jg .loop
3572    movd   eax, mm0
3573    RET
3574%endmacro
3575
3576%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
3577cglobal pixel_sad_8x%1_cache%2_mmx2
3578    SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
3579.loop:
3580    movq   mm1, [r2+8]
3581    movq   mm2, [r2+r3+8]
3582    movq   mm3, [r2]
3583    movq   mm4, [r2+r3]
3584    psllq  mm1, mm6
3585    psllq  mm2, mm6
3586    psrlq  mm3, mm7
3587    psrlq  mm4, mm7
3588    por    mm1, mm3
3589    por    mm2, mm4
3590    psadbw mm1, [r0]
3591    psadbw mm2, [r0+r1]
3592    paddw  mm0, mm1
3593    paddw  mm0, mm2
3594    lea    r2, [r2+2*r3]
3595    lea    r0, [r0+2*r1]
3596    dec    r4
3597    jg .loop
3598    movd   eax, mm0
3599    RET
3600%endmacro
3601
3602; sad_x3/x4_cache64: check each mv.
3603; if they're all within a cacheline, use normal sad_x3/x4.
3604; otherwise, send them individually to sad_cache64.
3605%macro CHECK_SPLIT 3 ; pix, width, cacheline
3606    mov  eax, %1
3607    and  eax, 0x17|%2|(%3>>1)
3608    cmp  eax, 0x10|%2|(%3>>1)
3609    jg .split
3610%endmacro
3611
3612%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
3613cglobal pixel_sad_x3_%1x%2_cache%3_%6
3614    CHECK_SPLIT r1m, %1, %3
3615    CHECK_SPLIT r2m, %1, %3
3616    CHECK_SPLIT r3m, %1, %3
3617    jmp pixel_sad_x3_%1x%2_%4
3618.split:
3619%if ARCH_X86_64
3620    PROLOGUE 6,9
3621    push r3
3622    push r2
3623%if WIN64
3624    movsxd r4, r4d
3625    sub rsp, 40 ; shadow space and alignment
3626%endif
3627    mov  r2, r1
3628    mov  r1, FENC_STRIDE
3629    mov  r3, r4
3630    mov  r7, r0
3631    mov  r8, r5
3632    call pixel_sad_%1x%2_cache%3_%5
3633    mov  [r8], eax
3634%if WIN64
3635    mov  r2, [rsp+40+0*8]
3636%else
3637    pop  r2
3638%endif
3639    mov  r0, r7
3640    call pixel_sad_%1x%2_cache%3_%5
3641    mov  [r8+4], eax
3642%if WIN64
3643    mov  r2, [rsp+40+1*8]
3644%else
3645    pop  r2
3646%endif
3647    mov  r0, r7
3648    call pixel_sad_%1x%2_cache%3_%5
3649    mov  [r8+8], eax
3650%if WIN64
3651    add  rsp, 40+2*8
3652%endif
3653    RET
3654%else
3655    push edi
3656    mov  edi, [esp+28]
3657    push dword [esp+24]
3658    push dword [esp+16]
3659    push dword 16
3660    push dword [esp+20]
3661    call pixel_sad_%1x%2_cache%3_%5
3662    mov  ecx, [esp+32]
3663    mov  [edi], eax
3664    mov  [esp+8], ecx
3665    call pixel_sad_%1x%2_cache%3_%5
3666    mov  ecx, [esp+36]
3667    mov  [edi+4], eax
3668    mov  [esp+8], ecx
3669    call pixel_sad_%1x%2_cache%3_%5
3670    mov  [edi+8], eax
3671    add  esp, 16
3672    pop  edi
3673    ret
3674%endif
3675%endmacro
3676
3677%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
3678cglobal pixel_sad_x4_%1x%2_cache%3_%6
3679    CHECK_SPLIT r1m, %1, %3
3680    CHECK_SPLIT r2m, %1, %3
3681    CHECK_SPLIT r3m, %1, %3
3682    CHECK_SPLIT r4m, %1, %3
3683    jmp pixel_sad_x4_%1x%2_%4
3684.split:
3685%if ARCH_X86_64
3686    PROLOGUE 6,9
3687    mov  r8,  r6mp
3688    push r4
3689    push r3
3690    push r2
3691%if WIN64
3692    sub rsp, 32 ; shadow space
3693%endif
3694    mov  r2, r1
3695    mov  r1, FENC_STRIDE
3696    mov  r3, r5
3697    mov  r7, r0
3698    call pixel_sad_%1x%2_cache%3_%5
3699    mov  [r8], eax
3700%if WIN64
3701    mov  r2, [rsp+32+0*8]
3702%else
3703    pop  r2
3704%endif
3705    mov  r0, r7
3706    call pixel_sad_%1x%2_cache%3_%5
3707    mov  [r8+4], eax
3708%if WIN64
3709    mov  r2, [rsp+32+1*8]
3710%else
3711    pop  r2
3712%endif
3713    mov  r0, r7
3714    call pixel_sad_%1x%2_cache%3_%5
3715    mov  [r8+8], eax
3716%if WIN64
3717    mov  r2, [rsp+32+2*8]
3718%else
3719    pop  r2
3720%endif
3721    mov  r0, r7
3722    call pixel_sad_%1x%2_cache%3_%5
3723    mov  [r8+12], eax
3724%if WIN64
3725    add  rsp, 32+3*8
3726%endif
3727    RET
3728%else
3729    push edi
3730    mov  edi, [esp+32]
3731    push dword [esp+28]
3732    push dword [esp+16]
3733    push dword 16
3734    push dword [esp+20]
3735    call pixel_sad_%1x%2_cache%3_%5
3736    mov  ecx, [esp+32]
3737    mov  [edi], eax
3738    mov  [esp+8], ecx
3739    call pixel_sad_%1x%2_cache%3_%5
3740    mov  ecx, [esp+36]
3741    mov  [edi+4], eax
3742    mov  [esp+8], ecx
3743    call pixel_sad_%1x%2_cache%3_%5
3744    mov  ecx, [esp+40]
3745    mov  [edi+8], eax
3746    mov  [esp+8], ecx
3747    call pixel_sad_%1x%2_cache%3_%5
3748    mov  [edi+12], eax
3749    add  esp, 16
3750    pop  edi
3751    ret
3752%endif
3753%endmacro
3754
3755%macro SADX34_CACHELINE_FUNC 1+
3756    SADX3_CACHELINE_FUNC %1
3757    SADX4_CACHELINE_FUNC %1
3758%endmacro
3759
3760
3761; instantiate the aligned sads
3762
3763INIT_MMX
3764%if ARCH_X86_64 == 0
3765SAD16_CACHELINE_FUNC_MMX2  8, 32
3766SAD16_CACHELINE_FUNC_MMX2 16, 32
3767SAD8_CACHELINE_FUNC_MMX2   4, 32
3768SAD8_CACHELINE_FUNC_MMX2   8, 32
3769SAD8_CACHELINE_FUNC_MMX2  16, 32
3770SAD16_CACHELINE_FUNC_MMX2  8, 64
3771SAD16_CACHELINE_FUNC_MMX2 16, 64
3772%endif ; !ARCH_X86_64
3773SAD8_CACHELINE_FUNC_MMX2   4, 64
3774SAD8_CACHELINE_FUNC_MMX2   8, 64
3775SAD8_CACHELINE_FUNC_MMX2  16, 64
3776
3777%if ARCH_X86_64 == 0
3778SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
3779SADX34_CACHELINE_FUNC 16,  8, 32, mmx2, mmx2, mmx2
3780SADX34_CACHELINE_FUNC  8, 16, 32, mmx2, mmx2, mmx2
3781SADX34_CACHELINE_FUNC  8,  8, 32, mmx2, mmx2, mmx2
3782SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
3783SADX34_CACHELINE_FUNC 16,  8, 64, mmx2, mmx2, mmx2
3784%endif ; !ARCH_X86_64
3785SADX34_CACHELINE_FUNC  8, 16, 64, mmx2, mmx2, mmx2
3786SADX34_CACHELINE_FUNC  8,  8, 64, mmx2, mmx2, mmx2
3787
3788%if ARCH_X86_64 == 0
3789SAD16_CACHELINE_FUNC sse2, 8
3790SAD16_CACHELINE_FUNC sse2, 16
3791%assign i 1
3792%rep 15
3793SAD16_CACHELINE_LOOP_SSE2 i
3794%assign i i+1
3795%endrep
3796SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
3797SADX34_CACHELINE_FUNC 16,  8, 64, sse2, sse2, sse2
3798%endif ; !ARCH_X86_64
3799SADX34_CACHELINE_FUNC  8, 16, 64, sse2, mmx2, sse2
3800
3801SAD16_CACHELINE_FUNC ssse3, 8
3802SAD16_CACHELINE_FUNC ssse3, 16
3803%assign i 1
3804%rep 15
3805SAD16_CACHELINE_LOOP_SSSE3 i
3806%assign i i+1
3807%endrep
3808SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
3809SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3, ssse3
3810
3811%if HIGH_BIT_DEPTH==0
3812INIT_YMM avx2
3813cglobal pixel_sad_x3_8x4, 6,6,5
3814    xorps           m0, m0
3815    xorps           m1, m1
3816
3817    sub             r2, r1          ; rebase on pointer r1
3818    sub             r3, r1
3819
3820    ; row 0
3821    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
3822    movq           xm3, [r1]
3823    movhps         xm3, [r1 + r2]
3824    movq           xm4, [r1 + r3]
3825    psadbw         xm3, xm2
3826    psadbw         xm4, xm2
3827    paddd          xm0, xm3
3828    paddd          xm1, xm4
3829    add             r1, r4
3830
3831    ; row 1
3832    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
3833    movq           xm3, [r1]
3834    movhps         xm3, [r1 + r2]
3835    movq           xm4, [r1 + r3]
3836    psadbw         xm3, xm2
3837    psadbw         xm4, xm2
3838    paddd          xm0, xm3
3839    paddd          xm1, xm4
3840    add             r1, r4
3841
3842    ; row 2
3843    vpbroadcastq   xm2, [r0 + 2 * FENC_STRIDE]
3844    movq           xm3, [r1]
3845    movhps         xm3, [r1 + r2]
3846    movq           xm4, [r1 + r3]
3847    psadbw         xm3, xm2
3848    psadbw         xm4, xm2
3849    paddd          xm0, xm3
3850    paddd          xm1, xm4
3851    add             r1, r4
3852
3853    ; row 3
3854    vpbroadcastq   xm2, [r0 + 3 * FENC_STRIDE]
3855    movq           xm3, [r1]
3856    movhps         xm3, [r1 + r2]
3857    movq           xm4, [r1 + r3]
3858    psadbw         xm3, xm2
3859    psadbw         xm4, xm2
3860    paddd          xm0, xm3
3861    paddd          xm1, xm4
3862
3863    pshufd          xm0, xm0, q0020
3864    movq            [r5 + 0], xm0
3865    movd            [r5 + 8], xm1
3866    RET
3867
3868INIT_YMM avx2
3869cglobal pixel_sad_x3_8x8, 6,6,5
3870    xorps           m0, m0
3871    xorps           m1, m1
3872
3873    sub             r2, r1          ; rebase on pointer r1
3874    sub             r3, r1
3875%assign x 0
3876%rep 4
3877    ; row 0
3878    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
3879    movq           xm3, [r1]
3880    movhps         xm3, [r1 + r2]
3881    movq           xm4, [r1 + r3]
3882    psadbw         xm3, xm2
3883    psadbw         xm4, xm2
3884    paddd          xm0, xm3
3885    paddd          xm1, xm4
3886    add             r1, r4
3887
3888    ; row 1
3889    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
3890    movq           xm3, [r1]
3891    movhps         xm3, [r1 + r2]
3892    movq           xm4, [r1 + r3]
3893    psadbw         xm3, xm2
3894    psadbw         xm4, xm2
3895    paddd          xm0, xm3
3896    paddd          xm1, xm4
3897
3898%assign x x+1
3899  %if x < 4
3900    add             r1, r4
3901    add             r0, 2 * FENC_STRIDE
3902  %endif
3903%endrep
3904
3905    pshufd          xm0, xm0, q0020
3906    movq            [r5 + 0], xm0
3907    movd            [r5 + 8], xm1
3908    RET
3909
3910INIT_YMM avx2
3911cglobal pixel_sad_x3_8x16, 6,6,5
3912    xorps           m0, m0
3913    xorps           m1, m1
3914
3915    sub             r2, r1          ; rebase on pointer r1
3916    sub             r3, r1
3917%assign x 0
3918%rep 8
3919    ; row 0
3920    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
3921    movq           xm3, [r1]
3922    movhps         xm3, [r1 + r2]
3923    movq           xm4, [r1 + r3]
3924    psadbw         xm3, xm2
3925    psadbw         xm4, xm2
3926    paddd          xm0, xm3
3927    paddd          xm1, xm4
3928    add             r1, r4
3929
3930    ; row 1
3931    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
3932    movq           xm3, [r1]
3933    movhps         xm3, [r1 + r2]
3934    movq           xm4, [r1 + r3]
3935    psadbw         xm3, xm2
3936    psadbw         xm4, xm2
3937    paddd          xm0, xm3
3938    paddd          xm1, xm4
3939
3940%assign x x+1
3941  %if x < 8
3942    add             r1, r4
3943    add             r0, 2 * FENC_STRIDE
3944  %endif
3945%endrep
3946
3947    pshufd          xm0, xm0, q0020
3948    movq            [r5 + 0], xm0
3949    movd            [r5 + 8], xm1
3950    RET
3951
3952INIT_YMM avx2
3953cglobal pixel_sad_x4_8x8, 7,7,5
3954    xorps           m0, m0
3955    xorps           m1, m1
3956
3957    sub             r2, r1          ; rebase on pointer r1
3958    sub             r3, r1
3959    sub             r4, r1
3960%assign x 0
3961%rep 4
3962    ; row 0
3963    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
3964    movq           xm3, [r1]
3965    movhps         xm3, [r1 + r2]
3966    movq           xm4, [r1 + r3]
3967    movhps         xm4, [r1 + r4]
3968    psadbw         xm3, xm2
3969    psadbw         xm4, xm2
3970    paddd          xm0, xm3
3971    paddd          xm1, xm4
3972    add             r1, r5
3973
3974    ; row 1
3975    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
3976    movq           xm3, [r1]
3977    movhps         xm3, [r1 + r2]
3978    movq           xm4, [r1 + r3]
3979    movhps         xm4, [r1 + r4]
3980    psadbw         xm3, xm2
3981    psadbw         xm4, xm2
3982    paddd          xm0, xm3
3983    paddd          xm1, xm4
3984
3985%assign x x+1
3986  %if x < 4
3987    add             r1, r5
3988    add             r0, 2 * FENC_STRIDE
3989  %endif
3990%endrep
3991
3992    pshufd          xm0, xm0, q0020
3993    pshufd          xm1, xm1, q0020
3994    movq            [r6 + 0], xm0
3995    movq            [r6 + 8], xm1
3996    RET
3997
3998INIT_YMM avx2
3999cglobal pixel_sad_32x8, 4,4,6
4000    xorps           m0, m0
4001    xorps           m5, m5
4002
4003    movu           m1, [r0]               ; row 0 of pix0
4004    movu           m2, [r2]               ; row 0 of pix1
4005    movu           m3, [r0 + r1]          ; row 1 of pix0
4006    movu           m4, [r2 + r3]          ; row 1 of pix1
4007
4008    psadbw         m1, m2
4009    psadbw         m3, m4
4010    paddd          m0, m1
4011    paddd          m5, m3
4012
4013    lea     r2,     [r2 + 2 * r3]
4014    lea     r0,     [r0 + 2 * r1]
4015
4016    movu           m1, [r0]               ; row 2 of pix0
4017    movu           m2, [r2]               ; row 2 of pix1
4018    movu           m3, [r0 + r1]          ; row 3 of pix0
4019    movu           m4, [r2 + r3]          ; row 3 of pix1
4020
4021    psadbw         m1, m2
4022    psadbw         m3, m4
4023    paddd          m0, m1
4024    paddd          m5, m3
4025
4026    lea     r2,     [r2 + 2 * r3]
4027    lea     r0,     [r0 + 2 * r1]
4028
4029    movu           m1, [r0]               ; row 4 of pix0
4030    movu           m2, [r2]               ; row 4 of pix1
4031    movu           m3, [r0 + r1]          ; row 5 of pix0
4032    movu           m4, [r2 + r3]          ; row 5 of pix1
4033
4034    psadbw         m1, m2
4035    psadbw         m3, m4
4036    paddd          m0, m1
4037    paddd          m5, m3
4038
4039    lea     r2,     [r2 + 2 * r3]
4040    lea     r0,     [r0 + 2 * r1]
4041
4042    movu           m1, [r0]               ; row 6 of pix0
4043    movu           m2, [r2]               ; row 6 of pix1
4044    movu           m3, [r0 + r1]          ; row 7 of pix0
4045    movu           m4, [r2 + r3]          ; row 7 of pix1
4046
4047    psadbw         m1, m2
4048    psadbw         m3, m4
4049    paddd          m0, m1
4050    paddd          m5, m3
4051
4052    paddd          m0, m5
4053    vextracti128   xm1, m0, 1
4054    paddd          xm0, xm1
4055    pshufd         xm1, xm0, 2
4056    paddd          xm0,xm1
4057    movd           eax, xm0
4058    RET
4059
4060INIT_YMM avx2
4061cglobal pixel_sad_32x16, 4,5,6
4062    xorps           m0, m0
4063    xorps           m5, m5
4064    mov             r4d, 4
4065
4066.loop
4067    movu           m1, [r0]               ; row 0 of pix0
4068    movu           m2, [r2]               ; row 0 of pix1
4069    movu           m3, [r0 + r1]          ; row 1 of pix0
4070    movu           m4, [r2 + r3]          ; row 1 of pix1
4071
4072    psadbw         m1, m2
4073    psadbw         m3, m4
4074    paddd          m0, m1
4075    paddd          m5, m3
4076
4077    lea     r2,     [r2 + 2 * r3]
4078    lea     r0,     [r0 + 2 * r1]
4079
4080    movu           m1, [r0]               ; row 2 of pix0
4081    movu           m2, [r2]               ; row 2 of pix1
4082    movu           m3, [r0 + r1]          ; row 3 of pix0
4083    movu           m4, [r2 + r3]          ; row 3 of pix1
4084
4085    psadbw         m1, m2
4086    psadbw         m3, m4
4087    paddd          m0, m1
4088    paddd          m5, m3
4089
4090    lea     r2,     [r2 + 2 * r3]
4091    lea     r0,     [r0 + 2 * r1]
4092
4093    dec         r4d
4094    jnz         .loop
4095
4096    paddd          m0, m5
4097    vextracti128   xm1, m0, 1
4098    paddd          xm0, xm1
4099    pshufd         xm1, xm0, 2
4100    paddd          xm0,xm1
4101    movd           eax, xm0
4102    RET
4103
4104INIT_YMM avx2
4105cglobal pixel_sad_32x24, 4,7,6
4106    xorps           m0, m0
4107    xorps           m5, m5
4108    mov             r4d, 6
4109    lea             r5, [r1 * 3]
4110    lea             r6, [r3 * 3]
4111.loop
4112    movu           m1, [r0]               ; row 0 of pix0
4113    movu           m2, [r2]               ; row 0 of pix1
4114    movu           m3, [r0 + r1]          ; row 1 of pix0
4115    movu           m4, [r2 + r3]          ; row 1 of pix1
4116
4117    psadbw         m1, m2
4118    psadbw         m3, m4
4119    paddd          m0, m1
4120    paddd          m5, m3
4121
4122    movu           m1, [r0 + 2 * r1]      ; row 2 of pix0
4123    movu           m2, [r2 + 2 * r3]      ; row 2 of pix1
4124    movu           m3, [r0 + r5]          ; row 3 of pix0
4125    movu           m4, [r2 + r6]          ; row 3 of pix1
4126
4127    psadbw         m1, m2
4128    psadbw         m3, m4
4129    paddd          m0, m1
4130    paddd          m5, m3
4131
4132    lea     r2,     [r2 + 4 * r3]
4133    lea     r0,     [r0 + 4 * r1]
4134
4135    dec         r4d
4136    jnz         .loop
4137
4138    paddd          m0, m5
4139    vextracti128   xm1, m0, 1
4140    paddd          xm0, xm1
4141    pshufd         xm1, xm0, 2
4142    paddd          xm0,xm1
4143    movd           eax, xm0
4144    RET
4145
4146INIT_YMM avx2
4147cglobal pixel_sad_32x32, 4,7,5
4148    xorps           m0, m0
4149    mov             r4d, 32/4
4150    lea             r5, [r1 * 3]
4151    lea             r6, [r3 * 3]
4152
4153.loop
4154    movu           m1, [r0]               ; row 0 of pix0
4155    movu           m2, [r2]               ; row 0 of pix1
4156    movu           m3, [r0 + r1]          ; row 1 of pix0
4157    movu           m4, [r2 + r3]          ; row 1 of pix1
4158
4159    psadbw         m1, m2
4160    psadbw         m3, m4
4161    paddd          m0, m1
4162    paddd          m0, m3
4163
4164    movu           m1, [r0 + 2 * r1]      ; row 2 of pix0
4165    movu           m2, [r2 + 2 * r3]      ; row 2 of pix1
4166    movu           m3, [r0 + r5]          ; row 3 of pix0
4167    movu           m4, [r2 + r6]          ; row 3 of pix1
4168
4169    psadbw         m1, m2
4170    psadbw         m3, m4
4171    paddd          m0, m1
4172    paddd          m0, m3
4173
4174    lea            r2,     [r2 + 4 * r3]
4175    lea            r0,     [r0 + 4 * r1]
4176
4177    dec            r4d
4178    jnz           .loop
4179
4180    vextracti128   xm1, m0, 1
4181    paddd          xm0, xm1
4182    pshufd         xm1, xm0, 2
4183    paddd          xm0,xm1
4184    movd            eax, xm0
4185    RET
4186
4187 INIT_YMM avx2
4188cglobal pixel_sad_32x64, 4,7,5
4189    xorps           m0, m0
4190    mov             r4d, 64/8
4191    lea             r5, [r1 * 3]
4192    lea             r6, [r3 * 3]
4193
4194.loop
4195    movu           m1, [r0]               ; row 0 of pix0
4196    movu           m2, [r2]               ; row 0 of pix1
4197    movu           m3, [r0 + r1]          ; row 1 of pix0
4198    movu           m4, [r2 + r3]          ; row 1 of pix1
4199
4200    psadbw         m1, m2
4201    psadbw         m3, m4
4202    paddd          m0, m1
4203    paddd          m0, m3
4204
4205    movu           m1, [r0 + 2 * r1]      ; row 2 of pix0
4206    movu           m2, [r2 + 2 * r3]      ; row 2 of pix1
4207    movu           m3, [r0 + r5]          ; row 3 of pix0
4208    movu           m4, [r2 + r6]          ; row 3 of pix1
4209
4210    psadbw         m1, m2
4211    psadbw         m3, m4
4212    paddd          m0, m1
4213    paddd          m0, m3
4214
4215    lea            r2,     [r2 + 4 * r3]
4216    lea            r0,     [r0 + 4 * r1]
4217
4218    movu           m1, [r0]               ; row 4 of pix0
4219    movu           m2, [r2]               ; row 4 of pix1
4220    movu           m3, [r0 + r1]          ; row 5 of pix0
4221    movu           m4, [r2 + r3]          ; row 5 of pix1
4222
4223    psadbw         m1, m2
4224    psadbw         m3, m4
4225    paddd          m0, m1
4226    paddd          m0, m3
4227
4228    movu           m1, [r0 + 2 * r1]      ; row 6 of pix0
4229    movu           m2, [r2 + 2 * r3]      ; row 6 of pix1
4230    movu           m3, [r0 + r5]          ; row 7 of pix0
4231    movu           m4, [r2 + r6]          ; row 7 of pix1
4232
4233    psadbw         m1, m2
4234    psadbw         m3, m4
4235    paddd          m0, m1
4236    paddd          m0, m3
4237
4238    lea            r2,     [r2 + 4 * r3]
4239    lea            r0,     [r0 + 4 * r1]
4240
4241    dec            r4d
4242    jnz           .loop
4243
4244    vextracti128   xm1, m0, 1
4245    paddd          xm0, xm1
4246    pshufd         xm1, xm0, 2
4247    paddd          xm0,xm1
4248    movd            eax, xm0
4249    RET
4250
4251INIT_YMM avx2
4252cglobal pixel_sad_48x64, 4,7,7
4253    xorps           m0, m0
4254    mov             r4d, 64/4
4255    lea             r5, [r1 * 3]
4256    lea             r6, [r3 * 3]
4257.loop
4258    movu           m1, [r0]               ; row 0 of pix0
4259    movu           m2, [r2]               ; row 0 of pix1
4260    movu           m3, [r0 + r1]          ; row 1 of pix0
4261    movu           m4, [r2 + r3]          ; row 1 of pix1
4262    movu           xm5, [r0 +32]          ; last 16 of row 0 of pix0
4263    vinserti128    m5, m5, [r0 + r1 + 32], 1
4264    movu           xm6, [r2 +32]          ; last 16 of row 0 of pix1
4265    vinserti128    m6, m6, [r2 + r3 + 32], 1
4266
4267    psadbw         m1, m2
4268    psadbw         m3, m4
4269    psadbw         m5, m6
4270    paddd          m0, m1
4271    paddd          m0, m3
4272    paddd          m0, m5
4273
4274    movu           m1, [r0 + 2 * r1]      ; row 2 of pix0
4275    movu           m2, [r2 + 2 * r3]      ; row 2 of pix1
4276    movu           m3, [r0 + r5]          ; row 3 of pix0
4277    movu           m4, [r2 + r6]          ; row 3 of pix1
4278    movu           xm5, [r0 +32 + 2 * r1]
4279    vinserti128    m5, m5, [r0 + r5 + 32], 1
4280    movu           xm6, [r2 +32 + 2 * r3]
4281    vinserti128    m6, m6, [r2 + r6 + 32], 1
4282
4283    psadbw         m1, m2
4284    psadbw         m3, m4
4285    psadbw         m5, m6
4286    paddd          m0, m1
4287    paddd          m0, m3
4288    paddd          m0, m5
4289
4290    lea     r2,     [r2 + 4 * r3]
4291    lea     r0,     [r0 + 4 * r1]
4292
4293    dec         r4d
4294    jnz         .loop
4295
4296    vextracti128   xm1, m0, 1
4297    paddd          xm0, xm1
4298    pshufd         xm1, xm0, 2
4299    paddd          xm0,xm1
4300    movd            eax, xm0
4301    RET
4302
4303INIT_YMM avx2
4304cglobal pixel_sad_64x16, 4,5,6
4305    xorps           m0, m0
4306    xorps           m5, m5
4307    mov             r4d, 4
4308.loop
4309    movu           m1, [r0]               ; first 32 of row 0 of pix0
4310    movu           m2, [r2]               ; first 32 of row 0 of pix1
4311    movu           m3, [r0 + 32]          ; second 32 of row 0 of pix0
4312    movu           m4, [r2 + 32]          ; second 32 of row 0 of pix1
4313
4314    psadbw         m1, m2
4315    psadbw         m3, m4
4316    paddd          m0, m1
4317    paddd          m5, m3
4318
4319    movu           m1, [r0 + r1]          ; first 32 of row 1 of pix0
4320    movu           m2, [r2 + r3]          ; first 32 of row 1 of pix1
4321    movu           m3, [r0 + 32 + r1]     ; second 32 of row 1 of pix0
4322    movu           m4, [r2 + 32 + r3]     ; second 32 of row 1 of pix1
4323
4324    psadbw         m1, m2
4325    psadbw         m3, m4
4326    paddd          m0, m1
4327    paddd          m5, m3
4328
4329    lea     r2,     [r2 + 2 * r3]
4330    lea     r0,     [r0 + 2 * r1]
4331
4332    movu           m1, [r0]               ; first 32 of row 2 of pix0
4333    movu           m2, [r2]               ; first 32 of row 2 of pix1
4334    movu           m3, [r0 + 32]          ; second 32 of row 2 of pix0
4335    movu           m4, [r2 + 32]          ; second 32 of row 2 of pix1
4336
4337    psadbw         m1, m2
4338    psadbw         m3, m4
4339    paddd          m0, m1
4340    paddd          m5, m3
4341
4342    movu           m1, [r0 + r1]          ; first 32 of row 3 of pix0
4343    movu           m2, [r2 + r3]          ; first 32 of row 3 of pix1
4344    movu           m3, [r0 + 32 + r1]     ; second 32 of row 3 of pix0
4345    movu           m4, [r2 + 32 + r3]     ; second 32 of row 3 of pix1
4346
4347    psadbw         m1, m2
4348    psadbw         m3, m4
4349    paddd          m0, m1
4350    paddd          m5, m3
4351
4352    lea     r2,     [r2 + 2 * r3]
4353    lea     r0,     [r0 + 2 * r1]
4354
4355    dec         r4d
4356    jnz         .loop
4357
4358    paddd          m0, m5
4359    vextracti128   xm1, m0, 1
4360    paddd          xm0, xm1
4361    pshufd         xm1, xm0, 2
4362    paddd          xm0,xm1
4363    movd            eax, xm0
4364    RET
4365
4366INIT_YMM avx2
4367cglobal pixel_sad_64x32, 4,5,6
4368    xorps           m0, m0
4369    xorps           m5, m5
4370    mov             r4d, 16
4371.loop
4372    movu           m1, [r0]               ; first 32 of row 0 of pix0
4373    movu           m2, [r2]               ; first 32 of row 0 of pix1
4374    movu           m3, [r0 + 32]          ; second 32 of row 0 of pix0
4375    movu           m4, [r2 + 32]          ; second 32 of row 0 of pix1
4376
4377    psadbw         m1, m2
4378    psadbw         m3, m4
4379    paddd          m0, m1
4380    paddd          m5, m3
4381
4382    movu           m1, [r0 + r1]          ; first 32 of row 1 of pix0
4383    movu           m2, [r2 + r3]          ; first 32 of row 1 of pix1
4384    movu           m3, [r0 + 32 + r1]     ; second 32 of row 1 of pix0
4385    movu           m4, [r2 + 32 + r3]     ; second 32 of row 1 of pix1
4386
4387    psadbw         m1, m2
4388    psadbw         m3, m4
4389    paddd          m0, m1
4390    paddd          m5, m3
4391
4392    lea     r2,     [r2 + 2 * r3]
4393    lea     r0,     [r0 + 2 * r1]
4394
4395    dec         r4d
4396    jnz         .loop
4397
4398    paddd          m0, m5
4399    vextracti128   xm1, m0, 1
4400    paddd          xm0, xm1
4401    pshufd         xm1, xm0, 2
4402    paddd          xm0,xm1
4403    movd            eax, xm0
4404    RET
4405
4406INIT_YMM avx2
4407cglobal pixel_sad_64x48, 4,7,6
4408    xorps           m0, m0
4409    xorps           m5, m5
4410    mov             r4d, 12
4411    lea             r5, [r1 * 3]
4412    lea             r6, [r3 * 3]
4413.loop
4414    movu           m1, [r0]               ; first 32 of row 0 of pix0
4415    movu           m2, [r2]               ; first 32 of row 0 of pix1
4416    movu           m3, [r0 + 32]          ; second 32 of row 0 of pix0
4417    movu           m4, [r2 + 32]          ; second 32 of row 0 of pix1
4418
4419    psadbw         m1, m2
4420    psadbw         m3, m4
4421    paddd          m0, m1
4422    paddd          m5, m3
4423
4424    movu           m1, [r0 + r1]          ; first 32 of row 1 of pix0
4425    movu           m2, [r2 + r3]          ; first 32 of row 1 of pix1
4426    movu           m3, [r0 + 32 + r1]     ; second 32 of row 1 of pix0
4427    movu           m4, [r2 + 32 + r3]     ; second 32 of row 1 of pix1
4428
4429    psadbw         m1, m2
4430    psadbw         m3, m4
4431    paddd          m0, m1
4432    paddd          m5, m3
4433
4434    movu           m1, [r0 + 2 * r1]      ; first 32 of row 0 of pix0
4435    movu           m2, [r2 + 2 * r3]      ; first 32 of row 0 of pix1
4436    movu           m3, [r0 + 2 * r1 + 32] ; second 32 of row 0 of pix0
4437    movu           m4, [r2 + 2 * r3 + 32] ; second 32 of row 0 of pix1
4438
4439    psadbw         m1, m2
4440    psadbw         m3, m4
4441    paddd          m0, m1
4442    paddd          m5, m3
4443
4444    movu           m1, [r0 + r5]          ; first 32 of row 1 of pix0
4445    movu           m2, [r2 + r6]          ; first 32 of row 1 of pix1
4446    movu           m3, [r0 + 32 + r5]     ; second 32 of row 1 of pix0
4447    movu           m4, [r2 + 32 + r6]     ; second 32 of row 1 of pix1
4448
4449    psadbw         m1, m2
4450    psadbw         m3, m4
4451    paddd          m0, m1
4452    paddd          m5, m3
4453
4454    lea     r2,     [r2 + 4 * r3]
4455    lea     r0,     [r0 + 4 * r1]
4456
4457    dec         r4d
4458    jnz         .loop
4459
4460    paddd          m0, m5
4461    vextracti128   xm1, m0, 1
4462    paddd          xm0, xm1
4463    pshufd         xm1, xm0, 2
4464    paddd          xm0,xm1
4465    movd            eax, xm0
4466    RET
4467
4468INIT_YMM avx2
4469cglobal pixel_sad_64x64, 4,7,6
4470    xorps           m0, m0
4471    xorps           m5, m5
4472    mov             r4d, 8
4473    lea             r5, [r1 * 3]
4474    lea             r6, [r3 * 3]
4475.loop
4476    movu           m1, [r0]               ; first 32 of row 0 of pix0
4477    movu           m2, [r2]               ; first 32 of row 0 of pix1
4478    movu           m3, [r0 + 32]          ; second 32 of row 0 of pix0
4479    movu           m4, [r2 + 32]          ; second 32 of row 0 of pix1
4480
4481    psadbw         m1, m2
4482    psadbw         m3, m4
4483    paddd          m0, m1
4484    paddd          m5, m3
4485
4486    movu           m1, [r0 + r1]          ; first 32 of row 1 of pix0
4487    movu           m2, [r2 + r3]          ; first 32 of row 1 of pix1
4488    movu           m3, [r0 + 32 + r1]     ; second 32 of row 1 of pix0
4489    movu           m4, [r2 + 32 + r3]     ; second 32 of row 1 of pix1
4490
4491    psadbw         m1, m2
4492    psadbw         m3, m4
4493    paddd          m0, m1
4494    paddd          m5, m3
4495
4496    movu           m1, [r0 + 2 * r1]      ; first 32 of row 2 of pix0
4497    movu           m2, [r2 + 2 * r3]      ; first 32 of row 2 of pix1
4498    movu           m3, [r0 + 2 * r1 + 32] ; second 32 of row 2 of pix0
4499    movu           m4, [r2 + 2 * r3 + 32] ; second 32 of row 2 of pix1
4500
4501    psadbw         m1, m2
4502    psadbw         m3, m4
4503    paddd          m0, m1
4504    paddd          m5, m3
4505
4506    movu           m1, [r0 + r5]          ; first 32 of row 3 of pix0
4507    movu           m2, [r2 + r6]          ; first 32 of row 3 of pix1
4508    movu           m3, [r0 + 32 + r5]     ; second 32 of row 3 of pix0
4509    movu           m4, [r2 + 32 + r6]     ; second 32 of row 3 of pix1
4510
4511    psadbw         m1, m2
4512    psadbw         m3, m4
4513    paddd          m0, m1
4514    paddd          m5, m3
4515
4516    lea     r2,     [r2 + 4 * r3]
4517    lea     r0,     [r0 + 4 * r1]
4518
4519    movu           m1, [r0]               ; first 32 of row 4 of pix0
4520    movu           m2, [r2]               ; first 32 of row 4 of pix1
4521    movu           m3, [r0 + 32]          ; second 32 of row 4 of pix0
4522    movu           m4, [r2 + 32]          ; second 32 of row 4 of pix1
4523
4524    psadbw         m1, m2
4525    psadbw         m3, m4
4526    paddd          m0, m1
4527    paddd          m5, m3
4528
4529    movu           m1, [r0 + r1]          ; first 32 of row 5 of pix0
4530    movu           m2, [r2 + r3]          ; first 32 of row 5 of pix1
4531    movu           m3, [r0 + 32 + r1]     ; second 32 of row 5 of pix0
4532    movu           m4, [r2 + 32 + r3]     ; second 32 of row 5 of pix1
4533
4534    psadbw         m1, m2
4535    psadbw         m3, m4
4536    paddd          m0, m1
4537    paddd          m5, m3
4538
4539    movu           m1, [r0 + 2 * r1]      ; first 32 of row 6 of pix0
4540    movu           m2, [r2 + 2 * r3]      ; first 32 of row 6 of pix1
4541    movu           m3, [r0 + 2 * r1 + 32] ; second 32 of row 6 of pix0
4542    movu           m4, [r2 + 2 * r3 + 32] ; second 32 of row 6 of pix1
4543
4544    psadbw         m1, m2
4545    psadbw         m3, m4
4546    paddd          m0, m1
4547    paddd          m5, m3
4548
4549    movu           m1, [r0 + r5]          ; first 32 of row 7 of pix0
4550    movu           m2, [r2 + r6]          ; first 32 of row 7 of pix1
4551    movu           m3, [r0 + 32 + r5]     ; second 32 of row 7 of pix0
4552    movu           m4, [r2 + 32 + r6]     ; second 32 of row 7 of pix1
4553
4554    psadbw         m1, m2
4555    psadbw         m3, m4
4556    paddd          m0, m1
4557    paddd          m5, m3
4558
4559    lea     r2,     [r2 + 4 * r3]
4560    lea     r0,     [r0 + 4 * r1]
4561
4562    dec         r4d
4563    jnz         .loop
4564
4565    paddd          m0, m5
4566    vextracti128   xm1, m0, 1
4567    paddd          xm0, xm1
4568    pshufd         xm1, xm0, 2
4569    paddd          xm0,xm1
4570    movd            eax, xm0
4571    RET
4572
4573%endif
4574