1;*****************************************************************************
2;* sad16-a.asm: x86 high depth sad functions
3;*****************************************************************************
4;* Copyright (C) 2010-2013 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;*          Henrik Gramner <henrik@gramner.com>
8;*          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9;*          Min Chen <chenm003@163.com>
10;*
11;* This program is free software; you can redistribute it and/or modify
12;* it under the terms of the GNU General Public License as published by
13;* the Free Software Foundation; either version 2 of the License, or
14;* (at your option) any later version.
15;*
16;* This program is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19;* GNU General Public License for more details.
20;*
21;* You should have received a copy of the GNU General Public License
22;* along with this program; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24;*
25;* This program is also available under a commercial proprietary license.
26;* For more information, contact us at license @ x265.com.
27;*****************************************************************************
28
29%include "x86inc.asm"
30%include "x86util.asm"
31
32SECTION .text
33
34cextern pw_1
35
36;=============================================================================
37; SAD MMX
38;=============================================================================
39
40%macro SAD_INC_1x16P_MMX 0
41    movu    m1, [r0+ 0]
42    movu    m2, [r0+ 8]
43    movu    m3, [r0+16]
44    movu    m4, [r0+24]
45    psubw   m1, [r2+ 0]
46    psubw   m2, [r2+ 8]
47    psubw   m3, [r2+16]
48    psubw   m4, [r2+24]
49    ABSW2   m1, m2, m1, m2, m5, m6
50    ABSW2   m3, m4, m3, m4, m7, m5
51    lea     r0, [r0+2*r1]
52    lea     r2, [r2+2*r3]
53    paddw   m1, m2
54    paddw   m3, m4
55  %if BIT_DEPTH <= 10
56    paddw   m0, m1
57    paddw   m0, m3
58  %else
59    paddw   m1, m3
60    pmaddwd m1, [pw_1]
61    paddd   m0, m1
62  %endif
63%endmacro
64
65%macro SAD_INC_2x8P_MMX 0
66    movu    m1, [r0+0]
67    movu    m2, [r0+8]
68    movu    m3, [r0+2*r1+0]
69    movu    m4, [r0+2*r1+8]
70    psubw   m1, [r2+0]
71    psubw   m2, [r2+8]
72    psubw   m3, [r2+2*r3+0]
73    psubw   m4, [r2+2*r3+8]
74    ABSW2   m1, m2, m1, m2, m5, m6
75    ABSW2   m3, m4, m3, m4, m7, m5
76    lea     r0, [r0+4*r1]
77    lea     r2, [r2+4*r3]
78    paddw   m1, m2
79    paddw   m3, m4
80  %if BIT_DEPTH <= 10
81    paddw   m0, m1
82    paddw   m0, m3
83  %else
84    paddw   m1, m3
85    pmaddwd m1, [pw_1]
86    paddd   m0, m1
87  %endif
88%endmacro
89
90%macro SAD_INC_2x4P_MMX 0
91    movu    m1, [r0]
92    movu    m2, [r0+2*r1]
93    psubw   m1, [r2]
94    psubw   m2, [r2+2*r3]
95    ABSW2   m1, m2, m1, m2, m3, m4
96    lea     r0, [r0+4*r1]
97    lea     r2, [r2+4*r3]
98  %if BIT_DEPTH <= 10
99    paddw   m0, m1
100    paddw   m0, m2
101  %else
102    paddw   m1, m2
103    pmaddwd m1, [pw_1]
104    paddd   m0, m1
105  %endif
106%endmacro
107
108;-----------------------------------------------------------------------------
109; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
110;-----------------------------------------------------------------------------
111%macro SAD_MMX 3
112cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
113    pxor    m0, m0
114%if %2 == 4
115    SAD_INC_%3x%1P_MMX
116    SAD_INC_%3x%1P_MMX
117%else
118    mov    r4d, %2/%3
119.loop:
120    SAD_INC_%3x%1P_MMX
121    dec    r4d
122    jg .loop
123%endif
124%if %1*%2 == 256
125  %if BIT_DEPTH <= 10
126    HADDUW  m0, m1
127  %else
128    HADDD  m0, m1
129  %endif
130%else
131  %if BIT_DEPTH <= 10
132    HADDW   m0, m1
133  %else
134    HADDD  m0, m1
135  %endif
136%endif
137    movd   eax, m0
138    RET
139%endmacro
140
141INIT_MMX mmx2
142SAD_MMX 16, 16, 1
143SAD_MMX 16,  8, 1
144SAD_MMX  8, 16, 2
145SAD_MMX  8,  8, 2
146SAD_MMX  8,  4, 2
147SAD_MMX  4,  8, 2
148SAD_MMX  4,  4, 2
149SAD_MMX  4,  16, 2
150INIT_MMX ssse3
151SAD_MMX  4,  8, 2
152SAD_MMX  4,  4, 2
153
154;=============================================================================
155; SAD XMM
156;=============================================================================
157
158%macro SAD_1x32 0
159    movu    m1, [r2+ 0]
160    movu    m2, [r2+16]
161    movu    m3, [r2+32]
162    movu    m4, [r2+48]
163    psubw   m1, [r0+0]
164    psubw   m2, [r0+16]
165    psubw   m3, [r0+32]
166    psubw   m4, [r0+48]
167    ABSW2   m1, m2, m1, m2, m5, m6
168    pmaddwd m1, [pw_1]
169    pmaddwd m2, [pw_1]
170    lea     r0, [r0+2*r1]
171    lea     r2, [r2+2*r3]
172    ABSW2   m3, m4, m3, m4, m7, m5
173    pmaddwd m3, [pw_1]
174    pmaddwd m4, [pw_1]
175    paddd   m1, m2
176    paddd   m3, m4
177    paddd   m0, m1
178    paddd   m0, m3
179%endmacro
180
181%macro SAD_1x24 0
182    movu    m1, [r2+ 0]
183    movu    m2, [r2+16]
184    movu    m3, [r2+32]
185    psubw   m1, [r0+0]
186    psubw   m2, [r0+16]
187    psubw   m3, [r0+32]
188    ABSW2   m1, m2, m1, m2, m4, m6
189    pmaddwd m1, [pw_1]
190    pmaddwd m2, [pw_1]
191    lea     r0, [r0+2*r1]
192    lea     r2, [r2+2*r3]
193    pxor    m4, m4
194    psubw    m4, m3
195    pmaxsw  m3, m4
196    pmaddwd m3, [pw_1]
197    paddd   m1, m2
198    paddd   m0, m1
199    paddd   m0, m3
200%endmacro
201
202%macro SAD_1x48 0
203    movu    m1, [r2+ 0]
204    movu    m2, [r2+16]
205    movu    m3, [r2+32]
206    movu    m4, [r2+48]
207    psubw   m1, [r0+0]
208    psubw   m2, [r0+16]
209    psubw   m3, [r0+32]
210    psubw   m4, [r0+48]
211    ABSW2   m1, m2, m1, m2, m5, m6
212    pmaddwd m1, [pw_1]
213    pmaddwd m2, [pw_1]
214    ABSW2   m3, m4, m3, m4, m7, m5
215    pmaddwd m3, [pw_1]
216    pmaddwd m4, [pw_1]
217    paddd   m1, m2
218    paddd   m3, m4
219    paddd   m0, m1
220    paddd   m0, m3
221    movu    m1, [r2+64]
222    movu    m2, [r2+80]
223    psubw   m1, [r0+64]
224    psubw   m2, [r0+80]
225    ABSW2   m1, m2, m1, m2, m3, m4
226    pmaddwd m1, [pw_1]
227    pmaddwd m2, [pw_1]
228    lea     r0, [r0+2*r1]
229    lea     r2, [r2+2*r3]
230    paddd   m0, m1
231    paddd   m0, m2
232%endmacro
233
234%macro SAD_1x64 0
235    movu    m1, [r2+ 0]
236    movu    m2, [r2+16]
237    movu    m3, [r2+32]
238    movu    m4, [r2+48]
239    psubw   m1, [r0+0]
240    psubw   m2, [r0+16]
241    psubw   m3, [r0+32]
242    psubw   m4, [r0+48]
243    ABSW2   m1, m2, m1, m2, m5, m6
244    pmaddwd m1, [pw_1]
245    pmaddwd m2, [pw_1]
246    ABSW2   m3, m4, m3, m4, m7, m5
247    pmaddwd m3, [pw_1]
248    pmaddwd m4, [pw_1]
249    paddd   m1, m2
250    paddd   m3, m4
251    paddd   m0, m1
252    paddd   m0, m3
253    movu    m1, [r2+64]
254    movu    m2, [r2+80]
255    movu    m3, [r2+96]
256    movu    m4, [r2+112]
257    psubw   m1, [r0+64]
258    psubw   m2, [r0+80]
259    psubw   m3, [r0+96]
260    psubw   m4, [r0+112]
261    ABSW2   m1, m2, m1, m2, m5, m6
262    pmaddwd m1, [pw_1]
263    pmaddwd m2, [pw_1]
264    ABSW2   m3, m4, m3, m4, m7, m5
265    pmaddwd m3, [pw_1]
266    pmaddwd m4, [pw_1]
267    paddd   m1, m2
268    paddd   m3, m4
269    paddd   m0, m1
270    paddd   m0, m3
271    lea     r0, [r0+2*r1]
272    lea     r2, [r2+2*r3]
273%endmacro
274
275%macro SAD_1x12 0
276    movu    m1, [r2+0]
277    movh    m2, [r2+16]
278    psubw   m1, [r0+0]
279    movh    m3, [r0+16]
280    psubw   m2, m3
281    ABSW2   m1, m2, m1, m2, m4, m6
282    pmaddwd m1, [pw_1]
283    pmaddwd m2, [pw_1]
284    lea     r0, [r0+2*r1]
285    lea     r2, [r2+2*r3]
286    paddd   m1, m2
287    paddd   m0, m1
288%endmacro
289
290%macro SAD_INC_2ROW 1
291%if 2*%1 > mmsize
292    movu    m1, [r2+ 0]
293    movu    m2, [r2+16]
294    movu    m3, [r2+2*r3+ 0]
295    movu    m4, [r2+2*r3+16]
296    psubw   m1, [r0+ 0]
297    psubw   m2, [r0+16]
298    psubw   m3, [r0+2*r1+ 0]
299    psubw   m4, [r0+2*r1+16]
300    ABSW2   m1, m2, m1, m2, m5, m6
301    lea     r0, [r0+4*r1]
302    lea     r2, [r2+4*r3]
303    ABSW2   m3, m4, m3, m4, m7, m5
304    paddw   m1, m2
305    paddw   m3, m4
306    paddw   m1, m3
307    pmaddwd m1, [pw_1]
308    paddd   m0, m1
309%else
310    movu    m1, [r2]
311    movu    m2, [r2+2*r3]
312    psubw   m1, [r0]
313    psubw   m2, [r0+2*r1]
314    ABSW2   m1, m2, m1, m2, m3, m4
315    lea     r0, [r0+4*r1]
316    lea     r2, [r2+4*r3]
317    paddw   m1, m2
318    pmaddwd m1, [pw_1]
319    paddd   m0, m1
320%endif
321%endmacro
322
323%macro SAD_INC_2ROW_Nx64 1
324%if 2*%1 > mmsize
325    movu    m1, [r2 + 0]
326    movu    m2, [r2 + 16]
327    movu    m3, [r2 + 2 * r3 + 0]
328    movu    m4, [r2 + 2 * r3 + 16]
329    psubw   m1, [r0 + 0]
330    psubw   m2, [r0 + 16]
331    psubw   m3, [r0 + 2 * r1 + 0]
332    psubw   m4, [r0 + 2 * r1 + 16]
333    ABSW2   m1, m2, m1, m2, m5, m6
334    lea     r0, [r0 + 4 * r1]
335    lea     r2, [r2 + 4 * r3]
336    ABSW2   m3, m4, m3, m4, m7, m5
337    paddw   m1, m2
338    paddw   m3, m4
339    paddw   m1, m3
340    pmaddwd m1, [pw_1]
341    paddd   m0, m1
342%else
343    movu    m1, [r2]
344    movu    m2, [r2 + 2 * r3]
345    psubw   m1, [r0]
346    psubw   m2, [r0 + 2 * r1]
347    ABSW2   m1, m2, m1, m2, m3, m4
348    lea     r0, [r0 + 4 * r1]
349    lea     r2, [r2 + 4 * r3]
350    paddw   m1, m2
351    pmaddwd m1, [pw_1]
352    paddd   m0, m1
353%endif
354%endmacro
355
356; ---------------------------------------------------------------------------- -
357; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t)
358; ---------------------------------------------------------------------------- -
359%macro SAD 2
360cglobal pixel_sad_%1x%2, 4,5,8
361    pxor    m0, m0
362%if %2 == 4
363    SAD_INC_2ROW %1
364    SAD_INC_2ROW %1
365%else
366    mov     r4d, %2/2
367.loop:
368    SAD_INC_2ROW %1
369    dec    r4d
370    jg .loop
371%endif
372    HADDD   m0, m1
373    movd    eax, xm0
374    RET
375%endmacro
376
377; ---------------------------------------------------------------------------- -
378; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t)
379; ---------------------------------------------------------------------------- -
380%macro SAD_Nx64 1
381cglobal pixel_sad_%1x64, 4,5, 8
382    pxor    m0, m0
383    mov     r4d, 64 / 2
384.loop:
385    SAD_INC_2ROW_Nx64 %1
386    dec    r4d
387    jg .loop
388
389    HADDD   m0, m1
390    movd    eax, xm0
391    RET
392%endmacro
393
394INIT_XMM sse2
395SAD  16,  4
396SAD  16,  8
397SAD  16, 12
398SAD  16, 16
399SAD  16, 32
400SAD_Nx64  16
401
402INIT_XMM sse2
403SAD  8,  4
404SAD  8,  8
405SAD  8, 16
406SAD  8, 32
407
408INIT_YMM avx2
409SAD  16,  4
410SAD  16,  8
411SAD  16, 12
412SAD  16, 16
413SAD  16, 32
414
415INIT_YMM avx2
416cglobal pixel_sad_16x64, 4,7,4
417    pxor    m0, m0
418    pxor    m3, m3
419    mov     r4d, 64 / 8
420    add     r3d, r3d
421    add     r1d, r1d
422    lea     r5,     [r1 * 3]
423    lea     r6,     [r3 * 3]
424.loop:
425    movu    m1, [r2]
426    movu    m2, [r2 + r3]
427    psubw   m1, [r0]
428    psubw   m2, [r0 + r1]
429    pabsw   m1, m1
430    pabsw   m2, m2
431    paddw   m0, m1
432    paddw   m3, m2
433
434    movu    m1, [r2 + 2 * r3]
435    movu    m2, [r2 + r6]
436    psubw   m1, [r0 + 2 * r1]
437    psubw   m2, [r0 + r5]
438    pabsw   m1, m1
439    pabsw   m2, m2
440    paddw   m0, m1
441    paddw   m3, m2
442
443    lea     r0, [r0 + 4 * r1]
444    lea     r2, [r2 + 4 * r3]
445
446    movu    m1, [r2]
447    movu    m2, [r2 + r3]
448    psubw   m1, [r0]
449    psubw   m2, [r0 + r1]
450    pabsw   m1, m1
451    pabsw   m2, m2
452    paddw   m0, m1
453    paddw   m3, m2
454
455    movu    m1, [r2 + 2 * r3]
456    movu    m2, [r2 + r6]
457    psubw   m1, [r0 + 2 * r1]
458    psubw   m2, [r0 + r5]
459    pabsw   m1, m1
460    pabsw   m2, m2
461    paddw   m0, m1
462    paddw   m3, m2
463
464    lea     r0, [r0 + 4 * r1]
465    lea     r2, [r2 + 4 * r3]
466
467    dec    r4d
468    jg .loop
469
470    HADDUWD m0, m1
471    HADDUWD m3, m1
472    HADDD   m0, m1
473    HADDD   m3, m1
474    paddd   m0, m3
475
476    movd    eax, xm0
477    RET
478
479INIT_YMM avx2
480cglobal pixel_sad_32x8, 4,7,5
481    pxor    m0, m0
482    mov     r4d, 8/4
483    add     r3d, r3d
484    add     r1d, r1d
485    lea     r5,     [r1 * 3]
486    lea     r6,     [r3 * 3]
487.loop:
488    movu    m1, [r2]
489    movu    m2, [r2 + 32]
490    movu    m3, [r2 + r3]
491    movu    m4, [r2 + r3 + 32]
492    psubw   m1, [r0]
493    psubw   m2, [r0 + 32]
494    psubw   m3, [r0 + r1]
495    psubw   m4, [r0 + r1 + 32]
496    pabsw   m1, m1
497    pabsw   m2, m2
498    pabsw   m3, m3
499    pabsw   m4, m4
500    paddw   m1, m2
501    paddw   m3, m4
502    paddw   m0, m1
503    paddw   m0, m3
504
505    movu    m1, [r2 + 2 * r3]
506    movu    m2, [r2 + 2 * r3 + 32]
507    movu    m3, [r2 + r6]
508    movu    m4, [r2 + r6 + 32]
509    psubw   m1, [r0 + 2 * r1]
510    psubw   m2, [r0 + 2 * r1 + 32]
511    psubw   m3, [r0 + r5]
512    psubw   m4, [r0 + r5 + 32]
513    pabsw   m1, m1
514    pabsw   m2, m2
515    lea     r0, [r0 + 4 * r1]
516    lea     r2, [r2 + 4 * r3]
517    pabsw   m3, m3
518    pabsw   m4, m4
519    paddw   m1, m2
520    paddw   m3, m4
521    paddw   m0, m1
522    paddw   m0, m3
523
524    dec    r4d
525    jg .loop
526
527    HADDW   m0, m1
528    movd    eax, xm0
529    RET
530
531INIT_YMM avx2
532cglobal pixel_sad_32x16, 4,7,5
533    pxor    m0, m0
534    mov     r4d, 16/8
535    add     r3d, r3d
536    add     r1d, r1d
537    lea     r5,     [r1 * 3]
538    lea     r6,     [r3 * 3]
539.loop:
540    movu    m1, [r2]
541    movu    m2, [r2 + 32]
542    movu    m3, [r2 + r3]
543    movu    m4, [r2 + r3 + 32]
544    psubw   m1, [r0]
545    psubw   m2, [r0 + 32]
546    psubw   m3, [r0 + r1]
547    psubw   m4, [r0 + r1 + 32]
548    pabsw   m1, m1
549    pabsw   m2, m2
550    pabsw   m3, m3
551    pabsw   m4, m4
552    paddw   m1, m2
553    paddw   m3, m4
554    paddw   m0, m1
555    paddw   m0, m3
556
557    movu    m1, [r2 + 2 * r3]
558    movu    m2, [r2 + 2 * r3 + 32]
559    movu    m3, [r2 + r6]
560    movu    m4, [r2 + r6 + 32]
561    psubw   m1, [r0 + 2 * r1]
562    psubw   m2, [r0 + 2 * r1 + 32]
563    psubw   m3, [r0 + r5]
564    psubw   m4, [r0 + r5 + 32]
565    pabsw   m1, m1
566    pabsw   m2, m2
567    lea     r0, [r0 + 4 * r1]
568    lea     r2, [r2 + 4 * r3]
569    pabsw   m3, m3
570    pabsw   m4, m4
571    paddw   m1, m2
572    paddw   m3, m4
573    paddw   m0, m1
574    paddw   m0, m3
575
576    movu    m1, [r2]
577    movu    m2, [r2 + 32]
578    movu    m3, [r2 + r3]
579    movu    m4, [r2 + r3 + 32]
580    psubw   m1, [r0]
581    psubw   m2, [r0 + 32]
582    psubw   m3, [r0 + r1]
583    psubw   m4, [r0 + r1 + 32]
584    pabsw   m1, m1
585    pabsw   m2, m2
586    pabsw   m3, m3
587    pabsw   m4, m4
588    paddw   m1, m2
589    paddw   m3, m4
590    paddw   m0, m1
591    paddw   m0, m3
592
593    movu    m1, [r2 + 2 * r3]
594    movu    m2, [r2 + 2 * r3 + 32]
595    movu    m3, [r2 + r6]
596    movu    m4, [r2 + r6 + 32]
597    psubw   m1, [r0 + 2 * r1]
598    psubw   m2, [r0 + 2 * r1 + 32]
599    psubw   m3, [r0 + r5]
600    psubw   m4, [r0 + r5 + 32]
601    pabsw   m1, m1
602    pabsw   m2, m2
603    lea     r0, [r0 + 4 * r1]
604    lea     r2, [r2 + 4 * r3]
605    pabsw   m3, m3
606    pabsw   m4, m4
607    paddw   m1, m2
608    paddw   m3, m4
609    paddw   m0, m1
610    paddw   m0, m3
611
612    dec    r4d
613    jg .loop
614
615    HADDW   m0, m1
616    movd    eax, xm0
617    RET
618
619INIT_YMM avx2
620cglobal pixel_sad_32x24, 4,7,5
621    pxor    m0, m0
622    mov     r4d, 24/4
623    add     r3d, r3d
624    add     r1d, r1d
625    lea     r5,     [r1 * 3]
626    lea     r6,     [r3 * 3]
627.loop:
628    movu    m1, [r2]
629    movu    m2, [r2 + 32]
630    movu    m3, [r2 + r3]
631    movu    m4, [r2 + r3 + 32]
632    psubw   m1, [r0]
633    psubw   m2, [r0 + 32]
634    psubw   m3, [r0 + r1]
635    psubw   m4, [r0 + r1 + 32]
636    pabsw   m1, m1
637    pabsw   m2, m2
638    pabsw   m3, m3
639    pabsw   m4, m4
640    paddw   m1, m2
641    paddw   m3, m4
642    paddw   m0, m1
643    paddw   m0, m3
644
645    movu    m1, [r2 + 2 * r3]
646    movu    m2, [r2 + 2 * r3 + 32]
647    movu    m3, [r2 + r6]
648    movu    m4, [r2 + r6 + 32]
649    psubw   m1, [r0 + 2 * r1]
650    psubw   m2, [r0 + 2 * r1 + 32]
651    psubw   m3, [r0 + r5]
652    psubw   m4, [r0 + r5 + 32]
653    pabsw   m1, m1
654    pabsw   m2, m2
655    pabsw   m3, m3
656    pabsw   m4, m4
657    paddw   m1, m2
658    paddw   m3, m4
659    paddw   m0, m1
660    paddw   m0, m3
661
662    lea     r0, [r0 + 4 * r1]
663    lea     r2, [r2 + 4 * r3]
664
665    dec    r4d
666    jg .loop
667
668    HADDUWD m0, m1
669    HADDD   m0, m1
670    movd    eax, xm0
671    RET
672
673
674INIT_YMM avx2
675cglobal pixel_sad_32x32, 4,7,5
676    pxor    m0, m0
677    mov     r4d, 32/4
678    add     r3d, r3d
679    add     r1d, r1d
680    lea     r5,     [r1 * 3]
681    lea     r6,     [r3 * 3]
682.loop:
683    movu    m1, [r2]
684    movu    m2, [r2 + 32]
685    movu    m3, [r2 + r3]
686    movu    m4, [r2 + r3 + 32]
687    psubw   m1, [r0]
688    psubw   m2, [r0 + 32]
689    psubw   m3, [r0 + r1]
690    psubw   m4, [r0 + r1 + 32]
691    pabsw   m1, m1
692    pabsw   m2, m2
693    pabsw   m3, m3
694    pabsw   m4, m4
695    paddw   m1, m2
696    paddw   m3, m4
697    paddw   m0, m1
698    paddw   m0, m3
699
700    movu    m1, [r2 + 2 * r3]
701    movu    m2, [r2 + 2 * r3 + 32]
702    movu    m3, [r2 + r6]
703    movu    m4, [r2 + r6 + 32]
704    psubw   m1, [r0 + 2 * r1]
705    psubw   m2, [r0 + 2 * r1 + 32]
706    psubw   m3, [r0 + r5]
707    psubw   m4, [r0 + r5 + 32]
708    pabsw   m1, m1
709    pabsw   m2, m2
710    pabsw   m3, m3
711    pabsw   m4, m4
712    paddw   m1, m2
713    paddw   m3, m4
714    paddw   m0, m1
715    paddw   m0, m3
716
717    lea     r0, [r0 + 4 * r1]
718    lea     r2, [r2 + 4 * r3]
719
720    dec    r4d
721    jg .loop
722
723    HADDUWD m0, m1
724    HADDD   m0, m1
725    movd    eax, xm0
726    RET
727
728INIT_YMM avx2
729cglobal pixel_sad_32x64, 4,7,6
730    pxor    m0, m0
731    pxor    m5, m5
732    mov     r4d, 64 / 4
733    add     r3d, r3d
734    add     r1d, r1d
735    lea     r5,     [r1 * 3]
736    lea     r6,     [r3 * 3]
737.loop:
738    movu    m1, [r2]
739    movu    m2, [r2 + 32]
740    movu    m3, [r2 + r3]
741    movu    m4, [r2 + r3 + 32]
742    psubw   m1, [r0]
743    psubw   m2, [r0 + 32]
744    psubw   m3, [r0 + r1]
745    psubw   m4, [r0 + r1 + 32]
746    pabsw   m1, m1
747    pabsw   m2, m2
748    pabsw   m3, m3
749    pabsw   m4, m4
750    paddw   m1, m2
751    paddw   m3, m4
752    paddw   m0, m1
753    paddw   m5, m3
754
755    movu    m1, [r2 + 2 * r3]
756    movu    m2, [r2 + 2 * r3 + 32]
757    movu    m3, [r2 + r6]
758    movu    m4, [r2 + r6 + 32]
759    psubw   m1, [r0 + 2 * r1]
760    psubw   m2, [r0 + 2 * r1 + 32]
761    psubw   m3, [r0 + r5]
762    psubw   m4, [r0 + r5 + 32]
763    pabsw   m1, m1
764    pabsw   m2, m2
765    pabsw   m3, m3
766    pabsw   m4, m4
767    paddw   m1, m2
768    paddw   m3, m4
769    paddw   m0, m1
770    paddw   m5, m3
771    lea     r0, [r0 + 4 * r1]
772    lea     r2, [r2 + 4 * r3]
773
774    dec    r4d
775    jg .loop
776
777    HADDUWD m0, m1
778    HADDUWD m5, m1
779    HADDD   m0, m1
780    HADDD   m5, m1
781    paddd   m0, m5
782
783    movd    eax, xm0
784    RET
785
786INIT_YMM avx2
787cglobal pixel_sad_48x64, 4, 5, 7
788    pxor    m0, m0
789    pxor    m5, m5
790    pxor    m6, m6
791    mov     r4d, 64/2
792    add     r3d, r3d
793    add     r1d, r1d
794.loop:
795    movu    m1, [r2 + 0 * mmsize]
796    movu    m2, [r2 + 1 * mmsize]
797    movu    m3, [r2 + 2 * mmsize]
798    psubw   m1, [r0 + 0 * mmsize]
799    psubw   m2, [r0 + 1 * mmsize]
800    psubw   m3, [r0 + 2 * mmsize]
801    pabsw   m1, m1
802    pabsw   m2, m2
803    pabsw   m3, m3
804    paddw   m0, m1
805    paddw   m5, m2
806    paddw   m6, m3
807
808    movu    m1, [r2 + r3 + 0 * mmsize]
809    movu    m2, [r2 + r3 + 1 * mmsize]
810    movu    m3, [r2 + r3 + 2 * mmsize]
811    psubw   m1, [r0 + r1 + 0 * mmsize]
812    psubw   m2, [r0 + r1 + 1 * mmsize]
813    psubw   m3, [r0 + r1 + 2 * mmsize]
814    pabsw   m1, m1
815    pabsw   m2, m2
816    pabsw   m3, m3
817    paddw   m0, m1
818    paddw   m5, m2
819    paddw   m6, m3
820
821    lea     r0, [r0 + 2 * r1]
822    lea     r2, [r2 + 2 * r3]
823
824    dec     r4d
825    jg      .loop
826
827    HADDUWD m0, m1
828    HADDUWD m5, m1
829    HADDUWD m6, m1
830    paddd   m0, m5
831    paddd   m0, m6
832    HADDD   m0, m1
833    movd    eax, xm0
834    RET
835
836INIT_YMM avx2
837cglobal pixel_sad_64x16, 4, 5, 5
838    pxor    m0, m0
839    mov     r4d, 16 / 2
840    add     r3d, r3d
841    add     r1d, r1d
842.loop:
843    movu    m1, [r2 + 0]
844    movu    m2, [r2 + 32]
845    movu    m3, [r2 + 2 * 32]
846    movu    m4, [r2 + 3 * 32]
847    psubw   m1, [r0 + 0]
848    psubw   m2, [r0 + 32]
849    psubw   m3, [r0 + 2 * 32]
850    psubw   m4, [r0 + 3 * 32]
851    pabsw   m1, m1
852    pabsw   m2, m2
853    pabsw   m3, m3
854    pabsw   m4, m4
855    paddw   m1, m2
856    paddw   m3, m4
857    paddw   m0, m1
858    paddw   m0, m3
859    movu    m1, [r2 + r3]
860    movu    m2, [r2 + r3 + 32]
861    movu    m3, [r2 + r3 + 64]
862    movu    m4, [r2 + r3 + 96]
863    psubw   m1, [r0 + r1]
864    psubw   m2, [r0 + r1 + 32]
865    psubw   m3, [r0 + r1 + 64]
866    psubw   m4, [r0 + r1 + 96]
867    pabsw   m1, m1
868    pabsw   m2, m2
869    pabsw   m3, m3
870    pabsw   m4, m4
871    paddw   m1, m2
872    paddw   m3, m4
873    paddw   m0, m1
874    paddw   m0, m3
875    lea     r0, [r0 + 2 * r1]
876    lea     r2, [r2 + 2 * r3]
877
878    dec    r4d
879    jg     .loop
880
881    HADDUWD m0, m1
882    HADDD   m0, m1
883    movd    eax, xm0
884    RET
885
886INIT_YMM avx2
887cglobal pixel_sad_64x32, 4, 5, 6
888    pxor    m0, m0
889    pxor    m5, m5
890    mov     r4d, 32 / 2
891    add     r3d, r3d
892    add     r1d, r1d
893.loop:
894    movu    m1, [r2 + 0]
895    movu    m2, [r2 + 32]
896    movu    m3, [r2 + 2 * 32]
897    movu    m4, [r2 + 3 * 32]
898    psubw   m1, [r0 + 0]
899    psubw   m2, [r0 + 32]
900    psubw   m3, [r0 + 2 * 32]
901    psubw   m4, [r0 + 3 * 32]
902    pabsw   m1, m1
903    pabsw   m2, m2
904    pabsw   m3, m3
905    pabsw   m4, m4
906    paddw   m1, m2
907    paddw   m3, m4
908    paddw   m0, m1
909    paddw   m5, m3
910
911    movu    m1, [r2 + r3]
912    movu    m2, [r2 + r3 + 32]
913    movu    m3, [r2 + r3 + 64]
914    movu    m4, [r2 + r3 + 96]
915    psubw   m1, [r0 + r1]
916    psubw   m2, [r0 + r1 + 32]
917    psubw   m3, [r0 + r1 + 64]
918    psubw   m4, [r0 + r1 + 96]
919    pabsw   m1, m1
920    pabsw   m2, m2
921    pabsw   m3, m3
922    pabsw   m4, m4
923    paddw   m1, m2
924    paddw   m3, m4
925    paddw   m0, m1
926    paddw   m5, m3
927    lea     r0, [r0 + 2 * r1]
928    lea     r2, [r2 + 2 * r3]
929
930    dec    r4d
931    jg     .loop
932
933    HADDUWD m0, m1
934    HADDUWD m5, m1
935    paddd   m0, m5
936    HADDD   m0, m1
937
938    movd    eax, xm0
939    RET
940
941INIT_YMM avx2
942cglobal pixel_sad_64x48, 4, 5, 8
943    pxor    m0, m0
944    pxor    m5, m5
945    pxor    m6, m6
946    pxor    m7, m7
947    mov     r4d, 48 / 2
948    add     r3d, r3d
949    add     r1d, r1d
950.loop:
951    movu    m1, [r2 + 0]
952    movu    m2, [r2 + 32]
953    movu    m3, [r2 + 64]
954    movu    m4, [r2 + 96]
955    psubw   m1, [r0 + 0]
956    psubw   m2, [r0 + 32]
957    psubw   m3, [r0 + 64]
958    psubw   m4, [r0 + 96]
959    pabsw   m1, m1
960    pabsw   m2, m2
961    pabsw   m3, m3
962    pabsw   m4, m4
963    paddw   m0, m1
964    paddw   m5, m2
965    paddw   m6, m3
966    paddw   m7, m4
967
968    movu    m1, [r2 + r3]
969    movu    m2, [r2 + r3 + 32]
970    movu    m3, [r2 + r3 + 64]
971    movu    m4, [r2 + r3 + 96]
972    psubw   m1, [r0 + r1]
973    psubw   m2, [r0 + r1 + 32]
974    psubw   m3, [r0 + r1 + 64]
975    psubw   m4, [r0 + r1 + 96]
976    pabsw   m1, m1
977    pabsw   m2, m2
978    pabsw   m3, m3
979    pabsw   m4, m4
980    paddw   m0, m1
981    paddw   m5, m2
982    paddw   m6, m3
983    paddw   m7, m4
984
985    lea     r0, [r0 + 2 * r1]
986    lea     r2, [r2 + 2 * r3]
987
988    dec    r4d
989    jg     .loop
990
991    HADDUWD m0, m1
992    HADDUWD m5, m1
993    HADDUWD m6, m1
994    HADDUWD m7, m1
995    paddd   m0, m5
996    paddd   m0, m6
997    paddd   m0, m7
998    HADDD   m0, m1
999    movd    eax, xm0
1000    RET
1001
1002INIT_YMM avx2
1003cglobal pixel_sad_64x64, 4, 5, 8
1004    pxor    m0, m0
1005    pxor    m5, m5
1006    pxor    m6, m6
1007    pxor    m7, m7
1008    mov     r4d, 64 / 2
1009    add     r3d, r3d
1010    add     r1d, r1d
1011.loop:
1012    movu    m1, [r2 + 0]
1013    movu    m2, [r2 + 32]
1014    movu    m3, [r2 + 64]
1015    movu    m4, [r2 + 96]
1016    psubw   m1, [r0 + 0]
1017    psubw   m2, [r0 + 32]
1018    psubw   m3, [r0 + 64]
1019    psubw   m4, [r0 + 96]
1020    pabsw   m1, m1
1021    pabsw   m2, m2
1022    pabsw   m3, m3
1023    pabsw   m4, m4
1024    paddw   m0, m1
1025    paddw   m5, m2
1026    paddw   m6, m3
1027    paddw   m7, m4
1028
1029    movu    m1, [r2 + r3]
1030    movu    m2, [r2 + r3 + 32]
1031    movu    m3, [r2 + r3 + 64]
1032    movu    m4, [r2 + r3 + 96]
1033    psubw   m1, [r0 + r1]
1034    psubw   m2, [r0 + r1 + 32]
1035    psubw   m3, [r0 + r1 + 64]
1036    psubw   m4, [r0 + r1 + 96]
1037    pabsw   m1, m1
1038    pabsw   m2, m2
1039    pabsw   m3, m3
1040    pabsw   m4, m4
1041    paddw   m0, m1
1042    paddw   m5, m2
1043    paddw   m6, m3
1044    paddw   m7, m4
1045
1046    lea     r0, [r0 + 2 * r1]
1047    lea     r2, [r2 + 2 * r3]
1048
1049    dec    r4d
1050    jg     .loop
1051
1052    HADDUWD m0, m1
1053    HADDUWD m5, m1
1054    HADDUWD m6, m1
1055    HADDUWD m7, m1
1056    paddd   m0, m5
1057    paddd   m0, m6
1058    paddd   m0, m7
1059    HADDD   m0, m1
1060    movd    eax, xm0
1061    RET
1062
1063;------------------------------------------------------------------
1064; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
1065;------------------------------------------------------------------
1066%macro SAD_32 2
1067cglobal pixel_sad_%1x%2, 4,5,8
1068    pxor    m0,  m0
1069    mov     r4d, %2/4
1070.loop:
1071    SAD_1x32
1072    SAD_1x32
1073    SAD_1x32
1074    SAD_1x32
1075    dec     r4d
1076    jnz     .loop
1077
1078    HADDD   m0, m1
1079    movd    eax, xm0
1080    RET
1081%endmacro
1082
1083INIT_XMM sse2
1084SAD_32  32,  8
1085SAD_32  32, 16
1086SAD_32  32, 24
1087SAD_32  32, 32
1088SAD_32  32, 64
1089
1090;------------------------------------------------------------------
1091; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
1092;------------------------------------------------------------------
1093%macro SAD_64 2
1094cglobal pixel_sad_%1x%2, 4,5,8
1095    pxor    m0, m0
1096    mov     r4d, %2/4
1097.loop:
1098    SAD_1x64
1099    SAD_1x64
1100    SAD_1x64
1101    SAD_1x64
1102    dec     r4d
1103    jnz     .loop
1104
1105    HADDD   m0, m1
1106    movd    eax, xmm0
1107    RET
1108%endmacro
1109
1110INIT_XMM sse2
1111SAD_64  64, 16
1112SAD_64  64, 32
1113SAD_64  64, 48
1114SAD_64  64, 64
1115
1116;------------------------------------------------------------------
1117; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
1118;------------------------------------------------------------------
1119%macro SAD_48 2
1120cglobal pixel_sad_%1x%2, 4,5,8
1121    pxor    m0, m0
1122    mov     r4d, %2/4
1123.loop:
1124    SAD_1x48
1125    SAD_1x48
1126    SAD_1x48
1127    SAD_1x48
1128    dec     r4d
1129    jnz     .loop
1130
1131    HADDD   m0, m1
1132    movd    eax, xmm0
1133    RET
1134%endmacro
1135
1136INIT_XMM sse2
1137SAD_48  48, 64
1138
1139;------------------------------------------------------------------
1140; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
1141;------------------------------------------------------------------
1142%macro SAD_24 2
1143cglobal pixel_sad_%1x%2, 4,5,8
1144    pxor    m0, m0
1145    mov     r4d, %2/4
1146.loop:
1147    SAD_1x24
1148    SAD_1x24
1149    SAD_1x24
1150    SAD_1x24
1151    dec     r4d
1152    jnz     .loop
1153
1154    HADDD   m0, m1
1155    movd    eax, xmm0
1156    RET
1157%endmacro
1158
1159INIT_XMM sse2
1160SAD_24  24, 32
1161
1162;------------------------------------------------------------------
1163; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
1164;------------------------------------------------------------------
1165%macro SAD_12 2
1166cglobal pixel_sad_%1x%2, 4,5,8
1167    pxor    m0,  m0
1168    mov     r4d, %2/4
1169.loop:
1170    SAD_1x12
1171    SAD_1x12
1172    SAD_1x12
1173    SAD_1x12
1174    dec     r4d
1175    jnz     .loop
1176
1177    HADDD   m0, m1
1178    movd    eax, xmm0
1179    RET
1180%endmacro
1181
1182INIT_XMM sse2
1183SAD_12  12, 16
1184
1185
1186;=============================================================================
1187; SAD x3/x4
1188;=============================================================================
1189
1190%macro SAD_X3_INC_P 0
1191    add     r0, 4*FENC_STRIDE
1192    lea     r1, [r1+4*r4]
1193    lea     r2, [r2+4*r4]
1194    lea     r3, [r3+4*r4]
1195%endmacro
1196
1197%macro SAD_X3_ONE_START 0
1198    mova    m3, [r0]
1199    movu    m0, [r1]
1200    movu    m1, [r2]
1201    movu    m2, [r3]
1202    psubw   m0, m3
1203    psubw   m1, m3
1204    psubw   m2, m3
1205    ABSW2   m0, m1, m0, m1, m4, m5
1206    ABSW    m2, m2, m6
1207    pmaddwd m0, [pw_1]
1208    pmaddwd m1, [pw_1]
1209    pmaddwd m2, [pw_1]
1210%endmacro
1211
1212%macro SAD_X3_ONE 2
1213    mova    m6, [r0+%1]
1214    movu    m3, [r1+%2]
1215    movu    m4, [r2+%2]
1216    movu    m5, [r3+%2]
1217    psubw   m3, m6
1218    psubw   m4, m6
1219    psubw   m5, m6
1220    ABSW2   m3, m4, m3, m4, m7, m6
1221    ABSW    m5, m5, m6
1222    pmaddwd m3, [pw_1]
1223    pmaddwd m4, [pw_1]
1224    pmaddwd m5, [pw_1]
1225    paddd   m0, m3
1226    paddd   m1, m4
1227    paddd   m2, m5
1228%endmacro
1229
1230%macro SAD_X3_END 2
1231%if mmsize == 8 && %1*%2 == 256
1232    HADDUW   m0, m3
1233    HADDUW   m1, m4
1234    HADDUW   m2, m5
1235%else
1236    HADDD    m0, m3
1237    HADDD    m1, m4
1238    HADDD    m2, m5
1239%endif
1240%if UNIX64
1241    movd [r5+0], xm0
1242    movd [r5+4], xm1
1243    movd [r5+8], xm2
1244%else
1245    mov      r0, r5mp
1246    movd [r0+0], xm0
1247    movd [r0+4], xm1
1248    movd [r0+8], xm2
1249%endif
1250    RET
1251%endmacro
1252
1253%macro SAD_X4_INC_P 0
1254    add     r0, 4*FENC_STRIDE
1255    lea     r1, [r1+4*r5]
1256    lea     r2, [r2+4*r5]
1257    lea     r3, [r3+4*r5]
1258    lea     r4, [r4+4*r5]
1259%endmacro
1260
1261%macro SAD_X4_ONE_START 0
1262    mova    m4, [r0]
1263    movu    m0, [r1]
1264    movu    m1, [r2]
1265    movu    m2, [r3]
1266    movu    m3, [r4]
1267    psubw   m0, m4
1268    psubw   m1, m4
1269    psubw   m2, m4
1270    psubw   m3, m4
1271    ABSW2   m0, m1, m0, m1, m5, m6
1272    ABSW2   m2, m3, m2, m3, m4, m7
1273    pmaddwd m0, [pw_1]
1274    pmaddwd m1, [pw_1]
1275    pmaddwd m2, [pw_1]
1276    pmaddwd m3, [pw_1]
1277%endmacro
1278
1279%macro SAD_X4_ONE 2
1280    mova    m4, [r0+%1]
1281    movu    m5, [r1+%2]
1282    movu    m6, [r2+%2]
1283%if num_mmregs > 8
1284    movu    m7, [r3+%2]
1285    movu    m8, [r4+%2]
1286    psubw   m5, m4
1287    psubw   m6, m4
1288    psubw   m7, m4
1289    psubw   m8, m4
1290    ABSW2   m5, m6, m5, m6, m9, m10
1291    ABSW2   m7, m8, m7, m8, m9, m10
1292    pmaddwd m5, [pw_1]
1293    pmaddwd m6, [pw_1]
1294    pmaddwd m7, [pw_1]
1295    pmaddwd m8, [pw_1]
1296    paddd   m0, m5
1297    paddd   m1, m6
1298    paddd   m2, m7
1299    paddd   m3, m8
1300%elif cpuflag(ssse3)
1301    movu    m7, [r3+%2]
1302    psubw   m5, m4
1303    psubw   m6, m4
1304    psubw   m7, m4
1305    movu    m4, [r4+%2]
1306    pabsw   m5, m5
1307    psubw   m4, [r0+%1]
1308    pabsw   m6, m6
1309    pabsw   m7, m7
1310    pabsw   m4, m4
1311    pmaddwd m5, [pw_1]
1312    pmaddwd m6, [pw_1]
1313    pmaddwd m7, [pw_1]
1314    pmaddwd m4, [pw_1]
1315    paddd   m0, m5
1316    paddd   m1, m6
1317    paddd   m2, m7
1318    paddd   m3, m4
1319%else ; num_mmregs == 8 && !ssse3
1320    psubw   m5, m4
1321    psubw   m6, m4
1322    ABSW    m5, m5, m7
1323    ABSW    m6, m6, m7
1324    pmaddwd m5, [pw_1]
1325    pmaddwd m6, [pw_1]
1326    paddd   m0, m5
1327    paddd   m1, m6
1328    movu    m5, [r3+%2]
1329    movu    m6, [r4+%2]
1330    psubw   m5, m4
1331    psubw   m6, m4
1332    ABSW2   m5, m6, m5, m6, m7, m4
1333    pmaddwd m5, [pw_1]
1334    pmaddwd m6, [pw_1]
1335    paddd   m2, m5
1336    paddd   m3, m6
1337%endif
1338%endmacro
1339
1340%macro SAD_X4_END 2
1341%if mmsize == 8 && %1*%2 == 256
1342    HADDUW    m0, m4
1343    HADDUW    m1, m5
1344    HADDUW    m2, m6
1345    HADDUW    m3, m7
1346%else
1347    HADDD     m0, m4
1348    HADDD     m1, m5
1349    HADDD     m2, m6
1350    HADDD     m3, m7
1351%endif
1352    mov       r0, r6mp
1353    movd [r0+ 0], xm0
1354    movd [r0+ 4], xm1
1355    movd [r0+ 8], xm2
1356    movd [r0+12], xm3
1357    RET
1358%endmacro
1359
1360%macro SAD_X_2xNP 4
1361    %assign x %3
1362%rep %4
1363    SAD_X%1_ONE x*mmsize, x*mmsize
1364    SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
1365    %assign x x+1
1366%endrep
1367%endmacro
1368
1369%macro PIXEL_VSAD 0
1370cglobal pixel_vsad, 3,3,8
1371    mova      m0, [r0]
1372    mova      m1, [r0+16]
1373    mova      m2, [r0+2*r1]
1374    mova      m3, [r0+2*r1+16]
1375    lea       r0, [r0+4*r1]
1376    psubw     m0, m2
1377    psubw     m1, m3
1378    ABSW2     m0, m1, m0, m1, m4, m5
1379    paddw     m0, m1
1380    sub      r2d, 2
1381    je .end
1382.loop:
1383    mova      m4, [r0]
1384    mova      m5, [r0+16]
1385    mova      m6, [r0+2*r1]
1386    mova      m7, [r0+2*r1+16]
1387    lea       r0, [r0+4*r1]
1388    psubw     m2, m4
1389    psubw     m3, m5
1390    psubw     m4, m6
1391    psubw     m5, m7
1392    ABSW      m2, m2, m1
1393    ABSW      m3, m3, m1
1394    ABSW      m4, m4, m1
1395    ABSW      m5, m5, m1
1396    paddw     m0, m2
1397    paddw     m0, m3
1398    paddw     m0, m4
1399    paddw     m0, m5
1400    mova      m2, m6
1401    mova      m3, m7
1402    sub r2d, 2
1403    jg .loop
1404.end:
1405%if BIT_DEPTH == 9
1406    HADDW     m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
1407%else
1408    HADDUW    m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
1409%endif
1410    movd     eax, m0
1411    RET
1412%endmacro
1413INIT_XMM sse2
1414PIXEL_VSAD
1415INIT_XMM ssse3
1416PIXEL_VSAD
1417INIT_XMM xop
1418PIXEL_VSAD
1419
1420INIT_YMM avx2
1421cglobal pixel_vsad, 3,3
1422    mova      m0, [r0]
1423    mova      m1, [r0+2*r1]
1424    lea       r0, [r0+4*r1]
1425    psubw     m0, m1
1426    pabsw     m0, m0
1427    sub      r2d, 2
1428    je .end
1429.loop:
1430    mova      m2, [r0]
1431    mova      m3, [r0+2*r1]
1432    lea       r0, [r0+4*r1]
1433    psubw     m1, m2
1434    psubw     m2, m3
1435    pabsw     m1, m1
1436    pabsw     m2, m2
1437    paddw     m0, m1
1438    paddw     m0, m2
1439    mova      m1, m3
1440    sub      r2d, 2
1441    jg .loop
1442.end:
1443%if BIT_DEPTH == 9
1444    HADDW     m0, m1
1445%else
1446    HADDUW    m0, m1
1447%endif
1448    movd     eax, xm0
1449    RET
1450;-----------------------------------------------------------------------------
1451; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
1452;                        uint16_t *pix2, intptr_t i_stride, int scores[3] )
1453;-----------------------------------------------------------------------------
1454%macro SAD_X 3
1455cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
1456    %assign regnum %1+1
1457    %xdefine STRIDE r %+ regnum
1458    mov     r6, %3/2-1
1459    SAD_X%1_ONE_START
1460    SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
1461    SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
1462.loop:
1463    SAD_X%1_INC_P
1464    SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
1465    dec     r6
1466    jg .loop
1467%if %1 == 4
1468    mov     r6, r6m
1469%endif
1470    SAD_X%1_END %2, %3
1471%endmacro
1472
1473INIT_MMX mmx2
1474%define XMM_REGS 0
1475SAD_X 3, 16, 16
1476SAD_X 3, 16,  8
1477SAD_X 3, 12, 16
1478SAD_X 3,  8, 16
1479SAD_X 3,  8,  8
1480SAD_X 3,  8,  4
1481SAD_X 3,  4, 16
1482SAD_X 3,  4,  8
1483SAD_X 3,  4,  4
1484SAD_X 4, 16, 16
1485SAD_X 4, 16,  8
1486SAD_X 4, 12, 16
1487SAD_X 4,  8, 16
1488SAD_X 4,  8,  8
1489SAD_X 4,  8,  4
1490SAD_X 4,  4, 16
1491SAD_X 4,  4,  8
1492SAD_X 4,  4,  4
1493INIT_MMX ssse3
1494SAD_X 3,  4,  8
1495SAD_X 3,  4,  4
1496SAD_X 4,  4,  8
1497SAD_X 4,  4,  4
1498INIT_XMM ssse3
1499%define XMM_REGS 7
1500SAD_X 3, 16, 16
1501SAD_X 3, 16,  8
1502SAD_X 3,  8, 16
1503SAD_X 3,  8,  8
1504SAD_X 3,  8,  4
1505%define XMM_REGS 9
1506SAD_X 4, 16, 16
1507SAD_X 4, 16,  8
1508SAD_X 4,  8, 16
1509SAD_X 4,  8,  8
1510SAD_X 4,  8,  4
1511INIT_XMM sse2
1512%define XMM_REGS 8
1513SAD_X 3, 64, 64
1514SAD_X 3, 64, 48
1515SAD_X 3, 64, 32
1516SAD_X 3, 64, 16
1517SAD_X 3, 48, 64
1518SAD_X 3, 32, 64
1519SAD_X 3, 32, 32
1520SAD_X 3, 32, 24
1521SAD_X 3, 32, 16
1522SAD_X 3, 32,  8
1523SAD_X 3, 24, 32
1524SAD_X 3, 16, 64
1525SAD_X 3, 16, 32
1526SAD_X 3, 16, 16
1527SAD_X 3, 16, 12
1528SAD_X 3, 16,  8
1529SAD_X 3, 16,  4
1530SAD_X 3,  8, 32
1531SAD_X 3,  8, 16
1532SAD_X 3,  8,  8
1533SAD_X 3,  8,  4
1534%define XMM_REGS 11
1535SAD_X 4, 64, 64
1536SAD_X 4, 64, 48
1537SAD_X 4, 64, 32
1538SAD_X 4, 64, 16
1539SAD_X 4, 48, 64
1540SAD_X 4, 32, 64
1541SAD_X 4, 32, 32
1542SAD_X 4, 32, 24
1543SAD_X 4, 32, 16
1544SAD_X 4, 32,  8
1545SAD_X 4, 24, 32
1546SAD_X 4, 16, 64
1547SAD_X 4, 16, 32
1548SAD_X 4, 16, 16
1549SAD_X 4, 16, 12
1550SAD_X 4, 16,  8
1551SAD_X 4, 16,  4
1552SAD_X 4,  8, 32
1553SAD_X 4,  8, 16
1554SAD_X 4,  8,  8
1555SAD_X 4,  8,  4
1556INIT_YMM avx2
1557%define XMM_REGS 7
1558SAD_X 3, 16,  4
1559SAD_X 3, 16,  8
1560SAD_X 3, 16,  12
1561SAD_X 3, 16,  16
1562SAD_X 3, 16,  32
1563SAD_X 3, 16,  64
1564SAD_X 3, 32,  8
1565SAD_X 3, 32, 16
1566SAD_X 3, 32, 24
1567SAD_X 3, 32, 32
1568SAD_X 3, 32, 64
1569SAD_X 3, 48, 64
1570SAD_X 3, 64, 16
1571SAD_X 3, 64, 32
1572SAD_X 3, 64, 48
1573SAD_X 3, 64, 64
1574%define XMM_REGS 9
1575SAD_X 4, 16,  4
1576SAD_X 4, 16,  8
1577SAD_X 4, 16,  12
1578SAD_X 4, 16,  16
1579SAD_X 4, 16,  32
1580SAD_X 4, 16,  64
1581SAD_X 4, 32,  8
1582SAD_X 4, 32, 16
1583SAD_X 4, 32, 24
1584SAD_X 4, 32, 32
1585SAD_X 4, 32, 64
1586SAD_X 4, 48, 64
1587SAD_X 4, 64, 16
1588SAD_X 4, 64, 32
1589SAD_X 4, 64, 48
1590SAD_X 4, 64, 64
1591
1592