1;*****************************************************************************
2;* sad16-a.asm: x86 high depth sad functions
3;*****************************************************************************
4;* Copyright (C) 2010-2021 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;*          Henrik Gramner <henrik@gramner.com>
8;*
9;* This program is free software; you can redistribute it and/or modify
10;* it under the terms of the GNU General Public License as published by
11;* the Free Software Foundation; either version 2 of the License, or
12;* (at your option) any later version.
13;*
14;* This program is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17;* GNU General Public License for more details.
18;*
19;* You should have received a copy of the GNU General Public License
20;* along with this program; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22;*
23;* This program is also available under a commercial proprietary license.
24;* For more information, contact us at licensing@x264.com.
25;*****************************************************************************
26
27%include "x86inc.asm"
28%include "x86util.asm"
29
30SECTION .text
31
32cextern pw_1
33cextern pw_4
34cextern pw_8
35
36;=============================================================================
37; SAD MMX
38;=============================================================================
39
40%macro SAD_INC_1x16P_MMX 0
41    movu    m1, [r0+ 0]
42    movu    m2, [r0+ 8]
43    movu    m3, [r0+16]
44    movu    m4, [r0+24]
45    psubw   m1, [r2+ 0]
46    psubw   m2, [r2+ 8]
47    psubw   m3, [r2+16]
48    psubw   m4, [r2+24]
49    ABSW2   m1, m2, m1, m2, m5, m6
50    ABSW2   m3, m4, m3, m4, m7, m5
51    lea     r0, [r0+2*r1]
52    lea     r2, [r2+2*r3]
53    paddw   m1, m2
54    paddw   m3, m4
55    paddw   m0, m1
56    paddw   m0, m3
57%endmacro
58
59%macro SAD_INC_2x8P_MMX 0
60    movu    m1, [r0+0]
61    movu    m2, [r0+8]
62    movu    m3, [r0+2*r1+0]
63    movu    m4, [r0+2*r1+8]
64    psubw   m1, [r2+0]
65    psubw   m2, [r2+8]
66    psubw   m3, [r2+2*r3+0]
67    psubw   m4, [r2+2*r3+8]
68    ABSW2   m1, m2, m1, m2, m5, m6
69    ABSW2   m3, m4, m3, m4, m7, m5
70    lea     r0, [r0+4*r1]
71    lea     r2, [r2+4*r3]
72    paddw   m1, m2
73    paddw   m3, m4
74    paddw   m0, m1
75    paddw   m0, m3
76%endmacro
77
78%macro SAD_INC_2x4P_MMX 0
79    movu    m1, [r0]
80    movu    m2, [r0+2*r1]
81    psubw   m1, [r2]
82    psubw   m2, [r2+2*r3]
83    ABSW2   m1, m2, m1, m2, m3, m4
84    lea     r0, [r0+4*r1]
85    lea     r2, [r2+4*r3]
86    paddw   m0, m1
87    paddw   m0, m2
88%endmacro
89
90;-----------------------------------------------------------------------------
91; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
92;-----------------------------------------------------------------------------
93%macro SAD_MMX 3
94cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
95    pxor    m0, m0
96%if %2 == 4
97    SAD_INC_%3x%1P_MMX
98    SAD_INC_%3x%1P_MMX
99%else
100    mov    r4d, %2/%3
101.loop:
102    SAD_INC_%3x%1P_MMX
103    dec    r4d
104    jg .loop
105%endif
106%if %1*%2 == 256
107    HADDUW  m0, m1
108%else
109    HADDW   m0, m1
110%endif
111    movd   eax, m0
112    RET
113%endmacro
114
115INIT_MMX mmx2
116SAD_MMX 16, 16, 1
117SAD_MMX 16,  8, 1
118SAD_MMX  8, 16, 2
119SAD_MMX  8,  8, 2
120SAD_MMX  8,  4, 2
121SAD_MMX  4,  8, 2
122SAD_MMX  4,  4, 2
123INIT_MMX ssse3
124SAD_MMX  4,  8, 2
125SAD_MMX  4,  4, 2
126
127;=============================================================================
128; SAD XMM
129;=============================================================================
130
131%macro SAD_INC_2ROW 1
132%if 2*%1 > mmsize
133    movu    m1, [r2+ 0]
134    movu    m2, [r2+16]
135    movu    m3, [r2+2*r3+ 0]
136    movu    m4, [r2+2*r3+16]
137    psubw   m1, [r0+ 0]
138    psubw   m2, [r0+16]
139    psubw   m3, [r0+2*r1+ 0]
140    psubw   m4, [r0+2*r1+16]
141    ABSW2   m1, m2, m1, m2, m5, m6
142    lea     r0, [r0+4*r1]
143    lea     r2, [r2+4*r3]
144    ABSW2   m3, m4, m3, m4, m7, m5
145    paddw   m1, m2
146    paddw   m3, m4
147    paddw   m0, m1
148    paddw   m0, m3
149%else
150    movu    m1, [r2]
151    movu    m2, [r2+2*r3]
152    psubw   m1, [r0]
153    psubw   m2, [r0+2*r1]
154    ABSW2   m1, m2, m1, m2, m3, m4
155    lea     r0, [r0+4*r1]
156    lea     r2, [r2+4*r3]
157    paddw   m0, m1
158    paddw   m0, m2
159%endif
160%endmacro
161
162;-----------------------------------------------------------------------------
163; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
164;-----------------------------------------------------------------------------
165%macro SAD 2
166cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
167    pxor    m0, m0
168%if %2 == 4
169    SAD_INC_2ROW %1
170    SAD_INC_2ROW %1
171%else
172    mov    r4d, %2/2
173.loop:
174    SAD_INC_2ROW %1
175    dec    r4d
176    jg .loop
177%endif
178    HADDW   m0, m1
179    movd   eax, xm0
180    RET
181%endmacro
182
183INIT_XMM sse2
184SAD 16, 16
185SAD 16,  8
186SAD  8, 16
187SAD  8,  8
188SAD  8,  4
189INIT_XMM sse2, aligned
190SAD 16, 16
191SAD 16,  8
192SAD  8, 16
193SAD  8,  8
194INIT_XMM ssse3
195SAD 16, 16
196SAD 16,  8
197SAD  8, 16
198SAD  8,  8
199SAD  8,  4
200INIT_XMM ssse3, aligned
201SAD 16, 16
202SAD 16,  8
203SAD  8, 16
204SAD  8,  8
205INIT_YMM avx2
206SAD 16, 16
207SAD 16,  8
208
209;=============================================================================
210; SAD x3/x4
211;=============================================================================
212
213%macro SAD_X3_INC_P 0
214    add     r0, 4*FENC_STRIDE
215    lea     r1, [r1+4*r4]
216    lea     r2, [r2+4*r4]
217    lea     r3, [r3+4*r4]
218%endmacro
219
220%macro SAD_X3_ONE_START 0
221    mova    m3, [r0]
222    movu    m0, [r1]
223    movu    m1, [r2]
224    movu    m2, [r3]
225    psubw   m0, m3
226    psubw   m1, m3
227    psubw   m2, m3
228    ABSW2   m0, m1, m0, m1, m4, m5
229    ABSW    m2, m2, m6
230%endmacro
231
232%macro SAD_X3_ONE 2
233    mova    m6, [r0+%1]
234    movu    m3, [r1+%2]
235    movu    m4, [r2+%2]
236    movu    m5, [r3+%2]
237    psubw   m3, m6
238    psubw   m4, m6
239    psubw   m5, m6
240    ABSW2   m3, m4, m3, m4, m7, m6
241    ABSW    m5, m5, m6
242    paddw   m0, m3
243    paddw   m1, m4
244    paddw   m2, m5
245%endmacro
246
247%macro SAD_X3_END 2
248%if mmsize == 8 && %1*%2 == 256
249    HADDUW   m0, m3
250    HADDUW   m1, m4
251    HADDUW   m2, m5
252%else
253    HADDW    m0, m3
254    HADDW    m1, m4
255    HADDW    m2, m5
256%endif
257%if UNIX64
258    movd [r5+0], xm0
259    movd [r5+4], xm1
260    movd [r5+8], xm2
261%else
262    mov      r0, r5mp
263    movd [r0+0], xm0
264    movd [r0+4], xm1
265    movd [r0+8], xm2
266%endif
267    RET
268%endmacro
269
270%macro SAD_X4_INC_P 0
271    add     r0, 4*FENC_STRIDE
272    lea     r1, [r1+4*r5]
273    lea     r2, [r2+4*r5]
274    lea     r3, [r3+4*r5]
275    lea     r4, [r4+4*r5]
276%endmacro
277
278%macro SAD_X4_ONE_START 0
279    mova    m4, [r0]
280    movu    m0, [r1]
281    movu    m1, [r2]
282    movu    m2, [r3]
283    movu    m3, [r4]
284    psubw   m0, m4
285    psubw   m1, m4
286    psubw   m2, m4
287    psubw   m3, m4
288    ABSW2   m0, m1, m0, m1, m5, m6
289    ABSW2   m2, m3, m2, m3, m4, m7
290%endmacro
291
292%macro SAD_X4_ONE 2
293    mova    m4, [r0+%1]
294    movu    m5, [r1+%2]
295    movu    m6, [r2+%2]
296%if num_mmregs > 8
297    movu    m7, [r3+%2]
298    movu    m8, [r4+%2]
299    psubw   m5, m4
300    psubw   m6, m4
301    psubw   m7, m4
302    psubw   m8, m4
303    ABSW2   m5, m6, m5, m6, m9, m10
304    ABSW2   m7, m8, m7, m8, m9, m10
305    paddw   m0, m5
306    paddw   m1, m6
307    paddw   m2, m7
308    paddw   m3, m8
309%elif cpuflag(ssse3)
310    movu    m7, [r3+%2]
311    psubw   m5, m4
312    psubw   m6, m4
313    psubw   m7, m4
314    movu    m4, [r4+%2]
315    pabsw   m5, m5
316    psubw   m4, [r0+%1]
317    pabsw   m6, m6
318    pabsw   m7, m7
319    pabsw   m4, m4
320    paddw   m0, m5
321    paddw   m1, m6
322    paddw   m2, m7
323    paddw   m3, m4
324%else ; num_mmregs == 8 && !ssse3
325    psubw   m5, m4
326    psubw   m6, m4
327    ABSW    m5, m5, m7
328    ABSW    m6, m6, m7
329    paddw   m0, m5
330    paddw   m1, m6
331    movu    m5, [r3+%2]
332    movu    m6, [r4+%2]
333    psubw   m5, m4
334    psubw   m6, m4
335    ABSW2   m5, m6, m5, m6, m7, m4
336    paddw   m2, m5
337    paddw   m3, m6
338%endif
339%endmacro
340
341%macro SAD_X4_END 2
342%if mmsize == 8 && %1*%2 == 256
343    HADDUW    m0, m4
344    HADDUW    m1, m5
345    HADDUW    m2, m6
346    HADDUW    m3, m7
347%else
348    HADDW     m0, m4
349    HADDW     m1, m5
350    HADDW     m2, m6
351    HADDW     m3, m7
352%endif
353    mov       r0, r6mp
354    movd [r0+ 0], xm0
355    movd [r0+ 4], xm1
356    movd [r0+ 8], xm2
357    movd [r0+12], xm3
358    RET
359%endmacro
360
361%macro SAD_X_2xNP 4
362    %assign x %3
363%rep %4
364    SAD_X%1_ONE x*mmsize, x*mmsize
365    SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
366    %assign x x+1
367%endrep
368%endmacro
369
370%macro PIXEL_VSAD 0
371cglobal pixel_vsad, 3,3,8
372    mova      m0, [r0]
373    mova      m1, [r0+16]
374    mova      m2, [r0+2*r1]
375    mova      m3, [r0+2*r1+16]
376    lea       r0, [r0+4*r1]
377    psubw     m0, m2
378    psubw     m1, m3
379    ABSW2     m0, m1, m0, m1, m4, m5
380    paddw     m0, m1
381    sub      r2d, 2
382    je .end
383.loop:
384    mova      m4, [r0]
385    mova      m5, [r0+16]
386    mova      m6, [r0+2*r1]
387    mova      m7, [r0+2*r1+16]
388    lea       r0, [r0+4*r1]
389    psubw     m2, m4
390    psubw     m3, m5
391    psubw     m4, m6
392    psubw     m5, m7
393    ABSW      m2, m2, m1
394    ABSW      m3, m3, m1
395    ABSW      m4, m4, m1
396    ABSW      m5, m5, m1
397    paddw     m0, m2
398    paddw     m0, m3
399    paddw     m0, m4
400    paddw     m0, m5
401    mova      m2, m6
402    mova      m3, m7
403    sub r2d, 2
404    jg .loop
405.end:
406%if BIT_DEPTH == 9
407    HADDW     m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
408%else
409    HADDUW    m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
410%endif
411    movd     eax, m0
412    RET
413%endmacro
414INIT_XMM sse2
415PIXEL_VSAD
416INIT_XMM ssse3
417PIXEL_VSAD
418INIT_XMM xop
419PIXEL_VSAD
420
421INIT_YMM avx2
422cglobal pixel_vsad, 3,3
423    mova      m0, [r0]
424    mova      m1, [r0+2*r1]
425    lea       r0, [r0+4*r1]
426    psubw     m0, m1
427    pabsw     m0, m0
428    sub      r2d, 2
429    je .end
430.loop:
431    mova      m2, [r0]
432    mova      m3, [r0+2*r1]
433    lea       r0, [r0+4*r1]
434    psubw     m1, m2
435    psubw     m2, m3
436    pabsw     m1, m1
437    pabsw     m2, m2
438    paddw     m0, m1
439    paddw     m0, m2
440    mova      m1, m3
441    sub      r2d, 2
442    jg .loop
443.end:
444%if BIT_DEPTH == 9
445    HADDW     m0, m1
446%else
447    HADDUW    m0, m1
448%endif
449    movd     eax, xm0
450    RET
451
452;-----------------------------------------------------------------------------
453; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
454;                        uint16_t *pix2, intptr_t i_stride, int scores[3] )
455;-----------------------------------------------------------------------------
456%macro SAD_X 3
457cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
458    %assign regnum %1+1
459    %xdefine STRIDE r %+ regnum
460    mov     r6, %3/2-1
461    SAD_X%1_ONE_START
462    SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
463    SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
464.loop:
465    SAD_X%1_INC_P
466    SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
467    dec     r6
468    jg .loop
469%if %1 == 4
470    mov     r6, r6m
471%endif
472    SAD_X%1_END %2, %3
473%endmacro
474
475INIT_MMX mmx2
476%define XMM_REGS 0
477SAD_X 3, 16, 16
478SAD_X 3, 16,  8
479SAD_X 3,  8, 16
480SAD_X 3,  8,  8
481SAD_X 3,  8,  4
482SAD_X 3,  4,  8
483SAD_X 3,  4,  4
484SAD_X 4, 16, 16
485SAD_X 4, 16,  8
486SAD_X 4,  8, 16
487SAD_X 4,  8,  8
488SAD_X 4,  8,  4
489SAD_X 4,  4,  8
490SAD_X 4,  4,  4
491INIT_MMX ssse3
492SAD_X 3,  4,  8
493SAD_X 3,  4,  4
494SAD_X 4,  4,  8
495SAD_X 4,  4,  4
496INIT_XMM ssse3
497%define XMM_REGS 7
498SAD_X 3, 16, 16
499SAD_X 3, 16,  8
500SAD_X 3,  8, 16
501SAD_X 3,  8,  8
502SAD_X 3,  8,  4
503%define XMM_REGS 9
504SAD_X 4, 16, 16
505SAD_X 4, 16,  8
506SAD_X 4,  8, 16
507SAD_X 4,  8,  8
508SAD_X 4,  8,  4
509INIT_XMM sse2
510%define XMM_REGS 8
511SAD_X 3, 16, 16
512SAD_X 3, 16,  8
513SAD_X 3,  8, 16
514SAD_X 3,  8,  8
515SAD_X 3,  8,  4
516%define XMM_REGS 11
517SAD_X 4, 16, 16
518SAD_X 4, 16,  8
519SAD_X 4,  8, 16
520SAD_X 4,  8,  8
521SAD_X 4,  8,  4
522INIT_XMM xop
523%define XMM_REGS 7
524SAD_X 3, 16, 16
525SAD_X 3, 16,  8
526SAD_X 3,  8, 16
527SAD_X 3,  8,  8
528SAD_X 3,  8,  4
529%define XMM_REGS 9
530SAD_X 4, 16, 16
531SAD_X 4, 16,  8
532SAD_X 4,  8, 16
533SAD_X 4,  8,  8
534SAD_X 4,  8,  4
535INIT_YMM avx2
536%define XMM_REGS 7
537SAD_X 3, 16, 16
538SAD_X 3, 16,  8
539%define XMM_REGS 9
540SAD_X 4, 16, 16
541SAD_X 4, 16,  8
542
543;-----------------------------------------------------------------------------
544; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
545;-----------------------------------------------------------------------------
546
547%macro INTRA_SAD_X3_4x4 0
548cglobal intra_sad_x3_4x4, 3,3,7
549%if cpuflag(ssse3)
550    movddup   m0, [r1-1*FDEC_STRIDEB]
551%else
552    movq      m0, [r1-1*FDEC_STRIDEB]
553    punpcklqdq m0, m0
554%endif
555    movq      m1, [r0+0*FENC_STRIDEB]
556    movq      m2, [r0+2*FENC_STRIDEB]
557    pshuflw   m6, m0, q1032
558    paddw     m6, m0
559    pshuflw   m5, m6, q2301
560    paddw     m6, m5
561    punpcklqdq m6, m6       ; A+B+C+D 8 times
562    movhps    m1, [r0+1*FENC_STRIDEB]
563    movhps    m2, [r0+3*FENC_STRIDEB]
564    psubw     m3, m1, m0
565    psubw     m0, m2
566    ABSW2     m3, m0, m3, m0, m4, m5
567    paddw     m0, m3
568    movd      m3, [r1+0*FDEC_STRIDEB-4]
569    movd      m4, [r1+2*FDEC_STRIDEB-4]
570    movhps    m3, [r1+1*FDEC_STRIDEB-8]
571    movhps    m4, [r1+3*FDEC_STRIDEB-8]
572    pshufhw   m3, m3, q3333
573    pshufhw   m4, m4, q3333
574    pshuflw   m3, m3, q1111 ; FF FF EE EE
575    pshuflw   m4, m4, q1111 ; HH HH GG GG
576    paddw     m5, m3, m4
577    paddw     m6, [pw_4]
578    paddw     m6, m5
579    pshufd    m5, m5, q1032
580    paddw     m5, m6
581    psrlw     m5, 3
582    psubw     m6, m5, m2
583    psubw     m5, m1
584    psubw     m1, m3
585    psubw     m2, m4
586    ABSW2     m5, m6, m5, m6, m3, m4
587    ABSW2     m1, m2, m1, m2, m3, m4
588    paddw     m5, m6
589    paddw     m1, m2
590%if cpuflag(ssse3)
591    phaddw    m0, m1
592    movhlps   m3, m5
593    paddw     m5, m3
594    phaddw    m0, m5
595    pmaddwd   m0, [pw_1]
596    mova    [r2], m0
597%else
598    HADDW     m0, m3
599    HADDW     m1, m3
600    HADDW     m5, m3
601    movd    [r2], m0 ; V prediction cost
602    movd  [r2+4], m1 ; H prediction cost
603    movd  [r2+8], m5 ; DC prediction cost
604%endif
605    RET
606%endmacro
607
608INIT_XMM sse2
609INTRA_SAD_X3_4x4
610INIT_XMM ssse3
611INTRA_SAD_X3_4x4
612INIT_XMM avx
613INTRA_SAD_X3_4x4
614
615;-----------------------------------------------------------------------------
616; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
617;-----------------------------------------------------------------------------
618
619;m0 = DC
620;m6 = V
621;m7 = H
622;m1 = DC score
623;m2 = V score
624;m3 = H score
625;m5 = temp
626;m4 = pixel row
627
628%macro INTRA_SAD_HVDC_ITER 2
629    mova        m4, [r0+(%1-4)*FENC_STRIDEB]
630    psubw       m4, m0
631    ABSW        m4, m4, m5
632    ACCUM    paddw, 1, 4, %1
633    mova        m4, [r0+(%1-4)*FENC_STRIDEB]
634    psubw       m4, m6
635    ABSW        m4, m4, m5
636    ACCUM    paddw, 2, 4, %1
637    pshufd      m5, m7, %2
638    psubw       m5, [r0+(%1-4)*FENC_STRIDEB]
639    ABSW        m5, m5, m4
640    ACCUM    paddw, 3, 5, %1
641%endmacro
642
643%macro INTRA_SAD_X3_8x8 0
644cglobal intra_sad_x3_8x8, 3,3,8
645    add         r0, 4*FENC_STRIDEB
646    movu        m0, [r1+7*SIZEOF_PIXEL]
647    mova        m6, [r1+16*SIZEOF_PIXEL] ;V prediction
648    mova        m7, m0
649    paddw       m0, m6
650    punpckhwd   m7, m7
651    HADDW       m0, m4
652    paddw       m0, [pw_8]
653    psrlw       m0, 4
654    SPLATW      m0, m0
655    INTRA_SAD_HVDC_ITER 0, q3333
656    INTRA_SAD_HVDC_ITER 1, q2222
657    INTRA_SAD_HVDC_ITER 2, q1111
658    INTRA_SAD_HVDC_ITER 3, q0000
659    movq        m7, [r1+7*SIZEOF_PIXEL]
660    punpcklwd   m7, m7
661    INTRA_SAD_HVDC_ITER 4, q3333
662    INTRA_SAD_HVDC_ITER 5, q2222
663    INTRA_SAD_HVDC_ITER 6, q1111
664    INTRA_SAD_HVDC_ITER 7, q0000
665%if cpuflag(ssse3)
666    phaddw      m2, m3     ; 2 2 2 2 3 3 3 3
667    movhlps     m3, m1
668    paddw       m1, m3     ; 1 1 1 1 _ _ _ _
669    phaddw      m2, m1     ; 2 2 3 3 1 1 _ _
670    pmaddwd     m2, [pw_1] ; 2 3 1 _
671    mova      [r2], m2
672%else
673    HADDW       m2, m4
674    HADDW       m3, m4
675    HADDW       m1, m4
676    movd    [r2+0], m2
677    movd    [r2+4], m3
678    movd    [r2+8], m1
679%endif
680    RET
681%endmacro
682
683INIT_XMM sse2
684INTRA_SAD_X3_8x8
685INIT_XMM ssse3
686INTRA_SAD_X3_8x8
687
688%macro INTRA_SAD_HVDC_ITER_YMM 2
689    mova       xm4, [r0+(%1-4)*FENC_STRIDEB]
690    vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
691    pshufd      m5, m7, %2
692    psubw       m5, m4
693    pabsw       m5, m5
694    ACCUM    paddw, 2, 5, %1 ; H
695    psubw       m5, m4, m6
696    psubw       m4, m0
697    pabsw       m5, m5
698    pabsw       m4, m4
699    ACCUM    paddw, 1, 5, %1 ; V
700    ACCUM    paddw, 3, 4, %1 ; DC
701%endmacro
702
703INIT_YMM avx2
704cglobal intra_sad_x3_8x8, 3,3,8
705    add            r0, 4*FENC_STRIDEB
706    movu          xm0, [r1+7*SIZEOF_PIXEL]
707    vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
708    vpermq         m7, m0, q0011
709    paddw         xm0, xm6
710    paddw         xm0, [pw_1] ; equal to +8 after HADDW
711    HADDW         xm0, xm4
712    psrld         xm0, 4
713    vpbroadcastw   m0, xm0
714    punpcklwd      m7, m7
715    INTRA_SAD_HVDC_ITER_YMM 0, q3333
716    INTRA_SAD_HVDC_ITER_YMM 1, q2222
717    INTRA_SAD_HVDC_ITER_YMM 2, q1111
718    INTRA_SAD_HVDC_ITER_YMM 3, q0000
719    phaddw         m1, m2     ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
720    punpckhqdq     m2, m3, m3
721    paddw          m3, m2     ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
722    phaddw         m1, m3     ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
723    vextracti128  xm2, m1, 1
724    paddw         xm1, xm2    ; 1 1 2 2 3 3 _ _
725    pmaddwd       xm1, [pw_1] ; 1 2 3 _
726    mova         [r2], xm1
727    RET
728