1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 16
30
31%macro SMOOTH_WEIGHT_TABLE 1-*
32    %rep %0
33        db %1-128, 127-%1
34        %rotate 1
35    %endrep
36%endmacro
37
38; sm_weights[], but modified to precalculate x and 256-x with offsets to
39; enable efficient use of pmaddubsw (which requires signed values)
40smooth_weights: SMOOTH_WEIGHT_TABLE         \
41      0,   0, 255, 128, 255, 149,  85,  64, \
42    255, 197, 146, 105,  73,  50,  37,  32, \
43    255, 225, 196, 170, 145, 123, 102,  84, \
44     68,  54,  43,  33,  26,  20,  17,  16, \
45    255, 240, 225, 210, 196, 182, 169, 157, \
46    145, 133, 122, 111, 101,  92,  83,  74, \
47     66,  59,  52,  45,  39,  34,  29,  25, \
48     21,  17,  14,  12,  10,   9,   8,   8, \
49    255, 248, 240, 233, 225, 218, 210, 203, \
50    196, 189, 182, 176, 169, 163, 156, 150, \
51    144, 138, 133, 127, 121, 116, 111, 106, \
52    101,  96,  91,  86,  82,  77,  73,  69, \
53     65,  61,  57,  54,  50,  47,  44,  41, \
54     38,  35,  32,  29,  27,  25,  22,  20, \
55     18,  16,  15,  13,  12,  10,   9,   8, \
56      7,   6,   6,   5,   5,   4,   4,   4
57
58ipred_v_shuf      : db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
59ipred_h_shuf      : db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
60ipred_paeth_shuf  : db  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0
61filter_shuf1      : db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
62filter_shuf2      : db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
63
64pw_8        : times 8  dw 8
65pb_3        : times 16 db 3
66pb_128      : times 8  db 128
67pw_128      : times 4  dw 128
68pw_255      : times 4  dw 255
69pb_2        : times 8  db 2
70pb_4        : times 8  db 4
71pb_127_m127 : times 4  db 127, -127
72pd_32768    : times 1  dd 32768
73
74
75%macro JMP_TABLE 3-*
76    %xdefine %1_%2_table (%%table - 2*4)
77    %xdefine %%base mangle(private_prefix %+ _%1_%2)
78    %%table:
79    %rep %0 - 2
80        dd %%base %+ .%3 - (%%table - 2*4)
81        %rotate 1
82    %endrep
83%endmacro
84
85%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
86%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
87
88JMP_TABLE ipred_h,          ssse3, w4, w8, w16, w32, w64
89JMP_TABLE ipred_dc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
90                                s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
91JMP_TABLE ipred_dc_left,    ssse3, h4, h8, h16, h32, h64
92JMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
93JMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
94JMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
95JMP_TABLE ipred_paeth,      ssse3, w4, w8, w16, w32, w64
96JMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
97JMP_TABLE ipred_cfl,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
98                                s4-8*4, s8-8*4, s16-8*4, s32-8*4
99JMP_TABLE ipred_cfl_left,   ssse3, h4, h8, h16, h32
100JMP_TABLE ipred_filter,     ssse3, w4, w8, w16, w32
101
102cextern filter_intra_taps
103
104
105SECTION .text
106
107;---------------------------------------------------------------------------------------
108;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
109;                                    const int width, const int height, const int a);
110;---------------------------------------------------------------------------------------
111%macro IPRED_SET   3                                          ; width, stride, stride size pshuflw_imm8
112    pshuflw                      m1, m0, %3                   ; extend 8 byte for 2 pos
113    punpcklqdq                   m1, m1
114    mova           [dstq +      %2], m1
115%if %1 > 16
116    mova           [dstq + 16 + %2], m1
117%endif
118%if %1 > 32
119    mova           [dstq + 32 + %2], m1
120    mova           [dstq + 48 + %2], m1
121%endif
122%endmacro
123
124%macro IPRED_H 1                                            ; width
125    sub                         tlq, 4
126    movd                         m0, [tlq]                  ; get 4 bytes of topleft data
127    punpcklbw                    m0, m0                     ; extend 2 byte
128%if %1 == 4
129    pshuflw                      m1, m0, q2233
130    movd           [dstq+strideq*0], m1
131    psrlq                        m1, 32
132    movd           [dstq+strideq*1], m1
133    pshuflw                      m0, m0, q0011
134    movd           [dstq+strideq*2], m0
135    psrlq                        m0, 32
136    movd           [dstq+stride3q ], m0
137
138%elif %1 == 8
139    punpcklwd                    m0, m0
140    punpckhdq                    m1, m0, m0
141    punpckldq                    m0, m0
142    movq           [dstq+strideq*1], m1
143    movhps         [dstq+strideq*0], m1
144    movq           [dstq+stride3q ], m0
145    movhps         [dstq+strideq*2], m0
146%else
147    IPRED_SET                    %1,         0, q3333
148    IPRED_SET                    %1,   strideq, q2222
149    IPRED_SET                    %1, strideq*2, q1111
150    IPRED_SET                    %1,  stride3q, q0000
151%endif
152    lea                        dstq, [dstq+strideq*4]
153    sub                          hd, 4
154    jg .w%1
155    RET
156%endmacro
157
158INIT_XMM ssse3
159cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
160    LEA                          r5, ipred_h_ssse3_table
161    tzcnt                        wd, wm
162    movifnidn                    hd, hm
163    movsxd                       wq, [r5+wq*4]
164    add                          wq, r5
165    lea                    stride3q, [strideq*3]
166    jmp                          wq
167.w4:
168    IPRED_H                       4
169.w8:
170    IPRED_H                       8
171.w16:
172    IPRED_H                      16
173.w32:
174    IPRED_H                      32
175.w64:
176    IPRED_H                      64
177
178;---------------------------------------------------------------------------------------
179;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
180;                                    const int width, const int height, const int a);
181;---------------------------------------------------------------------------------------
182cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
183    LEA                  r5, ipred_dc_splat_ssse3_table
184    tzcnt                wd, wm
185    movu                 m0, [tlq+ 1]
186    movu                 m1, [tlq+17]
187    movu                 m2, [tlq+33]
188    movu                 m3, [tlq+49]
189    movifnidn            hd, hm
190    movsxd               wq, [r5+wq*4]
191    add                  wq, r5
192    lea            stride3q, [strideq*3]
193    jmp                  wq
194
195;---------------------------------------------------------------------------------------
196;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
197;                                    const int width, const int height, const int a);
198;---------------------------------------------------------------------------------------
199cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
200    movifnidn                    hd, hm
201    movifnidn                    wd, wm
202    tzcnt                       r6d, hd
203    lea                         r5d, [wq+hq]
204    movd                         m4, r5d
205    tzcnt                       r5d, r5d
206    movd                         m5, r5d
207    LEA                          r5, ipred_dc_ssse3_table
208    tzcnt                        wd, wd
209    movsxd                       r6, [r5+r6*4]
210    movsxd                       wq, [r5+wq*4+20]
211    pcmpeqd                      m3, m3
212    psrlw                        m4, 1                             ; dc = (width + height) >> 1;
213    add                          r6, r5
214    add                          wq, r5
215    lea                    stride3q, [strideq*3]
216    jmp r6
217.h4:
218    movd                         m0, [tlq-4]
219    pmaddubsw                    m0, m3
220    jmp                          wq
221.w4:
222    movd                         m1, [tlq+1]
223    pmaddubsw                    m1, m3
224    psubw                        m0, m4
225    paddw                        m0, m1
226    pmaddwd                      m0, m3
227    cmp                          hd, 4
228    jg .w4_mul
229    psrlw                        m0, 3                             ; dc >>= ctz(width + height);
230    jmp .w4_end
231.w4_mul:
232    punpckhqdq                   m1, m0, m0
233    paddw                        m0, m1
234    psrlq                        m1, m0, 32
235    paddw                        m0, m1
236    psrlw                        m0, 2
237    mov                         r6d, 0x5556
238    mov                         r2d, 0x3334
239    test                         hd, 8
240    cmovz                       r6d, r2d
241    movd                         m5, r6d
242    pmulhuw                      m0, m5
243.w4_end:
244    pxor                         m1, m1
245    pshufb                       m0, m1
246.s4:
247    movd           [dstq+strideq*0], m0
248    movd           [dstq+strideq*1], m0
249    movd           [dstq+strideq*2], m0
250    movd           [dstq+stride3q ], m0
251    lea                        dstq, [dstq+strideq*4]
252    sub                          hd, 4
253    jg .s4
254    RET
255ALIGN function_align
256.h8:
257    movq                         m0, [tlq-8]
258    pmaddubsw                    m0, m3
259    jmp                          wq
260.w8:
261    movq                         m1, [tlq+1]
262    pmaddubsw                    m1, m3
263    psubw                        m4, m0
264    punpckhqdq                   m0, m0
265    psubw                        m0, m4
266    paddw                        m0, m1
267    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
268    paddw                        m0, m1
269    pmaddwd                      m0, m3
270    psrlw                        m0, m5
271    cmp                          hd, 8
272    je .w8_end
273    mov                         r6d, 0x5556
274    mov                         r2d, 0x3334
275    cmp                          hd, 32
276    cmovz                       r6d, r2d
277    movd                         m1, r6d
278    pmulhuw                      m0, m1
279.w8_end:
280    pxor                         m1, m1
281    pshufb                       m0, m1
282.s8:
283    movq           [dstq+strideq*0], m0
284    movq           [dstq+strideq*1], m0
285    movq           [dstq+strideq*2], m0
286    movq           [dstq+stride3q ], m0
287    lea                        dstq, [dstq+strideq*4]
288    sub                          hd, 4
289    jg .s8
290    RET
291ALIGN function_align
292.h16:
293    mova                         m0, [tlq-16]
294    pmaddubsw                    m0, m3
295    jmp                          wq
296.w16:
297    movu                         m1, [tlq+1]
298    pmaddubsw                    m1, m3
299    paddw                        m0, m1
300    psubw                        m4, m0
301    punpckhqdq                   m0, m0
302    psubw                        m0, m4
303    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
304    paddw                        m0, m1
305    pmaddwd                      m0, m3
306    psrlw                        m0, m5
307    cmp                          hd, 16
308    je .w16_end
309    mov                         r6d, 0x5556
310    mov                         r2d, 0x3334
311    test                         hd, 8|32
312    cmovz                       r6d, r2d
313    movd                         m1, r6d
314    pmulhuw                      m0, m1
315.w16_end:
316    pxor                         m1, m1
317    pshufb                       m0, m1
318.s16:
319    mova           [dstq+strideq*0], m0
320    mova           [dstq+strideq*1], m0
321    mova           [dstq+strideq*2], m0
322    mova           [dstq+stride3q ], m0
323    lea                        dstq, [dstq+strideq*4]
324    sub                          hd, 4
325    jg .s16
326    RET
327ALIGN function_align
328.h32:
329    mova                         m0, [tlq-32]
330    pmaddubsw                    m0, m3
331    mova                         m2, [tlq-16]
332    pmaddubsw                    m2, m3
333    paddw                        m0, m2
334    jmp wq
335.w32:
336    movu                         m1, [tlq+1]
337    pmaddubsw                    m1, m3
338    movu                         m2, [tlq+17]
339    pmaddubsw                    m2, m3
340    paddw                        m1, m2
341    paddw                        m0, m1
342    psubw                        m4, m0
343    punpckhqdq                   m0, m0
344    psubw                        m0, m4
345    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
346    paddw                        m0, m1
347    pmaddwd                      m0, m3
348    psrlw                        m0, m5
349    cmp                          hd, 32
350    je .w32_end
351    lea                         r2d, [hq*2]
352    mov                         r6d, 0x5556
353    mov                         r2d, 0x3334
354    test                         hd, 64|16
355    cmovz                       r6d, r2d
356    movd                         m1, r6d
357    pmulhuw                      m0, m1
358.w32_end:
359    pxor                         m1, m1
360    pshufb                       m0, m1
361    mova                         m1, m0
362.s32:
363    mova                     [dstq], m0
364    mova                  [dstq+16], m1
365    mova             [dstq+strideq], m0
366    mova          [dstq+strideq+16], m1
367    mova           [dstq+strideq*2], m0
368    mova        [dstq+strideq*2+16], m1
369    mova            [dstq+stride3q], m0
370    mova         [dstq+stride3q+16], m1
371    lea                        dstq, [dstq+strideq*4]
372    sub                          hd, 4
373    jg .s32
374    RET
375ALIGN function_align
376.h64:
377    mova                         m0, [tlq-64]
378    mova                         m1, [tlq-48]
379    pmaddubsw                    m0, m3
380    pmaddubsw                    m1, m3
381    paddw                        m0, m1
382    mova                         m1, [tlq-32]
383    pmaddubsw                    m1, m3
384    paddw                        m0, m1
385    mova                         m1, [tlq-16]
386    pmaddubsw                    m1, m3
387    paddw                        m0, m1
388    jmp wq
389.w64:
390    movu                         m1, [tlq+ 1]
391    movu                         m2, [tlq+17]
392    pmaddubsw                    m1, m3
393    pmaddubsw                    m2, m3
394    paddw                        m1, m2
395    movu                         m2, [tlq+33]
396    pmaddubsw                    m2, m3
397    paddw                        m1, m2
398    movu                         m2, [tlq+49]
399    pmaddubsw                    m2, m3
400    paddw                        m1, m2
401    paddw                        m0, m1
402    psubw                        m4, m0
403    punpckhqdq                   m0, m0
404    psubw                        m0, m4
405    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
406    paddw                        m0, m1
407    pmaddwd                      m0, m3
408    psrlw                        m0, m5
409    cmp                          hd, 64
410    je .w64_end
411    mov                         r6d, 0x5556
412    mov                         r2d, 0x3334
413    test                         hd, 32
414    cmovz                       r6d, r2d
415    movd                         m1, r6d
416    pmulhuw                      m0, m1
417.w64_end:
418    pxor                         m1, m1
419    pshufb                       m0, m1
420    mova                         m1, m0
421    mova                         m2, m0
422    mova                         m3, m0
423.s64:
424    mova                     [dstq], m0
425    mova                  [dstq+16], m1
426    mova                  [dstq+32], m2
427    mova                  [dstq+48], m3
428    mova             [dstq+strideq], m0
429    mova          [dstq+strideq+16], m1
430    mova          [dstq+strideq+32], m2
431    mova          [dstq+strideq+48], m3
432    lea                        dstq, [dstq+strideq*2]
433    sub                          hd, 2
434    jg .s64
435    RET
436
437;---------------------------------------------------------------------------------------
438;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
439;                                    const int width, const int height, const int a);
440;---------------------------------------------------------------------------------------
441cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
442    LEA                  r5, ipred_dc_left_ssse3_table
443    mov                  hd, hm                ; zero upper half
444    tzcnt               r6d, hd
445    sub                 tlq, hq
446    tzcnt                wd, wm
447    movu                 m0, [tlq]
448    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
449    movd                 m2, r6d
450    psrld                m3, m2
451    movsxd               r6, [r5+r6*4]
452    pcmpeqd              m2, m2
453    pmaddubsw            m0, m2
454    add                  r6, r5
455    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
456    movsxd               wq, [r5+wq*4]
457    add                  wq, r5
458    jmp                  r6
459.h64:
460    movu                 m1, [tlq+48]                           ; unaligned when jumping here from dc_top
461    pmaddubsw            m1, m2
462    paddw                m0, m1
463    movu                 m1, [tlq+32]                           ; unaligned when jumping here from dc_top
464    pmaddubsw            m1, m2
465    paddw                m0, m1
466.h32:
467    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
468    pmaddubsw            m1, m2
469    paddw                m0, m1
470.h16:
471    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
472    paddw                m0, m1
473.h8:
474    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
475    paddw                m0, m1
476.h4:
477    pmaddwd              m0, m2
478    pmulhrsw             m0, m3
479    lea            stride3q, [strideq*3]
480    pxor                 m1, m1
481    pshufb               m0, m1
482    mova                 m1, m0
483    mova                 m2, m0
484    mova                 m3, m0
485    jmp                  wq
486
487;---------------------------------------------------------------------------------------
488;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
489;                                    const int width, const int height, const int a);
490;---------------------------------------------------------------------------------------
491cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
492    LEA                  r5, ipred_dc_splat_ssse3_table
493    tzcnt                wd, wm
494    movifnidn            hd, hm
495    movsxd               wq, [r5+wq*4]
496    movddup              m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
497    mova                 m1, m0
498    mova                 m2, m0
499    mova                 m3, m0
500    add                  wq, r5
501    lea            stride3q, [strideq*3]
502    jmp                  wq
503
504;---------------------------------------------------------------------------------------
505;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
506;                                    const int width, const int height, const int a);
507;---------------------------------------------------------------------------------------
508cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
509    LEA                  r5, ipred_dc_left_ssse3_table
510    tzcnt                wd, wm
511    inc                 tlq
512    movu                 m0, [tlq]
513    movifnidn            hd, hm
514    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
515    movd                 m2, wd
516    psrld                m3, m2
517    movsxd               r6, [r5+wq*4]
518    pcmpeqd              m2, m2
519    pmaddubsw            m0, m2
520    add                  r6, r5
521    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
522    movsxd               wq, [r5+wq*4]
523    add                  wq, r5
524    jmp                  r6
525
526;---------------------------------------------------------------------------------------
527;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
528;                                    const int width, const int height, const int a);
529;---------------------------------------------------------------------------------------
530%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
531                ;            w * a         = (w - 128) * a + 128 * a
532                ;            (256 - w) * b = (127 - w) * b + 129 * b
533                ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
534    pmaddubsw            m6, m%3, m%1
535    pmaddubsw            m0, m%4, m%2                    ; (w - 128) * a + (127 - w) * b
536    paddw                m6, m%5
537    paddw                m0, m%6                         ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
538    psrlw                m6, 8
539    psrlw                m0, 8
540    packuswb             m6, m0
541%endmacro
542
543cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights
544%define base r6-ipred_smooth_v_ssse3_table
545    LEA                  r6, ipred_smooth_v_ssse3_table
546    tzcnt                wd, wm
547    mov                  hd, hm
548    movsxd               wq, [r6+wq*4]
549    movddup              m0, [base+pb_127_m127]
550    movddup              m1, [base+pw_128]
551    lea            weightsq, [base+smooth_weights+hq*4]
552    neg                  hq
553    movd                 m5, [tlq+hq]
554    pxor                 m2, m2
555    pshufb               m5, m2
556    add                  wq, r6
557    jmp                  wq
558.w4:
559    movd                 m2, [tlq+1]
560    punpckldq            m2, m2
561    punpcklbw            m2, m5                          ; top, bottom
562    lea                  r3, [strideq*3]
563    mova                 m4, [base+ipred_v_shuf]
564    mova                 m5, m4
565    punpckldq            m4, m4
566    punpckhdq            m5, m5
567    pmaddubsw            m3, m2, m0                      ; m3: 127 * top - 127 * bottom
568    paddw                m1, m2                          ; m1:   1 * top + 256 * bottom + 128, overflow is ok
569    paddw                m3, m1                          ; m3: 128 * top + 129 * bottom + 128
570.w4_loop:
571    movu                 m1, [weightsq+hq*2]
572    pshufb               m0, m1, m4                      ;m2, m3, m4 and m5 should be stable in loop
573    pshufb               m1, m5
574    SMOOTH                0, 1, 2, 2, 3, 3
575    movd   [dstq+strideq*0], m6
576    pshuflw              m1, m6, q1032
577    movd   [dstq+strideq*1], m1
578    punpckhqdq           m6, m6
579    movd   [dstq+strideq*2], m6
580    psrlq                m6, 32
581    movd   [dstq+r3       ], m6
582    lea                dstq, [dstq+strideq*4]
583    add                  hq, 4
584    jl .w4_loop
585    RET
586ALIGN function_align
587.w8:
588    movq                 m2, [tlq+1]
589    punpcklbw            m2, m5
590    mova                 m5, [base+ipred_v_shuf]
591    lea                  r3, [strideq*3]
592    pshufd               m4, m5, q0000
593    pshufd               m5, m5, q1111
594    pmaddubsw            m3, m2, m0
595    paddw                m1, m2
596    paddw                m3, m1                           ; m3 is output for loop
597.w8_loop:
598    movq                 m1, [weightsq+hq*2]
599    pshufb               m0, m1, m4
600    pshufb               m1, m5
601    SMOOTH                0, 1, 2, 2, 3, 3
602    movq   [dstq+strideq*0], m6
603    movhps [dstq+strideq*1], m6
604    lea                dstq, [dstq+strideq*2]
605    add                  hq, 2
606    jl .w8_loop
607    RET
608ALIGN function_align
609.w16:
610    movu                 m3, [tlq+1]
611    punpcklbw            m2, m3, m5
612    punpckhbw            m3, m5
613    pmaddubsw            m4, m2, m0
614    pmaddubsw            m5, m3, m0
615    paddw                m0, m1, m2
616    paddw                m1, m3
617    paddw                m4, m0
618    paddw                m5, m1                           ; m4 and m5 is output for loop
619.w16_loop:
620    movd                 m1, [weightsq+hq*2]
621    pshuflw              m1, m1, q0000
622    punpcklqdq           m1, m1
623    SMOOTH 1, 1, 2, 3, 4, 5
624    mova             [dstq], m6
625    add                dstq, strideq
626    add                  hq, 1
627    jl .w16_loop
628    RET
629ALIGN function_align
630.w32:
631%if WIN64
632    movaps         [rsp+24], xmm7
633    %define xmm_regs_used 8
634%endif
635    mova                 m7, m5
636.w32_loop_init:
637    mov                 r3d, 2
638.w32_loop:
639    movddup              m0, [base+pb_127_m127]
640    movddup              m1, [base+pw_128]
641    movu                 m3, [tlq+1]
642    punpcklbw            m2, m3, m7
643    punpckhbw            m3, m7
644    pmaddubsw            m4, m2, m0
645    pmaddubsw            m5, m3, m0
646    paddw                m0, m1, m2
647    paddw                m1, m3
648    paddw                m4, m0
649    paddw                m5, m1
650    movd                 m1, [weightsq+hq*2]
651    pshuflw              m1, m1, q0000
652    punpcklqdq           m1, m1
653    SMOOTH                1, 1, 2, 3, 4, 5
654    mova             [dstq], m6
655    add                 tlq, 16
656    add                dstq, 16
657    dec                 r3d
658    jg .w32_loop
659    lea                dstq, [dstq-32+strideq]
660    sub                 tlq, 32
661    add                  hq, 1
662    jl .w32_loop_init
663    RET
664ALIGN function_align
665.w64:
666%if WIN64
667    movaps         [rsp+24], xmm7
668    %define xmm_regs_used 8
669%endif
670    mova                 m7, m5
671.w64_loop_init:
672    mov                 r3d, 4
673.w64_loop:
674    movddup              m0, [base+pb_127_m127]
675    movddup              m1, [base+pw_128]
676    movu                 m3, [tlq+1]
677    punpcklbw            m2, m3, m7
678    punpckhbw            m3, m7
679    pmaddubsw            m4, m2, m0
680    pmaddubsw            m5, m3, m0
681    paddw                m0, m1, m2
682    paddw                m1, m3
683    paddw                m4, m0
684    paddw                m5, m1
685    movd                 m1, [weightsq+hq*2]
686    pshuflw              m1, m1, q0000
687    punpcklqdq           m1, m1
688    SMOOTH                1, 1, 2, 3, 4, 5
689    mova             [dstq], m6
690    add                 tlq, 16
691    add                dstq, 16
692    dec                 r3d
693    jg .w64_loop
694    lea                dstq, [dstq-64+strideq]
695    sub                 tlq, 64
696    add                  hq, 1
697    jl .w64_loop_init
698    RET
699
700;---------------------------------------------------------------------------------------
701;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
702;                                    const int width, const int height, const int a);
703;---------------------------------------------------------------------------------------
704cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h
705%define base r6-ipred_smooth_h_ssse3_table
706    LEA                  r6, ipred_smooth_h_ssse3_table
707    mov                  wd, wm
708    movd                 m3, [tlq+wq]
709    pxor                 m1, m1
710    pshufb               m3, m1                          ; right
711    tzcnt                wd, wd
712    mov                  hd, hm
713    movsxd               wq, [r6+wq*4]
714    movddup              m4, [base+pb_127_m127]
715    movddup              m5, [base+pw_128]
716    add                  wq, r6
717    jmp                  wq
718.w4:
719    movddup              m6, [base+smooth_weights+4*2]
720    mova                 m7, [base+ipred_h_shuf]
721    sub                 tlq, 4
722    sub                 tlq, hq
723    lea                  r3, [strideq*3]
724.w4_loop:
725    movd                 m2, [tlq+hq]                    ; left
726    pshufb               m2, m7
727    punpcklbw            m1, m2, m3                      ; left, right
728    punpckhbw            m2, m3
729    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
730    paddw                m0, m1                          ; 128 * left + 129 * right
731    pmaddubsw            m1, m6
732    paddw                m1, m5
733    paddw                m0, m1
734    pmaddubsw            m1, m2, m4
735    paddw                m1, m2
736    pmaddubsw            m2, m6
737    paddw                m2, m5
738    paddw                m1, m2
739    psrlw                m0, 8
740    psrlw                m1, 8
741    packuswb             m0, m1
742    movd   [dstq+strideq*0], m0
743    pshuflw              m1, m0, q1032
744    movd   [dstq+strideq*1], m1
745    punpckhqdq           m0, m0
746    movd   [dstq+strideq*2], m0
747    psrlq                m0, 32
748    movd   [dstq+r3       ], m0
749    lea                dstq, [dstq+strideq*4]
750    sub                  hd, 4
751    jg .w4_loop
752    RET
753ALIGN function_align
754.w8:
755    mova                 m6, [base+smooth_weights+8*2]
756    mova                 m7, [base+ipred_h_shuf]
757    sub                 tlq, 4
758    sub                 tlq, hq
759    punpckldq            m7, m7
760.w8_loop:
761    movd                 m2, [tlq+hq]                    ; left
762    pshufb               m2, m7
763    punpcklbw            m1, m2, m3                      ; left, right
764    punpckhbw            m2, m3
765    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
766    paddw                m0, m1                          ; 128 * left + 129 * right
767    pmaddubsw            m1, m6
768    paddw                m1, m5
769    paddw                m0, m1
770    pmaddubsw            m1, m2, m4
771    paddw                m1, m2
772    pmaddubsw            m2, m6
773    paddw                m2, m5
774    paddw                m1, m2
775    psrlw                m0, 8
776    psrlw                m1, 8
777    packuswb             m0, m1
778    movq   [dstq+strideq*0], m0
779    movhps [dstq+strideq*1], m0
780    lea                dstq, [dstq+strideq*2]
781    sub                  hd, 2
782    jg .w8_loop
783    RET
784ALIGN function_align
785.w16:
786    mova                 m6, [base+smooth_weights+16*2]
787    mova                 m7, [base+smooth_weights+16*3]
788    sub                 tlq, 1
789    sub                 tlq, hq
790.w16_loop:
791    pxor                 m1, m1
792    movd                 m2, [tlq+hq]                    ; left
793    pshufb               m2, m1
794    punpcklbw            m1, m2, m3                      ; left, right
795    punpckhbw            m2, m3
796    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
797    paddw                m0, m1                          ; 128 * left + 129 * right
798    pmaddubsw            m1, m6
799    paddw                m1, m5
800    paddw                m0, m1
801    pmaddubsw            m1, m2, m4
802    paddw                m1, m2
803    pmaddubsw            m2, m7
804    paddw                m2, m5
805    paddw                m1, m2
806    psrlw                m0, 8
807    psrlw                m1, 8
808    packuswb             m0, m1
809    mova             [dstq], m0
810    lea                dstq, [dstq+strideq]
811    sub                  hd, 1
812    jg .w16_loop
813    RET
814ALIGN function_align
815.w32:
816    sub                 tlq, 1
817    sub                 tlq, hq
818    pxor                 m6, m6
819.w32_loop_init:
820    mov                  r5, 2
821    lea                  r3, [base+smooth_weights+16*4]
822.w32_loop:
823    mova                 m7, [r3]
824    add                  r3, 16
825    movd                 m2, [tlq+hq]                    ; left
826    pshufb               m2, m6
827    punpcklbw            m1, m2, m3                      ; left, right
828    punpckhbw            m2, m3
829    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
830    paddw                m0, m1                          ; 128 * left + 129 * right
831    pmaddubsw            m1, m7
832    paddw                m1, m5
833    paddw                m0, m1
834    pmaddubsw            m1, m2, m4
835    paddw                m1, m2
836    mova                 m7, [r3]
837    add                  r3, 16
838    pmaddubsw            m2, m7
839    paddw                m2, m5
840    paddw                m1, m2
841    psrlw                m0, 8
842    psrlw                m1, 8
843    packuswb             m0, m1
844    mova             [dstq], m0
845    add                dstq, 16
846    dec                  r5
847    jg .w32_loop
848    lea                dstq, [dstq-32+strideq]
849    sub                  hd, 1
850    jg .w32_loop_init
851    RET
852ALIGN function_align
853.w64:
854    sub                 tlq, 1
855    sub                 tlq, hq
856    pxor                 m6, m6
857.w64_loop_init:
858    mov                  r5, 4
859    lea                  r3, [base+smooth_weights+16*8]
860.w64_loop:
861    mova                 m7, [r3]
862    add                  r3, 16
863    movd                 m2, [tlq+hq]                    ; left
864    pshufb               m2, m6
865    punpcklbw            m1, m2, m3                      ; left, right
866    punpckhbw            m2, m3
867    pmaddubsw            m0, m1, m4                      ; 127 * left - 127 * right
868    paddw                m0, m1                          ; 128 * left + 129 * right
869    pmaddubsw            m1, m7
870    paddw                m1, m5
871    paddw                m0, m1
872    pmaddubsw            m1, m2, m4
873    paddw                m1, m2
874    mova                 m7, [r3]
875    add                  r3, 16
876    pmaddubsw            m2, m7
877    paddw                m2, m5
878    paddw                m1, m2
879    psrlw                m0, 8
880    psrlw                m1, 8
881    packuswb             m0, m1
882    mova             [dstq], m0
883    add                dstq, 16
884    dec                  r5
885    jg .w64_loop
886    lea                dstq, [dstq-64+strideq]
887    sub                  hd, 1
888    jg .w64_loop_init
889    RET
890
891;---------------------------------------------------------------------------------------
892;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
893;                                    const int width, const int height, const int a);
894;---------------------------------------------------------------------------------------
895%macro SMOOTH_2D_END  7                                  ; src[1-2], mul[1-2], add[1-2], m3
896    pmaddubsw            m6, m%3, m%1
897    mova                 m0, m6
898    pmaddubsw            m6, m%4, m%2
899    mova                 m1, m6
900%ifnum %5
901    paddw                m0, m%5
902%else
903    paddw                m0, %5
904%endif
905%ifnum %6
906    paddw                m1, m%6
907%else
908    paddw                m1, %6
909%endif
910%ifnum %7
911%else
912    mova                 m3, %7
913%endif
914    pavgw                m0, m2
915    pavgw                m1, m3
916    psrlw                m0, 8
917    psrlw                m1, 8
918    packuswb             m0, m1
919%endmacro
920
921%macro SMOOTH_OUTPUT_16B  12      ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
922    mova                 m1, [rsp+16*%1]                  ; top
923    punpckhbw            m6, m1, m0                       ; top, bottom
924    punpcklbw            m1, m0                           ; top, bottom
925    pmaddubsw            m2, m1, m5
926    mova        [rsp+16*%2], m1
927    paddw                m1, m3                           ;   1 * top + 255 * bottom + 255
928    paddw                m2, m1                           ; 128 * top + 129 * bottom + 255
929    mova        [rsp+16*%3], m2
930    pmaddubsw            m2, m6, m5
931    mova        [rsp+16*%4], m6
932    paddw                m6, m3                           ;   1 * top + 255 * bottom + 255
933    paddw                m2, m6                           ; 128 * top + 129 * bottom + 255
934    mova        [rsp+16*%5], m2
935    movd                 m1, [tlq+hq]                     ; left
936    pshufb               m1, [base+pb_3]                  ; topleft[-(1 + y)]
937    punpcklbw            m1, m4                           ; left, right
938    pmaddubsw            m2, m1, m5                       ; 127 * left - 127 * right
939    paddw                m2, m1                           ; 128 * left + 129 * right
940    mova                 m3, m2
941    pmaddubsw            m0, m1, %6                       ; weights_hor = &dav1d_sm_weights[width];
942    pmaddubsw            m1, %7
943    paddw                m2, m3, m0
944    paddw                m3, m1
945    movd                 m1, [v_weightsq]                 ; weights_ver = &dav1d_sm_weights[height];
946    mova                 m7, [rsp+16*%9]
947    pshufb               m1, m7
948    mova        [rsp+16*%8], m3
949    mova                 m4, [rsp+16*%2]
950    mova                 m5, [rsp+16*%3]
951    mova                 m3, [rsp+16*%4]
952    mova                 m7, [rsp+16*%5]
953    SMOOTH_2D_END         1, 1, 4, 3, 5, 7, [rsp+16*%8]
954    mova             [dstq], m0
955    movddup              m3, [base+pw_255]                ; recovery
956    mova                 m0, [rsp+16*%10]                 ; recovery
957    mova                 m4, [rsp+16*%11]                 ; recovery
958    mova                 m5, [rsp+16*%12]                 ; recovery
959%endmacro
960
961cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
962%define base r6-ipred_smooth_ssse3_table
963    mov                  wd, wm
964    mov                  hd, hm
965    LEA                  r6, ipred_smooth_ssse3_table
966    movd                 m4, [tlq+wq]                     ; right
967    pxor                 m2, m2
968    pshufb               m4, m2
969    tzcnt                wd, wd
970    mov                  r5, tlq
971    sub                  r5, hq
972    movsxd               wq, [r6+wq*4]
973    movddup              m5, [base+pb_127_m127]
974    movd                 m0, [r5]
975    pshufb               m0, m2                           ; bottom
976    movddup              m3, [base+pw_255]
977    add                  wq, r6
978    lea          v_weightsq, [base+smooth_weights+hq*2]   ; weights_ver = &dav1d_sm_weights[height]
979    jmp                  wq
980.w4:
981    mova                 m7, [base+ipred_v_shuf]
982    movd                 m1, [tlq+1]                      ; left
983    pshufd               m1, m1, q0000
984    sub                 tlq, 4
985    lea                  r3, [strideq*3]
986    sub                 tlq, hq
987    punpcklbw            m1, m0                           ; top, bottom
988    pshufd               m6, m7, q1100
989    pshufd               m7, m7, q3322
990    pmaddubsw            m2, m1, m5
991    paddw                m3, m1                           ;   1 * top + 255 * bottom + 255
992    paddw                m2, m3                           ; 128 * top + 129 * bottom + 255
993    mova         [rsp+16*0], m1
994    mova         [rsp+16*1], m2
995    movq                 m1,  [base+smooth_weights+4*2]   ; weights_hor = &dav1d_sm_weights[width];
996    punpcklqdq           m1, m1
997    mova         [rsp+16*2], m1
998    mova         [rsp+16*3], m4
999    mova         [rsp+16*4], m6
1000    mova         [rsp+16*5], m5
1001.w4_loop:
1002    movd                 m1, [tlq+hq]                 ; left
1003    pshufb               m1, [base+ipred_h_shuf]
1004    punpcklbw            m0, m1, m4                   ; left, right
1005    punpckhbw            m1, m4
1006    pmaddubsw            m2, m0, m5                   ; 127 * left - 127 * right
1007    pmaddubsw            m3, m1, m5
1008    paddw                m2, m0                       ; 128 * left + 129 * right
1009    paddw                m3, m1
1010    mova                 m4, [rsp+16*2]
1011    pmaddubsw            m0, m4
1012    pmaddubsw            m1, m4
1013    paddw                m2, m0
1014    paddw                m3, m1
1015    movq                 m1, [v_weightsq]             ; weights_ver = &dav1d_sm_weights[height];
1016    add          v_weightsq, 8
1017    pshufb               m0, m1, m6
1018    pshufb               m1, m7
1019    mova                 m4, [rsp+16*0]
1020    mova                 m5, [rsp+16*1]
1021    SMOOTH_2D_END         0, 1, 4, 4, 5, 5, 3
1022    mova                 m4, [rsp+16*3]
1023    mova                 m6, [rsp+16*4]
1024    mova                 m5, [rsp+16*5]
1025    movd   [dstq+strideq*0], m0
1026    pshuflw              m1, m0, q1032
1027    movd   [dstq+strideq*1], m1
1028    punpckhqdq           m0, m0
1029    movd   [dstq+strideq*2], m0
1030    psrlq                m0, 32
1031    movd   [dstq+r3       ], m0
1032    lea                dstq, [dstq+strideq*4]
1033    sub                  hd, 4
1034    jg .w4_loop
1035    RET
1036ALIGN function_align
1037.w8:
1038    mova                 m7, [base+ipred_v_shuf]
1039    movq                 m1, [tlq+1]                  ; left
1040    punpcklqdq           m1, m1
1041    sub                 tlq, 4
1042    sub                 tlq, hq
1043    punpcklbw            m1, m0
1044    pshufd               m6, m7, q0000
1045    pshufd               m7, m7, q1111
1046    pmaddubsw            m2, m1, m5
1047    paddw                m3, m1
1048    paddw                m2, m3
1049    mova         [rsp+16*0], m1
1050    mova         [rsp+16*1], m2
1051    mova                 m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
1052    mova         [rsp+16*2], m1
1053    mova         [rsp+16*3], m4
1054    mova         [rsp+16*4], m6
1055    mova         [rsp+16*5], m5
1056.w8_loop:
1057    movd                 m1, [tlq+hq]                  ; left
1058    pshufb               m1, [base+ipred_h_shuf]
1059    pshufd               m1, m1, q1100
1060    punpcklbw            m0, m1, m4
1061    punpckhbw            m1, m4
1062    pmaddubsw            m2, m0, m5
1063    pmaddubsw            m3, m1, m5
1064    paddw                m2, m0
1065    paddw                m3, m1
1066    mova                 m4,  [rsp+16*2]
1067    pmaddubsw            m0, m4
1068    pmaddubsw            m1, m4
1069    paddw                m2, m0
1070    paddw                m3, m1
1071    movd                 m1, [v_weightsq]              ; weights_ver = &dav1d_sm_weights[height];
1072    add          v_weightsq, 4
1073    pshufb               m0, m1, m6
1074    pshufb               m1, m7
1075    mova                 m4, [rsp+16*0]
1076    mova                 m5, [rsp+16*1]
1077    SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
1078    mova                 m4, [rsp+16*3]
1079    mova                 m6, [rsp+16*4]
1080    mova                 m5, [rsp+16*5]
1081    movq   [dstq+strideq*0], m0
1082    movhps [dstq+strideq*1], m0
1083    lea                dstq, [dstq+strideq*2]
1084    sub                  hd, 2
1085    jg .w8_loop
1086    RET
1087ALIGN function_align
1088.w16:
1089    mova                 m7, [base+ipred_v_shuf]
1090    movu                 m1, [tlq+1]                     ; left
1091    sub                 tlq, 4
1092    sub                 tlq, hq
1093    punpckhbw            m6, m1, m0                      ; top, bottom
1094    punpcklbw            m1, m0                          ; top, bottom
1095    pshufd               m7, m7, q0000
1096    mova         [rsp+16*2], m7
1097    pmaddubsw            m2, m6, m5
1098    mova         [rsp+16*5], m6
1099    paddw                m6, m3                          ;   1 * top + 255 * bottom + 255
1100    paddw                m2, m6                          ; 128 * top + 129 * bottom + 255
1101    mova         [rsp+16*6], m2
1102    pmaddubsw            m2, m1, m5
1103    paddw                m3, m1                          ;   1 * top + 255 * bottom + 255
1104    mova         [rsp+16*0], m1
1105    paddw                m2, m3                          ; 128 * top + 129 * bottom + 255
1106    mova         [rsp+16*1], m2
1107    mova         [rsp+16*3], m4
1108    mova         [rsp+16*4], m5
1109.w16_loop:
1110    movd                 m1, [tlq+hq]                    ; left
1111    pshufb               m1, [base+pb_3]                 ; topleft[-(1 + y)]
1112    punpcklbw            m1, m4                          ; left, right
1113    pmaddubsw            m2, m1, m5                      ; 127 * left - 127 * right
1114    paddw                m2, m1                          ; 128 * left + 129 * right
1115    mova                 m0, m1
1116    mova                 m3, m2
1117    pmaddubsw            m0, [base+smooth_weights+16*2]  ; weights_hor = &dav1d_sm_weights[width];
1118    pmaddubsw            m1, [base+smooth_weights+16*3]
1119    paddw                m2, m0
1120    paddw                m3, m1
1121    movd                 m1, [v_weightsq]                ; weights_ver = &dav1d_sm_weights[height];
1122    add          v_weightsq, 2
1123    mova                 m7, [rsp+16*2]
1124    pshufb               m1, m7
1125    mova         [rsp+16*7], m3
1126    mova                 m4, [rsp+16*0]
1127    mova                 m5, [rsp+16*1]
1128    mova                 m3, [rsp+16*5]
1129    mova                 m7, [rsp+16*6]
1130    SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
1131    mova                 m4, [rsp+16*3]
1132    mova                 m5, [rsp+16*4]
1133    mova             [dstq], m0
1134    lea                dstq, [dstq+strideq]
1135    sub                  hd, 1
1136    jg .w16_loop
1137    RET
1138ALIGN function_align
1139.w32:
1140    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
1141    movu                 m2, [tlq+17]                    ; top
1142    mova         [rsp+16*0], m1
1143    mova         [rsp+16*1], m2
1144    sub                 tlq, 4
1145    sub                 tlq, hq
1146    mova                 m7, [base+ipred_v_shuf]
1147    pshufd               m7, m7, q0000
1148    mova         [rsp+16*2], m7
1149    mova         [rsp+16*3], m0
1150    mova         [rsp+16*4], m4
1151    mova         [rsp+16*5], m5
1152.w32_loop:
1153    SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
1154    add                dstq, 16
1155    SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
1156    lea                dstq, [dstq-16+strideq]
1157    add          v_weightsq, 2
1158    sub                  hd, 1
1159    jg .w32_loop
1160    RET
1161ALIGN function_align
1162.w64:
1163    movu                 m1, [tlq+1]                     ; top     topleft[1 + x]
1164    movu                 m2, [tlq+17]                    ; top
1165    mova         [rsp+16*0], m1
1166    mova         [rsp+16*1], m2
1167    movu                 m1, [tlq+33]                    ; top
1168    movu                 m2, [tlq+49]                    ; top
1169    mova        [rsp+16*11], m1
1170    mova        [rsp+16*12], m2
1171    sub                 tlq, 4
1172    sub                 tlq, hq
1173    mova                 m7, [base+ipred_v_shuf]
1174    pshufd               m7, m7, q0000
1175    mova         [rsp+16*2], m7
1176    mova         [rsp+16*3], m0
1177    mova         [rsp+16*4], m4
1178    mova         [rsp+16*5], m5
1179.w64_loop:
1180    SMOOTH_OUTPUT_16B  0, 6, 7, 8, 9,  [base+smooth_weights+16*8],  [base+smooth_weights+16*9], 10, 2, 3, 4, 5
1181    add                dstq, 16
1182    SMOOTH_OUTPUT_16B  1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
1183    add                dstq, 16
1184    SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
1185    add                dstq, 16
1186    SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
1187    lea                dstq, [dstq-48+strideq]
1188    add          v_weightsq, 2
1189    sub                  hd, 1
1190    jg .w64_loop
1191    RET
1192
1193;---------------------------------------------------------------------------------------
1194;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
1195;                                         const uint8_t *idx, const int w, const int h);
1196;---------------------------------------------------------------------------------------
1197cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
1198    mova                 m4, [palq]
1199    LEA                  r2, pal_pred_ssse3_table
1200    tzcnt                wd, wm
1201    movifnidn            hd, hm
1202    movsxd               wq, [r2+wq*4]
1203    packuswb             m4, m4
1204    add                  wq, r2
1205    lea                  r2, [strideq*3]
1206    jmp                  wq
1207.w4:
1208    pshufb               m0, m4, [idxq]
1209    add                idxq, 16
1210    movd   [dstq          ], m0
1211    pshuflw              m1, m0, q1032
1212    movd   [dstq+strideq  ], m1
1213    punpckhqdq           m0, m0
1214    movd   [dstq+strideq*2], m0
1215    psrlq                m0, 32
1216    movd   [dstq+r2       ], m0
1217    lea                dstq, [dstq+strideq*4]
1218    sub                  hd, 4
1219    jg .w4
1220    RET
1221ALIGN function_align
1222.w8:
1223    pshufb               m0, m4, [idxq]
1224    pshufb               m1, m4, [idxq+16]
1225    add                idxq, 32
1226    movq   [dstq          ], m0
1227    movhps [dstq+strideq  ], m0
1228    movq   [dstq+strideq*2], m1
1229    movhps [dstq+r2       ], m1
1230    lea                dstq, [dstq+strideq*4]
1231    sub                  hd, 4
1232    jg .w8
1233    RET
1234ALIGN function_align
1235.w16:
1236    pshufb               m0, m4, [idxq]
1237    pshufb               m1, m4, [idxq+16]
1238    pshufb               m2, m4, [idxq+32]
1239    pshufb               m3, m4, [idxq+48]
1240    add                idxq, 64
1241    mova   [dstq          ], m0
1242    mova   [dstq+strideq  ], m1
1243    mova   [dstq+strideq*2], m2
1244    mova   [dstq+r2       ], m3
1245    lea                dstq, [dstq+strideq*4]
1246    sub                  hd, 4
1247    jg .w16
1248    RET
1249ALIGN function_align
1250.w32:
1251    pshufb               m0, m4, [idxq]
1252    pshufb               m1, m4, [idxq+16]
1253    pshufb               m2, m4, [idxq+32]
1254    pshufb               m3, m4, [idxq+48]
1255    add                idxq, 64
1256    mova  [dstq           ], m0
1257    mova  [dstq+16        ], m1
1258    mova  [dstq+strideq   ], m2
1259    mova  [dstq+strideq+16], m3
1260    lea                dstq, [dstq+strideq*2]
1261    sub                  hd, 2
1262    jg .w32
1263    RET
1264ALIGN function_align
1265.w64:
1266    pshufb               m0, m4, [idxq]
1267    pshufb               m1, m4, [idxq+16]
1268    pshufb               m2, m4, [idxq+32]
1269    pshufb               m3, m4, [idxq+48]
1270    add                idxq, 64
1271    mova          [dstq   ], m0
1272    mova          [dstq+16], m1
1273    mova          [dstq+32], m2
1274    mova          [dstq+48], m3
1275    add                dstq, strideq
1276    sub                  hd, 1
1277    jg .w64
1278    RET
1279
1280;---------------------------------------------------------------------------------------
1281;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
1282;                           const int width, const int height, const int16_t *ac, const int alpha);
1283;---------------------------------------------------------------------------------------
1284%macro IPRED_CFL 1                   ; ac in, unpacked pixels out
1285    psignw               m3, m%1, m1
1286    pabsw               m%1, m%1
1287    pmulhrsw            m%1, m2
1288    psignw              m%1, m3
1289    paddw               m%1, m0
1290%endmacro
1291
1292%if UNIX64
1293DECLARE_REG_TMP 7
1294%else
1295DECLARE_REG_TMP 5
1296%endif
1297
1298cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
1299    movifnidn            wd, wm
1300    movifnidn            hd, hm
1301    tzcnt               r6d, hd
1302    lea                 t0d, [wq+hq]
1303    movd                 m4, t0d
1304    tzcnt               t0d, t0d
1305    movd                 m5, t0d
1306    LEA                  t0, ipred_cfl_ssse3_table
1307    tzcnt                wd, wd
1308    movsxd               r6, [t0+r6*4]
1309    movsxd               wq, [t0+wq*4+16]
1310    pcmpeqd              m3, m3
1311    psrlw                m4, 1
1312    add                  r6, t0
1313    add                  wq, t0
1314    movifnidn           acq, acmp
1315    jmp                  r6
1316.h4:
1317    movd                 m0, [tlq-4]
1318    pmaddubsw            m0, m3
1319    jmp                  wq
1320.w4:
1321    movd                 m1, [tlq+1]
1322    pmaddubsw            m1, m3
1323    psubw                m0, m4
1324    paddw                m0, m1
1325    pmaddwd              m0, m3
1326    cmp                  hd, 4
1327    jg .w4_mul
1328    psrlw                m0, 3                             ; dc >>= ctz(width + height);
1329    jmp .w4_end
1330.w4_mul:
1331    punpckhqdq           m1, m0, m0
1332    paddw                m0, m1
1333    pshuflw              m1, m0, q1032                     ; psrlq                m1, m0, 32
1334    paddw                m0, m1
1335    psrlw                m0, 2
1336    mov                 r6d, 0x5556
1337    mov                 r2d, 0x3334
1338    test                 hd, 8
1339    cmovz               r6d, r2d
1340    movd                 m5, r6d
1341    pmulhuw              m0, m5
1342.w4_end:
1343    pshuflw              m0, m0, q0000
1344    punpcklqdq           m0, m0
1345.s4:
1346    movd                 m1, alpham
1347    pshuflw              m1, m1, q0000
1348    punpcklqdq           m1, m1
1349    lea                  r6, [strideq*3]
1350    pabsw                m2, m1
1351    psllw                m2, 9
1352.s4_loop:
1353    mova                 m4, [acq]
1354    mova                 m5, [acq+16]
1355    IPRED_CFL             4
1356    IPRED_CFL             5
1357    packuswb             m4, m5
1358    movd   [dstq+strideq*0], m4
1359    pshuflw              m4, m4, q1032
1360    movd   [dstq+strideq*1], m4
1361    punpckhqdq           m4, m4
1362    movd   [dstq+strideq*2], m4
1363    psrlq                m4, 32
1364    movd   [dstq+r6       ], m4
1365    lea                dstq, [dstq+strideq*4]
1366    add                 acq, 32
1367    sub                  hd, 4
1368    jg .s4_loop
1369    RET
1370ALIGN function_align
1371.h8:
1372    movq                 m0, [tlq-8]
1373    pmaddubsw            m0, m3
1374    jmp                  wq
1375.w8:
1376    movq                 m1, [tlq+1]
1377    pmaddubsw            m1, m3
1378    psubw                m4, m0
1379    punpckhqdq           m0, m0
1380    psubw                m0, m4
1381    paddw                m0, m1
1382    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
1383    paddw                m0, m1
1384    pmaddwd              m0, m3
1385    psrlw                m0, m5
1386    cmp                  hd, 8
1387    je .w8_end
1388    mov                 r6d, 0x5556
1389    mov                 r2d, 0x3334
1390    cmp                  hd, 32
1391    cmovz               r6d, r2d
1392    movd                 m1, r6d
1393    pmulhuw              m0, m1
1394.w8_end:
1395    pshuflw              m0, m0, q0000
1396    punpcklqdq           m0, m0
1397.s8:
1398    movd                 m1, alpham
1399    pshuflw              m1, m1, q0000
1400    punpcklqdq           m1, m1
1401    lea                  r6, [strideq*3]
1402    pabsw                m2, m1
1403    psllw                m2, 9
1404.s8_loop:
1405    mova                 m4, [acq]
1406    mova                 m5, [acq+16]
1407    IPRED_CFL             4
1408    IPRED_CFL             5
1409    packuswb             m4, m5
1410    movq   [dstq          ], m4
1411    movhps [dstq+strideq  ], m4
1412    mova                 m4, [acq+32]
1413    mova                 m5, [acq+48]
1414    IPRED_CFL             4
1415    IPRED_CFL             5
1416    packuswb             m4, m5
1417    movq   [dstq+strideq*2], m4
1418    movhps [dstq+r6       ], m4
1419    lea                dstq, [dstq+strideq*4]
1420    add                 acq, 64
1421    sub                  hd, 4
1422    jg .s8_loop
1423    RET
1424ALIGN function_align
1425.h16:
1426    mova                 m0, [tlq-16]
1427    pmaddubsw            m0, m3
1428    jmp                  wq
1429.w16:
1430    movu                 m1, [tlq+1]
1431    pmaddubsw            m1, m3
1432    paddw                m0, m1
1433    psubw                m4, m0
1434    punpckhqdq           m0, m0
1435    psubw                m0, m4
1436    pshuflw              m1, m0, q1032                  ; psrlq  m1, m0, 32
1437    paddw                m0, m1
1438    pmaddwd              m0, m3
1439    psrlw                m0, m5
1440    cmp                  hd, 16
1441    je .w16_end
1442    mov                 r6d, 0x5556
1443    mov                 r2d, 0x3334
1444    test                 hd, 8|32
1445    cmovz               r6d, r2d
1446    movd                 m1, r6d
1447    pmulhuw              m0, m1
1448.w16_end:
1449    pshuflw              m0, m0, q0000
1450    punpcklqdq           m0, m0
1451.s16:
1452    movd                 m1, alpham
1453    pshuflw              m1, m1, q0000
1454    punpcklqdq           m1, m1
1455    pabsw                m2, m1
1456    psllw                m2, 9
1457.s16_loop:
1458    mova                 m4, [acq]
1459    mova                 m5, [acq+16]
1460    IPRED_CFL             4
1461    IPRED_CFL             5
1462    packuswb             m4, m5
1463    mova             [dstq], m4
1464    mova                 m4, [acq+32]
1465    mova                 m5, [acq+48]
1466    IPRED_CFL             4
1467    IPRED_CFL             5
1468    packuswb             m4, m5
1469    mova     [dstq+strideq], m4
1470    lea                dstq, [dstq+strideq*2]
1471    add                 acq, 64
1472    sub                  hd, 2
1473    jg .s16_loop
1474    RET
1475ALIGN function_align
1476.h32:
1477    mova                 m0, [tlq-32]
1478    pmaddubsw            m0, m3
1479    mova                 m2, [tlq-16]
1480    pmaddubsw            m2, m3
1481    paddw                m0, m2
1482    jmp                  wq
1483.w32:
1484    movu                 m1, [tlq+1]
1485    pmaddubsw            m1, m3
1486    movu                 m2, [tlq+17]
1487    pmaddubsw            m2, m3
1488    paddw                m1, m2
1489    paddw                m0, m1
1490    psubw                m4, m0
1491    punpckhqdq           m0, m0
1492    psubw                m0, m4
1493    pshuflw              m1, m0, q1032                   ; psrlq  m1, m0, 32
1494    paddw                m0, m1
1495    pmaddwd              m0, m3
1496    psrlw                m0, m5
1497    cmp                  hd, 32
1498    je .w32_end
1499    lea                 r2d, [hq*2]
1500    mov                 r6d, 0x5556
1501    mov                 r2d, 0x3334
1502    test                 hd, 64|16
1503    cmovz               r6d, r2d
1504    movd                 m1, r6d
1505    pmulhuw              m0, m1
1506.w32_end:
1507    pshuflw              m0, m0, q0000
1508    punpcklqdq           m0, m0
1509.s32:
1510    movd                 m1, alpham
1511    pshuflw              m1, m1, q0000
1512    punpcklqdq           m1, m1
1513    pabsw                m2, m1
1514    psllw                m2, 9
1515.s32_loop:
1516    mova                 m4, [acq]
1517    mova                 m5, [acq+16]
1518    IPRED_CFL             4
1519    IPRED_CFL             5
1520    packuswb             m4, m5
1521    mova             [dstq], m4
1522    mova                 m4, [acq+32]
1523    mova                 m5, [acq+48]
1524    IPRED_CFL             4
1525    IPRED_CFL             5
1526    packuswb             m4, m5
1527    mova          [dstq+16], m4
1528    add                dstq, strideq
1529    add                 acq, 64
1530    dec                  hd
1531    jg .s32_loop
1532    RET
1533
1534;---------------------------------------------------------------------------------------
1535;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
1536;                           const int width, const int height, const int16_t *ac, const int alpha);
1537;---------------------------------------------------------------------------------------
1538cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
1539    mov                  hd, hm                                 ; zero upper half
1540    tzcnt               r6d, hd
1541    sub                 tlq, hq
1542    tzcnt                wd, wm
1543    movu                 m0, [tlq]
1544    mov                 t0d, 0x8000
1545    movd                 m3, t0d
1546    movd                 m2, r6d
1547    psrld                m3, m2
1548    LEA                  t0, ipred_cfl_left_ssse3_table
1549    movsxd               r6, [t0+r6*4]
1550    pcmpeqd              m2, m2
1551    pmaddubsw            m0, m2
1552    add                  r6, t0
1553    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
1554    movsxd               wq, [t0+wq*4]
1555    add                  wq, t0
1556    movifnidn           acq, acmp
1557    jmp                  r6
1558.h32:
1559    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
1560    pmaddubsw            m1, m2
1561    paddw                m0, m1
1562.h16:
1563    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
1564    paddw                m0, m1
1565.h8:
1566    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
1567    paddw                m0, m1
1568.h4:
1569    pmaddwd              m0, m2
1570    pmulhrsw             m0, m3
1571    pshuflw              m0, m0, q0000
1572    punpcklqdq           m0, m0
1573    jmp                  wq
1574
1575;---------------------------------------------------------------------------------------
1576;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
1577;                           const int width, const int height, const int16_t *ac, const int alpha);
1578;---------------------------------------------------------------------------------------
1579cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
1580    LEA                  t0, ipred_cfl_left_ssse3_table
1581    tzcnt                wd, wm
1582    inc                 tlq
1583    movu                 m0, [tlq]
1584    movifnidn            hd, hm
1585    mov                 r6d, 0x8000
1586    movd                 m3, r6d
1587    movd                 m2, wd
1588    psrld                m3, m2
1589    movsxd               r6, [t0+wq*4]
1590    pcmpeqd              m2, m2
1591    pmaddubsw            m0, m2
1592    add                  r6, t0
1593    add                  t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
1594    movsxd               wq, [t0+wq*4]
1595    add                  wq, t0
1596    movifnidn           acq, acmp
1597    jmp                  r6
1598
1599;---------------------------------------------------------------------------------------
1600;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
1601;                           const int width, const int height, const int16_t *ac, const int alpha);
1602;---------------------------------------------------------------------------------------
1603cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
1604    tzcnt                wd, wm
1605    movifnidn            hd, hm
1606    LEA                  r6, ipred_cfl_splat_ssse3_table
1607    movsxd               wq, [r6+wq*4]
1608    movddup              m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
1609    add                  wq, r6
1610    movifnidn           acq, acmp
1611    jmp                  wq
1612
1613%macro RELOAD_ACQ_32 1
1614    mov                 acq, ac_bakq       ; restore acq
1615%endmacro
1616
1617%if ARCH_X86_64
1618cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
1619DECLARE_REG_TMP 7
1620    movddup              m2, [pb_2]
1621%else
1622cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
1623DECLARE_REG_TMP 4
1624%define ac_bakq acmp
1625    mov                 t0d, 0x02020202
1626    movd                 m2, t0d
1627    pshufd               m2, m2, q0000
1628%endif
1629    movifnidn            wd, wm
1630    mov                 t0d, hm
1631    mov                  hd, t0d
1632    imul                t0d, wd
1633    movd                 m5, t0d
1634    movifnidn         hpadd, hpadm
1635%if ARCH_X86_64
1636    mov             ac_bakq, acq
1637%endif
1638    shl               hpadd, 2
1639    sub                  hd, hpadd
1640    pxor                 m4, m4
1641    cmp                  wd, 8
1642    jg .w16
1643    je .w8
1644    ; fall-through
1645%if ARCH_X86_64
1646    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
1647%else
1648    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
1649%endif
1650.w4:
1651    lea            stride3q, [strideq*3]
1652.w4_loop:
1653    movq                 m0, [yq]
1654    movq                 m1, [yq+strideq]
1655    movhps               m0, [yq+strideq*2]
1656    movhps               m1, [yq+stride3q]
1657    pmaddubsw            m0, m2
1658    pmaddubsw            m1, m2
1659    paddw                m0, m1
1660    mova              [acq], m0
1661    paddw                m4, m0
1662    lea                  yq, [yq+strideq*4]
1663    add                 acq, 16
1664    sub                  hd, 2
1665    jg .w4_loop
1666    test              hpadd, hpadd
1667    jz .calc_avg_4_8
1668    punpckhqdq           m0, m0
1669.w4_hpad_loop:
1670    mova              [acq], m0
1671    paddw                m4, m0
1672    add                 acq, 16
1673    sub               hpadd, 2
1674    jg .w4_hpad_loop
1675    jmp .calc_avg_4_8
1676.w8:
1677    lea            stride3q, [strideq*3]
1678    test              wpadd, wpadd
1679    jnz .w8_wpad
1680.w8_loop:
1681    mova                 m0, [yq]
1682    mova                 m1, [yq+strideq]
1683    pmaddubsw            m0, m2
1684    pmaddubsw            m1, m2
1685    paddw                m0, m1
1686    mova              [acq], m0
1687    paddw                m4, m0
1688    mova                 m0, [yq+strideq*2]
1689    mova                 m1, [yq+stride3q]
1690    pmaddubsw            m0, m2
1691    pmaddubsw            m1, m2
1692    paddw                m0, m1
1693    mova           [acq+16], m0
1694    paddw                m4, m0
1695    lea                  yq, [yq+strideq*4]
1696    add                 acq, 32
1697    sub                  hd, 2
1698    jg .w8_loop
1699    test              hpadd, hpadd
1700    jz .calc_avg_4_8
1701    jmp .w8_hpad
1702.w8_wpad:                                              ; wpadd=1
1703    movddup              m0, [yq]
1704    movddup              m1, [yq+strideq]
1705    pmaddubsw            m0, m2
1706    pmaddubsw            m1, m2
1707    paddw                m0, m1
1708    pshufhw              m0, m0, q3333
1709    mova              [acq], m0
1710    paddw                m4, m0
1711    lea                  yq, [yq+strideq*2]
1712    add                 acq, 16
1713    sub                  hd, 1
1714    jg .w8_wpad
1715    test              hpadd, hpadd
1716    jz .calc_avg_4_8
1717.w8_hpad:
1718    mova              [acq], m0
1719    paddw                m4, m0
1720    add                 acq, 16
1721    sub               hpadd, 1
1722    jg .w8_hpad
1723    jmp .calc_avg_4_8
1724.w16:
1725    test              wpadd, wpadd
1726    jnz .w16_wpad
1727.w16_loop:
1728    mova                 m0, [yq]
1729    mova                 m1, [yq+strideq]
1730    pmaddubsw            m0, m2
1731    pmaddubsw            m1, m2
1732    paddw                m0, m1
1733    mova              [acq], m0
1734    paddw                m4, m0
1735    mova                 m6, [yq+16]
1736    mova                 m1, [yq+strideq+16]
1737    pmaddubsw            m6, m2
1738    pmaddubsw            m1, m2
1739    paddw                m6, m1
1740    mova           [acq+16], m6
1741    paddw                m4, m6
1742    lea                  yq, [yq+strideq*2]
1743    add                 acq, 32
1744    dec                  hd
1745    jg .w16_loop
1746    test              hpadd, hpadd
1747    jz .calc_avg16
1748    jmp .w16_hpad_loop
1749.w16_wpad:
1750    cmp               wpadd, 2
1751    jl .w16_pad1
1752    je .w16_pad2
1753.w16_pad3:
1754    movddup              m0, [yq]
1755    movddup              m1, [yq+strideq]
1756    pmaddubsw            m0, m2
1757    pmaddubsw            m1, m2
1758    paddw                m0, m1
1759    pshufhw              m0, m0, q3333
1760    mova              [acq], m0
1761    paddw                m4, m0
1762    mova                 m6, m0
1763    punpckhqdq           m6, m0, m0
1764    mova           [acq+16], m6
1765    paddw                m4, m6
1766    lea                  yq, [yq+strideq*2]
1767    add                 acq, 32
1768    dec                  hd
1769    jg .w16_pad3
1770    jmp .w16_wpad_done
1771.w16_pad2:
1772    mova                 m0, [yq]
1773    mova                 m1, [yq+strideq]
1774    pmaddubsw            m0, m2
1775    pmaddubsw            m1, m2
1776    paddw                m0, m1
1777    mova              [acq], m0
1778    paddw                m4, m0
1779    pshufhw              m6, m0, q3333
1780    punpckhqdq           m6, m6
1781    mova           [acq+16], m6
1782    paddw                m4, m6
1783    lea                  yq, [yq+strideq*2]
1784    add                 acq, 32
1785    dec                  hd
1786    jg .w16_pad2
1787    jmp .w16_wpad_done
1788.w16_pad1:
1789    mova                 m0, [yq]
1790    mova                 m1, [yq+strideq]
1791    pmaddubsw            m0, m2
1792    pmaddubsw            m1, m2
1793    paddw                m0, m1
1794    mova              [acq], m0
1795    paddw                m4, m0
1796    movddup              m6, [yq+16]
1797    movddup              m1, [yq+strideq+16]
1798    pmaddubsw            m6, m2
1799    pmaddubsw            m1, m2
1800    paddw                m6, m1
1801    pshufhw              m6, m6, q3333
1802    mova           [acq+16], m6
1803    paddw                m4, m6
1804    lea                  yq, [yq+strideq*2]
1805    add                 acq, 32
1806    dec                  hd
1807    jg .w16_pad1
1808.w16_wpad_done:
1809    test              hpadd, hpadd
1810    jz .calc_avg16
1811.w16_hpad_loop:
1812    mova              [acq], m0
1813    paddw                m4, m0
1814    mova           [acq+16], m6
1815    paddw                m4, m6
1816    add                 acq, 32
1817    dec               hpadd
1818    jg .w16_hpad_loop
1819    jmp .calc_avg16
1820
1821%if ARCH_X86_64
1822    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
1823%else
1824    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
1825%endif
1826.calc_avg_4_8:
1827    psrlw                m2, 9
1828    pmaddwd              m4, m2
1829    jmp .calc_avg
1830.calc_avg16:
1831    psrld                m0, m4, 16
1832    pslld                m4, 16
1833    psrld                m4, 16
1834    paddd                m4, m0
1835.calc_avg:
1836    movd                szd, m5
1837    psrad                m5, 1
1838    tzcnt               r1d, szd
1839    paddd                m4, m5
1840    movd                 m1, r1d
1841    pshufd               m0, m4, q2301
1842    paddd                m0, m4
1843    pshufd               m4, m0, q1032
1844    paddd                m0, m4
1845    psrad                m0, m1                        ; sum >>= log2sz;
1846    packssdw             m0, m0
1847    RELOAD_ACQ_32       acq
1848.sub_loop:
1849    mova                 m1, [acq]
1850    psubw                m1, m0                        ; ac[x] -= sum;
1851    mova              [acq], m1
1852    add                 acq, 16
1853    sub                 szd, 8
1854    jg .sub_loop
1855    RET
1856
1857%if ARCH_X86_64
1858cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
1859    movddup              m2, [pb_4]
1860%else
1861cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
1862    mov                 t0d, 0x04040404
1863    movd                 m2, t0d
1864    pshufd               m2, m2, q0000
1865%endif
1866    movifnidn            wd, wm
1867    mov                 t0d, hm
1868    mov                  hd, t0d
1869    imul                t0d, wd
1870    movd                 m6, t0d
1871    movifnidn         hpadd, hpadm
1872%if ARCH_X86_64
1873    mov             ac_bakq, acq
1874%endif
1875    shl               hpadd, 2
1876    sub                  hd, hpadd
1877    pxor                 m4, m4
1878    pxor                 m5, m5
1879    cmp                  wd, 8
1880    jg .w16
1881    je .w8
1882    ; fall-through
1883
1884%if ARCH_X86_64
1885    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
1886%else
1887    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
1888%endif
1889.w4:
1890    lea            stride3q, [strideq*3]
1891.w4_loop:
1892    movq                 m1, [yq]
1893    movhps               m1, [yq+strideq]
1894    movq                 m0, [yq+strideq*2]
1895    movhps               m0, [yq+stride3q]
1896    pmaddubsw            m0, m2
1897    pmaddubsw            m1, m2
1898    mova              [acq], m1
1899    mova           [acq+16], m0
1900    paddw                m4, m0
1901    paddw                m5, m1
1902    lea                  yq, [yq+strideq*4]
1903    add                 acq, 32
1904    sub                  hd, 4
1905    jg .w4_loop
1906    test              hpadd, hpadd
1907    jz .calc_avg_4
1908    punpckhqdq           m0, m0
1909.w4_hpad_loop:
1910    mova              [acq], m0
1911    paddw                m4, m0
1912    add                 acq, 16
1913    sub               hpadd, 2
1914    jg .w4_hpad_loop
1915    jmp .calc_avg_4
1916.w8:
1917    lea            stride3q, [strideq*3]
1918    test              wpadd, wpadd
1919    jnz .w8_wpad
1920.w8_loop:
1921    mova                 m1, [yq]
1922    mova                 m0, [yq+strideq]
1923    pmaddubsw            m0, m2
1924    pmaddubsw            m1, m2
1925    mova              [acq], m1
1926    mova           [acq+16], m0
1927    paddw                m4, m0
1928    paddw                m5, m1
1929    mova                 m1, [yq+strideq*2]
1930    mova                 m0, [yq+stride3q]
1931    pmaddubsw            m0, m2
1932    pmaddubsw            m1, m2
1933    mova           [acq+32], m1
1934    mova           [acq+48], m0
1935    paddw                m4, m0
1936    paddw                m5, m1
1937    lea                  yq, [yq+strideq*4]
1938    add                 acq, 64
1939    sub                  hd, 4
1940    jg .w8_loop
1941    test              hpadd, hpadd
1942    jz .calc_avg_8_16
1943    jmp .w8_hpad
1944.w8_wpad:
1945    movddup              m1, [yq]
1946    pmaddubsw            m1, m2
1947    pshufhw              m1, m1, q3333
1948    mova              [acq], m1
1949    paddw                m5, m1
1950    movddup              m0, [yq+strideq]
1951    pmaddubsw            m0, m2
1952    pshufhw              m0, m0, q3333
1953    mova           [acq+16], m0
1954    paddw                m4, m0
1955    lea                  yq, [yq+strideq*2]
1956    add                 acq, 32
1957    sub                  hd, 2
1958    jg .w8_wpad
1959    test              hpadd, hpadd
1960    jz .calc_avg_8_16
1961.w8_hpad:
1962    mova              [acq], m0
1963    paddw                m4, m0
1964    mova           [acq+16], m0
1965    paddw                m4, m0
1966    add                 acq, 32
1967    sub               hpadd, 2
1968    jg .w8_hpad
1969    jmp .calc_avg_8_16
1970.w16:
1971    test              wpadd, wpadd
1972    jnz .w16_wpad
1973.w16_loop:
1974    mova                 m1, [yq]
1975    mova                 m0, [yq+16]
1976    pmaddubsw            m0, m2
1977    pmaddubsw            m1, m2
1978    mova              [acq], m1
1979    mova           [acq+16], m0
1980    paddw                m5, m0
1981    paddw                m5, m1
1982    mova                 m1, [yq+strideq]
1983    mova                 m0, [yq+strideq+16]
1984    pmaddubsw            m0, m2
1985    pmaddubsw            m1, m2
1986    mova           [acq+32], m1
1987    mova           [acq+48], m0
1988    paddw                m4, m0
1989    paddw                m4, m1
1990    lea                  yq, [yq+strideq*2]
1991    add                 acq, 64
1992    sub                  hd, 2
1993    jg .w16_loop
1994    test              hpadd, hpadd
1995    jz .calc_avg_8_16
1996    jmp .w16_hpad_loop
1997.w16_wpad:
1998    cmp               wpadd, 2
1999    jl .w16_pad1
2000    je .w16_pad2
2001.w16_pad3:
2002    movddup              m1, [yq]
2003    pmaddubsw            m1, m2
2004    pshufhw              m1, m1, q3333
2005    mova              [acq], m1
2006    paddw                m5, m1
2007    punpckhqdq           m1, m1
2008    mova           [acq+16], m1
2009    paddw                m5, m1
2010    movddup              m1, [yq+strideq]
2011    pmaddubsw            m1, m2
2012    pshufhw              m1, m1, q3333
2013    mova           [acq+32], m1
2014    paddw                m4, m1
2015    punpckhqdq           m0, m1, m1
2016    mova           [acq+48], m0
2017    paddw                m4, m0
2018    lea                  yq, [yq+strideq*2]
2019    add                 acq, 64
2020    sub                  hd, 2
2021    jg .w16_pad3
2022    jmp .w16_wpad_done
2023.w16_pad2:
2024    mova                 m1, [yq]
2025    pmaddubsw            m1, m2
2026    mova              [acq], m1
2027    paddw                m5, m1
2028    pshufhw              m1, m1, q3333
2029    punpckhqdq           m1, m1
2030    mova           [acq+16], m1
2031    paddw                m5, m1
2032    mova                 m1, [yq+strideq]
2033    pmaddubsw            m1, m2
2034    mova           [acq+32], m1
2035    paddw                m4, m1
2036    mova                 m0, m1
2037    pshufhw              m0, m0, q3333
2038    punpckhqdq           m0, m0
2039    mova           [acq+48], m0
2040    paddw                m4, m0
2041    lea                  yq, [yq+strideq*2]
2042    add                 acq, 64
2043    sub                  hd, 2
2044    jg .w16_pad2
2045    jmp .w16_wpad_done
2046.w16_pad1:
2047    mova                 m1, [yq]
2048    pmaddubsw            m1, m2
2049    mova              [acq], m1
2050    paddw                m5, m1
2051    movddup              m0, [yq+16]
2052    pmaddubsw            m0, m2
2053    pshufhw              m0, m0, q3333
2054    mova           [acq+16], m0
2055    paddw                m5, m0
2056    mova                 m1, [yq+strideq]
2057    pmaddubsw            m1, m2
2058    mova           [acq+32], m1
2059    paddw                m4, m1
2060    movddup              m0, [yq+strideq+16]
2061    pmaddubsw            m0, m2
2062    pshufhw              m0, m0, q3333
2063    mova           [acq+48], m0
2064    paddw                m4, m0
2065    lea                  yq, [yq+strideq*2]
2066    add                 acq, 64
2067    sub                  hd, 2
2068    jg .w16_pad1
2069.w16_wpad_done:
2070    test              hpadd, hpadd
2071    jz .calc_avg_8_16
2072.w16_hpad_loop:
2073    mova              [acq], m1
2074    mova           [acq+16], m0
2075    paddw                m4, m1
2076    paddw                m5, m0
2077    mova           [acq+32], m1
2078    mova           [acq+48], m0
2079    paddw                m4, m1
2080    paddw                m5, m0
2081    add                 acq, 64
2082    sub               hpadd, 2
2083    jg .w16_hpad_loop
2084    jmp .calc_avg_8_16
2085
2086%if ARCH_X86_64
2087    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
2088%else
2089    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
2090%endif
2091.calc_avg_4:
2092    psrlw                m2, 10
2093    pmaddwd              m5, m2
2094    pmaddwd              m0, m4, m2
2095    jmp .calc_avg
2096.calc_avg_8_16:
2097    mova                 m0, m5
2098    psrld                m5, 16
2099    pslld                m0, 16
2100    psrld                m0, 16
2101    paddd                m5, m0
2102    mova                 m0, m4
2103    psrld                m0, 16
2104    pslld                m4, 16
2105    psrld                m4, 16
2106    paddd                m0, m4
2107.calc_avg:
2108    paddd                m5, m0
2109    movd                szd, m6
2110    psrad                m6, 1
2111    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
2112    paddd                m5, m6
2113    movd                 m1, r1d
2114    pshufd               m0, m5, q2301
2115    paddd                m0, m5
2116    pshufd               m5, m0, q1032
2117    paddd                m0, m5
2118    psrad                m0, m1                        ; sum >>= log2sz;
2119    packssdw             m0, m0
2120    RELOAD_ACQ_32       acq                            ; ac = ac_orig
2121.sub_loop:
2122    mova                 m1, [acq]
2123    psubw                m1, m0
2124    mova              [acq], m1
2125    add                 acq, 16
2126    sub                 szd, 8
2127    jg .sub_loop
2128    RET
2129
2130%if ARCH_X86_64
2131cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
2132    movddup              m2, [pb_4]
2133%else
2134cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
2135%define ac_bakq [rsp+16*4]
2136    mov                 t0d, 0x04040404
2137    movd                 m2, t0d
2138    pshufd               m2, m2, q0000
2139%endif
2140    movifnidn            wd, wm
2141    movifnidn         hpadd, hpadm
2142    movd                 m0, hpadd
2143    mov                 t0d, hm
2144    mov                  hd, t0d
2145    imul                t0d, wd
2146    movd                 m6, t0d
2147    movd              hpadd, m0
2148    mov             ac_bakq, acq
2149    shl               hpadd, 2
2150    sub                  hd, hpadd
2151    pxor                 m5, m5
2152    pxor                 m4, m4
2153    cmp                  wd, 16
2154    jg .w32
2155    cmp                  wd, 8
2156    jg .w16
2157    je .w8
2158    ; fall-through
2159
2160%if ARCH_X86_64
2161    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
2162%else
2163    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
2164%endif
2165.w4:
2166    lea            stride3q, [strideq*3]
2167.w4_loop:
2168    movd                 m1, [yq]
2169    movd                 m3, [yq+strideq]
2170    punpckldq            m1, m3
2171    punpcklbw            m1, m1
2172    movd                 m0, [yq+strideq*2]
2173    movd                 m3, [yq+stride3q]
2174    punpckldq            m0, m3
2175    punpcklbw            m0, m0
2176    pmaddubsw            m1, m2
2177    pmaddubsw            m0, m2
2178    mova              [acq], m1
2179    mova           [acq+16], m0
2180    paddw                m5, m0
2181    paddw                m5, m1
2182    lea                  yq, [yq+strideq*4]
2183    add                 acq, 32
2184    sub                  hd, 4
2185    jg .w4_loop
2186    test              hpadd, hpadd
2187    jz .calc_avg_4
2188    punpckhqdq           m0, m0
2189.w4_hpad_loop:
2190    mova              [acq], m0
2191    paddw                m5, m0
2192    add                 acq, 16
2193    sub               hpadd, 2
2194    jg .w4_hpad_loop
2195.calc_avg_4:
2196    psrlw                m2, 10
2197    pmaddwd              m5, m2
2198    jmp .calc_avg
2199
2200.w8:
2201    lea            stride3q, [strideq*3]
2202    test              wpadd, wpadd
2203    jnz .w8_wpad
2204.w8_loop:
2205    movq                 m1, [yq]
2206    punpcklbw            m1, m1
2207    pmaddubsw            m1, m2
2208    mova              [acq], m1
2209    paddw                m5, m1
2210    movq                 m0, [yq+strideq]
2211    punpcklbw            m0, m0
2212    pmaddubsw            m0, m2
2213    mova           [acq+16], m0
2214    paddw                m5, m0
2215    movq                 m1, [yq+strideq*2]
2216    punpcklbw            m1, m1
2217    pmaddubsw            m1, m2
2218    mova           [acq+32], m1
2219    paddw                m4, m1
2220    movq                 m0, [yq+stride3q]
2221    punpcklbw            m0, m0
2222    pmaddubsw            m0, m2
2223    mova           [acq+48], m0
2224    paddw                m4, m0
2225    lea                  yq, [yq+strideq*4]
2226    add                 acq, 64
2227    sub                  hd, 4
2228    jg .w8_loop
2229    test              hpadd, hpadd
2230    jz .calc_avg_8_16
2231    jmp .w8_hpad
2232.w8_wpad:
2233    movd                 m1, [yq]
2234    punpcklbw            m1, m1
2235    punpcklqdq           m1, m1
2236    pmaddubsw            m1, m2
2237    pshufhw              m1, m1, q3333
2238    mova              [acq], m1
2239    paddw                m5, m1
2240    movd                 m0, [yq+strideq]
2241    punpcklbw            m0, m0
2242    punpcklqdq           m0, m0
2243    pmaddubsw            m0, m2
2244    pshufhw              m0, m0, q3333
2245    mova           [acq+16], m0
2246    paddw                m4, m0
2247    lea                  yq, [yq+strideq*2]
2248    add                 acq, 32
2249    sub                  hd, 2
2250    jg .w8_wpad
2251    test              hpadd, hpadd
2252    jz .calc_avg_8_16
2253.w8_hpad:
2254    mova              [acq], m0
2255    paddw                m5, m0
2256    mova           [acq+16], m0
2257    paddw                m4, m0
2258    add                 acq, 32
2259    sub               hpadd, 2
2260    jg .w8_hpad
2261    jmp .calc_avg_8_16
2262
2263.w16:
2264    test              wpadd, wpadd
2265    jnz .w16_wpad
2266.w16_loop:
2267    mova                 m0, [yq]
2268    mova                 m1, m0
2269    punpcklbw            m1, m1
2270    pmaddubsw            m1, m2
2271    mova              [acq], m1
2272    paddw                m5, m1
2273    punpckhbw            m0, m0
2274    pmaddubsw            m0, m2
2275    mova           [acq+16], m0
2276    paddw                m5, m0
2277    mova                 m0, [yq+strideq]
2278    mova                 m1, m0
2279    punpcklbw            m1, m1
2280    pmaddubsw            m1, m2
2281    mova           [acq+32], m1
2282    paddw                m4, m1
2283    punpckhbw            m0, m0
2284    pmaddubsw            m0, m2
2285    mova           [acq+48], m0
2286    paddw                m4, m0
2287    lea                  yq, [yq+strideq*2]
2288    add                 acq, 64
2289    sub                  hd, 2
2290    jg .w16_loop
2291    test              hpadd, hpadd
2292    jz .calc_avg_8_16
2293    jmp .w16_hpad_loop
2294.w16_wpad:
2295    cmp               wpadd, 2
2296    jl .w16_pad1
2297    je .w16_pad2
2298.w16_pad3:
2299    movd                 m1, [yq]
2300    punpcklbw            m1, m1
2301    punpcklqdq           m1, m1
2302    pshufhw              m1, m1, q3333
2303    pmaddubsw            m1, m2
2304    mova              [acq], m1
2305    paddw                m5, m1
2306    punpckhqdq           m1, m1
2307    mova           [acq+16], m1
2308    paddw                m5, m1
2309    movd                 m1, [yq+strideq]
2310    punpcklbw            m1, m1
2311    punpcklqdq           m1, m1
2312    pshufhw              m1, m1, q3333
2313    pmaddubsw            m1, m2
2314    mova           [acq+32], m1
2315    paddw                m4, m1
2316    punpckhqdq           m0, m1, m1
2317    mova           [acq+48], m0
2318    paddw                m4, m0
2319    lea                  yq, [yq+strideq*2]
2320    add                 acq, 64
2321    sub                  hd, 2
2322    jg .w16_pad3
2323    jmp .w16_wpad_done
2324.w16_pad2:
2325    movq                 m1, [yq]
2326    punpcklbw            m1, m1
2327    pmaddubsw            m1, m2
2328    mova              [acq], m1
2329    paddw                m5, m1
2330    pshufhw              m1, m1, q3333
2331    punpckhqdq           m1, m1
2332    mova           [acq+16], m1
2333    paddw                m5, m1
2334    movq                 m1, [yq+strideq]
2335    punpcklbw            m1, m1
2336    pmaddubsw            m1, m2
2337    mova           [acq+32], m1
2338    paddw                m4, m1
2339    mova                 m0, m1
2340    pshufhw              m0, m0, q3333
2341    punpckhqdq           m0, m0
2342    mova           [acq+48], m0
2343    paddw                m4, m0
2344    lea                  yq, [yq+strideq*2]
2345    add                 acq, 64
2346    sub                  hd, 2
2347    jg .w16_pad2
2348    jmp .w16_wpad_done
2349.w16_pad1:
2350    mova                 m0, [yq]
2351    mova                 m1, m0
2352    punpcklbw            m1, m1
2353    pmaddubsw            m1, m2
2354    mova              [acq], m1
2355    paddw                m5, m1
2356    punpckhbw            m0, m0
2357    punpcklqdq           m0, m0
2358    pshufhw              m0, m0, q3333
2359    pmaddubsw            m0, m2
2360    mova           [acq+16], m0
2361    paddw                m5, m0
2362    mova                 m0, [yq+strideq]
2363    mova                 m1, m0
2364    punpcklbw            m1, m1
2365    pmaddubsw            m1, m2
2366    mova           [acq+32], m1
2367    paddw                m4, m1
2368    punpckhbw            m0, m0
2369    punpcklqdq           m0, m0
2370    pshufhw              m0, m0, q3333
2371    pmaddubsw            m0, m2
2372    mova           [acq+48], m0
2373    paddw                m4, m0
2374    lea                  yq, [yq+strideq*2]
2375    add                 acq, 64
2376    sub                  hd, 2
2377    jg .w16_pad1
2378.w16_wpad_done:
2379    test              hpadd, hpadd
2380    jz .calc_avg_8_16
2381.w16_hpad_loop:
2382    mova              [acq], m1
2383    mova           [acq+16], m0
2384    paddw                m4, m1
2385    paddw                m5, m0
2386    mova           [acq+32], m1
2387    mova           [acq+48], m0
2388    paddw                m4, m1
2389    paddw                m5, m0
2390    add                 acq, 64
2391    sub               hpadd, 2
2392    jg .w16_hpad_loop
2393.calc_avg_8_16:
2394    mova                 m0, m5
2395    psrld                m5, 16
2396    pslld                m0, 16
2397    psrld                m0, 16
2398    paddd                m5, m0
2399    mova                 m0, m4
2400    psrld                m0, 16
2401    pslld                m4, 16
2402    psrld                m4, 16
2403    paddd                m0, m4
2404    paddd                m5, m0
2405    jmp .calc_avg
2406
2407.w32:
2408    pxor                 m0, m0
2409    mova           [rsp   ], m0
2410    mova           [rsp+16], m0
2411    mova           [rsp+32], m0
2412    mova           [rsp+48], m0
2413    test              wpadd, wpadd
2414    jnz .w32_wpad
2415.w32_loop:
2416    mova                 m0, [yq]
2417    mova                 m1, m0
2418    punpcklbw            m1, m1
2419    pmaddubsw            m1, m2
2420    mova              [acq], m1
2421    paddw                m5, m1, [rsp]
2422    mova           [rsp   ], m5
2423    punpckhbw            m0, m0
2424    pmaddubsw            m0, m2
2425    mova           [acq+16], m0
2426    paddw                m5, m0, [rsp+16]
2427    mova           [rsp+16], m5
2428    mova                 m4, [yq+16]
2429    mova                 m3, m4
2430    punpcklbw            m3, m3
2431    pmaddubsw            m3, m2
2432    mova           [acq+32], m3
2433    paddw                m5, m3, [rsp+32]
2434    mova           [rsp+32], m5
2435    punpckhbw            m4, m4
2436    pmaddubsw            m4, m2
2437    mova           [acq+48], m4
2438    paddw                m5, m4, [rsp+48]
2439    mova           [rsp+48], m5
2440    lea                  yq, [yq+strideq]
2441    add                 acq, 64
2442    sub                  hd, 1
2443    jg .w32_loop
2444    test              hpadd, hpadd
2445    jz .calc_avg_32
2446    jmp .w32_hpad_loop
2447.w32_wpad:
2448    cmp               wpadd, 2
2449    jl .w32_pad1
2450    je .w32_pad2
2451    cmp               wpadd, 4
2452    jl .w32_pad3
2453    je .w32_pad4
2454    cmp               wpadd, 6
2455    jl .w32_pad5
2456    je .w32_pad6
2457.w32_pad7:
2458    movd                 m1, [yq]
2459    punpcklbw            m1, m1
2460    punpcklqdq           m1, m1
2461    pshufhw              m1, m1, q3333
2462    pmaddubsw            m1, m2
2463    mova              [acq], m1
2464    paddw                m5, m1, [rsp]
2465    mova           [rsp   ], m5
2466    mova                 m0, m1
2467    punpckhqdq           m0, m0
2468    mova           [acq+16], m0
2469    paddw                m5, m0, [rsp+16]
2470    mova           [rsp+16], m5
2471    mova                 m3, m0
2472    mova           [acq+32], m3
2473    paddw                m5, m3, [rsp+32]
2474    mova           [rsp+32], m5
2475    mova                 m4, m3
2476    mova           [acq+48], m4
2477    paddw                m5, m4, [rsp+48]
2478    mova           [rsp+48], m5
2479    lea                  yq, [yq+strideq]
2480    add                 acq, 64
2481    sub                  hd, 1
2482    jg .w32_pad7
2483    jmp .w32_wpad_done
2484.w32_pad6:
2485    mova                 m0, [yq]
2486    mova                 m1, m0
2487    punpcklbw            m1, m1
2488    pmaddubsw            m1, m2
2489    mova              [acq], m1
2490    paddw                m5, m1, [rsp]
2491    mova           [rsp   ], m5
2492    pshufhw              m0, m1, q3333
2493    punpckhqdq           m0, m0
2494    mova           [acq+16], m0
2495    paddw                m5, m0, [rsp+16]
2496    mova           [rsp+16], m5
2497    mova                 m3, m0
2498    mova           [acq+32], m3
2499    paddw                m5, m3, [rsp+32]
2500    mova           [rsp+32], m5
2501    mova                 m4, m3
2502    mova           [acq+48], m4
2503    paddw                m5, m4, [rsp+48]
2504    mova           [rsp+48], m5
2505    lea                  yq, [yq+strideq]
2506    add                 acq, 64
2507    sub                  hd, 1
2508    jg .w32_pad6
2509    jmp .w32_wpad_done
2510.w32_pad5:
2511    mova                 m0, [yq]
2512    mova                 m1, m0
2513    punpcklbw            m1, m1
2514    pmaddubsw            m1, m2
2515    mova              [acq], m1
2516    mova                 m5, [rsp]
2517    paddw                m5, m1
2518    mova           [rsp   ], m5
2519    punpckhbw            m0, m0
2520    punpcklqdq           m0, m0
2521    pshufhw              m0, m0, q3333
2522    pmaddubsw            m0, m2
2523    mova           [acq+16], m0
2524    paddw                m5, m0, [rsp+16]
2525    mova           [rsp+16], m5
2526    mova                 m3, m0
2527    punpckhqdq           m3, m3
2528    mova           [acq+32], m3
2529    paddw                m5, m3, [rsp+32]
2530    mova           [rsp+32], m5
2531    mova                 m4, m3
2532    mova           [acq+48], m4
2533    paddw                m5, m4, [rsp+48]
2534    mova           [rsp+48], m5
2535    lea                  yq, [yq+strideq]
2536    add                 acq, 64
2537    sub                  hd, 1
2538    jg .w32_pad5
2539    jmp .w32_wpad_done
2540.w32_pad4:
2541    mova                 m0, [yq]
2542    mova                 m1, m0
2543    punpcklbw            m1, m1
2544    pmaddubsw            m1, m2
2545    mova              [acq], m1
2546    paddw                m5, m1, [rsp]
2547    mova           [rsp   ], m5
2548    punpckhbw            m0, m0
2549    pmaddubsw            m0, m2
2550    mova           [acq+16], m0
2551    paddw                m5, m0, [rsp+16]
2552    mova           [rsp+16], m5
2553    mova                 m3, m0
2554    pshufhw              m3, m3, q3333
2555    punpckhqdq           m3, m3
2556    mova           [acq+32], m3
2557    paddw                m5, m3, [rsp+32]
2558    mova           [rsp+32], m5
2559    mova                 m4, m3
2560    mova           [acq+48], m4
2561    paddw                m5, m4, [rsp+48]
2562    mova           [rsp+48], m5
2563    lea                  yq, [yq+strideq]
2564    add                 acq, 64
2565    sub                  hd, 1
2566    jg .w32_pad4
2567    jmp .w32_wpad_done
2568.w32_pad3:
2569    mova                 m0, [yq]
2570    mova                 m1, m0
2571    punpcklbw            m1, m1
2572    pmaddubsw            m1, m2
2573    mova              [acq], m1
2574    paddw                m5, m1, [rsp]
2575    mova           [rsp   ], m5
2576    punpckhbw            m0, m0
2577    pmaddubsw            m0, m2
2578    mova           [acq+16], m0
2579    paddw                m5, m0, [rsp+16]
2580    mova           [rsp+16], m5
2581    movd                 m3, [yq+16]
2582    punpcklbw            m3, m3
2583    punpcklqdq           m3, m3
2584    pshufhw              m3, m3, q3333
2585    pmaddubsw            m3, m2
2586    mova           [acq+32], m3
2587    paddw                m5, m3, [rsp+32]
2588    mova           [rsp+32], m5
2589    mova                 m4, m3
2590    punpckhqdq           m4, m4
2591    mova           [acq+48], m4
2592    paddw                m5, m4, [rsp+48]
2593    mova           [rsp+48], m5
2594    lea                  yq, [yq+strideq]
2595    add                 acq, 64
2596    sub                  hd, 1
2597    jg .w32_pad3
2598    jmp .w32_wpad_done
2599.w32_pad2:
2600    mova                 m0, [yq]
2601    mova                 m1, m0
2602    punpcklbw            m1, m1
2603    pmaddubsw            m1, m2
2604    mova              [acq], m1
2605    paddw                m5, m1, [rsp]
2606    mova           [rsp   ], m5
2607    punpckhbw            m0, m0
2608    pmaddubsw            m0, m2
2609    mova           [acq+16], m0
2610    paddw                m5, m0, [rsp+16]
2611    mova           [rsp+16], m5
2612    mova                 m3, [yq+16]
2613    punpcklbw            m3, m3
2614    pmaddubsw            m3, m2
2615    mova           [acq+32], m3
2616    paddw                m5, m3, [rsp+32]
2617    mova           [rsp+32], m5
2618    pshufhw              m4, m3, q3333
2619    punpckhqdq           m4, m4
2620    mova           [acq+48], m4
2621    paddw                m5, m4, [rsp+48]
2622    mova           [rsp+48], m5
2623    lea                  yq, [yq+strideq]
2624    add                 acq, 64
2625    sub                  hd, 1
2626    jg .w32_pad2
2627    jmp .w32_wpad_done
2628.w32_pad1:
2629    mova                 m0, [yq]
2630    mova                 m1, m0
2631    punpcklbw            m1, m1
2632    pmaddubsw            m1, m2
2633    mova              [acq], m1
2634    paddw                m5, m1, [rsp]
2635    mova           [rsp   ], m5
2636    punpckhbw            m0, m0
2637    pmaddubsw            m0, m2
2638    mova           [acq+16], m0
2639    paddw                m5, m0, [rsp+16]
2640    mova           [rsp+16], m5
2641    mova                 m4, [yq+16]
2642    mova                 m3, m4
2643    punpcklbw            m3, m3
2644    pmaddubsw            m3, m2
2645    mova           [acq+32], m3
2646    paddw                m5, m3, [rsp+32]
2647    mova           [rsp+32], m5
2648    punpckhbw            m4, m4
2649    punpcklqdq           m4, m4
2650    pshufhw              m4, m4, q3333
2651    pmaddubsw            m4, m2
2652    mova           [acq+48], m4
2653    paddw                m5, m4, [rsp+48]
2654    mova           [rsp+48], m5
2655    lea                  yq, [yq+strideq]
2656    add                 acq, 64
2657    sub                  hd, 1
2658    jg .w32_pad1
2659.w32_wpad_done:
2660    test              hpadd, hpadd
2661    jz .calc_avg_32
2662.w32_hpad_loop:
2663    mova              [acq], m1
2664    mova           [acq+16], m0
2665    paddw                m5, m1, [rsp]
2666    mova           [rsp   ], m5
2667    paddw                m5, m0, [rsp+16]
2668    mova           [rsp+16], m5
2669    mova           [acq+32], m3
2670    mova           [acq+48], m4
2671    paddw                m5, m3, [rsp+32]
2672    mova           [rsp+32], m5
2673    paddw                m5, m4, [rsp+48]
2674    mova           [rsp+48], m5
2675    add                 acq, 64
2676    sub               hpadd, 1
2677    jg .w32_hpad_loop
2678
2679%if ARCH_X86_64
2680    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
2681%else
2682    DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
2683%endif
2684
2685.calc_avg_32:
2686    mova                 m5, [rsp]
2687    mova                 m0, m5
2688    psrld                m5, 16
2689    pslld                m0, 16
2690    psrld                m0, 16
2691    paddd                m5, m0
2692    mova                 m0, [rsp+16]
2693    mova                 m3, m0
2694    psrld                m0, 16
2695    pslld                m3, 16
2696    psrld                m3, 16
2697    paddd                m0, m3
2698    paddd                m5, m0
2699    mova                 m0, [rsp+32]
2700    mova                 m3, m0
2701    psrld                m0, 16
2702    pslld                m3, 16
2703    psrld                m3, 16
2704    paddd                m0, m3
2705    mova                 m1, [rsp+48]
2706    mova                 m3, m1
2707    psrld                m1, 16
2708    pslld                m3, 16
2709    psrld                m3, 16
2710    paddd                m1, m3
2711    paddd                m1, m0
2712    paddd                m5, m1
2713.calc_avg:
2714    movd                szd, m6
2715    psrad                m6, 1
2716    tzcnt               r1d, szd                       ; const int log2sz = ctz(width) + ctz(height);
2717    paddd                m5, m6
2718    movd                 m1, r1d
2719    pshufd               m0, m5, q2301
2720    paddd                m0, m5
2721    pshufd               m5, m0, q1032
2722    paddd                m0, m5
2723    psrad                m0, m1                        ; sum >>= log2sz;
2724    packssdw             m0, m0
2725    RELOAD_ACQ_32       acq                            ; ac = ac_orig
2726.sub_loop:
2727    mova                 m1, [acq]
2728    psubw                m1, m0
2729    mova              [acq], m1
2730    add                 acq, 16
2731    sub                 szd, 8
2732    jg .sub_loop
2733    RET
2734
2735; %1 simd register that hold the mask and will hold the result
2736; %2 simd register that holds the "true" values
2737; %3 location of the "false" values (simd register/memory)
2738%macro BLEND 3 ; mask, true, false
2739    pand  %2, %1
2740    pandn %1, %3
2741    por   %1, %2
2742%endmacro
2743
2744%macro PAETH 2                                 ; top, ldiff
2745    pavgb                m1, m%1, m3
2746    pxor                 m0, m%1, m3
2747    pand                 m0, m4
2748    psubusb              m2, m5, m1
2749    psubb                m1, m0
2750    psubusb              m1, m5
2751    por                  m1, m2
2752    paddusb              m1, m1
2753    por                  m1, m0               ; min(tldiff, 255)
2754    psubusb              m2, m5, m3
2755    psubusb              m0, m3, m5
2756    por                  m2, m0               ; tdiff
2757%ifnum %2
2758    pminub               m2, m%2
2759    pcmpeqb              m0, m%2, m2          ; ldiff <= tdiff
2760%else
2761    mova                 m0, %2
2762    pminub               m2, m0
2763    pcmpeqb              m0, m2
2764%endif
2765    pminub               m1, m2
2766    pcmpeqb              m1, m2               ; ldiff <= tldiff && tdiff <= tldiff
2767    mova                 m2, m3
2768    BLEND                m0, m2, m%1
2769    BLEND                m1, m0, m5
2770%endmacro
2771
2772cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h
2773%define base r5-ipred_paeth_ssse3_table
2774    tzcnt                wd, wm
2775    movifnidn            hd, hm
2776    pxor                 m0, m0
2777    movd                 m5, [tlq]
2778    pshufb               m5, m0
2779    LEA                  r5, ipred_paeth_ssse3_table
2780    movsxd               wq, [r5+wq*4]
2781    movddup              m4, [base+ipred_paeth_shuf]
2782    add                  wq, r5
2783    jmp                  wq
2784.w4:
2785    movd                 m6, [tlq+1]            ; top
2786    pshufd               m6, m6, q0000
2787    lea                  r3, [strideq*3]
2788    psubusb              m7, m5, m6
2789    psubusb              m0, m6, m5
2790    por                  m7, m0                 ; ldiff
2791.w4_loop:
2792    sub                 tlq, 4
2793    movd                 m3, [tlq]
2794    mova                 m1, [base+ipred_h_shuf]
2795    pshufb               m3, m1                 ; left
2796    PAETH                 6, 7
2797    movd   [dstq          ], m1
2798    pshuflw              m0, m1, q1032
2799    movd   [dstq+strideq  ], m0
2800    punpckhqdq           m1, m1
2801    movd   [dstq+strideq*2], m1
2802    psrlq                m1, 32
2803    movd   [dstq+r3       ], m1
2804    lea                dstq, [dstq+strideq*4]
2805    sub                  hd, 4
2806    jg .w4_loop
2807    RET
2808ALIGN function_align
2809.w8:
2810    movddup              m6, [tlq+1]
2811    psubusb              m7, m5, m6
2812    psubusb              m0, m6, m5
2813    por                  m7, m0
2814.w8_loop:
2815    sub                 tlq, 2
2816    movd                 m3, [tlq]
2817    pshufb               m3, [base+ipred_paeth_shuf]
2818    PAETH                 6, 7
2819    movq     [dstq        ], m1
2820    movhps   [dstq+strideq], m1
2821    lea                dstq, [dstq+strideq*2]
2822    sub                  hd, 2
2823    jg .w8_loop
2824    RET
2825ALIGN function_align
2826.w16:
2827    movu                 m6, [tlq+1]
2828    psubusb              m7, m5, m6
2829    psubusb              m0, m6, m5
2830    por                  m7, m0
2831.w16_loop:
2832    sub                 tlq, 1
2833    movd                 m3, [tlq]
2834    pxor                 m1, m1
2835    pshufb               m3, m1
2836    PAETH                 6, 7
2837    mova             [dstq], m1
2838    add                dstq, strideq
2839    sub                  hd, 1
2840    jg .w16_loop
2841    RET
2842ALIGN function_align
2843.w32:
2844    movu                 m6, [tlq+1]
2845    psubusb              m7, m5, m6
2846    psubusb              m0, m6, m5
2847    por                  m7, m0
2848    mova           [rsp   ], m6
2849    mova           [rsp+16], m7
2850    movu                 m6, [tlq+17]
2851    psubusb              m7, m5, m6
2852    psubusb              m0, m6, m5
2853    por                  m7, m0
2854    mova           [rsp+32], m6
2855.w32_loop:
2856    dec                 tlq
2857    movd                 m3, [tlq]
2858    pxor                 m1, m1
2859    pshufb               m3, m1
2860    mova                 m6, [rsp]
2861    PAETH                 6, [rsp+16]
2862    mova          [dstq   ], m1
2863    mova                 m6, [rsp+32]
2864    PAETH                 6, 7
2865    mova          [dstq+16], m1
2866    add                dstq, strideq
2867    dec                  hd
2868    jg .w32_loop
2869    RET
2870ALIGN function_align
2871.w64:
2872    movu                 m6, [tlq+1]
2873    psubusb              m7, m5, m6
2874    psubusb              m0, m6, m5
2875    por                  m7, m0
2876    mova           [rsp   ], m6
2877    mova           [rsp+16], m7
2878    movu                 m6, [tlq+17]
2879    psubusb              m7, m5, m6
2880    psubusb              m0, m6, m5
2881    por                  m7, m0
2882    mova           [rsp+32], m6
2883    mova           [rsp+48], m7
2884    movu                 m6, [tlq+33]
2885    psubusb              m7, m5, m6
2886    psubusb              m0, m6, m5
2887    por                  m7, m0
2888    mova           [rsp+64], m6
2889    mova           [rsp+80], m7
2890    movu                 m6, [tlq+49]
2891    psubusb              m7, m5, m6
2892    psubusb              m0, m6, m5
2893    por                  m7, m0
2894    mova           [rsp+96], m6
2895.w64_loop:
2896    dec                 tlq
2897    movd                 m3, [tlq]
2898    pxor                 m1, m1
2899    pshufb               m3, m1
2900    mova                 m6, [rsp]
2901    PAETH                 6, [rsp+16]
2902    mova          [dstq   ], m1
2903    mova                 m6, [rsp+32]
2904    PAETH                 6, [rsp+48]
2905    mova          [dstq+16], m1
2906    mova                 m6, [rsp+64]
2907    PAETH                 6, [rsp+80]
2908    mova          [dstq+32], m1
2909    mova                 m6, [rsp+96]
2910    PAETH                 6, 7
2911    mova          [dstq+48], m1
2912    add                dstq, strideq
2913    dec                  hd
2914    jg .w64_loop
2915    RET
2916
2917
2918%macro FILTER 4  ;dst, src, tmp, shuf
2919%ifnum %4
2920    pshufb               m%2, m%4
2921%else
2922    pshufb               m%2, %4
2923%endif
2924    pshufd               m%1, m%2, q0000           ;p0 p1
2925    pmaddubsw            m%1, m2
2926    pshufd               m%3, m%2, q1111           ;p2 p3
2927    pmaddubsw            m%3, m3
2928    paddw                m%1, [base+pw_8]
2929    paddw                m%1, m%3
2930    pshufd               m%3, m%2, q2222           ;p4 p5
2931    pmaddubsw            m%3, m4
2932    paddw                m%1, m%3
2933    pshufd               m%3, m%2, q3333           ;p6 __
2934    pmaddubsw            m%3, m5
2935    paddw                m%1, m%3
2936    psraw                m%1, 4
2937    packuswb             m%1, m%1
2938%endmacro
2939
2940cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter
2941%define base r6-$$
2942    LEA                   r6, $$
2943    tzcnt                 wd, wm
2944%ifidn filterd, filterm
2945    movzx            filterd, filterb
2946%else
2947    movzx            filterd, byte filterm
2948%endif
2949    shl              filterd, 6
2950    lea              filterq, [base+filter_intra_taps+filterq]
2951    movq                  m0, [tlq-3]                     ;_ 6 5 0 1 2 3 4
2952    movsxd                wq, [base+ipred_filter_ssse3_table+wq*4]
2953    mova                  m2, [filterq+16*0]
2954    mova                  m3, [filterq+16*1]
2955    mova                  m4, [filterq+16*2]
2956    mova                  m5, [filterq+16*3]
2957    lea                   wq, [base+ipred_filter_ssse3_table+wq]
2958    mov                   hd, hm
2959    jmp                   wq
2960.w4:
2961    mova                  m1, [base+filter_shuf1]
2962    sub                  tlq, 3
2963    sub                  tlq, hq
2964    jmp .w4_loop_start
2965.w4_loop:
2966    movd                  m0, [tlq+hq]
2967    punpckldq             m0, m6
2968    lea                 dstq, [dstq+strideq*2]
2969.w4_loop_start:
2970    FILTER                 6, 0, 7, 1
2971    movd    [dstq+strideq*0], m6
2972    pshuflw               m6, m6, q1032
2973    movd    [dstq+strideq*1], m6
2974    sub                   hd, 2
2975    jg .w4_loop
2976    RET
2977
2978ALIGN function_align
2979.w8:
2980    movq                  m6, [tlq+1]                   ;_ _ _ 0 1 2 3 4
2981    sub                  tlq, 5
2982    sub                  tlq, hq
2983
2984.w8_loop:
2985    FILTER                 7, 0, 1, [base+filter_shuf1]
2986    punpcklqdq            m6, m7                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
2987    FILTER                 0, 6, 1, [base+filter_shuf2]
2988
2989    punpckldq             m6, m7, m0
2990    movq    [dstq+strideq*0], m6
2991    punpckhqdq            m6, m6
2992    movq    [dstq+strideq*1], m6
2993
2994    movd                  m0, [tlq+hq]                  ;_ 6 5 0
2995    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
2996
2997    lea                 dstq, [dstq+strideq*2]
2998    sub                   hd, 2
2999    jg .w8_loop
3000    RET
3001
3002ALIGN function_align
3003.w16:
3004    movu                  m6, [tlq+1]                   ;top row
3005    sub                  tlq, 5
3006    sub                  tlq, hq
3007
3008.w16_loop:
3009    FILTER                 7, 0, 1, [base+filter_shuf1]
3010    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3011    movd    [dstq+strideq*0], m7
3012    psrlq                 m7, 32
3013    palignr               m7, m6, 4
3014
3015    FILTER                 6, 0, 1, [base+filter_shuf2]
3016    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3017    movd  [dstq+4+strideq*0], m6
3018    psrlq                 m6, 32
3019    palignr               m6, m7, 4
3020
3021    FILTER                 7, 0, 1, [base+filter_shuf2]
3022    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3023    movd  [dstq+8+strideq*0], m7
3024    psrlq                 m7, 32
3025    palignr               m7, m6, 4
3026
3027    FILTER                 6, 0, 1, [base+filter_shuf2]
3028    movd [dstq+12+strideq*0], m6
3029    psrlq                 m6, 32
3030    palignr               m6, m7, 4
3031    mova    [dstq+strideq*1], m6
3032
3033    movd                  m0, [tlq+hq]                  ;_ 6 5 0
3034    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
3035
3036    lea                 dstq, [dstq+strideq*2]
3037    sub                   hd, 2
3038    jg .w16_loop
3039    RET
3040
3041ALIGN function_align
3042.w32:
3043    movu                  m6, [tlq+1]                   ;top row
3044    lea              filterq, [tlq+17]
3045    sub                  tlq, 5
3046    sub                  tlq, hq
3047
3048.w32_loop:
3049    FILTER                 7, 0, 1, [base+filter_shuf1]
3050    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3051    movd    [dstq+strideq*0], m7
3052    psrlq                 m7, 32
3053    palignr               m7, m6, 4
3054
3055    FILTER                 6, 0, 1, [base+filter_shuf2]
3056    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3057    movd  [dstq+4+strideq*0], m6
3058    psrlq                 m6, 32
3059    palignr               m6, m7, 4
3060
3061    FILTER                 7, 0, 1, [base+filter_shuf2]
3062    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3063    movd  [dstq+8+strideq*0], m7
3064    psrlq                 m7, 32
3065    palignr               m7, m6, 4
3066
3067    FILTER                 6, 0, 1, [base+filter_shuf2]
3068    movu                  m1, [filterq]
3069    punpckldq             m0, m7, m1                    ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
3070    punpcklqdq            m0, m6                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3071    movd [dstq+12+strideq*0], m6
3072    psrlq                 m6, 32
3073    palignr               m6, m7, 4
3074    mova    [dstq+strideq*1], m6
3075
3076    mova                  m6, m1
3077
3078    FILTER                 7, 0, 6, [base+filter_shuf2]
3079    punpcklqdq            m0, m1, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3080    movd [dstq+16+strideq*0], m7
3081    psrlq                 m7, 32
3082    palignr               m7, m1, 4
3083
3084    FILTER                 6, 0, 1, [base+filter_shuf2]
3085    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3086    movd [dstq+20+strideq*0], m6
3087    psrlq                 m6, 32
3088    palignr               m6, m7, 4
3089
3090    FILTER                 7, 0, 1, [base+filter_shuf2]
3091    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
3092    movd [dstq+24+strideq*0], m7
3093    psrlq                 m7, 32
3094    palignr               m7, m6, 4
3095
3096    FILTER                 6, 0, 1, [base+filter_shuf2]
3097    movd [dstq+28+strideq*0], m6
3098    psrlq                 m6, 32
3099    palignr               m6, m7, 4
3100    mova [dstq+16+strideq*1], m6
3101
3102    mova                  m6, [dstq+strideq*1]
3103    movd                  m0, [tlq+hq]                  ;_ 6 5 0
3104    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
3105    lea              filterq, [dstq+16+strideq*1]
3106    lea                 dstq, [dstq+strideq*2]
3107    sub                   hd, 2
3108    jg .w32_loop
3109    RET
3110