1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 64
30
31%macro SMOOTH_WEIGHTS 1-*
32const smooth_weights_1d_16bpc ; sm_weights[] << 7
33    %rep %0
34        dw %1*128
35        %rotate 1
36    %endrep
37const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
38    %rep %0
39        dw %1, 256-%1
40        %rotate 1
41    %endrep
42%endmacro
43
44SMOOTH_WEIGHTS   0,   0, 255, 128, 255, 149,  85,  64, \
45               255, 197, 146, 105,  73,  50,  37,  32, \
46               255, 225, 196, 170, 145, 123, 102,  84, \
47                68,  54,  43,  33,  26,  20,  17,  16, \
48               255, 240, 225, 210, 196, 182, 169, 157, \
49               145, 133, 122, 111, 101,  92,  83,  74, \
50                66,  59,  52,  45,  39,  34,  29,  25, \
51                21,  17,  14,  12,  10,   9,   8,   8, \
52               255, 248, 240, 233, 225, 218, 210, 203, \
53               196, 189, 182, 176, 169, 163, 156, 150, \
54               144, 138, 133, 127, 121, 116, 111, 106, \
55               101,  96,  91,  86,  82,  77,  73,  69, \
56                65,  61,  57,  54,  50,  47,  44,  41, \
57                38,  35,  32,  29,  27,  25,  22,  20, \
58                18,  16,  15,  13,  12,  10,   9,   8, \
59                 7,   6,   6,   5,   5,   4,   4,   4
60
61%if ARCH_X86_64
62
63ipred_hv_shuf: db  6,  7,  6,  7,  0,  1,  2,  3,  2,  3,  2,  3,  8,  9, 10, 11
64               db  4,  5,  4,  5,  4,  5,  6,  7,  0,  1,  0,  1, 12, 13, 14, 15
65filter_shuf1:  db  8,  9,  0,  1,  2,  3,  4,  5,  6,  7, 14, 15, 12, 13, -1, -1
66filter_shuf2:  db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
67filter_shuf3:  db 12, 13,  0,  1,  2,  3,  4,  5,  6,  7, 10, 11,  8,  9, -1, -1
68pal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
69z_base_inc:    dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
70               dw   8*64,   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64
71z_filter_t0:   db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
72z_filter_t1:   db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
73z_filter_wh:   db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
74               db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
75pw_m1024:      times 2 dw -1024
76pw_1to16:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
77pw_16to1:      dw 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
78z2_ymul:       dw  1,  2,  1,  2,  1,  2,  1,  2,  3,  4,  3,  4,  3,  4,  3,  4
79z2_ymul8:      dw  1,  2,  5,  6,  3,  4,  7,  8,  5,  6, 16, 16,  7,  8
80pb_90:         times 4 db 90
81z2_y_shuf_h4:  dd  3,  7,  2,  6,  1,  5,  0,  4
82z_upsample:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
83z2_x_shuf:     db  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
84z2_y_shuf:     db  6,  7, 14, 15,  4,  5, 12, 13,  4,  5, 12, 13,  2,  3, 10, 11
85z2_y_shuf_us:  db  6,  7, 14, 15,  2,  3, 10, 11,  4,  5, 12, 13,  0,  1,  8,  9
86z_filter_k:    dw  4,  4,  5,  5,  4,  4
87               dw  8,  8,  6,  6,  4,  4
88               dw  0,  0,  0,  0,  2,  2
89
90%define pw_2  (z_filter_k+32)
91%define pw_4  (z_filter_k+ 0)
92%define pw_16 (z2_ymul8  +20)
93
94pw_1:    times 2 dw 1
95pw_3:    times 2 dw 3
96pw_62:   times 2 dw 62
97pw_512:  times 2 dw 512
98pw_2048: times 2 dw 2048
99pd_8:    dd 8
100
101%macro JMP_TABLE 3-*
102    %xdefine %1_%2_table (%%table - 2*4)
103    %xdefine %%base mangle(private_prefix %+ _%1_%2)
104    %%table:
105    %rep %0 - 2
106        dd %%base %+ .%3 - (%%table - 2*4)
107        %rotate 1
108    %endrep
109%endmacro
110
111%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
112%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
113
114JMP_TABLE ipred_dc_16bpc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
115                                        s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
116JMP_TABLE ipred_dc_left_16bpc,    avx2, h4, h8, h16, h32, h64
117JMP_TABLE ipred_h_16bpc,          avx2, w4, w8, w16, w32, w64
118JMP_TABLE ipred_paeth_16bpc,      avx2, w4, w8, w16, w32, w64
119JMP_TABLE ipred_smooth_16bpc,     avx2, w4, w8, w16, w32, w64
120JMP_TABLE ipred_smooth_h_16bpc,   avx2, w4, w8, w16, w32, w64
121JMP_TABLE ipred_smooth_v_16bpc,   avx2, w4, w8, w16, w32, w64
122JMP_TABLE ipred_z1_16bpc,         avx2, w4, w8, w16, w32, w64
123JMP_TABLE ipred_z2_16bpc,         avx2, w4, w8, w16, w32, w64
124JMP_TABLE ipred_z3_16bpc,         avx2, h4, h8, h16, h32, h64
125JMP_TABLE ipred_filter_16bpc,     avx2, w4, w8, w16, w32
126JMP_TABLE ipred_cfl_16bpc,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
127                                        s4-8*4, s8-8*4, s16-8*4, s32-8*4
128JMP_TABLE ipred_cfl_left_16bpc,   avx2, h4, h8, h16, h32
129JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
130JMP_TABLE pal_pred_16bpc,         avx2, w4, w8, w16, w32, w64
131
132cextern dr_intra_derivative
133cextern filter_intra_taps
134
135SECTION .text
136
137INIT_YMM avx2
138cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
139    movifnidn            hd, hm
140    add                 tlq, 2
141    movd                xm4, wd
142    pxor                xm3, xm3
143    pavgw               xm4, xm3
144    tzcnt                wd, wd
145    movd                xm5, wd
146    movu                 m0, [tlq]
147    lea                  r5, [ipred_dc_left_16bpc_avx2_table]
148    movsxd               r6, [r5+wq*4]
149    add                  r6, r5
150    add                  r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
151    movsxd               wq, [r5+wq*4]
152    add                  wq, r5
153    jmp                  r6
154
155cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
156    mov                  hd, hm
157    sub                 tlq, hq
158    movd                xm4, hd
159    sub                 tlq, hq
160    pxor                xm3, xm3
161    pavgw               xm4, xm3
162    tzcnt               r6d, hd
163    movd                xm5, r6d
164    movu                 m0, [tlq]
165    lea                  r5, [ipred_dc_left_16bpc_avx2_table]
166    movsxd               r6, [r5+r6*4]
167    add                  r6, r5
168    add                  r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
169    tzcnt                wd, wd
170    movsxd               wq, [r5+wq*4]
171    add                  wq, r5
172    jmp                  r6
173.h64:
174    paddw                m0, [tlq+96]
175    paddw                m0, [tlq+64]
176.h32:
177    paddw                m0, [tlq+32]
178.h16:
179    vextracti128        xm1, m0, 1
180    paddw               xm0, xm1
181.h8:
182    psrldq              xm1, xm0, 8
183    paddw               xm0, xm1
184.h4:
185    punpcklwd           xm0, xm3
186    psrlq               xm1, xm0, 32
187    paddd               xm0, xm1
188    psrldq              xm1, xm0, 8
189    paddd               xm0, xm1
190    paddd               xm0, xm4
191    psrld               xm0, xm5
192    lea            stride3q, [strideq*3]
193    vpbroadcastw         m0, xm0
194    mova                 m1, m0
195    mova                 m2, m0
196    mova                 m3, m0
197    jmp                  wq
198
199cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
200    movifnidn            hd, hm
201    tzcnt               r6d, hd
202    lea                 r5d, [wq+hq]
203    movd                xm4, r5d
204    tzcnt               r5d, r5d
205    movd                xm5, r5d
206    lea                  r5, [ipred_dc_16bpc_avx2_table]
207    tzcnt                wd, wd
208    movsxd               r6, [r5+r6*4]
209    movsxd               wq, [r5+wq*4+5*4]
210    pxor                 m3, m3
211    psrlw               xm4, 1
212    add                  r6, r5
213    add                  wq, r5
214    lea            stride3q, [strideq*3]
215    jmp                  r6
216.h4:
217    movq                xm0, [tlq-8]
218    jmp                  wq
219.w4:
220    movq                xm1, [tlq+2]
221    paddw                m0, m4
222    paddw                m0, m1
223    psrlq                m1, m0, 32
224    paddw                m0, m1
225    psrld                m1, m0, 16
226    paddw                m0, m1
227    cmp                  hd, 4
228    jg .w4_mul
229    psrlw               xm0, 3
230    jmp .w4_end
231.w4_mul:
232    vextracti128        xm1, m0, 1
233    paddw               xm0, xm1
234    lea                 r2d, [hq*2]
235    mov                 r6d, 0xAAAB6667
236    shrx                r6d, r6d, r2d
237    punpckhwd           xm1, xm0, xm3
238    punpcklwd           xm0, xm3
239    paddd               xm0, xm1
240    movd                xm1, r6d
241    psrld               xm0, 2
242    pmulhuw             xm0, xm1
243    psrlw               xm0, 1
244.w4_end:
245    vpbroadcastw        xm0, xm0
246.s4:
247    movq   [dstq+strideq*0], xm0
248    movq   [dstq+strideq*1], xm0
249    movq   [dstq+strideq*2], xm0
250    movq   [dstq+stride3q ], xm0
251    lea                dstq, [dstq+strideq*4]
252    sub                  hd, 4
253    jg .s4
254    RET
255ALIGN function_align
256.h8:
257    mova                xm0, [tlq-16]
258    jmp                  wq
259.w8:
260    vextracti128        xm1, m0, 1
261    paddw               xm0, [tlq+2]
262    paddw               xm0, xm4
263    paddw               xm0, xm1
264    psrld               xm1, xm0, 16
265    paddw               xm0, xm1
266    pblendw             xm0, xm3, 0xAA
267    psrlq               xm1, xm0, 32
268    paddd               xm0, xm1
269    psrldq              xm1, xm0, 8
270    paddd               xm0, xm1
271    psrld               xm0, xm5
272    cmp                  hd, 8
273    je .w8_end
274    mov                 r6d, 0xAAAB
275    mov                 r2d, 0x6667
276    cmp                  hd, 32
277    cmovz               r6d, r2d
278    movd                xm1, r6d
279    pmulhuw             xm0, xm1
280    psrlw               xm0, 1
281.w8_end:
282    vpbroadcastw        xm0, xm0
283.s8:
284    mova   [dstq+strideq*0], xm0
285    mova   [dstq+strideq*1], xm0
286    mova   [dstq+strideq*2], xm0
287    mova   [dstq+stride3q ], xm0
288    lea                dstq, [dstq+strideq*4]
289    sub                  hd, 4
290    jg .s8
291    RET
292ALIGN function_align
293.h16:
294    mova                 m0, [tlq-32]
295    jmp                  wq
296.w16:
297    paddw                m0, [tlq+2]
298    vextracti128        xm1, m0, 1
299    paddw               xm0, xm4
300    paddw               xm0, xm1
301    punpckhwd           xm1, xm0, xm3
302    punpcklwd           xm0, xm3
303    paddd               xm0, xm1
304    psrlq               xm1, xm0, 32
305    paddd               xm0, xm1
306    psrldq              xm1, xm0, 8
307    paddd               xm0, xm1
308    psrld               xm0, xm5
309    cmp                  hd, 16
310    je .w16_end
311    mov                 r6d, 0xAAAB
312    mov                 r2d, 0x6667
313    test                 hb, 8|32
314    cmovz               r6d, r2d
315    movd                xm1, r6d
316    pmulhuw             xm0, xm1
317    psrlw               xm0, 1
318.w16_end:
319    vpbroadcastw         m0, xm0
320.s16:
321    mova   [dstq+strideq*0], m0
322    mova   [dstq+strideq*1], m0
323    mova   [dstq+strideq*2], m0
324    mova   [dstq+stride3q ], m0
325    lea                dstq, [dstq+strideq*4]
326    sub                  hd, 4
327    jg .s16
328    RET
329ALIGN function_align
330.h32:
331    mova                 m0, [tlq-64]
332    paddw                m0, [tlq-32]
333    jmp                  wq
334.w32:
335    paddw                m0, [tlq+ 2]
336    paddw                m0, [tlq+34]
337    vextracti128        xm1, m0, 1
338    paddw               xm0, xm4
339    paddw               xm0, xm1
340    punpcklwd           xm1, xm0, xm3
341    punpckhwd           xm0, xm3
342    paddd               xm0, xm1
343    psrlq               xm1, xm0, 32
344    paddd               xm0, xm1
345    psrldq              xm1, xm0, 8
346    paddd               xm0, xm1
347    psrld               xm0, xm5
348    cmp                  hd, 32
349    je .w32_end
350    lea                 r2d, [hq*2]
351    mov                 r6d, 0x6667AAAB
352    shrx                r6d, r6d, r2d
353    movd                xm1, r6d
354    pmulhuw             xm0, xm1
355    psrlw               xm0, 1
356.w32_end:
357    vpbroadcastw         m0, xm0
358    mova                 m1, m0
359.s32:
360    mova [dstq+strideq*0+32*0], m0
361    mova [dstq+strideq*0+32*1], m1
362    mova [dstq+strideq*1+32*0], m0
363    mova [dstq+strideq*1+32*1], m1
364    mova [dstq+strideq*2+32*0], m0
365    mova [dstq+strideq*2+32*1], m1
366    mova [dstq+stride3q +32*0], m0
367    mova [dstq+stride3q +32*1], m1
368    lea                dstq, [dstq+strideq*4]
369    sub                  hd, 4
370    jg .s32
371    RET
372ALIGN function_align
373.h64:
374    mova                 m0, [tlq-128]
375    mova                 m1, [tlq- 96]
376    paddw                m0, [tlq- 64]
377    paddw                m1, [tlq- 32]
378    paddw                m0, m1
379    jmp                  wq
380.w64:
381    movu                 m1, [tlq+ 2]
382    paddw                m0, [tlq+34]
383    paddw                m1, [tlq+66]
384    paddw                m0, [tlq+98]
385    paddw                m0, m1
386    vextracti128        xm1, m0, 1
387    paddw               xm0, xm1
388    punpcklwd           xm1, xm0, xm3
389    punpckhwd           xm0, xm3
390    paddd               xm1, xm4
391    paddd               xm0, xm1
392    psrlq               xm1, xm0, 32
393    paddd               xm0, xm1
394    psrldq              xm1, xm0, 8
395    paddd               xm0, xm1
396    psrld               xm0, xm5
397    cmp                  hd, 64
398    je .w64_end
399    mov                 r6d, 0x6667AAAB
400    shrx                r6d, r6d, hd
401    movd                xm1, r6d
402    pmulhuw             xm0, xm1
403    psrlw               xm0, 1
404.w64_end:
405    vpbroadcastw         m0, xm0
406    mova                 m1, m0
407    mova                 m2, m0
408    mova                 m3, m0
409.s64:
410    mova [dstq+strideq*0+32*0], m0
411    mova [dstq+strideq*0+32*1], m1
412    mova [dstq+strideq*0+32*2], m2
413    mova [dstq+strideq*0+32*3], m3
414    mova [dstq+strideq*1+32*0], m0
415    mova [dstq+strideq*1+32*1], m1
416    mova [dstq+strideq*1+32*2], m2
417    mova [dstq+strideq*1+32*3], m3
418    lea                dstq, [dstq+strideq*2]
419    sub                  hd, 2
420    jg .s64
421    RET
422
423cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
424    mov                 r6d, r8m
425    shr                 r6d, 11
426    lea                  r5, [ipred_dc_splat_16bpc_avx2_table]
427    tzcnt                wd, wd
428    movifnidn            hd, hm
429    movsxd               wq, [r5+wq*4]
430    vpbroadcastd         m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
431    mova                 m1, m0
432    mova                 m2, m0
433    mova                 m3, m0
434    add                  wq, r5
435    lea            stride3q, [strideq*3]
436    jmp                  wq
437
438cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
439    movifnidn            hd, hm
440    movu                 m0, [tlq+ 2]
441    movu                 m1, [tlq+34]
442    movu                 m2, [tlq+66]
443    movu                 m3, [tlq+98]
444    lea                  r5, [ipred_dc_splat_16bpc_avx2_table]
445    tzcnt                wd, wd
446    movsxd               wq, [r5+wq*4]
447    add                  wq, r5
448    lea            stride3q, [strideq*3]
449    jmp                  wq
450
451%macro IPRED_H 2 ; w, store_type
452    vpbroadcastw         m0, [tlq-2]
453    vpbroadcastw         m1, [tlq-4]
454    vpbroadcastw         m2, [tlq-6]
455    vpbroadcastw         m3, [tlq-8]
456    sub                 tlq, 8
457    mov%2  [dstq+strideq*0], m0
458    mov%2  [dstq+strideq*1], m1
459    mov%2  [dstq+strideq*2], m2
460    mov%2  [dstq+stride3q ], m3
461    lea                dstq, [dstq+strideq*4]
462    sub                  hd, 4
463    jg .w%1
464    RET
465ALIGN function_align
466%endmacro
467
468cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
469    movifnidn            hd, hm
470    lea                  r5, [ipred_h_16bpc_avx2_table]
471    tzcnt                wd, wd
472    movsxd               wq, [r5+wq*4]
473    add                  wq, r5
474    lea            stride3q, [strideq*3]
475    jmp                  wq
476INIT_XMM avx2
477.w4:
478    IPRED_H               4, q
479.w8:
480    IPRED_H               8, a
481INIT_YMM avx2
482.w16:
483    IPRED_H              16, a
484.w32:
485    vpbroadcastw         m0, [tlq-2]
486    vpbroadcastw         m1, [tlq-4]
487    vpbroadcastw         m2, [tlq-6]
488    vpbroadcastw         m3, [tlq-8]
489    sub                 tlq, 8
490    mova [dstq+strideq*0+32*0], m0
491    mova [dstq+strideq*0+32*1], m0
492    mova [dstq+strideq*1+32*0], m1
493    mova [dstq+strideq*1+32*1], m1
494    mova [dstq+strideq*2+32*0], m2
495    mova [dstq+strideq*2+32*1], m2
496    mova [dstq+stride3q +32*0], m3
497    mova [dstq+stride3q +32*1], m3
498    lea                dstq, [dstq+strideq*4]
499    sub                  hd, 4
500    jg .w32
501    RET
502.w64:
503    vpbroadcastw         m0, [tlq-2]
504    vpbroadcastw         m1, [tlq-4]
505    sub                 tlq, 4
506    mova [dstq+strideq*0+32*0], m0
507    mova [dstq+strideq*0+32*1], m0
508    mova [dstq+strideq*0+32*2], m0
509    mova [dstq+strideq*0+32*3], m0
510    mova [dstq+strideq*1+32*0], m1
511    mova [dstq+strideq*1+32*1], m1
512    mova [dstq+strideq*1+32*2], m1
513    mova [dstq+strideq*1+32*3], m1
514    lea                dstq, [dstq+strideq*2]
515    sub                  hd, 2
516    jg .w64
517    RET
518
519%macro PAETH 3 ; top, signed_ldiff, ldiff
520    paddw               m0, m%2, m1
521    psubw               m7, m3, m0  ; tldiff
522    psubw               m0, m%1     ; tdiff
523    pabsw               m7, m7
524    pabsw               m0, m0
525    pminsw              m7, m0
526    pcmpeqw             m0, m7
527    pcmpgtw             m7, m%3, m7
528    vpblendvb           m0, m3, m%1, m0
529    vpblendvb           m0, m1, m0, m7
530%endmacro
531
532cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
533%define base r5-ipred_paeth_16bpc_avx2_table
534    movifnidn           hd, hm
535    lea                 r5, [ipred_paeth_16bpc_avx2_table]
536    tzcnt               wd, wd
537    movsxd              wq, [r5+wq*4]
538    vpbroadcastw        m3, [tlq]   ; topleft
539    add                 wq, r5
540    jmp                 wq
541.w4:
542    vpbroadcastq        m2, [tlq+2] ; top
543    movsldup            m6, [base+ipred_hv_shuf]
544    lea                 r3, [strideq*3]
545    psubw               m4, m2, m3
546    pabsw               m5, m4
547.w4_loop:
548    sub                tlq, 8
549    vpbroadcastq        m1, [tlq]
550    pshufb              m1, m6      ; left
551    PAETH                2, 4, 5
552    vextracti128       xm1, m0, 1
553    movq  [dstq+strideq*0], xm0
554    movq  [dstq+strideq*1], xm1
555    movhps [dstq+strideq*2], xm0
556    movhps [dstq+r3       ], xm1
557    lea               dstq, [dstq+strideq*4]
558    sub                 hd, 4
559    jg .w4_loop
560    RET
561ALIGN function_align
562.w8:
563    vbroadcasti128      m2, [tlq+2]
564    movsldup            m6, [base+ipred_hv_shuf]
565    psubw               m4, m2, m3
566    pabsw               m5, m4
567.w8_loop:
568    sub                tlq, 4
569    vpbroadcastd        m1, [tlq]
570    pshufb              m1, m6
571    PAETH                2, 4, 5
572    mova         [dstq+strideq*0], xm0
573    vextracti128 [dstq+strideq*1], m0, 1
574    lea               dstq, [dstq+strideq*2]
575    sub                 hd, 2
576    jg .w8_loop
577    RET
578ALIGN function_align
579.w16:
580    movu                m2, [tlq+2]
581    psubw               m4, m2, m3
582    pabsw               m5, m4
583.w16_loop:
584    sub                tlq, 2
585    vpbroadcastw        m1, [tlq]
586    PAETH                2, 4, 5
587    mova            [dstq], m0
588    add               dstq, strideq
589    dec                 hd
590    jg .w16_loop
591    RET
592ALIGN function_align
593.w32:
594    movu                m2, [tlq+2]
595    movu                m6, [tlq+34]
596%if WIN64
597    movaps             r4m, xmm8
598    movaps             r6m, xmm9
599%endif
600    psubw               m4, m2, m3
601    psubw               m8, m6, m3
602    pabsw               m5, m4
603    pabsw               m9, m8
604.w32_loop:
605    sub                tlq, 2
606    vpbroadcastw        m1, [tlq]
607    PAETH                2, 4, 5
608    mova       [dstq+32*0], m0
609    PAETH                6, 8, 9
610    mova       [dstq+32*1], m0
611    add               dstq, strideq
612    dec                 hd
613    jg .w32_loop
614%if WIN64
615    movaps            xmm8, r4m
616    movaps            xmm9, r6m
617%endif
618    RET
619ALIGN function_align
620.w64:
621    WIN64_SPILL_XMM 16
622    movu                m2, [tlq+ 2]
623    movu                m6, [tlq+34]
624    movu               m10, [tlq+66]
625    movu               m13, [tlq+98]
626    psubw               m4, m2, m3
627    psubw               m8, m6, m3
628    psubw              m11, m10, m3
629    psubw              m14, m13, m3
630    pabsw               m5, m4
631    pabsw               m9, m8
632    pabsw              m12, m11
633    pabsw              m15, m14
634.w64_loop:
635    sub                tlq, 2
636    vpbroadcastw        m1, [tlq]
637    PAETH                2, 4, 5
638    mova       [dstq+32*0], m0
639    PAETH                6, 8, 9
640    mova       [dstq+32*1], m0
641    PAETH               10, 11, 12
642    mova       [dstq+32*2], m0
643    PAETH               13, 14, 15
644    mova       [dstq+32*3], m0
645    add               dstq, strideq
646    dec                 hd
647    jg .w64_loop
648    RET
649
650cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
651%define base r6-ipred_smooth_v_16bpc_avx2_table
652    lea                  r6, [ipred_smooth_v_16bpc_avx2_table]
653    tzcnt                wd, wm
654    mov                  hd, hm
655    movsxd               wq, [r6+wq*4]
656    lea            weightsq, [base+smooth_weights_1d_16bpc+hq*4]
657    neg                  hq
658    vpbroadcastw         m5, [tlq+hq*2] ; bottom
659    add                  wq, r6
660    jmp                  wq
661.w4:
662    vpbroadcastq         m4, [tlq+2]    ; top
663    movsldup             m3, [base+ipred_hv_shuf]
664    lea                  r6, [strideq*3]
665    psubw                m4, m5         ; top - bottom
666.w4_loop:
667    vpbroadcastq         m0, [weightsq+hq*2]
668    pshufb               m0, m3
669    pmulhrsw             m0, m4
670    paddw                m0, m5
671    vextracti128        xm1, m0, 1
672    movhps [dstq+strideq*0], xm1
673    movhps [dstq+strideq*1], xm0
674    movq   [dstq+strideq*2], xm1
675    movq   [dstq+r6       ], xm0
676    lea                dstq, [dstq+strideq*4]
677    add                  hq, 4
678    jl .w4_loop
679.ret:
680    RET
681.w8:
682    vbroadcasti128       m4, [tlq+2]
683    movsldup             m3, [base+ipred_hv_shuf]
684    lea                  r6, [strideq*3]
685    psubw                m4, m5
686.w8_loop:
687    vpbroadcastd         m0, [weightsq+hq*2+0]
688    vpbroadcastd         m1, [weightsq+hq*2+4]
689    pshufb               m0, m3
690    pshufb               m1, m3
691    pmulhrsw             m0, m4
692    pmulhrsw             m1, m4
693    paddw                m0, m5
694    paddw                m1, m5
695    vextracti128 [dstq+strideq*0], m0, 1
696    mova         [dstq+strideq*1], xm0
697    vextracti128 [dstq+strideq*2], m1, 1
698    mova         [dstq+r6       ], xm1
699    lea                dstq, [dstq+strideq*4]
700    add                  hq, 4
701    jl .w8_loop
702    RET
703.w16:
704    movu                 m4, [tlq+2]
705    lea                  r6, [strideq*3]
706    psubw                m4, m5
707.w16_loop:
708    vpbroadcastw         m0, [weightsq+hq*2+0]
709    vpbroadcastw         m1, [weightsq+hq*2+2]
710    vpbroadcastw         m2, [weightsq+hq*2+4]
711    vpbroadcastw         m3, [weightsq+hq*2+6]
712    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
713    REPX   {paddw    x, m5}, m0, m1, m2, m3
714    mova   [dstq+strideq*0], m0
715    mova   [dstq+strideq*1], m1
716    mova   [dstq+strideq*2], m2
717    mova   [dstq+r6       ], m3
718    lea                dstq, [dstq+strideq*4]
719    add                  hq, 4
720    jl .w16_loop
721    RET
722.w32:
723    WIN64_SPILL_XMM       7
724    movu                 m4, [tlq+ 2]
725    movu                 m6, [tlq+34]
726    psubw                m4, m5
727    psubw                m6, m5
728.w32_loop:
729    vpbroadcastw         m1, [weightsq+hq*2+0]
730    vpbroadcastw         m3, [weightsq+hq*2+2]
731    pmulhrsw             m0, m4, m1
732    pmulhrsw             m1, m6
733    pmulhrsw             m2, m4, m3
734    pmulhrsw             m3, m6
735    REPX      {paddw x, m5}, m0, m1, m2, m3
736    mova [dstq+strideq*0+32*0], m0
737    mova [dstq+strideq*0+32*1], m1
738    mova [dstq+strideq*1+32*0], m2
739    mova [dstq+strideq*1+32*1], m3
740    lea                dstq, [dstq+strideq*2]
741    add                  hq, 2
742    jl .w32_loop
743    RET
744.w64:
745    WIN64_SPILL_XMM       8
746    movu                 m3, [tlq+ 2]
747    movu                 m4, [tlq+34]
748    movu                 m6, [tlq+66]
749    movu                 m7, [tlq+98]
750    REPX      {psubw x, m5}, m3, m4, m6, m7
751.w64_loop:
752    vpbroadcastw         m2, [weightsq+hq*2]
753    pmulhrsw             m0, m3, m2
754    pmulhrsw             m1, m4, m2
755    paddw                m0, m5
756    paddw                m1, m5
757    mova        [dstq+32*0], m0
758    pmulhrsw             m0, m6, m2
759    mova        [dstq+32*1], m1
760    pmulhrsw             m1, m7, m2
761    paddw                m0, m5
762    paddw                m1, m5
763    mova        [dstq+32*2], m0
764    mova        [dstq+32*3], m1
765    add                dstq, strideq
766    inc                  hq
767    jl .w64_loop
768    RET
769
770cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
771%define base r6-ipred_smooth_h_16bpc_avx2_table
772    lea                  r6, [ipred_smooth_h_16bpc_avx2_table]
773    mov                  wd, wm
774    movifnidn            hd, hm
775    vpbroadcastw         m5, [tlq+wq*2] ; right
776    tzcnt                wd, wd
777    add                  hd, hd
778    movsxd               wq, [r6+wq*4]
779    sub                 tlq, hq
780    lea            stride3q, [strideq*3]
781    add                  wq, r6
782    jmp                  wq
783.w4:
784    vpbroadcastq         m4, [base+smooth_weights_1d_16bpc+4*2]
785    movsldup             m3, [base+ipred_hv_shuf]
786.w4_loop:
787    vpbroadcastq         m0, [tlq+hq-8] ; left
788    pshufb               m0, m3
789    psubw                m0, m5         ; left - right
790    pmulhrsw             m0, m4
791    paddw                m0, m5
792    vextracti128        xm1, m0, 1
793    movq   [dstq+strideq*0], xm0
794    movq   [dstq+strideq*1], xm1
795    movhps [dstq+strideq*2], xm0
796    movhps [dstq+stride3q ], xm1
797    lea                dstq, [dstq+strideq*4]
798    sub                  hd, 4*2
799    jg .w4_loop
800    RET
801.w8:
802    vbroadcasti128       m4, [base+smooth_weights_1d_16bpc+8*2]
803    movsldup             m3, [base+ipred_hv_shuf]
804.w8_loop:
805    vpbroadcastd         m0, [tlq+hq-4]
806    vpbroadcastd         m1, [tlq+hq-8]
807    pshufb               m0, m3
808    pshufb               m1, m3
809    psubw                m0, m5
810    psubw                m1, m5
811    pmulhrsw             m0, m4
812    pmulhrsw             m1, m4
813    paddw                m0, m5
814    paddw                m1, m5
815    mova         [dstq+strideq*0], xm0
816    vextracti128 [dstq+strideq*1], m0, 1
817    mova         [dstq+strideq*2], xm1
818    vextracti128 [dstq+stride3q ], m1, 1
819    lea                dstq, [dstq+strideq*4]
820    sub                  hq, 4*2
821    jg .w8_loop
822    RET
823.w16:
824    movu                 m4, [base+smooth_weights_1d_16bpc+16*2]
825.w16_loop:
826    vpbroadcastq         m3, [tlq+hq-8]
827    punpcklwd            m3, m3
828    psubw                m3, m5
829    pshufd               m0, m3, q3333
830    pshufd               m1, m3, q2222
831    pshufd               m2, m3, q1111
832    pshufd               m3, m3, q0000
833    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
834    REPX   {paddw    x, m5}, m0, m1, m2, m3
835    mova   [dstq+strideq*0], m0
836    mova   [dstq+strideq*1], m1
837    mova   [dstq+strideq*2], m2
838    mova   [dstq+stride3q ], m3
839    lea                dstq, [dstq+strideq*4]
840    sub                  hq, 4*2
841    jg .w16_loop
842    RET
843.w32:
844    WIN64_SPILL_XMM       7
845    movu                 m4, [base+smooth_weights_1d_16bpc+32*2]
846    movu                 m6, [base+smooth_weights_1d_16bpc+32*3]
847.w32_loop:
848    vpbroadcastw         m1, [tlq+hq-2]
849    vpbroadcastw         m3, [tlq+hq-4]
850    psubw                m1, m5
851    psubw                m3, m5
852    pmulhrsw             m0, m4, m1
853    pmulhrsw             m1, m6
854    pmulhrsw             m2, m4, m3
855    pmulhrsw             m3, m6
856    REPX      {paddw x, m5}, m0, m1, m2, m3
857    mova [dstq+strideq*0+32*0], m0
858    mova [dstq+strideq*0+32*1], m1
859    mova [dstq+strideq*1+32*0], m2
860    mova [dstq+strideq*1+32*1], m3
861    lea                dstq, [dstq+strideq*2]
862    sub                  hq, 2*2
863    jg .w32_loop
864    RET
865.w64:
866    WIN64_SPILL_XMM       8
867    movu                 m3, [base+smooth_weights_1d_16bpc+32*4]
868    movu                 m4, [base+smooth_weights_1d_16bpc+32*5]
869    movu                 m6, [base+smooth_weights_1d_16bpc+32*6]
870    movu                 m7, [base+smooth_weights_1d_16bpc+32*7]
871.w64_loop:
872    vpbroadcastw         m2, [tlq+hq-2]
873    psubw                m2, m5
874    pmulhrsw             m0, m3, m2
875    pmulhrsw             m1, m4, m2
876    paddw                m0, m5
877    paddw                m1, m5
878    mova        [dstq+32*0], m0
879    pmulhrsw             m0, m6, m2
880    mova        [dstq+32*1], m1
881    pmulhrsw             m1, m7, m2
882    paddw                m0, m5
883    paddw                m1, m5
884    mova        [dstq+32*2], m0
885    mova        [dstq+32*3], m1
886    add                dstq, strideq
887    sub                  hq, 1*2
888    jg .w64_loop
889    RET
890
891%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
892    pmaddwd             m0, m%1, m%3
893    pmaddwd             m1, m%2, m%4
894    paddd               m0, m%5
895    paddd               m1, m%6
896    psrld               m0, 8
897    psrld               m1, 8
898    packssdw            m0, m1
899    pavgw               m0, m5
900%endmacro
901
902cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
903%define base r6-ipred_smooth_16bpc_avx2_table
904    lea                 r6, [ipred_smooth_16bpc_avx2_table]
905    mov                 wd, wm
906    vpbroadcastw        m4, [tlq+wq*2] ; right
907    tzcnt               wd, wd
908    mov                 hd, hm
909    sub                tlq, hq
910    sub                tlq, hq
911    movsxd              wq, [r6+wq*4]
912    pxor                m5, m5
913    add                 wq, r6
914    lea         v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
915    jmp                 wq
916.w4:
917    WIN64_SPILL_XMM     11
918    vpbroadcastw        m0, [tlq] ; bottom
919    vpbroadcastq        m6, [tlq+hq*2+2]
920    movsldup            m7, [base+ipred_hv_shuf]
921    movshdup            m9, [base+ipred_hv_shuf]
922    vbroadcasti128     m10, [base+smooth_weights_2d_16bpc+4*4]
923    punpcklwd           m6, m0 ; top, bottom
924    punpcklqdq          m8, m9, m9
925    punpckhqdq          m9, m9
926    lea                 r3, [strideq*3]
927.w4_loop:
928    vpbroadcastq        m3, [tlq+hq*2-8]
929    vbroadcasti128      m1, [v_weightsq]
930    pshufb              m3, m7
931    punpcklwd           m2, m3, m4 ; left, right
932    punpckhwd           m3, m4
933    pmaddwd             m2, m10
934    pmaddwd             m3, m10
935    pshufb              m0, m1, m8
936    pshufb              m1, m9
937    SMOOTH_2D_END        0, 1, 6, 6, 2, 3
938    vextracti128       xm1, m0, 1
939    movq   [dstq+strideq*0], xm0
940    movq   [dstq+strideq*1], xm1
941    movhps [dstq+strideq*2], xm0
942    movhps [dstq+r3       ], xm1
943    lea               dstq, [dstq+strideq*4]
944    add         v_weightsq, 16
945    sub                 hd, 4
946    jg .w4_loop
947    RET
948.w8:
949%assign stack_offset stack_offset - stack_size_padded
950    WIN64_SPILL_XMM     12
951    vpbroadcastw        m0, [tlq] ; bottom
952    vbroadcasti128      m7, [tlq+hq*2+2]
953    movsldup            m8, [base+ipred_hv_shuf]
954    movshdup            m9, [base+ipred_hv_shuf]
955    vbroadcasti128     m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
956    vbroadcasti128     m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
957    punpcklwd           m6, m7, m0 ; top, bottom
958    punpckhwd           m7, m0
959.w8_loop:
960    vpbroadcastd        m3, [tlq+hq*2-4]
961    vpbroadcastq        m1, [v_weightsq]
962    pshufb              m3, m8
963    punpcklwd           m2, m3, m4 ; left, right
964    punpckhwd           m3, m4
965    pmaddwd             m2, m10
966    pmaddwd             m3, m11
967    pshufb              m1, m9
968    SMOOTH_2D_END        1, 1, 6, 7, 2, 3
969    mova         [dstq+strideq*0], xm0
970    vextracti128 [dstq+strideq*1], m0, 1
971    lea               dstq, [dstq+strideq*2]
972    add         v_weightsq, 8
973    sub                 hd, 2
974    jg .w8_loop
975    RET
976.w16:
977%assign stack_offset stack_offset - stack_size_padded
978    WIN64_SPILL_XMM     11
979    vpbroadcastw        m0, [tlq] ; bottom
980    movu                m7, [tlq+hq*2+2]
981    mova               xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
982    mova               xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
983    vinserti128         m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
984    vinserti128         m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
985    punpcklwd           m6, m7, m0 ; top, bottom
986    punpckhwd           m7, m0
987.w16_loop:
988    vpbroadcastd        m3, [tlq+hq*2-4]
989    vpbroadcastd        m1, [v_weightsq+0]
990    punpcklwd           m3, m4     ; left, right
991    pshufd              m2, m3, q1111
992    pmaddwd            m10, m8, m2
993    pmaddwd             m2, m9
994    pshufd              m3, m3, q0000
995    SMOOTH_2D_END        1, 1, 6, 7, 10, 2
996    vpbroadcastd        m1, [v_weightsq+4]
997    pmaddwd             m2, m8, m3
998    pmaddwd             m3, m9
999    mova  [dstq+strideq*0], m0
1000    SMOOTH_2D_END        1, 1, 6, 7, 2, 3
1001    mova  [dstq+strideq*1], m0
1002    lea               dstq, [dstq+strideq*2]
1003    add         v_weightsq, 8
1004    sub                 hq, 2
1005    jg .w16_loop
1006    RET
1007.w32:
1008%assign stack_offset stack_offset - stack_size_padded
1009    WIN64_SPILL_XMM     15
1010    vpbroadcastw        m0, [tlq] ; bottom
1011    movu                m7, [tlq+hq*2+ 2]
1012    movu                m9, [tlq+hq*2+34]
1013    mova              xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
1014    mova              xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
1015    vinserti128        m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
1016    vinserti128        m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
1017    mova              xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
1018    mova              xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
1019    vinserti128        m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
1020    vinserti128        m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
1021    punpcklwd           m6, m7, m0
1022    punpckhwd           m7, m0
1023    punpcklwd           m8, m9, m0
1024    punpckhwd           m9, m0
1025.w32_loop:
1026    vpbroadcastw        m3, [tlq+hq*2-2]
1027    vpbroadcastd       m14, [v_weightsq]
1028    punpcklwd           m3, m4
1029    pmaddwd             m1, m10, m3
1030    pmaddwd             m2, m11, m3
1031    pmaddwd             m0, m6, m14
1032    paddd               m0, m1
1033    pmaddwd             m1, m7, m14
1034    paddd               m1, m2
1035    pmaddwd             m2, m12, m3
1036    pmaddwd             m3, m13
1037    psrld               m0, 8
1038    psrld               m1, 8
1039    packssdw            m0, m1
1040    pavgw               m0, m5
1041    mova       [dstq+32*0], m0
1042    SMOOTH_2D_END       14, 14, 8, 9, 2, 3
1043    mova       [dstq+32*1], m0
1044    add               dstq, strideq
1045    add         v_weightsq, 4
1046    dec                 hd
1047    jg .w32_loop
1048    RET
1049.w64:
1050%assign stack_offset stack_offset - stack_size_padded
1051    PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
1052    mov          dst_baseq, dstq
1053    mov           tl_baseq, tlq
1054    mov    v_weights_baseq, v_weightsq
1055    xor                 xq, xq
1056.w64_loop_x:
1057    mov                 yq, hq
1058    lea                tlq, [tl_baseq+hq*2]
1059    vpbroadcastw        m0, [tl_baseq] ; bottom
1060    movu                m7, [tlq+xq*2+ 2]
1061    movu                m9, [tlq+xq*2+34]
1062    mova              xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
1063    mova              xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
1064    vinserti128        m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
1065    vinserti128        m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
1066    mova              xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
1067    mova              xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
1068    vinserti128        m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
1069    vinserti128        m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
1070    punpcklwd           m6, m7, m0
1071    punpckhwd           m7, m0
1072    punpcklwd           m8, m9, m0
1073    punpckhwd           m9, m0
1074    lea                tlq, [tl_baseq-2]
1075.w64_loop_y:
1076    vpbroadcastw        m3, [tlq+yq*2]
1077    vpbroadcastd        m1, [v_weightsq]
1078    punpcklwd           m3, m4
1079    pmaddwd            m14, m10, m3
1080    pmaddwd            m15, m11, m3
1081    pmaddwd             m2, m12, m3
1082    pmaddwd             m3, m13
1083    pmaddwd             m0, m6, m1
1084    paddd               m0, m14
1085    pmaddwd            m14, m7, m1
1086    paddd              m14, m15
1087    psrld               m0, 8
1088    psrld              m14, 8
1089    packssdw            m0, m14
1090    pavgw               m0, m5
1091    mova       [dstq+32*0], m0
1092    SMOOTH_2D_END        8, 9, 1, 1, 2, 3
1093    mova       [dstq+32*1], m0
1094    add               dstq, strideq
1095    add         v_weightsq, 4
1096    dec                 yq
1097    jg .w64_loop_y
1098    lea               dstq, [dst_baseq+32*2]
1099    add                 r6, 16*8
1100    mov         v_weightsq, v_weights_baseq
1101    add                 xq, 32
1102    test                xb, 64
1103    jz .w64_loop_x
1104    RET
1105
1106cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
1107    %assign org_stack_offset stack_offset
1108    lea                  r6, [ipred_z1_16bpc_avx2_table]
1109    tzcnt                wd, wm
1110    movifnidn        angled, anglem
1111    movifnidn            hd, hm
1112    lea                  r7, [dr_intra_derivative]
1113    movsxd               wq, [r6+wq*4]
1114    add                 tlq, 2
1115    add                  wq, r6
1116    mov                 dxd, angled
1117    and                 dxd, 0x7e
1118    add              angled, 165 ; ~90
1119    movzx               dxd, word [r7+dxq]
1120    xor              angled, 0x4ff ; d = 90 - angle
1121    vpbroadcastd         m5, [pw_62]
1122    jmp                  wq
1123.w4:
1124    ALLOC_STACK         -64, 7
1125    cmp              angleb, 40
1126    jae .w4_no_upsample
1127    lea                 r3d, [angleq-1024]
1128    sar                 r3d, 7
1129    add                 r3d, hd
1130    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1131    vpbroadcastw        xm3, [tlq+14]
1132    movu                xm1, [tlq+ 0]    ; 1 2 3 4 5 6 7 8
1133    palignr             xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
1134    paddw               xm0, [tlq- 2]    ; 0 1 2 3 4 5 6 7
1135    add                 dxd, dxd
1136    palignr             xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
1137    paddw               xm2, xm1         ; -1 * a + 9 * b + 9 * c + -1 * d
1138    psubw               xm0, xm2, xm0    ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
1139    psraw               xm0, 3           ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
1140    pxor                xm4, xm4
1141    paddw               xm2, xm0
1142    vpbroadcastw        xm0, r8m         ; pixel_max
1143    mova           [rsp+32], xm3
1144    movd                xm3, dxd
1145    pmaxsw              xm2, xm4
1146    mov                 r3d, dxd
1147    pavgw               xm2, xm4
1148    vpbroadcastw         m3, xm3
1149    pminsw              xm2, xm0
1150    punpcklwd           xm0, xm1, xm2
1151    punpckhwd           xm1, xm2
1152    lea                  r5, [strideq*3]
1153    pslldq               m2, m3, 8
1154    mova           [rsp+ 0], xm0
1155    mova           [rsp+16], xm1
1156    paddw                m6, m3, m3
1157    paddw                m3, m2
1158    vpblendd             m4, m6, 0xf0
1159    paddw                m6, m6
1160    paddw                m3, m4 ; xpos0 xpos1 xpos2 xpos3
1161    vbroadcasti128       m4, [z_upsample]
1162.w4_upsample_loop:
1163    lea                 r2d, [r3+dxq]
1164    shr                 r3d, 6 ; base0
1165    movu                xm1, [rsp+r3*2]
1166    lea                 r3d, [r2+dxq]
1167    shr                 r2d, 6 ; base1
1168    movu                xm2, [rsp+r2*2]
1169    lea                 r2d, [r3+dxq]
1170    shr                 r3d, 6 ; base2
1171    vinserti128          m1, [rsp+r3*2], 1 ; 0 2
1172    lea                 r3d, [r2+dxq]
1173    shr                 r2d, 6 ; base3
1174    vinserti128          m2, [rsp+r2*2], 1 ; 1 3
1175    pshufb               m1, m4
1176    pshufb               m2, m4
1177    punpcklqdq           m0, m1, m2
1178    punpckhqdq           m1, m2
1179    pand                 m2, m5, m3 ; frac
1180    psllw                m2, 9      ; (a * (64 - frac) + b * frac + 32) >> 6
1181    psubw                m1, m0     ; = a + (((b - a) * frac + 32) >> 6)
1182    pmulhrsw             m1, m2     ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
1183    paddw                m3, m6     ; xpos += dx
1184    paddw                m0, m1
1185    vextracti128        xm1, m0, 1
1186    movq   [dstq+strideq*0], xm0
1187    movhps [dstq+strideq*1], xm0
1188    movq   [dstq+strideq*2], xm1
1189    movhps [dstq+r5       ], xm1
1190    lea                dstq, [dstq+strideq*4]
1191    sub                  hd, 4
1192    jg .w4_upsample_loop
1193    RET
1194ALIGN function_align
1195.filter_strength: ; w4/w8/w16
1196%define base r3-z_filter_t0
1197    movd                xm0, maxbased
1198    lea                  r3, [z_filter_t0]
1199    movd                xm1, angled
1200    shr              angled, 8 ; is_sm << 1
1201    vpbroadcastb         m0, xm0
1202    vpbroadcastb         m1, xm1
1203    pcmpeqb              m0, [base+z_filter_wh]
1204    mova                xm2, [r3+angleq*8]
1205    pand                 m0, m1
1206    pcmpgtb              m0, m2
1207    pmovmskb            r5d, m0
1208    ret
1209.w4_no_upsample:
1210    mov            maxbased, 7
1211    test             angled, 0x400 ; !enable_intra_edge_filter
1212    jnz .w4_main
1213    lea            maxbased, [hq+3]
1214    call .filter_strength
1215    mov            maxbased, 7
1216    test                r5d, r5d
1217    jz .w4_main ; filter_strength == 0
1218    popcnt              r5d, r5d
1219    vpbroadcastw        xm3, [tlq+14]
1220    mova                xm0, [tlq- 2]      ; 0 1 2 3 4 5 6 7
1221    vpbroadcastd        xm1, [base+z_filter_k-4+r5*4+12*1]
1222    vpbroadcastd        xm4, [base+z_filter_k-4+r5*4+12*0]
1223    palignr             xm2, xm3, xm0, 4   ; 2 3 4 5 6 7 8 8
1224    pmullw              xm1, [tlq+ 0]      ; 1 2 3 4 5 6 7 8
1225    paddw               xm2, xm0
1226    pmullw              xm2, xm4
1227    movd           [rsp+16], xm3
1228    cmp                 r5d, 3
1229    jne .w4_3tap
1230    paddw               xm1, xm2
1231    palignr             xm2, xm3, xm0, 6   ; 3 4 5 6 7 8 8 8
1232    pblendw             xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
1233    movzx               r3d, word [tlq+14]
1234    movzx               r2d, word [tlq+12]
1235    inc            maxbased
1236    paddw               xm2, xm0
1237    sub                 r2d, r3d
1238    paddw               xm2, xm2
1239    lea                 r2d, [r2+r3*8+4]
1240    shr                 r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
1241    mov            [rsp+16], r2w
1242.w4_3tap:
1243    pxor                xm0, xm0
1244    paddw               xm1, xm2
1245    mov                 tlq, rsp
1246    psrlw               xm1, 3
1247    cmp                  hd, 8
1248    sbb            maxbased, -1
1249    pavgw               xm0, xm1
1250    mova              [tlq], xm0
1251.w4_main:
1252    movd                xm3, dxd
1253    vpbroadcastq         m1, [z_base_inc]
1254    vpbroadcastw         m6, [tlq+maxbaseq*2] ; top[max_base_x]
1255    shl            maxbased, 6
1256    vpbroadcastw         m3, xm3
1257    movd                xm0, maxbased
1258    mov                 r3d, dxd      ; xpos
1259    vpbroadcastw         m0, xm0
1260    paddw                m4, m3, m3
1261    psubw                m1, m0       ; -max_base_x
1262    vpblendd             m3, m4, 0xcc
1263    paddw                m0, m4, m3
1264    vpblendd             m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
1265    paddw                m4, m4
1266    paddw                m3, m1
1267.w4_loop:
1268    lea                 r5d, [r3+dxq]
1269    shr                 r3d, 6 ; base0
1270    movu                xm1, [tlq+r3*2]
1271    lea                 r3d, [r5+dxq]
1272    shr                 r5d, 6 ; base1
1273    movu                xm2, [tlq+r5*2]
1274    lea                 r5d, [r3+dxq]
1275    shr                 r3d, 6 ; base2
1276    vinserti128          m1, [tlq+r3*2], 1 ; 0 2
1277    lea                 r3d, [r5+dxq]
1278    shr                 r5d, 6 ; base3
1279    vinserti128          m2, [tlq+r5*2], 1 ; 1 3
1280    punpcklqdq           m0, m1, m2
1281    psrldq               m1, 2
1282    pslldq               m2, 6
1283    vpblendd             m1, m2, 0xcc
1284    pand                 m2, m5, m3
1285    psllw                m2, 9
1286    psubw                m1, m0
1287    pmulhrsw             m1, m2
1288    psraw                m2, m3, 15 ; xpos < max_base_x
1289    paddw                m3, m4
1290    paddw                m0, m1
1291    vpblendvb            m0, m6, m0, m2
1292    vextracti128        xm1, m0, 1
1293    movq   [dstq+strideq*0], xm0
1294    movhps [dstq+strideq*1], xm0
1295    lea                dstq, [dstq+strideq*2]
1296    movq   [dstq+strideq*0], xm1
1297    movhps [dstq+strideq*1], xm1
1298    sub                  hd, 4
1299    jz .w4_end
1300    lea                dstq, [dstq+strideq*2]
1301    cmp                 r3d, maxbased
1302    jb .w4_loop
1303    lea                  r6, [strideq*3]
1304.w4_end_loop:
1305    movq   [dstq+strideq*0], xm6
1306    movq   [dstq+strideq*1], xm6
1307    movq   [dstq+strideq*2], xm6
1308    movq   [dstq+r6       ], xm6
1309    lea                dstq, [dstq+strideq*4]
1310    sub                  hd, 4
1311    jg .w4_end_loop
1312.w4_end:
1313    RET
1314.w8:
1315    %assign stack_offset org_stack_offset
1316    ALLOC_STACK         -64, 7
1317    lea                 r3d, [angleq+216]
1318    mov                 r3b, hb
1319    cmp                 r3d, 8
1320    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1321    movu                 m2, [tlq+2]    ; 2 3 4 5 6 7 8 9   a b c d e f g _
1322    movu                 m0, [tlq+4]    ; 3 4 5 6 7 8 9 a   b c d e f g _ _
1323    movu                 m1, [tlq+0]    ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1324    cmp                  hd, 4
1325    jne .w8_upsample_h8 ; awkward single-pixel edge case
1326    vpblendd             m0, m2, 0x20   ; 3 4 5 6 7 8 9 a   b c c _ _ _ _ _
1327.w8_upsample_h8:
1328    paddw                m2, m1
1329    paddw                m0, [tlq-2]    ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1330    add                 dxd, dxd
1331    psubw                m0, m2, m0
1332    psraw                m0, 3
1333    pxor                 m4, m4
1334    paddw                m2, m0
1335    vpbroadcastw         m0, r8m
1336    movd                xm3, dxd
1337    pmaxsw               m2, m4
1338    mov                 r3d, dxd
1339    pavgw                m2, m4
1340    vpbroadcastw         m3, xm3
1341    pminsw               m2, m0
1342    punpcklwd            m0, m1, m2
1343    punpckhwd            m1, m2
1344    vbroadcasti128       m4, [z_upsample]
1345    mova           [rsp+ 0], xm0
1346    mova           [rsp+16], xm1
1347    paddw                m6, m3, m3
1348    vextracti128   [rsp+32], m0, 1
1349    vextracti128   [rsp+48], m1, 1
1350    vpblendd             m3, m6, 0xf0 ; xpos0 xpos1
1351.w8_upsample_loop:
1352    lea                 r2d, [r3+dxq]
1353    shr                 r3d, 6 ; base0
1354    movu                xm1, [rsp+r3*2]
1355    movu                xm2, [rsp+r3*2+16]
1356    lea                 r3d, [r2+dxq]
1357    shr                 r2d, 6 ; base1
1358    vinserti128          m1, [rsp+r2*2], 1
1359    vinserti128          m2, [rsp+r2*2+16], 1
1360    pshufb               m1, m4
1361    pshufb               m2, m4
1362    punpcklqdq           m0, m1, m2
1363    punpckhqdq           m1, m2
1364    pand                 m2, m5, m3
1365    psllw                m2, 9
1366    psubw                m1, m0
1367    pmulhrsw             m1, m2
1368    paddw                m3, m6
1369    paddw                m0, m1
1370    mova         [dstq+strideq*0], xm0
1371    vextracti128 [dstq+strideq*1], m0, 1
1372    lea                dstq, [dstq+strideq*2]
1373    sub                  hd, 2
1374    jg .w8_upsample_loop
1375    RET
1376.w8_no_intra_edge_filter:
1377    and            maxbased, 7
1378    or             maxbased, 8 ; imin(h+7, 15)
1379    jmp .w8_main
1380.w8_no_upsample:
1381    lea            maxbased, [hq+7]
1382    test             angled, 0x400
1383    jnz .w8_no_intra_edge_filter
1384    call .filter_strength
1385    test                r5d, r5d
1386    jz .w8_main
1387    popcnt              r5d, r5d
1388    vpbroadcastd         m1, [base+z_filter_k-4+r5*4+12*1]
1389    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
1390    mova                 m0, [tlq-2]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1391    movu                 m2, [tlq+0]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1392    pmullw               m1, m2
1393    cmp                  hd, 8
1394    jl .w8_filter_h4
1395    punpckhwd            m2, m2
1396    vpblendd             m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9   a b c d e f g g
1397    je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
1398    movzx               r3d, word [tlq+30]
1399    mov            maxbased, 16
1400    mov            [rsp+32], r3d
1401    cmp                 r5d, 3
1402    jne .w8_filter_end
1403    punpcklwd           xm6, xm0, xm0
1404    vpblendd             m2, [tlq+4], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g g g
1405    vpblendd             m6, [tlq-4], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1406    movzx               r5d, word [tlq+28]
1407    mov            [rsp+34], r3w
1408    paddw                m2, m6
1409    sub                 r5d, r3d
1410    inc            maxbased
1411    paddw                m2, m2
1412    lea                 r3d, [r5+r3*8+4]
1413    paddw                m1, m2
1414    shr                 r3d, 3
1415    mov            [rsp+32], r3w
1416    jmp .w8_filter_end
1417.w8_filter_h4:
1418    pshuflw              m3, m2, q3321
1419    vinserti128          m3, [tlq+2], 0        ; 2 3 4 5 6 7 8 9   a b c c _ _ _ _
1420.w8_filter_end:
1421    paddw                m0, m3
1422    pmullw               m0, m4
1423    mov                 tlq, rsp
1424    pxor                 m2, m2
1425    paddw                m0, m1
1426    psrlw                m0, 3
1427    pavgw                m0, m2
1428    mova              [tlq], m0
1429.w8_main:
1430    movd                xm3, dxd
1431    vbroadcasti128       m1, [z_base_inc]
1432    vpbroadcastw         m6, [tlq+maxbaseq*2]
1433    shl            maxbased, 6
1434    vpbroadcastw         m3, xm3
1435    movd                xm0, maxbased
1436    mov                 r3d, dxd
1437    vpbroadcastw         m0, xm0
1438    paddw                m4, m3, m3
1439    psubw                m1, m0
1440    vpblendd             m3, m4, 0xf0 ; xpos0 xpos1
1441    paddw                m3, m1
1442.w8_loop:
1443    lea                 r5d, [r3+dxq]
1444    shr                 r3d, 6
1445    movu                xm0, [tlq+r3*2]
1446    movu                xm1, [tlq+r3*2+2]
1447    lea                 r3d, [r5+dxq]
1448    shr                 r5d, 6
1449    vinserti128          m0, [tlq+r5*2], 1
1450    vinserti128          m1, [tlq+r5*2+2], 1
1451    pand                 m2, m5, m3
1452    psllw                m2, 9
1453    psubw                m1, m0
1454    pmulhrsw             m1, m2
1455    psraw                m2, m3, 15
1456    paddw                m3, m4
1457    paddw                m0, m1
1458    vpblendvb            m0, m6, m0, m2
1459    mova         [dstq+strideq*0], xm0
1460    vextracti128 [dstq+strideq*1], m0, 1
1461    sub                  hd, 2
1462    jz .w8_end
1463    lea                dstq, [dstq+strideq*2]
1464    cmp                 r3d, maxbased
1465    jb .w8_loop
1466.w8_end_loop:
1467    mova   [dstq+strideq*0], xm6
1468    mova   [dstq+strideq*1], xm6
1469    lea                dstq, [dstq+strideq*2]
1470    sub                  hd, 2
1471    jg .w8_end_loop
1472.w8_end:
1473    RET
1474.w16_no_intra_edge_filter:
1475    and            maxbased, 15
1476    or             maxbased, 16 ; imin(h+15, 31)
1477    jmp .w16_main
1478.w16:
1479    %assign stack_offset org_stack_offset
1480    ALLOC_STACK         -96, 7
1481    lea            maxbased, [hq+15]
1482    test             angled, 0x400
1483    jnz .w16_no_intra_edge_filter
1484    call .filter_strength
1485    test                r5d, r5d
1486    jz .w16_main
1487    popcnt              r5d, r5d
1488    mova                 m0, [tlq-2]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1489    paddw                m1, m0, [tlq+2]        ; 2 3 4 5 6 7 8 9   a b c d e f g h
1490    cmp                 r5d, 3
1491    jne .w16_filter_3tap
1492    vpbroadcastd         m2, [base+pw_3]
1493    punpcklwd           xm0, xm0
1494    vpblendd             m0, [tlq-4], 0xfe      ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1495    paddw                m1, [tlq+0]            ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1496    paddw                m0, m2
1497    pavgw                m0, [tlq+4]            ; 3 4 5 6 7 8 9 a   b c d e f g h i
1498    paddw                m0, m1
1499    psrlw                m0, 2
1500    movu                 m3, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1501    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1502    paddw                m1, m3, [tlq+30]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1503    cmp                  hd, 8
1504    jl .w16_filter_5tap_h4
1505    punpckhwd            m3, m3
1506    je .w16_filter_5tap_h8
1507    vpblendd             m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1508    vpblendd             m3, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1509    movzx               r3d, word [tlq+62]
1510    movzx               r2d, word [tlq+60]
1511    pavgw                m2, m4
1512    sub                 r2d, r3d
1513    paddw                m1, m3
1514    lea                 r2d, [r2+r3*8+4]
1515    paddw                m1, m2
1516    shr                 r2d, 3
1517    psrlw                m1, 2
1518    mov            [rsp+66], r3w
1519    mov            [rsp+64], r2w
1520    mov                 tlq, rsp
1521    mov                 r3d, 33
1522    cmp                  hd, 16
1523    cmovg          maxbased, r3d
1524    jmp .w16_filter_end2
1525.w16_filter_5tap_h8:
1526    vpblendd            xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
1527    vpblendd            xm3, [tlq+34], 0x07      ; 3 4 5 6 7 8 9 9
1528    pavgw               xm2, xm4
1529    paddw               xm1, xm3
1530    paddw               xm1, xm2
1531    psrlw               xm1, 2
1532    jmp .w16_filter_end2
1533.w16_filter_5tap_h4:
1534    pshuflw             xm4, xm3, q3332          ; 4 5 5 5
1535    pshuflw             xm3, xm3, q3321          ; 3 4 5 5
1536    pavgw               xm2, xm4
1537    paddw               xm1, xm3
1538    paddw               xm1, xm2
1539    psrlw               xm1, 2
1540    jmp .w16_filter_end2
1541.w16_filter_3tap:
1542    vpbroadcastd         m3, [base+z_filter_k-4+r5*4+12*1]
1543    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
1544    pmullw               m0, m3, [tlq+0]    ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1545    movu                 m2, [tlq+32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1546    pmullw               m1, m4
1547    pmullw               m3, m2
1548    paddw                m0, m1
1549    cmp                  hd, 8
1550    je .w16_filter_3tap_h8
1551    jl .w16_filter_3tap_h4
1552    punpckhwd            m2, m2
1553    vpblendd             m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9   a b c d e f g g
1554    jmp .w16_filter_end
1555.w16_filter_3tap_h4:
1556    pshuflw             xm2, xm2, q3321     ; 2 3 4 4 _ _ _ _
1557    jmp .w16_filter_end
1558.w16_filter_3tap_h8:
1559    psrldq              xm2, 2
1560    pshufhw             xm2, xm2, q2210     ; 2 3 4 5 6 7 8 8
1561.w16_filter_end:
1562    paddw                m2, [tlq+30]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1563    pmullw               m2, m4
1564    psrlw                m0, 3
1565    pxor                 m1, m1
1566    paddw                m2, m3
1567    psrlw                m2, 3
1568    pavgw                m0, m1
1569    pavgw                m1, m2
1570.w16_filter_end2:
1571    mov                 tlq, rsp
1572    mova           [tlq+ 0], m0
1573    mova           [tlq+32], m1
1574.w16_main:
1575    movd                xm4, dxd
1576    vpbroadcastw         m6, [tlq+maxbaseq*2]
1577    shl            maxbased, 6
1578    vpbroadcastw         m4, xm4
1579    movd                xm0, maxbased
1580    mov                 r3d, dxd
1581    vpbroadcastw         m0, xm0
1582    paddw                m3, m4, [z_base_inc]
1583    psubw                m3, m0
1584.w16_loop:
1585    lea                 r5d, [r3+dxq]
1586    shr                 r3d, 6
1587    movu                 m0, [tlq+r3*2]
1588    movu                 m1, [tlq+r3*2+2]
1589    lea                 r3d, [r5+dxq]
1590    shr                 r5d, 6
1591    pand                 m2, m5, m3
1592    psllw                m2, 9
1593    psubw                m1, m0
1594    pmulhrsw             m1, m2
1595    psraw                m2, m3, 15
1596    paddw                m3, m4
1597    paddw                m1, m0
1598    movu                 m0, [tlq+r5*2]
1599    vpblendvb            m2, m6, m1, m2
1600    movu                 m1, [tlq+r5*2+2]
1601    mova   [dstq+strideq*0], m2
1602    pand                 m2, m5, m3
1603    psllw                m2, 9
1604    psubw                m1, m0
1605    pmulhrsw             m1, m2
1606    psraw                m2, m3, 15
1607    paddw                m3, m4
1608    paddw                m0, m1
1609    vpblendvb            m0, m6, m0, m2
1610    mova   [dstq+strideq*1], m0
1611    sub                  hd, 2
1612    jz .w16_end
1613    lea                dstq, [dstq+strideq*2]
1614    cmp                 r3d, maxbased
1615    jb .w16_loop
1616.w16_end_loop:
1617    mova   [dstq+strideq*0], m6
1618    mova   [dstq+strideq*1], m6
1619    lea                dstq, [dstq+strideq*2]
1620    sub                  hd, 2
1621    jg .w16_end_loop
1622.w16_end:
1623    RET
1624.w32:
1625    %assign stack_offset org_stack_offset
1626    ALLOC_STACK        -160, 8
1627    lea            maxbased, [hq+31]
1628    mov                 r3d, 63
1629    cmp                  hd, 32
1630    cmova          maxbased, r3d
1631    test             angled, 0x400
1632    jnz .w32_main
1633    vpbroadcastd         m2, [pw_3]
1634    mova                 m0, [tlq-2]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1635    punpcklwd           xm1, xm0, xm0
1636    vpblendd             m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1637    paddw                m0, [tlq+0]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1638    paddw                m1, m2
1639    paddw                m0, [tlq+2]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
1640    pavgw                m1, [tlq+4]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
1641    mov                  r3, rsp
1642    paddw                m0, m1
1643    lea                 r5d, [maxbaseq-31]
1644    psrlw                m0, 2
1645    mova               [r3], m0
1646.w32_filter_loop:
1647    mova                 m0, [tlq+30]
1648    paddw                m1, m2, [tlq+28]
1649    add                 tlq, 32
1650    paddw                m0, [tlq+0]
1651    pavgw                m1, [tlq+4]
1652    paddw                m0, [tlq+2]
1653    add                  r3, 32
1654    paddw                m0, m1
1655    psrlw                m0, 2
1656    mova               [r3], m0
1657    sub                 r5d, 16
1658    jg .w32_filter_loop
1659    movu                 m0, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1660    punpckhwd            m1, m0, m0
1661    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1662    paddw                m0, [tlq+30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1663    jl .w32_filter_h8
1664    vpblendd             m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1665    vpblendd             m1, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1666    movzx               r5d, word [tlq+62]
1667    movzx               r2d, word [tlq+60]
1668    pavgw                m2, m3
1669    sub                 r2d, r5d
1670    paddw                m0, m1
1671    lea                 r2d, [r2+r5*8+4]
1672    paddw                m0, m2
1673    shr                 r2d, 3
1674    psrlw                m0, 2
1675    mova            [r3+32], m0
1676    mov             [r3+66], r5w
1677    mov             [r3+64], r2w
1678    mov                 tlq, rsp
1679    mov                 r3d, 65
1680    cmp                  hd, 64
1681    cmove          maxbased, r3d
1682    jmp .w32_main
1683.w32_filter_h8:
1684    vpblendd            xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
1685    vpblendd            xm1, [tlq+34], 0x07      ; 3 4 5 6 7 8 9 9
1686    pavgw               xm2, xm3
1687    paddw               xm0, xm1
1688    mov                 tlq, rsp
1689    paddw               xm0, xm2
1690    psrlw               xm0, 2
1691    mova            [r3+32], xm0
1692.w32_main:
1693    movd                xm4, dxd
1694    vpbroadcastw         m6, [tlq+maxbaseq*2]
1695    shl            maxbased, 6
1696    vpbroadcastw         m4, xm4
1697    movd                xm0, maxbased
1698    mov                 r5d, dxd
1699    vpbroadcastd         m7, [pw_m1024] ; -16 * 64
1700    vpbroadcastw         m0, xm0
1701    paddw                m3, m4, [z_base_inc]
1702    psubw                m3, m0
1703.w32_loop:
1704    mov                 r3d, r5d
1705    shr                 r3d, 6
1706    movu                 m0, [tlq+r3*2]
1707    movu                 m1, [tlq+r3*2+2]
1708    pand                 m2, m5, m3
1709    psllw                m2, 9
1710    psubw                m1, m0
1711    pmulhrsw             m1, m2
1712    paddw                m0, m1
1713    psraw                m1, m3, 15
1714    vpblendvb            m0, m6, m0, m1
1715    mova        [dstq+32*0], m0
1716    movu                 m0, [tlq+r3*2+32]
1717    movu                 m1, [tlq+r3*2+34]
1718    add                 r5d, dxd
1719    psubw                m1, m0
1720    pmulhrsw             m1, m2
1721    pcmpgtw              m2, m7, m3
1722    paddw                m3, m4
1723    paddw                m0, m1
1724    vpblendvb            m0, m6, m0, m2
1725    mova        [dstq+32*1], m0
1726    dec                  hd
1727    jz .w32_end
1728    add                dstq, strideq
1729    cmp                 r5d, maxbased
1730    jb .w32_loop
1731.w32_end_loop:
1732    mova        [dstq+32*0], m6
1733    mova        [dstq+32*1], m6
1734    add                dstq, strideq
1735    dec                  hd
1736    jg .w32_end_loop
1737.w32_end:
1738    RET
1739.w64:
1740    %assign stack_offset org_stack_offset
1741    ALLOC_STACK        -256, 10
1742    lea            maxbased, [hq+63]
1743    test             angled, 0x400
1744    jnz .w64_main
1745    vpbroadcastd         m2, [pw_3]
1746    mova                 m0, [tlq-2]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1747    punpcklwd           xm1, xm0, xm0
1748    vpblendd             m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1749    paddw                m0, [tlq+0]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1750    paddw                m1, m2
1751    paddw                m0, [tlq+2]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
1752    pavgw                m1, [tlq+4]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
1753    mov                  r3, rsp
1754    paddw                m0, m1
1755    lea                 r5d, [hq+32]
1756    psrlw                m0, 2
1757    mova               [r3], m0
1758.w64_filter_loop:
1759    mova                 m0, [tlq+30]
1760    paddw                m1, m2, [tlq+28]
1761    add                 tlq, 32
1762    paddw                m0, [tlq+0]
1763    pavgw                m1, [tlq+4]
1764    paddw                m0, [tlq+2]
1765    add                  r3, 32
1766    paddw                m0, m1
1767    psrlw                m0, 2
1768    mova               [r3], m0
1769    sub                 r5d, 16
1770    jg .w64_filter_loop
1771    movu                 m0, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1772    punpckhwd            m1, m0, m0
1773    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1774    paddw                m0, [tlq+30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1775    vpblendd             m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1776    vpblendd             m1, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1777    pavgw                m2, m3
1778    paddw                m0, m1
1779    paddw                m0, m2
1780    mov                 tlq, rsp
1781    psrlw                m0, 2
1782    mova            [r3+32], m0
1783.w64_main:
1784    movd                xm4, dxd
1785    vpbroadcastw         m6, [tlq+maxbaseq*2]
1786    shl            maxbased, 6
1787    vpbroadcastw         m4, xm4
1788    movd                xm0, maxbased
1789    mov                 r5d, dxd
1790    vpbroadcastd         m7, [pw_m1024] ; -16 * 64
1791    vpbroadcastw         m0, xm0
1792    paddw                m3, m4, [z_base_inc]
1793    paddw                m8, m7, m7     ; -32 * 64
1794    psubw                m3, m0
1795    paddw                m9, m8, m7     ; -48 * 64
1796.w64_loop:
1797    mov                 r3d, r5d
1798    shr                 r3d, 6
1799    movu                 m0, [tlq+r3*2]
1800    movu                 m1, [tlq+r3*2+2]
1801    pand                 m2, m5, m3
1802    psllw                m2, 9
1803    psubw                m1, m0
1804    pmulhrsw             m1, m2
1805    paddw                m0, m1
1806    psraw                m1, m3, 15
1807    vpblendvb            m0, m6, m0, m1
1808    mova        [dstq+32*0], m0
1809    movu                 m0, [tlq+r3*2+32]
1810    movu                 m1, [tlq+r3*2+34]
1811    psubw                m1, m0
1812    pmulhrsw             m1, m2
1813    paddw                m0, m1
1814    pcmpgtw              m1, m7, m3
1815    vpblendvb            m0, m6, m0, m1
1816    mova        [dstq+32*1], m0
1817    movu                 m0, [tlq+r3*2+64]
1818    movu                 m1, [tlq+r3*2+66]
1819    psubw                m1, m0
1820    pmulhrsw             m1, m2
1821    paddw                m0, m1
1822    pcmpgtw              m1, m8, m3
1823    vpblendvb            m0, m6, m0, m1
1824    mova        [dstq+32*2], m0
1825    movu                 m0, [tlq+r3*2+96]
1826    movu                 m1, [tlq+r3*2+98]
1827    add                 r5d, dxd
1828    psubw                m1, m0
1829    pmulhrsw             m1, m2
1830    pcmpgtw              m2, m9, m3
1831    paddw                m3, m4
1832    paddw                m0, m1
1833    vpblendvb            m0, m6, m0, m2
1834    mova        [dstq+32*3], m0
1835    dec                  hd
1836    jz .w64_end
1837    add                dstq, strideq
1838    cmp                 r5d, maxbased
1839    jb .w64_loop
1840.w64_end_loop:
1841    mova        [dstq+32*0], m6
1842    mova        [dstq+32*1], m6
1843    mova        [dstq+32*2], m6
1844    mova        [dstq+32*3], m6
1845    add                dstq, strideq
1846    dec                  hd
1847    jg .w64_end_loop
1848.w64_end:
1849    RET
1850
1851cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
1852%define base r9-z_filter_t0
1853    lea                  r9, [ipred_z2_16bpc_avx2_table]
1854    tzcnt                wd, wm
1855    movifnidn        angled, anglem
1856    movifnidn            hd, hm
1857    lea                 dxq, [dr_intra_derivative-90]
1858    movsxd               wq, [r9+wq*4]
1859    mova                 m1, [tlq-  0]
1860    movzx               dyd, angleb
1861    xor              angled, 0x400
1862    mova                 m2, [tlq- 32]
1863    mov                  r8, dxq
1864    sub                 dxq, dyq
1865    mova                 m3, [tlq- 64]
1866    add                  wq, r9
1867    add                  r9, z_filter_t0-ipred_z2_16bpc_avx2_table
1868    mova                 m4, [tlq- 96]
1869    and                 dyd, ~1
1870    mova                 m5, [tlq-128]
1871    and                 dxq, ~1
1872    movzx               dyd, word [r8+dyq]  ; angle - 90
1873    movzx               dxd, word [dxq+270] ; 180 - angle
1874    vpbroadcastd        m11, [base+pw_62]
1875    mova          [rsp+128], m1
1876    mova          [rsp+ 96], m2
1877    mova          [rsp+ 64], m3
1878    neg                 dxd
1879    mova          [rsp+ 32], m4
1880    neg                 dyq
1881    mova          [rsp+  0], m5
1882    jmp                  wq
1883.w4:
1884    vbroadcasti128      m10, [base+z2_x_shuf]
1885    vpbroadcastq         m6, [base+z_base_inc+2]
1886    lea                 r8d, [dxq+(65<<6)] ; xpos
1887    mov                r10d, (63-4)<<6
1888    test             angled, 0x400
1889    jnz .w4_main ; !enable_intra_edge_filter
1890    lea                 r3d, [hq+2]
1891    add              angled, 1022
1892    shl                 r3d, 6
1893    test                r3d, angled
1894    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1895    movq                xm0, [tlq+2]    ; 1 2 3 4
1896    movq                xm1, [tlq+0]    ; 0 1 2 3
1897    pshuflw             xm2, xm0, q3321 ; 2 3 4 4
1898    pshuflw             xm3, xm1, q2100 ; 0 0 1 2
1899    vpbroadcastw        xm4, r8m        ; pixel_max
1900    vbroadcasti128      m10, [base+z_upsample]
1901    paddw               xm1, xm0
1902    paddw               xm2, xm3
1903    lea                 r8d, [r8+dxq+(1<<6)]
1904    psubw               xm2, xm1, xm2
1905    add                 dxd, dxd
1906    psraw               xm2, 3
1907    pxor                xm3, xm3
1908    sub                r10d, 3<<6
1909    paddw               xm1, xm2
1910    paddw                m6, m6
1911    pmaxsw              xm1, xm3
1912    sub              angled, 1075 ; angle - 53
1913    pavgw               xm1, xm3
1914    lea                 r3d, [hq+3]
1915    pminsw              xm1, xm4
1916    xor              angled, 0x7f ; 180 - angle
1917    punpcklwd           xm1, xm0
1918    movu          [rsp+130], xm1
1919    call .filter_strength
1920    jmp .w4_filter_left
1921ALIGN function_align
1922.filter_strength:
1923    movd                xm8, r3d
1924    mov                 r3d, angled
1925    movd                xm7, angled
1926    vpbroadcastb         m8, xm8
1927    shr                 r3d, 8 ; is_sm << 1
1928    vpbroadcastb         m7, xm7
1929    pcmpeqb              m8, [base+z_filter_wh]
1930    mova                xm9, [r9+r3*8]
1931    pand                 m0, m8, m7
1932    pcmpgtb              m0, m9
1933    pmovmskb            r3d, m0
1934    ret
1935ALIGN function_align
1936.upsample_left: ; h4/h8
1937    mova                xm0, [tlq-16]            ; 8 7 6 5 4 3 2 1
1938    movu                xm1, [tlq-14]            ; 7 6 5 4 3 2 1 0
1939%if STACK_ALIGNMENT < 32
1940    vpbroadcastw        xm4, r8m ; pixel_max
1941%else
1942    vpbroadcastw        xm4, r9m ; r8m -> r9m due to call
1943%endif
1944    cmp                  hd, 8
1945    je .upsample_left_h8
1946    pshufhw             xm2, xm0, q2100          ; _ _ _ _ 4 4 3 2
1947    pshufhw             xm3, xm1, q3321          ; _ _ _ _ 2 1 0 0
1948    jmp .upsample_left_end
1949.upsample_left_h8:
1950    pblendw             xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
1951    pblendw             xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
1952.upsample_left_end:
1953    paddw               xm1, xm0
1954    paddw               xm2, xm3
1955    psubw               xm2, xm1, xm2
1956    add                 dyq, dyq
1957    psraw               xm2, 3
1958    pxor                xm3, xm3
1959    paddw               xm1, xm2
1960    pmaxsw              xm1, xm3
1961    pavgw               xm1, xm3
1962    pminsw              xm1, xm4
1963    punpcklwd           xm2, xm0, xm1
1964    punpckhwd           xm0, xm1
1965    mova  [rsp+ 96+gprsize], xm2
1966    mova  [rsp+112+gprsize], xm0
1967    ret
1968.w4_no_upsample_above:
1969    lea                 r3d, [hq+3]
1970    sub              angled, 1112 ; angle - 90
1971    call .filter_strength
1972    test                r3d, r3d
1973    jz .w4_no_filter_above
1974    popcnt              r3d, r3d
1975    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
1976    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*0]
1977    psrldq              xm0, xm1, 2     ; 1 2 3 4
1978    pshuflw             xm2, xm1, q2100 ; 0 0 1 2
1979    pmullw              xm4, xm0
1980    pshuflw             xm3, xm0, q3321 ; 2 3 4 4
1981    paddw               xm1, xm3
1982    pshuflw             xm3, xm0, q3332 ; 3 4 4 4
1983    pmullw              xm1, xm5
1984    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*2]
1985    paddw               xm2, xm3
1986    vpbroadcastd        xm3, r6m ; max_width
1987    pmullw              xm2, xm5
1988    packssdw            xm3, xm3
1989    paddw               xm1, xm4
1990    paddw               xm1, xm2
1991    psubw               xm3, [base+pw_1to16]
1992    pxor                xm4, xm4
1993    psrlw               xm1, 3
1994    pminsw              xm3, xm11 ; clip to byte range since there's no variable word blend
1995    pavgw               xm1, xm4
1996    vpblendvb           xm1, xm0, xm3
1997    movq          [rsp+130], xm1
1998.w4_no_filter_above:
1999    lea                 r3d, [hq+2]
2000    add              angled, 973 ; angle + 883
2001    shl                 r3d, 6
2002    test                r3d, angled
2003    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
2004    vpbroadcastd        xm0, [base+pb_90]
2005    psubb               xm0, xm7 ; 180 - angle
2006    pand                xm0, xm8 ; reuse from previous filter_strength call
2007    pcmpgtb             xm0, xm9
2008    pmovmskb            r3d, xm0
2009.w4_filter_left:
2010    test                r3d, r3d
2011    jz .w4_main
2012    popcnt              r3d, r3d
2013    mova                 m0, [tlq-32]  ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2014    vpbroadcastd         m5, r7m ; max_height
2015    cmp                 r3d, 3
2016    je .w4_filter_left_s3
2017    vpbroadcastd         m2, [base+z_filter_k-4+r3*4+12*1]
2018    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2019    pmullw               m2, m0
2020    cmp                  hd, 8
2021    jl .w4_filter_left_h4
2022    movu                 m4, [tlq-34]
2023    punpcklwd            m1, m0, m0
2024    vpblendd             m1, m4, 0xee  ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2025    je .w4_filter_left_end
2026    vpblendd             m1, m4, 0x10  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2027    jmp .w4_filter_left_end
2028.w4_upsample_left:
2029    call .upsample_left
2030    mov                 r11, -16
2031    vbroadcasti128       m9, [base+z_upsample]
2032    jmp .w4_main_upsample_left
2033.w4_filter_left_s3: ; can only be h16
2034    movu                 m2, [tlq-30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2035    vpbroadcastd         m4, [base+pw_3]
2036    paddw                m1, m0, m2
2037    punpckhwd            m2, m2
2038    vpblendd             m2, [tlq-28], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
2039    punpcklwd           xm3, xm0, xm0
2040    paddw                m2, m4
2041    vpblendd             m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2042    vpblendd             m3, [tlq-36], 0xfe     ; 0 0 0 1 2 3 4 5   6 8 8 9 a b c d
2043    paddw                m1, m4
2044    pavgw                m2, m3
2045    paddw                m1, m2
2046    psrlw                m1, 2
2047    jmp .w4_filter_left_end2
2048.w4_filter_left_h4:
2049    pshufhw              m1, m0, q2100 ; _ _ _ _ _ _ _ _   _ _ _ _ c c d e
2050.w4_filter_left_end:
2051    paddw                m1, [tlq-30]  ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2052    pmullw               m1, m3
2053    paddw                m1, m2
2054    pxor                 m2, m2
2055    psrlw                m1, 3
2056    pavgw                m1, m2
2057.w4_filter_left_end2:
2058    packssdw             m5, m5
2059    psubw                m5, [base+pw_16to1]
2060    pminsw               m5, m11
2061    vpblendvb            m1, m0, m5
2062    mova           [rsp+96], m1
2063.w4_main:
2064    vbroadcasti128       m9, [base+z2_x_shuf]
2065    mov                 r11, -8
2066.w4_main_upsample_left:
2067    movd                xm5, dyd
2068    mova                 m4, [base+z2_y_shuf_h4]
2069    mov                 r2d, r8d
2070    movd                xm0, dxd
2071    vpbroadcastw         m5, xm5
2072    rorx                 r5, dyq, 5
2073    lea                 r8d, [dyq*3]
2074    pmullw               m5, [base+z2_ymul]
2075    rorx                 r9, dyq, 4
2076    sar                 dyd, 6
2077    vpbroadcastw         m0, xm0
2078    sar                 r8d, 6
2079    pand                 m5, m11       ; frac_y
2080    neg                 dyd
2081    psllw                m5, 9
2082    add                 r5d, dyd
2083    add                 r8d, dyd
2084    add                 r9d, dyd
2085    paddw                m7, m0, m0
2086    lea                 dyq, [rsp+dyq*2+126]
2087    vpblendd             m0, m7, 0xcc
2088    add                 dyq, r11
2089    neg                 r5d
2090    paddw                m1, m0, m7
2091    neg                 r8d
2092    vpblendd             m0, m1, 0xf0  ; xpos0 xpos1 xpos2 xpos3
2093    neg                 r9d
2094    paddw                m7, m7
2095    paddw                m6, m0
2096.w4_loop:
2097    lea                 r3d, [r2+dxq]
2098    shr                 r2d, 6         ; base_x0
2099    movu                xm1, [rsp+r2*2]
2100    lea                 r2d, [r3+dxq]
2101    shr                 r3d, 6         ; base_x1
2102    movu                xm3, [rsp+r3*2]
2103    lea                 r3d, [r2+dxq]
2104    shr                 r2d, 6         ; base_x2
2105    vinserti128          m1, [rsp+r2*2], 1
2106    lea                 r2d, [r3+dxq]
2107    shr                 r3d, 6         ; base_x3
2108    vinserti128          m3, [rsp+r3*2], 1
2109    pshufb               m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
2110    pshufb               m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
2111    pand                 m2, m11, m6
2112    punpcklqdq           m0, m1, m3
2113    punpckhqdq           m1, m3
2114    psllw                m2, 9
2115    psubw                m1, m0
2116    pmulhrsw             m1, m2
2117    paddw                m0, m1
2118    cmp                 r3d, 64
2119    jge .w4_toponly
2120    movu                xm2, [dyq]
2121    vinserti128          m2, [dyq+r8*2], 1
2122    movu                xm3, [dyq+r5*2]
2123    vinserti128          m3, [dyq+r9*2], 1
2124    pshufb               m2, m9
2125    pshufb               m3, m9
2126    punpckhwd            m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
2127    punpcklwd            m2, m3
2128    psubw                m2, m1
2129    pmulhrsw             m2, m5
2130    psraw                m3, m6, 15 ; base_x < topleft
2131    paddw                m1, m2
2132    vpermd               m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1   a2 b2 c2 d2 a3 b3 c3 d3
2133    vpblendvb            m0, m1, m3
2134.w4_toponly:
2135    paddw                m6, m7     ; xpos += dx
2136    lea                  r3, [strideq*3]
2137    add                 dyq, r11
2138    vextracti128        xm1, m0, 1
2139    movq   [dstq+strideq*0], xm0
2140    movhps [dstq+strideq*1], xm0
2141    movq   [dstq+strideq*2], xm1
2142    movhps [dstq+r3       ], xm1
2143    sub                  hd, 4
2144    jz .w4_end
2145    lea                dstq, [dstq+strideq*4]
2146    cmp                 r2d, r10d
2147    jge .w4_loop
2148.w4_leftonly_loop:
2149    movu                xm1, [dyq]
2150    vinserti128          m1, [dyq+r8*2], 1
2151    movu                xm2, [dyq+r5*2]
2152    vinserti128          m2, [dyq+r9*2], 1
2153    add                 dyq, r11
2154    pshufb               m1, m9
2155    pshufb               m2, m9
2156    punpckhwd            m0, m1, m2
2157    punpcklwd            m1, m2
2158    psubw                m1, m0
2159    pmulhrsw             m1, m5
2160    paddw                m0, m1
2161    vpermd               m0, m4, m0
2162    vextracti128        xm1, m0, 1
2163    movq   [dstq+strideq*0], xm0
2164    movhps [dstq+strideq*1], xm0
2165    movq   [dstq+strideq*2], xm1
2166    movhps [dstq+r3       ], xm1
2167    lea                dstq, [dstq+strideq*4]
2168    sub                  hd, 4
2169    jg .w4_leftonly_loop
2170.w4_end:
2171    RET
2172.w8:
2173    mov                r10d, hd
2174    test             angled, 0x400
2175    jnz .w8_main
2176    lea                 r3d, [angleq+126]
2177    xor                 r8d, r8d
2178    mov                 r3b, hb
2179    cmp                 r3d, 8
2180    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2181    movu                xm0, [tlq+2]            ; 1 2 3 4 5 6 7 8
2182    mova                xm1, [tlq+0]            ; 0 1 2 3 4 5 6 7
2183    pblendw             xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
2184    pblendw             xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
2185    vpbroadcastw        xm4, r8m ; pixel_max
2186    paddw               xm1, xm0
2187    paddw               xm2, xm3
2188    not                 r8d
2189    psubw               xm2, xm1, xm2
2190    add                 dxd, dxd
2191    psraw               xm2, 3
2192    sub              angled, 53 ; angle - 53
2193    pxor                xm3, xm3
2194    paddw               xm2, xm1
2195    lea                 r3d, [hq+7]
2196    pmaxsw              xm2, xm3
2197    xor              angled, 0x7f ; 180 - angle
2198    pavgw               xm2, xm3
2199    pminsw              xm2, xm4
2200    punpcklwd           xm1, xm2, xm0
2201    punpckhwd           xm2, xm0
2202    movu          [rsp+130], xm1
2203    movu          [rsp+146], xm2
2204    call .filter_strength
2205    jmp .w8_filter_left
2206.w8_no_upsample_above:
2207    lea                 r3d, [hq+7]
2208    sub              angled, 90 ; angle - 90
2209    call .filter_strength
2210    test                r3d, r3d
2211    jz .w8_no_filter_above
2212    popcnt              r3d, r3d
2213    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2214    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*0]
2215    vpbroadcastd        xm6, [base+z_filter_k-4+r3*4+12*2]
2216    movu                xm0, [tlq+2]            ; 1 2 3 4 5 6 7 8 x
2217    pblendw             xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
2218    pmullw              xm4, xm0
2219    pblendw             xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
2220    paddw               xm1, xm3
2221    vpblendd            xm3, [tlq+6], 0x07      ; 3 4 5 6 7 8 8 8 x
2222    paddw               xm2, xm3
2223    vpbroadcastd        xm3, r6m ; max_width
2224    pmullw              xm1, xm5
2225    pmullw              xm2, xm6
2226    packssdw            xm3, xm3
2227    paddw               xm1, xm4
2228    paddw               xm1, xm2
2229    psubw               xm3, [base+pw_1to16]
2230    pxor                xm4, xm4
2231    psrlw               xm1, 3
2232    pminsw              xm3, xm11
2233    pavgw               xm1, xm4
2234    vpblendvb           xm1, xm0, xm3
2235    movu          [rsp+130], xm1
2236.w8_no_filter_above:
2237    lea                 r3d, [angleq-51]
2238    mov                 r3b, hb
2239    cmp                 r3d, 8
2240    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2241    vpbroadcastd         m0, [base+pb_90]
2242    psubb                m0, m7
2243    pand                 m0, m8
2244    pcmpgtb              m0, m9
2245    pmovmskb            r3d, m0
2246.w8_filter_left:
2247    test                r3d, r3d
2248    jz .w8_main
2249    popcnt              r3d, r3d
2250    cmp                 r3d, 3
2251    jne .w8_filter_left_s12
2252    vpbroadcastd         m6, [base+pw_3]
2253    vpbroadcastd         m7, [base+pw_16]
2254    cmp                  hd, 16 ; flags needed for later
2255    jmp .filter_left_s3b
2256.w8_upsample_left:
2257    call .upsample_left
2258    vbroadcasti128       m7, [base+z2_y_shuf_us]
2259    lea                 r11, [rsp+118]
2260    mov                  r8, -8
2261    jmp .w8_main_upsample_left
2262.w16_filter_left_s12:
2263    xor                 r8d, r8d
2264.w8_filter_left_s12:
2265    mova                 m0, [tlq-32]  ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2266    vpbroadcastd         m5, r7m ; max_height
2267    vpbroadcastd         m2, [base+z_filter_k-4+r3*4+12*1]
2268    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2269    pmullw               m2, m0
2270    cmp                  hd, 8
2271    jl .w8_filter_left_h4
2272    movu                 m4, [tlq-34]
2273    punpcklwd            m1, m0, m0
2274    vpblendd             m1, m4, 0xee  ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2275    je .w8_filter_left_end
2276    vpblendd             m1, m4, 0x10  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2277    jmp .w8_filter_left_end
2278.w8_filter_left_h4:
2279    pshufhw              m1, m0, q2100 ; _ _ _ _ _ _ _ _   _ _ _ _ c c d e
2280.w8_filter_left_end:
2281    paddw                m1, [tlq-30]  ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2282    pmullw               m1, m3
2283    paddw                m1, m2
2284    pxor                 m2, m2
2285    psrlw                m1, 3
2286    pavgw                m1, m2
2287    packssdw             m5, m5
2288    psubw                m5, [base+pw_16to1]
2289    pminsw               m5, m11
2290    vpblendvb            m1, m0, m5
2291    mova           [rsp+96], m1
2292    test                r8d, r8d
2293    jz .w8_main
2294; upsample_main
2295    vbroadcasti128      m10, [base+z_upsample]
2296    vbroadcasti128       m7, [base+z2_y_shuf]
2297    lea                  r5, [rsp+120]
2298    movd                xm1, dyd
2299    vbroadcasti128       m4, [base+z_base_inc+2]
2300    movd                xm2, dxd
2301    vpbroadcastw         m1, xm1
2302    vpbroadcastw         m2, xm2
2303    mov                  r7, dstq
2304    paddw                m4, m4
2305    pmullw               m0, m1, [base+z2_ymul8]
2306    paddw                m5, m2, m2
2307    psllw               xm1, 3
2308    vpblendd             m2, m5, 0xf0
2309    lea                 r2d, [dxq+(66<<6)] ; xpos
2310    paddw                m4, m2
2311    pshufd               m6, m0, q2020
2312    psraw               xm0, 6
2313    pxor                xm1, xm1
2314    psubw               xm8, xm1, xm0
2315    pand                 m6, m11
2316    punpckhwd           xm9, xm8, xm1
2317    psllw                m6, 9
2318    punpcklwd           xm8, xm1
2319.w8_upsample_above_loop:
2320    lea                 r3d, [r2+dxq]
2321    shr                 r2d, 6
2322    movu                xm1, [rsp+r2*2]
2323    movu                xm2, [rsp+r2*2+16]
2324    lea                 r2d, [r3+dxq]
2325    shr                 r3d, 6
2326    vinserti128          m1, [rsp+r3*2], 1
2327    vinserti128          m2, [rsp+r3*2+16], 1
2328    pshufb               m1, m10
2329    pshufb               m2, m10
2330    punpcklqdq           m0, m1, m2   ; a0 b0 c0 d0 e0 f0 g0 h0
2331    punpckhqdq           m1, m2
2332    pand                 m2, m11, m4
2333    psubw                m1, m0
2334    psllw                m2, 9
2335    pmulhrsw             m1, m2
2336    paddw                m0, m1
2337    cmp                 r3d, 64
2338    jge .w8_upsample_above_toponly
2339    mova                 m1, m5
2340    vpgatherdq           m3, [r5+xm9*2], m5
2341    mova                 m5, m1
2342    vpgatherdq           m2, [r5+xm8*2], m1
2343    pshufb               m3, m7
2344    pshufb               m2, m7
2345    punpckldq            m1, m2, m3
2346    punpckhdq            m2, m3
2347    psubw                m2, m1
2348    pmulhrsw             m2, m6
2349    paddw                m1, m2
2350    vpermq               m1, m1, q3120
2351    psraw                m2, m4, 15
2352    vpblendvb            m0, m1, m2
2353.w8_upsample_above_toponly:
2354    paddw                m4, m5
2355    sub                  r5, 4
2356    mova         [dstq+strideq*0], xm0
2357    vextracti128 [dstq+strideq*1], m0, 1
2358    sub                  hd, 2
2359    jz .w8_ret
2360    lea                dstq, [dstq+strideq*2]
2361    jmp .w8_upsample_above_loop
2362.w8_main:
2363    vbroadcasti128       m7, [base+z2_y_shuf]
2364    lea                 r11, [rsp+120]
2365    mov                  r8, -4
2366.w8_main_upsample_left:
2367    movd                xm1, dyd
2368    vbroadcasti128       m4, [base+z_base_inc+2]
2369    movd                xm2, dxd
2370    vpbroadcastw         m1, xm1
2371    vpbroadcastw         m2, xm2
2372    mov                  r7, dstq
2373    pmullw               m0, m1, [base+z2_ymul8]
2374    paddw                m5, m2, m2
2375    psllw               xm1, 3
2376    vpblendd             m2, m5, 0xf0 ; xpos0 xpos1
2377    lea                 r9d, [dxq+(65<<6)] ; xpos
2378    paddw                m4, m2
2379    movd          [rsp+284], xm1
2380.w8_loop0:
2381    mov                 r2d, r9d
2382    mova          [rsp+288], m0
2383    mov                  r5, r11
2384    mova          [rsp+320], m4
2385    pshufd               m6, m0, q2020
2386    psraw               xm0, 6
2387    pxor                xm1, xm1
2388    psubw               xm8, xm1, xm0 ; base_y
2389    pand                 m6, m11      ; frac_y
2390    punpckhwd           xm9, xm8, xm1 ; base_y 2 3 6 7
2391    psllw                m6, 9
2392    punpcklwd           xm8, xm1      ; base_y 0 1 4 5
2393.w8_loop:
2394    lea                 r3d, [r2+dxq]
2395    shr                 r2d, 6        ; base_x0
2396    movu                xm0, [rsp+r2*2]
2397    movu                xm1, [rsp+r2*2+2]
2398    lea                 r2d, [r3+dxq]
2399    shr                 r3d, 6        ; base_x1
2400    vinserti128          m0, [rsp+r3*2], 1
2401    vinserti128          m1, [rsp+r3*2+2], 1
2402    pand                 m2, m11, m4
2403    psubw                m1, m0
2404    psllw                m2, 9
2405    pmulhrsw             m1, m2
2406    paddw                m0, m1
2407    cmp                 r3d, 64
2408    jge .w8_toponly
2409    mova                 m1, m5
2410    vpgatherdq           m3, [r5+xm9*2], m5
2411    mova                 m5, m1
2412    vpgatherdq           m2, [r5+xm8*2], m1
2413    pshufb               m3, m7       ; c0 d0 c1 d1               g0 h0 g1 h1
2414    pshufb               m2, m7       ; a0 b0 a1 b1               e0 f0 e1 f1
2415    punpckldq            m1, m2, m3   ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
2416    punpckhdq            m2, m3
2417    psubw                m2, m1
2418    pmulhrsw             m2, m6
2419    paddw                m1, m2
2420    vpermq               m1, m1, q3120
2421    psraw                m2, m4, 15   ; base_x < topleft
2422    vpblendvb            m0, m1, m2
2423.w8_toponly:
2424    paddw                m4, m5       ; xpos += dx
2425    add                  r5, r8
2426    mova         [dstq+strideq*0], xm0
2427    vextracti128 [dstq+strideq*1], m0, 1
2428    sub                  hd, 2
2429    jz .w8_end
2430    lea                dstq, [dstq+strideq*2]
2431    cmp                 r2d, (63-8)<<6
2432    jge .w8_loop
2433.w8_leftonly_loop:
2434    mova                 m0, m5
2435    vpgatherdq           m4, [r5+xm9*2], m5
2436    mova                 m5, m0
2437    vpgatherdq           m3, [r5+xm8*2], m0
2438    add                  r5, r8
2439    pshufb               m2, m4, m7
2440    pshufb               m1, m3, m7
2441    punpckldq            m0, m1, m2
2442    punpckhdq            m1, m2
2443    psubw                m1, m0
2444    pmulhrsw             m1, m6
2445    paddw                m0, m1
2446    vpermq               m0, m0, q3120
2447    mova         [dstq+strideq*0], xm0
2448    vextracti128 [dstq+strideq*1], m0, 1
2449    lea                dstq, [dstq+strideq*2]
2450    sub                  hd, 2
2451    jg .w8_leftonly_loop
2452.w8_end:
2453    sub                r10d, 1<<8
2454    jl .w8_ret
2455    vpbroadcastd         m0, [rsp+284]
2456    add                  r7, 16
2457    paddw                m0, [rsp+288] ; base_y += 8*dy
2458    add                 r9d, 8<<6
2459    vpbroadcastd         m4, [pw_512]
2460    movzx                hd, r10b
2461    paddw                m4, [rsp+320] ; base_x += 8*64
2462    mov                dstq, r7
2463    jmp .w8_loop0
2464.w8_ret:
2465    RET
2466.w16:
2467    movd                xm0, [tlq+32]
2468    lea                r10d, [hq+(1<<8)]
2469    movd          [rsp+160], xm0
2470    test             angled, 0x400
2471    jnz .w8_main
2472    lea                 r3d, [hq+15]
2473    sub              angled, 90
2474    call .filter_strength
2475    test                r3d, r3d
2476    jz .w16_no_filter_above
2477    popcnt              r3d, r3d
2478    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
2479    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*0]
2480    vpbroadcastd         m6, [base+z_filter_k-4+r3*4+12*2]
2481    movu                 m0, [tlq+2]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2482    punpcklwd           xm2, xm1, xm1
2483    vpblendd             m2, [tlq-2], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2484    punpckhwd            m3, m0, m0
2485    pmullw               m4, m0
2486    vpblendd             m3, [tlq+4], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
2487    paddw                m1, m3
2488    vpblendd             m3, [tlq+6], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g g g
2489    paddw                m2, m3
2490    vpbroadcastd         m3, r6m ; max_width
2491    pmullw               m1, m5
2492    pmullw               m2, m6
2493    packssdw             m3, m3
2494    paddw                m1, m4
2495    paddw                m1, m2
2496    psubw                m3, [base+pw_1to16]
2497    pxor                 m4, m4
2498    psrlw                m1, 3
2499    pminsw               m3, m11
2500    pavgw                m1, m4
2501    vpblendvb            m1, m0, m3
2502    movu          [rsp+130], m1
2503.w16_no_filter_above:
2504    vpbroadcastd         m0, [base+pb_90]
2505    psubb                m0, m7
2506    pand                 m0, m8
2507    pcmpgtb              m0, m9
2508    pmovmskb            r3d, m0
2509    test                r3d, r3d
2510    jz .w8_main
2511    popcnt              r3d, r3d
2512    cmp                 r3d, 3
2513    jne .w16_filter_left_s12
2514    vpbroadcastd         m6, [base+pw_3]
2515    vpbroadcastd         m7, [base+pw_16]
2516    cmp                  hd, 4
2517    jne .filter_left_s3
2518    movq                xm0, [tlq-8]    ; 0 1 2 3
2519    movq                xm1, [tlq-6]    ; 1 2 3 4
2520    vpbroadcastd        xm5, r7m ; max_height
2521    movq                xm4, [base+pw_16to1+24] ; 4to1
2522    pshuflw             xm2, xm0, q2100 ; 0 0 1 2
2523    pshuflw             xm3, xm1, q3321 ; 2 3 4 4
2524    paddw               xm1, xm0
2525    paddw               xm1, xm2
2526    pshuflw             xm2, xm0, q1000 ; 0 0 0 1
2527    paddw               xm3, xm6
2528    packssdw            xm5, xm5
2529    pavgw               xm2, xm3
2530    psubw               xm5, xm4
2531    paddw               xm1, xm2
2532    pminsw              xm5, xm11
2533    psrlw               xm1, 2
2534    vpblendvb           xm1, xm0, xm5
2535    movq          [rsp+120], xm1
2536    jmp .w8_main
2537.w32:
2538    mova                 m2, [tlq+32]
2539    movd                xm0, [tlq+64]
2540    lea                r10d, [hq+(3<<8)]
2541    mova          [rsp+160], m2
2542    movd          [rsp+192], xm0
2543    test             angled, 0x400
2544    jnz .w8_main
2545    vpbroadcastd         m6, [base+pw_3]
2546    vpbroadcastd         m0, r6m ; max_width
2547    vpbroadcastd         m7, [base+pw_16]
2548    mov                 r3d, 32
2549    packssdw             m0, m0
2550    psubw                m0, [base+pw_1to16]
2551    pminsw               m8, m0, m11
2552    psubw                m9, m8, m7
2553.w32_filter_above:
2554    movu                 m0, [tlq+2]
2555    punpcklwd           xm4, xm1, xm1
2556    paddw                m2, m6, [tlq+6]
2557    paddw                m1, m0
2558    vpblendd             m4, [tlq-2], 0xfe        ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2559    paddw                m1, [tlq+4]
2560    movu                 m3, [tlq+r3+2]
2561    paddw                m5, m6, [tlq+r3-2]
2562    pavgw                m2, m4
2563    punpckhwd            m4, m3, m3
2564    paddw                m1, m2
2565    vpblendd             m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
2566    vpblendd             m4, [tlq+r3+4], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
2567    pavgw                m2, m5
2568    paddw                m5, m3, [tlq+r3]
2569    paddw                m4, m5
2570    psrlw                m1, 2
2571    paddw                m2, m4
2572    vpblendvb            m1, m0, m8
2573    psrlw                m2, 2
2574    vpblendvb            m2, m3, m9
2575    movu          [rsp+130], m1
2576    movu       [rsp+r3+130], m2
2577.filter_left_s3:
2578    cmp                  hd, 16
2579    jl .filter_left_s3_h8 ; h8
2580.filter_left_s3b:
2581    mova                 m0, [tlq-32]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
2582    movu                 m2, [tlq-30]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
2583    vpbroadcastd         m5, r7m ; max_height
2584    paddw                m1, m0, m2
2585    punpckhwd            m2, m2
2586    mov                 r3d, hd
2587    vpblendd             m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
2588    packssdw             m5, m5
2589    not                  r3
2590    psubw                m5, [base+pw_16to1]
2591    paddw                m2, m6
2592    pminsw               m8, m11, m5
2593    je .filter_left_s3_end ; h16
2594    paddw                m1, [tlq-34]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2595    pavgw                m2, [tlq-36]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2596    paddw                m1, m2
2597    psrlw                m1, 2
2598    vpblendvb            m3, m1, m0, m8
2599    mova                 m0, [tlq-64]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
2600    paddw                m1, m0, [tlq-62]   ; 3 4 5 6 7 8 9 a   b c d e f g h i
2601    paddw                m2, m6, [tlq-60]   ; 4 5 6 7 8 9 a b   c d e f g h i j
2602    psubw                m8, m7
2603    mova           [rsp+96], m3
2604    jnp .filter_left_s3_end ; h32
2605    mova                 m5, [tlq-96]
2606    paddw                m1, [tlq-66]
2607    pavgw                m2, [tlq-68]
2608    paddw                m1, m2
2609    paddw                m4, m5, [tlq-94]
2610    paddw                m2, m6, [tlq-92]
2611    psrlw                m1, 2
2612    paddw                m4, [tlq- 98]
2613    pavgw                m2, [tlq-100]
2614    vpblendvb            m3, m1, m0, m8
2615    mova                 m0, [tlq-128]
2616    psubw                m8, m7
2617    paddw                m4, m2
2618    paddw                m1, m0, [tlq-126]
2619    paddw                m2, m6, [tlq-124]
2620    psrlw                m4, 2
2621    mova           [rsp+64], m3
2622    vpblendvb            m4, m5, m8
2623    psubw                m8, m7
2624    mova           [rsp+32], m4
2625.filter_left_s3_end:
2626    punpcklwd           xm3, xm0, xm0
2627    vpblendd             m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8   9 a b c d e f g
2628    vpblendd             m3, [tlq+r3*2-2], 0xfe   ; 2 2 2 3 4 5 6 7   8 9 a b c d e f
2629    paddw                m1, m4
2630    pavgw                m2, m3
2631    paddw                m1, m2
2632    psrlw                m1, 2
2633    vpblendvb            m1, m0, m8
2634    mova     [rsp+r3*2+130], m1
2635    jmp .w8_main
2636.filter_left_s3_h8:
2637    mova                xm0, [tlq-16]            ; 0 1 2 3 4 5 6 7
2638    movu                xm3, [tlq-14]            ; 1 2 3 4 5 6 7 8
2639    pblendw             xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
2640    vpbroadcastd        xm5, r7m ; max_height
2641    paddw               xm1, xm0, xm3
2642    pblendw             xm3, [tlq-12], 0x7f      ; 2 3 4 5 6 7 8 8
2643    paddw               xm1, xm2
2644    vpblendd            xm2, [tlq-20], 0x0e      ; 0 0 0 1 2 3 4 5
2645    paddw               xm3, xm6
2646    packssdw            xm5, xm5
2647    pavgw               xm2, xm3
2648    psubw               xm5, [base+pw_16to1+16] ; 8to1
2649    paddw               xm1, xm2
2650    pminsw              xm5, xm11
2651    psrlw               xm1, 2
2652    vpblendvb           xm1, xm0, xm5
2653    mova          [rsp+112], xm1
2654    jmp .w8_main
2655.w64:
2656    mova                 m2, [tlq+ 32]
2657    mova                 m3, [tlq+ 64]
2658    mova                 m4, [tlq+ 96]
2659    movd                xm0, [tlq+128]
2660    lea                r10d, [hq+(7<<8)]
2661    mova          [rsp+160], m2
2662    mova          [rsp+192], m3
2663    mova          [rsp+224], m4
2664    movd          [rsp+256], xm0
2665    test             angled, 0x400
2666    jnz .w8_main
2667    vpbroadcastd         m6, [base+pw_3]
2668    movu                 m0, [tlq+34]     ; 2 3 4 5 6 7 8 9   a b c d e f g h
2669    paddw                m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2670    paddw                m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2671    pavgw                m2, [tlq+38]     ; 4 5 6 7 8 9 a b   c d e f g h h h
2672    paddw                m5, [tlq+36]     ; 3 4 5 6 7 8 9 a   b c d e f g h h
2673    movu                 m4, [tlq+66]
2674    paddw                m3, m6, [tlq+62]
2675    paddw                m7, m4, [tlq+64]
2676    pavgw                m3, [tlq+70]
2677    paddw                m7, [tlq+68]
2678    paddw                m2, m5
2679    vpbroadcastd         m5, r6m ; max_width
2680    mov                 r3d, 96
2681    packssdw             m5, m5
2682    paddw                m3, m7
2683    psubw                m5, [base+pw_1to16]
2684    psrlw                m2, 2
2685    vpbroadcastd         m7, [base+pw_16]
2686    psrlw                m3, 2
2687    pminsw               m8, m11, m5
2688    psubw                m9, m8, m7
2689    vpblendvb            m2, m0, m9
2690    psubw                m9, m7
2691    vpblendvb            m3, m4, m9
2692    psubw                m9, m7
2693    movu          [rsp+162], m2
2694    movu          [rsp+194], m3
2695    jmp .w32_filter_above
2696
2697cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
2698    %assign org_stack_offset stack_offset
2699    lea                  r6, [ipred_z3_16bpc_avx2_table]
2700    tzcnt                hd, hm
2701    movifnidn        angled, anglem
2702    lea                  r7, [dr_intra_derivative+45*2-1]
2703    sub                 tlq, 2
2704    movsxd               hq, [r6+hq*4]
2705    sub              angled, 180
2706    add                  hq, r6
2707    mov                 dyd, angled
2708    neg                 dyd
2709    xor              angled, 0x400
2710    or                  dyq, ~0x7e
2711    movzx               dyd, word [r7+dyq]
2712    vpbroadcastd         m5, [pw_62]
2713    mov              org_wd, wd
2714    jmp                  hq
2715.h4:
2716    ALLOC_STACK         -64, 7
2717    lea                  r7, [strideq*3]
2718    cmp              angleb, 40
2719    jae .h4_no_upsample
2720    lea                 r4d, [angleq-1024]
2721    sar                 r4d, 7
2722    add                 r4d, wd
2723    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
2724    mova                xm2, [tlq-14]            ; 0 1 2 3 4 5 6 7
2725    pblendw             xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
2726    vpblendd            xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
2727    pshufd              xm3, xm1, q0000
2728    paddw               xm1, xm2
2729    paddw               xm0, [tlq-12]            ; 1 2 3 4 5 6 7 8
2730    vpbroadcastw        xm4, r8m ; pixel_max
2731    add                 dyd, dyd
2732    psubw               xm0, xm1, xm0
2733    mova           [rsp+ 0], xm3
2734    movd                xm3, dyd
2735    psraw               xm0, 3
2736    neg                 dyd
2737    paddw               xm1, xm0
2738    pxor                xm0, xm0
2739    lea                 r2d, [dyq+(16<<6)+63] ; ypos
2740    pmaxsw              xm1, xm0
2741    pavgw               xm1, xm0
2742    vpbroadcastw         m3, xm3
2743    pminsw              xm1, xm4
2744    punpckhwd           xm0, xm1, xm2
2745    punpcklwd           xm1, xm2
2746    paddw                m2, m3, m3
2747    mova           [rsp+32], xm0
2748    punpcklwd            m3, m2
2749    mova           [rsp+16], xm1
2750    paddw                m4, m2, m2
2751    paddw                m2, m3
2752    vpblendd             m3, m2, 0xf0 ; ypos0 ypos1   ypos2 ypos3
2753.h4_upsample_loop:
2754    lea                 r4d, [r2+dyq]
2755    shr                 r2d, 6
2756    movu                xm1, [rsp+r2*2]
2757    lea                 r2d, [r4+dyq]
2758    shr                 r4d, 6
2759    movu                xm2, [rsp+r4*2]
2760    lea                 r4d, [r2+dyq]
2761    shr                 r2d, 6
2762    vinserti128          m1, [rsp+r2*2], 1
2763    lea                 r2d, [r4+dyq]
2764    shr                 r4d, 6
2765    vinserti128          m2, [rsp+r4*2], 1
2766    psrld                m0, m1, 16
2767    pblendw              m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0   c3 d3 c2 d2 c1 d1 c0 d0
2768    pslld                m2, 16
2769    pblendw              m1, m2, 0xaa
2770    pand                 m2, m5, m3
2771    psllw                m2, 9
2772    psubw                m1, m0
2773    pmulhrsw             m1, m2
2774    paddw                m3, m4
2775    paddw                m1, m0
2776    vextracti128        xm2, m1, 1
2777    punpckhdq           xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
2778    punpckldq           xm1, xm2      ; a3 b3 c3 d3 a2 b2 c2 d2
2779    movhps [dstq+strideq*0], xm0
2780    movq   [dstq+strideq*1], xm0
2781    movhps [dstq+strideq*2], xm1
2782    movq   [dstq+r7       ], xm1
2783    add                dstq, 8
2784    sub                  wd, 4
2785    jg .h4_upsample_loop
2786    RET
2787ALIGN function_align
2788.filter_strength: ; h4/h8/h16
2789%define base r4-z_filter_t0
2790    lea                  r4, [z_filter_t0]
2791    movd                xm0, maxbased
2792    movd                xm1, angled
2793    shr              angled, 8 ; is_sm << 1
2794    vpbroadcastb         m0, xm0
2795    vpbroadcastb         m1, xm1
2796    pcmpeqb              m0, [base+z_filter_wh]
2797    pand                 m0, m1
2798    mova                xm1, [r4+angleq*8]
2799    pcmpgtb              m0, m1
2800    pmovmskb            r5d, m0
2801    ret
2802.h4_no_upsample:
2803    mov            maxbased, 7
2804    test             angled, 0x400 ; !enable_intra_edge_filter
2805    jnz .h4_main
2806    lea            maxbased, [wq+3]
2807    call .filter_strength
2808    mov            maxbased, 7
2809    test                r5d, r5d
2810    jz .h4_main ; filter_strength == 0
2811    popcnt              r5d, r5d
2812    mova                xm0, [tlq-14]       ; 0 1 2 3 4 5 6 7
2813    movu                xm3, [tlq-12]       ; 1 2 3 4 5 6 7 8
2814    vpbroadcastd        xm2, [base+z_filter_k-4+r5*4+12*1]
2815    vpbroadcastd        xm4, [base+z_filter_k-4+r5*4+12*0]
2816    pmullw              xm2, xm0
2817    pblendw             xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
2818    paddw               xm1, xm0, xm3
2819    movd           [rsp+12], xm0
2820    pmullw              xm1, xm4
2821    cmp                 r5d, 3
2822    jne .h4_filter_3tap
2823    pblendw             xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
2824    vpblendd            xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
2825    movzx               r4d, word [tlq-14]
2826    movzx               r2d, word [tlq-12]
2827    inc            maxbased
2828    paddw               xm1, xm2
2829    paddw               xm0, xm3
2830    sub                 r2d, r4d
2831    paddw               xm2, xm0, xm0
2832    lea                 r2d, [r2+r4*8+4]
2833    shr                 r2d, 3
2834    mov            [rsp+14], r2w
2835.h4_filter_3tap:
2836    pxor                xm0, xm0
2837    paddw               xm1, xm2
2838    lea                 tlq, [rsp+30]
2839    psrlw               xm1, 3
2840    cmp                  wd, 8
2841    sbb            maxbased, -1
2842    pavgw               xm0, xm1
2843    mova           [rsp+16], xm0
2844.h4_main:
2845    movd                xm3, dyd
2846    neg            maxbaseq
2847    vbroadcasti128       m1, [z_base_inc]
2848    vpbroadcastw         m6, [tlq+maxbaseq*2]
2849    shl            maxbased, 6
2850    vpbroadcastw         m3, xm3
2851    lea                 r4d, [maxbaseq+3*64]
2852    neg                 dyq
2853    movd                xm2, r4d
2854    sub                 tlq, 8
2855    lea                  r4, [dyq+63] ; ypos
2856    punpcklwd            m1, m1
2857    paddw                m0, m3, m3
2858    vpbroadcastw         m2, xm2
2859    punpcklwd            m3, m0
2860    paddw                m4, m0, m0
2861    paddw                m0, m3
2862    psubw                m2, m1
2863    vpblendd             m3, m0, 0xf0 ; ypos0 ypos1   ypos2 ypos3
2864    or             maxbased, 63
2865    paddw                m3, m2
2866.h4_loop:
2867    lea                  r5, [r4+dyq]
2868    sar                  r4, 6 ; base0
2869    movu                xm1, [tlq+r4*2]
2870    lea                  r4, [r5+dyq]
2871    sar                  r5, 6 ; base1
2872    movu                xm2, [tlq+r5*2]
2873    lea                  r5, [r4+dyq]
2874    sar                  r4, 6 ; base2
2875    vinserti128          m1, [tlq+r4*2], 1
2876    lea                  r4, [r5+dyq]
2877    sar                  r5, 6 ; base3
2878    vinserti128          m2, [tlq+r5*2], 1
2879    punpckhwd            m0, m1, m2
2880    punpcklwd            m1, m2
2881    pand                 m2, m5, m3
2882    palignr              m0, m1, 4    ; a3 b3 a2 b2 a1 b1 a0 b0   c3 d3 c2 d2 c1 d1 c0 d0
2883    psllw                m2, 9
2884    psubw                m1, m0
2885    pmulhrsw             m1, m2
2886    psraw                m2, m3, 15   ; ypos < max_base_y
2887    paddw                m3, m4
2888    paddw                m1, m0
2889    vpblendvb            m1, m6, m1, m2
2890    vextracti128        xm2, m1, 1
2891    punpckhdq           xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
2892    punpckldq           xm1, xm2      ; a3 b3 c3 d3 a2 b2 c2 d2
2893    movhps [dstq+strideq*0], xm0
2894    movq   [dstq+strideq*1], xm0
2895    movhps [dstq+strideq*2], xm1
2896    movq   [dstq+r7       ], xm1
2897    sub                  wd, 4
2898    jz .h4_end
2899    add                dstq, 8
2900    cmp                 r4d, maxbased
2901    jg .h4_loop
2902.h4_end_loop:
2903    movq   [dstq+strideq*0], xm6
2904    movq   [dstq+strideq*1], xm6
2905    movq   [dstq+strideq*2], xm6
2906    movq   [dstq+r7       ], xm6
2907    add                dstq, 8
2908    sub                  wd, 4
2909    jg .h4_end_loop
2910.h4_end:
2911    RET
2912.h8:
2913    lea                 r4d, [angleq+216]
2914    %assign stack_offset org_stack_offset
2915    ALLOC_STACK         -64, 8
2916    mov                 r4b, wb
2917    lea                  r7, [strideq*3]
2918    cmp                 r4d, 8
2919    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2920    mova                 m2, [tlq-30]     ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2921    paddw                m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6   7 8 9 a b c d e
2922    movu                 m0, [tlq-34]     ; _ _ 0 1 2 3 4 5   6 7 8 9 a b c d
2923    cmp                  wd, 8
2924    je .h8_upsample_w8
2925    pshufhw             xm3, xm2, q1000
2926    vpblendd             m0, m3, 0x0f     ; _ _ _ _ 4 4 4 5   6 7 8 9 a b c d
2927.h8_upsample_w8:
2928    paddw                m0, [tlq-28]     ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2929    vpbroadcastw         m4, r8m ; pixel_max
2930    add                 dyd, dyd
2931    psubw                m0, m1, m0
2932    movd                xm6, dyd
2933    psraw                m0, 3
2934    neg                 dyd
2935    paddw                m1, m0
2936    pxor                 m0, m0
2937    pmaxsw               m1, m0
2938    lea                 r4d, [dyq+(16<<6)+63] ; ypos
2939    pavgw                m1, m0
2940    vpbroadcastw         m6, xm6
2941    pminsw               m1, m4
2942    punpckhwd            m0, m1, m2
2943    punpcklwd            m1, m2
2944    vextracti128   [rsp+48], m0, 1
2945    vextracti128   [rsp+32], m1, 1
2946    paddw                m7, m6, m6
2947    mova           [rsp+16], xm0
2948    mova           [rsp+ 0], xm1
2949    punpcklwd            m6, m7 ; ypos0 ypos1
2950.h8_upsample_loop:
2951    lea                 r2d, [r4+dyq]
2952    shr                 r4d, 6 ; base0
2953    movu                 m1, [rsp+r4*2]
2954    lea                 r4d, [r2+dyq]
2955    shr                 r2d, 6 ; base1
2956    movu                 m2, [rsp+r2*2]
2957    lea                 r2d, [r4+dyq]
2958    shr                 r4d, 6 ; base2
2959    movu                 m3, [rsp+r4*2]
2960    lea                 r4d, [r2+dyq]
2961    shr                 r2d, 6 ; base3
2962    movu                 m4, [rsp+r2*2]
2963    psrld                m0, m1, 16
2964    pblendw              m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4   a3 b3 a2 b2 a1 b1 a0 b0
2965    pslld                m2, 16
2966    pblendw              m1, m2, 0xaa
2967    psrld                m2, m3, 16
2968    pblendw              m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4   c3 d3 c2 d2 c1 d1 c0 d0
2969    pslld                m4, 16
2970    pblendw              m3, m4, 0xaa
2971    pand                 m4, m5, m6
2972    paddw                m6, m7
2973    psllw                m4, 9
2974    psubw                m1, m0
2975    pmulhrsw             m1, m4
2976    pand                 m4, m5, m6
2977    psllw                m4, 9
2978    psubw                m3, m2
2979    pmulhrsw             m3, m4
2980    paddw                m6, m7
2981    lea                  r2, [dstq+strideq*4]
2982    paddw                m1, m0
2983    paddw                m3, m2
2984    punpckhdq            m0, m1, m3   ; a5 b5 c5 d5 a4 b4 c4 d4   a1 b1 c1 d1 a0 b0 c0 d0
2985    punpckldq            m1, m3       ; a7 b7 c7 d7 a6 b6 c6 d6   a3 b3 c3 d3 a2 b2 c2 d2
2986    vextracti128        xm2, m0, 1
2987    vextracti128        xm3, m1, 1
2988    movhps [r2  +strideq*0], xm0
2989    movq   [r2  +strideq*1], xm0
2990    movhps [r2  +strideq*2], xm1
2991    movq   [r2  +r7       ], xm1
2992    movhps [dstq+strideq*0], xm2
2993    movq   [dstq+strideq*1], xm2
2994    movhps [dstq+strideq*2], xm3
2995    movq   [dstq+r7       ], xm3
2996    add                dstq, 8
2997    sub                  wd, 4
2998    jg .h8_upsample_loop
2999    RET
3000.h8_no_intra_edge_filter:
3001    and            maxbased, 7
3002    or             maxbased, 8 ; imin(w+7, 15)
3003    jmp .h8_main
3004.h8_no_upsample:
3005    lea            maxbased, [wq+7]
3006    test             angled, 0x400
3007    jnz .h8_no_intra_edge_filter
3008    call .filter_strength
3009    test                r5d, r5d
3010    jz .h8_main
3011    popcnt              r5d, r5d
3012    mova                 m0, [tlq-30]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3013    movu                 m3, [tlq-28]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3014    vpbroadcastd         m2, [base+z_filter_k-4+r5*4+12*1]
3015    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
3016    pmullw               m2, m0
3017    cmp                  wd, 8
3018    jl .h8_filter_w4
3019    punpcklwd           xm0, xm0
3020    vpblendd             m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3021    movd           [rsp+28], xm0
3022    paddw                m1, m3
3023    mov                 r4d, 16
3024    pmullw               m1, m4
3025    cmovg          maxbased, r4d
3026    cmp                 r5d, 3
3027    jne .h8_filter_3tap
3028    punpckhwd            m3, m3
3029    vpblendd             m0, [tlq-34], 0xfe     ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3030    vpblendd             m3, [tlq-26], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
3031    movzx               r4d, word [tlq-30]
3032    movzx               r2d, word [tlq-28]
3033    inc            maxbased
3034    paddw                m1, m2
3035    paddw                m0, m3
3036    sub                 r2d, r4d
3037    paddw                m2, m0, m0
3038    lea                 r2d, [r2+r4*8+4]
3039    shr                 r2d, 3
3040    mov            [rsp+30], r2w
3041    jmp .h8_filter_3tap
3042.h8_filter_w4:
3043    pshufhw             xm1, xm0, q2100
3044    vinserti128          m1, [tlq-16], 1        ; _ _ _ _ 4 4 5 6   7 8 9 a b c d e
3045    paddw                m1, m3
3046    pmullw               m1, m4
3047.h8_filter_3tap:
3048    pxor                 m0, m0
3049    paddw                m1, m2
3050    lea                 tlq, [rsp+62]
3051    psrlw                m1, 3
3052    pavgw                m0, m1
3053    mova           [rsp+32], m0
3054.h8_main:
3055    movd                xm4, dyd
3056    neg            maxbaseq
3057    vbroadcasti128       m1, [z_base_inc]
3058    vpbroadcastw         m7, [tlq+maxbaseq*2]
3059    shl            maxbased, 6
3060    vpbroadcastw         m4, xm4
3061    lea                 r4d, [maxbaseq+7*64]
3062    neg                 dyq
3063    movd                xm2, r4d
3064    sub                 tlq, 16
3065    lea                  r4, [dyq+63]
3066    paddw                m6, m4, m4
3067    vpbroadcastw         m2, xm2
3068    vpblendd             m4, m6, 0xf0 ; ypos0 ypos1
3069    psubw                m2, m1
3070    or             maxbased, 63
3071    paddw                m4, m2
3072.h8_loop:
3073    lea                  r5, [r4+dyq]
3074    sar                  r4, 6 ; base0
3075    movu                xm0, [tlq+r4*2+2]
3076    movu                xm1, [tlq+r4*2]
3077    lea                  r4, [r5+dyq]
3078    sar                  r5, 6 ; base1
3079    vinserti128          m0, [tlq+r5*2+2], 1
3080    vinserti128          m1, [tlq+r5*2], 1
3081    lea                  r5, [r4+dyq]
3082    sar                  r4, 6 ; base2
3083    pand                 m3, m5, m4
3084    psllw                m3, 9
3085    psubw                m1, m0
3086    pmulhrsw             m1, m3
3087    psraw                m3, m4, 15
3088    paddw                m4, m6
3089    paddw                m0, m1
3090    movu                xm1, [tlq+r4*2+2]
3091    movu                xm2, [tlq+r4*2]
3092    lea                  r4, [r5+dyq]
3093    sar                  r5, 6 ; base3
3094    vpblendvb            m0, m7, m0, m3
3095    vinserti128          m1, [tlq+r5*2+2], 1
3096    vinserti128          m2, [tlq+r5*2], 1
3097    pand                 m3, m5, m4
3098    psllw                m3, 9
3099    psubw                m2, m1
3100    pmulhrsw             m2, m3
3101    psraw                m3, m4, 15
3102    paddw                m4, m6
3103    lea                  r5, [dstq+strideq*4]
3104    paddw                m1, m2
3105    vpblendvb            m1, m7, m1, m3
3106    punpckhwd            m2, m0, m1   ; a3 c3 a2 c2 a1 c1 a0 c0   b3 d3 b2 d2 b1 d1 b0 d0
3107    vextracti128        xm3, m2, 1
3108    punpcklwd            m0, m1       ; a7 c7 a6 c6 a5 c5 a4 c5   b7 d7 b6 d6 b5 d5 b4 d4
3109    punpckhwd           xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
3110    punpcklwd           xm2, xm3      ; a3 b3 c3 d3 a2 b2 c2 d2
3111    vextracti128        xm3, m0, 1
3112    movhps [dstq+strideq*0], xm1
3113    movq   [dstq+strideq*1], xm1
3114    movhps [dstq+strideq*2], xm2
3115    movq   [dstq+r7       ], xm2
3116    punpckhwd           xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
3117    punpcklwd           xm0, xm3      ; a7 b7 c7 d7 a6 b6 c6 d6
3118    movhps [r5  +strideq*0], xm1
3119    movq   [r5  +strideq*1], xm1
3120    movhps [r5  +strideq*2], xm0
3121    movq   [r5  +r7       ], xm0
3122    sub                  wd, 4
3123    jz .h8_end
3124    add                dstq, 8
3125    cmp                 r4d, maxbased
3126    jg .h8_loop
3127    lea                  r6, [strideq*5]
3128    lea                  r2, [strideq+r7*2] ; stride*7
3129    test                 wd, 4
3130    jz .h8_end_loop
3131    movq   [dstq+strideq*0], xm7
3132    movq   [dstq+strideq*1], xm7
3133    movq   [dstq+strideq*2], xm7
3134    movq   [dstq+r7       ], xm7
3135    movq   [dstq+strideq*4], xm7
3136    movq   [dstq+r6       ], xm7
3137    movq   [dstq+r7*2     ], xm7
3138    movq   [dstq+r2       ], xm7
3139    add                dstq, 8
3140    sub                  wd, 4
3141    jz .h8_end
3142.h8_end_loop:
3143    mova   [dstq+strideq*0], xm7
3144    mova   [dstq+strideq*1], xm7
3145    mova   [dstq+strideq*2], xm7
3146    mova   [dstq+r7       ], xm7
3147    mova   [dstq+strideq*4], xm7
3148    mova   [dstq+r6       ], xm7
3149    mova   [dstq+r7*2     ], xm7
3150    mova   [dstq+r2       ], xm7
3151    add                dstq, 16
3152    sub                  wd, 8
3153    jg .h8_end_loop
3154.h8_end:
3155    RET
3156.h16_no_intra_edge_filter:
3157    and            maxbased, 15
3158    or             maxbased, 16 ; imin(w+15, 31)
3159    jmp .h16_main
3160ALIGN function_align
3161.h16:
3162    %assign stack_offset org_stack_offset
3163    ALLOC_STACK         -96, 10
3164    lea            maxbased, [wq+15]
3165    lea                  r7, [strideq*3]
3166    test             angled, 0x400
3167    jnz .h16_no_intra_edge_filter
3168    call .filter_strength
3169    test                r5d, r5d
3170    jz .h16_main ; filter_strength == 0
3171    popcnt              r5d, r5d
3172    movu                 m0, [tlq-28]            ; 3 4 5 6 7 8 9 a   b c d e f g h i
3173    paddw                m1, m0, [tlq-32]        ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3174    vpbroadcastd         m6, [base+z_filter_k-4+r5*4+12*1]
3175    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
3176    pmullw               m2, m6, [tlq-30]        ; 2 3 4 5 6 7 8 9   a b c d e f g h
3177    pmullw               m1, m7
3178    paddw                m1, m2
3179    cmp                  wd, 8
3180    jg .h16_filter_w16
3181    mova                xm3, [tlq-46]            ; 0 1 2 3 4 5 6 7
3182    pmullw              xm6, xm3
3183    jl .h16_filter_w4
3184    pblendw             xm3, [tlq-48], 0xfe      ; 0 0 1 2 3 4 5 6
3185    cmp                 r5d, 3
3186    jne .h16_filter_w8_3tap
3187    vpblendd            xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
3188.h16_filter_w8_5tap:
3189    punpckhwd            m0, m0
3190    vpblendd             m0, [tlq-26], 0x7f      ; 4 5 6 7 8 9 a b   c d e f g h i i
3191    paddw               xm4, [tlq-42]            ; 2 3 4 5 6 7 8 9
3192    paddw                m0, [tlq-34]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3193    paddw               xm4, xm4
3194    paddw                m0, m0
3195    paddw               xm6, xm4
3196    paddw                m1, m0
3197.h16_filter_w8_3tap:
3198    paddw               xm3, [tlq-44]            ; 1 2 3 4 5 6 7 8
3199    pmullw              xm3, xm7
3200    pxor                 m0, m0
3201    paddw               xm3, xm6
3202    psrlw               xm3, 3
3203    pavgw               xm3, xm0
3204    mova           [rsp+48], xm3
3205    jmp .h16_filter_end
3206.h16_filter_w4:
3207    pshufhw             xm3, xm3, q2100          ; _ _ _ _ 4 4 5 6
3208    cmp                 r5d, 3
3209    jne .h16_filter_w8_3tap
3210    pshufhw             xm4, xm3, q2100          ; _ _ _ _ 4 4 4 5
3211    jmp .h16_filter_w8_5tap
3212.h16_filter_w16:
3213    mova                 m3, [tlq-62]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3214    pmullw               m6, m3
3215    punpcklwd           xm3, xm3
3216    vpblendd             m4, m3, [tlq-64], 0xfe  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3217    paddw                m4, [tlq-60]            ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3218    mov                 r4d, 32
3219    cmp                  wd, 16
3220    cmovg          maxbased, r4d
3221    movd           [rsp+28], xm3
3222    pmullw               m4, m7
3223    cmp                 r5d, 3
3224    jne .h16_filter_w16_3tap
3225    punpckhwd            m0, m0
3226    vpblendd             m3, [tlq-66], 0xfe      ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3227    vpblendd             m0, [tlq-26], 0x7f      ; 4 5 6 7 8 9 a b   c d e f g h i i
3228    paddw                m3, [tlq-58]            ; 2 3 4 5 6 7 8 9   a b c d e f g h
3229    paddw                m0, [tlq-34]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3230    movzx               r4d, word [tlq-62]
3231    movzx               r2d, word [tlq-60]
3232    or             maxbased, 1
3233    paddw                m3, m3
3234    sub                 r2d, r4d
3235    paddw                m0, m0
3236    lea                 r2d, [r2+r4*8+4]
3237    paddw                m4, m3
3238    shr                 r2d, 3
3239    paddw                m1, m0
3240    mov            [rsp+30], r2w
3241.h16_filter_w16_3tap:
3242    pxor                 m0, m0
3243    paddw                m4, m6
3244    psrlw                m4, 3
3245    pavgw                m4, m0
3246    mova           [rsp+32], m4
3247.h16_filter_end:
3248    psrlw                m1, 3
3249    lea                 tlq, [rsp+94]
3250    pavgw                m1, m0
3251    mova           [rsp+64], m1
3252.h16_main:
3253    movd                xm8, dyd
3254    neg            maxbaseq
3255    vpbroadcastw         m9, [tlq+maxbaseq*2]
3256    shl            maxbased, 6
3257    vpbroadcastw         m8, xm8
3258    lea                 r4d, [maxbaseq+dyq+15*64]
3259    neg                 dyq
3260    movd                xm7, r4d
3261    sub                 tlq, 32
3262    lea                  r4, [dyq+63]
3263    vpbroadcastw         m7, xm7
3264    or             maxbased, 63
3265    psubw                m7, [z_base_inc]
3266.h16_loop:
3267    lea                  r5, [r4+dyq]
3268    sar                  r4, 6 ; base0
3269    movu                 m0, [tlq+r4*2+2]
3270    movu                 m2, [tlq+r4*2]
3271    lea                  r4, [r5+dyq]
3272    sar                  r5, 6 ; base1
3273    movu                 m1, [tlq+r5*2+2]
3274    movu                 m3, [tlq+r5*2]
3275    lea                  r5, [r4+dyq]
3276    sar                  r4, 6 ; base3
3277    pand                 m6, m5, m7
3278    psllw                m6, 9
3279    psubw                m2, m0
3280    pmulhrsw             m2, m6
3281    psraw                m6, m7, 15
3282    paddw                m7, m8
3283    paddw                m0, m2
3284    movu                 m2, [tlq+r4*2+2]
3285    movu                 m4, [tlq+r4*2]
3286    lea                  r4, [r5+dyq]
3287    sar                  r5, 6 ; base3
3288    vpblendvb            m0, m9, m0, m6
3289    pand                 m6, m5, m7
3290    psllw                m6, 9
3291    psubw                m3, m1
3292    pmulhrsw             m3, m6
3293    psraw                m6, m7, 15
3294    paddw                m7, m8
3295    paddw                m1, m3
3296    vpblendvb            m1, m9, m1, m6
3297    pand                 m6, m5, m7
3298    psllw                m6, 9
3299    psubw                m4, m2
3300    pmulhrsw             m4, m6
3301    psraw                m6, m7, 15
3302    paddw                m7, m8
3303    paddw                m2, m4
3304    movu                 m3, [tlq+r5*2+2]
3305    movu                 m4, [tlq+r5*2]
3306    vpblendvb            m2, m9, m2, m6
3307    pand                 m6, m5, m7
3308    psllw                m6, 9
3309    psubw                m4, m3
3310    pmulhrsw             m4, m6
3311    psraw                m6, m7, 15
3312    paddw                m7, m8
3313    lea                  r5, [dstq+strideq*4]
3314    paddw                m3, m4
3315    vpblendvb            m3, m9, m3, m6
3316    punpckhwd            m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8   a3 b3 a2 b2 a1 b1 a0 b0
3317    punpcklwd            m0, m1     ; af bf ae be ad bd ac bc   a7 b7 a6 b6 a5 b5 a4 b4
3318    punpckhwd            m1, m2, m3 ; cb db ca da c9 d9 c8 d8   c3 d3 c2 d2 c1 d1 c0 d0
3319    punpcklwd            m2, m3     ; cf df ce de cd dd cc dc   c7 d7 c6 d6 c5 d5 c4 d4
3320    punpckhdq            m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8   a1 b1 c1 d1 a0 b0 c0 d0
3321    vextracti128        xm6, m3, 1
3322    punpckldq            m4, m1     ; ab bb cb db aa ba ca da   a3 b3 c3 d3 a2 b2 c2 d2
3323    punpckhdq            m1, m0, m2 ; ad bd cd dd ac bc cc dc   a5 b5 c5 d5 a4 b4 c4 d4
3324    punpckldq            m0, m2     ; af bf cf df ae be ce de   a7 b7 c7 d7 a6 b6 c6 d6
3325    vextracti128        xm2, m4, 1
3326    movhps [dstq+strideq*0], xm6
3327    movq   [dstq+strideq*1], xm6
3328    vextracti128        xm6, m1, 1
3329    movhps [dstq+strideq*2], xm2
3330    movq   [dstq+r7       ], xm2
3331    vextracti128        xm2, m0, 1
3332    movhps [r5  +strideq*0], xm6
3333    movq   [r5  +strideq*1], xm6
3334    movhps [r5  +strideq*2], xm2
3335    movq   [r5  +r7       ], xm2
3336    lea                  r5, [dstq+strideq*8]
3337    movhps [r5  +strideq*0], xm3
3338    movq   [r5  +strideq*1], xm3
3339    movhps [r5  +strideq*2], xm4
3340    movq   [r5  +r7       ], xm4
3341    lea                  r5, [r5+strideq*4]
3342    movhps [r5  +strideq*0], xm1
3343    movq   [r5  +strideq*1], xm1
3344    movhps [r5  +strideq*2], xm0
3345    movq   [r5  +r7       ], xm0
3346    sub                  wd, 4
3347    jz .h16_end
3348    add                dstq, 8
3349    cmp                 r4d, maxbased
3350    jg .h16_loop
3351    mov                  hd, 4
3352.h16_end_loop0:
3353    mov                 r6d, wd
3354    mov                  r2, dstq
3355    test                 wb, 4
3356    jz .h16_end_loop
3357    movq   [dstq+strideq*0], xm9
3358    movq   [dstq+strideq*1], xm9
3359    movq   [dstq+strideq*2], xm9
3360    movq   [dstq+r7       ], xm9
3361    and                 r6d, 120
3362    jz .h16_end_w4
3363    add                dstq, 8
3364.h16_end_loop:
3365    mova   [dstq+strideq*0], xm9
3366    mova   [dstq+strideq*1], xm9
3367    mova   [dstq+strideq*2], xm9
3368    mova   [dstq+r7       ], xm9
3369    add                dstq, 16
3370    sub                 r6d, 8
3371    jg .h16_end_loop
3372.h16_end_w4:
3373    lea                dstq, [r2+strideq*4]
3374    dec                  hd
3375    jg .h16_end_loop0
3376.h16_end:
3377    RET
3378.h32:
3379    %assign stack_offset org_stack_offset
3380    ALLOC_STACK        -160, 9
3381    lea            maxbased, [wq+31]
3382    and            maxbased, 31
3383    or             maxbased, 32 ; imin(w+31, 63)
3384    test             angled, 0x400
3385    jnz .h32_main
3386    vpbroadcastd         m2, [pw_3]
3387    movu                 m0, [tlq-28]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
3388    punpckhwd            m1, m0, m0
3389    vpblendd             m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
3390    paddw                m0, [tlq-30]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
3391    paddw                m1, m2
3392    paddw                m0, [tlq-32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3393    pavgw                m1, [tlq-34]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3394    lea                  r4, [rsp+128]
3395    paddw                m0, m1
3396    lea                 r5d, [maxbaseq-31]
3397    psrlw                m0, 2
3398    mova               [r4], m0
3399.h32_filter_loop:
3400    mova                 m0, [tlq-62]
3401    paddw                m1, m2, [tlq-66]
3402    paddw                m0, [tlq-64]
3403    pavgw                m1, [tlq-58]
3404    paddw                m0, [tlq-60]
3405    sub                 tlq, 32
3406    sub                  r4, 32
3407    paddw                m0, m1
3408    psrlw                m0, 2
3409    mova               [r4], m0
3410    sub                 r5d, 16
3411    jg .h32_filter_loop
3412    jl .h32_filter_h8
3413    mova                 m0, [tlq-62]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3414    punpcklwd           xm1, xm0, xm0
3415    paddw                m2, [tlq-58]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
3416    paddw                m0, [tlq-60]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3417    vpblendd             m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3418    vpblendd             m1, [tlq-64], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3419    movzx               r5d, word [tlq-62]
3420    movzx               r2d, word [tlq-60]
3421    pavgw                m2, m3
3422    sub                 r2d, r5d
3423    paddw                m0, m1
3424    lea                 r2d, [r2+r5*8+4]
3425    paddw                m0, m2
3426    shr                 r2d, 3
3427    psrlw                m0, 2
3428    mova            [r4-32], m0
3429    mov             [r4-36], r5w
3430    mov             [r4-34], r2w
3431    lea                 tlq, [rsp+158]
3432    mov                 r4d, 65
3433    cmp                  wd, 64
3434    cmove          maxbased, r4d
3435    jmp .h32_main
3436.h32_filter_h8:
3437    mova                xm0, [tlq-46]            ; 0 1 2 3 4 5 6 7
3438    pblendw             xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
3439    paddw               xm2, [tlq-42]            ; 2 3 4 5 6 7 8 9
3440    paddw               xm0, [tlq-44]            ; 1 2 3 4 5 6 7 8
3441    vpblendd            xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
3442    lea                 tlq, [rsp+158]
3443    pavgw               xm2, xm3
3444    paddw               xm0, xm1
3445    paddw               xm0, xm2
3446    psrlw               xm0, 2
3447    mova            [r4-16], xm0
3448.h32_main:
3449    movd                xm6, dyd
3450    neg            maxbaseq
3451    vpbroadcastw         m7, [tlq+maxbaseq*2]
3452    shl            maxbased, 6
3453    vpbroadcastw         m6, xm6
3454    lea                 r4d, [maxbaseq+dyq+15*64]
3455    neg                 dyq
3456    movd                xm4, r4d
3457    vpbroadcastd         m8, [pw_m1024]
3458    lea                  r4, [dyq+63]
3459    vpbroadcastw         m4, xm4
3460    or             maxbased, 63
3461    psubw                m4, [z_base_inc]
3462.h32_loop:
3463    mov                  r5, r4
3464    sar                  r5, 6
3465    movu                 m1, [tlq+r5*2-64]
3466    movu                 m0, [tlq+r5*2-62]
3467    pand                 m3, m5, m4
3468    psllw                m3, 9
3469    psubw                m1, m0
3470    pmulhrsw             m1, m3
3471    pcmpgtw              m2, m8, m4
3472    paddw                m0, m1
3473    vpblendvb            m0, m7, m0, m2
3474    movu                 m2, [tlq+r5*2-32]
3475    movu                 m1, [tlq+r5*2-30]
3476    add                  r4, dyq
3477    sub                 rsp, 64
3478    psubw                m2, m1
3479    pmulhrsw             m2, m3
3480    psraw                m3, m4, 15
3481    paddw                m4, m6
3482    mova         [rsp+32*0], m0
3483    paddw                m1, m2
3484    vpblendvb            m1, m7, m1, m3
3485    mova         [rsp+32*1], m1
3486    dec                  wd
3487    jz .h32_transpose
3488    cmp                 r4d, maxbased
3489    jg .h32_loop
3490.h32_end_loop:
3491    sub                 rsp, 64
3492    mova         [rsp+32*0], m7
3493    mova         [rsp+32*1], m7
3494    dec                  wd
3495    jg .h32_end_loop
3496.h32_transpose:
3497    lea                  r3, [strideq*3]
3498    lea                  r4, [strideq*5]
3499    mov                  r8, dstq
3500    lea                  r5, [strideq+r3*2]
3501.h32_transpose_loop0:
3502    lea                  r6, [rsp+32]
3503    lea                  r2, [r8+org_wq*2-16]
3504.h32_transpose_loop:
3505    mova                 m0, [r6+64*7]
3506    mova                 m1, [r6+64*6]
3507    mova                 m2, [r6+64*5]
3508    mova                 m3, [r6+64*4]
3509    mova                 m4, [r6+64*3]
3510    mova                 m5, [r6+64*2]
3511    mova                 m6, [r6+64*1]
3512    mova                 m7, [r6+64*0]
3513    punpckhwd            m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
3514    punpcklwd            m0, m1     ; a7 b7 a6 b6 a5 b5 a4 b4
3515    punpckhwd            m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
3516    punpcklwd            m2, m3     ; c7 d7 c6 d6 c5 d5 c4 d4
3517    punpckhwd            m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
3518    punpcklwd            m4, m5     ; e7 f7 e6 f6 e5 f5 e4 f4
3519    punpckhwd            m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
3520    punpcklwd            m6, m7     ; g7 h7 g6 h6 g5 h5 g4 h4
3521    lea                dstq, [r2+strideq*8]
3522    sub                  r6, 32
3523    punpckhdq            m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
3524    punpckldq            m8, m1     ; a3 b3 c3 d3 a2 b2 c2 d2
3525    punpckhdq            m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
3526    punpckldq            m3, m5     ; e3 f3 g3 h3 e2 f2 g2 h2
3527    punpckhqdq           m5, m7, m1 ;  8  0
3528    vextracti128 [r2  +strideq*0], m5, 1
3529    punpcklqdq           m7, m1     ;  9  1
3530    mova         [dstq+strideq*0], xm5
3531    punpckhqdq           m1, m8, m3 ; 10  2
3532    vextracti128 [r2  +strideq*1], m7, 1
3533    punpcklqdq           m8, m3     ; 11  3
3534    mova         [dstq+strideq*1], xm7
3535    punpckhdq            m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
3536    vextracti128 [r2  +strideq*2], m1, 1
3537    punpckldq            m0, m2     ; a7 b7 c7 d7 a6 b6 c6 d6
3538    mova         [dstq+strideq*2], xm1
3539    punpckhdq            m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
3540    vextracti128 [r2  +r3       ], m8, 1
3541    punpckldq            m4, m6     ; e7 f7 g7 h7 e6 f6 g6 h6
3542    mova         [dstq+r3       ], xm8
3543    punpckhqdq           m6, m3, m2 ; 12  4
3544    vextracti128 [r2  +strideq*4], m6, 1
3545    punpcklqdq           m3, m2     ; 13  5
3546    mova         [dstq+strideq*4], xm6
3547    punpckhqdq           m2, m0, m4 ; 14  6
3548    vextracti128 [r2  +r4       ], m3, 1
3549    punpcklqdq           m0, m4     ; 15  7
3550    mova         [dstq+r4       ], xm3
3551    vextracti128 [r2  +r3*2     ], m2, 1
3552    mova         [dstq+r3*2     ], xm2
3553    vextracti128 [r2  +r5       ], m0, 1
3554    mova         [dstq+r5       ], xm0
3555    lea                  r2, [dstq+strideq*8]
3556    cmp                  r6, rsp
3557    jae .h32_transpose_loop
3558    add                 rsp, 64*8
3559    sub              org_wd, 8
3560    jg .h32_transpose_loop0
3561.h32_end:
3562    RET
3563.h64:
3564    %assign stack_offset org_stack_offset
3565    ALLOC_STACK        -256, 10
3566    lea            maxbased, [wq+63]
3567    test             angled, 0x400
3568    jnz .h64_main
3569    vpbroadcastd         m2, [pw_3]
3570    movu                 m0, [tlq-28]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
3571    punpckhwd            m1, m0, m0
3572    vpblendd             m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
3573    paddw                m0, [tlq-30]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
3574    paddw                m1, m2
3575    paddw                m0, [tlq-32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3576    pavgw                m1, [tlq-34]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3577    lea                  r4, [rsp+224]
3578    paddw                m0, m1
3579    lea                 r5d, [wq+32]
3580    psrlw                m0, 2
3581    mova               [r4], m0
3582.h64_filter_loop:
3583    mova                 m0, [tlq-62]
3584    paddw                m1, m2, [tlq-66]
3585    paddw                m0, [tlq-64]
3586    pavgw                m1, [tlq-58]
3587    paddw                m0, [tlq-60]
3588    sub                 tlq, 32
3589    sub                  r4, 32
3590    paddw                m0, m1
3591    psrlw                m0, 2
3592    mova               [r4], m0
3593    sub                 r5d, 16
3594    jg .h64_filter_loop
3595    mova                 m0, [tlq-62]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3596    punpcklwd           xm1, xm0, xm0
3597    paddw                m2, [tlq-58]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
3598    paddw                m0, [tlq-60]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3599    vpblendd             m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3600    vpblendd             m1, [tlq-64], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3601    lea                 tlq, [rsp+254]
3602    pavgw                m2, m3
3603    paddw                m0, m1
3604    paddw                m0, m2
3605    psrlw                m0, 2
3606    mova            [r4-32], m0
3607.h64_main:
3608    neg            maxbaseq
3609    movd                xm4, dyd
3610    vpbroadcastw         m6, [tlq+maxbaseq*2]
3611    shl            maxbased, 6
3612    vpbroadcastw         m4, xm4
3613    lea                 r4d, [maxbaseq+dyq+15*64]
3614    neg                 dyq
3615    vpbroadcastd         m7, [pw_m1024]
3616    movd                xm3, r4d
3617    lea                  r4, [dyq+63]
3618    paddw                m8, m7, m7
3619    vpbroadcastw         m3, xm3
3620    or             maxbased, 63
3621    paddw                m9, m8, m7
3622    psubw                m3, [z_base_inc]
3623.h64_loop:
3624    mov                  r5, r4
3625    sar                  r5, 6
3626    movu                 m1, [tlq+r5*2-128]
3627    movu                 m0, [tlq+r5*2-126]
3628    pand                 m2, m5, m3
3629    psllw                m2, 9
3630    psubw                m1, m0
3631    pmulhrsw             m1, m2
3632    sub                 rsp, 128
3633    paddw                m0, m1
3634    pcmpgtw              m1, m9, m3
3635    vpblendvb            m0, m6, m0, m1
3636    mova         [rsp+32*0], m0
3637    movu                 m1, [tlq+r5*2-96]
3638    movu                 m0, [tlq+r5*2-94]
3639    psubw                m1, m0
3640    pmulhrsw             m1, m2
3641    paddw                m0, m1
3642    pcmpgtw              m1, m8, m3
3643    vpblendvb            m0, m6, m0, m1
3644    mova         [rsp+32*1], m0
3645    movu                 m1, [tlq+r5*2-64]
3646    movu                 m0, [tlq+r5*2-62]
3647    psubw                m1, m0
3648    pmulhrsw             m1, m2
3649    paddw                m0, m1
3650    pcmpgtw              m1, m7, m3
3651    vpblendvb            m0, m6, m0, m1
3652    mova         [rsp+32*2], m0
3653    movu                 m1, [tlq+r5*2-32]
3654    movu                 m0, [tlq+r5*2-30]
3655    psubw                m1, m0
3656    pmulhrsw             m1, m2
3657    add                  r4, dyq
3658    psraw                m2, m3, 15
3659    paddw                m3, m4
3660    paddw                m0, m1
3661    vpblendvb            m0, m6, m0, m2
3662    mova         [rsp+32*3], m0
3663    dec                  wd
3664    jz .h64_transpose
3665    cmp                 r4d, maxbased
3666    jg .h64_loop
3667.h64_end_loop:
3668    sub                 rsp, 128
3669    mova         [rsp+32*0], m6
3670    mova         [rsp+32*1], m6
3671    mova         [rsp+32*2], m6
3672    mova         [rsp+32*3], m6
3673    dec                  wd
3674    jg .h64_end_loop
3675.h64_transpose:
3676    lea                  r2, [strideq*3]
3677    lea                  r3, [strideq*5]
3678    mov                  r5, dstq
3679    lea                  r4, [strideq+r2*2]
3680.h64_transpose_loop0:
3681    lea                  r6, [rsp+112]
3682    lea                dstq, [r5+org_wq*2-32]
3683.h64_transpose_loop:
3684    mova                xm0, [r6+128*15]
3685    vinserti128          m0, [r6+128* 7], 1
3686    mova                xm1, [r6+128*14]
3687    vinserti128          m1, [r6+128* 6], 1
3688    mova                xm2, [r6+128*13]
3689    vinserti128          m2, [r6+128* 5], 1
3690    mova                xm3, [r6+128*12]
3691    vinserti128          m3, [r6+128* 4], 1
3692    mova                xm4, [r6+128*11]
3693    vinserti128          m4, [r6+128* 3], 1
3694    mova                xm5, [r6+128*10]
3695    vinserti128          m5, [r6+128* 2], 1
3696    mova                xm6, [r6+128* 9]
3697    vinserti128          m6, [r6+128* 1], 1
3698    mova                xm7, [r6+128* 8]
3699    vinserti128          m7, [r6+128* 0], 1
3700    punpckhwd            m8, m0, m1
3701    punpcklwd            m0, m1
3702    punpckhwd            m1, m2, m3
3703    punpcklwd            m2, m3
3704    punpckhwd            m3, m4, m5
3705    punpcklwd            m4, m5
3706    punpckhwd            m5, m6, m7
3707    punpcklwd            m6, m7
3708    sub                  r6, 16
3709    punpckhdq            m7, m8, m1
3710    punpckldq            m8, m1
3711    punpckhdq            m1, m3, m5
3712    punpckldq            m3, m5
3713    punpckhqdq           m5, m7, m1
3714    punpcklqdq           m7, m1
3715    punpckhqdq           m1, m8, m3
3716    punpcklqdq           m8, m3
3717    punpckhdq            m3, m0, m2
3718    mova   [dstq+strideq*0], m5
3719    punpckldq            m0, m2
3720    mova   [dstq+strideq*1], m7
3721    punpckhdq            m2, m4, m6
3722    mova   [dstq+strideq*2], m1
3723    punpckldq            m4, m6
3724    mova   [dstq+r2       ], m8
3725    punpckhqdq           m6, m3, m2
3726    mova   [dstq+strideq*4], m6
3727    punpcklqdq           m3, m2
3728    mova   [dstq+r3       ], m3
3729    punpckhqdq           m2, m0, m4
3730    mova   [dstq+r2*2     ], m2
3731    punpcklqdq           m0, m4
3732    mova   [dstq+r4       ], m0
3733    lea                dstq, [dstq+strideq*8]
3734    cmp                  r6, rsp
3735    jae .h64_transpose_loop
3736    add                 rsp, 128*16
3737    sub              org_wd, 16
3738    jg .h64_transpose_loop0
3739.h64_end:
3740    RET
3741
3742%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
3743%ifnum %4
3744    pshufb             xm%2, xm%4
3745%else
3746    pshufb             xm%2, %4
3747%endif
3748    vinserti128         m%2, xm%2, 1
3749    pshufd              m%1, m%2, q0000
3750    pmaddwd             m%1, m2
3751    pshufd              m%3, m%2, q1111
3752    pmaddwd             m%3, m3
3753    paddd               m%1, m1
3754    paddd               m%1, m%3
3755    pshufd              m%3, m%2, q2222
3756    pmaddwd             m%3, m4
3757    paddd               m%1, m%3
3758    pshufd              m%3, m%2, q3333
3759    pmaddwd             m%3, m5
3760    paddd               m%1, m%3
3761    psrad               m%1, 4
3762    packusdw            m%1, m%1
3763    pminsw              m%1, m%5
3764%endmacro
3765
3766%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
3767    pshufb              m%2, m%6
3768    vpermq              m%4, m%2, q3232
3769    vinserti128         m%2, xm%2, 1
3770    pshufd              m%1, m%2, q0000
3771    pshufd              m%3, m%4, q0000
3772    pmaddwd             m%1, m2
3773    pmaddwd             m%3, m2
3774    paddd               m%1, m1
3775    paddd               m%3, m1
3776    pshufd              m%5, m%2, q1111
3777    pmaddwd             m%5, m3
3778    paddd               m%1, m%5
3779    pshufd              m%5, m%4, q1111
3780    pmaddwd             m%5, m3
3781    paddd               m%3, m%5
3782    pshufd              m%5, m%2, q2222
3783    pmaddwd             m%5, m4
3784    paddd               m%1, m%5
3785    pshufd              m%5, m%4, q2222
3786    pmaddwd             m%5, m4
3787    paddd               m%3, m%5
3788    pshufd              m%5, m%2, q3333
3789    pmaddwd             m%5, m5
3790    paddd               m%1, m%5
3791    pshufd              m%5, m%4, q3333
3792    pmaddwd             m%5, m5
3793    paddd               m%3, m%5
3794    psrad               m%1, 4
3795    psrad               m%3, 4
3796    packusdw            m%1, m%3
3797    pminsw              m%1, m%7
3798%endmacro
3799
3800; The ipred_filter SIMD processes 4x2 blocks in the following order which
3801; increases parallelism compared to doing things row by row. One redundant
3802; block is calculated for w8 and w16, two for w32.
3803;     w4     w8       w16             w32
3804;     1     1 2     1 2 3 5     1 2 3 5 b c d f
3805;     2     2 3     2 4 5 7     2 4 5 7 c e f h
3806;     3     3 4     4 6 7 9     4 6 7 9 e g h j
3807; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
3808;           5       8           8       i
3809
3810cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
3811%assign org_stack_offset stack_offset
3812%define base r6-ipred_filter_16bpc_avx2_table
3813    lea                  r6, [filter_intra_taps]
3814    tzcnt                wd, wm
3815%ifidn filterd, filterm
3816    movzx           filterd, filterb
3817%else
3818    movzx           filterd, byte filterm
3819%endif
3820    shl             filterd, 6
3821    add             filterq, r6
3822    lea                  r6, [ipred_filter_16bpc_avx2_table]
3823    vbroadcasti128       m0, [tlq-6]
3824    movsxd               wq, [r6+wq*4]
3825    vpbroadcastd         m1, [base+pd_8]
3826    pmovsxbw             m2, [filterq+16*0]
3827    pmovsxbw             m3, [filterq+16*1]
3828    pmovsxbw             m4, [filterq+16*2]
3829    pmovsxbw             m5, [filterq+16*3]
3830    add                  wq, r6
3831    mov                  hd, hm
3832    jmp                  wq
3833.w4:
3834    WIN64_SPILL_XMM      10
3835    mova                xm8, [base+filter_shuf2]
3836    vpbroadcastw         m9, r8m ; bitdepth_max
3837    lea                  r7, [6+hq*2]
3838    sub                 tlq, r7
3839    jmp .w4_loop_start
3840.w4_loop:
3841    pinsrq              xm0, [tlq+hq*2], 0
3842    lea                dstq, [dstq+strideq*2]
3843.w4_loop_start:
3844    FILTER_1BLK           6, 0, 7, 8, 9
3845    vextracti128        xm0, m6, 1
3846    movq   [dstq+strideq*0], xm6
3847    movq   [dstq+strideq*1], xm0
3848    sub                  hd, 2
3849    jg .w4_loop
3850    RET
3851ALIGN function_align
3852.w8:
3853    %assign stack_offset stack_offset - stack_size_padded
3854    WIN64_SPILL_XMM      16
3855    vbroadcasti128      m14, [base+filter_shuf3]
3856    vpbroadcastw        m15, r8m ; bitdepth_max
3857    FILTER_1BLK          10, 0, 7, [base+filter_shuf2], 15
3858    vpermq               m6, m10, q1302         ; ____ ____ | ____ 4321
3859    pslldq               m8, m0, 4
3860    psrldq               m7, m6, 2
3861    psrldq               m0, m6, 10
3862    punpcklwd            m7, m0
3863    vpblendd             m8, m6, 0x33           ; _0__ 4321 | ____ 4321
3864    vpblendd             m8, m7, 0x40           ; _056 4321 | ____ 4321
3865    vpblendd             m8, [tlq-6], 0x30      ; _056 4321 | ____ 4321
3866    lea                  r7, [16+hq*2]
3867    sub                 tlq, r7
3868    jmp .w8_loop_start
3869.w8_loop:
3870    vpermq               m8, m9, q1302          ; ____ 4321 | ____ 4321
3871    vpermq               m6, m9, q2031
3872    psrldq               m0, m6, 2
3873    psrldq               m6, 10
3874    punpcklwd            m6, m0
3875    vpblendd             m8, m7, 0x80           ; _0__ 4321 | ____ 4321
3876    vpblendd             m8, m6, 0x40           ; _056 4321 | ____ 4321
3877    mova                m10, m9
3878.w8_loop_start:
3879    vpblendd             m8, [tlq+hq*2], 0x0C   ; _056 4321 | _056 4321
3880    call .main
3881    vpblendd            m10, m9, 0xCC
3882    mova         [dstq+strideq*0], xm10
3883    vextracti128 [dstq+strideq*1], m10, 1
3884    lea                dstq, [dstq+strideq*2]
3885    sub                  hd, 2
3886    jg .w8_loop
3887    RET
3888ALIGN function_align
3889.w16:
3890    %assign stack_offset stack_offset - stack_size_padded
3891    ALLOC_STACK          32, 16
3892    vpbroadcastw        m15, r8m ; bitdepth_max
3893    sub                  hd, 2
3894    TAIL_CALL .w16_main, 0
3895.w16_main:
3896    mova               xm10, [base+filter_shuf2]
3897    FILTER_1BLK          13, 0, 6, 10, 15
3898    vpermq              m12, m13, q3120
3899    mova               xm14, [base+filter_shuf3]
3900    vinserti128         m14, [base+filter_shuf1], 1
3901    vpbroadcastq         m0, [tlq+10]
3902    vpblendd             m0, [tlq-16], 0x4C     ; ___0 4321 | _056 ____
3903    psrldq               m6, m12, 8
3904    vpblendd             m0, m6, 0x03           ; ___0 4321 | _056 4321
3905    punpcklwd            m6, m12
3906    vpblendd             m0, m6, 0x80           ; 56_0 4321 | _056 4321
3907    FILTER_2BLK          12, 0, 6, 7, 8, 14, 15
3908    vpblendd            m13, m12, 0xCC
3909    vpermq              m12, m12, q2031         ; 6___ 5___
3910    psrldq              xm6, xm12, 2
3911    psrldq              xm8, xm12, 12
3912    vpblendd            xm6, xm8, 0x01
3913    pblendw             xm6, [tlq+10], 0xF8     ; 4321 056_
3914    FILTER_1BLK          11, 6, 8, 10, 15
3915    vpermq              m11, m11, q3120
3916    pshufd               m9, m11, q1032
3917    movu                 m8, [tlq+6]            ; __43 210_ | ____ ____
3918    pshufd               m8, m8, q3021          ; __0_ 4321 | ____ ____
3919    pshufhw              m8, m8, q3201          ; ___0 4321 | ____ ____
3920    vpblendd             m9, m8, 0x70           ; ___0 4321 | ____ 4321
3921    mova         [dstq+strideq*0], xm13
3922    vextracti128 [dstq+strideq*1], m13, 1
3923    lea                  r7, [20+hq*2]
3924    sub                 tlq, r7
3925    vpermq               m6, m12, q0123         ; ____ 4321 | ____ 4321
3926    jmp .w16_loop_start
3927.w16_loop:
3928    vpermq              m13, m13, q3322
3929    vpermq              m11,  m9, q2020
3930    vpermq               m9,  m9, q1302
3931    vpermq               m6, m12, q0123
3932    psrldq               m7, 4
3933    vpblendd            m13, m10, 0xCC
3934    vpblendd             m9, m7, 0x40
3935    mova                 m0, [rsp+8]
3936    mova         [dstq+strideq*0], xm13
3937    vextracti128 [dstq+strideq*1], m13, 1
3938.w16_loop_start:
3939    mova                m13, m12
3940    vpblendd             m0, [tlq+hq*2], 0x0C
3941    psrldq               m7, m12, 8
3942    punpcklwd            m7, m12
3943    vpblendd             m0, m6, 0x33           ; ___0 4321 | _056 4321
3944    vpblendd             m0, m7, 0x80           ; 56_0 4321 | _056 4321
3945    FILTER_2BLK          10, 0, 6, 7, 8, 14, 15
3946    vpermq              m12, m10, q2031
3947    mova            [rsp+8], m0
3948    psrldq               m8, m11, 8
3949    psrldq              xm6, xm12, 2
3950    psrldq              xm7, xm12, 10
3951    psrldq              xm0, xm13, 2
3952    punpcklwd            m8, m11
3953    punpcklwd           xm7, xm6
3954    vpblendd             m8, m9, 0x73           ; 56_0 4321 | ____ 4321
3955    vpblendd             m8, m7, 0x04           ; 56_0 4321 | __56 4321
3956    vpblendd             m8, m0, 0x08           ; 56_0 4321 | _056 4321
3957    call .main
3958    vpermq               m8, m11, q3120
3959    vpblendd             m6, m8, m9, 0xCC
3960    mova         [dstq+strideq*0+16], xm6
3961    vextracti128 [dstq+strideq*1+16], m6, 1
3962    lea                dstq, [dstq+strideq*2]
3963    sub                  hd, 2
3964    jg .w16_loop
3965    vpermq               m8, m9, q3120
3966    vextracti128        xm0, m8, 1              ; 4321 ____
3967    pshufd             xm11, xm11, q1032
3968    vpblendd            xm0, xm11, 0x02         ; 4321 0___
3969    psrldq              xm6, xm8, 2
3970    psrldq              xm7, xm8, 12
3971    pblendw             xm0, xm6, 0x4           ; 4321 05__
3972    pblendw             xm0, xm7, 0x2           ; 4321 056_
3973    FILTER_1BLK           6, 0, 7, [base+filter_shuf2], 15
3974    vpermq              m12, m13, q1302
3975    vpblendd            m12, m10, 0xCC
3976    vpblendd             m9, m6, 0xCC
3977    mova         [dstq+strideq*0+ 0], xm12
3978    mova         [dstq+strideq*0+16], xm9
3979    vextracti128 [dstq+strideq*1+ 0], m12, 1
3980    vextracti128 [dstq+strideq*1+16], m9, 1
3981    ret
3982ALIGN function_align
3983.w32:
3984    %assign stack_offset org_stack_offset
3985    ALLOC_STACK          64, 16
3986    vpbroadcastw        m15, r8m ; bitdepth_max
3987    sub                  hd, 2
3988    lea                  r3, [dstq+32]
3989    lea                 r5d, [hd*2+20]
3990    call .w16_main
3991    mov                dstq, r3
3992    lea                 tlq, [tlq+r5+32]
3993    sub                 r5d, 20
3994    shr                 r5d, 1
3995    sub                 r5d, 2
3996    lea                  r4, [dstq+strideq*2-2]
3997DEFINE_ARGS dst, stride, tl, stride3, left, h
3998    lea            stride3q, [strideq*3]
3999    movu                 m8, [tlq-6]                        ; 4321 0___
4000    mova               xm10, [base+filter_shuf2]
4001    pinsrw              xm0, xm8, [dstq+strideq*0-2], 2
4002    pinsrw              xm0, xm0, [dstq+strideq*1-2], 1     ; 4321 056_
4003    pinsrw              xm9, [leftq+strideq*0], 5
4004    pinsrw              xm9, [leftq+strideq*1], 4
4005    FILTER_1BLK          13, 0, 6, 10, 15
4006    vpermq              m12, m13, q3120
4007    mova               xm14, [base+filter_shuf3]
4008    vinserti128         m14, [base+filter_shuf1], 1
4009    psrldq               m6, m12, 8
4010    punpcklwd            m7, m6, m12
4011    vpblendd             m0, m6, 0x03           ; ___0 ____ | _0__ 4321
4012    vpblendd             m0, m7, 0x80           ; 56_0 ____ | _0__ 4321
4013    vpblendd             m0, m8, 0x30           ; 56_0 4321 | _0__ 4321
4014    vpblendd             m0, m9, 0x04           ; 56_0 4321 | _056 4321
4015    FILTER_2BLK          12, 0, 6, 7, 8, 14, 15
4016    vpblendd            m13, m12, 0xCC
4017    pinsrw              xm9, [leftq+strideq*2], 3
4018    pinsrw              xm9, [leftq+stride3q ], 2
4019    lea               leftq, [leftq+strideq*4]
4020    pinsrw              xm9, [leftq+strideq*0], 1
4021    pinsrw              xm9, [leftq+strideq*1], 0
4022    movq           [rsp+32], xm9
4023    mov                 r7d, 1
4024    pslldq               m8, m9, 4
4025    vpblendd             m0, m8, 0x0C           ; ___0 ____ | _056 ____
4026    vpermq              m12, m12, q2031         ; 6___ 5___
4027    psrldq              xm6, xm12, 2
4028    psrldq              xm7, xm12, 12
4029    vpblendd            xm6, xm7, 0x01          ; ____ _56_
4030    pblendw             xm6, [tlq+10], 0xF8     ; 4321 056_
4031    FILTER_1BLK          11, 6, 7, 10, 15
4032    vpermq              m11, m11, q3120
4033    pshufd               m9, m11, q1032
4034    vbroadcasti128       m8, [tlq+22]           ; __43 210_ | ____ ____
4035    pshufd               m8, m8, q3021          ; __0_ 4321 | ____ ____
4036    pshufhw              m8, m8, q3201          ; ___0 4321 | ____ ____
4037    vpblendd             m9, m8, 0x70           ; ___0 4321 | ____ 4321
4038    mova         [dstq+strideq*0], xm13
4039    vextracti128 [dstq+strideq*1], m13, 1
4040    vpermq               m6, m12, q0123         ; ____ 4321 | ____ 4321
4041    jmp .w32_loop_start
4042.w32_loop_last:
4043    mova                 m0, [rsp+0]
4044    jmp .w32_loop
4045.w32_loop_left:
4046    mova                 m0, [rsp+0]
4047    vpblendd             m0, [rsp+32+r7*4-12], 0x0C
4048    dec                 r7d
4049    jg .w32_loop
4050    cmp                  hd, 2
4051    je .w32_loop
4052    pinsrw              xm6, [rsp+32], 6
4053    pinsrw              xm6, [leftq+strideq*2], 5
4054    pinsrw              xm6, [leftq+stride3q ], 4
4055    lea               leftq, [leftq+strideq*4]
4056    pinsrw              xm6, [leftq+strideq*0], 3
4057    pinsrw              xm6, [leftq+strideq*1], 2
4058    pinsrw              xm6, [leftq+strideq*2], 1
4059    pinsrw              xm6, [leftq+stride3q ], 0
4060    lea               leftq, [leftq+strideq*4]
4061    movu           [rsp+36], xm6
4062    pinsrw              xm6, [leftq+strideq*0], 1
4063    pinsrw              xm6, [leftq+strideq*1], 0
4064    movd           [rsp+32], xm6
4065    mov                 r7d, 4
4066.w32_loop:
4067    vpermq              m13, m13, q3322
4068    vpermq              m11,  m9, q2020
4069    vpermq               m9,  m9, q1302
4070    vpermq               m6, m12, q0123
4071    psrldq               m7, 4
4072    vpblendd            m13, m10, 0xCC
4073    vpblendd             m9, m7, 0x40           ; ___0 4321 | ____ 4321
4074    mova         [dstq+strideq*0], xm13
4075    vextracti128 [dstq+strideq*1], m13, 1
4076.w32_loop_start:
4077    mova                m13, m12
4078    psrldq               m7, m12, 8
4079    punpcklwd            m7, m12
4080    vpblendd             m0, m6, 0x33           ; ___0 4321 | _056 4321
4081    vpblendd             m0, m7, 0x80           ; 56_0 4321 | _056 4321
4082    FILTER_2BLK          10, 0, 6, 7, 8, 14, 15
4083    vpermq              m12, m10, q2031
4084    mova            [rsp+0], m0
4085    psrldq               m8, m11, 8
4086    psrldq              xm6, xm12, 2
4087    psrldq              xm7, xm12, 10
4088    psrldq              xm0, xm13, 2
4089    punpcklwd            m8, m11
4090    punpcklwd           xm7, xm6
4091    vpblendd             m8, m9, 0x73           ; 56_0 4321 | ____ 4321
4092    vpblendd             m8, m7, 0x04           ; 56_0 4321 | __56 4321
4093    vpblendd             m8, m0, 0x08           ; 56_0 4321 | _056 4321
4094    call .main
4095    vpermq               m8, m11, q3120
4096    vpblendd             m6, m8, m9, 0xCC
4097    mova         [dstq+strideq*0+16], xm6
4098    vextracti128 [dstq+strideq*1+16], m6, 1
4099    lea                dstq, [dstq+strideq*2]
4100    sub                  hd, 2
4101    jg .w32_loop_left
4102    jz .w32_loop_last
4103    vpermq               m8, m9, q3120
4104    vextracti128        xm0, m8, 1              ; 4321 ____
4105    pshufd             xm11, xm11, q1032
4106    vpblendd            xm0, xm11, 0x02         ; 4321 0___
4107    psrldq              xm6, xm8, 2
4108    psrldq              xm7, xm8, 12
4109    pblendw             xm0, xm6, 0x4           ; 4321 05__
4110    pblendw             xm0, xm7, 0x2           ; 4321 056_
4111    FILTER_1BLK           6, 0, 7, [base+filter_shuf2], 15
4112    vpermq              m12, m13, q1302
4113    vpblendd            m12, m10, 0xCC
4114    vpblendd             m9, m6, 0xCC
4115    mova         [dstq+strideq*0+ 0], xm12
4116    mova         [dstq+strideq*0+16], xm9
4117    vextracti128 [dstq+strideq*1+ 0], m12, 1
4118    vextracti128 [dstq+strideq*1+16], m9, 1
4119    RET
4120.main:
4121    FILTER_2BLK           9, 8, 6, 7, 0, 14, 15
4122    ret
4123
4124%if WIN64
4125DECLARE_REG_TMP 5
4126%else
4127DECLARE_REG_TMP 7
4128%endif
4129
4130%macro IPRED_CFL 1 ; ac in, unpacked pixels out
4131    psignw               m3, m%1, m1
4132    pabsw               m%1, m%1
4133    pmulhrsw            m%1, m2
4134    psignw              m%1, m3
4135    paddw               m%1, m0
4136%endmacro
4137
4138cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4139    movifnidn            hd, hm
4140    add                 tlq, 2
4141    movd                xm4, wd
4142    pxor                 m6, m6
4143    vpbroadcastw         m7, r7m
4144    pavgw               xm4, xm6
4145    tzcnt                wd, wd
4146    movd                xm5, wd
4147    movu                 m0, [tlq]
4148    lea                  t0, [ipred_cfl_left_16bpc_avx2_table]
4149    movsxd               r6, [t0+wq*4]
4150    add                  r6, t0
4151    add                  t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
4152    movsxd               wq, [t0+wq*4]
4153    add                  wq, t0
4154    movifnidn           acq, acmp
4155    jmp                  r6
4156
4157cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4158    mov                  hd, hm ; zero upper half
4159    sub                 tlq, hq
4160    movd                xm4, hd
4161    sub                 tlq, hq
4162    pxor                 m6, m6
4163    vpbroadcastw         m7, r7m
4164    pavgw               xm4, xm6
4165    tzcnt               r6d, hd
4166    movd                xm5, r6d
4167    movu                 m0, [tlq]
4168    lea                  t0, [ipred_cfl_left_16bpc_avx2_table]
4169    movsxd               r6, [t0+r6*4]
4170    add                  r6, t0
4171    add                  t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
4172    tzcnt                wd, wd
4173    movsxd               wq, [t0+wq*4]
4174    add                  wq, t0
4175    movifnidn           acq, acmp
4176    jmp                  r6
4177.h32:
4178    paddw                m0, [tlq+32]
4179.h16:
4180    vextracti128        xm1, m0, 1
4181    paddw               xm0, xm1
4182.h8:
4183    psrldq              xm1, xm0, 8
4184    paddw               xm0, xm1
4185.h4:
4186    punpcklwd           xm0, xm6
4187    psrlq               xm1, xm0, 32
4188    paddd               xm0, xm1
4189    psrldq              xm1, xm0, 8
4190    paddd               xm0, xm1
4191    paddd               xm0, xm4
4192    psrld               xm0, xm5
4193    vpbroadcastw         m0, xm0
4194    jmp                  wq
4195
4196cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4197    movifnidn            hd, hm
4198    movifnidn            wd, wm
4199    tzcnt               r6d, hd
4200    lea                 t0d, [wq+hq]
4201    movd                xm4, t0d
4202    tzcnt               t0d, t0d
4203    movd                xm5, t0d
4204    lea                  t0, [ipred_cfl_16bpc_avx2_table]
4205    tzcnt                wd, wd
4206    movsxd               r6, [t0+r6*4]
4207    movsxd               wq, [t0+wq*4+4*4]
4208    psrlw               xm4, 1
4209    pxor                 m6, m6
4210    vpbroadcastw         m7, r7m
4211    add                  r6, t0
4212    add                  wq, t0
4213    movifnidn           acq, acmp
4214    jmp                  r6
4215.h4:
4216    movq                xm0, [tlq-8]
4217    jmp                  wq
4218.w4:
4219    movq                xm1, [tlq+2]
4220    paddw                m0, m4
4221    paddw                m0, m1
4222    psrlq                m1, m0, 32
4223    paddw                m0, m1
4224    psrld                m1, m0, 16
4225    paddw                m0, m1
4226    cmp                  hd, 4
4227    jg .w4_mul
4228    psrlw               xm0, 3
4229    jmp .w4_end
4230.w4_mul:
4231    vextracti128        xm1, m0, 1
4232    paddw               xm0, xm1
4233    lea                 r2d, [hq*2]
4234    mov                 r6d, 0xAAAB6667
4235    shrx                r6d, r6d, r2d
4236    punpckhwd           xm1, xm0, xm6
4237    punpcklwd           xm0, xm6
4238    paddd               xm0, xm1
4239    movd                xm1, r6d
4240    psrld               xm0, 2
4241    pmulhuw             xm0, xm1
4242    psrlw               xm0, 1
4243.w4_end:
4244    vpbroadcastw         m0, xm0
4245.s4:
4246    vpbroadcastw         m1, alpham
4247    lea                  r6, [strideq*3]
4248    pabsw                m2, m1
4249    psllw                m2, 9
4250.s4_loop:
4251    mova                 m4, [acq]
4252    IPRED_CFL             4
4253    pmaxsw               m4, m6
4254    pminsw               m4, m7
4255    vextracti128        xm5, m4, 1
4256    movq   [dstq+strideq*0], xm4
4257    movq   [dstq+strideq*2], xm5
4258    movhps [dstq+strideq*1], xm4
4259    movhps [dstq+r6       ], xm5
4260    lea                dstq, [dstq+strideq*4]
4261    add                 acq, 32
4262    sub                  hd, 4
4263    jg .s4_loop
4264    RET
4265ALIGN function_align
4266.h8:
4267    mova                xm0, [tlq-16]
4268    jmp                  wq
4269.w8:
4270    vextracti128        xm1, m0, 1
4271    paddw               xm0, [tlq+2]
4272    paddw               xm0, xm4
4273    paddw               xm0, xm1
4274    psrld               xm1, xm0, 16
4275    paddw               xm0, xm1
4276    pblendw             xm0, xm6, 0xAA
4277    psrlq               xm1, xm0, 32
4278    paddd               xm0, xm1
4279    psrldq              xm1, xm0, 8
4280    paddd               xm0, xm1
4281    psrld               xm0, xm5
4282    cmp                  hd, 8
4283    je .w8_end
4284    mov                 r6d, 0xAAAB
4285    mov                 r2d, 0x6667
4286    cmp                  hd, 32
4287    cmovz               r6d, r2d
4288    movd                xm1, r6d
4289    pmulhuw             xm0, xm1
4290    psrlw               xm0, 1
4291.w8_end:
4292    vpbroadcastw         m0, xm0
4293.s8:
4294    vpbroadcastw         m1, alpham
4295    lea                  r6, [strideq*3]
4296    pabsw                m2, m1
4297    psllw                m2, 9
4298.s8_loop:
4299    mova                 m4, [acq]
4300    mova                 m5, [acq+32]
4301    IPRED_CFL             4
4302    IPRED_CFL             5
4303    pmaxsw               m4, m6
4304    pmaxsw               m5, m6
4305    pminsw               m4, m7
4306    pminsw               m5, m7
4307    mova         [dstq+strideq*0], xm4
4308    mova         [dstq+strideq*2], xm5
4309    vextracti128 [dstq+strideq*1], m4, 1
4310    vextracti128 [dstq+r6       ], m5, 1
4311    lea                dstq, [dstq+strideq*4]
4312    add                 acq, 64
4313    sub                  hd, 4
4314    jg .s8_loop
4315    RET
4316ALIGN function_align
4317.h16:
4318    mova                 m0, [tlq-32]
4319    jmp                  wq
4320.w16:
4321    paddw                m0, [tlq+2]
4322    vextracti128        xm1, m0, 1
4323    paddw               xm0, xm4
4324    paddw               xm0, xm1
4325    punpckhwd           xm1, xm0, xm6
4326    punpcklwd           xm0, xm6
4327    paddd               xm0, xm1
4328    psrlq               xm1, xm0, 32
4329    paddd               xm0, xm1
4330    psrldq              xm1, xm0, 8
4331    paddd               xm0, xm1
4332    psrld               xm0, xm5
4333    cmp                  hd, 16
4334    je .w16_end
4335    mov                 r6d, 0xAAAB
4336    mov                 r2d, 0x6667
4337    test                 hb, 8|32
4338    cmovz               r6d, r2d
4339    movd                xm1, r6d
4340    pmulhuw             xm0, xm1
4341    psrlw               xm0, 1
4342.w16_end:
4343    vpbroadcastw         m0, xm0
4344.s16:
4345    vpbroadcastw         m1, alpham
4346    pabsw                m2, m1
4347    psllw                m2, 9
4348.s16_loop:
4349    mova                 m4, [acq]
4350    mova                 m5, [acq+32]
4351    IPRED_CFL             4
4352    IPRED_CFL             5
4353    pmaxsw               m4, m6
4354    pmaxsw               m5, m6
4355    pminsw               m4, m7
4356    pminsw               m5, m7
4357    mova   [dstq+strideq*0], m4
4358    mova   [dstq+strideq*1], m5
4359    lea                dstq, [dstq+strideq*2]
4360    add                 acq, 64
4361    sub                  hd, 2
4362    jg .s16_loop
4363    RET
4364ALIGN function_align
4365.h32:
4366    mova                 m0, [tlq-64]
4367    paddw                m0, [tlq-32]
4368    jmp                  wq
4369.w32:
4370    paddw                m0, [tlq+ 2]
4371    paddw                m0, [tlq+34]
4372    vextracti128        xm1, m0, 1
4373    paddw               xm0, xm4
4374    paddw               xm0, xm1
4375    punpcklwd           xm1, xm0, xm6
4376    punpckhwd           xm0, xm6
4377    paddd               xm0, xm1
4378    psrlq               xm1, xm0, 32
4379    paddd               xm0, xm1
4380    psrldq              xm1, xm0, 8
4381    paddd               xm0, xm1
4382    psrld               xm0, xm5
4383    cmp                  hd, 32
4384    je .w32_end
4385    lea                 r2d, [hq*2]
4386    mov                 r6d, 0x6667AAAB
4387    shrx                r6d, r6d, r2d
4388    movd                xm1, r6d
4389    pmulhuw             xm0, xm1
4390    psrlw               xm0, 1
4391.w32_end:
4392    vpbroadcastw         m0, xm0
4393.s32:
4394    vpbroadcastw         m1, alpham
4395    pabsw                m2, m1
4396    psllw                m2, 9
4397.s32_loop:
4398    mova                 m4, [acq]
4399    mova                 m5, [acq+32]
4400    IPRED_CFL             4
4401    IPRED_CFL             5
4402    pmaxsw               m4, m6
4403    pmaxsw               m5, m6
4404    pminsw               m4, m7
4405    pminsw               m5, m7
4406    mova        [dstq+32*0], m4
4407    mova        [dstq+32*1], m5
4408    add                dstq, strideq
4409    add                 acq, 64
4410    dec                  hd
4411    jg .s32_loop
4412    RET
4413
4414cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4415    mov                 r6d, r7m
4416    shr                 r6d, 11
4417    lea                  t0, [ipred_cfl_splat_16bpc_avx2_table]
4418    tzcnt                wd, wd
4419    movifnidn            hd, hm
4420    movsxd               wq, [t0+wq*4]
4421    vpbroadcastd         m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
4422    pxor                 m6, m6
4423    vpbroadcastw         m7, r7m
4424    add                  wq, t0
4425    movifnidn           acq, acmp
4426    jmp                  wq
4427
4428cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4429    movifnidn         hpadd, hpadm
4430    vpbroadcastd         m5, [pw_2]
4431    mov                  hd, hm
4432    shl               hpadd, 2
4433    pxor                 m4, m4
4434    sub                  hd, hpadd
4435    cmp            dword wm, 8
4436    jg .w16
4437    je .w8
4438.w4:
4439    lea                  r3, [strideq*3]
4440    mov                  r5, acq
4441.w4_loop:
4442    mova                xm0, [ypxq+strideq*2]
4443    mova                xm1, [ypxq+r3       ]
4444    vinserti128          m0, [ypxq+strideq*0], 1
4445    vinserti128          m1, [ypxq+strideq*1], 1
4446    lea                ypxq, [ypxq+strideq*4]
4447    pmaddwd              m0, m5
4448    pmaddwd              m1, m5
4449    paddd                m0, m1
4450    vextracti128        xm1, m0, 1
4451    paddd                m4, m0
4452    packssdw            xm1, xm0
4453    mova              [acq], xm1
4454    add                 acq, 16
4455    sub                  hd, 2
4456    jg .w4_loop
4457    test              hpadd, hpadd
4458    jz .dc
4459    vpermq               m1, m1, q1111
4460    pslld               xm0, 2
4461.w4_hpad_loop:
4462    mova              [acq], m1
4463    paddd                m4, m0
4464    add                 acq, 32
4465    sub               hpadd, 4
4466    jg .w4_hpad_loop
4467    jmp .dc
4468.w8:
4469    mov                  r5, acq
4470    test              wpadd, wpadd
4471    jnz .w8_wpad1
4472.w8_loop:
4473    pmaddwd              m0, m5, [ypxq+strideq*0]
4474    pmaddwd              m1, m5, [ypxq+strideq*1]
4475    lea                ypxq, [ypxq+strideq*2]
4476    paddd                m0, m1
4477    vextracti128        xm1, m0, 1
4478    paddd                m4, m0
4479    packssdw            xm1, xm0, xm1
4480    mova              [acq], xm1
4481    add                 acq, 16
4482    dec                  hd
4483    jg .w8_loop
4484.w8_hpad:
4485    test              hpadd, hpadd
4486    jz .dc
4487    vinserti128          m1, xm1, 1
4488    pslld                m0, 2
4489    jmp .hpad
4490.w8_wpad1:
4491    pmaddwd             xm0, xm5, [ypxq+strideq*0]
4492    pmaddwd             xm3, xm5, [ypxq+strideq*1]
4493    lea                ypxq, [ypxq+strideq*2]
4494    paddd               xm0, xm3
4495    pshufd              xm3, xm0, q3333
4496    packssdw            xm1, xm0, xm3
4497    paddd               xm0, xm3
4498    paddd               xm4, xm0
4499    mova              [acq], xm1
4500    add                 acq, 16
4501    dec                  hd
4502    jg .w8_wpad1
4503    jmp .w8_hpad
4504.w16_wpad:
4505    mova                 m0, [ypxq+strideq*0+ 0]
4506    mova                 m1, [ypxq+strideq*1+ 0]
4507    cmp               wpadd, 2
4508    jl .w16_wpad1
4509    je .w16_wpad2
4510    vpbroadcastd         m2, [ypxq+strideq*0+12]
4511    vpbroadcastd         m3, [ypxq+strideq*1+12]
4512    vpblendd             m0, m2, 0xf0
4513    vpblendd             m1, m3, 0xf0
4514    jmp .w16_wpad_end
4515.w16_wpad2:
4516    vpbroadcastd         m2, [ypxq+strideq*0+28]
4517    vpbroadcastd         m3, [ypxq+strideq*1+28]
4518    jmp .w16_wpad_end
4519.w16_wpad1:
4520    vpbroadcastd         m2, [ypxq+strideq*0+44]
4521    vpbroadcastd         m3, [ypxq+strideq*1+44]
4522    vinserti128          m2, [ypxq+strideq*0+32], 0
4523    vinserti128          m3, [ypxq+strideq*1+32], 0
4524.w16_wpad_end:
4525    lea                ypxq, [ypxq+strideq*2]
4526    REPX    {pmaddwd x, m5}, m0, m1, m2, m3
4527    paddd                m0, m1
4528    paddd                m2, m3
4529    packssdw             m1, m0, m2
4530    paddd                m0, m2
4531    vpermq               m1, m1, q3120
4532    paddd                m4, m0
4533    mova              [acq], m1
4534    add                 acq, 32
4535    dec                  hd
4536    jg .w16_wpad
4537    jmp .w16_hpad
4538.w16:
4539    mov                  r5, acq
4540    test              wpadd, wpadd
4541    jnz .w16_wpad
4542.w16_loop:
4543    pmaddwd              m0, m5, [ypxq+strideq*0+ 0]
4544    pmaddwd              m2, m5, [ypxq+strideq*0+32]
4545    pmaddwd              m1, m5, [ypxq+strideq*1+ 0]
4546    pmaddwd              m3, m5, [ypxq+strideq*1+32]
4547    lea                ypxq, [ypxq+strideq*2]
4548    paddd                m0, m1
4549    paddd                m2, m3
4550    packssdw             m1, m0, m2
4551    paddd                m0, m2
4552    vpermq               m1, m1, q3120
4553    paddd                m4, m0
4554    mova              [acq], m1
4555    add                 acq, 32
4556    dec                  hd
4557    jg .w16_loop
4558.w16_hpad:
4559    add               hpadd, hpadd
4560    jz .dc
4561    paddd                m0, m0
4562.hpad:
4563    mova         [acq+32*0], m1
4564    paddd                m4, m0
4565    mova         [acq+32*1], m1
4566    add                 acq, 32*2
4567    sub               hpadd, 4
4568    jg .hpad
4569.dc:
4570    vextracti128        xm1, m4, 1
4571    sub                  r5, acq ; -w*h*2
4572    tzcnt               r1d, r5d
4573    paddd               xm4, xm1
4574    sub                 r1d, 2
4575    punpckhqdq          xm1, xm4, xm4
4576    movd                xm0, r1d
4577    paddd               xm1, xm4
4578    pshuflw             xm4, xm1, q1032
4579    paddd               xm1, xm4
4580    psrld               xm1, xm0
4581    pxor                xm0, xm0
4582    pavgw               xm1, xm0
4583    vpbroadcastw         m1, xm1
4584.dc_loop:
4585    mova                 m0, [acq+r5]
4586    psubw                m0, m1
4587    mova           [acq+r5], m0
4588    add                  r5, 32
4589    jl .dc_loop
4590    RET
4591
4592cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4593    movifnidn         hpadd, hpadm
4594    vpbroadcastd         m5, [pw_4]
4595    mov                  hd, hm
4596    shl               hpadd, 2
4597    pxor                 m4, m4
4598    sub                  hd, hpadd
4599    cmp            dword wm, 8
4600    jg .w16
4601    je .w8
4602.w4:
4603    lea                  r3, [strideq*3]
4604    mov                  r5, acq
4605.w4_loop:
4606    mova                xm0, [ypxq+strideq*0]
4607    mova                xm1, [ypxq+strideq*1]
4608    vinserti128          m0, [ypxq+strideq*2], 1
4609    vinserti128          m1, [ypxq+r3       ], 1
4610    lea                ypxq, [ypxq+strideq*4]
4611    pmaddwd              m0, m5
4612    pmaddwd              m1, m5
4613    paddd                m4, m0
4614    packssdw             m0, m1
4615    paddd                m4, m1
4616    mova              [acq], m0
4617    add                 acq, 32
4618    sub                  hd, 4
4619    jg .w4_loop
4620    test              hpadd, hpadd
4621    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4622    vextracti128        xm1, m1, 1
4623    vpermq               m0, m0, q3333
4624    pslld               xm1, 2
4625.w4_hpad_loop:
4626    mova              [acq], m0
4627    paddd                m4, m1
4628    add                 acq, 32
4629    sub               hpadd, 4
4630    jg .w4_hpad_loop
4631    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4632.w8:
4633    mov                  r5, acq
4634    test              wpadd, wpadd
4635    jnz .w8_wpad1
4636.w8_loop:
4637    pmaddwd              m1, m5, [ypxq+strideq*0]
4638    pmaddwd              m0, m5, [ypxq+strideq*1]
4639    lea                ypxq, [ypxq+strideq*2]
4640    paddd                m4, m1
4641    packssdw             m1, m0
4642    paddd                m4, m0
4643    vpermq               m2, m1, q3120
4644    mova              [acq], m2
4645    add                 acq, 32
4646    sub                  hd, 2
4647    jg .w8_loop
4648.w8_hpad:
4649    test              hpadd, hpadd
4650    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4651    vpermq               m1, m1, q3131
4652    pslld                m0, 2
4653    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4654.w8_wpad1:
4655    vpbroadcastd         m1, [ypxq+strideq*0+12]
4656    vpbroadcastd         m0, [ypxq+strideq*1+12]
4657    vinserti128          m1, [ypxq+strideq*0+ 0], 0
4658    vinserti128          m0, [ypxq+strideq*1+ 0], 0
4659    lea                ypxq, [ypxq+strideq*2]
4660    pmaddwd              m1, m5
4661    pmaddwd              m0, m5
4662    paddd                m4, m1
4663    packssdw             m1, m0
4664    paddd                m4, m0
4665    vpermq               m2, m1, q3120
4666    mova              [acq], m2
4667    add                 acq, 32
4668    sub                  hd, 2
4669    jg .w8_wpad1
4670    jmp .w8_hpad
4671.w16:
4672    mov                  r5, acq
4673    test              wpadd, wpadd
4674    jnz .w16_wpad
4675.w16_loop:
4676    pmaddwd              m2, m5, [ypxq+strideq*0+ 0]
4677    pmaddwd              m1, m5, [ypxq+strideq*0+32]
4678    pmaddwd              m0, m5, [ypxq+strideq*1+ 0]
4679    pmaddwd              m3, m5, [ypxq+strideq*1+32]
4680    lea                ypxq, [ypxq+strideq*2]
4681    paddd                m4, m2
4682    packssdw             m2, m1
4683    paddd                m4, m1
4684    packssdw             m1, m0, m3
4685    paddd                m0, m3
4686    vpermq               m2, m2, q3120
4687    paddd                m4, m0
4688    vpermq               m1, m1, q3120
4689    mova         [acq+32*0], m2
4690    mova         [acq+32*1], m1
4691    add                 acq, 32*2
4692    sub                  hd, 2
4693    jg .w16_loop
4694    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
4695.w16_wpad:
4696    mova                 m2, [ypxq+strideq*0+ 0]
4697    mova                 m0, [ypxq+strideq*1+ 0]
4698    cmp               wpadd, 2
4699    jl .w16_wpad1
4700    je .w16_wpad2
4701    vpbroadcastd         m1, [ypxq+strideq*0+12]
4702    vpbroadcastd         m3, [ypxq+strideq*1+12]
4703    vpblendd             m2, m1, 0xf0
4704    vpblendd             m0, m3, 0xf0
4705    jmp .w16_wpad_end
4706.w16_wpad2:
4707    vpbroadcastd         m1, [ypxq+strideq*0+28]
4708    vpbroadcastd         m3, [ypxq+strideq*1+28]
4709    jmp .w16_wpad_end
4710.w16_wpad1:
4711    vpbroadcastd         m1, [ypxq+strideq*0+44]
4712    vpbroadcastd         m3, [ypxq+strideq*1+44]
4713    vinserti128          m1, [ypxq+strideq*0+32], 0
4714    vinserti128          m3, [ypxq+strideq*1+32], 0
4715.w16_wpad_end:
4716    lea                ypxq, [ypxq+strideq*2]
4717    REPX    {pmaddwd x, m5}, m2, m0, m1, m3
4718    paddd                m4, m2
4719    packssdw             m2, m1
4720    paddd                m4, m1
4721    packssdw             m1, m0, m3
4722    paddd                m0, m3
4723    vpermq               m2, m2, q3120
4724    paddd                m4, m0
4725    vpermq               m1, m1, q3120
4726    mova         [acq+32*0], m2
4727    mova         [acq+32*1], m1
4728    add                 acq, 32*2
4729    sub                  hd, 2
4730    jg .w16_wpad
4731    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
4732
4733cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4734    lea                  r6, [ipred_cfl_ac_444_16bpc_avx2_table]
4735    tzcnt                wd, wm
4736    movifnidn         hpadd, hpadm
4737    vpbroadcastd         m5, [pw_1]
4738    movsxd               wq, [r6+wq*4]
4739    shl               hpadd, 2
4740    add                  wq, r6
4741    mov                  hd, hm
4742    pxor                 m4, m4
4743    sub                  hd, hpadd
4744    jmp                  wq
4745.w4:
4746    lea                  r3, [strideq*3]
4747    mov                  r5, acq
4748.w4_loop:
4749    movq                xm0, [ypxq+strideq*0]
4750    movhps              xm0, [ypxq+strideq*1]
4751    vpbroadcastq         m1, [ypxq+strideq*2]
4752    vpbroadcastq         m2, [ypxq+r3       ]
4753    lea                ypxq, [ypxq+strideq*4]
4754    vpblendd             m0, m1, 0x30
4755    vpblendd             m0, m2, 0xc0
4756    psllw                m0, 3
4757    pmaddwd              m1, m0, m5
4758    mova              [acq], m0
4759    add                 acq, 32
4760    paddd                m4, m1
4761    sub                  hd, 4
4762    jg .w4_loop
4763    test              hpadd, hpadd
4764    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4765    vpermq               m0, m0, q3333
4766    paddd                m1, m1
4767    mova         [acq+32*0], m0
4768    vpermq               m1, m1, q3333
4769    mova         [acq+32*1], m0
4770    add                 acq, 32*2
4771    paddd                m4, m1
4772    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4773.w8:
4774    lea                  r3, [strideq*3]
4775    mov                  r5, acq
4776.w8_loop:
4777    mova                xm2, [ypxq+strideq*0]
4778    vinserti128          m2, [ypxq+strideq*1], 1
4779    mova                xm1, [ypxq+strideq*2]
4780    vinserti128          m1, [ypxq+r3       ], 1
4781    lea                ypxq, [ypxq+strideq*4]
4782    psllw                m2, 3
4783    psllw                m1, 3
4784    mova         [acq+32*0], m2
4785    pmaddwd              m2, m5
4786    mova         [acq+32*1], m1
4787    pmaddwd              m0, m1, m5
4788    add                 acq, 32*2
4789    paddd                m4, m2
4790    paddd                m4, m0
4791    sub                  hd, 4
4792    jg .w8_loop
4793    test              hpadd, hpadd
4794    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4795    vperm2i128           m1, m1, 0x11
4796    pslld                m0, 2
4797    pxor                 m2, m2
4798    vpblendd             m0, m2, 0x0f
4799    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4800.w16_wpad2:
4801    vpbroadcastw         m3, [ypxq+strideq*0+14]
4802    vpbroadcastw         m0, [ypxq+strideq*1+14]
4803    vpblendd             m2, m3, 0xf0
4804    vpblendd             m1, m0, 0xf0
4805    jmp .w16_wpad_end
4806.w16:
4807    mov                  r5, acq
4808.w16_loop:
4809    mova                 m2, [ypxq+strideq*0]
4810    mova                 m1, [ypxq+strideq*1]
4811    test              wpadd, wpadd
4812    jnz .w16_wpad2
4813.w16_wpad_end:
4814    lea                ypxq, [ypxq+strideq*2]
4815    psllw                m2, 3
4816    psllw                m1, 3
4817    mova         [acq+32*0], m2
4818    pmaddwd              m2, m5
4819    mova         [acq+32*1], m1
4820    pmaddwd              m0, m1, m5
4821    add                 acq, 32*2
4822    paddd                m4, m2
4823    paddd                m4, m0
4824    sub                  hd, 2
4825    jg .w16_loop
4826    add               hpadd, hpadd
4827    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4828    paddd                m0, m0
4829    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4830.w32:
4831    mov                  r5, acq
4832    test              wpadd, wpadd
4833    jnz .w32_wpad
4834.w32_loop:
4835    mova                 m0, [ypxq+ 0]
4836    mova                 m1, [ypxq+32]
4837    add                ypxq, strideq
4838    psllw                m0, 3
4839    psllw                m1, 3
4840    pmaddwd              m2, m0, m5
4841    mova         [acq+32*0], m0
4842    pmaddwd              m3, m1, m5
4843    mova         [acq+32*1], m1
4844    add                 acq, 32*2
4845    paddd                m2, m3
4846    paddd                m4, m2
4847    dec                  hd
4848    jg .w32_loop
4849.w32_hpad:
4850    test              hpadd, hpadd
4851    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4852    paddd                m2, m2
4853.w32_hpad_loop:
4854    mova         [acq+32*0], m0
4855    mova         [acq+32*1], m1
4856    paddd                m4, m2
4857    mova         [acq+32*2], m0
4858    mova         [acq+32*3], m1
4859    add                 acq, 32*4
4860    sub               hpadd, 2
4861    jg .w32_hpad_loop
4862    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4863.w32_wpad:
4864    mova                 m0, [ypxq+ 0]
4865    cmp               wpadd, 4
4866    jl .w32_wpad2
4867    je .w32_wpad4
4868    vpbroadcastw         m1, [ypxq+14]
4869    vpblendd             m0, m1, 0xf0
4870    jmp .w32_wpad_end
4871.w32_wpad4:
4872    vpbroadcastw         m1, [ypxq+30]
4873    jmp .w32_wpad_end
4874.w32_wpad2:
4875    vpbroadcastw         m1, [ypxq+46]
4876    vinserti128          m1, [ypxq+32], 0
4877.w32_wpad_end:
4878    add                ypxq, strideq
4879    psllw                m0, 3
4880    psllw                m1, 3
4881    pmaddwd              m2, m0, m5
4882    mova         [acq+32*0], m0
4883    pmaddwd              m3, m1, m5
4884    mova         [acq+32*1], m1
4885    add                 acq, 32*2
4886    paddd                m2, m3
4887    paddd                m4, m2
4888    dec                  hd
4889    jg .w32_wpad
4890    jmp .w32_hpad
4891
4892cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
4893    vbroadcasti128       m3, [palq]
4894    lea                  r2, [pal_pred_16bpc_avx2_table]
4895    tzcnt                wd, wm
4896    vbroadcasti128       m4, [pal_pred_shuf]
4897    movifnidn            hd, hm
4898    movsxd               wq, [r2+wq*4]
4899    pshufb               m3, m4
4900    punpckhqdq           m4, m3, m3
4901    add                  wq, r2
4902DEFINE_ARGS dst, stride, stride3, idx, w, h
4903    lea            stride3q, [strideq*3]
4904    jmp                  wq
4905.w4:
4906    mova                xm2, [idxq]
4907    add                idxq, 16
4908    pshufb              xm1, xm3, xm2
4909    pshufb              xm2, xm4, xm2
4910    punpcklbw           xm0, xm1, xm2
4911    punpckhbw           xm1, xm2
4912    movq   [dstq+strideq*0], xm0
4913    movq   [dstq+strideq*2], xm1
4914    movhps [dstq+strideq*1], xm0
4915    movhps [dstq+stride3q ], xm1
4916    lea                dstq, [dstq+strideq*4]
4917    sub                  hd, 4
4918    jg .w4
4919    RET
4920.w8:
4921    movu                 m2, [idxq] ; only 16-byte alignment
4922    add                idxq, 32
4923    pshufb               m1, m3, m2
4924    pshufb               m2, m4, m2
4925    punpcklbw            m0, m1, m2
4926    punpckhbw            m1, m2
4927    mova         [dstq+strideq*0], xm0
4928    mova         [dstq+strideq*1], xm1
4929    vextracti128 [dstq+strideq*2], m0, 1
4930    vextracti128 [dstq+stride3q ], m1, 1
4931    lea                dstq, [dstq+strideq*4]
4932    sub                  hd, 4
4933    jg .w8
4934    RET
4935.w16:
4936    vpermq               m2, [idxq+ 0], q3120
4937    vpermq               m5, [idxq+32], q3120
4938    add                idxq, 64
4939    pshufb               m1, m3, m2
4940    pshufb               m2, m4, m2
4941    punpcklbw            m0, m1, m2
4942    punpckhbw            m1, m2
4943    mova   [dstq+strideq*0], m0
4944    mova   [dstq+strideq*1], m1
4945    pshufb               m1, m3, m5
4946    pshufb               m2, m4, m5
4947    punpcklbw            m0, m1, m2
4948    punpckhbw            m1, m2
4949    mova   [dstq+strideq*2], m0
4950    mova   [dstq+stride3q ], m1
4951    lea                dstq, [dstq+strideq*4]
4952    sub                  hd, 4
4953    jg .w16
4954    RET
4955.w32:
4956    vpermq               m2, [idxq+ 0], q3120
4957    vpermq               m5, [idxq+32], q3120
4958    add                idxq, 64
4959    pshufb               m1, m3, m2
4960    pshufb               m2, m4, m2
4961    punpcklbw            m0, m1, m2
4962    punpckhbw            m1, m2
4963    mova [dstq+strideq*0+ 0], m0
4964    mova [dstq+strideq*0+32], m1
4965    pshufb               m1, m3, m5
4966    pshufb               m2, m4, m5
4967    punpcklbw            m0, m1, m2
4968    punpckhbw            m1, m2
4969    mova [dstq+strideq*1+ 0], m0
4970    mova [dstq+strideq*1+32], m1
4971    lea                dstq, [dstq+strideq*2]
4972    sub                  hd, 2
4973    jg .w32
4974    RET
4975.w64:
4976    vpermq               m2, [idxq+ 0], q3120
4977    vpermq               m5, [idxq+32], q3120
4978    add                idxq, 64
4979    pshufb               m1, m3, m2
4980    pshufb               m2, m4, m2
4981    punpcklbw            m0, m1, m2
4982    punpckhbw            m1, m2
4983    mova          [dstq+ 0], m0
4984    mova          [dstq+32], m1
4985    pshufb               m1, m3, m5
4986    pshufb               m2, m4, m5
4987    punpcklbw            m0, m1, m2
4988    punpckhbw            m1, m2
4989    mova          [dstq+64], m0
4990    mova          [dstq+96], m1
4991    add                 dstq, strideq
4992    dec                   hd
4993    jg .w64
4994    RET
4995
4996%endif
4997