1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if HAVE_AVX512ICL && ARCH_X86_64
30
31%macro DUP4 1-*
32    %rep %0
33        times 4 db %1
34        %rotate 1
35    %endrep
36%endmacro
37
38%macro DIRS 16 ; cdef_directions[]
39    %rep 4 + 16 + 4 ; 6 7   0 1 2 3 4 5 6 7   0 1
40        ; masking away unused bits allows us to use a single vpaddd {1to16}
41        ; instruction instead of having to do vpbroadcastd + paddb
42        db %13 & 0x3f, -%13 & 0x3f
43        %rotate 1
44    %endrep
45%endmacro
46
47SECTION_RODATA 64
48
49lut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
50               db 16, 17,  0,  1,  2,  3,  4,  5, 18, 19,  8,  9, 10, 11, 12, 13
51               db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
52               db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
53lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
54              db  96, 97,  0,  1,  2,  3,  4,  5, 98, 99,  8,  9, 10, 11, 12, 13
55lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
56              db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
57              db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
58               db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
59pd_01234567:   dd  0,  1,  2,  3,  4,  5,  6,  7
60lut_perm_8x8a: db  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
61               db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55
62               db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87
63               db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119
64lut_perm_8x8b: db  4,  5,  6,  7,  8,  9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27
65               db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
66               db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91
67              db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123
68edge_mask:     dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
69               dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
70               dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
71               dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
72               dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
73               dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
74               dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
75               dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
76px_idx:      DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
77cdef_dirs:   DIRS -7,-14,  1, -6,  1,  2,  1, 10,  9, 18,  8, 17,  8, 16,  8, 15
78gf_shr:        dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
79               dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
80               dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
81               dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
82      times 16 db  0 ; realign (introduced by cdef_dirs)
83end_perm_w8clip:db 0, 4,  8, 12,  2,  6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
84               db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
85               db  1,  5,  9, 13,  3,  7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
86               db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
87end_perm:      db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
88               db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
89pri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
90sec_tap:       db 32, 32, 16, 16
91pd_268435568:  dd 268435568
92
93SECTION .text
94
95%if WIN64
96DECLARE_REG_TMP 5, 6
97%else
98DECLARE_REG_TMP 8, 5
99%endif
100
101; lut:
102; t0 t1 t2 t3 t4 t5 t6 t7
103; T0 T1 T2 T3 T4 T5 T6 T7
104; L0 L1 00 01 02 03 04 05
105; L2 L3 10 11 12 13 14 15
106; L4 L5 20 21 22 23 24 25
107; L6 L7 30 31 32 33 34 35
108; 4e 4f 40 41 42 43 44 45
109; 5e 5f 50 51 52 53 54 55
110
111INIT_ZMM avx512icl
112cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
113%define base r7-edge_mask
114    movq         xmm0, [dstq+strideq*0]
115    movhps       xmm0, [dstq+strideq*1]
116    lea            r7, [edge_mask]
117    movq         xmm1, [topq+strideq*0-2]
118    movhps       xmm1, [topq+strideq*1-2]
119    mov           r6d, edgem
120    vinserti32x4  ym0, ymm0, [leftq], 1
121    lea            r2, [strideq*3]
122    vinserti32x4  ym1, ymm1, [dstq+strideq*2], 1
123    mova           m5, [base+lut_perm_4x4]
124    vinserti32x4   m0, [dstq+r2], 2
125    test          r6b, 0x08      ; avoid buffer overread
126    jz .main
127    lea            r3, [dstq+strideq*4-4]
128    vinserti32x4   m1, [r3+strideq*0], 2
129    vinserti32x4   m0, [r3+strideq*1], 3
130.main:
131    movifnidn    prid, prim
132    mov           t0d, dirm
133    mova           m3, [base+px_idx]
134    mov           r3d, dampingm
135    vpermi2b       m5, m0, m1    ; lut
136    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
137    pxor           m7, m7
138    lea            r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
139    vpermb         m6, m3, m5    ; px
140    cmp           r6d, 0x0f
141    jne .mask_edges              ; mask edges only if required
142    test         prid, prid
143    jz .sec_only
144    vpaddd         m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
145    vpermb         m1, m1, m5    ; k0p0 k0p1 k1p0 k1p1
146%macro CDEF_FILTER_4x4_PRI 0
147    vpcmpub        k1, m6, m1, 6 ; px > pN
148    psubb          m2, m1, m6
149    lzcnt         r6d, prid
150    vpsubb     m2{k1}, m6, m1    ; abs(diff)
151    vpbroadcastb   m4, prid
152    and          prid, 1
153    vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
154    movifnidn     t1d, secm
155    vpbroadcastd  m10, [base+pri_tap+priq*4]
156    vpsubb    m10{k1}, m7, m10   ; apply_sign(pri_tap)
157    psubusb        m4, m9        ; imax(0, pri_strength - (abs(diff) >> shift)))
158    pminub         m2, m4
159    vpdpbusd       m0, m2, m10   ; sum
160%endmacro
161    CDEF_FILTER_4x4_PRI
162    test          t1d, t1d       ; sec
163    jz .end_no_clip
164    call .sec
165.end_clip:
166    pminub         m4, m6, m1
167    pmaxub         m1, m6
168    pminub         m5, m2, m3
169    pmaxub         m2, m3
170    pminub         m4, m5
171    pmaxub         m2, m1
172    psrldq         m1, m4, 2
173    psrldq         m3, m2, 2
174    pminub         m1, m4
175    vpcmpw         k1, m0, m7, 1
176    vpshldd        m6, m0, 8
177    pmaxub         m2, m3
178    pslldq         m3, m1, 1
179    psubw          m7, m0
180    paddusw        m0, m6     ; clip >0xff
181    vpsubusw   m0{k1}, m6, m7 ; clip <0x00
182    pslldq         m4, m2, 1
183    pminub         m1, m3
184    pmaxub         m2, m4
185    pmaxub         m0, m1
186    pminub         m0, m2
187    jmp .end
188.sec_only:
189    movifnidn     t1d, secm
190    call .sec
191.end_no_clip:
192    vpshldd        m6, m0, 8  ; (px << 8) + ((sum > -8) << 4)
193    paddw          m0, m6     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
194.end:
195    mova          xm1, [base+end_perm]
196    vpermb         m0, m1, m0 ; output in bits 8-15 of each dword
197    movd   [dstq+strideq*0], xm0
198    pextrd [dstq+strideq*1], xm0, 1
199    pextrd [dstq+strideq*2], xm0, 2
200    pextrd [dstq+r2       ], xm0, 3
201    RET
202.mask_edges_sec_only:
203    movifnidn     t1d, secm
204    call .mask_edges_sec
205    jmp .end_no_clip
206ALIGN function_align
207.mask_edges:
208    vpbroadcastq   m8, [base+edge_mask+r6*8]
209    test         prid, prid
210    jz .mask_edges_sec_only
211    vpaddd         m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
212    vpshufbitqmb   k1, m8, m2 ; index in-range
213    mova           m1, m6
214    vpermb     m1{k1}, m2, m5
215    CDEF_FILTER_4x4_PRI
216    test          t1d, t1d
217    jz .end_no_clip
218    call .mask_edges_sec
219    jmp .end_clip
220.mask_edges_sec:
221    vpaddd         m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
222    vpaddd         m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
223    vpshufbitqmb   k1, m8, m4
224    mova           m2, m6
225    vpermb     m2{k1}, m4, m5
226    vpshufbitqmb   k1, m8, m9
227    mova           m3, m6
228    vpermb     m3{k1}, m9, m5
229    jmp .sec_main
230ALIGN function_align
231.sec:
232    vpaddd         m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
233    vpaddd         m3,     [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
234    vpermb         m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
235    vpermb         m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
236.sec_main:
237    vpbroadcastd   m8, [base+sec_tap]
238    vpcmpub        k1, m6, m2, 6
239    psubb          m4, m2, m6
240    vpbroadcastb  m12, t1d
241    lzcnt         t1d, t1d
242    vpsubb     m4{k1}, m6, m2
243    vpcmpub        k2, m6, m3, 6
244    vpbroadcastq  m11, [r3+t1*8]
245    gf2p8affineqb m10, m4, m11, 0
246    psubb          m5, m3, m6
247    mova           m9, m8
248    vpsubb     m8{k1}, m7, m8
249    psubusb       m10, m12, m10
250    vpsubb     m5{k2}, m6, m3
251    pminub         m4, m10
252    vpdpbusd       m0, m4, m8
253    gf2p8affineqb m11, m5, m11, 0
254    vpsubb     m9{k2}, m7, m9
255    psubusb       m12, m11
256    pminub         m5, m12
257    vpdpbusd       m0, m5, m9
258    ret
259
260DECLARE_REG_TMP 2, 7
261
262;         lut top                lut bottom
263; t0 t1 t2 t3 t4 t5 t6 t7  L4 L5 20 21 22 23 24 25
264; T0 T1 T2 T3 T4 T5 T6 T7  L6 L7 30 31 32 33 34 35
265; L0 L1 00 01 02 03 04 05  L8 L9 40 41 42 43 44 45
266; L2 L3 10 11 12 13 14 15  La Lb 50 51 52 53 54 55
267; L4 L5 20 21 22 23 24 25  Lc Ld 60 61 62 63 64 65
268; L6 L7 30 31 32 33 34 35  Le Lf 70 71 72 73 74 75
269; L8 L9 40 41 42 43 44 45  8e 8f 80 81 82 83 84 85
270; La Lb 50 51 52 53 54 55  9e 9f 90 91 92 93 94 95
271
272cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
273                                   pri, sec, dir, damping, edge
274%define base r8-edge_mask
275    vpbroadcastd ym21, strided
276    mov           r6d, edgem
277    lea            r8, [edge_mask]
278    movq          xm1, [topq+strideq*0-2]
279    pmulld       ym21, [base+pd_01234567]
280    kxnorb         k1, k1, k1
281    movq          xm2, [topq+strideq*1-2]
282    vpgatherdq m0{k1}, [dstq+ym21]  ; +0+1 +2+3 +4+5 +6+7
283    mova          m14, [base+lut_perm_4x8a]
284    movu          m15, [base+lut_perm_4x8b]
285    test          r6b, 0x08         ; avoid buffer overread
286    jz .main
287    lea            r7, [dstq+strideq*8-2]
288    vinserti32x4  ym1, [r7+strideq*0], 1
289    vinserti32x4  ym2, [r7+strideq*1], 1
290.main:
291    punpcklqdq    ym1, ym2
292    vinserti32x4   m1, [leftq], 2   ; -2-1 +8+9 left ____
293    movifnidn    prid, prim
294    mov           t0d, dirm
295    mova          m16, [base+px_idx]
296    mov           r3d, dampingm
297    vpermi2b      m14, m0, m1    ; lut top
298    vpermi2b      m15, m0, m1    ; lut bottom
299    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
300    pxor          m20, m20
301    lea            r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
302    vpermb         m2, m16, m14  ; pxt
303    vpermb         m3, m16, m15  ; pxb
304    mova           m1, m0
305    cmp           r6b, 0x0f
306    jne .mask_edges              ; mask edges only if required
307    test         prid, prid
308    jz .sec_only
309    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
310    vpermb         m4, m6, m14   ; pNt k0p0 k0p1 k1p0 k1p1
311    vpermb         m5, m6, m15   ; pNb
312%macro CDEF_FILTER_4x8_PRI 0
313    vpcmpub        k1, m2, m4, 6 ; pxt > pNt
314    vpcmpub        k2, m3, m5, 6 ; pxb > pNb
315    psubb          m6, m4, m2
316    psubb          m7, m5, m3
317    lzcnt         r6d, prid
318    vpsubb     m6{k1}, m2, m4    ; abs(diff_top)
319    vpsubb     m7{k2}, m3, m5    ; abs(diff_bottom)
320    vpbroadcastb  m13, prid
321    vpbroadcastq   m9, [r3+r6*8]
322    and          prid, 1
323    vpbroadcastd  m11, [base+pri_tap+priq*4]
324    vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
325    vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
326    mova          m10, m11
327    movifnidn     t1d, secm
328    vpsubb    m10{k1}, m20, m11  ; apply_sign(pri_tap_top)
329    vpsubb    m11{k2}, m20, m11  ; apply_sign(pri_tap_bottom)
330    psubusb       m12, m13, m8   ; imax(0, pri_strength - (abs(dt) >> shift)))
331    psubusb       m13, m13, m9   ; imax(0, pri_strength - (abs(db) >> shift)))
332    pminub         m6, m12
333    pminub         m7, m13
334    vpdpbusd       m0, m6, m10   ; sum top
335    vpdpbusd       m1, m7, m11   ; sum bottom
336%endmacro
337    CDEF_FILTER_4x8_PRI
338    test          t1d, t1d       ; sec
339    jz .end_no_clip
340    call .sec
341.end_clip:
342    pminub        m10, m4, m2
343    pminub        m12, m6, m8
344    pminub        m11, m5, m3
345    pminub        m13, m7, m9
346    pmaxub         m4, m2
347    pmaxub         m6, m8
348    pmaxub         m5, m3
349    pmaxub         m7, m9
350    pminub        m10, m12
351    pminub        m11, m13
352    pmaxub         m4, m6
353    pmaxub         m5, m7
354    mov           r2d, 0xAAAAAAAA
355    kmovd          k1, r2d
356    kxnorb         k2, k2, k2       ;   hw   lw
357    vpshrdd       m12, m0, m1, 16   ;  m1lw m0hw
358    vpshrdd        m6, m10, m11, 16 ; m11lw m10hw
359    vpshrdd        m8, m4, m5, 16   ;  m5lw m4hw
360    vpblendmw  m7{k1}, m10, m11     ; m11hw m10lw
361    vpblendmw  m9{k1}, m4, m5       ;  m5hw m4lw
362    vpblendmw  m4{k1}, m0, m12      ;  m1lw m0lw
363    vpblendmw  m5{k1}, m12, m1      ;  m1hw m0hw
364    vpshrdd        m2, m3, 16
365    pminub         m6, m7
366    pmaxub         m8, m9
367    mova         ym14, [base+end_perm]
368    vpcmpw         k1, m4, m20, 1
369    vpshldw        m2, m5, 8
370    pslldq         m7, m6, 1
371    pslldq         m9, m8, 1
372    psubw          m5, m20, m4
373    paddusw        m0, m4, m2 ; clip >0xff
374    pminub         m6, m7
375    pmaxub         m8, m9
376    psubusw    m0{k1}, m2, m5 ; clip <0x00
377    pmaxub         m0, m6
378    pminub         m0, m8
379    vpermb         m0, m14, m0
380    vpscatterdd [dstq+ym21]{k2}, ym0
381    RET
382.sec_only:
383    movifnidn     t1d, secm
384    call .sec
385.end_no_clip:
386    mova          ym4, [base+end_perm]
387    kxnorb         k1, k1, k1
388    vpshldd        m2, m0, 8  ; (px << 8) + ((sum > -8) << 4)
389    vpshldd        m3, m1, 8
390    paddw          m0, m2     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
391    paddw          m1, m3
392    pslld          m0, 16
393    vpshrdd        m0, m1, 16
394    vpermb         m0, m4, m0 ; output in bits 8-15 of each word
395    vpscatterdd [dstq+ym21]{k1}, ym0
396    RET
397.mask_edges_sec_only:
398    movifnidn     t1d, secm
399    call .mask_edges_sec
400    jmp .end_no_clip
401ALIGN function_align
402.mask_edges:
403    mov           t1d, r6d
404    or            r6d, 8 ; top 4x4 has bottom
405    or            t1d, 4 ; bottom 4x4 has top
406    vpbroadcastq  m17, [base+edge_mask+r6*8]
407    vpbroadcastq  m18, [base+edge_mask+t1*8]
408    test         prid, prid
409    jz .mask_edges_sec_only
410    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
411    vpshufbitqmb   k1, m17, m6 ; index in-range
412    vpshufbitqmb   k2, m18, m6
413    mova           m4, m2
414    mova           m5, m3
415    vpermb     m4{k1}, m6, m14
416    vpermb     m5{k2}, m6, m15
417    CDEF_FILTER_4x8_PRI
418    test          t1d, t1d
419    jz .end_no_clip
420    call .mask_edges_sec
421    jmp .end_clip
422.mask_edges_sec:
423    vpaddd        m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
424    vpaddd        m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
425    vpshufbitqmb   k1, m17, m10
426    vpshufbitqmb   k2, m18, m10
427    vpshufbitqmb   k3, m17, m11
428    vpshufbitqmb   k4, m18, m11
429    mova           m6, m2
430    mova           m7, m3
431    mova           m8, m2
432    mova           m9, m3
433    vpermb     m6{k1}, m10, m14
434    vpermb     m7{k2}, m10, m15
435    vpermb     m8{k3}, m11, m14
436    vpermb     m9{k4}, m11, m15
437    jmp .sec_main
438ALIGN function_align
439.sec:
440    vpaddd         m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
441    vpaddd         m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
442    vpermb         m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
443    vpermb         m7, m8, m15 ; pNb
444    vpermb         m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
445    vpermb         m9, m9, m15 ; pNb
446.sec_main:
447    vpbroadcastb  m18, t1d
448    lzcnt         t1d, t1d
449    vpcmpub        k1, m2, m6, 6
450    vpcmpub        k2, m3, m7, 6
451    vpcmpub        k3, m2, m8, 6
452    vpcmpub        k4, m3, m9, 6
453    vpbroadcastq  m17, [r3+t1*8]
454    psubb         m10, m6, m2
455    psubb         m11, m7, m3
456    psubb         m12, m8, m2
457    psubb         m13, m9, m3
458    vpsubb    m10{k1}, m2, m6      ; abs(dt0)
459    vpsubb    m11{k2}, m3, m7      ; abs(db0)
460    vpsubb    m12{k3}, m2, m8      ; abs(dt1)
461    vpsubb    m13{k4}, m3, m9      ; abs(db1)
462    vpbroadcastd  m19, [base+sec_tap]
463    gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
464    gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
465    gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
466    gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
467    psubusb       m14, m18, m14    ; imax(0, sec_strength - (abs(dt0) >> shift)))
468    psubusb       m15, m18, m15    ; imax(0, sec_strength - (abs(db0) >> shift)))
469    psubusb       m16, m18, m16    ; imax(0, sec_strength - (abs(dt1) >> shift)))
470    psubusb       m17, m18, m17    ; imax(0, sec_strength - (abs(db1) >> shift)))
471    pminub        m10, m14
472    pminub        m11, m15
473    pminub        m12, m16
474    pminub        m13, m17
475    mova          m14, m19
476    mova          m15, m19
477    mova          m16, m19
478    vpsubb    m14{k1}, m20, m19    ; apply_sign(sec_tap_top_0)
479    vpsubb    m15{k2}, m20, m19    ; apply_sign(sec_tap_bottom_0)
480    vpsubb    m16{k3}, m20, m19    ; apply_sign(sec_tap_top_1)
481    vpsubb    m19{k4}, m20, m19    ; apply_sign(sec_tap_bottom_1)
482    vpdpbusd       m0, m10, m14
483    vpdpbusd       m1, m11, m15
484    vpdpbusd       m0, m12, m16
485    vpdpbusd       m1, m13, m19
486    ret
487
488;         lut tl                   lut tr
489; t0 t1 t2 t3 t4 t5 t6 t7  t6 t7 t8 t9 ta tb tc td
490; T0 T1 T2 T3 T4 T5 T6 T7  T6 T7 T8 T9 TA TB TC TD
491; L0 L1 00 01 02 03 04 05  04 05 06 07 08 09 0a 0b
492; L2 L3 10 11 12 13 14 15  14 15 16 17 18 19 1a 1b
493; L4 L5 20 21 22 23 24 25  24 25 26 27 28 29 2a 2b
494; L6 L7 30 31 32 33 34 35  34 35 36 37 38 39 3a 3b
495; L8 L9 40 41 42 43 44 45  44 45 46 47 48 49 4a 4b
496; La Lb 50 51 52 53 54 55  54 55 56 57 58 59 5a 5b
497;         lut bl                   lut br
498; L4 L5 20 21 22 23 24 25  24 25 26 27 28 29 2a 2b
499; L6 L7 30 31 32 33 34 35  34 35 36 37 38 39 3a 3b
500; L8 L9 40 41 42 43 44 45  44 45 46 47 48 49 4a 4b
501; La Lb 50 51 52 53 54 55  54 55 56 57 58 59 5a 5b
502; Lc Ld 60 61 62 63 64 65  64 65 66 67 68 69 6a 6b
503; Le Lf 70 71 72 73 74 75  74 75 76 77 78 79 7a 7b
504; 8e 8f 80 81 82 83 84 85  84 85 86 87 88 89 8a 8b
505; 9e 9f 90 91 92 93 94 95  94 95 96 97 98 99 9a 9b
506
507cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
508                                          pri, sec, dir, damping, edge
509%define base r8-edge_mask
510    mov           r6d, edgem
511    lea           r10, [dstq+strideq*4-2]
512    movu         xmm0, [topq+strideq*0-2]
513    movu         xmm1, [dstq+strideq*2-2]
514    movu         xmm2, [r10 +strideq*2  ]
515    lea            r8, [edge_mask]
516    lea            r9, [strideq*3]
517    pmovzxwq      m10, [leftq-4]
518    vinserti32x4  ym0, ymm0, [topq+strideq*1-2], 1
519    vinserti32x4  ym1, ymm1, [dstq+r9       -2], 1
520    vinserti32x4  ym2, ymm2, [r10 +r9         ], 1
521    lea            r7, [r10 +strideq*4  ]
522    pmovzxwq      m11, [leftq+4]
523    vinserti32x4   m0, [dstq+strideq*0-2], 2
524    vinserti32x4   m1, [r10 +strideq*0  ], 2
525    mova          m12, [base+lut_perm_8x8a]
526    movu          m13, [base+lut_perm_8x8b]
527    vinserti32x4   m0, [dstq+strideq*1-2], 3
528    vinserti32x4   m1, [r10 +strideq*1  ], 3
529    test          r6b, 0x08       ; avoid buffer overread
530    jz .main
531    vinserti32x4   m2, [r7  +strideq*0], 2
532    vinserti32x4   m2, [r7  +strideq*1], 3
533.main:
534    mov           t1d, 0x11111100
535    mova          m14, m12
536    mova          m15, m13
537    kmovd          k1, t1d
538    kshiftrd       k2, k1, 8
539    movifnidn    prid, prim
540    mov           t0d, dirm
541    mova          m30, [base+px_idx]
542    mov           r3d, dampingm
543    vpermi2b      m12, m0, m1     ; lut tl
544    vpermi2b      m14, m1, m2     ; lut bl
545    vpermi2b      m13, m0, m1     ; lut tr
546    vpermi2b      m15, m1, m2     ; lut br
547    vpblendmw m12{k1}, m12, m10
548    vpblendmw m14{k2}, m14, m11
549    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
550    pxor          m31, m31
551    lea            r3, [r8+r3*8]  ; gf_shr + (damping - 30) * 8
552    vpermb         m4, m30, m12   ; pxtl
553    vpermb         m5, m30, m13   ; pxtr
554    vpermb         m6, m30, m14   ; pxbl
555    vpermb         m7, m30, m15   ; pxbr
556    mova           m1, m0
557    mova           m2, m0
558    mova           m3, m0
559    cmp           r6b, 0x0f
560    jne .mask_edges               ; mask edges only if required
561    test         prid, prid
562    jz .sec_only
563    vpaddd        m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
564    vpermb         m8, m11, m12   ; pNtl k0p0 k0p1 k1p0 k1p1
565    vpermb         m9, m11, m13   ; pNtr
566    vpermb        m10, m11, m14   ; pNbl
567    vpermb        m11, m11, m15   ; pNbr
568%macro CDEF_FILTER_8x8_PRI 0
569    vpcmpub        k1, m4, m8, 6  ; pxtl > pNtl
570    vpcmpub        k2, m5, m9, 6  ; pxtr > pNtr
571    vpcmpub        k3, m6, m10, 6 ; pxbl > pNbl
572    vpcmpub        k4, m7, m11, 6 ; pxbr > pNbr
573    psubb         m16, m8, m4
574    psubb         m17, m9, m5
575    psubb         m18, m10, m6
576    psubb         m19, m11, m7
577    lzcnt         r6d, prid
578    vpsubb    m16{k1}, m4, m8     ; abs(diff_tl)
579    vpsubb    m17{k2}, m5, m9     ; abs(diff_tr)
580    vpsubb    m18{k3}, m6, m10    ; abs(diff_bl)
581    vpsubb    m19{k4}, m7, m11    ; abs(diff_br)
582    vpbroadcastq  m28, [r3+r6*8]
583    vpbroadcastb  m29, prid
584    and          prid, 1
585    vpbroadcastd  m27, [base+pri_tap+priq*4]
586    vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
587    vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
588    vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
589    vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
590    mova          m24, m27
591    mova          m25, m27
592    mova          m26, m27
593    movifnidn     t1d, secm
594    vpsubb    m24{k1}, m31, m27   ; apply_sign(pri_tap_tl)
595    vpsubb    m25{k2}, m31, m27   ; apply_sign(pri_tap_tr)
596    vpsubb    m26{k3}, m31, m27   ; apply_sign(pri_tap_tl)
597    vpsubb    m27{k4}, m31, m27   ; apply_sign(pri_tap_tr)
598    psubusb       m20, m29, m20   ; imax(0, pri_strength - (abs(dtl) >> shift)))
599    psubusb       m21, m29, m21   ; imax(0, pri_strength - (abs(dtr) >> shift)))
600    psubusb       m22, m29, m22   ; imax(0, pri_strength - (abs(dbl) >> shift)))
601    psubusb       m23, m29, m23   ; imax(0, pri_strength - (abs(dbr) >> shift)))
602    pminub        m16, m20
603    pminub        m17, m21
604    pminub        m18, m22
605    pminub        m19, m23
606    vpdpbusd       m0, m16, m24   ; sum tl
607    vpdpbusd       m1, m17, m25   ; sum tr
608    vpdpbusd       m2, m18, m26   ; sum bl
609    vpdpbusd       m3, m19, m27   ; sum br
610%endmacro
611    CDEF_FILTER_8x8_PRI
612    test          t1d, t1d        ; sec
613    jz .end_no_clip
614    call .sec
615.end_clip:
616    pminub        m20, m8, m4
617    pminub        m24, m12, m16
618    pminub        m21, m9, m5
619    pminub        m25, m13, m17
620    pminub        m22, m10, m6
621    pminub        m26, m14, m18
622    pminub        m23, m11, m7
623    pminub        m27, m15, m19
624    pmaxub         m8, m4
625    pmaxub        m12, m16
626    pmaxub         m9, m5
627    pmaxub        m13, m17
628    pmaxub        m10, m6
629    pmaxub        m14, m18
630    pmaxub        m11, m7
631    pmaxub        m15, m19
632    pminub        m20, m24
633    pminub        m21, m25
634    pminub        m22, m26
635    pminub        m23, m27
636    pmaxub         m8, m12
637    pmaxub         m9, m13
638    pmaxub        m10, m14
639    pmaxub        m11, m15
640    mov           r2d, 0xAAAAAAAA
641    kmovd          k1, r2d
642    vpshrdd       m24,  m0,  m1, 16
643    vpshrdd       m25,  m2,  m3, 16
644    vpshrdd       m12, m20, m21, 16
645    vpshrdd       m14, m22, m23, 16
646    vpshrdd       m16,  m8,  m9, 16
647    vpshrdd       m18, m10, m11, 16
648    vpblendmw m13{k1}, m20, m21
649    vpblendmw m15{k1}, m22, m23
650    vpblendmw m17{k1},  m8, m9
651    vpblendmw m19{k1}, m10, m11
652    vpblendmw m20{k1},  m0, m24
653    vpblendmw m21{k1}, m24, m1
654    vpblendmw m22{k1},  m2, m25
655    vpblendmw m23{k1}, m25, m3
656    vpshrdd        m4, m5, 16
657    vpshrdd        m6, m7, 16
658    pminub        m12, m13
659    pminub        m14, m15
660    pmaxub        m16, m17
661    pmaxub        m18, m19
662    mova           m8, [base+end_perm_w8clip]
663    vpcmpw         k2, m20, m31, 1
664    vpcmpw         k3, m22, m31, 1
665    vpshldw        m4, m21, 8
666    vpshldw        m6, m23, 8
667    kunpckdq       k1, k1, k1
668    kxnorb         k4, k4, k4
669    vpshrdw       m11, m12, m14, 8
670    vpshrdw       m15, m16, m18, 8
671    vpblendmb m13{k1}, m12, m14
672    vpblendmb m17{k1}, m16, m18
673    psubw         m21, m31, m20
674    psubw         m23, m31, m22
675    paddusw        m0, m20, m4  ; clip >0xff
676    paddusw        m1, m22, m6
677    pminub        m11, m13
678    pmaxub        m15, m17
679    psubusw    m0{k2}, m4, m21  ; clip <0x00
680    psubusw    m1{k3}, m6, m23
681    psrlw          m0, 8
682    vmovdqu8   m0{k1}, m1
683    pmaxub         m0, m11
684    pminub         m0, m15
685    vpermb         m0, m8, m0
686    add           r10, 2
687    vextracti32x4 xm1, m0, 1
688    vextracti32x4 xm2, m0, 2
689    vextracti32x4 xm3, m0, 3
690    movq   [dstq+strideq*0], xm0
691    movq   [dstq+strideq*2], xm1
692    movq   [r10 +strideq*0], xm2
693    movq   [r10 +strideq*2], xm3
694    movhps [dstq+strideq*1], xm0
695    movhps [dstq+r9       ], xm1
696    movhps [r10 +strideq*1], xm2
697    movhps [r10 +r9       ], xm3
698    RET
699.sec_only:
700    movifnidn     t1d, secm
701    call .sec
702.end_no_clip:
703    mova          xm8, [base+end_perm]
704    kxnorb         k1, k1, k1
705    vpshldd        m4, m0, 8  ; (px << 8) + ((sum > -8) << 4)
706    vpshldd        m5, m1, 8
707    vpshldd        m6, m2, 8
708    vpshldd        m7, m3, 8
709    paddw          m0, m4     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
710    paddw          m1, m5
711    paddw          m2, m6
712    paddw          m3, m7
713    vpermb         m0, m8, m0
714    vpermb         m1, m8, m1
715    vpermb         m2, m8, m2
716    vpermb         m3, m8, m3
717    add           r10, 2
718    punpckldq      m4, m0, m1
719    punpckhdq      m0, m1
720    punpckldq      m5, m2, m3
721    punpckhdq      m2, m3
722    movq   [dstq+strideq*0], xm4
723    movq   [dstq+strideq*2], xm0
724    movq   [r10 +strideq*0], xm5
725    movq   [r10 +strideq*2], xm2
726    movhps [dstq+strideq*1], xm4
727    movhps [dstq+r9       ], xm0
728    movhps [r10 +strideq*1], xm5
729    movhps [r10 +r9       ], xm2
730    RET
731.mask_edges_sec_only:
732    movifnidn     t1d, secm
733    call .mask_edges_sec
734    jmp .end_no_clip
735ALIGN function_align
736.mask_edges:
737    mov           t0d, r6d
738    mov           t1d, r6d
739    or            t0d, 0xA ; top-left 4x4 has bottom and right
740    or            t1d, 0x9 ; top-right 4x4 has bottom and left
741    vpbroadcastq  m26, [base+edge_mask+t0*8]
742    vpbroadcastq  m27, [base+edge_mask+t1*8]
743    mov           t1d, r6d
744    or            r6d, 0x6 ; bottom-left 4x4 has top and right
745    or            t1d, 0x5 ; bottom-right 4x4 has top and left
746    vpbroadcastq  m28, [base+edge_mask+r6*8]
747    vpbroadcastq  m29, [base+edge_mask+t1*8]
748    mov           t0d, dirm
749    test         prid, prid
750    jz .mask_edges_sec_only
751    vpaddd        m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
752    vpshufbitqmb   k1, m26, m20 ; index in-range
753    vpshufbitqmb   k2, m27, m20
754    vpshufbitqmb   k3, m28, m20
755    vpshufbitqmb   k4, m29, m20
756    mova           m8, m4
757    mova           m9, m5
758    mova          m10, m6
759    mova          m11, m7
760    vpermb     m8{k1}, m20, m12
761    vpermb     m9{k2}, m20, m13
762    vpermb    m10{k3}, m20, m14
763    vpermb    m11{k4}, m20, m15
764    mova   [rsp+0x00], m26
765    mova   [rsp+0x40], m27
766    mova   [rsp+0x80], m28
767    mova   [rsp+0xC0], m29
768    CDEF_FILTER_8x8_PRI
769    test          t1d, t1d
770    jz .end_no_clip
771    mova          m26, [rsp+0x00]
772    mova          m27, [rsp+0x40]
773    mova          m28, [rsp+0x80]
774    mova          m29, [rsp+0xC0]
775    call .mask_edges_sec
776    jmp .end_clip
777.mask_edges_sec:
778    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
779    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
780    vpshufbitqmb   k1, m26, m20
781    vpshufbitqmb   k2, m27, m20
782    vpshufbitqmb   k3, m28, m20
783    vpshufbitqmb   k4, m29, m20
784    mova          m16, m4
785    mova          m17, m5
786    mova          m18, m6
787    mova          m19, m7
788    vpermb    m16{k1}, m20, m12
789    vpermb    m17{k2}, m20, m13
790    vpermb    m18{k3}, m20, m14
791    vpermb    m19{k4}, m20, m15
792    vpshufbitqmb   k1, m26, m21
793    vpshufbitqmb   k2, m27, m21
794    vpshufbitqmb   k3, m28, m21
795    vpshufbitqmb   k4, m29, m21
796    vpermb        m12, m21, m12
797    vpermb        m13, m21, m13
798    vpermb        m14, m21, m14
799    vpermb        m15, m21, m15
800    vpblendmb m12{k1}, m4, m12
801    vpblendmb m13{k2}, m5, m13
802    vpblendmb m14{k3}, m6, m14
803    vpblendmb m15{k4}, m7, m15
804    jmp .sec_main
805ALIGN function_align
806.sec:
807    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
808    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
809    vpermb        m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
810    vpermb        m17, m20, m13 ; pNtr
811    vpermb        m18, m20, m14 ; pNbl
812    vpermb        m19, m20, m15 ; pNbr
813    vpermb        m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
814    vpermb        m13, m21, m13 ; pNtr
815    vpermb        m14, m21, m14 ; pNbl
816    vpermb        m15, m21, m15 ; pNbr
817.sec_main:
818%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
819    vpcmpub        k1, m4, %1, 6
820    vpcmpub        k2, m5, %2, 6
821    vpcmpub        k3, m6, %3, 6
822    vpcmpub        k4, m7, %4, 6
823    psubb         m20, %1, m4
824    psubb         m21, %2, m5
825    psubb         m22, %3, m6
826    psubb         m23, %4, m7
827%if %5
828    vpbroadcastb  m28, t1d
829    lzcnt         t1d, t1d
830    vpbroadcastq  m29, [r3+t1*8]
831%endif
832    vpsubb    m20{k1}, m4, %1
833    vpsubb    m21{k2}, m5, %2
834    vpsubb    m22{k3}, m6, %3
835    vpsubb    m23{k4}, m7, %4
836    gf2p8affineqb m24, m20, m29, 0
837    gf2p8affineqb m25, m21, m29, 0
838    gf2p8affineqb m26, m22, m29, 0
839    gf2p8affineqb m27, m23, m29, 0
840%if %5
841    vpbroadcastd  m30, [base+sec_tap]
842%endif
843    psubusb       m24, m28, m24
844    psubusb       m25, m28, m25
845    psubusb       m26, m28, m26
846    psubusb       m27, m28, m27
847    pminub        m20, m24
848    pminub        m21, m25
849    pminub        m22, m26
850    pminub        m23, m27
851    mova          m24, m30
852    mova          m25, m30
853    mova          m26, m30
854    mova          m27, m30
855    vpsubb    m24{k1}, m31, m30
856    vpsubb    m25{k2}, m31, m30
857    vpsubb    m26{k3}, m31, m30
858    vpsubb    m27{k4}, m31, m30
859    vpdpbusd       m0, m20, m24
860    vpdpbusd       m1, m21, m25
861    vpdpbusd       m2, m22, m26
862    vpdpbusd       m3, m23, m27
863%endmacro
864    CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
865    CDEF_FILTER_8x8_SEC m12, m13, m14, m15
866    ret
867
868%endif ; HAVE_AVX512ICL && ARCH_X86_64
869