1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31%macro DUP4 1-*
32    %rep %0
33        times 4 db %1
34        %rotate 1
35    %endrep
36%endmacro
37
38%macro DIRS 16 ; cdef_directions[]
39    %rep 4 + 16 + 4 ; 6 7   0 1 2 3 4 5 6 7   0 1
40        ; masking away unused bits allows us to use a single vpaddd {1to16}
41        ; instruction instead of having to do vpbroadcastd + paddb
42        db %13 & 0x3f, -%13 & 0x3f
43        %rotate 1
44    %endrep
45%endmacro
46
47SECTION_RODATA 64
48
49lut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
50               db 16, 17,  0,  1,  2,  3,  4,  5, 18, 19,  8,  9, 10, 11, 12, 13
51               db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
52               db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
53lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
54               db 96, 97,  0,  1,  2,  3,  4,  5, 98, 99,  8,  9, 10, 11, 12, 13
55lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
56              db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
57              db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
58               db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
59pd_01234567:   dd  0,  1,  2,  3,  4,  5,  6,  7
60lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55
61               db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
62lut_perm_8x8b: db 12, 13,  0,  1,  2,  3,  4,  5, 14, 15, 16, 17, 18, 19, 20, 21
63               db  2,  3,  4,  5,  6,  7,  8,  9, 18, 19, 20, 21, 22, 23, 24, 25
64               db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53
65               db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57
66end_perm:      db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
67               db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
68end_perm_clip: db  0,  4,  8, 12,  2,  6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
69               db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
70               db  1,  5,  9, 13,  3,  7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
71               db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
72edge_mask:     dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
73               dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
74               dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
75               dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
76               dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
77               dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
78               dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
79               dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
80px_idx:      DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
81cdef_dirs:   DIRS -7,-14,  1, -6,  1,  2,  1, 10,  9, 18,  8, 17,  8, 16,  8, 15
82gf_shr:        dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
83               dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
84               dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
85               dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
86pri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
87sec_tap:       db 32, 32, 16, 16
88pd_268435568:  dd 268435568
89
90SECTION .text
91
92%if WIN64
93DECLARE_REG_TMP 4
94%else
95DECLARE_REG_TMP 8
96%endif
97
98; lut:
99; t0 t1 t2 t3 t4 t5 t6 t7
100; T0 T1 T2 T3 T4 T5 T6 T7
101; L0 L1 00 01 02 03 04 05
102; L2 L3 10 11 12 13 14 15
103; L4 L5 20 21 22 23 24 25
104; L6 L7 30 31 32 33 34 35
105; b0 b1 b2 b3 b4 b5 b6 b7
106; B0 B1 B2 B3 B4 B5 B6 B7
107
108INIT_ZMM avx512icl
109cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \
110                                        pri, sec, dir, damping, edge
111%define base r7-edge_mask
112    movq         xmm0, [dstq+strideq*0]
113    movhps       xmm0, [dstq+strideq*1]
114    lea            r7, [edge_mask]
115    movq         xmm1, [topq+strideq*0-2]
116    movhps       xmm1, [topq+strideq*1-2]
117    mov           r6d, edgem
118    vinserti32x4  ym0, ymm0, [leftq], 1
119    lea            r2, [strideq*3]
120    vinserti32x4  ym1, ymm1, [dstq+strideq*2], 1
121    mova           m5, [base+lut_perm_4x4]
122    vinserti32x4   m0, [dstq+r2], 2
123    test          r6b, 0x08      ; avoid buffer overread
124    jz .main
125    vinserti32x4   m1, [botq+strideq*0-4], 2
126    vinserti32x4   m0, [botq+strideq*1-4], 3
127.main:
128    movifnidn    prid, prim
129    mov           t0d, dirm
130    mova           m3, [base+px_idx]
131    mov           r3d, dampingm
132    vpermi2b       m5, m0, m1    ; lut
133    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
134    pxor           m7, m7
135    lea            r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
136    vpermb         m6, m3, m5    ; px
137    cmp           r6d, 0x0f
138    jne .mask_edges              ; mask edges only if required
139    test         prid, prid
140    jz .sec_only
141    vpaddd         m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
142    vpermb         m1, m1, m5    ; k0p0 k0p1 k1p0 k1p1
143%macro CDEF_FILTER_4x4_PRI 0
144    vpcmpub        k1, m6, m1, 6 ; px > pN
145    psubb          m2, m1, m6
146    lzcnt         r6d, prid
147    vpsubb     m2{k1}, m6, m1    ; abs(diff)
148    vpbroadcastb   m4, prid
149    and          prid, 1
150    vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
151    movifnidn    secd, secm
152    vpbroadcastd  m10, [base+pri_tap+priq*4]
153    vpsubb    m10{k1}, m7, m10   ; apply_sign(pri_tap)
154    psubusb        m4, m9        ; imax(0, pri_strength - (abs(diff) >> shift)))
155    pminub         m2, m4
156    vpdpbusd       m0, m2, m10   ; sum
157%endmacro
158    CDEF_FILTER_4x4_PRI
159    test         secd, secd
160    jz .end_no_clip
161    call .sec
162.end_clip:
163    pminub         m4, m6, m1
164    pmaxub         m1, m6
165    pminub         m5, m2, m3
166    pmaxub         m2, m3
167    pminub         m4, m5
168    pmaxub         m2, m1
169    psrldq         m1, m4, 2
170    psrldq         m3, m2, 2
171    pminub         m1, m4
172    vpcmpw         k1, m0, m7, 1
173    vpshldd        m6, m0, 8
174    pmaxub         m2, m3
175    pslldq         m3, m1, 1
176    psubw          m7, m0
177    paddusw        m0, m6     ; clip >0xff
178    vpsubusw   m0{k1}, m6, m7 ; clip <0x00
179    pslldq         m4, m2, 1
180    pminub         m1, m3
181    pmaxub         m2, m4
182    pmaxub         m0, m1
183    pminub         m0, m2
184    jmp .end
185.sec_only:
186    movifnidn    secd, secm
187    call .sec
188.end_no_clip:
189    vpshldd        m6, m0, 8  ; (px << 8) + ((sum > -8) << 4)
190    paddw          m0, m6     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
191.end:
192    mova          xm1, [base+end_perm]
193    vpermb         m0, m1, m0 ; output in bits 8-15 of each dword
194    movd   [dstq+strideq*0], xm0
195    pextrd [dstq+strideq*1], xm0, 1
196    pextrd [dstq+strideq*2], xm0, 2
197    pextrd [dstq+r2       ], xm0, 3
198    RET
199.mask_edges_sec_only:
200    movifnidn    secd, secm
201    call .mask_edges_sec
202    jmp .end_no_clip
203ALIGN function_align
204.mask_edges:
205    vpbroadcastq   m8, [base+edge_mask+r6*8]
206    test         prid, prid
207    jz .mask_edges_sec_only
208    vpaddd         m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
209    vpshufbitqmb   k1, m8, m2 ; index in-range
210    mova           m1, m6
211    vpermb     m1{k1}, m2, m5
212    CDEF_FILTER_4x4_PRI
213    test         secd, secd
214    jz .end_no_clip
215    call .mask_edges_sec
216    jmp .end_clip
217.mask_edges_sec:
218    vpaddd         m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
219    vpaddd         m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
220    vpshufbitqmb   k1, m8, m4
221    mova           m2, m6
222    vpermb     m2{k1}, m4, m5
223    vpshufbitqmb   k1, m8, m9
224    mova           m3, m6
225    vpermb     m3{k1}, m9, m5
226    jmp .sec_main
227ALIGN function_align
228.sec:
229    vpaddd         m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
230    vpaddd         m3,     [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
231    vpermb         m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
232    vpermb         m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
233.sec_main:
234    vpbroadcastd   m8, [base+sec_tap]
235    vpcmpub        k1, m6, m2, 6
236    psubb          m4, m2, m6
237    vpbroadcastb  m12, secd
238    lzcnt        secd, secd
239    vpsubb     m4{k1}, m6, m2
240    vpcmpub        k2, m6, m3, 6
241    vpbroadcastq  m11, [r3+secq*8]
242    gf2p8affineqb m10, m4, m11, 0
243    psubb          m5, m3, m6
244    mova           m9, m8
245    vpsubb     m8{k1}, m7, m8
246    psubusb       m10, m12, m10
247    vpsubb     m5{k2}, m6, m3
248    pminub         m4, m10
249    vpdpbusd       m0, m4, m8
250    gf2p8affineqb m11, m5, m11, 0
251    vpsubb     m9{k2}, m7, m9
252    psubusb       m12, m11
253    pminub         m5, m12
254    vpdpbusd       m0, m5, m9
255    ret
256
257DECLARE_REG_TMP 2, 7
258
259;         lut top                lut bottom
260; t0 t1 t2 t3 t4 t5 t6 t7  L4 L5 20 21 22 23 24 25
261; T0 T1 T2 T3 T4 T5 T6 T7  L6 L7 30 31 32 33 34 35
262; L0 L1 00 01 02 03 04 05  L8 L9 40 41 42 43 44 45
263; L2 L3 10 11 12 13 14 15  La Lb 50 51 52 53 54 55
264; L4 L5 20 21 22 23 24 25  Lc Ld 60 61 62 63 64 65
265; L6 L7 30 31 32 33 34 35  Le Lf 70 71 72 73 74 75
266; L8 L9 40 41 42 43 44 45  b0 b1 b2 b3 b4 b5 b6 b7
267; La Lb 50 51 52 53 54 55  B0 B1 B2 B3 B4 B5 B6 B7
268
269cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \
270                                        pri, sec, dir, damping, edge
271%define base r8-edge_mask
272    vpbroadcastd ym21, strided
273    mov           r6d, edgem
274    lea            r8, [edge_mask]
275    movq          xm1, [topq+strideq*0-2]
276    pmulld       ym21, [base+pd_01234567]
277    kxnorb         k1, k1, k1
278    movq          xm2, [topq+strideq*1-2]
279    vpgatherdq m0{k1}, [dstq+ym21]  ; +0+1 +2+3 +4+5 +6+7
280    mova          m14, [base+lut_perm_4x8a]
281    movu          m15, [base+lut_perm_4x8b]
282    test          r6b, 0x08         ; avoid buffer overread
283    jz .main
284    vinserti32x4  ym1, [botq+strideq*0-2], 1
285    vinserti32x4  ym2, [botq+strideq*1-2], 1
286.main:
287    punpcklqdq    ym1, ym2
288    vinserti32x4   m1, [leftq], 2   ; -2-1 +8+9 left ____
289    movifnidn    prid, prim
290    mov           t0d, dirm
291    mova          m16, [base+px_idx]
292    mov           r3d, dampingm
293    vpermi2b      m14, m0, m1    ; lut top
294    vpermi2b      m15, m0, m1    ; lut bottom
295    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
296    pxor          m20, m20
297    lea            r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
298    vpermb         m2, m16, m14  ; pxt
299    vpermb         m3, m16, m15  ; pxb
300    mova           m1, m0
301    cmp           r6b, 0x0f
302    jne .mask_edges              ; mask edges only if required
303    test         prid, prid
304    jz .sec_only
305    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
306    vpermb         m4, m6, m14   ; pNt k0p0 k0p1 k1p0 k1p1
307    vpermb         m5, m6, m15   ; pNb
308%macro CDEF_FILTER_4x8_PRI 0
309    vpcmpub        k1, m2, m4, 6 ; pxt > pNt
310    vpcmpub        k2, m3, m5, 6 ; pxb > pNb
311    psubb          m6, m4, m2
312    psubb          m7, m5, m3
313    lzcnt         r6d, prid
314    vpsubb     m6{k1}, m2, m4    ; abs(diff_top)
315    vpsubb     m7{k2}, m3, m5    ; abs(diff_bottom)
316    vpbroadcastb  m13, prid
317    vpbroadcastq   m9, [r3+r6*8]
318    and          prid, 1
319    vpbroadcastd  m11, [base+pri_tap+priq*4]
320    vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
321    vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
322    mova          m10, m11
323    movifnidn     t1d, secm
324    vpsubb    m10{k1}, m20, m11  ; apply_sign(pri_tap_top)
325    vpsubb    m11{k2}, m20, m11  ; apply_sign(pri_tap_bottom)
326    psubusb       m12, m13, m8   ; imax(0, pri_strength - (abs(dt) >> shift)))
327    psubusb       m13, m13, m9   ; imax(0, pri_strength - (abs(db) >> shift)))
328    pminub         m6, m12
329    pminub         m7, m13
330    vpdpbusd       m0, m6, m10   ; sum top
331    vpdpbusd       m1, m7, m11   ; sum bottom
332%endmacro
333    CDEF_FILTER_4x8_PRI
334    test          t1d, t1d       ; sec
335    jz .end_no_clip
336    call .sec
337.end_clip:
338    pminub        m10, m4, m2
339    pminub        m12, m6, m8
340    pminub        m11, m5, m3
341    pminub        m13, m7, m9
342    pmaxub         m4, m2
343    pmaxub         m6, m8
344    pmaxub         m5, m3
345    pmaxub         m7, m9
346    pminub        m10, m12
347    pminub        m11, m13
348    pmaxub         m4, m6
349    pmaxub         m5, m7
350    mov           r2d, 0xAAAAAAAA
351    kmovd          k1, r2d
352    kxnorb         k2, k2, k2       ;   hw   lw
353    vpshrdd       m12, m0, m1, 16   ;  m1lw m0hw
354    vpshrdd        m6, m10, m11, 16 ; m11lw m10hw
355    vpshrdd        m8, m4, m5, 16   ;  m5lw m4hw
356    vpblendmw  m7{k1}, m10, m11     ; m11hw m10lw
357    vpblendmw  m9{k1}, m4, m5       ;  m5hw m4lw
358    vpblendmw  m4{k1}, m0, m12      ;  m1lw m0lw
359    vpblendmw  m5{k1}, m12, m1      ;  m1hw m0hw
360    vpshrdd        m2, m3, 16
361    pminub         m6, m7
362    pmaxub         m8, m9
363    mova         ym14, [base+end_perm]
364    vpcmpw         k1, m4, m20, 1
365    vpshldw        m2, m5, 8
366    pslldq         m7, m6, 1
367    pslldq         m9, m8, 1
368    psubw          m5, m20, m4
369    paddusw        m0, m4, m2 ; clip >0xff
370    pminub         m6, m7
371    pmaxub         m8, m9
372    psubusw    m0{k1}, m2, m5 ; clip <0x00
373    pmaxub         m0, m6
374    pminub         m0, m8
375    vpermb         m0, m14, m0
376    vpscatterdd [dstq+ym21]{k2}, ym0
377    RET
378.sec_only:
379    movifnidn     t1d, secm
380    call .sec
381.end_no_clip:
382    mova          ym4, [base+end_perm]
383    kxnorb         k1, k1, k1
384    vpshldd        m2, m0, 8  ; (px << 8) + ((sum > -8) << 4)
385    vpshldd        m3, m1, 8
386    paddw          m0, m2     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
387    paddw          m1, m3
388    pslld          m0, 16
389    vpshrdd        m0, m1, 16
390    vpermb         m0, m4, m0 ; output in bits 8-15 of each word
391    vpscatterdd [dstq+ym21]{k1}, ym0
392    RET
393.mask_edges_sec_only:
394    movifnidn     t1d, secm
395    call .mask_edges_sec
396    jmp .end_no_clip
397ALIGN function_align
398.mask_edges:
399    mov           t1d, r6d
400    or            r6d, 8 ; top 4x4 has bottom
401    or            t1d, 4 ; bottom 4x4 has top
402    vpbroadcastq  m17, [base+edge_mask+r6*8]
403    vpbroadcastq  m18, [base+edge_mask+t1*8]
404    test         prid, prid
405    jz .mask_edges_sec_only
406    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
407    vpshufbitqmb   k1, m17, m6 ; index in-range
408    vpshufbitqmb   k2, m18, m6
409    mova           m4, m2
410    mova           m5, m3
411    vpermb     m4{k1}, m6, m14
412    vpermb     m5{k2}, m6, m15
413    CDEF_FILTER_4x8_PRI
414    test          t1d, t1d
415    jz .end_no_clip
416    call .mask_edges_sec
417    jmp .end_clip
418.mask_edges_sec:
419    vpaddd        m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
420    vpaddd        m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
421    vpshufbitqmb   k1, m17, m10
422    vpshufbitqmb   k2, m18, m10
423    vpshufbitqmb   k3, m17, m11
424    vpshufbitqmb   k4, m18, m11
425    mova           m6, m2
426    mova           m7, m3
427    mova           m8, m2
428    mova           m9, m3
429    vpermb     m6{k1}, m10, m14
430    vpermb     m7{k2}, m10, m15
431    vpermb     m8{k3}, m11, m14
432    vpermb     m9{k4}, m11, m15
433    jmp .sec_main
434ALIGN function_align
435.sec:
436    vpaddd         m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
437    vpaddd         m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
438    vpermb         m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
439    vpermb         m7, m8, m15 ; pNb
440    vpermb         m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
441    vpermb         m9, m9, m15 ; pNb
442.sec_main:
443    vpbroadcastb  m18, t1d
444    lzcnt         t1d, t1d
445    vpcmpub        k1, m2, m6, 6
446    vpcmpub        k2, m3, m7, 6
447    vpcmpub        k3, m2, m8, 6
448    vpcmpub        k4, m3, m9, 6
449    vpbroadcastq  m17, [r3+t1*8]
450    psubb         m10, m6, m2
451    psubb         m11, m7, m3
452    psubb         m12, m8, m2
453    psubb         m13, m9, m3
454    vpsubb    m10{k1}, m2, m6      ; abs(dt0)
455    vpsubb    m11{k2}, m3, m7      ; abs(db0)
456    vpsubb    m12{k3}, m2, m8      ; abs(dt1)
457    vpsubb    m13{k4}, m3, m9      ; abs(db1)
458    vpbroadcastd  m19, [base+sec_tap]
459    gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
460    gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
461    gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
462    gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
463    psubusb       m14, m18, m14    ; imax(0, sec_strength - (abs(dt0) >> shift)))
464    psubusb       m15, m18, m15    ; imax(0, sec_strength - (abs(db0) >> shift)))
465    psubusb       m16, m18, m16    ; imax(0, sec_strength - (abs(dt1) >> shift)))
466    psubusb       m17, m18, m17    ; imax(0, sec_strength - (abs(db1) >> shift)))
467    pminub        m10, m14
468    pminub        m11, m15
469    pminub        m12, m16
470    pminub        m13, m17
471    mova          m14, m19
472    mova          m15, m19
473    mova          m16, m19
474    vpsubb    m14{k1}, m20, m19    ; apply_sign(sec_tap_top_0)
475    vpsubb    m15{k2}, m20, m19    ; apply_sign(sec_tap_bottom_0)
476    vpsubb    m16{k3}, m20, m19    ; apply_sign(sec_tap_top_1)
477    vpsubb    m19{k4}, m20, m19    ; apply_sign(sec_tap_bottom_1)
478    vpdpbusd       m0, m10, m14
479    vpdpbusd       m1, m11, m15
480    vpdpbusd       m0, m12, m16
481    vpdpbusd       m1, m13, m19
482    ret
483
484;         lut tl                   lut tr
485; t0 t1 t2 t3 t4 t5 t6 t7  t4 t5 t6 t7 t8 t9 ta tb
486; T0 T1 T2 T3 T4 T5 T6 T7  T4 T5 T6 T7 T8 T9 Ta Tb
487; L0 L1 00 01 02 03 04 05  02 03 04 05 06 07 08 09
488; L2 L3 10 11 12 13 14 15  12 13 14 15 16 17 18 19
489; L4 L5 20 21 22 23 24 25  22 23 24 25 26 27 28 29
490; L6 L7 30 31 32 33 34 35  32 33 34 35 36 37 38 39
491; L8 L9 40 41 42 43 44 45  42 43 44 45 46 47 48 49
492; La Lb 50 51 52 53 54 55  52 53 54 55 56 57 58 59
493;         lut bl                   lut br
494; L4 L5 20 21 22 23 24 25  22 23 24 25 26 27 28 29
495; L6 L7 30 31 32 33 34 35  32 33 34 35 36 37 38 39
496; L8 L9 40 41 42 43 44 45  42 43 44 45 46 47 48 49
497; La Lb 50 51 52 53 54 55  52 53 54 55 56 57 58 59
498; Lc Ld 60 61 62 63 64 65  62 63 64 65 66 67 68 69
499; Le Lf 70 71 72 73 74 75  72 73 74 75 76 77 78 79
500; b0 b1 b2 b3 b4 b5 b6 b7  b4 b5 b6 b7 b8 b9 ba bb
501; B0 B1 B2 B3 B4 B5 B6 B7  B4 B5 B6 B7 B8 B9 Ba Bb
502
503cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \
504                                               pri, sec, dir, damping, edge
505%define base r8-edge_mask
506    movu         xm16, [dstq+strideq*0]
507    pinsrd       xm16, [leftq+4*0], 3
508    mov           r6d, edgem
509    vinserti128  ym16, [dstq+strideq*1], 1
510    lea           r10, [dstq+strideq*4]
511    movu         xm17, [dstq+strideq*2]
512    vinserti32x4  m16, [topq+strideq*0-2], 2
513    lea            r9, [strideq*3]
514    pinsrd       xm17, [leftq+4*1], 3
515    vinserti32x4  m16, [topq+strideq*1-2], 3 ; 0 1 t T
516    lea            r8, [edge_mask]
517    vinserti128  ym17, [dstq+r9       ], 1
518    vpbroadcastd ym18, [leftq+4*2]
519    vpblendd     ym17, ym18, 0x80
520    movu         xm18, [r10 +strideq*2]
521    vinserti32x4  m17, [r10 +strideq*0], 2
522    pinsrd       xm18, [leftq+4*3], 3
523    vinserti32x4  m17, [r10 +strideq*1], 3   ; 2 3 4 5
524    vinserti128  ym18, [r10 +r9       ], 1
525    test          r6b, 0x08       ; avoid buffer overread
526    jz .main
527    vinserti32x4  m18, [botq+strideq*0-2], 2
528    vinserti32x4  m18, [botq+strideq*1-2], 3 ; 6 7 b B
529.main:
530    mova           m0, [base+lut_perm_8x8a]
531    movu           m1, [base+lut_perm_8x8b]
532    mova          m30, [base+px_idx]
533    vpermb        m16, m0, m16
534    movifnidn    prid, prim
535    vpermb        m17, m1, m17
536    mov           t0d, dirm
537    vpermb        m18, m0, m18
538    mov           r3d, dampingm
539    vshufi32x4    m12, m16, m17, q2020 ; lut tl
540    vshufi32x4    m13, m16, m17, q3131 ; lut tr
541    vshufi32x4    m14, m17, m18, q0220 ; lut bl
542    vshufi32x4    m15, m17, m18, q1331 ; lut br
543    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
544    pxor          m31, m31
545    lea            r3, [r8+r3*8]  ; gf_shr + (damping - 30) * 8
546    vpermb         m4, m30, m12   ; pxtl
547    mova           m1, m0
548    vpermb         m5, m30, m13   ; pxtr
549    mova           m2, m0
550    vpermb         m6, m30, m14   ; pxbl
551    mova           m3, m0
552    vpermb         m7, m30, m15   ; pxbr
553    cmp           r6b, 0x0f
554    jne .mask_edges               ; mask edges only if required
555    test         prid, prid
556    jz .sec_only
557    vpaddd        m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
558    vpermb         m8, m11, m12   ; pNtl k0p0 k0p1 k1p0 k1p1
559    vpermb         m9, m11, m13   ; pNtr
560    vpermb        m10, m11, m14   ; pNbl
561    vpermb        m11, m11, m15   ; pNbr
562%macro CDEF_FILTER_8x8_PRI 0
563    vpcmpub        k1, m4, m8, 6  ; pxtl > pNtl
564    vpcmpub        k2, m5, m9, 6  ; pxtr > pNtr
565    vpcmpub        k3, m6, m10, 6 ; pxbl > pNbl
566    vpcmpub        k4, m7, m11, 6 ; pxbr > pNbr
567    psubb         m16, m8, m4
568    psubb         m17, m9, m5
569    psubb         m18, m10, m6
570    psubb         m19, m11, m7
571    lzcnt         r6d, prid
572    vpsubb    m16{k1}, m4, m8     ; abs(diff_tl)
573    vpsubb    m17{k2}, m5, m9     ; abs(diff_tr)
574    vpsubb    m18{k3}, m6, m10    ; abs(diff_bl)
575    vpsubb    m19{k4}, m7, m11    ; abs(diff_br)
576    vpbroadcastq  m28, [r3+r6*8]
577    vpbroadcastb  m29, prid
578    and          prid, 1
579    vpbroadcastd  m27, [base+pri_tap+priq*4]
580    vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
581    vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
582    vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
583    vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
584    mova          m24, m27
585    mova          m25, m27
586    mova          m26, m27
587    movifnidn     t1d, secm
588    vpsubb    m24{k1}, m31, m27   ; apply_sign(pri_tap_tl)
589    vpsubb    m25{k2}, m31, m27   ; apply_sign(pri_tap_tr)
590    vpsubb    m26{k3}, m31, m27   ; apply_sign(pri_tap_tl)
591    vpsubb    m27{k4}, m31, m27   ; apply_sign(pri_tap_tr)
592    psubusb       m20, m29, m20   ; imax(0, pri_strength - (abs(dtl) >> shift)))
593    psubusb       m21, m29, m21   ; imax(0, pri_strength - (abs(dtr) >> shift)))
594    psubusb       m22, m29, m22   ; imax(0, pri_strength - (abs(dbl) >> shift)))
595    psubusb       m23, m29, m23   ; imax(0, pri_strength - (abs(dbr) >> shift)))
596    pminub        m16, m20
597    pminub        m17, m21
598    pminub        m18, m22
599    pminub        m19, m23
600    vpdpbusd       m0, m16, m24   ; sum tl
601    vpdpbusd       m1, m17, m25   ; sum tr
602    vpdpbusd       m2, m18, m26   ; sum bl
603    vpdpbusd       m3, m19, m27   ; sum br
604%endmacro
605    CDEF_FILTER_8x8_PRI
606    test          t1d, t1d        ; sec
607    jz .end_no_clip
608    call .sec
609.end_clip:
610    pminub        m20, m8, m4
611    pminub        m24, m12, m16
612    pminub        m21, m9, m5
613    pminub        m25, m13, m17
614    pminub        m22, m10, m6
615    pminub        m26, m14, m18
616    pminub        m23, m11, m7
617    pminub        m27, m15, m19
618    pmaxub         m8, m4
619    pmaxub        m12, m16
620    pmaxub         m9, m5
621    pmaxub        m13, m17
622    pmaxub        m10, m6
623    pmaxub        m14, m18
624    pmaxub        m11, m7
625    pmaxub        m15, m19
626    pminub        m20, m24
627    pminub        m21, m25
628    pminub        m22, m26
629    pminub        m23, m27
630    pmaxub         m8, m12
631    pmaxub         m9, m13
632    pmaxub        m10, m14
633    pmaxub        m11, m15
634    mov           r2d, 0xAAAAAAAA
635    kmovd          k1, r2d
636    vpshrdd       m24,  m0,  m1, 16
637    vpshrdd       m25,  m2,  m3, 16
638    vpshrdd       m12, m20, m21, 16
639    vpshrdd       m14, m22, m23, 16
640    vpshrdd       m16,  m8,  m9, 16
641    vpshrdd       m18, m10, m11, 16
642    vpblendmw m13{k1}, m20, m21
643    vpblendmw m15{k1}, m22, m23
644    vpblendmw m17{k1},  m8, m9
645    vpblendmw m19{k1}, m10, m11
646    vpblendmw m20{k1},  m0, m24
647    vpblendmw m21{k1}, m24, m1
648    vpblendmw m22{k1},  m2, m25
649    vpblendmw m23{k1}, m25, m3
650    vpshrdd        m4, m5, 16
651    vpshrdd        m6, m7, 16
652    pminub        m12, m13
653    pminub        m14, m15
654    pmaxub        m16, m17
655    pmaxub        m18, m19
656    mova           m8, [base+end_perm_clip]
657    vpcmpw         k2, m20, m31, 1
658    vpcmpw         k3, m22, m31, 1
659    vpshldw        m4, m21, 8
660    vpshldw        m6, m23, 8
661    kunpckdq       k1, k1, k1
662    kxnorb         k4, k4, k4
663    vpshrdw       m11, m12, m14, 8
664    vpshrdw       m15, m16, m18, 8
665    vpblendmb m13{k1}, m12, m14
666    vpblendmb m17{k1}, m16, m18
667    psubw         m21, m31, m20
668    psubw         m23, m31, m22
669    paddusw        m0, m20, m4  ; clip >0xff
670    paddusw        m1, m22, m6
671    pminub        m11, m13
672    pmaxub        m15, m17
673    psubusw    m0{k2}, m4, m21  ; clip <0x00
674    psubusw    m1{k3}, m6, m23
675    psrlw          m0, 8
676    vmovdqu8   m0{k1}, m1
677    pmaxub         m0, m11
678    pminub         m0, m15
679    vpermb         m0, m8, m0
680    vextracti32x4 xm1, m0, 1
681    vextracti32x4 xm2, m0, 2
682    vextracti32x4 xm3, m0, 3
683    movq   [dstq+strideq*0], xm0
684    movq   [dstq+strideq*2], xm1
685    movq   [r10 +strideq*0], xm2
686    movq   [r10 +strideq*2], xm3
687    movhps [dstq+strideq*1], xm0
688    movhps [dstq+r9       ], xm1
689    movhps [r10 +strideq*1], xm2
690    movhps [r10 +r9       ], xm3
691    RET
692.sec_only:
693    movifnidn     t1d, secm
694    call .sec
695.end_no_clip:
696    mova          xm8, [base+end_perm]
697    kxnorb         k1, k1, k1
698    vpshldd        m4, m0, 8  ; (px << 8) + ((sum > -8) << 4)
699    vpshldd        m5, m1, 8
700    vpshldd        m6, m2, 8
701    vpshldd        m7, m3, 8
702    paddw          m0, m4     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
703    paddw          m1, m5
704    paddw          m2, m6
705    paddw          m3, m7
706    vpermb         m0, m8, m0
707    vpermb         m1, m8, m1
708    vpermb         m2, m8, m2
709    vpermb         m3, m8, m3
710    punpckldq      m4, m0, m1
711    punpckhdq      m0, m1
712    punpckldq      m5, m2, m3
713    punpckhdq      m2, m3
714    movq   [dstq+strideq*0], xm4
715    movq   [dstq+strideq*2], xm0
716    movq   [r10 +strideq*0], xm5
717    movq   [r10 +strideq*2], xm2
718    movhps [dstq+strideq*1], xm4
719    movhps [dstq+r9       ], xm0
720    movhps [r10 +strideq*1], xm5
721    movhps [r10 +r9       ], xm2
722    RET
723.mask_edges_sec_only:
724    movifnidn     t1d, secm
725    call .mask_edges_sec
726    jmp .end_no_clip
727ALIGN function_align
728.mask_edges:
729    mov           t0d, r6d
730    mov           t1d, r6d
731    or            t0d, 0xA ; top-left 4x4 has bottom and right
732    or            t1d, 0x9 ; top-right 4x4 has bottom and left
733    vpbroadcastq  m26, [base+edge_mask+t0*8]
734    vpbroadcastq  m27, [base+edge_mask+t1*8]
735    mov           t1d, r6d
736    or            r6d, 0x6 ; bottom-left 4x4 has top and right
737    or            t1d, 0x5 ; bottom-right 4x4 has top and left
738    vpbroadcastq  m28, [base+edge_mask+r6*8]
739    vpbroadcastq  m29, [base+edge_mask+t1*8]
740    mov           t0d, dirm
741    test         prid, prid
742    jz .mask_edges_sec_only
743    vpaddd        m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
744    vpshufbitqmb   k1, m26, m20 ; index in-range
745    vpshufbitqmb   k2, m27, m20
746    vpshufbitqmb   k3, m28, m20
747    vpshufbitqmb   k4, m29, m20
748    mova           m8, m4
749    mova           m9, m5
750    mova          m10, m6
751    mova          m11, m7
752    vpermb     m8{k1}, m20, m12
753    vpermb     m9{k2}, m20, m13
754    vpermb    m10{k3}, m20, m14
755    vpermb    m11{k4}, m20, m15
756    mova   [rsp+0x00], m26
757    mova   [rsp+0x40], m27
758    mova   [rsp+0x80], m28
759    mova   [rsp+0xC0], m29
760    CDEF_FILTER_8x8_PRI
761    test          t1d, t1d
762    jz .end_no_clip
763    mova          m26, [rsp+0x00]
764    mova          m27, [rsp+0x40]
765    mova          m28, [rsp+0x80]
766    mova          m29, [rsp+0xC0]
767    call .mask_edges_sec
768    jmp .end_clip
769.mask_edges_sec:
770    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
771    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
772    vpshufbitqmb   k1, m26, m20
773    vpshufbitqmb   k2, m27, m20
774    vpshufbitqmb   k3, m28, m20
775    vpshufbitqmb   k4, m29, m20
776    mova          m16, m4
777    mova          m17, m5
778    mova          m18, m6
779    mova          m19, m7
780    vpermb    m16{k1}, m20, m12
781    vpermb    m17{k2}, m20, m13
782    vpermb    m18{k3}, m20, m14
783    vpermb    m19{k4}, m20, m15
784    vpshufbitqmb   k1, m26, m21
785    vpshufbitqmb   k2, m27, m21
786    vpshufbitqmb   k3, m28, m21
787    vpshufbitqmb   k4, m29, m21
788    vpermb        m12, m21, m12
789    vpermb        m13, m21, m13
790    vpermb        m14, m21, m14
791    vpermb        m15, m21, m15
792    vpblendmb m12{k1}, m4, m12
793    vpblendmb m13{k2}, m5, m13
794    vpblendmb m14{k3}, m6, m14
795    vpblendmb m15{k4}, m7, m15
796    jmp .sec_main
797ALIGN function_align
798.sec:
799    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
800    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
801    vpermb        m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
802    vpermb        m17, m20, m13 ; pNtr
803    vpermb        m18, m20, m14 ; pNbl
804    vpermb        m19, m20, m15 ; pNbr
805    vpermb        m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
806    vpermb        m13, m21, m13 ; pNtr
807    vpermb        m14, m21, m14 ; pNbl
808    vpermb        m15, m21, m15 ; pNbr
809.sec_main:
810%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
811    vpcmpub        k1, m4, %1, 6
812    vpcmpub        k2, m5, %2, 6
813    vpcmpub        k3, m6, %3, 6
814    vpcmpub        k4, m7, %4, 6
815    psubb         m20, %1, m4
816    psubb         m21, %2, m5
817    psubb         m22, %3, m6
818    psubb         m23, %4, m7
819%if %5
820    vpbroadcastb  m28, t1d
821    lzcnt         t1d, t1d
822    vpbroadcastq  m29, [r3+t1*8]
823%endif
824    vpsubb    m20{k1}, m4, %1
825    vpsubb    m21{k2}, m5, %2
826    vpsubb    m22{k3}, m6, %3
827    vpsubb    m23{k4}, m7, %4
828    gf2p8affineqb m24, m20, m29, 0
829    gf2p8affineqb m25, m21, m29, 0
830    gf2p8affineqb m26, m22, m29, 0
831    gf2p8affineqb m27, m23, m29, 0
832%if %5
833    vpbroadcastd  m30, [base+sec_tap]
834%endif
835    psubusb       m24, m28, m24
836    psubusb       m25, m28, m25
837    psubusb       m26, m28, m26
838    psubusb       m27, m28, m27
839    pminub        m20, m24
840    pminub        m21, m25
841    pminub        m22, m26
842    pminub        m23, m27
843    mova          m24, m30
844    mova          m25, m30
845    mova          m26, m30
846    mova          m27, m30
847    vpsubb    m24{k1}, m31, m30
848    vpsubb    m25{k2}, m31, m30
849    vpsubb    m26{k3}, m31, m30
850    vpsubb    m27{k4}, m31, m30
851    vpdpbusd       m0, m20, m24
852    vpdpbusd       m1, m21, m25
853    vpdpbusd       m2, m22, m26
854    vpdpbusd       m3, m23, m27
855%endmacro
856    CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
857    CDEF_FILTER_8x8_SEC m12, m13, m14, m15
858    ret
859
860%endif ; ARCH_X86_64
861