1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if HAVE_AVX512ICL && ARCH_X86_64 30 31%macro DUP4 1-* 32 %rep %0 33 times 4 db %1 34 %rotate 1 35 %endrep 36%endmacro 37 38%macro DIRS 16 ; cdef_directions[] 39 %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 40 ; masking away unused bits allows us to use a single vpaddd {1to16} 41 ; instruction instead of having to do vpbroadcastd + paddb 42 db %13 & 0x3f, -%13 & 0x3f 43 %rotate 1 44 %endrep 45%endmacro 46 47SECTION_RODATA 64 48 49lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 50 db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 51 db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 52 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 53lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 54 db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 55lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 56 db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 57 db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 58 db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 59pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 60lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 61 db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 62 db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 63 db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 64lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 65 db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 66 db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 67 db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 68edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 69 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 70 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 71 dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 72 dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 73 dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 74 dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 75 dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 76px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 77cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 78gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 79 dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 80 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 81 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 82 times 16 db 0 ; realign (introduced by cdef_dirs) 83end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 84 db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 85 db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 86 db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 87end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 88 db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 89pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 90sec_tap: db 32, 32, 16, 16 91pd_268435568: dd 268435568 92 93SECTION .text 94 95%if WIN64 96DECLARE_REG_TMP 5, 6 97%else 98DECLARE_REG_TMP 8, 5 99%endif 100 101; lut: 102; t0 t1 t2 t3 t4 t5 t6 t7 103; T0 T1 T2 T3 T4 T5 T6 T7 104; L0 L1 00 01 02 03 04 05 105; L2 L3 10 11 12 13 14 15 106; L4 L5 20 21 22 23 24 25 107; L6 L7 30 31 32 33 34 35 108; 4e 4f 40 41 42 43 44 45 109; 5e 5f 50 51 52 53 54 55 110 111INIT_ZMM avx512icl 112cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge 113%define base r7-edge_mask 114 movq xmm0, [dstq+strideq*0] 115 movhps xmm0, [dstq+strideq*1] 116 lea r7, [edge_mask] 117 movq xmm1, [topq+strideq*0-2] 118 movhps xmm1, [topq+strideq*1-2] 119 mov r6d, edgem 120 vinserti32x4 ym0, ymm0, [leftq], 1 121 lea r2, [strideq*3] 122 vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 123 mova m5, [base+lut_perm_4x4] 124 vinserti32x4 m0, [dstq+r2], 2 125 test r6b, 0x08 ; avoid buffer overread 126 jz .main 127 lea r3, [dstq+strideq*4-4] 128 vinserti32x4 m1, [r3+strideq*0], 2 129 vinserti32x4 m0, [r3+strideq*1], 3 130.main: 131 movifnidn prid, prim 132 mov t0d, dirm 133 mova m3, [base+px_idx] 134 mov r3d, dampingm 135 vpermi2b m5, m0, m1 ; lut 136 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 137 pxor m7, m7 138 lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 139 vpermb m6, m3, m5 ; px 140 cmp r6d, 0x0f 141 jne .mask_edges ; mask edges only if required 142 test prid, prid 143 jz .sec_only 144 vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 145 vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 146%macro CDEF_FILTER_4x4_PRI 0 147 vpcmpub k1, m6, m1, 6 ; px > pN 148 psubb m2, m1, m6 149 lzcnt r6d, prid 150 vpsubb m2{k1}, m6, m1 ; abs(diff) 151 vpbroadcastb m4, prid 152 and prid, 1 153 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift 154 movifnidn t1d, secm 155 vpbroadcastd m10, [base+pri_tap+priq*4] 156 vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) 157 psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) 158 pminub m2, m4 159 vpdpbusd m0, m2, m10 ; sum 160%endmacro 161 CDEF_FILTER_4x4_PRI 162 test t1d, t1d ; sec 163 jz .end_no_clip 164 call .sec 165.end_clip: 166 pminub m4, m6, m1 167 pmaxub m1, m6 168 pminub m5, m2, m3 169 pmaxub m2, m3 170 pminub m4, m5 171 pmaxub m2, m1 172 psrldq m1, m4, 2 173 psrldq m3, m2, 2 174 pminub m1, m4 175 vpcmpw k1, m0, m7, 1 176 vpshldd m6, m0, 8 177 pmaxub m2, m3 178 pslldq m3, m1, 1 179 psubw m7, m0 180 paddusw m0, m6 ; clip >0xff 181 vpsubusw m0{k1}, m6, m7 ; clip <0x00 182 pslldq m4, m2, 1 183 pminub m1, m3 184 pmaxub m2, m4 185 pmaxub m0, m1 186 pminub m0, m2 187 jmp .end 188.sec_only: 189 movifnidn t1d, secm 190 call .sec 191.end_no_clip: 192 vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) 193 paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 194.end: 195 mova xm1, [base+end_perm] 196 vpermb m0, m1, m0 ; output in bits 8-15 of each dword 197 movd [dstq+strideq*0], xm0 198 pextrd [dstq+strideq*1], xm0, 1 199 pextrd [dstq+strideq*2], xm0, 2 200 pextrd [dstq+r2 ], xm0, 3 201 RET 202.mask_edges_sec_only: 203 movifnidn t1d, secm 204 call .mask_edges_sec 205 jmp .end_no_clip 206ALIGN function_align 207.mask_edges: 208 vpbroadcastq m8, [base+edge_mask+r6*8] 209 test prid, prid 210 jz .mask_edges_sec_only 211 vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} 212 vpshufbitqmb k1, m8, m2 ; index in-range 213 mova m1, m6 214 vpermb m1{k1}, m2, m5 215 CDEF_FILTER_4x4_PRI 216 test t1d, t1d 217 jz .end_no_clip 218 call .mask_edges_sec 219 jmp .end_clip 220.mask_edges_sec: 221 vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} 222 vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} 223 vpshufbitqmb k1, m8, m4 224 mova m2, m6 225 vpermb m2{k1}, m4, m5 226 vpshufbitqmb k1, m8, m9 227 mova m3, m6 228 vpermb m3{k1}, m9, m5 229 jmp .sec_main 230ALIGN function_align 231.sec: 232 vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 233 vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 234 vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 235 vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 236.sec_main: 237 vpbroadcastd m8, [base+sec_tap] 238 vpcmpub k1, m6, m2, 6 239 psubb m4, m2, m6 240 vpbroadcastb m12, t1d 241 lzcnt t1d, t1d 242 vpsubb m4{k1}, m6, m2 243 vpcmpub k2, m6, m3, 6 244 vpbroadcastq m11, [r3+t1*8] 245 gf2p8affineqb m10, m4, m11, 0 246 psubb m5, m3, m6 247 mova m9, m8 248 vpsubb m8{k1}, m7, m8 249 psubusb m10, m12, m10 250 vpsubb m5{k2}, m6, m3 251 pminub m4, m10 252 vpdpbusd m0, m4, m8 253 gf2p8affineqb m11, m5, m11, 0 254 vpsubb m9{k2}, m7, m9 255 psubusb m12, m11 256 pminub m5, m12 257 vpdpbusd m0, m5, m9 258 ret 259 260DECLARE_REG_TMP 2, 7 261 262; lut top lut bottom 263; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 264; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 265; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 266; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 267; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 268; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 269; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 270; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 271 272cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ 273 pri, sec, dir, damping, edge 274%define base r8-edge_mask 275 vpbroadcastd ym21, strided 276 mov r6d, edgem 277 lea r8, [edge_mask] 278 movq xm1, [topq+strideq*0-2] 279 pmulld ym21, [base+pd_01234567] 280 kxnorb k1, k1, k1 281 movq xm2, [topq+strideq*1-2] 282 vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 283 mova m14, [base+lut_perm_4x8a] 284 movu m15, [base+lut_perm_4x8b] 285 test r6b, 0x08 ; avoid buffer overread 286 jz .main 287 lea r7, [dstq+strideq*8-2] 288 vinserti32x4 ym1, [r7+strideq*0], 1 289 vinserti32x4 ym2, [r7+strideq*1], 1 290.main: 291 punpcklqdq ym1, ym2 292 vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ 293 movifnidn prid, prim 294 mov t0d, dirm 295 mova m16, [base+px_idx] 296 mov r3d, dampingm 297 vpermi2b m14, m0, m1 ; lut top 298 vpermi2b m15, m0, m1 ; lut bottom 299 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 300 pxor m20, m20 301 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 302 vpermb m2, m16, m14 ; pxt 303 vpermb m3, m16, m15 ; pxb 304 mova m1, m0 305 cmp r6b, 0x0f 306 jne .mask_edges ; mask edges only if required 307 test prid, prid 308 jz .sec_only 309 vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 310 vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 311 vpermb m5, m6, m15 ; pNb 312%macro CDEF_FILTER_4x8_PRI 0 313 vpcmpub k1, m2, m4, 6 ; pxt > pNt 314 vpcmpub k2, m3, m5, 6 ; pxb > pNb 315 psubb m6, m4, m2 316 psubb m7, m5, m3 317 lzcnt r6d, prid 318 vpsubb m6{k1}, m2, m4 ; abs(diff_top) 319 vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) 320 vpbroadcastb m13, prid 321 vpbroadcastq m9, [r3+r6*8] 322 and prid, 1 323 vpbroadcastd m11, [base+pri_tap+priq*4] 324 vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift 325 vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift 326 mova m10, m11 327 movifnidn t1d, secm 328 vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) 329 vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) 330 psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) 331 psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) 332 pminub m6, m12 333 pminub m7, m13 334 vpdpbusd m0, m6, m10 ; sum top 335 vpdpbusd m1, m7, m11 ; sum bottom 336%endmacro 337 CDEF_FILTER_4x8_PRI 338 test t1d, t1d ; sec 339 jz .end_no_clip 340 call .sec 341.end_clip: 342 pminub m10, m4, m2 343 pminub m12, m6, m8 344 pminub m11, m5, m3 345 pminub m13, m7, m9 346 pmaxub m4, m2 347 pmaxub m6, m8 348 pmaxub m5, m3 349 pmaxub m7, m9 350 pminub m10, m12 351 pminub m11, m13 352 pmaxub m4, m6 353 pmaxub m5, m7 354 mov r2d, 0xAAAAAAAA 355 kmovd k1, r2d 356 kxnorb k2, k2, k2 ; hw lw 357 vpshrdd m12, m0, m1, 16 ; m1lw m0hw 358 vpshrdd m6, m10, m11, 16 ; m11lw m10hw 359 vpshrdd m8, m4, m5, 16 ; m5lw m4hw 360 vpblendmw m7{k1}, m10, m11 ; m11hw m10lw 361 vpblendmw m9{k1}, m4, m5 ; m5hw m4lw 362 vpblendmw m4{k1}, m0, m12 ; m1lw m0lw 363 vpblendmw m5{k1}, m12, m1 ; m1hw m0hw 364 vpshrdd m2, m3, 16 365 pminub m6, m7 366 pmaxub m8, m9 367 mova ym14, [base+end_perm] 368 vpcmpw k1, m4, m20, 1 369 vpshldw m2, m5, 8 370 pslldq m7, m6, 1 371 pslldq m9, m8, 1 372 psubw m5, m20, m4 373 paddusw m0, m4, m2 ; clip >0xff 374 pminub m6, m7 375 pmaxub m8, m9 376 psubusw m0{k1}, m2, m5 ; clip <0x00 377 pmaxub m0, m6 378 pminub m0, m8 379 vpermb m0, m14, m0 380 vpscatterdd [dstq+ym21]{k2}, ym0 381 RET 382.sec_only: 383 movifnidn t1d, secm 384 call .sec 385.end_no_clip: 386 mova ym4, [base+end_perm] 387 kxnorb k1, k1, k1 388 vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) 389 vpshldd m3, m1, 8 390 paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 391 paddw m1, m3 392 pslld m0, 16 393 vpshrdd m0, m1, 16 394 vpermb m0, m4, m0 ; output in bits 8-15 of each word 395 vpscatterdd [dstq+ym21]{k1}, ym0 396 RET 397.mask_edges_sec_only: 398 movifnidn t1d, secm 399 call .mask_edges_sec 400 jmp .end_no_clip 401ALIGN function_align 402.mask_edges: 403 mov t1d, r6d 404 or r6d, 8 ; top 4x4 has bottom 405 or t1d, 4 ; bottom 4x4 has top 406 vpbroadcastq m17, [base+edge_mask+r6*8] 407 vpbroadcastq m18, [base+edge_mask+t1*8] 408 test prid, prid 409 jz .mask_edges_sec_only 410 vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} 411 vpshufbitqmb k1, m17, m6 ; index in-range 412 vpshufbitqmb k2, m18, m6 413 mova m4, m2 414 mova m5, m3 415 vpermb m4{k1}, m6, m14 416 vpermb m5{k2}, m6, m15 417 CDEF_FILTER_4x8_PRI 418 test t1d, t1d 419 jz .end_no_clip 420 call .mask_edges_sec 421 jmp .end_clip 422.mask_edges_sec: 423 vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} 424 vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} 425 vpshufbitqmb k1, m17, m10 426 vpshufbitqmb k2, m18, m10 427 vpshufbitqmb k3, m17, m11 428 vpshufbitqmb k4, m18, m11 429 mova m6, m2 430 mova m7, m3 431 mova m8, m2 432 mova m9, m3 433 vpermb m6{k1}, m10, m14 434 vpermb m7{k2}, m10, m15 435 vpermb m8{k3}, m11, m14 436 vpermb m9{k4}, m11, m15 437 jmp .sec_main 438ALIGN function_align 439.sec: 440 vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 441 vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 442 vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 443 vpermb m7, m8, m15 ; pNb 444 vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 445 vpermb m9, m9, m15 ; pNb 446.sec_main: 447 vpbroadcastb m18, t1d 448 lzcnt t1d, t1d 449 vpcmpub k1, m2, m6, 6 450 vpcmpub k2, m3, m7, 6 451 vpcmpub k3, m2, m8, 6 452 vpcmpub k4, m3, m9, 6 453 vpbroadcastq m17, [r3+t1*8] 454 psubb m10, m6, m2 455 psubb m11, m7, m3 456 psubb m12, m8, m2 457 psubb m13, m9, m3 458 vpsubb m10{k1}, m2, m6 ; abs(dt0) 459 vpsubb m11{k2}, m3, m7 ; abs(db0) 460 vpsubb m12{k3}, m2, m8 ; abs(dt1) 461 vpsubb m13{k4}, m3, m9 ; abs(db1) 462 vpbroadcastd m19, [base+sec_tap] 463 gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift 464 gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift 465 gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift 466 gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift 467 psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) 468 psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) 469 psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) 470 psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) 471 pminub m10, m14 472 pminub m11, m15 473 pminub m12, m16 474 pminub m13, m17 475 mova m14, m19 476 mova m15, m19 477 mova m16, m19 478 vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) 479 vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) 480 vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) 481 vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) 482 vpdpbusd m0, m10, m14 483 vpdpbusd m1, m11, m15 484 vpdpbusd m0, m12, m16 485 vpdpbusd m1, m13, m19 486 ret 487 488; lut tl lut tr 489; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td 490; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD 491; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b 492; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b 493; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b 494; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b 495; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b 496; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b 497; lut bl lut br 498; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b 499; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b 500; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b 501; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b 502; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b 503; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b 504; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b 505; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b 506 507cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ 508 pri, sec, dir, damping, edge 509%define base r8-edge_mask 510 mov r6d, edgem 511 lea r10, [dstq+strideq*4-2] 512 movu xmm0, [topq+strideq*0-2] 513 movu xmm1, [dstq+strideq*2-2] 514 movu xmm2, [r10 +strideq*2 ] 515 lea r8, [edge_mask] 516 lea r9, [strideq*3] 517 pmovzxwq m10, [leftq-4] 518 vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 519 vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 520 vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 521 lea r7, [r10 +strideq*4 ] 522 pmovzxwq m11, [leftq+4] 523 vinserti32x4 m0, [dstq+strideq*0-2], 2 524 vinserti32x4 m1, [r10 +strideq*0 ], 2 525 mova m12, [base+lut_perm_8x8a] 526 movu m13, [base+lut_perm_8x8b] 527 vinserti32x4 m0, [dstq+strideq*1-2], 3 528 vinserti32x4 m1, [r10 +strideq*1 ], 3 529 test r6b, 0x08 ; avoid buffer overread 530 jz .main 531 vinserti32x4 m2, [r7 +strideq*0], 2 532 vinserti32x4 m2, [r7 +strideq*1], 3 533.main: 534 mov t1d, 0x11111100 535 mova m14, m12 536 mova m15, m13 537 kmovd k1, t1d 538 kshiftrd k2, k1, 8 539 movifnidn prid, prim 540 mov t0d, dirm 541 mova m30, [base+px_idx] 542 mov r3d, dampingm 543 vpermi2b m12, m0, m1 ; lut tl 544 vpermi2b m14, m1, m2 ; lut bl 545 vpermi2b m13, m0, m1 ; lut tr 546 vpermi2b m15, m1, m2 ; lut br 547 vpblendmw m12{k1}, m12, m10 548 vpblendmw m14{k2}, m14, m11 549 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 550 pxor m31, m31 551 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 552 vpermb m4, m30, m12 ; pxtl 553 vpermb m5, m30, m13 ; pxtr 554 vpermb m6, m30, m14 ; pxbl 555 vpermb m7, m30, m15 ; pxbr 556 mova m1, m0 557 mova m2, m0 558 mova m3, m0 559 cmp r6b, 0x0f 560 jne .mask_edges ; mask edges only if required 561 test prid, prid 562 jz .sec_only 563 vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 564 vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 565 vpermb m9, m11, m13 ; pNtr 566 vpermb m10, m11, m14 ; pNbl 567 vpermb m11, m11, m15 ; pNbr 568%macro CDEF_FILTER_8x8_PRI 0 569 vpcmpub k1, m4, m8, 6 ; pxtl > pNtl 570 vpcmpub k2, m5, m9, 6 ; pxtr > pNtr 571 vpcmpub k3, m6, m10, 6 ; pxbl > pNbl 572 vpcmpub k4, m7, m11, 6 ; pxbr > pNbr 573 psubb m16, m8, m4 574 psubb m17, m9, m5 575 psubb m18, m10, m6 576 psubb m19, m11, m7 577 lzcnt r6d, prid 578 vpsubb m16{k1}, m4, m8 ; abs(diff_tl) 579 vpsubb m17{k2}, m5, m9 ; abs(diff_tr) 580 vpsubb m18{k3}, m6, m10 ; abs(diff_bl) 581 vpsubb m19{k4}, m7, m11 ; abs(diff_br) 582 vpbroadcastq m28, [r3+r6*8] 583 vpbroadcastb m29, prid 584 and prid, 1 585 vpbroadcastd m27, [base+pri_tap+priq*4] 586 vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift 587 vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift 588 vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift 589 vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift 590 mova m24, m27 591 mova m25, m27 592 mova m26, m27 593 movifnidn t1d, secm 594 vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) 595 vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) 596 vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) 597 vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) 598 psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) 599 psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) 600 psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) 601 psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) 602 pminub m16, m20 603 pminub m17, m21 604 pminub m18, m22 605 pminub m19, m23 606 vpdpbusd m0, m16, m24 ; sum tl 607 vpdpbusd m1, m17, m25 ; sum tr 608 vpdpbusd m2, m18, m26 ; sum bl 609 vpdpbusd m3, m19, m27 ; sum br 610%endmacro 611 CDEF_FILTER_8x8_PRI 612 test t1d, t1d ; sec 613 jz .end_no_clip 614 call .sec 615.end_clip: 616 pminub m20, m8, m4 617 pminub m24, m12, m16 618 pminub m21, m9, m5 619 pminub m25, m13, m17 620 pminub m22, m10, m6 621 pminub m26, m14, m18 622 pminub m23, m11, m7 623 pminub m27, m15, m19 624 pmaxub m8, m4 625 pmaxub m12, m16 626 pmaxub m9, m5 627 pmaxub m13, m17 628 pmaxub m10, m6 629 pmaxub m14, m18 630 pmaxub m11, m7 631 pmaxub m15, m19 632 pminub m20, m24 633 pminub m21, m25 634 pminub m22, m26 635 pminub m23, m27 636 pmaxub m8, m12 637 pmaxub m9, m13 638 pmaxub m10, m14 639 pmaxub m11, m15 640 mov r2d, 0xAAAAAAAA 641 kmovd k1, r2d 642 vpshrdd m24, m0, m1, 16 643 vpshrdd m25, m2, m3, 16 644 vpshrdd m12, m20, m21, 16 645 vpshrdd m14, m22, m23, 16 646 vpshrdd m16, m8, m9, 16 647 vpshrdd m18, m10, m11, 16 648 vpblendmw m13{k1}, m20, m21 649 vpblendmw m15{k1}, m22, m23 650 vpblendmw m17{k1}, m8, m9 651 vpblendmw m19{k1}, m10, m11 652 vpblendmw m20{k1}, m0, m24 653 vpblendmw m21{k1}, m24, m1 654 vpblendmw m22{k1}, m2, m25 655 vpblendmw m23{k1}, m25, m3 656 vpshrdd m4, m5, 16 657 vpshrdd m6, m7, 16 658 pminub m12, m13 659 pminub m14, m15 660 pmaxub m16, m17 661 pmaxub m18, m19 662 mova m8, [base+end_perm_w8clip] 663 vpcmpw k2, m20, m31, 1 664 vpcmpw k3, m22, m31, 1 665 vpshldw m4, m21, 8 666 vpshldw m6, m23, 8 667 kunpckdq k1, k1, k1 668 kxnorb k4, k4, k4 669 vpshrdw m11, m12, m14, 8 670 vpshrdw m15, m16, m18, 8 671 vpblendmb m13{k1}, m12, m14 672 vpblendmb m17{k1}, m16, m18 673 psubw m21, m31, m20 674 psubw m23, m31, m22 675 paddusw m0, m20, m4 ; clip >0xff 676 paddusw m1, m22, m6 677 pminub m11, m13 678 pmaxub m15, m17 679 psubusw m0{k2}, m4, m21 ; clip <0x00 680 psubusw m1{k3}, m6, m23 681 psrlw m0, 8 682 vmovdqu8 m0{k1}, m1 683 pmaxub m0, m11 684 pminub m0, m15 685 vpermb m0, m8, m0 686 add r10, 2 687 vextracti32x4 xm1, m0, 1 688 vextracti32x4 xm2, m0, 2 689 vextracti32x4 xm3, m0, 3 690 movq [dstq+strideq*0], xm0 691 movq [dstq+strideq*2], xm1 692 movq [r10 +strideq*0], xm2 693 movq [r10 +strideq*2], xm3 694 movhps [dstq+strideq*1], xm0 695 movhps [dstq+r9 ], xm1 696 movhps [r10 +strideq*1], xm2 697 movhps [r10 +r9 ], xm3 698 RET 699.sec_only: 700 movifnidn t1d, secm 701 call .sec 702.end_no_clip: 703 mova xm8, [base+end_perm] 704 kxnorb k1, k1, k1 705 vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) 706 vpshldd m5, m1, 8 707 vpshldd m6, m2, 8 708 vpshldd m7, m3, 8 709 paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 710 paddw m1, m5 711 paddw m2, m6 712 paddw m3, m7 713 vpermb m0, m8, m0 714 vpermb m1, m8, m1 715 vpermb m2, m8, m2 716 vpermb m3, m8, m3 717 add r10, 2 718 punpckldq m4, m0, m1 719 punpckhdq m0, m1 720 punpckldq m5, m2, m3 721 punpckhdq m2, m3 722 movq [dstq+strideq*0], xm4 723 movq [dstq+strideq*2], xm0 724 movq [r10 +strideq*0], xm5 725 movq [r10 +strideq*2], xm2 726 movhps [dstq+strideq*1], xm4 727 movhps [dstq+r9 ], xm0 728 movhps [r10 +strideq*1], xm5 729 movhps [r10 +r9 ], xm2 730 RET 731.mask_edges_sec_only: 732 movifnidn t1d, secm 733 call .mask_edges_sec 734 jmp .end_no_clip 735ALIGN function_align 736.mask_edges: 737 mov t0d, r6d 738 mov t1d, r6d 739 or t0d, 0xA ; top-left 4x4 has bottom and right 740 or t1d, 0x9 ; top-right 4x4 has bottom and left 741 vpbroadcastq m26, [base+edge_mask+t0*8] 742 vpbroadcastq m27, [base+edge_mask+t1*8] 743 mov t1d, r6d 744 or r6d, 0x6 ; bottom-left 4x4 has top and right 745 or t1d, 0x5 ; bottom-right 4x4 has top and left 746 vpbroadcastq m28, [base+edge_mask+r6*8] 747 vpbroadcastq m29, [base+edge_mask+t1*8] 748 mov t0d, dirm 749 test prid, prid 750 jz .mask_edges_sec_only 751 vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} 752 vpshufbitqmb k1, m26, m20 ; index in-range 753 vpshufbitqmb k2, m27, m20 754 vpshufbitqmb k3, m28, m20 755 vpshufbitqmb k4, m29, m20 756 mova m8, m4 757 mova m9, m5 758 mova m10, m6 759 mova m11, m7 760 vpermb m8{k1}, m20, m12 761 vpermb m9{k2}, m20, m13 762 vpermb m10{k3}, m20, m14 763 vpermb m11{k4}, m20, m15 764 mova [rsp+0x00], m26 765 mova [rsp+0x40], m27 766 mova [rsp+0x80], m28 767 mova [rsp+0xC0], m29 768 CDEF_FILTER_8x8_PRI 769 test t1d, t1d 770 jz .end_no_clip 771 mova m26, [rsp+0x00] 772 mova m27, [rsp+0x40] 773 mova m28, [rsp+0x80] 774 mova m29, [rsp+0xC0] 775 call .mask_edges_sec 776 jmp .end_clip 777.mask_edges_sec: 778 vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} 779 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} 780 vpshufbitqmb k1, m26, m20 781 vpshufbitqmb k2, m27, m20 782 vpshufbitqmb k3, m28, m20 783 vpshufbitqmb k4, m29, m20 784 mova m16, m4 785 mova m17, m5 786 mova m18, m6 787 mova m19, m7 788 vpermb m16{k1}, m20, m12 789 vpermb m17{k2}, m20, m13 790 vpermb m18{k3}, m20, m14 791 vpermb m19{k4}, m20, m15 792 vpshufbitqmb k1, m26, m21 793 vpshufbitqmb k2, m27, m21 794 vpshufbitqmb k3, m28, m21 795 vpshufbitqmb k4, m29, m21 796 vpermb m12, m21, m12 797 vpermb m13, m21, m13 798 vpermb m14, m21, m14 799 vpermb m15, m21, m15 800 vpblendmb m12{k1}, m4, m12 801 vpblendmb m13{k2}, m5, m13 802 vpblendmb m14{k3}, m6, m14 803 vpblendmb m15{k4}, m7, m15 804 jmp .sec_main 805ALIGN function_align 806.sec: 807 vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 808 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 809 vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 810 vpermb m17, m20, m13 ; pNtr 811 vpermb m18, m20, m14 ; pNbl 812 vpermb m19, m20, m15 ; pNbr 813 vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 814 vpermb m13, m21, m13 ; pNtr 815 vpermb m14, m21, m14 ; pNbl 816 vpermb m15, m21, m15 ; pNbr 817.sec_main: 818%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants 819 vpcmpub k1, m4, %1, 6 820 vpcmpub k2, m5, %2, 6 821 vpcmpub k3, m6, %3, 6 822 vpcmpub k4, m7, %4, 6 823 psubb m20, %1, m4 824 psubb m21, %2, m5 825 psubb m22, %3, m6 826 psubb m23, %4, m7 827%if %5 828 vpbroadcastb m28, t1d 829 lzcnt t1d, t1d 830 vpbroadcastq m29, [r3+t1*8] 831%endif 832 vpsubb m20{k1}, m4, %1 833 vpsubb m21{k2}, m5, %2 834 vpsubb m22{k3}, m6, %3 835 vpsubb m23{k4}, m7, %4 836 gf2p8affineqb m24, m20, m29, 0 837 gf2p8affineqb m25, m21, m29, 0 838 gf2p8affineqb m26, m22, m29, 0 839 gf2p8affineqb m27, m23, m29, 0 840%if %5 841 vpbroadcastd m30, [base+sec_tap] 842%endif 843 psubusb m24, m28, m24 844 psubusb m25, m28, m25 845 psubusb m26, m28, m26 846 psubusb m27, m28, m27 847 pminub m20, m24 848 pminub m21, m25 849 pminub m22, m26 850 pminub m23, m27 851 mova m24, m30 852 mova m25, m30 853 mova m26, m30 854 mova m27, m30 855 vpsubb m24{k1}, m31, m30 856 vpsubb m25{k2}, m31, m30 857 vpsubb m26{k3}, m31, m30 858 vpsubb m27{k4}, m31, m30 859 vpdpbusd m0, m20, m24 860 vpdpbusd m1, m21, m25 861 vpdpbusd m2, m22, m26 862 vpdpbusd m3, m23, m27 863%endmacro 864 CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 865 CDEF_FILTER_8x8_SEC m12, m13, m14, m15 866 ret 867 868%endif ; HAVE_AVX512ICL && ARCH_X86_64 869