1; Copyright © 2020, VideoLAN and dav1d authors 2; Copyright © 2020, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31%macro DUP4 1-* 32 %rep %0 33 times 4 db %1 34 %rotate 1 35 %endrep 36%endmacro 37 38%macro DIRS 16 ; cdef_directions[] 39 %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 40 ; masking away unused bits allows us to use a single vpaddd {1to16} 41 ; instruction instead of having to do vpbroadcastd + paddb 42 db %13 & 0x3f, -%13 & 0x3f 43 %rotate 1 44 %endrep 45%endmacro 46 47SECTION_RODATA 64 48 49lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 50 db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 51 db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 52 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 53lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 54 db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 55lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 56 db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 57 db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 58 db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 59pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 60lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55 61 db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 62lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21 63 db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25 64 db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53 65 db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57 66end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 67 db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 68end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 69 db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 70 db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 71 db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 72edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 73 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 74 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 75 dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 76 dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 77 dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 78 dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 79 dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 80px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 81cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 82gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 83 dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 84 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 85 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 86pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 87sec_tap: db 32, 32, 16, 16 88pd_268435568: dd 268435568 89 90SECTION .text 91 92%if WIN64 93DECLARE_REG_TMP 4 94%else 95DECLARE_REG_TMP 8 96%endif 97 98; lut: 99; t0 t1 t2 t3 t4 t5 t6 t7 100; T0 T1 T2 T3 T4 T5 T6 T7 101; L0 L1 00 01 02 03 04 05 102; L2 L3 10 11 12 13 14 15 103; L4 L5 20 21 22 23 24 25 104; L6 L7 30 31 32 33 34 35 105; b0 b1 b2 b3 b4 b5 b6 b7 106; B0 B1 B2 B3 B4 B5 B6 B7 107 108INIT_ZMM avx512icl 109cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \ 110 pri, sec, dir, damping, edge 111%define base r7-edge_mask 112 movq xmm0, [dstq+strideq*0] 113 movhps xmm0, [dstq+strideq*1] 114 lea r7, [edge_mask] 115 movq xmm1, [topq+strideq*0-2] 116 movhps xmm1, [topq+strideq*1-2] 117 mov r6d, edgem 118 vinserti32x4 ym0, ymm0, [leftq], 1 119 lea r2, [strideq*3] 120 vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 121 mova m5, [base+lut_perm_4x4] 122 vinserti32x4 m0, [dstq+r2], 2 123 test r6b, 0x08 ; avoid buffer overread 124 jz .main 125 vinserti32x4 m1, [botq+strideq*0-4], 2 126 vinserti32x4 m0, [botq+strideq*1-4], 3 127.main: 128 movifnidn prid, prim 129 mov t0d, dirm 130 mova m3, [base+px_idx] 131 mov r3d, dampingm 132 vpermi2b m5, m0, m1 ; lut 133 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 134 pxor m7, m7 135 lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 136 vpermb m6, m3, m5 ; px 137 cmp r6d, 0x0f 138 jne .mask_edges ; mask edges only if required 139 test prid, prid 140 jz .sec_only 141 vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 142 vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 143%macro CDEF_FILTER_4x4_PRI 0 144 vpcmpub k1, m6, m1, 6 ; px > pN 145 psubb m2, m1, m6 146 lzcnt r6d, prid 147 vpsubb m2{k1}, m6, m1 ; abs(diff) 148 vpbroadcastb m4, prid 149 and prid, 1 150 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift 151 movifnidn secd, secm 152 vpbroadcastd m10, [base+pri_tap+priq*4] 153 vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) 154 psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) 155 pminub m2, m4 156 vpdpbusd m0, m2, m10 ; sum 157%endmacro 158 CDEF_FILTER_4x4_PRI 159 test secd, secd 160 jz .end_no_clip 161 call .sec 162.end_clip: 163 pminub m4, m6, m1 164 pmaxub m1, m6 165 pminub m5, m2, m3 166 pmaxub m2, m3 167 pminub m4, m5 168 pmaxub m2, m1 169 psrldq m1, m4, 2 170 psrldq m3, m2, 2 171 pminub m1, m4 172 vpcmpw k1, m0, m7, 1 173 vpshldd m6, m0, 8 174 pmaxub m2, m3 175 pslldq m3, m1, 1 176 psubw m7, m0 177 paddusw m0, m6 ; clip >0xff 178 vpsubusw m0{k1}, m6, m7 ; clip <0x00 179 pslldq m4, m2, 1 180 pminub m1, m3 181 pmaxub m2, m4 182 pmaxub m0, m1 183 pminub m0, m2 184 jmp .end 185.sec_only: 186 movifnidn secd, secm 187 call .sec 188.end_no_clip: 189 vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) 190 paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 191.end: 192 mova xm1, [base+end_perm] 193 vpermb m0, m1, m0 ; output in bits 8-15 of each dword 194 movd [dstq+strideq*0], xm0 195 pextrd [dstq+strideq*1], xm0, 1 196 pextrd [dstq+strideq*2], xm0, 2 197 pextrd [dstq+r2 ], xm0, 3 198 RET 199.mask_edges_sec_only: 200 movifnidn secd, secm 201 call .mask_edges_sec 202 jmp .end_no_clip 203ALIGN function_align 204.mask_edges: 205 vpbroadcastq m8, [base+edge_mask+r6*8] 206 test prid, prid 207 jz .mask_edges_sec_only 208 vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} 209 vpshufbitqmb k1, m8, m2 ; index in-range 210 mova m1, m6 211 vpermb m1{k1}, m2, m5 212 CDEF_FILTER_4x4_PRI 213 test secd, secd 214 jz .end_no_clip 215 call .mask_edges_sec 216 jmp .end_clip 217.mask_edges_sec: 218 vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} 219 vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} 220 vpshufbitqmb k1, m8, m4 221 mova m2, m6 222 vpermb m2{k1}, m4, m5 223 vpshufbitqmb k1, m8, m9 224 mova m3, m6 225 vpermb m3{k1}, m9, m5 226 jmp .sec_main 227ALIGN function_align 228.sec: 229 vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 230 vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 231 vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 232 vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 233.sec_main: 234 vpbroadcastd m8, [base+sec_tap] 235 vpcmpub k1, m6, m2, 6 236 psubb m4, m2, m6 237 vpbroadcastb m12, secd 238 lzcnt secd, secd 239 vpsubb m4{k1}, m6, m2 240 vpcmpub k2, m6, m3, 6 241 vpbroadcastq m11, [r3+secq*8] 242 gf2p8affineqb m10, m4, m11, 0 243 psubb m5, m3, m6 244 mova m9, m8 245 vpsubb m8{k1}, m7, m8 246 psubusb m10, m12, m10 247 vpsubb m5{k2}, m6, m3 248 pminub m4, m10 249 vpdpbusd m0, m4, m8 250 gf2p8affineqb m11, m5, m11, 0 251 vpsubb m9{k2}, m7, m9 252 psubusb m12, m11 253 pminub m5, m12 254 vpdpbusd m0, m5, m9 255 ret 256 257DECLARE_REG_TMP 2, 7 258 259; lut top lut bottom 260; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 261; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 262; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 263; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 264; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 265; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 266; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 267; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 268 269cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \ 270 pri, sec, dir, damping, edge 271%define base r8-edge_mask 272 vpbroadcastd ym21, strided 273 mov r6d, edgem 274 lea r8, [edge_mask] 275 movq xm1, [topq+strideq*0-2] 276 pmulld ym21, [base+pd_01234567] 277 kxnorb k1, k1, k1 278 movq xm2, [topq+strideq*1-2] 279 vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 280 mova m14, [base+lut_perm_4x8a] 281 movu m15, [base+lut_perm_4x8b] 282 test r6b, 0x08 ; avoid buffer overread 283 jz .main 284 vinserti32x4 ym1, [botq+strideq*0-2], 1 285 vinserti32x4 ym2, [botq+strideq*1-2], 1 286.main: 287 punpcklqdq ym1, ym2 288 vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ 289 movifnidn prid, prim 290 mov t0d, dirm 291 mova m16, [base+px_idx] 292 mov r3d, dampingm 293 vpermi2b m14, m0, m1 ; lut top 294 vpermi2b m15, m0, m1 ; lut bottom 295 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 296 pxor m20, m20 297 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 298 vpermb m2, m16, m14 ; pxt 299 vpermb m3, m16, m15 ; pxb 300 mova m1, m0 301 cmp r6b, 0x0f 302 jne .mask_edges ; mask edges only if required 303 test prid, prid 304 jz .sec_only 305 vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 306 vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 307 vpermb m5, m6, m15 ; pNb 308%macro CDEF_FILTER_4x8_PRI 0 309 vpcmpub k1, m2, m4, 6 ; pxt > pNt 310 vpcmpub k2, m3, m5, 6 ; pxb > pNb 311 psubb m6, m4, m2 312 psubb m7, m5, m3 313 lzcnt r6d, prid 314 vpsubb m6{k1}, m2, m4 ; abs(diff_top) 315 vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) 316 vpbroadcastb m13, prid 317 vpbroadcastq m9, [r3+r6*8] 318 and prid, 1 319 vpbroadcastd m11, [base+pri_tap+priq*4] 320 vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift 321 vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift 322 mova m10, m11 323 movifnidn t1d, secm 324 vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) 325 vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) 326 psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) 327 psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) 328 pminub m6, m12 329 pminub m7, m13 330 vpdpbusd m0, m6, m10 ; sum top 331 vpdpbusd m1, m7, m11 ; sum bottom 332%endmacro 333 CDEF_FILTER_4x8_PRI 334 test t1d, t1d ; sec 335 jz .end_no_clip 336 call .sec 337.end_clip: 338 pminub m10, m4, m2 339 pminub m12, m6, m8 340 pminub m11, m5, m3 341 pminub m13, m7, m9 342 pmaxub m4, m2 343 pmaxub m6, m8 344 pmaxub m5, m3 345 pmaxub m7, m9 346 pminub m10, m12 347 pminub m11, m13 348 pmaxub m4, m6 349 pmaxub m5, m7 350 mov r2d, 0xAAAAAAAA 351 kmovd k1, r2d 352 kxnorb k2, k2, k2 ; hw lw 353 vpshrdd m12, m0, m1, 16 ; m1lw m0hw 354 vpshrdd m6, m10, m11, 16 ; m11lw m10hw 355 vpshrdd m8, m4, m5, 16 ; m5lw m4hw 356 vpblendmw m7{k1}, m10, m11 ; m11hw m10lw 357 vpblendmw m9{k1}, m4, m5 ; m5hw m4lw 358 vpblendmw m4{k1}, m0, m12 ; m1lw m0lw 359 vpblendmw m5{k1}, m12, m1 ; m1hw m0hw 360 vpshrdd m2, m3, 16 361 pminub m6, m7 362 pmaxub m8, m9 363 mova ym14, [base+end_perm] 364 vpcmpw k1, m4, m20, 1 365 vpshldw m2, m5, 8 366 pslldq m7, m6, 1 367 pslldq m9, m8, 1 368 psubw m5, m20, m4 369 paddusw m0, m4, m2 ; clip >0xff 370 pminub m6, m7 371 pmaxub m8, m9 372 psubusw m0{k1}, m2, m5 ; clip <0x00 373 pmaxub m0, m6 374 pminub m0, m8 375 vpermb m0, m14, m0 376 vpscatterdd [dstq+ym21]{k2}, ym0 377 RET 378.sec_only: 379 movifnidn t1d, secm 380 call .sec 381.end_no_clip: 382 mova ym4, [base+end_perm] 383 kxnorb k1, k1, k1 384 vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) 385 vpshldd m3, m1, 8 386 paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 387 paddw m1, m3 388 pslld m0, 16 389 vpshrdd m0, m1, 16 390 vpermb m0, m4, m0 ; output in bits 8-15 of each word 391 vpscatterdd [dstq+ym21]{k1}, ym0 392 RET 393.mask_edges_sec_only: 394 movifnidn t1d, secm 395 call .mask_edges_sec 396 jmp .end_no_clip 397ALIGN function_align 398.mask_edges: 399 mov t1d, r6d 400 or r6d, 8 ; top 4x4 has bottom 401 or t1d, 4 ; bottom 4x4 has top 402 vpbroadcastq m17, [base+edge_mask+r6*8] 403 vpbroadcastq m18, [base+edge_mask+t1*8] 404 test prid, prid 405 jz .mask_edges_sec_only 406 vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} 407 vpshufbitqmb k1, m17, m6 ; index in-range 408 vpshufbitqmb k2, m18, m6 409 mova m4, m2 410 mova m5, m3 411 vpermb m4{k1}, m6, m14 412 vpermb m5{k2}, m6, m15 413 CDEF_FILTER_4x8_PRI 414 test t1d, t1d 415 jz .end_no_clip 416 call .mask_edges_sec 417 jmp .end_clip 418.mask_edges_sec: 419 vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} 420 vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} 421 vpshufbitqmb k1, m17, m10 422 vpshufbitqmb k2, m18, m10 423 vpshufbitqmb k3, m17, m11 424 vpshufbitqmb k4, m18, m11 425 mova m6, m2 426 mova m7, m3 427 mova m8, m2 428 mova m9, m3 429 vpermb m6{k1}, m10, m14 430 vpermb m7{k2}, m10, m15 431 vpermb m8{k3}, m11, m14 432 vpermb m9{k4}, m11, m15 433 jmp .sec_main 434ALIGN function_align 435.sec: 436 vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 437 vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 438 vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 439 vpermb m7, m8, m15 ; pNb 440 vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 441 vpermb m9, m9, m15 ; pNb 442.sec_main: 443 vpbroadcastb m18, t1d 444 lzcnt t1d, t1d 445 vpcmpub k1, m2, m6, 6 446 vpcmpub k2, m3, m7, 6 447 vpcmpub k3, m2, m8, 6 448 vpcmpub k4, m3, m9, 6 449 vpbroadcastq m17, [r3+t1*8] 450 psubb m10, m6, m2 451 psubb m11, m7, m3 452 psubb m12, m8, m2 453 psubb m13, m9, m3 454 vpsubb m10{k1}, m2, m6 ; abs(dt0) 455 vpsubb m11{k2}, m3, m7 ; abs(db0) 456 vpsubb m12{k3}, m2, m8 ; abs(dt1) 457 vpsubb m13{k4}, m3, m9 ; abs(db1) 458 vpbroadcastd m19, [base+sec_tap] 459 gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift 460 gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift 461 gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift 462 gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift 463 psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) 464 psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) 465 psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) 466 psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) 467 pminub m10, m14 468 pminub m11, m15 469 pminub m12, m16 470 pminub m13, m17 471 mova m14, m19 472 mova m15, m19 473 mova m16, m19 474 vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) 475 vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) 476 vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) 477 vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) 478 vpdpbusd m0, m10, m14 479 vpdpbusd m1, m11, m15 480 vpdpbusd m0, m12, m16 481 vpdpbusd m1, m13, m19 482 ret 483 484; lut tl lut tr 485; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb 486; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb 487; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09 488; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19 489; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 490; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 491; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 492; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 493; lut bl lut br 494; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 495; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 496; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 497; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 498; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69 499; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79 500; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb 501; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb 502 503cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \ 504 pri, sec, dir, damping, edge 505%define base r8-edge_mask 506 movu xm16, [dstq+strideq*0] 507 pinsrd xm16, [leftq+4*0], 3 508 mov r6d, edgem 509 vinserti128 ym16, [dstq+strideq*1], 1 510 lea r10, [dstq+strideq*4] 511 movu xm17, [dstq+strideq*2] 512 vinserti32x4 m16, [topq+strideq*0-2], 2 513 lea r9, [strideq*3] 514 pinsrd xm17, [leftq+4*1], 3 515 vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T 516 lea r8, [edge_mask] 517 vinserti128 ym17, [dstq+r9 ], 1 518 vpbroadcastd ym18, [leftq+4*2] 519 vpblendd ym17, ym18, 0x80 520 movu xm18, [r10 +strideq*2] 521 vinserti32x4 m17, [r10 +strideq*0], 2 522 pinsrd xm18, [leftq+4*3], 3 523 vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5 524 vinserti128 ym18, [r10 +r9 ], 1 525 test r6b, 0x08 ; avoid buffer overread 526 jz .main 527 vinserti32x4 m18, [botq+strideq*0-2], 2 528 vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B 529.main: 530 mova m0, [base+lut_perm_8x8a] 531 movu m1, [base+lut_perm_8x8b] 532 mova m30, [base+px_idx] 533 vpermb m16, m0, m16 534 movifnidn prid, prim 535 vpermb m17, m1, m17 536 mov t0d, dirm 537 vpermb m18, m0, m18 538 mov r3d, dampingm 539 vshufi32x4 m12, m16, m17, q2020 ; lut tl 540 vshufi32x4 m13, m16, m17, q3131 ; lut tr 541 vshufi32x4 m14, m17, m18, q0220 ; lut bl 542 vshufi32x4 m15, m17, m18, q1331 ; lut br 543 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) 544 pxor m31, m31 545 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 546 vpermb m4, m30, m12 ; pxtl 547 mova m1, m0 548 vpermb m5, m30, m13 ; pxtr 549 mova m2, m0 550 vpermb m6, m30, m14 ; pxbl 551 mova m3, m0 552 vpermb m7, m30, m15 ; pxbr 553 cmp r6b, 0x0f 554 jne .mask_edges ; mask edges only if required 555 test prid, prid 556 jz .sec_only 557 vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir 558 vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 559 vpermb m9, m11, m13 ; pNtr 560 vpermb m10, m11, m14 ; pNbl 561 vpermb m11, m11, m15 ; pNbr 562%macro CDEF_FILTER_8x8_PRI 0 563 vpcmpub k1, m4, m8, 6 ; pxtl > pNtl 564 vpcmpub k2, m5, m9, 6 ; pxtr > pNtr 565 vpcmpub k3, m6, m10, 6 ; pxbl > pNbl 566 vpcmpub k4, m7, m11, 6 ; pxbr > pNbr 567 psubb m16, m8, m4 568 psubb m17, m9, m5 569 psubb m18, m10, m6 570 psubb m19, m11, m7 571 lzcnt r6d, prid 572 vpsubb m16{k1}, m4, m8 ; abs(diff_tl) 573 vpsubb m17{k2}, m5, m9 ; abs(diff_tr) 574 vpsubb m18{k3}, m6, m10 ; abs(diff_bl) 575 vpsubb m19{k4}, m7, m11 ; abs(diff_br) 576 vpbroadcastq m28, [r3+r6*8] 577 vpbroadcastb m29, prid 578 and prid, 1 579 vpbroadcastd m27, [base+pri_tap+priq*4] 580 vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift 581 vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift 582 vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift 583 vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift 584 mova m24, m27 585 mova m25, m27 586 mova m26, m27 587 movifnidn t1d, secm 588 vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) 589 vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) 590 vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) 591 vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) 592 psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) 593 psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) 594 psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) 595 psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) 596 pminub m16, m20 597 pminub m17, m21 598 pminub m18, m22 599 pminub m19, m23 600 vpdpbusd m0, m16, m24 ; sum tl 601 vpdpbusd m1, m17, m25 ; sum tr 602 vpdpbusd m2, m18, m26 ; sum bl 603 vpdpbusd m3, m19, m27 ; sum br 604%endmacro 605 CDEF_FILTER_8x8_PRI 606 test t1d, t1d ; sec 607 jz .end_no_clip 608 call .sec 609.end_clip: 610 pminub m20, m8, m4 611 pminub m24, m12, m16 612 pminub m21, m9, m5 613 pminub m25, m13, m17 614 pminub m22, m10, m6 615 pminub m26, m14, m18 616 pminub m23, m11, m7 617 pminub m27, m15, m19 618 pmaxub m8, m4 619 pmaxub m12, m16 620 pmaxub m9, m5 621 pmaxub m13, m17 622 pmaxub m10, m6 623 pmaxub m14, m18 624 pmaxub m11, m7 625 pmaxub m15, m19 626 pminub m20, m24 627 pminub m21, m25 628 pminub m22, m26 629 pminub m23, m27 630 pmaxub m8, m12 631 pmaxub m9, m13 632 pmaxub m10, m14 633 pmaxub m11, m15 634 mov r2d, 0xAAAAAAAA 635 kmovd k1, r2d 636 vpshrdd m24, m0, m1, 16 637 vpshrdd m25, m2, m3, 16 638 vpshrdd m12, m20, m21, 16 639 vpshrdd m14, m22, m23, 16 640 vpshrdd m16, m8, m9, 16 641 vpshrdd m18, m10, m11, 16 642 vpblendmw m13{k1}, m20, m21 643 vpblendmw m15{k1}, m22, m23 644 vpblendmw m17{k1}, m8, m9 645 vpblendmw m19{k1}, m10, m11 646 vpblendmw m20{k1}, m0, m24 647 vpblendmw m21{k1}, m24, m1 648 vpblendmw m22{k1}, m2, m25 649 vpblendmw m23{k1}, m25, m3 650 vpshrdd m4, m5, 16 651 vpshrdd m6, m7, 16 652 pminub m12, m13 653 pminub m14, m15 654 pmaxub m16, m17 655 pmaxub m18, m19 656 mova m8, [base+end_perm_clip] 657 vpcmpw k2, m20, m31, 1 658 vpcmpw k3, m22, m31, 1 659 vpshldw m4, m21, 8 660 vpshldw m6, m23, 8 661 kunpckdq k1, k1, k1 662 kxnorb k4, k4, k4 663 vpshrdw m11, m12, m14, 8 664 vpshrdw m15, m16, m18, 8 665 vpblendmb m13{k1}, m12, m14 666 vpblendmb m17{k1}, m16, m18 667 psubw m21, m31, m20 668 psubw m23, m31, m22 669 paddusw m0, m20, m4 ; clip >0xff 670 paddusw m1, m22, m6 671 pminub m11, m13 672 pmaxub m15, m17 673 psubusw m0{k2}, m4, m21 ; clip <0x00 674 psubusw m1{k3}, m6, m23 675 psrlw m0, 8 676 vmovdqu8 m0{k1}, m1 677 pmaxub m0, m11 678 pminub m0, m15 679 vpermb m0, m8, m0 680 vextracti32x4 xm1, m0, 1 681 vextracti32x4 xm2, m0, 2 682 vextracti32x4 xm3, m0, 3 683 movq [dstq+strideq*0], xm0 684 movq [dstq+strideq*2], xm1 685 movq [r10 +strideq*0], xm2 686 movq [r10 +strideq*2], xm3 687 movhps [dstq+strideq*1], xm0 688 movhps [dstq+r9 ], xm1 689 movhps [r10 +strideq*1], xm2 690 movhps [r10 +r9 ], xm3 691 RET 692.sec_only: 693 movifnidn t1d, secm 694 call .sec 695.end_no_clip: 696 mova xm8, [base+end_perm] 697 kxnorb k1, k1, k1 698 vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) 699 vpshldd m5, m1, 8 700 vpshldd m6, m2, 8 701 vpshldd m7, m3, 8 702 paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 703 paddw m1, m5 704 paddw m2, m6 705 paddw m3, m7 706 vpermb m0, m8, m0 707 vpermb m1, m8, m1 708 vpermb m2, m8, m2 709 vpermb m3, m8, m3 710 punpckldq m4, m0, m1 711 punpckhdq m0, m1 712 punpckldq m5, m2, m3 713 punpckhdq m2, m3 714 movq [dstq+strideq*0], xm4 715 movq [dstq+strideq*2], xm0 716 movq [r10 +strideq*0], xm5 717 movq [r10 +strideq*2], xm2 718 movhps [dstq+strideq*1], xm4 719 movhps [dstq+r9 ], xm0 720 movhps [r10 +strideq*1], xm5 721 movhps [r10 +r9 ], xm2 722 RET 723.mask_edges_sec_only: 724 movifnidn t1d, secm 725 call .mask_edges_sec 726 jmp .end_no_clip 727ALIGN function_align 728.mask_edges: 729 mov t0d, r6d 730 mov t1d, r6d 731 or t0d, 0xA ; top-left 4x4 has bottom and right 732 or t1d, 0x9 ; top-right 4x4 has bottom and left 733 vpbroadcastq m26, [base+edge_mask+t0*8] 734 vpbroadcastq m27, [base+edge_mask+t1*8] 735 mov t1d, r6d 736 or r6d, 0x6 ; bottom-left 4x4 has top and right 737 or t1d, 0x5 ; bottom-right 4x4 has top and left 738 vpbroadcastq m28, [base+edge_mask+r6*8] 739 vpbroadcastq m29, [base+edge_mask+t1*8] 740 mov t0d, dirm 741 test prid, prid 742 jz .mask_edges_sec_only 743 vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} 744 vpshufbitqmb k1, m26, m20 ; index in-range 745 vpshufbitqmb k2, m27, m20 746 vpshufbitqmb k3, m28, m20 747 vpshufbitqmb k4, m29, m20 748 mova m8, m4 749 mova m9, m5 750 mova m10, m6 751 mova m11, m7 752 vpermb m8{k1}, m20, m12 753 vpermb m9{k2}, m20, m13 754 vpermb m10{k3}, m20, m14 755 vpermb m11{k4}, m20, m15 756 mova [rsp+0x00], m26 757 mova [rsp+0x40], m27 758 mova [rsp+0x80], m28 759 mova [rsp+0xC0], m29 760 CDEF_FILTER_8x8_PRI 761 test t1d, t1d 762 jz .end_no_clip 763 mova m26, [rsp+0x00] 764 mova m27, [rsp+0x40] 765 mova m28, [rsp+0x80] 766 mova m29, [rsp+0xC0] 767 call .mask_edges_sec 768 jmp .end_clip 769.mask_edges_sec: 770 vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} 771 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} 772 vpshufbitqmb k1, m26, m20 773 vpshufbitqmb k2, m27, m20 774 vpshufbitqmb k3, m28, m20 775 vpshufbitqmb k4, m29, m20 776 mova m16, m4 777 mova m17, m5 778 mova m18, m6 779 mova m19, m7 780 vpermb m16{k1}, m20, m12 781 vpermb m17{k2}, m20, m13 782 vpermb m18{k3}, m20, m14 783 vpermb m19{k4}, m20, m15 784 vpshufbitqmb k1, m26, m21 785 vpshufbitqmb k2, m27, m21 786 vpshufbitqmb k3, m28, m21 787 vpshufbitqmb k4, m29, m21 788 vpermb m12, m21, m12 789 vpermb m13, m21, m13 790 vpermb m14, m21, m14 791 vpermb m15, m21, m15 792 vpblendmb m12{k1}, m4, m12 793 vpblendmb m13{k2}, m5, m13 794 vpblendmb m14{k3}, m6, m14 795 vpblendmb m15{k4}, m7, m15 796 jmp .sec_main 797ALIGN function_align 798.sec: 799 vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 800 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 801 vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 802 vpermb m17, m20, m13 ; pNtr 803 vpermb m18, m20, m14 ; pNbl 804 vpermb m19, m20, m15 ; pNbr 805 vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 806 vpermb m13, m21, m13 ; pNtr 807 vpermb m14, m21, m14 ; pNbl 808 vpermb m15, m21, m15 ; pNbr 809.sec_main: 810%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants 811 vpcmpub k1, m4, %1, 6 812 vpcmpub k2, m5, %2, 6 813 vpcmpub k3, m6, %3, 6 814 vpcmpub k4, m7, %4, 6 815 psubb m20, %1, m4 816 psubb m21, %2, m5 817 psubb m22, %3, m6 818 psubb m23, %4, m7 819%if %5 820 vpbroadcastb m28, t1d 821 lzcnt t1d, t1d 822 vpbroadcastq m29, [r3+t1*8] 823%endif 824 vpsubb m20{k1}, m4, %1 825 vpsubb m21{k2}, m5, %2 826 vpsubb m22{k3}, m6, %3 827 vpsubb m23{k4}, m7, %4 828 gf2p8affineqb m24, m20, m29, 0 829 gf2p8affineqb m25, m21, m29, 0 830 gf2p8affineqb m26, m22, m29, 0 831 gf2p8affineqb m27, m23, m29, 0 832%if %5 833 vpbroadcastd m30, [base+sec_tap] 834%endif 835 psubusb m24, m28, m24 836 psubusb m25, m28, m25 837 psubusb m26, m28, m26 838 psubusb m27, m28, m27 839 pminub m20, m24 840 pminub m21, m25 841 pminub m22, m26 842 pminub m23, m27 843 mova m24, m30 844 mova m25, m30 845 mova m26, m30 846 mova m27, m30 847 vpsubb m24{k1}, m31, m30 848 vpsubb m25{k2}, m31, m30 849 vpsubb m26{k3}, m31, m30 850 vpsubb m27{k4}, m31, m30 851 vpdpbusd m0, m20, m24 852 vpdpbusd m1, m21, m25 853 vpdpbusd m2, m22, m26 854 vpdpbusd m3, m23, m27 855%endmacro 856 CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 857 CDEF_FILTER_8x8_SEC m12, m13, m14, m15 858 ret 859 860%endif ; ARCH_X86_64 861