1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 34pb_7_1: times 16 db 7, 1 35pb_3_1: times 16 db 3, 1 36pb_2_1: times 16 db 2, 1 37pb_m1_0: times 16 db -1, 0 38pb_m1_1: times 16 db -1, 1 39pb_m1_2: times 16 db -1, 2 40pb_1: times 32 db 1 41pb_2: times 32 db 2 42pb_3: times 32 db 3 43pb_4: times 32 db 4 44pb_16: times 32 db 16 45pb_63: times 32 db 63 46pb_64: times 32 db 64 47pb_128: times 32 db 0x80 48pb_129: times 32 db 0x81 49pb_240: times 32 db 0xf0 50pb_248: times 32 db 0xf8 51pb_254: times 32 db 0xfe 52 53pw_2048: times 16 dw 2048 54pw_4096: times 16 dw 4096 55 56pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 57 58SECTION .text 59 60%macro ABSSUB 4 ; dst, a, b, tmp 61 psubusb %1, %2, %3 62 psubusb %4, %3, %2 63 por %1, %4 64%endmacro 65 66%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 67 ; transpose 16x4 68 punpcklbw m%5, m%1, m%2 69 punpckhbw m%1, m%2 70 punpcklbw m%2, m%3, m%4 71 punpckhbw m%3, m%4 72 punpcklwd m%4, m%5, m%2 73 punpckhwd m%5, m%2 74 punpcklwd m%2, m%1, m%3 75 punpckhwd m%1, m%3 76 77 ; write out 78 movd [dstq+strideq*0-2], xm%4 79 pextrd [dstq+strideq*1-2], xm%4, 1 80 pextrd [dstq+strideq*2-2], xm%4, 2 81 pextrd [dstq+stride3q-2], xm%4, 3 82 lea dstq, [dstq+strideq*4] 83 movd [dstq+strideq*0-2], xm%5 84 pextrd [dstq+strideq*1-2], xm%5, 1 85 pextrd [dstq+strideq*2-2], xm%5, 2 86 pextrd [dstq+stride3q-2], xm%5, 3 87 lea dstq, [dstq+strideq*4] 88 movd [dstq+strideq*0-2], xm%2 89 pextrd [dstq+strideq*1-2], xm%2, 1 90 pextrd [dstq+strideq*2-2], xm%2, 2 91 pextrd [dstq+stride3q-2], xm%2, 3 92 lea dstq, [dstq+strideq*4] 93 movd [dstq+strideq*0-2], xm%1 94 pextrd [dstq+strideq*1-2], xm%1, 1 95 pextrd [dstq+strideq*2-2], xm%1, 2 96 pextrd [dstq+stride3q-2], xm%1, 3 97 lea dstq, [dstq+strideq*4] 98 99 vextracti128 xm%4, m%4, 1 100 vextracti128 xm%5, m%5, 1 101 vextracti128 xm%2, m%2, 1 102 vextracti128 xm%1, m%1, 1 103 104 movd [dstq+strideq*0-2], xm%4 105 pextrd [dstq+strideq*1-2], xm%4, 1 106 pextrd [dstq+strideq*2-2], xm%4, 2 107 pextrd [dstq+stride3q-2], xm%4, 3 108 lea dstq, [dstq+strideq*4] 109 movd [dstq+strideq*0-2], xm%5 110 pextrd [dstq+strideq*1-2], xm%5, 1 111 pextrd [dstq+strideq*2-2], xm%5, 2 112 pextrd [dstq+stride3q-2], xm%5, 3 113 lea dstq, [dstq+strideq*4] 114 movd [dstq+strideq*0-2], xm%2 115 pextrd [dstq+strideq*1-2], xm%2, 1 116 pextrd [dstq+strideq*2-2], xm%2, 2 117 pextrd [dstq+stride3q-2], xm%2, 3 118 lea dstq, [dstq+strideq*4] 119 movd [dstq+strideq*0-2], xm%1 120 pextrd [dstq+strideq*1-2], xm%1, 1 121 pextrd [dstq+strideq*2-2], xm%1, 2 122 pextrd [dstq+stride3q-2], xm%1, 3 123 lea dstq, [dstq+strideq*4] 124%endmacro 125 126%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem 127%if %1 == 0 128 mova %3, m15 129%endif 130 131 ; input in m0-15 132 punpcklbw m15, m0, m1 133 punpckhbw m0, m1 134 punpcklbw m1, m2, m3 135 punpckhbw m2, m3 136 punpcklbw m3, m4, m5 137 punpckhbw m4, m5 138 punpcklbw m5, m6, m7 139 punpckhbw m6, m7 140 punpcklbw m7, m8, m9 141 punpckhbw m8, m9 142 punpcklbw m9, m10, m11 143 punpckhbw m10, m11 144 punpcklbw m11, m12, m13 145 punpckhbw m12, m13 146 mova m13, %3 147 mova %3, m12 148 punpcklbw m12, m14, m13 149 punpckhbw m13, m14, m13 150 151 ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 152 punpcklwd m14, m15, m1 153 punpckhwd m15, m1 154 punpcklwd m1, m0, m2 155 punpckhwd m0, m2 156 punpcklwd m2, m3, m5 157 punpckhwd m3, m5 158 punpcklwd m5, m4, m6 159 punpckhwd m4, m6 160 punpcklwd m6, m7, m9 161 punpckhwd m7, m9 162 punpcklwd m9, m8, m10 163 punpckhwd m8, m10 164 punpcklwd m10, m11, m12 165 punpckhwd m11, m12 166 mova m12, %3 167 mova %3, m11 168 punpcklwd m11, m12, m13 169 punpckhwd m12, m13 170 171 ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 172 punpckldq m13, m14, m2 173 punpckhdq m14, m2 174 punpckldq m2, m15, m3 175 punpckhdq m15, m3 176 punpckldq m3, m1, m5 177 punpckhdq m1, m5 178 punpckldq m5, m0, m4 179 punpckhdq m0, m4 180 punpckldq m4, m6, m10 181 punpckhdq m6, m10 182 punpckldq m10, m9, m11 183 punpckhdq m9, m11 184 punpckldq m11, m8, m12 185 punpckhdq m8, m12 186 mova m12, %3 187 mova %3, m8 188 punpckldq m8, m7, m12 189 punpckhdq m7, m12 190 191 ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 192 punpcklqdq m12, m13, m4 193 punpckhqdq m13, m4 194 punpcklqdq m4, m14, m6 195 punpckhqdq m14, m6 196 punpcklqdq m6, m2, m8 197 punpckhqdq m2, m8 198 punpcklqdq m8, m15, m7 199 punpckhqdq m15, m7 200 punpcklqdq m7, m3, m10 201 punpckhqdq m3, m10 202 punpcklqdq m10, m1, m9 203 punpckhqdq m1, m9 204 punpcklqdq m9, m5, m11 205 punpckhqdq m5, m11 206 mova m11, %3 207 mova %3, m12 208 punpcklqdq m12, m0, m11 209 punpckhqdq m0, m11 210%if %2 == 0 211 mova m11, %3 212%endif 213 214 ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 215 SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 216 SWAP 3, 14, 12, 9 217%endmacro 218 219%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 220 ; load data 221%ifidn %2, v 222%if %1 == 4 223 lea tmpq, [dstq+mstrideq*2] 224 mova m3, [tmpq+strideq*0] ; p1 225 mova m4, [tmpq+strideq*1] ; p0 226 mova m5, [tmpq+strideq*2] ; q0 227 mova m6, [tmpq+stride3q] ; q1 228%else 229 ; load 6-8 pixels, remainder (for wd=16) will be read inline 230 lea tmpq, [dstq+mstrideq*4] 231%if %1 != 6 232 mova m12, [tmpq+strideq*0] 233%endif 234 mova m13, [tmpq+strideq*1] 235 mova m3, [tmpq+strideq*2] 236 mova m4, [tmpq+stride3q] 237 mova m5, [dstq+strideq*0] 238 mova m6, [dstq+strideq*1] 239 mova m14, [dstq+strideq*2] 240%if %1 != 6 241 mova m15, [dstq+stride3q] 242%endif 243%endif 244%else 245 ; load lines 246%if %1 == 4 247 movd xm3, [dstq+strideq*0-2] 248 movd xm4, [dstq+strideq*1-2] 249 movd xm5, [dstq+strideq*2-2] 250 movd xm6, [dstq+stride3q -2] 251 lea tmpq, [dstq+strideq*4] 252 pinsrd xm3, [tmpq+strideq*0-2], 2 253 pinsrd xm4, [tmpq+strideq*1-2], 2 254 pinsrd xm5, [tmpq+strideq*2-2], 2 255 pinsrd xm6, [tmpq+stride3q -2], 2 256 lea tmpq, [tmpq+strideq*4] 257 pinsrd xm3, [tmpq+strideq*0-2], 1 258 pinsrd xm4, [tmpq+strideq*1-2], 1 259 pinsrd xm5, [tmpq+strideq*2-2], 1 260 pinsrd xm6, [tmpq+stride3q -2], 1 261 lea tmpq, [tmpq+strideq*4] 262 pinsrd xm3, [tmpq+strideq*0-2], 3 263 pinsrd xm4, [tmpq+strideq*1-2], 3 264 pinsrd xm5, [tmpq+strideq*2-2], 3 265 pinsrd xm6, [tmpq+stride3q -2], 3 266 lea tmpq, [tmpq+strideq*4] 267 movd xm12, [tmpq+strideq*0-2] 268 movd xm13, [tmpq+strideq*1-2] 269 movd xm14, [tmpq+strideq*2-2] 270 movd xm15, [tmpq+stride3q -2] 271 lea tmpq, [tmpq+strideq*4] 272 pinsrd xm12, [tmpq+strideq*0-2], 2 273 pinsrd xm13, [tmpq+strideq*1-2], 2 274 pinsrd xm14, [tmpq+strideq*2-2], 2 275 pinsrd xm15, [tmpq+stride3q -2], 2 276 lea tmpq, [tmpq+strideq*4] 277 pinsrd xm12, [tmpq+strideq*0-2], 1 278 pinsrd xm13, [tmpq+strideq*1-2], 1 279 pinsrd xm14, [tmpq+strideq*2-2], 1 280 pinsrd xm15, [tmpq+stride3q -2], 1 281 lea tmpq, [tmpq+strideq*4] 282 pinsrd xm12, [tmpq+strideq*0-2], 3 283 pinsrd xm13, [tmpq+strideq*1-2], 3 284 pinsrd xm14, [tmpq+strideq*2-2], 3 285 pinsrd xm15, [tmpq+stride3q -2], 3 286 vinserti128 m3, xm12, 1 287 vinserti128 m4, xm13, 1 288 vinserti128 m5, xm14, 1 289 vinserti128 m6, xm15, 1 290 291 ; transpose 4x16 292 ; xm3: A-D0,A-D8,A-D4,A-D12 293 ; xm4: A-D1,A-D9,A-D5,A-D13 294 ; xm5: A-D2,A-D10,A-D6,A-D14 295 ; xm6: A-D3,A-D11,A-D7,A-D15 296 punpcklbw m7, m3, m4 297 punpckhbw m3, m4 298 punpcklbw m4, m5, m6 299 punpckhbw m5, m6 300 ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 301 ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 302 ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 303 ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 304 punpcklwd m6, m7, m4 305 punpckhwd m7, m4 306 punpcklwd m4, m3, m5 307 punpckhwd m3, m5 308 ; xm6: A0-3,B0-3,C0-3,D0-3 309 ; xm7: A8-11,B8-11,C8-11,D8-11 310 ; xm4: A4-7,B4-7,C4-7,D4-7 311 ; xm3: A12-15,B12-15,C12-15,D12-15 312 punpckldq m5, m6, m4 313 punpckhdq m6, m4 314 punpckldq m4, m7, m3 315 punpckhdq m7, m3 316 ; xm5: A0-7,B0-7 317 ; xm6: C0-7,D0-7 318 ; xm4: A8-15,B8-15 319 ; xm7: C8-15,D8-15 320 punpcklqdq m3, m5, m4 321 punpckhqdq m4, m5, m4 322 punpcklqdq m5, m6, m7 323 punpckhqdq m6, m7 324 ; xm3: A0-15 325 ; xm5: B0-15 326 ; xm4: C0-15 327 ; xm6: D0-15 328%elif %1 == 6 || %1 == 8 329 movq xm3, [dstq+strideq*0-%1/2] 330 movq xm4, [dstq+strideq*1-%1/2] 331 movq xm5, [dstq+strideq*2-%1/2] 332 movq xm6, [dstq+stride3q -%1/2] 333 lea tmpq, [dstq+strideq*8] 334 movhps xm3, [tmpq+strideq*0-%1/2] 335 movhps xm4, [tmpq+strideq*1-%1/2] 336 movhps xm5, [tmpq+strideq*2-%1/2] 337 movhps xm6, [tmpq+stride3q -%1/2] 338 lea tmpq, [tmpq+strideq*8] 339 movq xm7, [tmpq+strideq*0-%1/2] 340 movq xm8, [tmpq+strideq*1-%1/2] 341 movq xm9, [tmpq+strideq*2-%1/2] 342 movq xm11, [tmpq+stride3q -%1/2] 343 lea tmpq, [tmpq+strideq*8] 344 movhps xm7, [tmpq+strideq*0-%1/2] 345 movhps xm8, [tmpq+strideq*1-%1/2] 346 movhps xm9, [tmpq+strideq*2-%1/2] 347 movhps xm11, [tmpq+stride3q -%1/2] 348 vinserti128 m3, xm7, 1 349 vinserti128 m4, xm8, 1 350 vinserti128 m5, xm9, 1 351 vinserti128 m6, xm11, 1 352 lea tmpq, [dstq+strideq*4] 353 movq xm12, [tmpq+strideq*0-%1/2] 354 movq xm13, [tmpq+strideq*1-%1/2] 355 movq xm14, [tmpq+strideq*2-%1/2] 356 movq xm15, [tmpq+stride3q -%1/2] 357 lea tmpq, [tmpq+strideq*8] 358 movhps xm12, [tmpq+strideq*0-%1/2] 359 movhps xm13, [tmpq+strideq*1-%1/2] 360 movhps xm14, [tmpq+strideq*2-%1/2] 361 movhps xm15, [tmpq+stride3q -%1/2] 362 lea tmpq, [tmpq+strideq*8] 363 movq xm7, [tmpq+strideq*0-%1/2] 364 movq xm8, [tmpq+strideq*1-%1/2] 365 movq xm9, [tmpq+strideq*2-%1/2] 366 movq xm11, [tmpq+stride3q -%1/2] 367 lea tmpq, [tmpq+strideq*8] 368 movhps xm7, [tmpq+strideq*0-%1/2] 369 movhps xm8, [tmpq+strideq*1-%1/2] 370 movhps xm9, [tmpq+strideq*2-%1/2] 371 movhps xm11, [tmpq+stride3q -%1/2] 372 vinserti128 m12, xm7, 1 373 vinserti128 m13, xm8, 1 374 vinserti128 m14, xm9, 1 375 vinserti128 m15, xm11, 1 376 377 ; transpose 8x16 378 ; xm3: A-H0,A-H8 379 ; xm4: A-H1,A-H9 380 ; xm5: A-H2,A-H10 381 ; xm6: A-H3,A-H11 382 ; xm12: A-H4,A-H12 383 ; xm13: A-H5,A-H13 384 ; xm14: A-H6,A-H14 385 ; xm15: A-H7,A-H15 386 punpcklbw m7, m3, m4 387 punpckhbw m3, m4 388 punpcklbw m4, m5, m6 389 punpckhbw m5, m6 390 punpcklbw m6, m12, m13 391 punpckhbw m12, m13 392 punpcklbw m13, m14, m15 393 punpckhbw m14, m15 394 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 395 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 396 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 397 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 398 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 399 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 400 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 401 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 402 punpcklwd m15, m7, m4 403 punpckhwd m7, m4 404 punpcklwd m4, m3, m5 405 punpckhwd m3, m5 406 punpcklwd m5, m6, m13 407 punpckhwd m6, m13 408 punpcklwd m13, m12, m14 409 punpckhwd m12, m14 410 ; xm15: A0-3,B0-3,C0-3,D0-3 411 ; xm7: E0-3,F0-3,G0-3,H0-3 412 ; xm4: A8-11,B8-11,C8-11,D8-11 413 ; xm3: E8-11,F8-11,G8-11,H8-11 414 ; xm5: A4-7,B4-7,C4-7,D4-7 415 ; xm6: E4-7,F4-7,G4-7,H4-7 416 ; xm13: A12-15,B12-15,C12-15,D12-15 417 ; xm12: E12-15,F12-15,G12-15,H12-15 418 punpckldq m14, m15, m5 419 punpckhdq m15, m5 420 punpckldq m5, m7, m6 421%if %1 != 6 422 punpckhdq m7, m6 423%endif 424 punpckldq m6, m4, m13 425 punpckhdq m4, m13 426 punpckldq m13, m3, m12 427%if %1 != 6 428 punpckhdq m12, m3, m12 429%endif 430 ; xm14: A0-7,B0-7 431 ; xm15: C0-7,D0-7 432 ; xm5: E0-7,F0-7 433 ; xm7: G0-7,H0-7 434 ; xm6: A8-15,B8-15 435 ; xm4: C8-15,D8-15 436 ; xm13: E8-15,F8-15 437 ; xm12: G8-15,H8-15 438 punpcklqdq m3, m14, m6 439 punpckhqdq m14, m6 440 punpckhqdq m6, m15, m4 441 punpcklqdq m15, m4 442 punpcklqdq m4, m5, m13 443 punpckhqdq m13, m5, m13 444%if %1 == 8 445 punpcklqdq m5, m7, m12 446 punpckhqdq m12, m7, m12 447 ; xm3: A0-15 448 ; xm14: B0-15 449 ; xm15: C0-15 450 ; xm6: D0-15 451 ; xm4: E0-15 452 ; xm13: F0-15 453 ; xm5: G0-15 454 ; xm12: H0-15 455 SWAP 12, 3, 15 456 SWAP 13, 14, 5, 4, 6 457 ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 458%else 459 SWAP 13, 3, 14 460 SWAP 6, 4, 15, 5 461 ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 462%endif 463%else 464 ; load and 16x16 transpose. We only use 14 pixels but we'll need the 465 ; remainder at the end for the second transpose 466 movu xm0, [dstq+strideq*0-8] 467 movu xm1, [dstq+strideq*1-8] 468 movu xm2, [dstq+strideq*2-8] 469 movu xm3, [dstq+stride3q -8] 470 lea tmpq, [dstq+strideq*4] 471 movu xm4, [tmpq+strideq*0-8] 472 movu xm5, [tmpq+strideq*1-8] 473 movu xm6, [tmpq+strideq*2-8] 474 movu xm7, [tmpq+stride3q -8] 475 lea tmpq, [tmpq+strideq*4] 476 movu xm8, [tmpq+strideq*0-8] 477 movu xm9, [tmpq+strideq*1-8] 478 movu xm10, [tmpq+strideq*2-8] 479 movu xm11, [tmpq+stride3q -8] 480 lea tmpq, [tmpq+strideq*4] 481 movu xm12, [tmpq+strideq*0-8] 482 movu xm13, [tmpq+strideq*1-8] 483 movu xm14, [tmpq+strideq*2-8] 484 movu xm15, [tmpq+stride3q -8] 485 lea tmpq, [tmpq+strideq*4] 486 vinserti128 m0, [tmpq+strideq*0-8], 1 487 vinserti128 m1, [tmpq+strideq*1-8], 1 488 vinserti128 m2, [tmpq+strideq*2-8], 1 489 vinserti128 m3, [tmpq+stride3q -8], 1 490 lea tmpq, [tmpq+strideq*4] 491 vinserti128 m4, [tmpq+strideq*0-8], 1 492 vinserti128 m5, [tmpq+strideq*1-8], 1 493 vinserti128 m6, [tmpq+strideq*2-8], 1 494 vinserti128 m7, [tmpq+stride3q -8], 1 495 lea tmpq, [tmpq+strideq*4] 496 vinserti128 m8, [tmpq+strideq*0-8], 1 497 vinserti128 m9, [tmpq+strideq*1-8], 1 498 vinserti128 m10, [tmpq+strideq*2-8], 1 499 vinserti128 m11, [tmpq+stride3q -8], 1 500 lea tmpq, [tmpq+strideq*4] 501 vinserti128 m12, [tmpq+strideq*0-8], 1 502 vinserti128 m13, [tmpq+strideq*1-8], 1 503 vinserti128 m14, [tmpq+strideq*2-8], 1 504 vinserti128 m15, [tmpq+stride3q -8], 1 505 506 TRANSPOSE_16X16B 0, 1, [rsp+11*32] 507 mova [rsp+12*32], m1 508 mova [rsp+13*32], m2 509 mova [rsp+14*32], m3 510 mova [rsp+15*32], m12 511 mova [rsp+16*32], m13 512 mova [rsp+17*32], m14 513 mova [rsp+18*32], m15 514 ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 515 SWAP 12, 4, 7 516 SWAP 13, 5, 8 517 SWAP 3, 6, 9 518 SWAP 10, 14 519 SWAP 11, 15 520%endif 521%endif 522 523 ; load L/E/I/H 524%ifidn %2, v 525 movu m1, [lq] 526 movu m0, [lq+l_strideq] 527%else 528 movq xm1, [lq] 529 movq xm2, [lq+l_strideq*2] 530 movhps xm1, [lq+l_strideq] 531 movhps xm2, [lq+l_stride3q] 532 lea lq, [lq+l_strideq*4] 533 movq xm10, [lq] 534 movq xm0, [lq+l_strideq*2] 535 movhps xm10, [lq+l_strideq] 536 movhps xm0, [lq+l_stride3q] 537 lea lq, [lq+l_strideq*4] 538 vinserti128 m1, xm10, 1 539 vinserti128 m2, xm0, 1 540 shufps m0, m1, m2, q3131 541 shufps m1, m2, q2020 542%endif 543 pxor m2, m2 544 pcmpeqb m10, m2, m0 545 pand m1, m10 546 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] 547 pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] 548 pcmpeqb m10, m2, m0 ; !L 549 psrlq m2, m0, [lutq+128] 550 pand m2, [pb_63] 551 vpbroadcastb m1, [lutq+136] 552 pminub m2, m1 553 pmaxub m2, [pb_1] ; I 554 pand m1, m0, [pb_240] 555 psrlq m1, 4 ; H 556 paddb m0, [pb_2] 557 paddb m0, m0 558 paddb m0, m2 ; E 559 pxor m1, [pb_128] 560 pxor m2, [pb_128] 561 pxor m0, [pb_128] 562 563 ABSSUB m8, m3, m4, m9 ; abs(p1-p0) 564 pmaxub m8, m10 565 ABSSUB m9, m5, m6, m10 ; abs(q1-q0) 566 pmaxub m8, m9 567%if %1 == 4 568 pxor m8, [pb_128] 569 pcmpgtb m7, m8, m1 ; hev 570%else 571 pxor m7, m8, [pb_128] 572 pcmpgtb m7, m1 ; hev 573 574%if %1 == 6 575 ABSSUB m9, m13, m4, m10 ; abs(p2-p0) 576 pmaxub m9, m8 577%else 578 ABSSUB m9, m12, m4, m10 ; abs(p3-p0) 579 pmaxub m9, m8 580 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) 581 pmaxub m9, m10 582%endif 583 ABSSUB m10, m5, m14, m11 ; abs(q2-q0) 584 pmaxub m9, m10 585%if %1 != 6 586 ABSSUB m10, m5, m15, m11 ; abs(q3-q0) 587 pmaxub m9, m10 588%endif 589 pxor m9, [pb_128] 590 pcmpgtb m9, [pb_129] ; !flat8in 591 592%if %1 == 6 593 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) 594%else 595 ABSSUB m10, m12, m13, m11 ; abs(p3-p2) 596 ABSSUB m11, m13, m3, m1 ; abs(p2-p1) 597 pmaxub m10, m11 598 ABSSUB m11, m14, m15, m1 ; abs(q3-q2) 599 pmaxub m10, m11 600%endif 601 ABSSUB m11, m14, m6, m1 ; abs(q2-q1) 602 pmaxub m10, m11 603%if %1 == 16 604 vpbroadcastd m11, [maskq+8] 605 vpbroadcastd m1, [maskq+4] 606 por m11, m1 607 pand m11, [pb_mask] 608 pcmpeqd m11, [pb_mask] 609 pand m10, m11 610%else 611 vpbroadcastd m11, [maskq+4] 612 pand m11, [pb_mask] 613 pcmpeqd m11, [pb_mask] 614 pand m10, m11 ; only apply fm-wide to wd>4 blocks 615%endif 616 pmaxub m8, m10 617 618 pxor m8, [pb_128] 619%endif 620 pcmpgtb m8, m2 621 622 ABSSUB m10, m3, m6, m11 ; abs(p1-q1) 623 ABSSUB m11, m4, m5, m2 ; abs(p0-q0) 624 paddusb m11, m11 625 pand m10, [pb_254] 626 psrlq m10, 1 627 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 628 pxor m10, [pb_128] 629 pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E 630 por m8, m10 631 632%if %1 == 16 633%ifidn %2, v 634 lea tmpq, [dstq+mstrideq*8] 635 mova m0, [tmpq+strideq*1] 636%else 637 mova m0, [rsp+12*32] 638%endif 639 ABSSUB m1, m0, m4, m2 640%ifidn %2, v 641 mova m0, [tmpq+strideq*2] 642%else 643 mova m0, [rsp+13*32] 644%endif 645 ABSSUB m2, m0, m4, m10 646 pmaxub m1, m2 647%ifidn %2, v 648 mova m0, [tmpq+stride3q] 649%else 650 mova m0, [rsp+14*32] 651%endif 652 ABSSUB m2, m0, m4, m10 653 pmaxub m1, m2 654%ifidn %2, v 655 lea tmpq, [dstq+strideq*4] 656 mova m0, [tmpq+strideq*0] 657%else 658 mova m0, [rsp+15*32] 659%endif 660 ABSSUB m2, m0, m5, m10 661 pmaxub m1, m2 662%ifidn %2, v 663 mova m0, [tmpq+strideq*1] 664%else 665 mova m0, [rsp+16*32] 666%endif 667 ABSSUB m2, m0, m5, m10 668 pmaxub m1, m2 669%ifidn %2, v 670 mova m0, [tmpq+strideq*2] 671%else 672 mova m0, [rsp+17*32] 673%endif 674 ABSSUB m2, m0, m5, m10 675 pmaxub m1, m2 676 pxor m1, [pb_128] 677 pcmpgtb m1, [pb_129] ; !flat8out 678 por m1, m9 ; !flat8in | !flat8out 679 vpbroadcastd m2, [maskq+8] 680 pand m10, m2, [pb_mask] 681 pcmpeqd m10, [pb_mask] 682 pandn m1, m10 ; flat16 683 pandn m1, m8, m1 ; flat16 & fm 684 685 vpbroadcastd m10, [maskq+4] 686 por m10, m2 687 pand m2, m10, [pb_mask] 688 pcmpeqd m2, [pb_mask] 689 pandn m9, m2 ; flat8in 690 pandn m9, m8, m9 691 vpbroadcastd m2, [maskq+0] 692 por m2, m10 693 pand m2, [pb_mask] 694 pcmpeqd m2, [pb_mask] 695 pandn m8, m2 696 pandn m8, m9, m8 ; fm & !flat8 & !flat16 697 pandn m9, m1, m9 ; flat8 & !flat16 698%elif %1 != 4 699 vpbroadcastd m0, [maskq+4] 700 pand m2, m0, [pb_mask] 701 pcmpeqd m2, [pb_mask] 702 pandn m9, m2 703 pandn m9, m8, m9 ; flat8 & fm 704 vpbroadcastd m2, [maskq+0] 705 por m0, m2 706 pand m0, [pb_mask] 707 pcmpeqd m0, [pb_mask] 708 pandn m8, m0 709 pandn m8, m9, m8 ; fm & !flat8 710%else 711 vpbroadcastd m0, [maskq+0] 712 pand m0, [pb_mask] 713 pcmpeqd m0, [pb_mask] 714 pandn m8, m0 ; fm 715%endif 716 717 ; short filter 718 719 pxor m3, [pb_128] 720 pxor m6, [pb_128] 721 psubsb m10, m3, m6 ; iclip_diff(p1-q1) 722 pand m10, m7 ; f=iclip_diff(p1-q1)&hev 723 pxor m4, [pb_128] 724 pxor m5, [pb_128] 725 psubsb m11, m5, m4 726 paddsb m10, m11 727 paddsb m10, m11 728 paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) 729 pand m8, m10 ; f&=fm 730 paddsb m10, m8, [pb_3] 731 paddsb m8, [pb_4] 732 pand m10, [pb_248] 733 pand m8, [pb_248] 734 psrlq m10, 3 735 psrlq m8, 3 736 pxor m10, [pb_16] 737 pxor m8, [pb_16] 738 psubb m10, [pb_16] ; f2 739 psubb m8, [pb_16] ; f1 740 paddsb m4, m10 741 psubsb m5, m8 742 pxor m4, [pb_128] 743 pxor m5, [pb_128] 744 745 pxor m8, [pb_128] 746 pxor m10, m10 747 pavgb m8, m10 ; f=(f1+1)>>1 748 psubb m8, [pb_64] 749 pandn m8, m7, m8 ; f&=!hev 750 paddsb m3, m8 751 psubsb m6, m8 752 pxor m3, [pb_128] 753 pxor m6, [pb_128] 754 755%if %1 == 16 756 ; flat16 filter 757%ifidn %2, v 758 lea tmpq, [dstq+mstrideq*8] 759 mova m0, [tmpq+strideq*1] ; p6 760 mova m2, [tmpq+strideq*2] ; p5 761 mova m7, [tmpq+stride3q] ; p4 762%else 763 mova m0, [rsp+12*32] 764 mova m2, [rsp+13*32] 765 mova m7, [rsp+14*32] 766%endif 767 768 mova [rsp+0*32], m9 769 mova [rsp+1*32], m14 770 mova [rsp+2*32], m15 771 772 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A 773 ; write -6 774 punpcklbw m14, m0, m12 775 punpckhbw m15, m0, m12 776 pmaddubsw m10, m14, [pb_7_1] 777 pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 778 punpcklbw m8, m2, m7 779 punpckhbw m9, m2, m7 780 pmaddubsw m8, [pb_2] 781 pmaddubsw m9, [pb_2] 782 paddw m10, m8 783 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 784 punpcklbw m8, m13, m3 785 punpckhbw m9, m13, m3 786 pmaddubsw m8, [pb_1] 787 pmaddubsw m9, [pb_1] 788 paddw m10, m8 789 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 790 punpcklbw m8, m4, m5 791 punpckhbw m9, m4, m5 792 pmaddubsw m8, [pb_1] 793 pmaddubsw m9, [pb_1] 794 paddw m10, m8 795 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 796 pmulhrsw m8, m10, [pw_2048] 797 pmulhrsw m9, m11, [pw_2048] 798 packuswb m8, m9 799 pand m8, m1 800 pandn m9, m1, m2 801 por m8, m9 802%ifidn %2, v 803 mova [tmpq+strideq*2], m8 ; p5 804%else 805 mova [rsp+13*32], m8 806%endif 807 808 ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B 809 ; write -5 810 pmaddubsw m14, [pb_m1_1] 811 pmaddubsw m15, [pb_m1_1] 812 paddw m10, m14 813 paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 814 punpcklbw m8, m0, m6 815 punpckhbw m9, m0, m6 816 pmaddubsw m8, [pb_m1_1] 817 pmaddubsw m9, [pb_m1_1] 818 mova [rsp+3*32], m8 819 mova [rsp+4*32], m9 820 paddw m10, m8 821 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 822 pmulhrsw m8, m10, [pw_2048] 823 pmulhrsw m9, m11, [pw_2048] 824 packuswb m8, m9 825 pand m8, m1 826 pandn m9, m1, m7 827 por m8, m9 828%ifidn %2, v 829 mova [tmpq+stride3q], m8 ; p4 830%else 831 mova [rsp+14*32], m8 832%endif 833 834 ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C 835 ; write -4 836 mova m14, [rsp+1*32] 837 punpcklbw m8, m0, m13 838 punpckhbw m9, m0, m13 839 pmaddubsw m8, [pb_m1_1] 840 pmaddubsw m9, [pb_m1_1] 841 paddw m10, m8 842 paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 843 punpcklbw m8, m2, m14 844 punpckhbw m2, m14 845 pmaddubsw m8, [pb_m1_1] 846 pmaddubsw m2, [pb_m1_1] 847 mova [rsp+1*32], m8 848 paddw m10, m8 849 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 850 pmulhrsw m8, m10, [pw_2048] 851 pmulhrsw m9, m11, [pw_2048] 852 packuswb m8, m9 853 pand m8, m1 854 pandn m9, m1, m12 855 por m8, m9 856%ifidn %2, v 857 mova [tmpq+strideq*4], m8 ; p3 858%else 859 mova [rsp+19*32], m8 860%endif 861 862 ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D 863 ; write -3 864 mova m15, [rsp+2*32] 865 punpcklbw m8, m0, m3 866 punpckhbw m9, m0, m3 867 pmaddubsw m8, [pb_m1_1] 868 pmaddubsw m9, [pb_m1_1] 869 paddw m10, m8 870 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 871 punpcklbw m8, m7, m15 872 punpckhbw m7, m15 873 pmaddubsw m8, [pb_m1_1] 874 pmaddubsw m7, [pb_m1_1] 875 mova [rsp+2*32], m8 876 paddw m10, m8 877 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 878 pmulhrsw m8, m10, [pw_2048] 879 pmulhrsw m9, m11, [pw_2048] 880 packuswb m8, m9 881 pand m8, m1 882 pandn m9, m1, m13 883 por m8, m9 884 mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F 885 886 ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E 887 ; write -2 888%ifidn %2, v 889 lea tmpq, [dstq+strideq*4] 890%endif 891 punpcklbw m8, m0, m4 892 punpckhbw m9, m0, m4 893 pmaddubsw m8, [pb_m1_1] 894 pmaddubsw m9, [pb_m1_1] 895 paddw m10, m8 896 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 897%ifidn %2, v 898 mova m9, [tmpq+strideq*0] ; q4 899%else 900 mova m9, [rsp+15*32] 901%endif 902 punpcklbw m8, m12, m9 903 punpckhbw m9, m12, m9 904 pmaddubsw m8, [pb_m1_1] 905 pmaddubsw m9, [pb_m1_1] 906 mova [rsp+7*32], m8 907 mova [rsp+5*32], m9 908 paddw m10, m8 909 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 910 pmulhrsw m8, m10, [pw_2048] 911 pmulhrsw m9, m11, [pw_2048] 912 packuswb m8, m9 913 pand m8, m1 914 pandn m9, m1, m3 915 por m8, m9 916 mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G 917 918 ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F 919 ; write -1 920%ifidn %2, v 921 mova m9, [tmpq+strideq*1] ; q5 922%else 923 mova m9, [rsp+16*32] 924%endif 925 punpcklbw m8, m0, m5 926 punpckhbw m0, m5 927 pmaddubsw m8, [pb_m1_1] 928 pmaddubsw m0, [pb_m1_1] 929 paddw m10, m8 930 paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 931 punpcklbw m0, m13, m9 932 punpckhbw m9, m13, m9 933 mova m13, [rsp+6*32] 934 pmaddubsw m0, [pb_m1_1] 935 pmaddubsw m9, [pb_m1_1] 936 mova [rsp+ 9*32], m0 937 mova [rsp+10*32], m9 938 paddw m10, m0 939 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 940 pmulhrsw m0, m10, [pw_2048] 941 pmulhrsw m8, m11, [pw_2048] 942 packuswb m0, m8 943 pand m0, m1 944 pandn m8, m1, m4 945 por m0, m8 946 mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H 947 948 ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G 949 ; write +0 950%ifidn %2, v 951 mova m0, [tmpq+strideq*2] ; q6 952%else 953 mova m0, [rsp+17*32] 954%endif 955 paddw m10, [rsp+3*32] 956 paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 957 punpcklbw m8, m3, m0 958 punpckhbw m9, m3, m0 959 mova m3, [rsp+8*32] 960 pmaddubsw m8, [pb_m1_1] 961 pmaddubsw m9, [pb_m1_1] 962 mova [rsp+3*32], m8 963 mova [rsp+4*32], m9 964 paddw m10, m8 965 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 966 pmulhrsw m8, m10, [pw_2048] 967 pmulhrsw m9, m11, [pw_2048] 968 packuswb m8, m9 969 pand m8, m1 970 pandn m9, m1, m5 971 por m8, m9 972 mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I 973 974 ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H 975 ; write +1 976 paddw m10, [rsp+1*32] 977 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 978 punpcklbw m8, m4, m0 979 punpckhbw m2, m4, m0 980 mova m4, [rsp+6*32] 981 pmaddubsw m8, [pb_m1_1] 982 pmaddubsw m2, [pb_m1_1] 983 paddw m10, m8 984 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 985 pmulhrsw m2, m10, [pw_2048] 986 pmulhrsw m9, m11, [pw_2048] 987 packuswb m2, m9 988 pand m2, m1 989 pandn m9, m1, m6 990 por m2, m9 ; don't clobber q1/m6 since we need it in K 991 992 ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I 993 ; write +2 994 paddw m10, [rsp+2*32] 995 paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 996 punpcklbw m8, m5, m0 997 punpckhbw m9, m5, m0 998 mova m5, [rsp+8*32] 999 pmaddubsw m8, [pb_m1_1] 1000 pmaddubsw m9, [pb_m1_1] 1001 paddw m10, m8 1002 paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 1003 pmulhrsw m7, m10, [pw_2048] 1004 pmulhrsw m9, m11, [pw_2048] 1005 packuswb m7, m9 1006 pand m7, m1 1007 pandn m9, m1, m14 1008 por m7, m9 ; don't clobber q2/m14 since we need it in K 1009 1010 ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J 1011 ; write +3 1012 paddw m10, [rsp+7*32] 1013 paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 1014 punpcklbw m8, m6, m0 1015 punpckhbw m9, m6, m0 1016 SWAP 2, 6 1017 pmaddubsw m8, [pb_m1_1] 1018 pmaddubsw m9, [pb_m1_1] 1019 paddw m10, m8 1020 paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 1021 pmulhrsw m8, m10, [pw_2048] 1022 pmulhrsw m9, m11, [pw_2048] 1023 packuswb m8, m9 1024 pand m8, m1 1025 pandn m9, m1, m15 1026 por m8, m9 1027%ifidn %2, v 1028 mova [tmpq+mstrideq], m8 ; q3 1029%else 1030 mova [rsp+20*32], m8 1031%endif 1032 1033 ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K 1034 ; write +4 1035 paddw m10, [rsp+ 9*32] 1036 paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 1037 punpcklbw m8, m14, m0 1038 punpckhbw m9, m14, m0 1039 SWAP 14, 7 1040 pmaddubsw m8, [pb_m1_1] 1041 pmaddubsw m9, [pb_m1_1] 1042 paddw m10, m8 1043 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 1044 pmulhrsw m8, m10, [pw_2048] 1045 pmulhrsw m9, m11, [pw_2048] 1046 packuswb m8, m9 1047 pand m8, m1 1048%ifidn %2, v 1049 pandn m9, m1, [tmpq+strideq*0] 1050%else 1051 pandn m9, m1, [rsp+15*32] 1052%endif 1053 por m8, m9 1054%ifidn %2, v 1055 mova [tmpq+strideq*0], m8 ; q4 1056%else 1057 mova [rsp+15*32], m8 1058%endif 1059 1060 ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L 1061 ; write +5 1062 paddw m10, [rsp+3*32] 1063 paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 1064 punpcklbw m8, m15, m0 1065 punpckhbw m9, m15, m0 1066 pmaddubsw m8, [pb_m1_1] 1067 pmaddubsw m9, [pb_m1_1] 1068 paddw m10, m8 1069 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 1070 pmulhrsw m10, [pw_2048] 1071 pmulhrsw m11, [pw_2048] 1072 packuswb m10, m11 1073 pand m10, m1 1074%ifidn %2, v 1075 pandn m11, m1, [tmpq+strideq*1] 1076%else 1077 pandn m11, m1, [rsp+16*32] 1078%endif 1079 por m10, m11 1080%ifidn %2, v 1081 mova [tmpq+strideq*1], m10 ; q5 1082%else 1083 mova [rsp+16*32], m10 1084%endif 1085 1086 mova m9, [rsp+0*32] 1087%ifidn %2, v 1088 lea tmpq, [dstq+mstrideq*4] 1089%endif 1090%endif 1091%if %1 >= 8 1092 ; flat8 filter 1093 punpcklbw m0, m12, m3 1094 punpckhbw m1, m12, m3 1095 pmaddubsw m2, m0, [pb_3_1] 1096 pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 1097 punpcklbw m8, m13, m4 1098 punpckhbw m11, m13, m4 1099 pmaddubsw m8, [pb_2_1] 1100 pmaddubsw m11, [pb_2_1] 1101 paddw m2, m8 1102 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 1103 punpcklbw m8, m5, [pb_4] 1104 punpckhbw m11, m5, [pb_4] 1105 pmaddubsw m8, [pb_1] 1106 pmaddubsw m11, [pb_1] 1107 paddw m2, m8 1108 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 1109 psrlw m8, m2, 3 1110 psrlw m11, m7, 3 1111 packuswb m8, m11 1112 pand m8, m9 1113 pandn m11, m9, m13 1114 por m10, m8, m11 ; p2 1115%ifidn %2, v 1116 mova [tmpq+strideq*1], m10 ; p2 1117%endif 1118 1119 pmaddubsw m8, m0, [pb_m1_1] 1120 pmaddubsw m11, m1, [pb_m1_1] 1121 paddw m2, m8 1122 paddw m7, m11 1123 punpcklbw m8, m13, m6 1124 punpckhbw m11, m13, m6 1125 pmaddubsw m8, [pb_m1_1] 1126 pmaddubsw m11, [pb_m1_1] 1127 paddw m2, m8 1128 paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 1129 psrlw m8, m2, 3 1130 psrlw m11, m7, 3 1131 packuswb m8, m11 1132 pand m8, m9 1133 pandn m11, m9, m3 1134 por m8, m11 ; p1 1135%ifidn %2, v 1136 mova [tmpq+strideq*2], m8 ; p1 1137%else 1138 mova [rsp+0*32], m8 1139%endif 1140 1141 pmaddubsw m0, [pb_1] 1142 pmaddubsw m1, [pb_1] 1143 psubw m2, m0 1144 psubw m7, m1 1145 punpcklbw m8, m4, m14 1146 punpckhbw m11, m4, m14 1147 pmaddubsw m8, [pb_1] 1148 pmaddubsw m11, [pb_1] 1149 paddw m2, m8 1150 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 1151 psrlw m8, m2, 3 1152 psrlw m11, m7, 3 1153 packuswb m8, m11 1154 pand m8, m9 1155 pandn m11, m9, m4 1156 por m8, m11 ; p0 1157%ifidn %2, v 1158 mova [tmpq+stride3q ], m8 ; p0 1159%else 1160 mova [rsp+1*32], m8 1161%endif 1162 1163 punpcklbw m0, m5, m15 1164 punpckhbw m1, m5, m15 1165 pmaddubsw m8, m0, [pb_1] 1166 pmaddubsw m11, m1, [pb_1] 1167 paddw m2, m8 1168 paddw m7, m11 1169 punpcklbw m8, m4, m12 1170 punpckhbw m11, m4, m12 1171 pmaddubsw m8, [pb_1] 1172 pmaddubsw m11, [pb_1] 1173 psubw m2, m8 1174 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 1175 psrlw m8, m2, 3 1176 psrlw m11, m7, 3 1177 packuswb m8, m11 1178 pand m8, m9 1179 pandn m11, m9, m5 1180 por m11, m8, m11 ; q0 1181%ifidn %2, v 1182 mova [dstq+strideq*0], m11 ; q0 1183%endif 1184 1185 pmaddubsw m0, [pb_m1_1] 1186 pmaddubsw m1, [pb_m1_1] 1187 paddw m2, m0 1188 paddw m7, m1 1189 punpcklbw m8, m13, m6 1190 punpckhbw m13, m6 1191 pmaddubsw m8, [pb_m1_1] 1192 pmaddubsw m13, [pb_m1_1] 1193 paddw m2, m8 1194 paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 1195 psrlw m8, m2, 3 1196 psrlw m13, m7, 3 1197 packuswb m8, m13 1198 pand m8, m9 1199 pandn m13, m9, m6 1200 por m13, m8, m13 ; q1 1201%ifidn %2, v 1202 mova [dstq+strideq*1], m13 ; q1 1203%endif 1204 1205 punpcklbw m0, m3, m6 1206 punpckhbw m1, m3, m6 1207 pmaddubsw m0, [pb_1] 1208 pmaddubsw m1, [pb_1] 1209 psubw m2, m0 1210 psubw m7, m1 1211 punpcklbw m0, m14, m15 1212 punpckhbw m1, m14, m15 1213 pmaddubsw m0, [pb_1] 1214 pmaddubsw m1, [pb_1] 1215 paddw m2, m0 1216 paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 1217 psrlw m2, 3 1218 psrlw m7, 3 1219 packuswb m2, m7 1220 pand m2, m9 1221 pandn m7, m9, m14 1222 por m2, m7 ; q2 1223%ifidn %2, v 1224 mova [dstq+strideq*2], m2 ; q2 1225%else 1226 mova m0, [rsp+0*32] 1227 mova m1, [rsp+1*32] 1228%if %1 == 8 1229 ; 16x8 transpose 1230 punpcklbw m3, m12, m10 1231 punpckhbw m12, m10 1232 punpcklbw m10, m0, m1 1233 punpckhbw m0, m1 1234 punpcklbw m1, m11, m13 1235 punpckhbw m11, m13 1236 punpcklbw m13, m2, m15 1237 punpckhbw m2, m15 1238 1239 punpcklwd m15, m3, m10 1240 punpckhwd m3, m10 1241 punpcklwd m10, m12, m0 1242 punpckhwd m12, m0 1243 punpcklwd m0, m1, m13 1244 punpckhwd m1, m13 1245 punpcklwd m13, m11, m2 1246 punpckhwd m11, m2 1247 1248 punpckldq m2, m15, m0 1249 punpckhdq m15, m0 1250 punpckldq m0, m3, m1 1251 punpckhdq m3, m1 1252 punpckldq m1, m10, m13 1253 punpckhdq m10, m13 1254 punpckldq m13, m12, m11 1255 punpckhdq m12, m11 1256 1257 ; write 8x32 1258 movq [dstq+strideq*0-4], xm2 1259 movhps [dstq+strideq*1-4], xm2 1260 movq [dstq+strideq*2-4], xm15 1261 movhps [dstq+stride3q -4], xm15 1262 lea dstq, [dstq+strideq*4] 1263 movq [dstq+strideq*0-4], xm0 1264 movhps [dstq+strideq*1-4], xm0 1265 movq [dstq+strideq*2-4], xm3 1266 movhps [dstq+stride3q -4], xm3 1267 lea dstq, [dstq+strideq*4] 1268 movq [dstq+strideq*0-4], xm1 1269 movhps [dstq+strideq*1-4], xm1 1270 movq [dstq+strideq*2-4], xm10 1271 movhps [dstq+stride3q -4], xm10 1272 lea dstq, [dstq+strideq*4] 1273 movq [dstq+strideq*0-4], xm13 1274 movhps [dstq+strideq*1-4], xm13 1275 movq [dstq+strideq*2-4], xm12 1276 movhps [dstq+stride3q -4], xm12 1277 lea dstq, [dstq+strideq*4] 1278 1279 vextracti128 xm2, m2, 1 1280 vextracti128 xm15, m15, 1 1281 vextracti128 xm0, m0, 1 1282 vextracti128 xm3, m3, 1 1283 vextracti128 xm1, m1, 1 1284 vextracti128 xm10, m10, 1 1285 vextracti128 xm13, m13, 1 1286 vextracti128 xm12, m12, 1 1287 1288 movq [dstq+strideq*0-4], xm2 1289 movhps [dstq+strideq*1-4], xm2 1290 movq [dstq+strideq*2-4], xm15 1291 movhps [dstq+stride3q -4], xm15 1292 lea dstq, [dstq+strideq*4] 1293 movq [dstq+strideq*0-4], xm0 1294 movhps [dstq+strideq*1-4], xm0 1295 movq [dstq+strideq*2-4], xm3 1296 movhps [dstq+stride3q -4], xm3 1297 lea dstq, [dstq+strideq*4] 1298 movq [dstq+strideq*0-4], xm1 1299 movhps [dstq+strideq*1-4], xm1 1300 movq [dstq+strideq*2-4], xm10 1301 movhps [dstq+stride3q -4], xm10 1302 lea dstq, [dstq+strideq*4] 1303 movq [dstq+strideq*0-4], xm13 1304 movhps [dstq+strideq*1-4], xm13 1305 movq [dstq+strideq*2-4], xm12 1306 movhps [dstq+stride3q -4], xm12 1307 lea dstq, [dstq+strideq*4] 1308%else 1309 ; 16x16 transpose and store 1310 SWAP 5, 10, 2 1311 SWAP 6, 0 1312 SWAP 7, 1 1313 SWAP 8, 11 1314 SWAP 9, 13 1315 mova m0, [rsp+11*32] 1316 mova m1, [rsp+12*32] 1317 mova m2, [rsp+13*32] 1318 mova m3, [rsp+14*32] 1319 mova m4, [rsp+19*32] 1320 mova m11, [rsp+20*32] 1321 mova m12, [rsp+15*32] 1322 mova m13, [rsp+16*32] 1323 mova m14, [rsp+17*32] 1324 TRANSPOSE_16X16B 1, 0, [rsp+18*32] 1325 movu [dstq+strideq*0-8], xm0 1326 movu [dstq+strideq*1-8], xm1 1327 movu [dstq+strideq*2-8], xm2 1328 movu [dstq+stride3q -8], xm3 1329 lea dstq, [dstq+strideq*4] 1330 movu [dstq+strideq*0-8], xm4 1331 movu [dstq+strideq*1-8], xm5 1332 movu [dstq+strideq*2-8], xm6 1333 movu [dstq+stride3q -8], xm7 1334 lea dstq, [dstq+strideq*4] 1335 movu [dstq+strideq*0-8], xm8 1336 movu [dstq+strideq*1-8], xm9 1337 movu [dstq+strideq*2-8], xm10 1338 movu [dstq+stride3q -8], xm11 1339 lea dstq, [dstq+strideq*4] 1340 movu [dstq+strideq*0-8], xm12 1341 movu [dstq+strideq*1-8], xm13 1342 movu [dstq+strideq*2-8], xm14 1343 movu [dstq+stride3q -8], xm15 1344 lea dstq, [dstq+strideq*4] 1345 vextracti128 [dstq+strideq*0-8], m0, 1 1346 vextracti128 [dstq+strideq*1-8], m1, 1 1347 vextracti128 [dstq+strideq*2-8], m2, 1 1348 vextracti128 [dstq+stride3q -8], m3, 1 1349 lea dstq, [dstq+strideq*4] 1350 vextracti128 [dstq+strideq*0-8], m4, 1 1351 vextracti128 [dstq+strideq*1-8], m5, 1 1352 vextracti128 [dstq+strideq*2-8], m6, 1 1353 vextracti128 [dstq+stride3q -8], m7, 1 1354 lea dstq, [dstq+strideq*4] 1355 vextracti128 [dstq+strideq*0-8], m8, 1 1356 vextracti128 [dstq+strideq*1-8], m9, 1 1357 vextracti128 [dstq+strideq*2-8], m10, 1 1358 vextracti128 [dstq+stride3q -8], m11, 1 1359 lea dstq, [dstq+strideq*4] 1360 vextracti128 [dstq+strideq*0-8], m12, 1 1361 vextracti128 [dstq+strideq*1-8], m13, 1 1362 vextracti128 [dstq+strideq*2-8], m14, 1 1363 vextracti128 [dstq+stride3q -8], m15, 1 1364 lea dstq, [dstq+strideq*4] 1365%endif 1366%endif 1367%elif %1 == 6 1368 ; flat6 filter 1369 1370 punpcklbw m8, m13, m5 1371 punpckhbw m11, m13, m5 1372 pmaddubsw m0, m8, [pb_3_1] 1373 pmaddubsw m1, m11, [pb_3_1] 1374 punpcklbw m7, m4, m3 1375 punpckhbw m10, m4, m3 1376 pmaddubsw m2, m7, [pb_2] 1377 pmaddubsw m12, m10, [pb_2] 1378 paddw m0, m2 1379 paddw m1, m12 1380 pmulhrsw m2, m0, [pw_4096] 1381 pmulhrsw m12, m1, [pw_4096] 1382 packuswb m2, m12 1383 pand m2, m9 1384 pandn m12, m9, m3 1385 por m2, m12 1386%ifidn %2, v 1387 mova [tmpq+strideq*2], m2 ; p1 1388%endif 1389 1390 pmaddubsw m8, [pb_m1_1] 1391 pmaddubsw m11, [pb_m1_1] 1392 paddw m0, m8 1393 paddw m1, m11 1394 punpcklbw m8, m13, m6 1395 punpckhbw m11, m13, m6 1396 pmaddubsw m8, [pb_m1_1] 1397 pmaddubsw m11, [pb_m1_1] 1398 paddw m0, m8 1399 paddw m1, m11 1400 pmulhrsw m12, m0, [pw_4096] 1401 pmulhrsw m13, m1, [pw_4096] 1402 packuswb m12, m13 1403 pand m12, m9 1404 pandn m13, m9, m4 1405 por m12, m13 1406%ifidn %2, v 1407 mova [tmpq+stride3q], m12 ; p0 1408%endif 1409 1410 paddw m0, m8 1411 paddw m1, m11 1412 punpcklbw m8, m3, m14 1413 punpckhbw m11, m3, m14 1414 pmaddubsw m14, m8, [pb_m1_1] 1415 pmaddubsw m13, m11, [pb_m1_1] 1416 paddw m0, m14 1417 paddw m1, m13 1418 pmulhrsw m14, m0, [pw_4096] 1419 pmulhrsw m13, m1, [pw_4096] 1420 packuswb m14, m13 1421 pand m14, m9 1422 pandn m13, m9, m5 1423 por m14, m13 1424%ifidn %2, v 1425 mova [dstq+strideq*0], m14 ; q0 1426%endif 1427 1428 pmaddubsw m8, [pb_m1_2] 1429 pmaddubsw m11, [pb_m1_2] 1430 paddw m0, m8 1431 paddw m1, m11 1432 pmaddubsw m7, [pb_m1_0] 1433 pmaddubsw m10, [pb_m1_0] 1434 paddw m0, m7 1435 paddw m1, m10 1436 pmulhrsw m0, [pw_4096] 1437 pmulhrsw m1, [pw_4096] 1438 packuswb m0, m1 1439 pand m0, m9 1440 pandn m9, m6 1441 por m0, m9 1442%ifidn %2, v 1443 mova [dstq+strideq*1], m0 ; q1 1444%else 1445 TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 1446%endif 1447%else 1448%ifidn %2, v 1449 mova [tmpq+strideq*0], m3 ; p1 1450 mova [tmpq+strideq*1], m4 ; p0 1451 mova [tmpq+strideq*2], m5 ; q0 1452 mova [tmpq+stride3q ], m6 ; q1 1453%else 1454 TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 1455%endif 1456%endif 1457%endmacro 1458 1459INIT_YMM avx2 1460cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \ 1461 dst, stride, mask, l, l_stride, lut, \ 1462 w, stride3, mstride, tmp 1463 shl l_strideq, 2 1464 sub lq, l_strideq 1465 mov mstrideq, strideq 1466 neg mstrideq 1467 lea stride3q, [strideq*3] 1468 1469.loop: 1470 cmp byte [maskq+8], 0 ; vmask[2] 1471 je .no_flat16 1472 1473 FILTER 16, v 1474 jmp .end 1475 1476.no_flat16: 1477 cmp byte [maskq+4], 0 ; vmask[1] 1478 je .no_flat 1479 1480 FILTER 8, v 1481 jmp .end 1482 1483.no_flat: 1484 cmp byte [maskq+0], 0 ; vmask[0] 1485 je .end 1486 1487 FILTER 4, v 1488 1489.end: 1490 add lq, 32 1491 add dstq, 32 1492 add maskq, 1 1493 sub wd, 8 1494 jg .loop 1495 RET 1496 1497INIT_YMM avx2 1498cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \ 1499 dst, stride, mask, l, l_stride, lut, \ 1500 h, stride3, l_stride3, tmp 1501 shl l_strideq, 2 1502 sub lq, 4 1503 lea stride3q, [strideq*3] 1504 lea l_stride3q, [l_strideq*3] 1505 1506.loop: 1507 cmp byte [maskq+8], 0 ; vmask[2] 1508 je .no_flat16 1509 1510 FILTER 16, h 1511 jmp .end 1512 1513.no_flat16: 1514 cmp byte [maskq+4], 0 ; vmask[1] 1515 je .no_flat 1516 1517 FILTER 8, h 1518 jmp .end 1519 1520.no_flat: 1521 cmp byte [maskq+0], 0 ; vmask[0] 1522 je .no_filter 1523 1524 FILTER 4, h 1525 jmp .end 1526 1527.no_filter: 1528 lea dstq, [dstq+stride3q*8] 1529 lea lq, [lq+l_strideq*8] 1530 lea dstq, [dstq+strideq*8] 1531.end: 1532 add maskq, 1 1533 sub hd, 8 1534 jg .loop 1535 RET 1536 1537INIT_YMM avx2 1538cglobal lpf_v_sb_uv, 7, 10, 16, \ 1539 dst, stride, mask, l, l_stride, lut, \ 1540 w, stride3, mstride, tmp 1541 shl l_strideq, 2 1542 sub lq, l_strideq 1543 mov mstrideq, strideq 1544 neg mstrideq 1545 lea stride3q, [strideq*3] 1546 1547.loop: 1548 cmp byte [maskq+4], 0 ; vmask[1] 1549 je .no_flat 1550 1551 FILTER 6, v 1552 jmp .end 1553 1554.no_flat: 1555 cmp byte [maskq+0], 0 ; vmask[0] 1556 je .end 1557 1558 FILTER 4, v 1559 1560.end: 1561 add lq, 32 1562 add dstq, 32 1563 add maskq, 1 1564 sub wd, 8 1565 jg .loop 1566 RET 1567 1568INIT_YMM avx2 1569cglobal lpf_h_sb_uv, 7, 10, 16, \ 1570 dst, stride, mask, l, l_stride, lut, \ 1571 h, stride3, l_stride3, tmp 1572 shl l_strideq, 2 1573 sub lq, 4 1574 lea stride3q, [strideq*3] 1575 lea l_stride3q, [l_strideq*3] 1576 1577.loop: 1578 cmp byte [maskq+4], 0 ; vmask[1] 1579 je .no_flat 1580 1581 FILTER 6, h 1582 jmp .end 1583 1584.no_flat: 1585 cmp byte [maskq+0], 0 ; vmask[0] 1586 je .no_filter 1587 1588 FILTER 4, h 1589 jmp .end 1590 1591.no_filter: 1592 lea dstq, [dstq+stride3q*8] 1593 lea lq, [lq+l_strideq*8] 1594 lea dstq, [dstq+strideq*8] 1595.end: 1596 add maskq, 1 1597 sub hd, 8 1598 jg .loop 1599 RET 1600 1601%endif ; ARCH_X86_64 1602