1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31%macro JMP_TABLE 2-* 32 %xdefine %1_jmptable %%table 33 %xdefine %%base mangle(private_prefix %+ _%1_avx2) 34 %%table: 35 %rep %0 - 1 36 dd %%base %+ .%2 - %%table 37 %rotate 1 38 %endrep 39%endmacro 40 41%macro CDEF_FILTER_JMP_TABLE 1 42JMP_TABLE cdef_filter_%1_8bpc, \ 43 d6k0, d6k1, d7k0, d7k1, \ 44 d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ 45 d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ 46 d0k0, d0k1, d1k0, d1k1 47%endmacro 48 49SECTION_RODATA 32 50 51pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 52blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 53 dd 0x80, 0x00, 0x00 54blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 55blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 56 dd 0x00, 0x00 57blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 58 dd 0x0000 59blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 60 dd 0x0000, 0x0000 61blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 62blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 63div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 64shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 65shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 66pw_128: times 2 dw 128 67pw_2048: times 2 dw 2048 68tap_table: ; masks for 8 bit shifts 69 db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 70 ; weights 71 db 4, 2, 3, 3, 2, 1 72 db -1 * 16 + 1, -2 * 16 + 2 73 db 0 * 16 + 1, -1 * 16 + 2 74 db 0 * 16 + 1, 0 * 16 + 2 75 db 0 * 16 + 1, 1 * 16 + 2 76 db 1 * 16 + 1, 2 * 16 + 2 77 db 1 * 16 + 0, 2 * 16 + 1 78 db 1 * 16 + 0, 2 * 16 + 0 79 db 1 * 16 + 0, 2 * 16 - 1 80 ; the last 6 are repeats of the first 6 so we don't need to & 7 81 db -1 * 16 + 1, -2 * 16 + 2 82 db 0 * 16 + 1, -1 * 16 + 2 83 db 0 * 16 + 1, 0 * 16 + 2 84 db 0 * 16 + 1, 1 * 16 + 2 85 db 1 * 16 + 1, 2 * 16 + 2 86 db 1 * 16 + 0, 2 * 16 + 1 87 88CDEF_FILTER_JMP_TABLE 4x4 89CDEF_FILTER_JMP_TABLE 4x8 90CDEF_FILTER_JMP_TABLE 8x8 91 92SECTION .text 93 94%macro PREP_REGS 2 ; w, h 95 ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 96 mov dird, r7m 97 lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] 98 lea dirq, [tableq+dirq*2*4] 99%if %1 == 4 100 %if %2 == 4 101 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ 102 table, dir, dirjmp, stride3, k 103 %else 104 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ 105 table, dir, dirjmp, dst4, stride3, k 106 lea dst4q, [dstq+strideq*4] 107 %endif 108%else 109 DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ 110 table, dir, dirjmp, top2, stride3, k 111 mov hq, -8 112 lea top1q, [top1q+strideq*0] 113 lea top2q, [top1q+strideq*1] 114%endif 115%if %1 == 4 116 lea stride3q, [strideq*3] 117%endif 118%endmacro 119 120%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max 121 mov kd, 1 122 pxor m15, m15 ; sum 123%if %2 == 8 124 pxor m12, m12 125 %if %1 == 4 126 movd xm4, [dstq +strideq*0] 127 movd xm6, [dstq +strideq*1] 128 movd xm5, [dstq +strideq*2] 129 movd xm7, [dstq +stride3q ] 130 vinserti128 m4, [dst4q+strideq*0], 1 131 vinserti128 m6, [dst4q+strideq*1], 1 132 vinserti128 m5, [dst4q+strideq*2], 1 133 vinserti128 m7, [dst4q+stride3q ], 1 134 punpckldq m4, m6 135 punpckldq m5, m7 136 %else 137 movq xm4, [dstq+strideq*0] 138 movq xm5, [dstq+strideq*1] 139 vinserti128 m4, [dstq+strideq*2], 1 140 vinserti128 m5, [dstq+stride3q ], 1 141 %endif 142 punpcklqdq m4, m5 143%else 144 movd xm4, [dstq+strideq*0] 145 movd xm5, [dstq+strideq*1] 146 vinserti128 m4, [dstq+strideq*2], 1 147 vinserti128 m5, [dstq+stride3q ], 1 148 punpckldq m4, m5 149%endif 150%if %3 == 1 151 mova m7, m4 ; min 152 mova m8, m4 ; max 153%endif 154%endmacro 155 156%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength 157 ; mul_tap, w, h, clip 158 ; load p0/p1 159 movsxd dirjmpq, [dirq+kq*4+%1*2*4] 160 add dirjmpq, tableq 161 call dirjmpq 162 163%if %8 == 1 164 pmaxub m7, m5 165 pminub m8, m5 166 pmaxub m7, m6 167 pminub m8, m6 168%endif 169 170 ; accumulate sum[m15] over p0/p1 171%if %7 == 4 172 punpcklbw m5, m6 173 punpcklbw m6, m4, m4 174 psubusb m9, m5, m6 175 psubusb m5, m6, m5 176 por m9, m5 ; abs_diff_p01(p01 - px) 177 pcmpeqb m5, m9 178 por m5, %5 179 psignb m6, %5, m5 180 psrlw m5, m9, %2 ; emulate 8-bit shift 181 pand m5, %3 182 psubusb m5, %4, m5 183 pminub m5, m9 184 pmaddubsw m5, m6 185 paddw m15, m5 186%else 187 psubusb m9, m5, m4 188 psubusb m5, m4, m5 189 psubusb m11, m6, m4 190 psubusb m6, m4, m6 191 por m9, m5 ; abs_diff_p0(p0 - px) 192 por m11, m6 ; abs_diff_p1(p1 - px) 193 pcmpeqb m5, m9 194 pcmpeqb m6, m11 195 punpckhbw m10, m9, m11 196 punpcklbw m9, m11 197 por m5, %5 198 por m11, m6, %5 199 punpckhbw m6, m5, m11 200 punpcklbw m5, m11 201 psignb m11, %5, m6 202 psrlw m6, m10, %2 ; emulate 8-bit shift 203 pand m6, %3 204 psubusb m6, %4, m6 205 pminub m6, m10 206 pmaddubsw m6, m11 207 paddw m12, m6 208 psignb m11, %5, m5 209 psrlw m5, m9, %2 ; emulate 8-bit shift 210 pand m5, %3 211 psubusb m5, %4, m5 212 pminub m5, m9 213 pmaddubsw m5, m11 214 paddw m15, m5 215%endif 216%endmacro 217 218%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip 219%if %2 == 4 220 %if %5 == 1 221 punpcklbw m4, %3 222 %endif 223 pcmpgtw %3, m15 224 paddw m15, %3 225 pmulhrsw m15, %4 226 %if %5 == 0 227 packsswb m15, m15 228 paddb m4, m15 229 %else 230 paddw m4, m15 231 packuswb m4, m4 ; clip px in [0x0,0xff] 232 pminub m4, m7 233 pmaxub m4, m8 234 %endif 235 vextracti128 xm5, m4, 1 236 movd [dstq+strideq*0], xm4 237 movd [dstq+strideq*2], xm5 238 pextrd [dstq+strideq*1], xm4, 1 239 pextrd [dstq+stride3q ], xm5, 1 240%else 241 pcmpgtw m6, %3, m12 242 pcmpgtw m5, %3, m15 243 paddw m12, m6 244 paddw m15, m5 245 %if %5 == 1 246 punpckhbw m5, m4, %3 247 punpcklbw m4, %3 248 %endif 249 pmulhrsw m12, %4 250 pmulhrsw m15, %4 251 %if %5 == 0 252 packsswb m15, m12 253 paddb m4, m15 254 %else 255 paddw m5, m12 256 paddw m4, m15 257 packuswb m4, m5 ; clip px in [0x0,0xff] 258 pminub m4, m7 259 pmaxub m4, m8 260 %endif 261 vextracti128 xm5, m4, 1 262 %if %1 == 4 263 movd [dstq +strideq*0], xm4 264 movd [dst4q+strideq*0], xm5 265 pextrd [dstq +strideq*1], xm4, 1 266 pextrd [dst4q+strideq*1], xm5, 1 267 pextrd [dstq +strideq*2], xm4, 2 268 pextrd [dst4q+strideq*2], xm5, 2 269 pextrd [dstq +stride3q ], xm4, 3 270 pextrd [dst4q+stride3q ], xm5, 3 271 %else 272 movq [dstq+strideq*0], xm4 273 movq [dstq+strideq*2], xm5 274 movhps [dstq+strideq*1], xm4 275 movhps [dstq+stride3q ], xm5 276 %endif 277%endif 278%endmacro 279 280%macro BORDER_PREP_REGS 2 ; w, h 281 ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 282 mov dird, r7m 283 lea dirq, [tableq+dirq*2+14] 284%if %1*%2*2/mmsize > 1 285 %if %1 == 4 286 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off 287 %else 288 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off 289 %endif 290 mov hd, %1*%2*2/mmsize 291%else 292 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off 293%endif 294 lea stkq, [px] 295 pxor m11, m11 296%endmacro 297 298%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max 299 mov kd, 1 300%if %1 == 4 301 movq xm4, [stkq+32*0] 302 movhps xm4, [stkq+32*1] 303 movq xm5, [stkq+32*2] 304 movhps xm5, [stkq+32*3] 305 vinserti128 m4, xm5, 1 306%else 307 mova xm4, [stkq+32*0] ; px 308 vinserti128 m4, [stkq+32*1], 1 309%endif 310 pxor m15, m15 ; sum 311%if %3 == 1 312 mova m7, m4 ; max 313 mova m8, m4 ; min 314%endif 315%endmacro 316 317%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength 318 ; mul_tap, w, clip 319 ; load p0/p1 320 movsx offq, byte [dirq+kq+%1] ; off1 321%if %6 == 4 322 movq xm5, [stkq+offq*2+32*0] ; p0 323 movq xm6, [stkq+offq*2+32*2] 324 movhps xm5, [stkq+offq*2+32*1] 325 movhps xm6, [stkq+offq*2+32*3] 326 vinserti128 m5, xm6, 1 327%else 328 movu xm5, [stkq+offq*2+32*0] ; p0 329 vinserti128 m5, [stkq+offq*2+32*1], 1 330%endif 331 neg offq ; -off1 332%if %6 == 4 333 movq xm6, [stkq+offq*2+32*0] ; p1 334 movq xm9, [stkq+offq*2+32*2] 335 movhps xm6, [stkq+offq*2+32*1] 336 movhps xm9, [stkq+offq*2+32*3] 337 vinserti128 m6, xm9, 1 338%else 339 movu xm6, [stkq+offq*2+32*0] ; p1 340 vinserti128 m6, [stkq+offq*2+32*1], 1 341%endif 342%if %7 == 1 343 ; out of bounds values are set to a value that is a both a large unsigned 344 ; value and a negative signed value. 345 ; use signed max and unsigned min to remove them 346 pmaxsw m7, m5 ; max after p0 347 pminuw m8, m5 ; min after p0 348 pmaxsw m7, m6 ; max after p1 349 pminuw m8, m6 ; min after p1 350%endif 351 352 ; accumulate sum[m15] over p0/p1 353 ; calculate difference before converting 354 psubw m5, m4 ; diff_p0(p0 - px) 355 psubw m6, m4 ; diff_p1(p1 - px) 356 357 ; convert to 8-bits with signed saturation 358 ; saturating to large diffs has no impact on the results 359 packsswb m5, m6 360 361 ; group into pairs so we can accumulate using maddubsw 362 pshufb m5, m12 363 pabsb m9, m5 364 psignb m10, %5, m5 365 psrlw m5, m9, %2 ; emulate 8-bit shift 366 pand m5, %3 367 psubusb m5, %4, m5 368 369 ; use unsigned min since abs diff can equal 0x80 370 pminub m5, m9 371 pmaddubsw m5, m10 372 paddw m15, m5 373%endmacro 374 375%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip 376 pcmpgtw m9, m11, m15 377 paddw m15, m9 378 pmulhrsw m15, %2 379 paddw m4, m15 380%if %3 == 1 381 pminsw m4, m7 382 pmaxsw m4, m8 383%endif 384 packuswb m4, m4 385 vextracti128 xm5, m4, 1 386%if %1 == 4 387 movd [dstq+strideq*0], xm4 388 pextrd [dstq+strideq*1], xm4, 1 389 movd [dstq+strideq*2], xm5 390 pextrd [dstq+stride3q ], xm5, 1 391%else 392 movq [dstq+strideq*0], xm4 393 movq [dstq+strideq*1], xm5 394%endif 395%endmacro 396 397%macro CDEF_FILTER 2 ; w, h 398INIT_YMM avx2 399cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ 400 pri, sec, dir, damping, edge 401%assign stack_offset_entry stack_offset 402 mov edged, edgem 403 cmp edged, 0xf 404 jne .border_block 405 406 PUSH r10 407 PUSH r11 408%if %2 == 4 409 %assign regs_used 12 410 %if STACK_ALIGNMENT < 32 411 PUSH r%+regs_used 412 %assign regs_used regs_used+1 413 %endif 414 ALLOC_STACK 0x60, 16 415 pmovzxbw xm0, [leftq+1] 416 vpermq m0, m0, q0110 417 psrldq m1, m0, 4 418 vpalignr m2, m0, m0, 12 419 movu [rsp+0x10], m0 420 movu [rsp+0x28], m1 421 movu [rsp+0x40], m2 422%elif %1 == 4 423 PUSH r12 424 %assign regs_used 13 425 %if STACK_ALIGNMENT < 32 426 PUSH r%+regs_used 427 %assign regs_used regs_used+1 428 %endif 429 ALLOC_STACK 8*2+%1*%2*1, 16 430 pmovzxwd m0, [leftq] 431 mova [rsp+0x10], m0 432%else 433 PUSH r12 434 PUSH r13 435 %assign regs_used 14 436 %if STACK_ALIGNMENT < 32 437 PUSH r%+regs_used 438 %assign regs_used regs_used+1 439 %endif 440 ALLOC_STACK 8*4+%1*%2*2+32, 16 441 lea r11, [strideq*3] 442 movu xm4, [dstq+strideq*2] 443 pmovzxwq m0, [leftq+0] 444 pmovzxwq m1, [leftq+8] 445 vinserti128 m4, [dstq+r11], 1 446 pmovzxbd m2, [leftq+1] 447 pmovzxbd m3, [leftq+9] 448 mov [rsp+16], botq 449 mova [rsp+0x20], m0 450 mova [rsp+0x40], m1 451 mova [rsp+0x60], m2 452 mova [rsp+0x80], m3 453 mova [rsp+0xa0], m4 454 lea botq, [dstq+strideq*4] 455%endif 456 457 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping 458 mov dampingd, r8m 459 xor zerod, zerod 460 movifnidn prid, prim 461 sub dampingd, 31 462 movifnidn secdmpd, secdmpm 463 test prid, prid 464 jz .sec_only 465 movd xm0, prid 466 lzcnt pridmpd, prid 467 add pridmpd, dampingd 468 cmovs pridmpd, zerod 469 mov [rsp+0], pridmpq ; pri_shift 470 test secdmpd, secdmpd 471 jz .pri_only 472 movd xm1, secdmpd 473 lzcnt secdmpd, secdmpd 474 add secdmpd, dampingd 475 mov [rsp+8], secdmpq ; sec_shift 476 477 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp 478 lea tableq, [tap_table] 479 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 480 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 481 482 ; pri/sec_taps[k] [4 total] 483 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir 484 vpbroadcastb m0, xm0 ; pri_strength 485 vpbroadcastb m1, xm1 ; sec_strength 486 and prid, 1 487 lea priq, [tableq+priq*2+8] ; pri_taps 488 lea secq, [tableq+12] ; sec_taps 489 490 PREP_REGS %1, %2 491%if %1*%2 > mmsize 492.v_loop: 493%endif 494 LOAD_BLOCK %1, %2, 1 495.k_loop: 496 vpbroadcastb m2, [priq+kq] ; pri_taps 497 vpbroadcastb m3, [secq+kq] ; sec_taps 498 ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 499 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 500 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 501 dec kq 502 jge .k_loop 503 504 vpbroadcastd m10, [pw_2048] 505 pxor m9, m9 506 ADJUST_PIXEL %1, %2, m9, m10, 1 507%if %1*%2 > mmsize 508 lea dstq, [dstq+strideq*4] 509 lea top1q, [rsp+0xa0] 510 lea top2q, [rsp+0xb0] 511 mov botq, [rsp+16] 512 add hq, 4 513 jl .v_loop 514%endif 515 RET 516 517.pri_only: 518 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp 519 lea tableq, [tap_table] 520 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 521 ; pri/sec_taps[k] [4 total] 522 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir 523 vpbroadcastb m0, xm0 ; pri_strength 524 and prid, 1 525 lea priq, [tableq+priq*2+8] ; pri_taps 526 PREP_REGS %1, %2 527 vpbroadcastd m3, [pw_2048] 528 pxor m1, m1 529%if %1*%2 > mmsize 530.pri_v_loop: 531%endif 532 LOAD_BLOCK %1, %2 533.pri_k_loop: 534 vpbroadcastb m2, [priq+kq] ; pri_taps 535 ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 536 dec kq 537 jge .pri_k_loop 538 ADJUST_PIXEL %1, %2, m1, m3 539%if %1*%2 > mmsize 540 lea dstq, [dstq+strideq*4] 541 lea top1q, [rsp+0xa0] 542 lea top2q, [rsp+0xb0] 543 mov botq, [rsp+16] 544 add hq, 4 545 jl .pri_v_loop 546%endif 547 RET 548 549.sec_only: 550 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping 551 movd xm1, secdmpd 552 lzcnt secdmpd, secdmpd 553 add secdmpd, dampingd 554 mov [rsp+8], secdmpq ; sec_shift 555 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table 556 lea tableq, [tap_table] 557 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 558 ; pri/sec_taps[k] [4 total] 559 DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir 560 vpbroadcastb m1, xm1 ; sec_strength 561 lea secq, [tableq+12] ; sec_taps 562 PREP_REGS %1, %2 563 vpbroadcastd m2, [pw_2048] 564 pxor m0, m0 565%if %1*%2 > mmsize 566.sec_v_loop: 567%endif 568 LOAD_BLOCK %1, %2 569.sec_k_loop: 570 vpbroadcastb m3, [secq+kq] ; sec_taps 571 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 572 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 573 dec kq 574 jge .sec_k_loop 575 ADJUST_PIXEL %1, %2, m0, m2 576%if %1*%2 > mmsize 577 lea dstq, [dstq+strideq*4] 578 lea top1q, [rsp+0xa0] 579 lea top2q, [rsp+0xb0] 580 mov botq, [rsp+16] 581 add hq, 4 582 jl .sec_v_loop 583%endif 584 RET 585 586.d0k0: 587%if %1 == 4 588 %if %2 == 4 589 vpbroadcastq m6, [dstq+strideq*1-1] 590 vpbroadcastq m10, [dstq+strideq*2-1] 591 movd xm5, [topq+strideq*1+1] 592 movd xm9, [dstq+strideq*0+1] 593 psrldq m11, m6, 2 594 psrldq m12, m10, 2 595 vinserti128 m6, [dstq+stride3q -1], 1 596 vinserti128 m10, [botq -1], 1 597 vpblendd m5, m11, 0x10 598 vpblendd m9, m12, 0x10 599 movu m11, [blend_4x4+16] 600 punpckldq m6, m10 601 punpckldq m5, m9 602 vpblendvb m6, [rsp+gprsize+0x28], m11 603 %else 604 movd xm5, [topq +strideq*1+1] 605 movq xm6, [dstq +strideq*1-1] 606 movq xm10, [dstq +stride3q -1] 607 movq xm11, [dst4q+strideq*1-1] 608 pinsrd xm5, [dstq +strideq*0+1], 1 609 movhps xm6, [dstq +strideq*2-1] 610 movhps xm10, [dst4q+strideq*0-1] 611 movhps xm11, [dst4q+strideq*2-1] 612 psrldq xm9, xm6, 2 613 shufps xm5, xm9, q2010 ; -1 +0 +1 +2 614 shufps xm6, xm10, q2020 ; +1 +2 +3 +4 615 psrldq xm9, xm11, 2 616 psrldq xm10, 2 617 shufps xm10, xm9, q2020 ; +3 +4 +5 +6 618 movd xm9, [dst4q+stride3q -1] 619 pinsrd xm9, [botq -1], 1 620 shufps xm11, xm9, q1020 ; +5 +6 +7 +8 621 pmovzxbw m9, [leftq+3] 622 vinserti128 m6, xm11, 1 623 movu m11, [blend_4x8_0+4] 624 vinserti128 m5, xm10, 1 625 vpblendvb m6, m9, m11 626 %endif 627%else 628 lea r13, [blend_8x8_0+16] 629 movq xm5, [top2q +1] 630 vbroadcasti128 m10, [dstq+strideq*1-1] 631 vbroadcasti128 m11, [dstq+strideq*2-1] 632 movhps xm5, [dstq+strideq*0+1] 633 vinserti128 m6, m10, [dstq+stride3q-1], 1 634 vinserti128 m9, m11, [botq -1], 1 635 psrldq m10, 2 636 psrldq m11, 2 637 punpcklqdq m6, m9 638 movu m9, [r13+hq*2*1+16*1] 639 punpcklqdq m10, m11 640 vpblendd m5, m10, 0xF0 641 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 642%endif 643 ret 644.d1k0: 645.d2k0: 646.d3k0: 647%if %1 == 4 648 %if %2 == 4 649 movq xm6, [dstq+strideq*0-1] 650 movq xm9, [dstq+strideq*1-1] 651 vinserti128 m6, [dstq+strideq*2-1], 1 652 vinserti128 m9, [dstq+stride3q -1], 1 653 movu m11, [rsp+gprsize+0x10] 654 pcmpeqd m12, m12 655 psrldq m5, m6, 2 656 psrldq m10, m9, 2 657 psrld m12, 24 658 punpckldq m6, m9 659 punpckldq m5, m10 660 vpblendvb m6, m11, m12 661 %else 662 movq xm6, [dstq +strideq*0-1] 663 movq xm9, [dstq +strideq*2-1] 664 movhps xm6, [dstq +strideq*1-1] 665 movhps xm9, [dstq +stride3q -1] 666 movq xm10, [dst4q+strideq*0-1] 667 movhps xm10, [dst4q+strideq*1-1] 668 psrldq xm5, xm6, 2 669 psrldq xm11, xm9, 2 670 shufps xm5, xm11, q2020 671 movq xm11, [dst4q+strideq*2-1] 672 movhps xm11, [dst4q+stride3q -1] 673 shufps xm6, xm9, q2020 674 shufps xm9, xm10, xm11, q2020 675 vinserti128 m6, xm9, 1 676 pmovzxbw m9, [leftq+1] 677 psrldq xm10, 2 678 psrldq xm11, 2 679 shufps xm10, xm11, q2020 680 vpbroadcastd m11, [blend_4x8_0+4] 681 vinserti128 m5, xm10, 1 682 vpblendvb m6, m9, m11 683 %endif 684%else 685 movu xm5, [dstq+strideq*0-1] 686 movu xm9, [dstq+strideq*1-1] 687 vinserti128 m5, [dstq+strideq*2-1], 1 688 vinserti128 m9, [dstq+stride3q -1], 1 689 movu m10, [blend_8x8_0+16] 690 punpcklqdq m6, m5, m9 691 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 692 psrldq m5, 2 693 psrldq m9, 2 694 punpcklqdq m5, m9 695%endif 696 ret 697.d4k0: 698%if %1 == 4 699 %if %2 == 4 700 vpbroadcastq m10, [dstq+strideq*1-1] 701 vpbroadcastq m11, [dstq+strideq*2-1] 702 movd xm6, [topq+strideq*1-1] 703 movd xm9, [dstq+strideq*0-1] 704 psrldq m5, m10, 2 705 psrldq m12, m11, 2 706 vpblendd m6, m10, 0x10 707 vpblendd m9, m11, 0x10 708 movu m10, [blend_4x4] 709 vinserti128 m5, [dstq+stride3q +1], 1 710 vinserti128 m12, [botq +1], 1 711 punpckldq m6, m9 712 punpckldq m5, m12 713 vpblendvb m6, [rsp+gprsize+0x40], m10 714 %else 715 movd xm6, [topq +strideq*1-1] 716 movq xm9, [dstq +strideq*1-1] 717 movq xm10, [dstq +stride3q -1] 718 movq xm11, [dst4q+strideq*1-1] 719 pinsrd xm6, [dstq +strideq*0-1], 1 720 movhps xm9, [dstq +strideq*2-1] 721 movhps xm10, [dst4q+strideq*0-1] 722 movhps xm11, [dst4q+strideq*2-1] 723 psrldq xm5, xm9, 2 724 shufps xm6, xm9, q2010 725 psrldq xm9, xm10, 2 726 shufps xm5, xm9, q2020 727 shufps xm10, xm11, q2020 728 movd xm9, [dst4q+stride3q +1] 729 vinserti128 m6, xm10, 1 730 pinsrd xm9, [botq +1], 1 731 psrldq xm11, 2 732 pmovzxbw m10, [leftq-1] 733 shufps xm11, xm9, q1020 734 movu m9, [blend_4x8_0] 735 vinserti128 m5, xm11, 1 736 vpblendvb m6, m10, m9 737 %endif 738%else 739 lea r13, [blend_8x8_0+8] 740 movq xm6, [top2q -1] 741 vbroadcasti128 m5, [dstq+strideq*1-1] 742 vbroadcasti128 m9, [dstq+strideq*2-1] 743 movhps xm6, [dstq+strideq*0-1] 744 movu m11, [r13+hq*2*1+16*1] 745 punpcklqdq m10, m5, m9 746 vinserti128 m5, [dstq+stride3q -1], 1 747 vinserti128 m9, [botq -1], 1 748 vpblendd m6, m10, 0xF0 749 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 750 psrldq m5, 2 751 psrldq m9, 2 752 punpcklqdq m5, m9 753%endif 754 ret 755.d5k0: 756.d6k0: 757.d7k0: 758%if %1 == 4 759 %if %2 == 4 760 movd xm6, [topq+strideq*1 ] 761 vpbroadcastd m5, [dstq+strideq*1 ] 762 vpbroadcastd m9, [dstq+strideq*2 ] 763 vpblendd xm6, [dstq+strideq*0-4], 0x2 764 vpblendd m5, m9, 0x22 765 vpblendd m6, m5, 0x30 766 vinserti128 m5, [dstq+stride3q ], 1 767 vpblendd m5, [botq -20], 0x20 768 %else 769 movd xm6, [topq +strideq*1] 770 movd xm5, [dstq +strideq*1] 771 movd xm9, [dstq +stride3q ] 772 movd xm10, [dst4q+strideq*1] 773 movd xm11, [dst4q+stride3q ] 774 pinsrd xm6, [dstq +strideq*0], 1 775 pinsrd xm5, [dstq +strideq*2], 1 776 pinsrd xm9, [dst4q+strideq*0], 1 777 pinsrd xm10, [dst4q+strideq*2], 1 778 pinsrd xm11, [botq ], 1 779 punpcklqdq xm6, xm5 780 punpcklqdq xm5, xm9 781 punpcklqdq xm9, xm10 782 punpcklqdq xm10, xm11 783 vinserti128 m6, xm9, 1 784 vinserti128 m5, xm10, 1 785 %endif 786%else 787 movq xm6, [top2q ] 788 movq xm5, [dstq+strideq*1] 789 movq xm9, [dstq+stride3q ] 790 movhps xm6, [dstq+strideq*0] 791 movhps xm5, [dstq+strideq*2] 792 movhps xm9, [botq ] 793 vinserti128 m6, xm5, 1 794 vinserti128 m5, xm9, 1 795%endif 796 ret 797.d0k1: 798%if %1 == 4 799 %if %2 == 4 800 movd xm6, [dstq+strideq*2-2] 801 movd xm9, [dstq+stride3q -2] 802 movd xm5, [topq+strideq*0+2] 803 movd xm10, [topq+strideq*1+2] 804 pinsrw xm6, [leftq+4], 0 805 pinsrw xm9, [leftq+6], 0 806 vinserti128 m5, [dstq+strideq*0+2], 1 807 vinserti128 m10, [dstq+strideq*1+2], 1 808 vinserti128 m6, [botq+strideq*0-2], 1 809 vinserti128 m9, [botq+strideq*1-2], 1 810 punpckldq m5, m10 811 punpckldq m6, m9 812 %else 813 movq xm6, [dstq +strideq*2-2] 814 movd xm10, [dst4q+strideq*2-2] 815 movd xm5, [topq +strideq*0+2] 816 movq xm9, [dst4q+strideq*0-2] 817 movhps xm6, [dstq +stride3q -2] 818 pinsrw xm10, [dst4q+stride3q ], 3 819 pinsrd xm5, [topq +strideq*1+2], 1 820 movhps xm9, [dst4q+strideq*1-2] 821 pinsrd xm10, [botq +strideq*0-2], 2 822 pinsrd xm5, [dstq +strideq*0+2], 2 823 pinsrd xm10, [botq +strideq*1-2], 3 824 pinsrd xm5, [dstq +strideq*1+2], 3 825 shufps xm11, xm6, xm9, q3131 826 shufps xm6, xm9, q2020 827 movu m9, [blend_4x8_3+8] 828 vinserti128 m6, xm10, 1 829 vinserti128 m5, xm11, 1 830 vpblendvb m6, [rsp+gprsize+0x10+8], m9 831 %endif 832%else 833 lea r13, [blend_8x8_1+16] 834 movq xm6, [dstq+strideq*2-2] 835 movq xm9, [dstq+stride3q -2] 836 movq xm5, [top1q +2] 837 movq xm10, [top2q +2] 838 movu m11, [r13+hq*2*2+16*2] 839 vinserti128 m6, [botq+strideq*0-2], 1 840 vinserti128 m9, [botq+strideq*1-2], 1 841 vinserti128 m5, [dstq+strideq*0+2], 1 842 vinserti128 m10, [dstq+strideq*1+2], 1 843 punpcklqdq m6, m9 844 punpcklqdq m5, m10 845 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 846%endif 847 ret 848.d1k1: 849%if %1 == 4 850 %if %2 == 4 851 vpbroadcastq m6, [dstq+strideq*1-2] 852 vpbroadcastq m9, [dstq+strideq*2-2] 853 movd xm5, [topq+strideq*1+2] 854 movd xm10, [dstq+strideq*0+2] 855 psrldq m11, m6, 4 856 psrldq m12, m9, 4 857 vpblendd m5, m11, 0x10 858 movq xm11, [leftq+2] 859 vinserti128 m6, [dstq+stride3q-2], 1 860 punpckldq xm11, xm11 861 vpblendd m10, m12, 0x10 862 pcmpeqd m12, m12 863 pmovzxwd m11, xm11 864 psrld m12, 16 865 punpckldq m6, m9 866 vpbroadcastd m9, [botq-2] 867 vpblendvb m6, m11, m12 868 punpckldq m5, m10 869 vpblendd m6, m9, 0x20 870 %else 871 movd xm5, [topq +strideq*1+2] 872 movq xm6, [dstq +strideq*1-2] 873 movq xm9, [dstq +stride3q -2] 874 movq xm10, [dst4q+strideq*1-2] 875 movd xm11, [dst4q+stride3q -2] 876 pinsrd xm5, [dstq +strideq*0+2], 1 877 movhps xm6, [dstq +strideq*2-2] 878 movhps xm9, [dst4q+strideq*0-2] 879 movhps xm10, [dst4q+strideq*2-2] 880 pinsrd xm11, [botq -2], 1 881 shufps xm5, xm6, q3110 882 shufps xm6, xm9, q2020 883 shufps xm9, xm10, q3131 884 shufps xm10, xm11, q1020 885 movu m11, [blend_4x8_2+4] 886 vinserti128 m6, xm10, 1 887 vinserti128 m5, xm9, 1 888 vpblendvb m6, [rsp+gprsize+0x10+4], m11 889 %endif 890%else 891 lea r13, [blend_8x8_1+16] 892 movq xm5, [top2q +2] 893 vbroadcasti128 m6, [dstq+strideq*1-2] 894 vbroadcasti128 m9, [dstq+strideq*2-2] 895 movhps xm5, [dstq+strideq*0+2] 896 shufps m10, m6, m9, q2121 897 vinserti128 m6, [dstq+stride3q -2], 1 898 vinserti128 m9, [botq -2], 1 899 movu m11, [r13+hq*2*1+16*1] 900 vpblendd m5, m10, 0xF0 901 punpcklqdq m6, m9 902 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 903%endif 904 ret 905.d2k1: 906%if %1 == 4 907 %if %2 == 4 908 movq xm11, [leftq] 909 movq xm6, [dstq+strideq*0-2] 910 movq xm9, [dstq+strideq*1-2] 911 vinserti128 m6, [dstq+strideq*2-2], 1 912 vinserti128 m9, [dstq+stride3q -2], 1 913 punpckldq xm11, xm11 914 psrldq m5, m6, 4 915 psrldq m10, m9, 4 916 pmovzxwd m11, xm11 917 punpckldq m6, m9 918 punpckldq m5, m10 919 pblendw m6, m11, 0x05 920 %else 921 movq xm5, [dstq +strideq*0-2] 922 movq xm9, [dstq +strideq*2-2] 923 movq xm10, [dst4q+strideq*0-2] 924 movq xm11, [dst4q+strideq*2-2] 925 movhps xm5, [dstq +strideq*1-2] 926 movhps xm9, [dstq +stride3q -2] 927 movhps xm10, [dst4q+strideq*1-2] 928 movhps xm11, [dst4q+stride3q -2] 929 shufps xm6, xm5, xm9, q2020 930 shufps xm5, xm9, q3131 931 shufps xm9, xm10, xm11, q2020 932 shufps xm10, xm11, q3131 933 pmovzxwd m11, [leftq] 934 vinserti128 m6, xm9, 1 935 vinserti128 m5, xm10, 1 936 pblendw m6, m11, 0x55 937 %endif 938%else 939 mova m11, [rsp+gprsize+0x20+hq*8+64] 940 movu xm5, [dstq+strideq*0-2] 941 movu xm9, [dstq+strideq*1-2] 942 vinserti128 m5, [dstq+strideq*2-2], 1 943 vinserti128 m9, [dstq+stride3q -2], 1 944 shufps m6, m5, m9, q1010 945 shufps m5, m9, q2121 946 pblendw m6, m11, 0x11 947%endif 948 ret 949.d3k1: 950%if %1 == 4 951 %if %2 == 4 952 vpbroadcastq m11, [dstq+strideq*1-2] 953 vpbroadcastq m12, [dstq+strideq*2-2] 954 movd xm6, [topq+strideq*1-2] 955 movd xm9, [dstq+strideq*0-2] 956 pblendw m11, [leftq-16+2], 0x01 957 pblendw m12, [leftq-16+4], 0x01 958 pinsrw xm9, [leftq- 0+0], 0 959 psrldq m5, m11, 4 960 psrldq m10, m12, 4 961 vinserti128 m5, [dstq+stride3q +2], 1 962 vinserti128 m10, [botq +2], 1 963 vpblendd m6, m11, 0x10 964 vpblendd m9, m12, 0x10 965 punpckldq m6, m9 966 punpckldq m5, m10 967 %else 968 movd xm6, [topq +strideq*1-2] 969 movq xm5, [dstq +strideq*1-2] 970 movq xm9, [dstq +stride3q -2] 971 movq xm10, [dst4q+strideq*1-2] 972 movd xm11, [dst4q+stride3q +2] 973 pinsrw xm6, [dstq +strideq*0 ], 3 974 movhps xm5, [dstq +strideq*2-2] 975 movhps xm9, [dst4q+strideq*0-2] 976 movhps xm10, [dst4q+strideq*2-2] 977 pinsrd xm11, [botq +2], 1 978 shufps xm6, xm5, q2010 979 shufps xm5, xm9, q3131 980 shufps xm9, xm10, q2020 981 shufps xm10, xm11, q1031 982 movu m11, [blend_4x8_2] 983 vinserti128 m6, xm9, 1 984 vinserti128 m5, xm10, 1 985 vpblendvb m6, [rsp+gprsize+0x10-4], m11 986 %endif 987%else 988 lea r13, [blend_8x8_1+8] 989 movq xm6, [top2q -2] 990 vbroadcasti128 m5, [dstq+strideq*1-2] 991 vbroadcasti128 m10, [dstq+strideq*2-2] 992 movhps xm6, [dstq+strideq*0-2] 993 punpcklqdq m9, m5, m10 994 vinserti128 m5, [dstq+stride3q -2], 1 995 vinserti128 m10, [botq -2], 1 996 movu m11, [r13+hq*2*1+16*1] 997 vpblendd m6, m9, 0xF0 998 shufps m5, m10, q2121 999 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 1000%endif 1001 ret 1002.d4k1: 1003%if %1 == 4 1004 %if %2 == 4 1005 vinserti128 m6, [dstq+strideq*0-2], 1 1006 vinserti128 m9, [dstq+strideq*1-2], 1 1007 movd xm5, [dstq+strideq*2+2] 1008 movd xm10, [dstq+stride3q +2] 1009 pblendw m6, [leftq-16+0], 0x01 1010 pblendw m9, [leftq-16+2], 0x01 1011 vinserti128 m5, [botq+strideq*0+2], 1 1012 vinserti128 m10, [botq+strideq*1+2], 1 1013 vpblendd m6, [topq+strideq*0-2], 0x01 1014 vpblendd m9, [topq+strideq*1-2], 0x01 1015 punpckldq m5, m10 1016 punpckldq m6, m9 1017 %else 1018 movd xm6, [topq +strideq*0-2] 1019 movq xm5, [dstq +strideq*2-2] 1020 movq xm9, [dst4q+strideq*0-2] 1021 movd xm10, [dst4q+strideq*2+2] 1022 pinsrd xm6, [topq +strideq*1-2], 1 1023 movhps xm5, [dstq +stride3q -2] 1024 movhps xm9, [dst4q+strideq*1-2] 1025 pinsrd xm10, [dst4q+stride3q +2], 1 1026 pinsrd xm6, [dstq +strideq*0-2], 2 1027 pinsrd xm10, [botq +strideq*0+2], 2 1028 pinsrd xm6, [dstq +strideq*1-2], 3 1029 pinsrd xm10, [botq +strideq*1+2], 3 1030 shufps xm11, xm5, xm9, q2020 1031 shufps xm5, xm9, q3131 1032 movu m9, [blend_4x8_3] 1033 vinserti128 m6, xm11, 1 1034 vinserti128 m5, xm10, 1 1035 vpblendvb m6, [rsp+gprsize+0x10-8], m9 1036 %endif 1037%else 1038 lea r13, [blend_8x8_1] 1039 movu m11, [r13+hq*2*2+16*2] 1040 movq xm6, [top1q -2] 1041 movq xm9, [top2q -2] 1042 movq xm5, [dstq+strideq*2+2] 1043 movq xm10, [dstq+stride3q +2] 1044 vinserti128 m6, [dstq+strideq*0-2], 1 1045 vinserti128 m9, [dstq+strideq*1-2], 1 1046 vinserti128 m5, [botq+strideq*0+2], 1 1047 vinserti128 m10, [botq+strideq*1+2], 1 1048 punpcklqdq m6, m9 1049 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 1050 punpcklqdq m5, m10 1051%endif 1052 ret 1053.d5k1: 1054%if %1 == 4 1055 %if %2 == 4 1056 movd xm6, [topq+strideq*0-1] 1057 movd xm9, [topq+strideq*1-1] 1058 movd xm5, [dstq+strideq*2+1] 1059 movd xm10, [dstq+stride3q +1] 1060 pcmpeqd m12, m12 1061 pmovzxbw m11, [leftq-8+1] 1062 psrld m12, 24 1063 vinserti128 m6, [dstq+strideq*0-1], 1 1064 vinserti128 m9, [dstq+strideq*1-1], 1 1065 vinserti128 m5, [botq+strideq*0+1], 1 1066 vinserti128 m10, [botq+strideq*1+1], 1 1067 punpckldq m6, m9 1068 pxor m9, m9 1069 vpblendd m12, m9, 0x0F 1070 punpckldq m5, m10 1071 vpblendvb m6, m11, m12 1072 %else 1073 movd xm6, [topq +strideq*0-1] 1074 movq xm5, [dstq +strideq*2-1] 1075 movq xm9, [dst4q+strideq*0-1] 1076 movd xm10, [dst4q+strideq*2+1] 1077 pinsrd xm6, [topq +strideq*1-1], 1 1078 movhps xm5, [dstq +stride3q -1] 1079 movhps xm9, [dst4q+strideq*1-1] 1080 pinsrd xm10, [dst4q+stride3q +1], 1 1081 pinsrd xm6, [dstq +strideq*0-1], 2 1082 pinsrd xm10, [botq +strideq*0+1], 2 1083 pinsrd xm6, [dstq +strideq*1-1], 3 1084 pinsrd xm10, [botq +strideq*1+1], 3 1085 shufps xm11, xm5, xm9, q2020 1086 vinserti128 m6, xm11, 1 1087 pmovzxbw m11, [leftq-3] 1088 psrldq xm5, 2 1089 psrldq xm9, 2 1090 shufps xm5, xm9, q2020 1091 movu m9, [blend_4x8_1] 1092 vinserti128 m5, xm10, 1 1093 vpblendvb m6, m11, m9 1094 %endif 1095%else 1096 lea r13, [blend_8x8_0] 1097 movu m11, [r13+hq*2*2+16*2] 1098 movq xm6, [top1q -1] 1099 movq xm9, [top2q -1] 1100 movq xm5, [dstq+strideq*2+1] 1101 movq xm10, [dstq+stride3q +1] 1102 vinserti128 m6, [dstq+strideq*0-1], 1 1103 vinserti128 m9, [dstq+strideq*1-1], 1 1104 vinserti128 m5, [botq+strideq*0+1], 1 1105 vinserti128 m10, [botq+strideq*1+1], 1 1106 punpcklqdq m6, m9 1107 punpcklqdq m5, m10 1108 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 1109%endif 1110 ret 1111.d6k1: 1112%if %1 == 4 1113 %if %2 == 4 1114 movd xm6, [topq+strideq*0] 1115 movd xm9, [topq+strideq*1] 1116 movd xm5, [dstq+strideq*2] 1117 movd xm10, [dstq+stride3q ] 1118 vinserti128 m6, [dstq+strideq*0], 1 1119 vinserti128 m9, [dstq+strideq*1], 1 1120 vinserti128 m5, [botq+strideq*0], 1 1121 vinserti128 m10, [botq+strideq*1], 1 1122 punpckldq m6, m9 1123 punpckldq m5, m10 1124 %else 1125 movd xm5, [dstq +strideq*2] 1126 movd xm6, [topq +strideq*0] 1127 movd xm9, [dst4q+strideq*2] 1128 pinsrd xm5, [dstq +stride3q ], 1 1129 pinsrd xm6, [topq +strideq*1], 1 1130 pinsrd xm9, [dst4q+stride3q ], 1 1131 pinsrd xm5, [dst4q+strideq*0], 2 1132 pinsrd xm6, [dstq +strideq*0], 2 1133 pinsrd xm9, [botq +strideq*0], 2 1134 pinsrd xm5, [dst4q+strideq*1], 3 1135 pinsrd xm6, [dstq +strideq*1], 3 1136 pinsrd xm9, [botq +strideq*1], 3 1137 vinserti128 m6, xm5, 1 1138 vinserti128 m5, xm9, 1 1139 %endif 1140%else 1141 movq xm5, [dstq+strideq*2] 1142 movq xm9, [botq+strideq*0] 1143 movq xm6, [top1q ] 1144 movq xm10, [dstq+strideq*0] 1145 movhps xm5, [dstq+stride3q ] 1146 movhps xm9, [botq+strideq*1] 1147 movhps xm6, [top2q ] 1148 movhps xm10, [dstq+strideq*1] 1149 vinserti128 m5, xm9, 1 1150 vinserti128 m6, xm10, 1 1151%endif 1152 ret 1153.d7k1: 1154%if %1 == 4 1155 %if %2 == 4 1156 movd xm5, [dstq+strideq*2-1] 1157 movd xm9, [dstq+stride3q -1] 1158 movd xm6, [topq+strideq*0+1] 1159 movd xm10, [topq+strideq*1+1] 1160 pinsrb xm5, [leftq+ 5], 0 1161 pinsrb xm9, [leftq+ 7], 0 1162 vinserti128 m6, [dstq+strideq*0+1], 1 1163 vinserti128 m10, [dstq+strideq*1+1], 1 1164 vinserti128 m5, [botq+strideq*0-1], 1 1165 vinserti128 m9, [botq+strideq*1-1], 1 1166 punpckldq m6, m10 1167 punpckldq m5, m9 1168 %else 1169 movd xm6, [topq +strideq*0+1] 1170 movq xm9, [dstq +strideq*2-1] 1171 movq xm10, [dst4q+strideq*0-1] 1172 movd xm11, [dst4q+strideq*2-1] 1173 pinsrd xm6, [topq +strideq*1+1], 1 1174 movhps xm9, [dstq +stride3q -1] 1175 movhps xm10, [dst4q+strideq*1-1] 1176 pinsrd xm11, [dst4q+stride3q -1], 1 1177 pinsrd xm6, [dstq +strideq*0+1], 2 1178 pinsrd xm11, [botq +strideq*0-1], 2 1179 pinsrd xm6, [dstq +strideq*1+1], 3 1180 pinsrd xm11, [botq +strideq*1-1], 3 1181 shufps xm5, xm9, xm10, q2020 1182 vinserti128 m5, xm11, 1 1183 pmovzxbw m11, [leftq+5] 1184 psrldq xm9, 2 1185 psrldq xm10, 2 1186 shufps xm9, xm10, q2020 1187 movu m10, [blend_4x8_1+8] 1188 vinserti128 m6, xm9, 1 1189 vpblendvb m5, m11, m10 1190 %endif 1191%else 1192 lea r13, [blend_8x8_0+16] 1193 movq xm5, [dstq+strideq*2-1] 1194 movq xm9, [botq+strideq*0-1] 1195 movq xm6, [top1q +1] 1196 movq xm10, [dstq+strideq*0+1] 1197 movhps xm5, [dstq+stride3q -1] 1198 movhps xm9, [botq+strideq*1-1] 1199 movhps xm6, [top2q +1] 1200 movhps xm10, [dstq+strideq*1+1] 1201 movu m11, [r13+hq*2*2+16*2] 1202 vinserti128 m5, xm9, 1 1203 vinserti128 m6, xm10, 1 1204 vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 1205%endif 1206 ret 1207 1208.border_block: 1209 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge 1210%define rstk rsp 1211%assign stack_offset stack_offset_entry 1212%assign regs_used 10 1213%if STACK_ALIGNMENT < 32 1214 PUSH r%+regs_used 1215 %assign regs_used regs_used+1 1216%endif 1217 ALLOC_STACK 2*16+(%2+4)*32, 16 1218%define px rsp+2*16+2*32 1219 1220 pcmpeqw m14, m14 1221 psllw m14, 15 ; 0x8000 1222 1223 ; prepare pixel buffers - body/right 1224%if %1 == 4 1225 INIT_XMM avx2 1226%endif 1227%if %2 == 8 1228 lea dst4q, [dstq+strideq*4] 1229%endif 1230 lea stride3q, [strideq*3] 1231 test edgeb, 2 ; have_right 1232 jz .no_right 1233 pmovzxbw m1, [dstq+strideq*0] 1234 pmovzxbw m2, [dstq+strideq*1] 1235 pmovzxbw m3, [dstq+strideq*2] 1236 pmovzxbw m4, [dstq+stride3q] 1237 mova [px+0*32], m1 1238 mova [px+1*32], m2 1239 mova [px+2*32], m3 1240 mova [px+3*32], m4 1241%if %2 == 8 1242 pmovzxbw m1, [dst4q+strideq*0] 1243 pmovzxbw m2, [dst4q+strideq*1] 1244 pmovzxbw m3, [dst4q+strideq*2] 1245 pmovzxbw m4, [dst4q+stride3q] 1246 mova [px+4*32], m1 1247 mova [px+5*32], m2 1248 mova [px+6*32], m3 1249 mova [px+7*32], m4 1250%endif 1251 jmp .body_done 1252.no_right: 1253%if %1 == 4 1254 movd xm1, [dstq+strideq*0] 1255 movd xm2, [dstq+strideq*1] 1256 movd xm3, [dstq+strideq*2] 1257 movd xm4, [dstq+stride3q] 1258 pmovzxbw xm1, xm1 1259 pmovzxbw xm2, xm2 1260 pmovzxbw xm3, xm3 1261 pmovzxbw xm4, xm4 1262 movq [px+0*32], xm1 1263 movq [px+1*32], xm2 1264 movq [px+2*32], xm3 1265 movq [px+3*32], xm4 1266%else 1267 pmovzxbw xm1, [dstq+strideq*0] 1268 pmovzxbw xm2, [dstq+strideq*1] 1269 pmovzxbw xm3, [dstq+strideq*2] 1270 pmovzxbw xm4, [dstq+stride3q] 1271 mova [px+0*32], xm1 1272 mova [px+1*32], xm2 1273 mova [px+2*32], xm3 1274 mova [px+3*32], xm4 1275%endif 1276 movd [px+0*32+%1*2], xm14 1277 movd [px+1*32+%1*2], xm14 1278 movd [px+2*32+%1*2], xm14 1279 movd [px+3*32+%1*2], xm14 1280%if %2 == 8 1281 %if %1 == 4 1282 movd xm1, [dst4q+strideq*0] 1283 movd xm2, [dst4q+strideq*1] 1284 movd xm3, [dst4q+strideq*2] 1285 movd xm4, [dst4q+stride3q] 1286 pmovzxbw xm1, xm1 1287 pmovzxbw xm2, xm2 1288 pmovzxbw xm3, xm3 1289 pmovzxbw xm4, xm4 1290 movq [px+4*32], xm1 1291 movq [px+5*32], xm2 1292 movq [px+6*32], xm3 1293 movq [px+7*32], xm4 1294 %else 1295 pmovzxbw xm1, [dst4q+strideq*0] 1296 pmovzxbw xm2, [dst4q+strideq*1] 1297 pmovzxbw xm3, [dst4q+strideq*2] 1298 pmovzxbw xm4, [dst4q+stride3q] 1299 mova [px+4*32], xm1 1300 mova [px+5*32], xm2 1301 mova [px+6*32], xm3 1302 mova [px+7*32], xm4 1303 %endif 1304 movd [px+4*32+%1*2], xm14 1305 movd [px+5*32+%1*2], xm14 1306 movd [px+6*32+%1*2], xm14 1307 movd [px+7*32+%1*2], xm14 1308%endif 1309.body_done: 1310 1311 ; top 1312 test edgeb, 4 ; have_top 1313 jz .no_top 1314 test edgeb, 1 ; have_left 1315 jz .top_no_left 1316 test edgeb, 2 ; have_right 1317 jz .top_no_right 1318 pmovzxbw m1, [topq+strideq*0-(%1/2)] 1319 pmovzxbw m2, [topq+strideq*1-(%1/2)] 1320 movu [px-2*32-%1], m1 1321 movu [px-1*32-%1], m2 1322 jmp .top_done 1323.top_no_right: 1324 pmovzxbw m1, [topq+strideq*0-%1] 1325 pmovzxbw m2, [topq+strideq*1-%1] 1326 movu [px-2*32-%1*2], m1 1327 movu [px-1*32-%1*2], m2 1328 movd [px-2*32+%1*2], xm14 1329 movd [px-1*32+%1*2], xm14 1330 jmp .top_done 1331.top_no_left: 1332 test edgeb, 2 ; have_right 1333 jz .top_no_left_right 1334 pmovzxbw m1, [topq+strideq*0] 1335 pmovzxbw m2, [topq+strideq*1] 1336 mova [px-2*32+0], m1 1337 mova [px-1*32+0], m2 1338 movd [px-2*32-4], xm14 1339 movd [px-1*32-4], xm14 1340 jmp .top_done 1341.top_no_left_right: 1342%if %1 == 4 1343 movd xm1, [topq+strideq*0] 1344 pinsrd xm1, [topq+strideq*1], 1 1345 pmovzxbw xm1, xm1 1346 movq [px-2*32+0], xm1 1347 movhps [px-1*32+0], xm1 1348%else 1349 pmovzxbw xm1, [topq+strideq*0] 1350 pmovzxbw xm2, [topq+strideq*1] 1351 mova [px-2*32+0], xm1 1352 mova [px-1*32+0], xm2 1353%endif 1354 movd [px-2*32-4], xm14 1355 movd [px-1*32-4], xm14 1356 movd [px-2*32+%1*2], xm14 1357 movd [px-1*32+%1*2], xm14 1358 jmp .top_done 1359.no_top: 1360 movu [px-2*32-%1], m14 1361 movu [px-1*32-%1], m14 1362.top_done: 1363 1364 ; left 1365 test edgeb, 1 ; have_left 1366 jz .no_left 1367 pmovzxbw xm1, [leftq+ 0] 1368%if %2 == 8 1369 pmovzxbw xm2, [leftq+ 8] 1370%endif 1371 movd [px+0*32-4], xm1 1372 pextrd [px+1*32-4], xm1, 1 1373 pextrd [px+2*32-4], xm1, 2 1374 pextrd [px+3*32-4], xm1, 3 1375%if %2 == 8 1376 movd [px+4*32-4], xm2 1377 pextrd [px+5*32-4], xm2, 1 1378 pextrd [px+6*32-4], xm2, 2 1379 pextrd [px+7*32-4], xm2, 3 1380%endif 1381 jmp .left_done 1382.no_left: 1383 movd [px+0*32-4], xm14 1384 movd [px+1*32-4], xm14 1385 movd [px+2*32-4], xm14 1386 movd [px+3*32-4], xm14 1387%if %2 == 8 1388 movd [px+4*32-4], xm14 1389 movd [px+5*32-4], xm14 1390 movd [px+6*32-4], xm14 1391 movd [px+7*32-4], xm14 1392%endif 1393.left_done: 1394 1395 ; bottom 1396 DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge 1397 test edgeb, 8 ; have_bottom 1398 jz .no_bottom 1399 test edgeb, 1 ; have_left 1400 jz .bottom_no_left 1401 test edgeb, 2 ; have_right 1402 jz .bottom_no_right 1403 pmovzxbw m1, [botq+strideq*0-(%1/2)] 1404 pmovzxbw m2, [botq+strideq*1-(%1/2)] 1405 movu [px+(%2+0)*32-%1], m1 1406 movu [px+(%2+1)*32-%1], m2 1407 jmp .bottom_done 1408.bottom_no_right: 1409 pmovzxbw m1, [botq+strideq*0-%1] 1410 pmovzxbw m2, [botq+strideq*1-%1] 1411 movu [px+(%2+0)*32-%1*2], m1 1412 movu [px+(%2+1)*32-%1*2], m2 1413%if %1 == 8 1414 movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu 1415%endif 1416 movd [px+(%2+0)*32+%1*2], xm14 1417 movd [px+(%2+1)*32+%1*2], xm14 1418 jmp .bottom_done 1419.bottom_no_left: 1420 test edgeb, 2 ; have_right 1421 jz .bottom_no_left_right 1422 pmovzxbw m1, [botq+strideq*0] 1423 pmovzxbw m2, [botq+strideq*1] 1424 mova [px+(%2+0)*32+0], m1 1425 mova [px+(%2+1)*32+0], m2 1426 movd [px+(%2+0)*32-4], xm14 1427 movd [px+(%2+1)*32-4], xm14 1428 jmp .bottom_done 1429.bottom_no_left_right: 1430%if %1 == 4 1431 movd xm1, [botq+strideq*0] 1432 pinsrd xm1, [botq+strideq*1], 1 1433 pmovzxbw xm1, xm1 1434 movq [px+(%2+0)*32+0], xm1 1435 movhps [px+(%2+1)*32+0], xm1 1436%else 1437 pmovzxbw xm1, [botq+strideq*0] 1438 pmovzxbw xm2, [botq+strideq*1] 1439 mova [px+(%2+0)*32+0], xm1 1440 mova [px+(%2+1)*32+0], xm2 1441%endif 1442 movd [px+(%2+0)*32-4], xm14 1443 movd [px+(%2+1)*32-4], xm14 1444 movd [px+(%2+0)*32+%1*2], xm14 1445 movd [px+(%2+1)*32+%1*2], xm14 1446 jmp .bottom_done 1447.no_bottom: 1448 movu [px+(%2+0)*32-%1], m14 1449 movu [px+(%2+1)*32-%1], m14 1450.bottom_done: 1451 1452 ; actual filter 1453 INIT_YMM avx2 1454 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero 1455%undef edged 1456 ; register to shuffle values into after packing 1457 vbroadcasti128 m12, [shufb_lohi] 1458 1459 mov dampingd, r8m 1460 xor zerod, zerod 1461 movifnidn prid, prim 1462 sub dampingd, 31 1463 movifnidn secdmpd, secdmpm 1464 test prid, prid 1465 jz .border_sec_only 1466 movd xm0, prid 1467 lzcnt pridmpd, prid 1468 add pridmpd, dampingd 1469 cmovs pridmpd, zerod 1470 mov [rsp+0], pridmpq ; pri_shift 1471 test secdmpd, secdmpd 1472 jz .border_pri_only 1473 movd xm1, secdmpd 1474 lzcnt secdmpd, secdmpd 1475 add secdmpd, dampingd 1476 mov [rsp+8], secdmpq ; sec_shift 1477 1478 DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 1479 lea tableq, [tap_table] 1480 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1481 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1482 1483 ; pri/sec_taps[k] [4 total] 1484 DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 1485 vpbroadcastb m0, xm0 ; pri_strength 1486 vpbroadcastb m1, xm1 ; sec_strength 1487 and prid, 1 1488 lea priq, [tableq+priq*2+8] ; pri_taps 1489 lea secq, [tableq+12] ; sec_taps 1490 1491 BORDER_PREP_REGS %1, %2 1492%if %1*%2*2/mmsize > 1 1493.border_v_loop: 1494%endif 1495 BORDER_LOAD_BLOCK %1, %2, 1 1496.border_k_loop: 1497 vpbroadcastb m2, [priq+kq] ; pri_taps 1498 vpbroadcastb m3, [secq+kq] ; sec_taps 1499 ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 1500 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 1501 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 1502 dec kq 1503 jge .border_k_loop 1504 1505 vpbroadcastd m10, [pw_2048] 1506 BORDER_ADJUST_PIXEL %1, m10, 1 1507%if %1*%2*2/mmsize > 1 1508 %define vloop_lines (mmsize/(%1*2)) 1509 lea dstq, [dstq+strideq*vloop_lines] 1510 add stkq, 32*vloop_lines 1511 dec hd 1512 jg .border_v_loop 1513%endif 1514 RET 1515 1516.border_pri_only: 1517 DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 1518 lea tableq, [tap_table] 1519 vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1520 DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 1521 vpbroadcastb m0, xm0 ; pri_strength 1522 and prid, 1 1523 lea priq, [tableq+priq*2+8] ; pri_taps 1524 BORDER_PREP_REGS %1, %2 1525 vpbroadcastd m1, [pw_2048] 1526%if %1*%2*2/mmsize > 1 1527.border_pri_v_loop: 1528%endif 1529 BORDER_LOAD_BLOCK %1, %2 1530.border_pri_k_loop: 1531 vpbroadcastb m2, [priq+kq] ; pri_taps 1532 ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 1533 dec kq 1534 jge .border_pri_k_loop 1535 BORDER_ADJUST_PIXEL %1, m1 1536%if %1*%2*2/mmsize > 1 1537 %define vloop_lines (mmsize/(%1*2)) 1538 lea dstq, [dstq+strideq*vloop_lines] 1539 add stkq, 32*vloop_lines 1540 dec hd 1541 jg .border_pri_v_loop 1542%endif 1543 RET 1544 1545.border_sec_only: 1546 DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 1547 movd xm1, secdmpd 1548 lzcnt secdmpd, secdmpd 1549 add secdmpd, dampingd 1550 mov [rsp+8], secdmpq ; sec_shift 1551 DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 1552 lea tableq, [tap_table] 1553 vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1554 DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 1555 vpbroadcastb m1, xm1 ; sec_strength 1556 lea secq, [tableq+12] ; sec_taps 1557 BORDER_PREP_REGS %1, %2 1558 vpbroadcastd m0, [pw_2048] 1559%if %1*%2*2/mmsize > 1 1560.border_sec_v_loop: 1561%endif 1562 BORDER_LOAD_BLOCK %1, %2 1563.border_sec_k_loop: 1564 vpbroadcastb m3, [secq+kq] ; sec_taps 1565 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 1566 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 1567 dec kq 1568 jge .border_sec_k_loop 1569 BORDER_ADJUST_PIXEL %1, m0 1570%if %1*%2*2/mmsize > 1 1571 %define vloop_lines (mmsize/(%1*2)) 1572 lea dstq, [dstq+strideq*vloop_lines] 1573 add stkq, 32*vloop_lines 1574 dec hd 1575 jg .border_sec_v_loop 1576%endif 1577 RET 1578%endmacro 1579 1580CDEF_FILTER 8, 8 1581CDEF_FILTER 4, 8 1582CDEF_FILTER 4, 4 1583 1584INIT_YMM avx2 1585cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 1586 lea stride3q, [strideq*3] 1587 movq xm0, [srcq+strideq*0] 1588 movq xm1, [srcq+strideq*1] 1589 movq xm2, [srcq+strideq*2] 1590 movq xm3, [srcq+stride3q ] 1591 lea srcq, [srcq+strideq*4] 1592 vpbroadcastq m4, [srcq+stride3q ] 1593 vpbroadcastq m5, [srcq+strideq*2] 1594 vpblendd m0, m4, 0xf0 1595 vpblendd m1, m5, 0xf0 1596 vpbroadcastq m4, [srcq+strideq*1] 1597 vpbroadcastq m5, [srcq+strideq*0] 1598 vpblendd m2, m4, 0xf0 1599 vpblendd m3, m5, 0xf0 1600 pxor m4, m4 1601 punpcklbw m0, m4 1602 punpcklbw m1, m4 1603 punpcklbw m2, m4 1604 punpcklbw m3, m4 1605cglobal_label .main 1606 vpbroadcastd m4, [pw_128] 1607 PROLOGUE 3, 4, 15 1608 psubw m0, m4 1609 psubw m1, m4 1610 psubw m2, m4 1611 psubw m3, m4 1612 1613 ; shuffle registers to generate partial_sum_diag[0-1] together 1614 vperm2i128 m7, m0, m0, 0x01 1615 vperm2i128 m6, m1, m1, 0x01 1616 vperm2i128 m5, m2, m2, 0x01 1617 vperm2i128 m4, m3, m3, 0x01 1618 1619 ; start with partial_sum_hv[0-1] 1620 paddw m8, m0, m1 1621 paddw m9, m2, m3 1622 phaddw m10, m0, m1 1623 phaddw m11, m2, m3 1624 paddw m8, m9 1625 phaddw m10, m11 1626 vextracti128 xm9, m8, 1 1627 vextracti128 xm11, m10, 1 1628 paddw xm8, xm9 ; partial_sum_hv[1] 1629 phaddw xm10, xm11 ; partial_sum_hv[0] 1630 vinserti128 m8, xm10, 1 1631 vpbroadcastd m9, [div_table+44] 1632 pmaddwd m8, m8 1633 pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] 1634 1635 ; create aggregates [lower half]: 1636 ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ 1637 ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 1638 ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ 1639 ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x 1640 ; and [upper half]: 1641 ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ 1642 ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 1643 ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ 1644 ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx 1645 ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd 1646 1647 pslldq m9, m1, 2 1648 psrldq m10, m1, 14 1649 pslldq m11, m2, 4 1650 psrldq m12, m2, 12 1651 pslldq m13, m3, 6 1652 psrldq m14, m3, 10 1653 paddw m9, m11 1654 paddw m10, m12 1655 paddw m9, m13 1656 paddw m10, m14 1657 pslldq m11, m4, 8 1658 psrldq m12, m4, 8 1659 pslldq m13, m5, 10 1660 psrldq m14, m5, 6 1661 paddw m9, m11 1662 paddw m10, m12 1663 paddw m9, m13 1664 paddw m10, m14 1665 pslldq m11, m6, 12 1666 psrldq m12, m6, 4 1667 pslldq m13, m7, 14 1668 psrldq m14, m7, 2 1669 paddw m9, m11 1670 paddw m10, m12 1671 paddw m9, m13 1672 paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] 1673 vbroadcasti128 m14, [shufw_6543210x] 1674 vbroadcasti128 m13, [div_table+16] 1675 vbroadcasti128 m12, [div_table+0] 1676 paddw m9, m0 ; partial_sum_diag[0/1][0-7] 1677 pshufb m10, m14 1678 punpckhwd m11, m9, m10 1679 punpcklwd m9, m10 1680 pmaddwd m11, m11 1681 pmaddwd m9, m9 1682 pmulld m11, m13 1683 pmulld m9, m12 1684 paddd m9, m11 ; cost0[a-d] | cost4[a-d] 1685 1686 ; merge horizontally and vertically for partial_sum_alt[0-3] 1687 paddw m10, m0, m1 1688 paddw m11, m2, m3 1689 paddw m12, m4, m5 1690 paddw m13, m6, m7 1691 phaddw m0, m4 1692 phaddw m1, m5 1693 phaddw m2, m6 1694 phaddw m3, m7 1695 1696 ; create aggregates [lower half]: 1697 ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 1698 ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx 1699 ; and [upper half]: 1700 ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 1701 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx 1702 ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd 1703 1704 pslldq m4, m11, 2 1705 psrldq m11, 14 1706 pslldq m5, m12, 4 1707 psrldq m12, 12 1708 pslldq m6, m13, 6 1709 psrldq m13, 10 1710 paddw m4, m10 1711 paddw m11, m12 1712 vpbroadcastd m12, [div_table+44] 1713 paddw m5, m6 1714 paddw m11, m13 ; partial_sum_alt[3/2] right 1715 vbroadcasti128 m13, [div_table+32] 1716 paddw m4, m5 ; partial_sum_alt[3/2] left 1717 pshuflw m5, m11, q3012 1718 punpckhwd m6, m11, m4 1719 punpcklwd m4, m5 1720 pmaddwd m6, m6 1721 pmaddwd m4, m4 1722 pmulld m6, m12 1723 pmulld m4, m13 1724 paddd m4, m6 ; cost7[a-d] | cost5[a-d] 1725 1726 ; create aggregates [lower half]: 1727 ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 1728 ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx 1729 ; and [upper half]: 1730 ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 1731 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx 1732 ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd 1733 1734 pslldq m5, m1, 2 1735 psrldq m1, 14 1736 pslldq m6, m2, 4 1737 psrldq m2, 12 1738 pslldq m7, m3, 6 1739 psrldq m3, 10 1740 paddw m5, m0 1741 paddw m1, m2 1742 paddw m6, m7 1743 paddw m1, m3 ; partial_sum_alt[0/1] right 1744 paddw m5, m6 ; partial_sum_alt[0/1] left 1745 pshuflw m0, m1, q3012 1746 punpckhwd m1, m5 1747 punpcklwd m5, m0 1748 pmaddwd m1, m1 1749 pmaddwd m5, m5 1750 pmulld m1, m12 1751 pmulld m5, m13 1752 paddd m5, m1 ; cost1[a-d] | cost3[a-d] 1753 1754 mova xm0, [pd_47130256+ 16] 1755 mova m1, [pd_47130256] 1756 phaddd m9, m8 1757 phaddd m5, m4 1758 phaddd m9, m5 1759 vpermd m0, m9 ; cost[0-3] 1760 vpermd m1, m9 ; cost[4-7] | cost[0-3] 1761 1762 ; now find the best cost 1763 pmaxsd xm2, xm0, xm1 1764 pshufd xm3, xm2, q1032 1765 pmaxsd xm2, xm3 1766 pshufd xm3, xm2, q2301 1767 pmaxsd xm2, xm3 ; best cost 1768 1769 ; find the idx using minpos 1770 ; make everything other than the best cost negative via subtraction 1771 ; find the min of unsigned 16-bit ints to sort out the negative values 1772 psubd xm4, xm1, xm2 1773 psubd xm3, xm0, xm2 1774 packssdw xm3, xm4 1775 phminposuw xm3, xm3 1776 1777 ; convert idx to 32-bits 1778 psrld xm3, 16 1779 movd eax, xm3 1780 1781 ; get idx^4 complement 1782 vpermd m3, m1 1783 psubd xm2, xm3 1784 psrld xm2, 10 1785 movd [varq], xm2 1786 RET 1787 1788%endif ; ARCH_X86_64 1789