1;****************************************************************************** 2;* VP9 Intra prediction SIMD optimizations 3;* 4;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> 5;* Copyright (c) 2015 Henrik Gramner <henrik gramner com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 32 27 28pd_2: times 8 dd 2 29pd_4: times 8 dd 4 30pd_8: times 8 dd 8 31 32pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 33pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 34pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 35 36cextern pw_1 37cextern pw_1023 38cextern pw_4095 39cextern pd_16 40cextern pd_32 41cextern pd_65535; 42 43; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take 44; only 3 registers on x86-32, which would make it one cycle faster, but that 45; would make the code quite a bit uglier... 46 47SECTION .text 48 49%macro SCRATCH 3-4 50%if ARCH_X86_64 51 SWAP %1, %2 52%if %0 == 4 53%define reg_%4 m%2 54%endif 55%else 56 mova [%3], m%1 57%if %0 == 4 58%define reg_%4 [%3] 59%endif 60%endif 61%endmacro 62 63%macro UNSCRATCH 3-4 64%if ARCH_X86_64 65 SWAP %1, %2 66%else 67 mova m%1, [%3] 68%endif 69%if %0 == 4 70%undef reg_%4 71%endif 72%endmacro 73 74%macro PRELOAD 2-3 75%if ARCH_X86_64 76 mova m%1, [%2] 77%if %0 == 3 78%define reg_%3 m%1 79%endif 80%elif %0 == 3 81%define reg_%3 [%2] 82%endif 83%endmacro 84 85INIT_MMX mmx 86cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a 87 movifnidn aq, amp 88 mova m0, [aq] 89 DEFINE_ARGS dst, stride, stride3 90 lea stride3q, [strideq*3] 91 mova [dstq+strideq*0], m0 92 mova [dstq+strideq*1], m0 93 mova [dstq+strideq*2], m0 94 mova [dstq+stride3q ], m0 95 RET 96 97INIT_XMM sse 98cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a 99 movifnidn aq, amp 100 mova m0, [aq] 101 DEFINE_ARGS dst, stride, stride3 102 lea stride3q, [strideq*3] 103 mova [dstq+strideq*0], m0 104 mova [dstq+strideq*1], m0 105 mova [dstq+strideq*2], m0 106 mova [dstq+stride3q ], m0 107 lea dstq, [dstq+strideq*4] 108 mova [dstq+strideq*0], m0 109 mova [dstq+strideq*1], m0 110 mova [dstq+strideq*2], m0 111 mova [dstq+stride3q ], m0 112 RET 113 114INIT_XMM sse 115cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a 116 movifnidn aq, amp 117 mova m0, [aq] 118 mova m1, [aq+mmsize] 119 DEFINE_ARGS dst, stride, stride3, cnt 120 lea stride3q, [strideq*3] 121 mov cntd, 4 122.loop: 123 mova [dstq+strideq*0+ 0], m0 124 mova [dstq+strideq*0+16], m1 125 mova [dstq+strideq*1+ 0], m0 126 mova [dstq+strideq*1+16], m1 127 mova [dstq+strideq*2+ 0], m0 128 mova [dstq+strideq*2+16], m1 129 mova [dstq+stride3q + 0], m0 130 mova [dstq+stride3q +16], m1 131 lea dstq, [dstq+strideq*4] 132 dec cntd 133 jg .loop 134 RET 135 136INIT_XMM sse 137cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a 138 movifnidn aq, amp 139 mova m0, [aq+mmsize*0] 140 mova m1, [aq+mmsize*1] 141 mova m2, [aq+mmsize*2] 142 mova m3, [aq+mmsize*3] 143 DEFINE_ARGS dst, stride, cnt 144 mov cntd, 16 145.loop: 146 mova [dstq+strideq*0+ 0], m0 147 mova [dstq+strideq*0+16], m1 148 mova [dstq+strideq*0+32], m2 149 mova [dstq+strideq*0+48], m3 150 mova [dstq+strideq*1+ 0], m0 151 mova [dstq+strideq*1+16], m1 152 mova [dstq+strideq*1+32], m2 153 mova [dstq+strideq*1+48], m3 154 lea dstq, [dstq+strideq*2] 155 dec cntd 156 jg .loop 157 RET 158 159INIT_MMX mmxext 160cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a 161 mova m3, [lq] 162 DEFINE_ARGS dst, stride, stride3 163 lea stride3q, [strideq*3] 164 pshufw m0, m3, q3333 165 pshufw m1, m3, q2222 166 pshufw m2, m3, q1111 167 pshufw m3, m3, q0000 168 mova [dstq+strideq*0], m0 169 mova [dstq+strideq*1], m1 170 mova [dstq+strideq*2], m2 171 mova [dstq+stride3q ], m3 172 RET 173 174INIT_XMM sse2 175cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a 176 mova m2, [lq] 177 DEFINE_ARGS dst, stride, stride3 178 lea stride3q, [strideq*3] 179 punpckhwd m3, m2, m2 180 pshufd m0, m3, q3333 181 pshufd m1, m3, q2222 182 mova [dstq+strideq*0], m0 183 mova [dstq+strideq*1], m1 184 pshufd m0, m3, q1111 185 pshufd m1, m3, q0000 186 mova [dstq+strideq*2], m0 187 mova [dstq+stride3q ], m1 188 lea dstq, [dstq+strideq*4] 189 punpcklwd m2, m2 190 pshufd m0, m2, q3333 191 pshufd m1, m2, q2222 192 mova [dstq+strideq*0], m0 193 mova [dstq+strideq*1], m1 194 pshufd m0, m2, q1111 195 pshufd m1, m2, q0000 196 mova [dstq+strideq*2], m0 197 mova [dstq+stride3q ], m1 198 RET 199 200INIT_XMM sse2 201cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt 202 mov cntd, 3 203 lea stride3q, [strideq*3] 204.loop: 205 movh m3, [lq+cntq*8] 206 punpcklwd m3, m3 207 pshufd m0, m3, q3333 208 pshufd m1, m3, q2222 209 pshufd m2, m3, q1111 210 pshufd m3, m3, q0000 211 mova [dstq+strideq*0+ 0], m0 212 mova [dstq+strideq*0+16], m0 213 mova [dstq+strideq*1+ 0], m1 214 mova [dstq+strideq*1+16], m1 215 mova [dstq+strideq*2+ 0], m2 216 mova [dstq+strideq*2+16], m2 217 mova [dstq+stride3q + 0], m3 218 mova [dstq+stride3q +16], m3 219 lea dstq, [dstq+strideq*4] 220 dec cntd 221 jge .loop 222 RET 223 224INIT_XMM sse2 225cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt 226 mov cntd, 7 227 lea stride3q, [strideq*3] 228.loop: 229 movh m3, [lq+cntq*8] 230 punpcklwd m3, m3 231 pshufd m0, m3, q3333 232 pshufd m1, m3, q2222 233 pshufd m2, m3, q1111 234 pshufd m3, m3, q0000 235 mova [dstq+strideq*0+ 0], m0 236 mova [dstq+strideq*0+16], m0 237 mova [dstq+strideq*0+32], m0 238 mova [dstq+strideq*0+48], m0 239 mova [dstq+strideq*1+ 0], m1 240 mova [dstq+strideq*1+16], m1 241 mova [dstq+strideq*1+32], m1 242 mova [dstq+strideq*1+48], m1 243 mova [dstq+strideq*2+ 0], m2 244 mova [dstq+strideq*2+16], m2 245 mova [dstq+strideq*2+32], m2 246 mova [dstq+strideq*2+48], m2 247 mova [dstq+stride3q + 0], m3 248 mova [dstq+stride3q +16], m3 249 mova [dstq+stride3q +32], m3 250 mova [dstq+stride3q +48], m3 251 lea dstq, [dstq+strideq*4] 252 dec cntd 253 jge .loop 254 RET 255 256INIT_MMX mmxext 257cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a 258 mova m0, [lq] 259 paddw m0, [aq] 260 DEFINE_ARGS dst, stride, stride3 261 lea stride3q, [strideq*3] 262 pmaddwd m0, [pw_1] 263 pshufw m1, m0, q3232 264 paddd m0, [pd_4] 265 paddd m0, m1 266 psrad m0, 3 267 pshufw m0, m0, q0000 268 mova [dstq+strideq*0], m0 269 mova [dstq+strideq*1], m0 270 mova [dstq+strideq*2], m0 271 mova [dstq+stride3q ], m0 272 RET 273 274INIT_XMM sse2 275cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a 276 mova m0, [lq] 277 paddw m0, [aq] 278 DEFINE_ARGS dst, stride, stride3 279 lea stride3q, [strideq*3] 280 pmaddwd m0, [pw_1] 281 pshufd m1, m0, q3232 282 paddd m0, m1 283 pshufd m1, m0, q1111 284 paddd m0, [pd_8] 285 paddd m0, m1 286 psrad m0, 4 287 pshuflw m0, m0, q0000 288 punpcklqdq m0, m0 289 mova [dstq+strideq*0], m0 290 mova [dstq+strideq*1], m0 291 mova [dstq+strideq*2], m0 292 mova [dstq+stride3q ], m0 293 lea dstq, [dstq+strideq*4] 294 mova [dstq+strideq*0], m0 295 mova [dstq+strideq*1], m0 296 mova [dstq+strideq*2], m0 297 mova [dstq+stride3q ], m0 298 RET 299 300INIT_XMM sse2 301cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a 302 mova m0, [lq] 303 paddw m0, [lq+mmsize] 304 paddw m0, [aq] 305 paddw m0, [aq+mmsize] 306 DEFINE_ARGS dst, stride, stride3, cnt 307 lea stride3q, [strideq*3] 308 mov cntd, 4 309 pmaddwd m0, [pw_1] 310 pshufd m1, m0, q3232 311 paddd m0, m1 312 pshufd m1, m0, q1111 313 paddd m0, [pd_16] 314 paddd m0, m1 315 psrad m0, 5 316 pshuflw m0, m0, q0000 317 punpcklqdq m0, m0 318.loop: 319 mova [dstq+strideq*0+ 0], m0 320 mova [dstq+strideq*0+16], m0 321 mova [dstq+strideq*1+ 0], m0 322 mova [dstq+strideq*1+16], m0 323 mova [dstq+strideq*2+ 0], m0 324 mova [dstq+strideq*2+16], m0 325 mova [dstq+stride3q + 0], m0 326 mova [dstq+stride3q +16], m0 327 lea dstq, [dstq+strideq*4] 328 dec cntd 329 jg .loop 330 RET 331 332INIT_XMM sse2 333cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a 334 mova m0, [lq+mmsize*0] 335 paddw m0, [lq+mmsize*1] 336 paddw m0, [lq+mmsize*2] 337 paddw m0, [lq+mmsize*3] 338 paddw m0, [aq+mmsize*0] 339 paddw m0, [aq+mmsize*1] 340 paddw m0, [aq+mmsize*2] 341 paddw m0, [aq+mmsize*3] 342 DEFINE_ARGS dst, stride, stride3, cnt 343 lea stride3q, [strideq*3] 344 mov cntd, 16 345 pmaddwd m0, [pw_1] 346 pshufd m1, m0, q3232 347 paddd m0, m1 348 pshufd m1, m0, q1111 349 paddd m0, [pd_32] 350 paddd m0, m1 351 psrad m0, 6 352 pshuflw m0, m0, q0000 353 punpcklqdq m0, m0 354.loop: 355 mova [dstq+strideq*0+ 0], m0 356 mova [dstq+strideq*0+16], m0 357 mova [dstq+strideq*0+32], m0 358 mova [dstq+strideq*0+48], m0 359 mova [dstq+strideq*1+ 0], m0 360 mova [dstq+strideq*1+16], m0 361 mova [dstq+strideq*1+32], m0 362 mova [dstq+strideq*1+48], m0 363 lea dstq, [dstq+strideq*2] 364 dec cntd 365 jg .loop 366 RET 367 368%macro DC_1D_FNS 2 369INIT_MMX mmxext 370cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a 371 mova m0, [%2] 372 DEFINE_ARGS dst, stride, stride3 373 lea stride3q, [strideq*3] 374 pmaddwd m0, [pw_1] 375 pshufw m1, m0, q3232 376 paddd m0, [pd_2] 377 paddd m0, m1 378 psrad m0, 2 379 pshufw m0, m0, q0000 380 mova [dstq+strideq*0], m0 381 mova [dstq+strideq*1], m0 382 mova [dstq+strideq*2], m0 383 mova [dstq+stride3q ], m0 384 RET 385 386INIT_XMM sse2 387cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a 388 mova m0, [%2] 389 DEFINE_ARGS dst, stride, stride3 390 lea stride3q, [strideq*3] 391 pmaddwd m0, [pw_1] 392 pshufd m1, m0, q3232 393 paddd m0, m1 394 pshufd m1, m0, q1111 395 paddd m0, [pd_4] 396 paddd m0, m1 397 psrad m0, 3 398 pshuflw m0, m0, q0000 399 punpcklqdq m0, m0 400 mova [dstq+strideq*0], m0 401 mova [dstq+strideq*1], m0 402 mova [dstq+strideq*2], m0 403 mova [dstq+stride3q ], m0 404 lea dstq, [dstq+strideq*4] 405 mova [dstq+strideq*0], m0 406 mova [dstq+strideq*1], m0 407 mova [dstq+strideq*2], m0 408 mova [dstq+stride3q ], m0 409 RET 410 411INIT_XMM sse2 412cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a 413 mova m0, [%2] 414 paddw m0, [%2+mmsize] 415 DEFINE_ARGS dst, stride, stride3, cnt 416 lea stride3q, [strideq*3] 417 mov cntd, 4 418 pmaddwd m0, [pw_1] 419 pshufd m1, m0, q3232 420 paddd m0, m1 421 pshufd m1, m0, q1111 422 paddd m0, [pd_8] 423 paddd m0, m1 424 psrad m0, 4 425 pshuflw m0, m0, q0000 426 punpcklqdq m0, m0 427.loop: 428 mova [dstq+strideq*0+ 0], m0 429 mova [dstq+strideq*0+16], m0 430 mova [dstq+strideq*1+ 0], m0 431 mova [dstq+strideq*1+16], m0 432 mova [dstq+strideq*2+ 0], m0 433 mova [dstq+strideq*2+16], m0 434 mova [dstq+stride3q + 0], m0 435 mova [dstq+stride3q +16], m0 436 lea dstq, [dstq+strideq*4] 437 dec cntd 438 jg .loop 439 RET 440 441INIT_XMM sse2 442cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a 443 mova m0, [%2+mmsize*0] 444 paddw m0, [%2+mmsize*1] 445 paddw m0, [%2+mmsize*2] 446 paddw m0, [%2+mmsize*3] 447 DEFINE_ARGS dst, stride, cnt 448 mov cntd, 16 449 pmaddwd m0, [pw_1] 450 pshufd m1, m0, q3232 451 paddd m0, m1 452 pshufd m1, m0, q1111 453 paddd m0, [pd_16] 454 paddd m0, m1 455 psrad m0, 5 456 pshuflw m0, m0, q0000 457 punpcklqdq m0, m0 458.loop: 459 mova [dstq+strideq*0+ 0], m0 460 mova [dstq+strideq*0+16], m0 461 mova [dstq+strideq*0+32], m0 462 mova [dstq+strideq*0+48], m0 463 mova [dstq+strideq*1+ 0], m0 464 mova [dstq+strideq*1+16], m0 465 mova [dstq+strideq*1+32], m0 466 mova [dstq+strideq*1+48], m0 467 lea dstq, [dstq+strideq*2] 468 dec cntd 469 jg .loop 470 RET 471%endmacro 472 473DC_1D_FNS top, aq 474DC_1D_FNS left, lq 475 476INIT_MMX mmxext 477cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a 478 mova m5, [pw_1023] 479.body: 480 mova m4, [aq] 481 mova m3, [lq] 482 movd m0, [aq-4] 483 pshufw m0, m0, q1111 484 psubw m4, m0 485 DEFINE_ARGS dst, stride, stride3 486 lea stride3q, [strideq*3] 487 pshufw m0, m3, q3333 488 pshufw m1, m3, q2222 489 pshufw m2, m3, q1111 490 pshufw m3, m3, q0000 491 paddw m0, m4 492 paddw m1, m4 493 paddw m2, m4 494 paddw m3, m4 495 pxor m4, m4 496 pmaxsw m0, m4 497 pmaxsw m1, m4 498 pmaxsw m2, m4 499 pmaxsw m3, m4 500 pminsw m0, m5 501 pminsw m1, m5 502 pminsw m2, m5 503 pminsw m3, m5 504 mova [dstq+strideq*0], m0 505 mova [dstq+strideq*1], m1 506 mova [dstq+strideq*2], m2 507 mova [dstq+stride3q ], m3 508 RET 509 510cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a 511 mova m5, [pw_4095] 512 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body 513 514INIT_XMM sse2 515cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a 516 mova m4, [pw_1023] 517.body: 518 pxor m6, m6 519 mova m5, [aq] 520 movd m0, [aq-4] 521 pshuflw m0, m0, q1111 522 punpcklqdq m0, m0 523 psubw m5, m0 524 DEFINE_ARGS dst, stride, l, stride3, cnt 525 lea stride3q, [strideq*3] 526 mov cntd, 1 527.loop: 528 movh m3, [lq+cntq*8] 529 punpcklwd m3, m3 530 pshufd m0, m3, q3333 531 pshufd m1, m3, q2222 532 pshufd m2, m3, q1111 533 pshufd m3, m3, q0000 534 paddw m0, m5 535 paddw m1, m5 536 paddw m2, m5 537 paddw m3, m5 538 pmaxsw m0, m6 539 pmaxsw m1, m6 540 pmaxsw m2, m6 541 pmaxsw m3, m6 542 pminsw m0, m4 543 pminsw m1, m4 544 pminsw m2, m4 545 pminsw m3, m4 546 mova [dstq+strideq*0], m0 547 mova [dstq+strideq*1], m1 548 mova [dstq+strideq*2], m2 549 mova [dstq+stride3q ], m3 550 lea dstq, [dstq+strideq*4] 551 dec cntd 552 jge .loop 553 RET 554 555cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a 556 mova m4, [pw_4095] 557 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body 558 559INIT_XMM sse2 560cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a 561 mova m7, [pw_1023] 562.body: 563 pxor m6, m6 564 mova m4, [aq] 565 mova m5, [aq+mmsize] 566 movd m0, [aq-4] 567 pshuflw m0, m0, q1111 568 punpcklqdq m0, m0 569 psubw m4, m0 570 psubw m5, m0 571 DEFINE_ARGS dst, stride, l, cnt 572 mov cntd, 7 573.loop: 574 movd m3, [lq+cntq*4] 575 punpcklwd m3, m3 576 pshufd m2, m3, q1111 577 pshufd m3, m3, q0000 578 paddw m0, m2, m4 579 paddw m2, m5 580 paddw m1, m3, m4 581 paddw m3, m5 582 pmaxsw m0, m6 583 pmaxsw m2, m6 584 pmaxsw m1, m6 585 pmaxsw m3, m6 586 pminsw m0, m7 587 pminsw m2, m7 588 pminsw m1, m7 589 pminsw m3, m7 590 mova [dstq+strideq*0+ 0], m0 591 mova [dstq+strideq*0+16], m2 592 mova [dstq+strideq*1+ 0], m1 593 mova [dstq+strideq*1+16], m3 594 lea dstq, [dstq+strideq*2] 595 dec cntd 596 jge .loop 597 RET 598 599cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a 600 mova m7, [pw_4095] 601 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body 602 603INIT_XMM sse2 604cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a 605 mova m0, [pw_1023] 606.body: 607 pxor m1, m1 608%if ARCH_X86_64 609 SWAP 0, 8 610 SWAP 1, 9 611%define reg_min m9 612%define reg_max m8 613%else 614 mova [rsp+ 0], m0 615 mova [rsp+16], m1 616%define reg_min [rsp+16] 617%define reg_max [rsp+ 0] 618%endif 619 620 mova m4, [aq+mmsize*0] 621 mova m5, [aq+mmsize*1] 622 mova m6, [aq+mmsize*2] 623 mova m7, [aq+mmsize*3] 624 movd m0, [aq-4] 625 pshuflw m0, m0, q1111 626 punpcklqdq m0, m0 627 psubw m4, m0 628 psubw m5, m0 629 psubw m6, m0 630 psubw m7, m0 631 DEFINE_ARGS dst, stride, l, cnt 632 mov cntd, 31 633.loop: 634 pinsrw m3, [lq+cntq*2], 0 635 punpcklwd m3, m3 636 pshufd m3, m3, q0000 637 paddw m0, m3, m4 638 paddw m1, m3, m5 639 paddw m2, m3, m6 640 paddw m3, m7 641 pmaxsw m0, reg_min 642 pmaxsw m1, reg_min 643 pmaxsw m2, reg_min 644 pmaxsw m3, reg_min 645 pminsw m0, reg_max 646 pminsw m1, reg_max 647 pminsw m2, reg_max 648 pminsw m3, reg_max 649 mova [dstq+strideq*0+ 0], m0 650 mova [dstq+strideq*0+16], m1 651 mova [dstq+strideq*0+32], m2 652 mova [dstq+strideq*0+48], m3 653 add dstq, strideq 654 dec cntd 655 jge .loop 656 RET 657 658cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a 659 mova m0, [pw_4095] 660 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body 661 662; Directional intra predicion functions 663; 664; in the functions below, 'abcdefgh' refers to above data (sometimes simply 665; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply 666; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered 667; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered 668; top-left data. 669 670; left=(left+2*center+right+2)>>2 671%macro LOWPASS 3 ; left [dst], center, right 672 paddw m%1, m%3 673 psraw m%1, 1 674 pavgw m%1, m%2 675%endmacro 676 677; abcdefgh (src) -> bcdefghh (dst) 678; dst/src can be the same register 679%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg] 680%if cpuflag(ssse3) 681 pshufb %1, %2, %3 ; abcdefgh -> bcdefghh 682%else 683 psrldq %1, %2, 2 ; abcdefgh -> bcdefgh. 684 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh 685%endif 686%endmacro 687 688; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2) 689%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg] 690%if cpuflag(ssse3) 691 pshufb %1, %3, %4 ; abcdefgh -> bcdefghh 692 pshufb %2, %1, %4 ; bcdefghh -> cdefghhh 693%else 694 psrldq %1, %3, 2 ; abcdefgh -> bcdefgh. 695 psrldq %2, %3, 4 ; abcdefgh -> cdefgh.. 696 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh 697 pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh 698%endif 699%endmacro 700 701%macro DL_FUNCS 0 702cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a 703 movifnidn aq, amp 704 movu m1, [aq] ; abcdefgh 705 pshufhw m0, m1, q3310 ; abcdefhh 706 SHIFT_RIGHT m1, m1 ; bcdefghh 707 psrldq m2, m1, 2 ; cdefghh. 708 LOWPASS 0, 1, 2 ; BCDEFGh. 709 pshufd m1, m0, q3321 ; DEFGh... 710 movh [dstq+strideq*0], m0 711 movh [dstq+strideq*2], m1 712 add dstq, strideq 713 psrldq m0, 2 ; CDEFGh.. 714 psrldq m1, 2 ; EFGh.... 715 movh [dstq+strideq*0], m0 716 movh [dstq+strideq*2], m1 717 RET 718 719cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a 720 movifnidn aq, amp 721 mova m0, [aq] ; abcdefgh 722%if cpuflag(ssse3) 723 mova m4, [pb_2to15_14_15] 724%endif 725 SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh 726 LOWPASS 0, 1, 2 ; BCDEFGHh 727 shufps m1, m0, m2, q3332 ; FGHhhhhh 728 shufps m3, m0, m1, q2121 ; DEFGHhhh 729 DEFINE_ARGS dst, stride, stride5 730 lea stride5q, [strideq*5] 731 732 mova [dstq+strideq*0], m0 733 mova [dstq+strideq*4], m1 734 SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh 735 pshuflw m1, m1, q3321 ; GHhhhhhh 736 pshufd m2, m0, q3321 ; EFGHhhhh 737 mova [dstq+strideq*1], m0 738 mova [dstq+stride5q ], m1 739 lea dstq, [dstq+strideq*2] 740 pshuflw m1, m1, q3321 ; Hhhhhhhh 741 mova [dstq+strideq*0], m3 742 mova [dstq+strideq*4], m1 743 pshuflw m1, m1, q3321 ; hhhhhhhh 744 mova [dstq+strideq*1], m2 745 mova [dstq+stride5q ], m1 746 RET 747 748cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a 749 movifnidn aq, amp 750 mova m0, [aq] ; abcdefgh 751 mova m3, [aq+mmsize] ; ijklmnop 752 PALIGNR m1, m3, m0, 2, m4 ; bcdefghi 753 PALIGNR m2, m3, m0, 4, m4 ; cdefghij 754 LOWPASS 0, 1, 2 ; BCDEFGHI 755%if cpuflag(ssse3) 756 mova m4, [pb_2to15_14_15] 757%endif 758 SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp 759 LOWPASS 1, 2, 3 ; JKLMNOPp 760 pshufd m2, m2, q3333 ; pppppppp 761 DEFINE_ARGS dst, stride, cnt 762 mov cntd, 8 763 764.loop: 765 mova [dstq+strideq*0+ 0], m0 766 mova [dstq+strideq*0+16], m1 767 mova [dstq+strideq*8+ 0], m1 768 mova [dstq+strideq*8+16], m2 769 add dstq, strideq 770%if cpuflag(avx) 771 vpalignr m0, m1, m0, 2 772%else 773 PALIGNR m3, m1, m0, 2, m4 774 mova m0, m3 775%endif 776 SHIFT_RIGHT m1, m1, m4 777 dec cntd 778 jg .loop 779 RET 780 781cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a 782 movifnidn aq, amp 783 mova m0, [aq+mmsize*0] ; abcdefgh 784 mova m1, [aq+mmsize*1] ; ijklmnop 785 mova m2, [aq+mmsize*2] ; qrstuvwx 786 mova m3, [aq+mmsize*3] ; yz012345 787 PALIGNR m4, m1, m0, 2, m6 788 PALIGNR m5, m1, m0, 4, m6 789 LOWPASS 0, 4, 5 ; BCDEFGHI 790 PALIGNR m4, m2, m1, 2, m6 791 PALIGNR m5, m2, m1, 4, m6 792 LOWPASS 1, 4, 5 ; JKLMNOPQ 793 PALIGNR m4, m3, m2, 2, m6 794 PALIGNR m5, m3, m2, 4, m6 795 LOWPASS 2, 4, 5 ; RSTUVWXY 796%if cpuflag(ssse3) 797 mova m6, [pb_2to15_14_15] 798%endif 799 SHIFT_RIGHTx2 m4, m5, m3, m6 800 LOWPASS 3, 4, 5 ; Z0123455 801 pshufd m4, m4, q3333 ; 55555555 802 DEFINE_ARGS dst, stride, stride8, stride24, cnt 803 mov cntd, 8 804 lea stride8q, [strideq*8] 805 lea stride24q, [stride8q*3] 806 807.loop: 808 mova [dstq+stride8q*0+ 0], m0 809 mova [dstq+stride8q*0+16], m1 810 mova [dstq+stride8q*0+32], m2 811 mova [dstq+stride8q*0+48], m3 812 mova [dstq+stride8q*1+ 0], m1 813 mova [dstq+stride8q*1+16], m2 814 mova [dstq+stride8q*1+32], m3 815 mova [dstq+stride8q*1+48], m4 816 mova [dstq+stride8q*2+ 0], m2 817 mova [dstq+stride8q*2+16], m3 818 mova [dstq+stride8q*2+32], m4 819 mova [dstq+stride8q*2+48], m4 820 mova [dstq+stride24q + 0], m3 821 mova [dstq+stride24q +16], m4 822 mova [dstq+stride24q +32], m4 823 mova [dstq+stride24q +48], m4 824 add dstq, strideq 825%if cpuflag(avx) 826 vpalignr m0, m1, m0, 2 827 vpalignr m1, m2, m1, 2 828 vpalignr m2, m3, m2, 2 829%else 830 PALIGNR m5, m1, m0, 2, m6 831 mova m0, m5 832 PALIGNR m5, m2, m1, 2, m6 833 mova m1, m5 834 PALIGNR m5, m3, m2, 2, m6 835 mova m2, m5 836%endif 837 SHIFT_RIGHT m3, m3, m6 838 dec cntd 839 jg .loop 840 RET 841%endmacro 842 843INIT_XMM sse2 844DL_FUNCS 845INIT_XMM ssse3 846DL_FUNCS 847INIT_XMM avx 848DL_FUNCS 849 850%if HAVE_AVX2_EXTERNAL 851INIT_YMM avx2 852cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a 853 movifnidn aq, amp 854 mova m0, [aq] ; abcdefghijklmnop 855 vpbroadcastw xm1, [aq+30] ; pppppppp 856 vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp 857 vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp 858 vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp 859 LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp 860 vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp 861 DEFINE_ARGS dst, stride, stride3, cnt 862 mov cntd, 2 863 lea stride3q, [strideq*3] 864 865.loop: 866 mova [dstq+strideq*0], m0 867 vpalignr m3, m2, m0, 2 868 vpalignr m4, m2, m0, 4 869 mova [dstq+strideq*1], m3 870 mova [dstq+strideq*2], m4 871 vpalignr m3, m2, m0, 6 872 vpalignr m4, m2, m0, 8 873 mova [dstq+stride3q ], m3 874 lea dstq, [dstq+strideq*4] 875 mova [dstq+strideq*0], m4 876 vpalignr m3, m2, m0, 10 877 vpalignr m4, m2, m0, 12 878 mova [dstq+strideq*1], m3 879 mova [dstq+strideq*2], m4 880 vpalignr m3, m2, m0, 14 881 mova [dstq+stride3q ], m3 882 lea dstq, [dstq+strideq*4] 883 mova m0, m2 884 vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp 885 dec cntd 886 jg .loop 887 RET 888 889cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a 890 movifnidn aq, amp 891 mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop 892 mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 893 vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 894 vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx 895 vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq 896 vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr 897 LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ 898 vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 899 vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 900 vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 901 LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 902 vperm2i128 m2, m1, m4, q0201 ; Z......555555555 903 vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY 904 DEFINE_ARGS dst, stride, stride3, cnt 905 lea stride3q, [strideq*3] 906 mov cntd, 4 907 908.loop: 909 mova [dstq+strideq*0 + 0], m0 910 mova [dstq+strideq*0 +32], m1 911 vpalignr m3, m5, m0, 2 912 vpalignr m4, m2, m1, 2 913 mova [dstq+strideq*1 + 0], m3 914 mova [dstq+strideq*1 +32], m4 915 vpalignr m3, m5, m0, 4 916 vpalignr m4, m2, m1, 4 917 mova [dstq+strideq*2 + 0], m3 918 mova [dstq+strideq*2 +32], m4 919 vpalignr m3, m5, m0, 6 920 vpalignr m4, m2, m1, 6 921 mova [dstq+stride3q*1+ 0], m3 922 mova [dstq+stride3q*1+32], m4 923 lea dstq, [dstq+strideq*4] 924 vpalignr m3, m5, m0, 8 925 vpalignr m4, m2, m1, 8 926 mova [dstq+strideq*0 + 0], m3 927 mova [dstq+strideq*0 +32], m4 928 vpalignr m3, m5, m0, 10 929 vpalignr m4, m2, m1, 10 930 mova [dstq+strideq*1 + 0], m3 931 mova [dstq+strideq*1 +32], m4 932 vpalignr m3, m5, m0, 12 933 vpalignr m4, m2, m1, 12 934 mova [dstq+strideq*2+ 0], m3 935 mova [dstq+strideq*2+32], m4 936 vpalignr m3, m5, m0, 14 937 vpalignr m4, m2, m1, 14 938 mova [dstq+stride3q+ 0], m3 939 mova [dstq+stride3q+ 32], m4 940 vpalignr m3, m5, m0, 16 941 vpalignr m4, m2, m1, 16 942 vperm2i128 m5, m3, m4, q0201 943 vperm2i128 m2, m4, m4, q0101 944 mova m0, m3 945 mova m1, m4 946 lea dstq, [dstq+strideq*4] 947 dec cntd 948 jg .loop 949 RET 950%endif 951 952%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function 953cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a 954 movh m0, [lq] ; wxyz.... 955 movhps m0, [aq-2] ; wxyz*abc 956 movd m1, [aq+6] ; d....... 957 PALIGNR m1, m0, 2, m2 ; xyz*abcd 958 psrldq m2, m1, 2 ; yz*abcd. 959 LOWPASS 0, 1, 2 ; XYZ#ABC. 960 DEFINE_ARGS dst, stride, stride3 961 lea stride3q, [strideq*3] 962 963 movh [dstq+stride3q ], m0 964 psrldq m0, 2 ; YZ#ABC.. 965 movh [dstq+strideq*2], m0 966 psrldq m0, 2 ; Z#ABC... 967 movh [dstq+strideq*1], m0 968 psrldq m0, 2 ; #ABC.... 969 movh [dstq+strideq*0], m0 970 RET 971 972cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a 973 mova m0, [lq] ; stuvwxyz 974 movu m1, [aq-2] ; *abcdefg 975 mova m2, [aq] ; abcdefgh 976 psrldq m3, m2, 2 ; bcdefgh. 977 LOWPASS 3, 2, 1 ; ABCDEFG. 978 PALIGNR m1, m0, 2, m4 ; tuvwxyz* 979 PALIGNR m2, m1, 2, m4 ; uvwxyz*a 980 LOWPASS 2, 1, 0 ; TUVWXYZ# 981 DEFINE_ARGS dst, stride, dst4, stride3 982 lea stride3q, [strideq*3] 983 lea dst4q, [dstq+strideq*4] 984 985 movhps [dstq +stride3q +0], m2 986 movh [dstq+ stride3q +8], m3 987 mova [dst4q+stride3q +0], m2 988 PALIGNR m1, m3, m2, 2, m0 989 psrldq m3, 2 990 movhps [dstq +strideq*2+0], m1 991 movh [dstq+ strideq*2+8], m3 992 mova [dst4q+strideq*2+0], m1 993 PALIGNR m2, m3, m1, 2, m0 994 psrldq m3, 2 995 movhps [dstq +strideq*1+0], m2 996 movh [dstq+ strideq*1+8], m3 997 mova [dst4q+strideq*1+0], m2 998 PALIGNR m1, m3, m2, 2, m0 999 psrldq m3, 2 1000 movhps [dstq +strideq*0+0], m1 1001 movh [dstq+ strideq*0+8], m3 1002 mova [dst4q+strideq*0+0], m1 1003 RET 1004 1005cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a 1006 mova m0, [lq] ; klmnopqr 1007 mova m1, [lq+mmsize] ; stuvwxyz 1008 movu m2, [aq-2] ; *abcdefg 1009 movu m3, [aq+mmsize-2] ; hijklmno 1010 mova m4, [aq] ; abcdefgh 1011 mova m5, [aq+mmsize] ; ijklmnop 1012 psrldq m6, m5, 2 ; jklmnop. 1013 LOWPASS 6, 5, 3 ; IJKLMNO. 1014 PALIGNR m5, m4, 2, m3 ; bcdefghi 1015 LOWPASS 5, 4, 2 ; ABCDEFGH 1016 PALIGNR m2, m1, 2, m3 ; tuvwxyz* 1017 PALIGNR m4, m2, 2, m3 ; uvwxyz*a 1018 LOWPASS 4, 2, 1 ; TUVWXYZ# 1019 PALIGNR m1, m0, 2, m3 ; lmnopqrs 1020 PALIGNR m2, m1, 2, m3 ; mnopqrst 1021 LOWPASS 2, 1, 0 ; LMNOPQRS 1022 DEFINE_ARGS dst, stride, dst8, cnt 1023 lea dst8q, [dstq+strideq*8] 1024 mov cntd, 8 1025 1026.loop: 1027 sub dst8q, strideq 1028 mova [dst8q+strideq*0+ 0], m4 1029 mova [dst8q+strideq*0+16], m5 1030 mova [dst8q+strideq*8+ 0], m2 1031 mova [dst8q+strideq*8+16], m4 1032%if cpuflag(avx) 1033 vpalignr m2, m4, m2, 2 1034 vpalignr m4, m5, m4, 2 1035 vpalignr m5, m6, m5, 2 1036%else 1037 PALIGNR m0, m4, m2, 2, m1 1038 mova m2, m0 1039 PALIGNR m0, m5, m4, 2, m1 1040 mova m4, m0 1041 PALIGNR m0, m6, m5, 2, m1 1042 mova m5, m0 1043%endif 1044 psrldq m6, 2 1045 dec cntd 1046 jg .loop 1047 RET 1048 1049cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ 1050 %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a 1051 mova m0, [aq+mmsize*3] ; a[24-31] 1052 movu m1, [aq+mmsize*3-2] ; a[23-30] 1053 psrldq m2, m0, 2 ; a[25-31]. 1054 LOWPASS 2, 0, 1 ; A[24-30]. 1055 mova m1, [aq+mmsize*2] ; a[16-23] 1056 movu m3, [aq+mmsize*2-2] ; a[15-22] 1057 PALIGNR m0, m1, 2, m4 ; a[17-24] 1058 LOWPASS 0, 1, 3 ; A[16-23] 1059 mova m3, [aq+mmsize*1] ; a[8-15] 1060 movu m4, [aq+mmsize*1-2] ; a[7-14] 1061 PALIGNR m1, m3, 2, m5 ; a[9-16] 1062 LOWPASS 1, 3, 4 ; A[8-15] 1063 mova m4, [aq+mmsize*0] ; a[0-7] 1064 movu m5, [aq+mmsize*0-2] ; *a[0-6] 1065 PALIGNR m3, m4, 2, m6 ; a[1-8] 1066 LOWPASS 3, 4, 5 ; A[0-7] 1067 SCRATCH 1, 8, rsp+0*mmsize 1068 SCRATCH 3, 9, rsp+1*mmsize 1069%if notcpuflag(ssse3) 1070 SCRATCH 0, 10, rsp+2*mmsize 1071%endif 1072 mova m6, [lq+mmsize*3] ; l[24-31] 1073 PALIGNR m5, m6, 2, m0 ; l[25-31]* 1074 PALIGNR m4, m5, 2, m0 ; l[26-31]*a 1075 LOWPASS 4, 5, 6 ; L[25-31]# 1076 mova m7, [lq+mmsize*2] ; l[16-23] 1077 PALIGNR m6, m7, 2, m0 ; l[17-24] 1078 PALIGNR m5, m6, 2, m0 ; l[18-25] 1079 LOWPASS 5, 6, 7 ; L[17-24] 1080 mova m1, [lq+mmsize*1] ; l[8-15] 1081 PALIGNR m7, m1, 2, m0 ; l[9-16] 1082 PALIGNR m6, m7, 2, m0 ; l[10-17] 1083 LOWPASS 6, 7, 1 ; L[9-16] 1084 mova m3, [lq+mmsize*0] ; l[0-7] 1085 PALIGNR m1, m3, 2, m0 ; l[1-8] 1086 PALIGNR m7, m1, 2, m0 ; l[2-9] 1087 LOWPASS 7, 1, 3 ; L[1-8] 1088%if cpuflag(ssse3) 1089%if cpuflag(avx) 1090 UNSCRATCH 1, 8, rsp+0*mmsize 1091%endif 1092 UNSCRATCH 3, 9, rsp+1*mmsize 1093%else 1094 UNSCRATCH 0, 10, rsp+2*mmsize 1095%endif 1096 DEFINE_ARGS dst8, stride, stride8, stride24, cnt 1097 lea stride8q, [strideq*8] 1098 lea stride24q, [stride8q*3] 1099 lea dst8q, [dst8q+strideq*8] 1100 mov cntd, 8 1101 1102.loop: 1103 sub dst8q, strideq 1104%if notcpuflag(avx) 1105 UNSCRATCH 1, 8, rsp+0*mmsize 1106%if notcpuflag(ssse3) 1107 UNSCRATCH 3, 9, rsp+1*mmsize 1108%endif 1109%endif 1110 mova [dst8q+stride8q*0+ 0], m4 1111 mova [dst8q+stride8q*0+16], m3 1112 mova [dst8q+stride8q*0+32], m1 1113 mova [dst8q+stride8q*0+48], m0 1114 mova [dst8q+stride8q*1+ 0], m5 1115 mova [dst8q+stride8q*1+16], m4 1116 mova [dst8q+stride8q*1+32], m3 1117 mova [dst8q+stride8q*1+48], m1 1118 mova [dst8q+stride8q*2+ 0], m6 1119 mova [dst8q+stride8q*2+16], m5 1120 mova [dst8q+stride8q*2+32], m4 1121 mova [dst8q+stride8q*2+48], m3 1122 mova [dst8q+stride24q + 0], m7 1123 mova [dst8q+stride24q +16], m6 1124 mova [dst8q+stride24q +32], m5 1125 mova [dst8q+stride24q +48], m4 1126%if cpuflag(avx) 1127 vpalignr m7, m6, m7, 2 1128 vpalignr m6, m5, m6, 2 1129 vpalignr m5, m4, m5, 2 1130 vpalignr m4, m3, m4, 2 1131 vpalignr m3, m1, m3, 2 1132 vpalignr m1, m0, m1, 2 1133 vpalignr m0, m2, m0, 2 1134%else 1135 SCRATCH 2, 8, rsp+0*mmsize 1136%if notcpuflag(ssse3) 1137 SCRATCH 0, 9, rsp+1*mmsize 1138%endif 1139 PALIGNR m2, m6, m7, 2, m0 1140 mova m7, m2 1141 PALIGNR m2, m5, m6, 2, m0 1142 mova m6, m2 1143 PALIGNR m2, m4, m5, 2, m0 1144 mova m5, m2 1145 PALIGNR m2, m3, m4, 2, m0 1146 mova m4, m2 1147 PALIGNR m2, m1, m3, 2, m0 1148 mova m3, m2 1149%if notcpuflag(ssse3) 1150 UNSCRATCH 0, 9, rsp+1*mmsize 1151 SCRATCH 3, 9, rsp+1*mmsize 1152%endif 1153 PALIGNR m2, m0, m1, 2, m3 1154 mova m1, m2 1155 UNSCRATCH 2, 8, rsp+0*mmsize 1156 SCRATCH 1, 8, rsp+0*mmsize 1157 PALIGNR m1, m2, m0, 2, m3 1158 mova m0, m1 1159%endif 1160 psrldq m2, 2 1161 dec cntd 1162 jg .loop 1163 RET 1164%endmacro 1165 1166INIT_XMM sse2 1167DR_FUNCS 3 1168INIT_XMM ssse3 1169DR_FUNCS 2 1170INIT_XMM avx 1171DR_FUNCS 2 1172 1173%if HAVE_AVX2_EXTERNAL 1174INIT_YMM avx2 1175cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a 1176 mova m0, [lq] ; klmnopqrstuvwxyz 1177 movu m1, [aq-2] ; *abcdefghijklmno 1178 mova m2, [aq] ; abcdefghijklmnop 1179 vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ 1180 vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. 1181 vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg 1182 LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. 1183 vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* 1184 vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a 1185 LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# 1186 vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH 1187 DEFINE_ARGS dst, stride, stride3, stride5, dst3 1188 lea dst3q, [dstq+strideq*4] 1189 lea stride3q, [strideq*3] 1190 lea stride5q, [stride3q+strideq*2] 1191 1192 vpalignr m3, m5, m0, 2 1193 vpalignr m4, m1, m5, 2 1194 mova [dst3q+stride5q*2], m3 ; 14 1195 mova [ dstq+stride3q*2], m4 ; 6 1196 vpalignr m3, m5, m0, 4 1197 vpalignr m4, m1, m5, 4 1198 sub dst3q, strideq 1199 mova [dst3q+stride5q*2], m3 ; 13 1200 mova [dst3q+strideq*2 ], m4 ; 5 1201 mova [dst3q+stride3q*4], m0 ; 15 1202 vpalignr m3, m5, m0, 6 1203 vpalignr m4, m1, m5, 6 1204 mova [dstq+stride3q*4], m3 ; 12 1205 mova [dst3q+strideq*1], m4 ; 4 1206 vpalignr m3, m5, m0, 8 1207 vpalignr m4, m1, m5, 8 1208 mova [dst3q+strideq*8], m3 ; 11 1209 mova [dst3q+strideq*0], m4 ; 3 1210 vpalignr m3, m5, m0, 10 1211 vpalignr m4, m1, m5, 10 1212 mova [dstq+stride5q*2], m3 ; 10 1213 mova [dstq+strideq*2 ], m4 ; 2 1214 vpalignr m3, m5, m0, 12 1215 vpalignr m4, m1, m5, 12 1216 mova [dst3q+stride3q*2], m3 ; 9 1217 mova [dstq+strideq*1 ], m4 ; 1 1218 vpalignr m3, m5, m0, 14 1219 vpalignr m4, m1, m5, 14 1220 mova [dstq+strideq*8], m3 ; 8 1221 mova [dstq+strideq*0], m4 ; 0 1222 mova [dst3q+strideq*4], m5 ; 7 1223 RET 1224 1225%if ARCH_X86_64 1226cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a 1227 mova m0, [lq+mmsize*0+0] ; l[0-15] 1228 mova m1, [lq+mmsize*1+0] ; l[16-31] 1229 movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno 1230 mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop 1231 mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 1232 vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 1233 vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 1234 vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 1235 LOWPASS 0, 6, 7 ; L[0-15] 1236 vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg 1237 vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* 1238 vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a 1239 LOWPASS 1, 5, 6 ; L[16-31]# 1240 vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx 1241 vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq 1242 LOWPASS 2, 3, 6 ; A[0-15] 1243 movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 1244 vperm2i128 m6, m4, m4, q2001 ; yz012345........ 1245 vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. 1246 LOWPASS 3, 4, 7 ; A[16-31]. 1247 vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH 1248 vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] 1249 vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX 1250 DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt 1251 lea stride3q, [strideq*3] 1252 lea stride5q, [stride3q+strideq*2] 1253 lea stride7q, [strideq*4+stride3q] 1254 lea dst24q, [dst8q+stride3q*8] 1255 lea dst8q, [dst8q+strideq*8] 1256 mov cntd, 2 1257 1258.loop: 1259 mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 1260 mova [dst24q+stride7q+32], m1 1261 mova [dst8q+stride7q+0], m1 1262 mova [dst8q+stride7q+32], m2 1263 vpalignr m6, m4, m1, 2 1264 vpalignr m7, m5, m0, 2 1265 vpalignr m9, m8, m2, 2 1266 mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 1267 mova [dst24q+stride3q*2+32], m6 1268 mova [dst8q+stride3q*2+0], m6 1269 mova [dst8q+stride3q*2+32], m9 1270 vpalignr m6, m4, m1, 4 1271 vpalignr m7, m5, m0, 4 1272 vpalignr m9, m8, m2, 4 1273 mova [dst24q+stride5q+0], m7 ; 29 21 13 5 1274 mova [dst24q+stride5q+32], m6 1275 mova [dst8q+stride5q+0], m6 1276 mova [dst8q+stride5q+32], m9 1277 vpalignr m6, m4, m1, 6 1278 vpalignr m7, m5, m0, 6 1279 vpalignr m9, m8, m2, 6 1280 mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 1281 mova [dst24q+strideq*4+32], m6 1282 mova [dst8q+strideq*4+0], m6 1283 mova [dst8q+strideq*4+32], m9 1284 vpalignr m6, m4, m1, 8 1285 vpalignr m7, m5, m0, 8 1286 vpalignr m9, m8, m2, 8 1287 mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 1288 mova [dst24q+stride3q+32], m6 1289 mova [dst8q+stride3q+0], m6 1290 mova [dst8q+stride3q+32], m9 1291 vpalignr m6, m4, m1, 10 1292 vpalignr m7, m5, m0, 10 1293 vpalignr m9, m8, m2, 10 1294 mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 1295 mova [dst24q+strideq*2+32], m6 1296 mova [dst8q+strideq*2+0], m6 1297 mova [dst8q+strideq*2+32], m9 1298 vpalignr m6, m4, m1, 12 1299 vpalignr m7, m5, m0, 12 1300 vpalignr m9, m8, m2, 12 1301 mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 1302 mova [dst24q+strideq+32], m6 1303 mova [dst8q+strideq+0], m6 1304 mova [dst8q+strideq+32], m9 1305 vpalignr m6, m4, m1, 14 1306 vpalignr m7, m5, m0, 14 1307 vpalignr m9, m8, m2, 14 1308 mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 1309 mova [dst24q+strideq*0+32], m6 1310 mova [dst8q+strideq*0+0], m6 1311 mova [dst8q+strideq*0+32], m9 1312 mova m0, m5 1313 mova m5, m1 1314 mova m1, m4 1315 mova m4, m2 1316 mova m2, m8 1317 mova m8, m3 1318 sub dst24q, stride7q 1319 sub dst24q, strideq 1320 sub dst8q, stride7q 1321 sub dst8q, strideq 1322 dec cntd 1323 jg .loop 1324 RET 1325%endif 1326%endif 1327 1328%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function 1329cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a 1330 movifnidn aq, amp 1331 movu m0, [aq] ; abcdefgh 1332 psrldq m1, m0, 2 ; bcdefgh. 1333 psrldq m2, m0, 4 ; cdefgh.. 1334 LOWPASS 2, 1, 0 ; BCDEFGH. 1335 pavgw m1, m0 ; ABCDEFG. 1336 DEFINE_ARGS dst, stride, stride3 1337 lea stride3q, [strideq*3] 1338 1339 movh [dstq+strideq*0], m1 1340 movh [dstq+strideq*1], m2 1341 psrldq m1, 2 1342 psrldq m2, 2 1343 movh [dstq+strideq*2], m1 1344 movh [dstq+stride3q ], m2 1345 RET 1346 1347cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a 1348 movifnidn aq, amp 1349 mova m0, [aq] ; abcdefgh 1350%if cpuflag(ssse3) 1351 mova m3, [pb_2to15_14_15] 1352%endif 1353 SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh 1354 LOWPASS 2, 1, 0 ; BCDEFGHh 1355 pavgw m1, m0 ; ABCDEFGh 1356 DEFINE_ARGS dst, stride, stride3 1357 lea stride3q, [strideq*3] 1358 1359 mova [dstq+strideq*0], m1 1360 mova [dstq+strideq*1], m2 1361 SHIFT_RIGHT m1, m1, m3 1362 SHIFT_RIGHT m2, m2, m3 1363 mova [dstq+strideq*2], m1 1364 mova [dstq+stride3q ], m2 1365 lea dstq, [dstq+strideq*4] 1366 SHIFT_RIGHT m1, m1, m3 1367 SHIFT_RIGHT m2, m2, m3 1368 mova [dstq+strideq*0], m1 1369 mova [dstq+strideq*1], m2 1370 SHIFT_RIGHT m1, m1, m3 1371 SHIFT_RIGHT m2, m2, m3 1372 mova [dstq+strideq*2], m1 1373 mova [dstq+stride3q ], m2 1374 RET 1375 1376cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a 1377 movifnidn aq, amp 1378 mova m0, [aq] 1379 mova m1, [aq+mmsize] 1380 PALIGNR m2, m1, m0, 2, m3 1381 PALIGNR m3, m1, m0, 4, m4 1382 LOWPASS 3, 2, 0 1383 pavgw m2, m0 1384%if cpuflag(ssse3) 1385 mova m4, [pb_2to15_14_15] 1386%endif 1387 SHIFT_RIGHTx2 m5, m0, m1, m4 1388 LOWPASS 0, 5, 1 1389 pavgw m1, m5 1390 DEFINE_ARGS dst, stride, cnt 1391 mov cntd, 8 1392 1393.loop: 1394 mova [dstq+strideq*0+ 0], m2 1395 mova [dstq+strideq*0+16], m1 1396 mova [dstq+strideq*1+ 0], m3 1397 mova [dstq+strideq*1+16], m0 1398 lea dstq, [dstq+strideq*2] 1399%if cpuflag(avx) 1400 vpalignr m2, m1, m2, 2 1401 vpalignr m3, m0, m3, 2 1402%else 1403 PALIGNR m5, m1, m2, 2, m4 1404 mova m2, m5 1405 PALIGNR m5, m0, m3, 2, m4 1406 mova m3, m5 1407%endif 1408 SHIFT_RIGHT m1, m1, m4 1409 SHIFT_RIGHT m0, m0, m4 1410 dec cntd 1411 jg .loop 1412 RET 1413 1414cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a 1415 movifnidn aq, amp 1416 mova m0, [aq+mmsize*0] 1417 mova m1, [aq+mmsize*1] 1418 mova m2, [aq+mmsize*2] 1419 PALIGNR m6, m1, m0, 2, m5 1420 PALIGNR m7, m1, m0, 4, m5 1421 LOWPASS 7, 6, 0 1422 pavgw m6, m0 1423 SCRATCH 6, 8, rsp+0*mmsize 1424 PALIGNR m4, m2, m1, 2, m0 1425 PALIGNR m5, m2, m1, 4, m0 1426 LOWPASS 5, 4, 1 1427 pavgw m4, m1 1428 mova m0, [aq+mmsize*3] 1429 PALIGNR m1, m0, m2, 2, m6 1430 PALIGNR m3, m0, m2, 4, m6 1431 LOWPASS 3, 1, 2 1432 pavgw m2, m1 1433%if cpuflag(ssse3) 1434 PRELOAD 10, pb_2to15_14_15, shuf 1435%endif 1436 SHIFT_RIGHTx2 m6, m1, m0, reg_shuf 1437 LOWPASS 1, 6, 0 1438 pavgw m0, m6 1439%if ARCH_X86_64 1440 pshufd m9, m6, q3333 1441%endif 1442%if cpuflag(avx) 1443 UNSCRATCH 6, 8, rsp+0*mmsize 1444%endif 1445 DEFINE_ARGS dst, stride, cnt, stride16, stride17 1446 mov stride16q, strideq 1447 mov cntd, 8 1448 shl stride16q, 4 1449 lea stride17q, [stride16q+strideq] 1450 1451 ; FIXME m8 is unused for avx, so we could save one register here for win64 1452.loop: 1453%if notcpuflag(avx) 1454 UNSCRATCH 6, 8, rsp+0*mmsize 1455%endif 1456 mova [dstq+strideq*0+ 0], m6 1457 mova [dstq+strideq*0+16], m4 1458 mova [dstq+strideq*0+32], m2 1459 mova [dstq+strideq*0+48], m0 1460 mova [dstq+strideq*1+ 0], m7 1461 mova [dstq+strideq*1+16], m5 1462 mova [dstq+strideq*1+32], m3 1463 mova [dstq+strideq*1+48], m1 1464 mova [dstq+stride16q+ 0], m4 1465 mova [dstq+stride16q+16], m2 1466 mova [dstq+stride16q+32], m0 1467%if ARCH_X86_64 1468 mova [dstq+stride16q+48], m9 1469%endif 1470 mova [dstq+stride17q+ 0], m5 1471 mova [dstq+stride17q+16], m3 1472 mova [dstq+stride17q+32], m1 1473%if ARCH_X86_64 1474 mova [dstq+stride17q+48], m9 1475%endif 1476 lea dstq, [dstq+strideq*2] 1477%if cpuflag(avx) 1478 vpalignr m6, m4, m6, 2 1479 vpalignr m4, m2, m4, 2 1480 vpalignr m2, m0, m2, 2 1481 vpalignr m7, m5, m7, 2 1482 vpalignr m5, m3, m5, 2 1483 vpalignr m3, m1, m3, 2 1484%else 1485 SCRATCH 3, 8, rsp+0*mmsize 1486%if notcpuflag(ssse3) 1487 SCRATCH 1, 10, rsp+1*mmsize 1488%endif 1489 PALIGNR m3, m4, m6, 2, m1 1490 mova m6, m3 1491 PALIGNR m3, m2, m4, 2, m1 1492 mova m4, m3 1493 PALIGNR m3, m0, m2, 2, m1 1494 mova m2, m3 1495 PALIGNR m3, m5, m7, 2, m1 1496 mova m7, m3 1497 UNSCRATCH 3, 8, rsp+0*mmsize 1498 SCRATCH 6, 8, rsp+0*mmsize 1499%if notcpuflag(ssse3) 1500 UNSCRATCH 1, 10, rsp+1*mmsize 1501 SCRATCH 7, 10, rsp+1*mmsize 1502%endif 1503 PALIGNR m6, m3, m5, 2, m7 1504 mova m5, m6 1505 PALIGNR m6, m1, m3, 2, m7 1506 mova m3, m6 1507%if notcpuflag(ssse3) 1508 UNSCRATCH 7, 10, rsp+1*mmsize 1509%endif 1510%endif 1511 SHIFT_RIGHT m1, m1, reg_shuf 1512 SHIFT_RIGHT m0, m0, reg_shuf 1513 dec cntd 1514 jg .loop 1515 1516%if ARCH_X86_32 1517 DEFINE_ARGS dst, stride, stride3 1518 lea stride3q, [strideq*3] 1519%assign %%n 0 1520%rep 4 1521 mova [dstq+strideq*0+48], m0 1522 mova [dstq+strideq*1+48], m0 1523 mova [dstq+strideq*2+48], m0 1524 mova [dstq+stride3q +48], m0 1525%if %%n < 3 1526 lea dstq, [dstq+strideq*4] 1527%endif 1528%assign %%n (%%n+1) 1529%endrep 1530%endif 1531 RET 1532%endmacro 1533 1534INIT_XMM sse2 1535VL_FUNCS 2 1536INIT_XMM ssse3 1537VL_FUNCS 1 1538INIT_XMM avx 1539VL_FUNCS 1 1540 1541%macro VR_FUNCS 0 1542cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a 1543 movu m0, [aq-2] 1544 movhps m1, [lq] 1545 PALIGNR m0, m1, 10, m2 ; xyz*abcd 1546 pslldq m1, m0, 2 ; .xyz*abc 1547 pslldq m2, m0, 4 ; ..xyz*ab 1548 LOWPASS 2, 1, 0 ; ..YZ#ABC 1549 pavgw m1, m0 ; ....#ABC 1550 DEFINE_ARGS dst, stride, stride3 1551 lea stride3q, [strideq*3] 1552 1553 movhps [dstq+strideq*0], m1 1554 movhps [dstq+strideq*1], m2 1555 shufps m0, m2, m1, q3210 1556%if cpuflag(ssse3) 1557 pshufb m2, [pb_4_5_8to13_8x0] 1558%else 1559 pshuflw m2, m2, q2222 1560 psrldq m2, 6 1561%endif 1562 psrldq m0, 6 1563 movh [dstq+strideq*2], m0 1564 movh [dstq+stride3q ], m2 1565 RET 1566 1567cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a 1568 movu m1, [aq-2] ; *abcdefg 1569 movu m2, [lq] ; stuvwxyz 1570 mova m0, [aq] ; abcdefgh 1571 PALIGNR m3, m1, m2, 14, m4 ; z*abcdef 1572 LOWPASS 3, 1, 0 1573 pavgw m0, m1 1574 PALIGNR m1, m2, 2, m4 ; tuvwxyz* 1575 pslldq m4, m2, 2 ; .stuvwxy 1576 LOWPASS 4, 2, 1 1577 DEFINE_ARGS dst, stride, stride3 1578 lea stride3q, [strideq*3] 1579 1580 mova [dstq+strideq*0], m0 1581 mova [dstq+strideq*1], m3 1582 PALIGNR m0, m4, 14, m1 1583 pslldq m4, 2 1584 PALIGNR m3, m4, 14, m1 1585 pslldq m4, 2 1586 mova [dstq+strideq*2], m0 1587 mova [dstq+stride3q ], m3 1588 lea dstq, [dstq+strideq*4] 1589 PALIGNR m0, m4, 14, m1 1590 pslldq m4, 2 1591 PALIGNR m3, m4, 14, m1 1592 pslldq m4, 2 1593 mova [dstq+strideq*0], m0 1594 mova [dstq+strideq*1], m3 1595 PALIGNR m0, m4, 14, m1 1596 pslldq m4, 2 1597 PALIGNR m3, m4, 14, m4 1598 mova [dstq+strideq*2], m0 1599 mova [dstq+stride3q ], m3 1600 RET 1601 1602cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a 1603 movu m1, [aq-2] ; *abcdefg 1604 movu m2, [aq+mmsize-2] ; hijklmno 1605 mova m3, [aq] ; abcdefgh 1606 mova m4, [aq+mmsize] ; ijklmnop 1607 mova m5, [lq+mmsize] ; stuvwxyz 1608 PALIGNR m0, m1, m5, 14, m6 ; z*abcdef 1609 movu m6, [aq+mmsize-4] ; ghijklmn 1610 LOWPASS 6, 2, 4 1611 pavgw m2, m4 1612 LOWPASS 0, 1, 3 1613 pavgw m3, m1 1614 PALIGNR m1, m5, 2, m7 ; tuvwxyz* 1615 movu m7, [lq+mmsize-2] ; rstuvwxy 1616 LOWPASS 1, 5, 7 1617 movu m5, [lq+2] ; lmnopqrs 1618 pslldq m4, m5, 2 ; .lmnopqr 1619 pslldq m7, m5, 4 ; ..lmnopq 1620 LOWPASS 5, 4, 7 1621 psrld m4, m1, 16 1622 psrld m7, m5, 16 1623 pand m1, [pd_65535] 1624 pand m5, [pd_65535] 1625 packssdw m7, m4 1626 packssdw m5, m1 1627 DEFINE_ARGS dst, stride, cnt 1628 mov cntd, 8 1629 1630.loop: 1631 mova [dstq+strideq*0+ 0], m3 1632 mova [dstq+strideq*0+16], m2 1633 mova [dstq+strideq*1+ 0], m0 1634 mova [dstq+strideq*1+16], m6 1635 lea dstq, [dstq+strideq*2] 1636 PALIGNR m2, m3, 14, m4 1637 PALIGNR m3, m7, 14, m4 1638 pslldq m7, 2 1639 PALIGNR m6, m0, 14, m4 1640 PALIGNR m0, m5, 14, m4 1641 pslldq m5, 2 1642 dec cntd 1643 jg .loop 1644 RET 1645 1646cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a 1647 movu m0, [aq+mmsize*0-2] ; *a[0-6] 1648 movu m1, [aq+mmsize*1-2] ; a[7-14] 1649 movu m2, [aq+mmsize*2-2] ; a[15-22] 1650 movu m3, [aq+mmsize*3-2] ; a[23-30] 1651 mova m4, [aq+mmsize*3+0] ; a[24-31] 1652 movu m5, [aq+mmsize*3-4] ; a[22-29] 1653 LOWPASS 5, 3, 4 ; A[23-30] 1654 SCRATCH 5, 8, rsp+0*mmsize 1655 pavgw m3, m4 1656 mova m4, [aq+mmsize*2+0] ; a[16-23] 1657 movu m6, [aq+mmsize*2-4] ; a[14-21] 1658 LOWPASS 6, 2, 4 ; A[15-22] 1659 SCRATCH 6, 9, rsp+1*mmsize 1660 pavgw m2, m4 1661 mova m4, [aq+mmsize*1+0] ; a[8-15] 1662 movu m7, [aq+mmsize*1-4] ; a[6-13] 1663 LOWPASS 7, 1, 4 ; A[7-14] 1664 SCRATCH 7, 10, rsp+2*mmsize 1665 pavgw m1, m4 1666 mova m4, [aq+mmsize*0+0] ; a[0-7] 1667 mova m5, [lq+mmsize*3+0] ; l[24-31] 1668 PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5] 1669 LOWPASS 6, 0, 4 ; #A[0-6] 1670 SCRATCH 6, 11, rsp+3*mmsize 1671 pavgw m4, m0 1672 PALIGNR m0, m5, 2, m7 ; l[25-31]* 1673 movu m7, [lq+mmsize*3-2] ; l[23-30] 1674 LOWPASS 0, 5, 7 ; L[24-31] 1675 movu m5, [lq+mmsize*2-2] ; l[15-22] 1676 mova m7, [lq+mmsize*2+0] ; l[16-23] 1677 movu m6, [lq+mmsize*2+2] ; l[17-24] 1678 LOWPASS 5, 7, 6 ; L[16-23] 1679 psrld m7, m0, 16 1680 psrld m6, m5, 16 1681 pand m0, [pd_65535] 1682 pand m5, [pd_65535] 1683 packssdw m6, m7 1684 packssdw m5, m0 1685 SCRATCH 5, 12, rsp+4*mmsize 1686 SCRATCH 6, 13, rsp+5*mmsize 1687 movu m6, [lq+mmsize*1-2] ; l[7-14] 1688 mova m0, [lq+mmsize*1+0] ; l[8-15] 1689 movu m5, [lq+mmsize*1+2] ; l[9-16] 1690 LOWPASS 6, 0, 5 ; L[8-15] 1691 movu m0, [lq+mmsize*0+2] ; l[1-8] 1692 pslldq m5, m0, 2 ; .l[1-7] 1693 pslldq m7, m0, 4 ; ..l[1-6] 1694 LOWPASS 0, 5, 7 1695 psrld m5, m6, 16 1696 psrld m7, m0, 16 1697 pand m6, [pd_65535] 1698 pand m0, [pd_65535] 1699 packssdw m7, m5 1700 packssdw m0, m6 1701 UNSCRATCH 6, 13, rsp+5*mmsize 1702 DEFINE_ARGS dst, stride, stride16, cnt, stride17 1703 mov stride16q, strideq 1704 mov cntd, 8 1705 shl stride16q, 4 1706%if ARCH_X86_64 1707 lea stride17q, [stride16q+strideq] 1708%endif 1709 1710.loop: 1711 mova [dstq+strideq*0+ 0], m4 1712 mova [dstq+strideq*0+16], m1 1713 mova [dstq+strideq*0+32], m2 1714 mova [dstq+strideq*0+48], m3 1715%if ARCH_X86_64 1716 mova [dstq+strideq*1+ 0], m11 1717 mova [dstq+strideq*1+16], m10 1718 mova [dstq+strideq*1+32], m9 1719 mova [dstq+strideq*1+48], m8 1720%endif 1721 mova [dstq+stride16q+ 0], m6 1722 mova [dstq+stride16q+16], m4 1723 mova [dstq+stride16q+32], m1 1724 mova [dstq+stride16q+48], m2 1725%if ARCH_X86_64 1726 mova [dstq+stride17q+ 0], m12 1727 mova [dstq+stride17q+16], m11 1728 mova [dstq+stride17q+32], m10 1729 mova [dstq+stride17q+48], m9 1730%endif 1731 lea dstq, [dstq+strideq*2] 1732 PALIGNR m3, m2, 14, m5 1733 PALIGNR m2, m1, 14, m5 1734 PALIGNR m1, m4, 14, m5 1735 PALIGNR m4, m6, 14, m5 1736 PALIGNR m6, m7, 14, m5 1737 pslldq m7, 2 1738%if ARCH_X86_64 1739 PALIGNR m8, m9, 14, m5 1740 PALIGNR m9, m10, 14, m5 1741 PALIGNR m10, m11, 14, m5 1742 PALIGNR m11, m12, 14, m5 1743 PALIGNR m12, m0, 14, m5 1744 pslldq m0, 2 1745%endif 1746 dec cntd 1747 jg .loop 1748 1749%if ARCH_X86_32 1750 UNSCRATCH 5, 12, rsp+4*mmsize 1751 UNSCRATCH 4, 11, rsp+3*mmsize 1752 UNSCRATCH 3, 10, rsp+2*mmsize 1753 UNSCRATCH 2, 9, rsp+1*mmsize 1754 UNSCRATCH 1, 8, rsp+0*mmsize 1755 mov dstq, dstm 1756 mov cntd, 8 1757 add dstq, strideq 1758.loop2: 1759 mova [dstq+strideq*0+ 0], m4 1760 mova [dstq+strideq*0+16], m3 1761 mova [dstq+strideq*0+32], m2 1762 mova [dstq+strideq*0+48], m1 1763 mova [dstq+stride16q+ 0], m5 1764 mova [dstq+stride16q+16], m4 1765 mova [dstq+stride16q+32], m3 1766 mova [dstq+stride16q+48], m2 1767 lea dstq, [dstq+strideq*2] 1768 PALIGNR m1, m2, 14, m6 1769 PALIGNR m2, m3, 14, m6 1770 PALIGNR m3, m4, 14, m6 1771 PALIGNR m4, m5, 14, m6 1772 PALIGNR m5, m0, 14, m6 1773 pslldq m0, 2 1774 dec cntd 1775 jg .loop2 1776%endif 1777 RET 1778%endmacro 1779 1780INIT_XMM sse2 1781VR_FUNCS 1782INIT_XMM ssse3 1783VR_FUNCS 1784INIT_XMM avx 1785VR_FUNCS 1786 1787%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function 1788cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a 1789 movh m0, [lq] ; abcd 1790%if cpuflag(ssse3) 1791 pshufb m0, [pb_0to7_67x4] ; abcddddd 1792%else 1793 punpcklqdq m0, m0 1794 pshufhw m0, m0, q3333 ; abcddddd 1795%endif 1796 psrldq m1, m0, 2 ; bcddddd. 1797 psrldq m2, m0, 4 ; cddddd.. 1798 LOWPASS 2, 1, 0 ; BCDddd.. 1799 pavgw m1, m0 ; abcddddd 1800 SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd 1801 PALIGNR m2, m1, 4, m0 ; bCcDdddd 1802 DEFINE_ARGS dst, stride, stride3 1803 lea stride3q, [strideq*3] 1804 1805 movh [dstq+strideq*0], m1 ; aBbC 1806 movh [dstq+strideq*1], m2 ; bCcD 1807 movhps [dstq+strideq*2], m1 ; cDdd 1808 movhps [dstq+stride3q ], m2 ; dddd 1809 RET 1810 1811cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a 1812 mova m0, [lq] 1813%if cpuflag(ssse3) 1814 mova m3, [pb_2to15_14_15] 1815%endif 1816 SHIFT_RIGHTx2 m1, m2, m0, m3 1817 LOWPASS 2, 1, 0 1818 pavgw m1, m0 1819 SBUTTERFLY wd, 1, 2, 0 1820 shufps m0, m1, m2, q1032 1821 pshufd m3, m2, q3332 1822 DEFINE_ARGS dst, stride, stride3 1823 lea stride3q, [strideq*3] 1824 1825 mova [dstq+strideq *0], m1 1826 mova [dstq+strideq *2], m0 1827 mova [dstq+strideq *4], m2 1828 mova [dstq+stride3q*2], m3 1829 add dstq, strideq 1830%if cpuflag(avx) 1831 vpalignr m1, m2, m1, 4 1832%else 1833 PALIGNR m0, m2, m1, 4, m3 1834 mova m1, m0 1835%endif 1836 pshufd m2, m2, q3321 1837 shufps m0, m1, m2, q1032 1838 pshufd m3, m2, q3332 1839 mova [dstq+strideq *0], m1 1840 mova [dstq+strideq *2], m0 1841 mova [dstq+strideq *4], m2 1842 mova [dstq+stride3q*2], m3 1843 RET 1844 1845cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a 1846 mova m0, [lq] 1847 mova m3, [lq+mmsize] 1848 movu m1, [lq+2] 1849 movu m2, [lq+4] 1850 LOWPASS 2, 1, 0 1851 pavgw m1, m0 1852 SBUTTERFLY wd, 1, 2, 0 1853%if cpuflag(ssse3) 1854 mova m5, [pb_2to15_14_15] 1855%endif 1856 SHIFT_RIGHTx2 m0, m4, m3, m5 1857 LOWPASS 4, 0, 3 1858 pavgw m3, m0 1859 SBUTTERFLY wd, 3, 4, 5 1860 pshufd m0, m0, q3333 1861 DEFINE_ARGS dst, stride, stride3, cnt 1862 lea stride3q, [strideq*3] 1863 mov cntd, 4 1864 1865.loop: 1866 mova [dstq+strideq *0+ 0], m1 1867 mova [dstq+strideq *0+16], m2 1868 mova [dstq+strideq *4+ 0], m2 1869 mova [dstq+strideq *4+16], m3 1870 mova [dstq+strideq *8+ 0], m3 1871 mova [dstq+strideq *8+16], m4 1872 mova [dstq+stride3q*4+ 0], m4 1873 mova [dstq+stride3q*4+16], m0 1874 add dstq, strideq 1875%if cpuflag(avx) 1876 vpalignr m1, m2, m1, 4 1877 vpalignr m2, m3, m2, 4 1878 vpalignr m3, m4, m3, 4 1879 vpalignr m4, m0, m4, 4 1880%else 1881 PALIGNR m5, m2, m1, 4, m6 1882 mova m1, m5 1883 PALIGNR m5, m3, m2, 4, m6 1884 mova m2, m5 1885 PALIGNR m5, m4, m3, 4, m6 1886 mova m3, m5 1887 PALIGNR m5, m0, m4, 4, m6 1888 mova m4, m5 1889%endif 1890 dec cntd 1891 jg .loop 1892 RET 1893 1894cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ 1895 %1 * -mmsize * ARCH_X86_32, dst, stride, l, a 1896 mova m2, [lq+mmsize*0+0] 1897 movu m1, [lq+mmsize*0+2] 1898 movu m0, [lq+mmsize*0+4] 1899 LOWPASS 0, 1, 2 1900 pavgw m1, m2 1901 SBUTTERFLY wd, 1, 0, 2 1902 SCRATCH 1, 8, rsp+0*mmsize 1903 mova m4, [lq+mmsize*1+0] 1904 movu m3, [lq+mmsize*1+2] 1905 movu m2, [lq+mmsize*1+4] 1906 LOWPASS 2, 3, 4 1907 pavgw m3, m4 1908 SBUTTERFLY wd, 3, 2, 4 1909 mova m6, [lq+mmsize*2+0] 1910 movu m5, [lq+mmsize*2+2] 1911 movu m4, [lq+mmsize*2+4] 1912 LOWPASS 4, 5, 6 1913 pavgw m5, m6 1914 SBUTTERFLY wd, 5, 4, 6 1915 mova m7, [lq+mmsize*3+0] 1916 SCRATCH 0, 9, rsp+1*mmsize 1917%if cpuflag(ssse3) 1918 mova m0, [pb_2to15_14_15] 1919%endif 1920 SHIFT_RIGHTx2 m1, m6, m7, m0 1921 LOWPASS 6, 1, 7 1922 pavgw m7, m1 1923 SBUTTERFLY wd, 7, 6, 0 1924 pshufd m1, m1, q3333 1925 UNSCRATCH 0, 9, rsp+1*mmsize 1926 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 1927 lea stride3q, [strideq*3] 1928 lea stride4q, [strideq*4] 1929 lea stride28q, [stride4q*8] 1930 lea stride20q, [stride4q*5] 1931 sub stride28q, stride4q 1932 mov cntd, 4 1933 1934.loop: 1935%if ARCH_X86_64 1936 SWAP 1, 8 1937%else 1938 mova [rsp+1*mmsize], m1 1939 mova m1, [rsp+0*mmsize] 1940%endif 1941 mova [dstq+strideq *0+ 0], m1 1942 mova [dstq+strideq *0+16], m0 1943 mova [dstq+strideq *0+32], m3 1944 mova [dstq+strideq *0+48], m2 1945 mova [dstq+stride4q*1+ 0], m0 1946 mova [dstq+stride4q*1+16], m3 1947 mova [dstq+stride4q*1+32], m2 1948 mova [dstq+stride4q*1+48], m5 1949 mova [dstq+stride4q*2+ 0], m3 1950 mova [dstq+stride4q*2+16], m2 1951 mova [dstq+stride4q*2+32], m5 1952 mova [dstq+stride4q*2+48], m4 1953%if cpuflag(avx) 1954 vpalignr m1, m0, m1, 4 1955 vpalignr m0, m3, m0, 4 1956 vpalignr m3, m2, m3, 4 1957%else 1958 SCRATCH 6, 9, rsp+2*mmsize 1959%if notcpuflag(ssse3) 1960 SCRATCH 7, 10, rsp+3*mmsize 1961%endif 1962 PALIGNR m6, m0, m1, 4, m7 1963 mova m1, m6 1964 PALIGNR m6, m3, m0, 4, m7 1965 mova m0, m6 1966 PALIGNR m6, m2, m3, 4, m7 1967 mova m3, m6 1968 UNSCRATCH 6, 9, rsp+2*mmsize 1969 SCRATCH 0, 9, rsp+2*mmsize 1970%if notcpuflag(ssse3) 1971 UNSCRATCH 7, 10, rsp+3*mmsize 1972 SCRATCH 3, 10, rsp+3*mmsize 1973%endif 1974%endif 1975%if ARCH_X86_64 1976 SWAP 1, 8 1977%else 1978 mova [rsp+0*mmsize], m1 1979 mova m1, [rsp+1*mmsize] 1980%endif 1981 mova [dstq+stride3q*4+ 0], m2 1982 mova [dstq+stride3q*4+16], m5 1983 mova [dstq+stride3q*4+32], m4 1984 mova [dstq+stride3q*4+48], m7 1985 mova [dstq+stride4q*4+ 0], m5 1986 mova [dstq+stride4q*4+16], m4 1987 mova [dstq+stride4q*4+32], m7 1988 mova [dstq+stride4q*4+48], m6 1989 mova [dstq+stride20q + 0], m4 1990 mova [dstq+stride20q +16], m7 1991 mova [dstq+stride20q +32], m6 1992 mova [dstq+stride20q +48], m1 1993 mova [dstq+stride3q*8+ 0], m7 1994 mova [dstq+stride3q*8+16], m6 1995 mova [dstq+stride3q*8+32], m1 1996 mova [dstq+stride3q*8+48], m1 1997 mova [dstq+stride28q + 0], m6 1998 mova [dstq+stride28q +16], m1 1999 mova [dstq+stride28q +32], m1 2000 mova [dstq+stride28q +48], m1 2001%if cpuflag(avx) 2002 vpalignr m2, m5, m2, 4 2003 vpalignr m5, m4, m5, 4 2004 vpalignr m4, m7, m4, 4 2005 vpalignr m7, m6, m7, 4 2006 vpalignr m6, m1, m6, 4 2007%else 2008 PALIGNR m0, m5, m2, 4, m3 2009 mova m2, m0 2010 PALIGNR m0, m4, m5, 4, m3 2011 mova m5, m0 2012 PALIGNR m0, m7, m4, 4, m3 2013 mova m4, m0 2014 PALIGNR m0, m6, m7, 4, m3 2015 mova m7, m0 2016 PALIGNR m0, m1, m6, 4, m3 2017 mova m6, m0 2018 UNSCRATCH 0, 9, rsp+2*mmsize 2019%if notcpuflag(ssse3) 2020 UNSCRATCH 3, 10, rsp+3*mmsize 2021%endif 2022%endif 2023 add dstq, strideq 2024 dec cntd 2025 jg .loop 2026 RET 2027%endmacro 2028 2029INIT_XMM sse2 2030HU_FUNCS 4 2031INIT_XMM ssse3 2032HU_FUNCS 3 2033INIT_XMM avx 2034HU_FUNCS 2 2035 2036%macro HD_FUNCS 0 2037cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a 2038 movh m0, [lq] 2039 movhps m0, [aq-2] 2040 psrldq m1, m0, 2 2041 psrldq m2, m0, 4 2042 LOWPASS 2, 1, 0 2043 pavgw m1, m0 2044 punpcklwd m1, m2 2045 DEFINE_ARGS dst, stride, stride3 2046 lea stride3q, [strideq*3] 2047 2048 movh [dstq+stride3q ], m1 2049 movhps [dstq+strideq*1], m1 2050 movhlps m2, m2 2051 PALIGNR m2, m1, 4, m0 2052 movh [dstq+strideq*2], m2 2053 movhps [dstq+strideq*0], m2 2054 RET 2055 2056cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a 2057 mova m0, [lq] 2058 movu m1, [aq-2] 2059 PALIGNR m2, m1, m0, 2, m3 2060 PALIGNR m3, m1, m0, 4, m4 2061 LOWPASS 3, 2, 0 2062 pavgw m2, m0 2063 SBUTTERFLY wd, 2, 3, 0 2064 psrldq m0, m1, 2 2065 psrldq m4, m1, 4 2066 LOWPASS 1, 0, 4 2067 DEFINE_ARGS dst8, mstride, cnt 2068 lea dst8q, [dst8q+mstrideq*8] 2069 neg mstrideq 2070 mov cntd, 4 2071 2072.loop: 2073 add dst8q, mstrideq 2074 mova [dst8q+mstrideq*0], m2 2075 mova [dst8q+mstrideq*4], m3 2076%if cpuflag(avx) 2077 vpalignr m2, m3, m2, 4 2078 vpalignr m3, m1, m3, 4 2079%else 2080 PALIGNR m0, m3, m2, 4, m4 2081 mova m2, m0 2082 PALIGNR m0, m1, m3, 4, m4 2083 mova m3, m0 2084%endif 2085 psrldq m1, 4 2086 dec cntd 2087 jg .loop 2088 RET 2089 2090cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a 2091 mova m2, [lq] 2092 movu m1, [lq+2] 2093 movu m0, [lq+4] 2094 LOWPASS 0, 1, 2 2095 pavgw m1, m2 2096 mova m4, [lq+mmsize] 2097 movu m5, [aq-2] 2098 PALIGNR m3, m5, m4, 2, m6 2099 PALIGNR m2, m5, m4, 4, m6 2100 LOWPASS 2, 3, 4 2101 pavgw m3, m4 2102 SBUTTERFLY wd, 1, 0, 4 2103 SBUTTERFLY wd, 3, 2, 4 2104 mova m6, [aq] 2105 movu m4, [aq+2] 2106 LOWPASS 4, 6, 5 2107 movu m5, [aq+mmsize-2] 2108 psrldq m6, m5, 2 2109 psrldq m7, m5, 4 2110 LOWPASS 5, 6, 7 2111 DEFINE_ARGS dst, mstride, mstride3, cnt 2112 lea dstq, [dstq+mstrideq*8] 2113 lea dstq, [dstq+mstrideq*8] 2114 neg mstrideq 2115 lea mstride3q, [mstrideq*3] 2116 mov cntd, 4 2117 2118.loop: 2119 add dstq, mstrideq 2120 mova [dstq+mstride3q*4+ 0], m2 2121 mova [dstq+mstride3q*4+16], m4 2122 mova [dstq+mstrideq *8+ 0], m3 2123 mova [dstq+mstrideq *8+16], m2 2124 mova [dstq+mstrideq *4+ 0], m0 2125 mova [dstq+mstrideq *4+16], m3 2126 mova [dstq+mstrideq *0+ 0], m1 2127 mova [dstq+mstrideq *0+16], m0 2128%if cpuflag(avx) 2129 vpalignr m1, m0, m1, 4 2130 vpalignr m0, m3, m0, 4 2131 vpalignr m3, m2, m3, 4 2132 vpalignr m2, m4, m2, 4 2133 vpalignr m4, m5, m4, 4 2134%else 2135 PALIGNR m6, m0, m1, 4, m7 2136 mova m1, m6 2137 PALIGNR m6, m3, m0, 4, m7 2138 mova m0, m6 2139 PALIGNR m6, m2, m3, 4, m7 2140 mova m3, m6 2141 PALIGNR m6, m4, m2, 4, m7 2142 mova m2, m6 2143 PALIGNR m6, m5, m4, 4, m7 2144 mova m4, m6 2145%endif 2146 psrldq m5, 4 2147 dec cntd 2148 jg .loop 2149 RET 2150 2151cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ 2152 10 * -mmsize * ARCH_X86_32, dst, stride, l, a 2153 mova m2, [lq+mmsize*0+0] 2154 movu m1, [lq+mmsize*0+2] 2155 movu m0, [lq+mmsize*0+4] 2156 LOWPASS 0, 1, 2 2157 pavgw m1, m2 2158 SBUTTERFLY wd, 1, 0, 2 2159 mova m4, [lq+mmsize*1+0] 2160 movu m3, [lq+mmsize*1+2] 2161 movu m2, [lq+mmsize*1+4] 2162 LOWPASS 2, 3, 4 2163 pavgw m3, m4 2164 SBUTTERFLY wd, 3, 2, 4 2165 SCRATCH 0, 8, rsp+0*mmsize 2166 SCRATCH 1, 9, rsp+1*mmsize 2167 SCRATCH 2, 10, rsp+2*mmsize 2168 SCRATCH 3, 11, rsp+3*mmsize 2169 mova m6, [lq+mmsize*2+0] 2170 movu m5, [lq+mmsize*2+2] 2171 movu m4, [lq+mmsize*2+4] 2172 LOWPASS 4, 5, 6 2173 pavgw m5, m6 2174 SBUTTERFLY wd, 5, 4, 6 2175 mova m0, [lq+mmsize*3+0] 2176 movu m1, [aq+mmsize*0-2] 2177 PALIGNR m7, m1, m0, 2, m2 2178 PALIGNR m6, m1, m0, 4, m2 2179 LOWPASS 6, 7, 0 2180 pavgw m7, m0 2181 SBUTTERFLY wd, 7, 6, 0 2182 mova m2, [aq+mmsize*0+0] 2183 movu m0, [aq+mmsize*0+2] 2184 LOWPASS 0, 2, 1 2185 movu m1, [aq+mmsize*1-2] 2186 mova m2, [aq+mmsize*1+0] 2187 movu m3, [aq+mmsize*1+2] 2188 LOWPASS 1, 2, 3 2189 SCRATCH 6, 12, rsp+6*mmsize 2190 SCRATCH 7, 13, rsp+7*mmsize 2191 movu m2, [aq+mmsize*2-2] 2192 mova m3, [aq+mmsize*2+0] 2193 movu m6, [aq+mmsize*2+2] 2194 LOWPASS 2, 3, 6 2195 movu m3, [aq+mmsize*3-2] 2196 psrldq m6, m3, 2 2197 psrldq m7, m3, 4 2198 LOWPASS 3, 6, 7 2199 UNSCRATCH 6, 12, rsp+6*mmsize 2200 UNSCRATCH 7, 13, rsp+7*mmsize 2201%if ARCH_X86_32 2202 mova [rsp+4*mmsize], m4 2203 mova [rsp+5*mmsize], m5 2204 ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need 2205 ; to do it again here 2206%endif 2207 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 2208 mov cntd, 4 2209 lea stride3q, [strideq*3] 2210%if ARCH_X86_64 2211 lea stride4q, [strideq*4] 2212 lea stride28q, [stride4q*8] 2213 lea stride20q, [stride4q*5] 2214 sub stride28q, stride4q 2215%endif 2216 add dstq, stride3q 2217 2218 ; x86-32 doesn't have enough registers, so on that platform, we split 2219 ; the loop in 2... Otherwise you spend most of the loop (un)scratching 2220.loop: 2221%if ARCH_X86_64 2222 mova [dstq+stride28q + 0], m9 2223 mova [dstq+stride28q +16], m8 2224 mova [dstq+stride28q +32], m11 2225 mova [dstq+stride28q +48], m10 2226 mova [dstq+stride3q*8+ 0], m8 2227 mova [dstq+stride3q*8+16], m11 2228 mova [dstq+stride3q*8+32], m10 2229 mova [dstq+stride3q*8+48], m5 2230 mova [dstq+stride20q + 0], m11 2231 mova [dstq+stride20q +16], m10 2232 mova [dstq+stride20q +32], m5 2233 mova [dstq+stride20q +48], m4 2234 mova [dstq+stride4q*4+ 0], m10 2235 mova [dstq+stride4q*4+16], m5 2236 mova [dstq+stride4q*4+32], m4 2237 mova [dstq+stride4q*4+48], m7 2238%endif 2239 mova [dstq+stride3q*4+ 0], m5 2240 mova [dstq+stride3q*4+16], m4 2241 mova [dstq+stride3q*4+32], m7 2242 mova [dstq+stride3q*4+48], m6 2243 mova [dstq+strideq* 8+ 0], m4 2244 mova [dstq+strideq* 8+16], m7 2245 mova [dstq+strideq* 8+32], m6 2246 mova [dstq+strideq* 8+48], m0 2247 mova [dstq+strideq* 4+ 0], m7 2248 mova [dstq+strideq* 4+16], m6 2249 mova [dstq+strideq* 4+32], m0 2250 mova [dstq+strideq* 4+48], m1 2251 mova [dstq+strideq* 0+ 0], m6 2252 mova [dstq+strideq* 0+16], m0 2253 mova [dstq+strideq* 0+32], m1 2254 mova [dstq+strideq* 0+48], m2 2255 sub dstq, strideq 2256%if cpuflag(avx) 2257%if ARCH_X86_64 2258 vpalignr m9, m8, m9, 4 2259 vpalignr m8, m11, m8, 4 2260 vpalignr m11, m10, m11, 4 2261 vpalignr m10, m5, m10, 4 2262%endif 2263 vpalignr m5, m4, m5, 4 2264 vpalignr m4, m7, m4, 4 2265 vpalignr m7, m6, m7, 4 2266 vpalignr m6, m0, m6, 4 2267 vpalignr m0, m1, m0, 4 2268 vpalignr m1, m2, m1, 4 2269 vpalignr m2, m3, m2, 4 2270%else 2271%if ARCH_X86_64 2272 PALIGNR m12, m8, m9, 4, m13 2273 mova m9, m12 2274 PALIGNR m12, m11, m8, 4, m13 2275 mova m8, m12 2276 PALIGNR m12, m10, m11, 4, m13 2277 mova m11, m12 2278 PALIGNR m12, m5, m10, 4, m13 2279 mova m10, m12 2280%endif 2281 SCRATCH 3, 12, rsp+8*mmsize, sh 2282%if notcpuflag(ssse3) 2283 SCRATCH 2, 13, rsp+9*mmsize 2284%endif 2285 PALIGNR m3, m4, m5, 4, m2 2286 mova m5, m3 2287 PALIGNR m3, m7, m4, 4, m2 2288 mova m4, m3 2289 PALIGNR m3, m6, m7, 4, m2 2290 mova m7, m3 2291 PALIGNR m3, m0, m6, 4, m2 2292 mova m6, m3 2293 PALIGNR m3, m1, m0, 4, m2 2294 mova m0, m3 2295%if notcpuflag(ssse3) 2296 UNSCRATCH 2, 13, rsp+9*mmsize 2297 SCRATCH 0, 13, rsp+9*mmsize 2298%endif 2299 PALIGNR m3, m2, m1, 4, m0 2300 mova m1, m3 2301 PALIGNR m3, reg_sh, m2, 4, m0 2302 mova m2, m3 2303%if notcpuflag(ssse3) 2304 UNSCRATCH 0, 13, rsp+9*mmsize 2305%endif 2306 UNSCRATCH 3, 12, rsp+8*mmsize, sh 2307%endif 2308 psrldq m3, 4 2309 dec cntd 2310 jg .loop 2311 2312%if ARCH_X86_32 2313 UNSCRATCH 0, 8, rsp+0*mmsize 2314 UNSCRATCH 1, 9, rsp+1*mmsize 2315 UNSCRATCH 2, 10, rsp+2*mmsize 2316 UNSCRATCH 3, 11, rsp+3*mmsize 2317 mova m4, [rsp+4*mmsize] 2318 mova m5, [rsp+5*mmsize] 2319 mova m6, [rsp+6*mmsize] 2320 mova m7, [rsp+7*mmsize] 2321 DEFINE_ARGS dst, stride, stride5, stride3 2322 lea stride5q, [strideq*5] 2323 lea dstq, [dstq+stride5q*4] 2324 DEFINE_ARGS dst, stride, cnt, stride3 2325 mov cntd, 4 2326.loop_2: 2327 mova [dstq+stride3q*4+ 0], m1 2328 mova [dstq+stride3q*4+16], m0 2329 mova [dstq+stride3q*4+32], m3 2330 mova [dstq+stride3q*4+48], m2 2331 mova [dstq+strideq* 8+ 0], m0 2332 mova [dstq+strideq* 8+16], m3 2333 mova [dstq+strideq* 8+32], m2 2334 mova [dstq+strideq* 8+48], m5 2335 mova [dstq+strideq* 4+ 0], m3 2336 mova [dstq+strideq* 4+16], m2 2337 mova [dstq+strideq* 4+32], m5 2338 mova [dstq+strideq* 4+48], m4 2339 mova [dstq+strideq* 0+ 0], m2 2340 mova [dstq+strideq* 0+16], m5 2341 mova [dstq+strideq* 0+32], m4 2342 mova [dstq+strideq* 0+48], m7 2343 sub dstq, strideq 2344%if cpuflag(avx) 2345 vpalignr m1, m0, m1, 4 2346 vpalignr m0, m3, m0, 4 2347 vpalignr m3, m2, m3, 4 2348 vpalignr m2, m5, m2, 4 2349 vpalignr m5, m4, m5, 4 2350 vpalignr m4, m7, m4, 4 2351 vpalignr m7, m6, m7, 4 2352%else 2353 SCRATCH 6, 12, rsp+8*mmsize, sh 2354%if notcpuflag(ssse3) 2355 SCRATCH 7, 13, rsp+9*mmsize 2356%endif 2357 PALIGNR m6, m0, m1, 4, m7 2358 mova m1, m6 2359 PALIGNR m6, m3, m0, 4, m7 2360 mova m0, m6 2361 PALIGNR m6, m2, m3, 4, m7 2362 mova m3, m6 2363 PALIGNR m6, m5, m2, 4, m7 2364 mova m2, m6 2365 PALIGNR m6, m4, m5, 4, m7 2366 mova m5, m6 2367%if notcpuflag(ssse3) 2368 UNSCRATCH 7, 13, rsp+9*mmsize 2369 SCRATCH 5, 13, rsp+9*mmsize 2370%endif 2371 PALIGNR m6, m7, m4, 4, m5 2372 mova m4, m6 2373 PALIGNR m6, reg_sh, m7, 4, m5 2374 mova m7, m6 2375%if notcpuflag(ssse3) 2376 UNSCRATCH 5, 13, rsp+9*mmsize 2377%endif 2378 UNSCRATCH 6, 12, rsp+8*mmsize, sh 2379%endif 2380 psrldq m6, 4 2381 dec cntd 2382 jg .loop_2 2383%endif 2384 RET 2385%endmacro 2386 2387INIT_XMM sse2 2388HD_FUNCS 2389INIT_XMM ssse3 2390HD_FUNCS 2391INIT_XMM avx 2392HD_FUNCS 2393