1;***************************************************************************** 2;* predict-a.asm: x86 intra prediction 3;***************************************************************************** 4;* Copyright (C) 2005-2021 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Holger Lubitz <holger@lubitz.org> 8;* Fiona Glaser <fiona@x264.com> 9;* Henrik Gramner <henrik@gramner.com> 10;* 11;* This program is free software; you can redistribute it and/or modify 12;* it under the terms of the GNU General Public License as published by 13;* the Free Software Foundation; either version 2 of the License, or 14;* (at your option) any later version. 15;* 16;* This program is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19;* GNU General Public License for more details. 20;* 21;* You should have received a copy of the GNU General Public License 22;* along with this program; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 24;* 25;* This program is also available under a commercial proprietary license. 26;* For more information, contact us at licensing@x264.com. 27;***************************************************************************** 28 29%include "x86inc.asm" 30%include "x86util.asm" 31 32SECTION_RODATA 32 33 34pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 35pw_m3: times 16 dw -3 36pw_m7: times 16 dw -7 37pb_00s_ff: times 8 db 0 38pb_0s_ff: times 7 db 0 39 db 0xff 40shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 41shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 42shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0 43shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7 44pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 45 46SECTION .text 47 48cextern pb_0 49cextern pb_1 50cextern pb_3 51cextern pw_1 52cextern pw_2 53cextern pw_4 54cextern pw_8 55cextern pw_16 56cextern pw_00ff 57cextern pw_pixel_max 58cextern pw_0to15 59 60%macro STORE8 1 61 mova [r0+0*FDEC_STRIDEB], %1 62 mova [r0+1*FDEC_STRIDEB], %1 63 add r0, 4*FDEC_STRIDEB 64 mova [r0-2*FDEC_STRIDEB], %1 65 mova [r0-1*FDEC_STRIDEB], %1 66 mova [r0+0*FDEC_STRIDEB], %1 67 mova [r0+1*FDEC_STRIDEB], %1 68 mova [r0+2*FDEC_STRIDEB], %1 69 mova [r0+3*FDEC_STRIDEB], %1 70%endmacro 71 72%macro STORE16 1-4 73%if %0 > 1 74 mov r1d, 2*%0 75.loop: 76 mova [r0+0*FDEC_STRIDEB+0*mmsize], %1 77 mova [r0+0*FDEC_STRIDEB+1*mmsize], %2 78 mova [r0+1*FDEC_STRIDEB+0*mmsize], %1 79 mova [r0+1*FDEC_STRIDEB+1*mmsize], %2 80%ifidn %0, 4 81 mova [r0+0*FDEC_STRIDEB+2*mmsize], %3 82 mova [r0+0*FDEC_STRIDEB+3*mmsize], %4 83 mova [r0+1*FDEC_STRIDEB+2*mmsize], %3 84 mova [r0+1*FDEC_STRIDEB+3*mmsize], %4 85 add r0, 2*FDEC_STRIDEB 86%else ; %0 == 2 87 add r0, 4*FDEC_STRIDEB 88 mova [r0-2*FDEC_STRIDEB+0*mmsize], %1 89 mova [r0-2*FDEC_STRIDEB+1*mmsize], %2 90 mova [r0-1*FDEC_STRIDEB+0*mmsize], %1 91 mova [r0-1*FDEC_STRIDEB+1*mmsize], %2 92%endif 93 dec r1d 94 jg .loop 95%else ; %0 == 1 96 STORE8 %1 97%if HIGH_BIT_DEPTH ; Different code paths to reduce code size 98 add r0, 6*FDEC_STRIDEB 99 mova [r0-2*FDEC_STRIDEB], %1 100 mova [r0-1*FDEC_STRIDEB], %1 101 mova [r0+0*FDEC_STRIDEB], %1 102 mova [r0+1*FDEC_STRIDEB], %1 103 add r0, 4*FDEC_STRIDEB 104 mova [r0-2*FDEC_STRIDEB], %1 105 mova [r0-1*FDEC_STRIDEB], %1 106 mova [r0+0*FDEC_STRIDEB], %1 107 mova [r0+1*FDEC_STRIDEB], %1 108%else 109 add r0, 8*FDEC_STRIDE 110 mova [r0-4*FDEC_STRIDE], %1 111 mova [r0-3*FDEC_STRIDE], %1 112 mova [r0-2*FDEC_STRIDE], %1 113 mova [r0-1*FDEC_STRIDE], %1 114 mova [r0+0*FDEC_STRIDE], %1 115 mova [r0+1*FDEC_STRIDE], %1 116 mova [r0+2*FDEC_STRIDE], %1 117 mova [r0+3*FDEC_STRIDE], %1 118%endif ; HIGH_BIT_DEPTH 119%endif 120%endmacro 121 122%macro PRED_H_LOAD 2 ; reg, offset 123%if cpuflag(avx2) 124 vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL] 125%elif HIGH_BIT_DEPTH 126 movd %1, [r0+(%2)*FDEC_STRIDEB-4] 127 SPLATW %1, %1, 1 128%else 129 SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2 130%endif 131%endmacro 132 133%macro PRED_H_STORE 3 ; reg, offset, width 134%assign %%w %3*SIZEOF_PIXEL 135%if %%w == 8 136 movq [r0+(%2)*FDEC_STRIDEB], %1 137%else 138 %assign %%i 0 139 %rep %%w/mmsize 140 mova [r0+(%2)*FDEC_STRIDEB+%%i], %1 141 %assign %%i %%i+mmsize 142 %endrep 143%endif 144%endmacro 145 146%macro PRED_H_4ROWS 2 ; width, inc_ptr 147 PRED_H_LOAD m0, 0 148 PRED_H_LOAD m1, 1 149 PRED_H_STORE m0, 0, %1 150 PRED_H_STORE m1, 1, %1 151 PRED_H_LOAD m0, 2 152%if %2 153 add r0, 4*FDEC_STRIDEB 154%endif 155 PRED_H_LOAD m1, 3-4*%2 156 PRED_H_STORE m0, 2-4*%2, %1 157 PRED_H_STORE m1, 3-4*%2, %1 158%endmacro 159 160; dest, left, right, src, tmp 161; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 162%macro PRED8x8_LOWPASS 4-5 163%if HIGH_BIT_DEPTH 164 paddw %2, %3 165 psrlw %2, 1 166 pavgw %1, %4, %2 167%else 168 mova %5, %2 169 pavgb %2, %3 170 pxor %3, %5 171 pand %3, [pb_1] 172 psubusb %2, %3 173 pavgb %1, %4, %2 174%endif 175%endmacro 176 177;----------------------------------------------------------------------------- 178; void predict_4x4_h( pixel *src ) 179;----------------------------------------------------------------------------- 180%if HIGH_BIT_DEPTH 181INIT_XMM avx2 182cglobal predict_4x4_h, 1,1 183 PRED_H_4ROWS 4, 0 184 RET 185%endif 186 187;----------------------------------------------------------------------------- 188; void predict_4x4_ddl( pixel *src ) 189;----------------------------------------------------------------------------- 190%macro PREDICT_4x4_DDL 0 191cglobal predict_4x4_ddl, 1,1 192 movu m1, [r0-FDEC_STRIDEB] 193 PSLLPIX m2, m1, 1 194 mova m0, m1 195%if HIGH_BIT_DEPTH 196 PSRLPIX m1, m1, 1 197 pshufhw m1, m1, q2210 198%else 199 pxor m1, m2 200 PSRLPIX m1, m1, 1 201 pxor m1, m0 202%endif 203 PRED8x8_LOWPASS m0, m2, m1, m0, m3 204 205%assign Y 0 206%rep 4 207 PSRLPIX m0, m0, 1 208 movh [r0+Y*FDEC_STRIDEB], m0 209%assign Y (Y+1) 210%endrep 211 212 RET 213%endmacro 214 215%if HIGH_BIT_DEPTH 216INIT_XMM sse2 217PREDICT_4x4_DDL 218INIT_XMM avx 219PREDICT_4x4_DDL 220INIT_MMX mmx2 221cglobal predict_4x4_ddl, 1,2 222 movu m1, [r0-FDEC_STRIDEB+4] 223 PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2] 224 mova m3, [r0-FDEC_STRIDEB+8] 225 mova [r0+0*FDEC_STRIDEB], m0 226 pshufw m4, m3, q3321 227 PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3 228 mova [r0+3*FDEC_STRIDEB], m2 229 pshufw m1, m0, q0021 230 punpckldq m1, m2 231 mova [r0+1*FDEC_STRIDEB], m1 232 psllq m0, 16 233 PALIGNR m2, m0, 6, m0 234 mova [r0+2*FDEC_STRIDEB], m2 235 RET 236%else ; !HIGH_BIT_DEPTH 237INIT_MMX mmx2 238PREDICT_4x4_DDL 239%endif 240 241;----------------------------------------------------------------------------- 242; void predict_4x4_vr( pixel *src ) 243;----------------------------------------------------------------------------- 244%if HIGH_BIT_DEPTH == 0 245INIT_MMX ssse3 246cglobal predict_4x4_vr, 1,1 247 movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 248 mova m4, m1 249 palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt 250 pavgb m4, m1 251 palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0 252 mova m0, m1 253 palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1 254 mova m2, m1 255 palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2 256 PRED8x8_LOWPASS m2, m0, m1, m2, m3 257 pshufw m0, m2, 0 258 psrlq m2, 16 259 movd [r0+0*FDEC_STRIDEB], m4 260 palignr m4, m0, 7 261 movd [r0+1*FDEC_STRIDEB], m2 262 psllq m0, 8 263 movd [r0+2*FDEC_STRIDEB], m4 264 palignr m2, m0, 7 265 movd [r0+3*FDEC_STRIDEB], m2 266 RET 267%endif ; !HIGH_BIT_DEPTH 268 269;----------------------------------------------------------------------------- 270; void predict_4x4_ddr( pixel *src ) 271;----------------------------------------------------------------------------- 272%macro PREDICT_4x4 4 273cglobal predict_4x4_ddr, 1,1 274%if HIGH_BIT_DEPTH 275 movu m2, [r0-1*FDEC_STRIDEB-8] 276 pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2 277 pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1 278 pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0 279 movhps m3, [r0+3*FDEC_STRIDEB-8] 280%else ; !HIGH_BIT_DEPTH 281 movd m0, [r0+2*FDEC_STRIDEB-4] 282 movd m1, [r0+0*FDEC_STRIDEB-4] 283 punpcklbw m0, [r0+1*FDEC_STRIDEB-4] 284 punpcklbw m1, [r0-1*FDEC_STRIDEB-4] 285 punpckhwd m0, m1 286 movd m2, [r0-1*FDEC_STRIDEB] 287%if cpuflag(ssse3) 288 palignr m2, m0, 4 289%else 290 psllq m2, 32 291 punpckhdq m0, m2 292 SWAP 2, 0 293%endif 294 movd m3, [r0+3*FDEC_STRIDEB-4] 295 psllq m3, 32 296%endif ; !HIGH_BIT_DEPTH 297 298 PSRLPIX m1, m2, 1 299 mova m0, m2 300 PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3 301 PRED8x8_LOWPASS m0, m2, m1, m0, m3 302%assign Y 3 303 movh [r0+Y*FDEC_STRIDEB], m0 304%rep 3 305%assign Y (Y-1) 306 PSRLPIX m0, m0, 1 307 movh [r0+Y*FDEC_STRIDEB], m0 308%endrep 309 RET 310 311;----------------------------------------------------------------------------- 312; void predict_4x4_vr( pixel *src ) 313;----------------------------------------------------------------------------- 314cglobal predict_4x4_vr, 1,1 315%if HIGH_BIT_DEPTH 316 movu m1, [r0-1*FDEC_STRIDEB-8] 317 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2 318 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1 319 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0 320%else ; !HIGH_BIT_DEPTH 321 movd m0, [r0+2*FDEC_STRIDEB-4] 322 movd m1, [r0+0*FDEC_STRIDEB-4] 323 punpcklbw m0, [r0+1*FDEC_STRIDEB-4] 324 punpcklbw m1, [r0-1*FDEC_STRIDEB-4] 325 punpckhwd m0, m1 326 movd m1, [r0-1*FDEC_STRIDEB] 327%if cpuflag(ssse3) 328 palignr m1, m0, 4 329%else 330 psllq m1, 32 331 punpckhdq m0, m1 332 SWAP 1, 0 333%endif 334%endif ; !HIGH_BIT_DEPTH 335 PSRLPIX m2, m1, 1 336 PSRLPIX m0, m1, 2 337 pavg%1 m4, m1, m2 338 PSRLPIX m4, m4, 3 339 PRED8x8_LOWPASS m2, m0, m1, m2, m3 340 PSLLPIX m0, m2, 6 341 PSRLPIX m2, m2, 2 342 movh [r0+0*FDEC_STRIDEB], m4 343 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3 344 movh [r0+1*FDEC_STRIDEB], m2 345 PSLLPIX m0, m0, 1 346 movh [r0+2*FDEC_STRIDEB], m4 347 PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0 348 movh [r0+3*FDEC_STRIDEB], m2 349 RET 350 351;----------------------------------------------------------------------------- 352; void predict_4x4_hd( pixel *src ) 353;----------------------------------------------------------------------------- 354cglobal predict_4x4_hd, 1,1 355%if HIGH_BIT_DEPTH 356 movu m1, [r0-1*FDEC_STRIDEB-8] 357 PSLLPIX m1, m1, 1 358 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3 359 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2 360 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1 361 pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0 362%else 363 movd m0, [r0-1*FDEC_STRIDEB-4] ; lt .. 364 punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. .. 365 PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. .. 366 movd m1, [r0+3*FDEC_STRIDEB-4] ; l3 367 punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3 368 movd m2, [r0+1*FDEC_STRIDEB-4] ; l1 369 punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1 370 punpckh%3 m1, m2 ; l0 l1 l2 l3 371 punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 372%endif 373 PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2 374 PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1 375 pavg%1 m5, m1, m2 376 PRED8x8_LOWPASS m3, m1, m0, m2, m4 377 punpckl%2 m5, m3 378 PSRLPIX m3, m3, 4 379 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4 380%assign Y 3 381 movh [r0+Y*FDEC_STRIDEB], m5 382%rep 2 383%assign Y (Y-1) 384 PSRLPIX m5, m5, 2 385 movh [r0+Y*FDEC_STRIDEB], m5 386%endrep 387 movh [r0+0*FDEC_STRIDEB], m3 388 RET 389%endmacro ; PREDICT_4x4 390 391;----------------------------------------------------------------------------- 392; void predict_4x4_ddr( pixel *src ) 393;----------------------------------------------------------------------------- 394%if HIGH_BIT_DEPTH 395INIT_MMX mmx2 396cglobal predict_4x4_ddr, 1,1 397 mova m0, [r0+1*FDEC_STRIDEB-8] 398 punpckhwd m0, [r0+0*FDEC_STRIDEB-8] 399 mova m3, [r0+3*FDEC_STRIDEB-8] 400 punpckhwd m3, [r0+2*FDEC_STRIDEB-8] 401 punpckhdq m3, m0 402 403 pshufw m0, m3, q3321 404 pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3 405 pshufw m1, m0, q3321 406 PRED8x8_LOWPASS m0, m1, m3, m0 407 movq [r0+3*FDEC_STRIDEB], m0 408 409 movq m2, [r0-1*FDEC_STRIDEB-0] 410 pshufw m4, m2, q2100 411 pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0 412 movq m1, m4 413 PALIGNR m4, m3, 6, m3 414 PRED8x8_LOWPASS m1, m4, m2, m1 415 movq [r0+0*FDEC_STRIDEB], m1 416 417 pshufw m2, m0, q3321 418 punpckldq m2, m1 419 psllq m0, 16 420 PALIGNR m1, m0, 6, m0 421 movq [r0+1*FDEC_STRIDEB], m1 422 movq [r0+2*FDEC_STRIDEB], m2 423 movd [r0+3*FDEC_STRIDEB+4], m1 424 RET 425 426;----------------------------------------------------------------------------- 427; void predict_4x4_hd( pixel *src ) 428;----------------------------------------------------------------------------- 429cglobal predict_4x4_hd, 1,1 430 mova m0, [r0+1*FDEC_STRIDEB-8] 431 punpckhwd m0, [r0+0*FDEC_STRIDEB-8] 432 mova m1, [r0+3*FDEC_STRIDEB-8] 433 punpckhwd m1, [r0+2*FDEC_STRIDEB-8] 434 punpckhdq m1, m0 435 mova m0, m1 436 437 movu m3, [r0-1*FDEC_STRIDEB-2] 438 pshufw m4, m1, q0032 439 mova m7, m3 440 punpckldq m4, m3 441 PALIGNR m3, m1, 2, m2 442 PRED8x8_LOWPASS m2, m4, m1, m3 443 444 pavgw m0, m3 445 punpcklwd m5, m0, m2 446 punpckhwd m4, m0, m2 447 mova [r0+3*FDEC_STRIDEB], m5 448 mova [r0+1*FDEC_STRIDEB], m4 449 psrlq m5, 32 450 punpckldq m5, m4 451 mova [r0+2*FDEC_STRIDEB], m5 452 453 pshufw m4, m7, q2100 454 mova m6, [r0-1*FDEC_STRIDEB+0] 455 pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0 456 PRED8x8_LOWPASS m3, m4, m6, m7 457 PALIGNR m3, m0, 6, m0 458 mova [r0+0*FDEC_STRIDEB], m3 459 RET 460 461INIT_XMM sse2 462PREDICT_4x4 w, wd, dq, qdq 463INIT_XMM ssse3 464PREDICT_4x4 w, wd, dq, qdq 465INIT_XMM avx 466PREDICT_4x4 w, wd, dq, qdq 467%else ; !HIGH_BIT_DEPTH 468INIT_MMX mmx2 469PREDICT_4x4 b, bw, wd, dq 470INIT_MMX ssse3 471%define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3 472PREDICT_4x4 b, bw, wd, dq 473%endif 474 475;----------------------------------------------------------------------------- 476; void predict_4x4_hu( pixel *src ) 477;----------------------------------------------------------------------------- 478%if HIGH_BIT_DEPTH 479INIT_MMX 480cglobal predict_4x4_hu_mmx2, 1,1 481 movq m0, [r0+0*FDEC_STRIDEB-8] 482 punpckhwd m0, [r0+1*FDEC_STRIDEB-8] 483 movq m1, [r0+2*FDEC_STRIDEB-8] 484 punpckhwd m1, [r0+3*FDEC_STRIDEB-8] 485 punpckhdq m0, m1 486 pshufw m1, m1, q3333 487 movq [r0+3*FDEC_STRIDEB], m1 488 pshufw m3, m0, q3321 489 pshufw m4, m0, q3332 490 pavgw m2, m0, m3 491 PRED8x8_LOWPASS m3, m0, m4, m3 492 punpcklwd m4, m2, m3 493 mova [r0+0*FDEC_STRIDEB], m4 494 psrlq m2, 16 495 psrlq m3, 16 496 punpcklwd m2, m3 497 mova [r0+1*FDEC_STRIDEB], m2 498 punpckhdq m2, m1 499 mova [r0+2*FDEC_STRIDEB], m2 500 RET 501 502%else ; !HIGH_BIT_DEPTH 503INIT_MMX 504cglobal predict_4x4_hu_mmx2, 1,1 505 movd m1, [r0+0*FDEC_STRIDEB-4] 506 punpcklbw m1, [r0+1*FDEC_STRIDEB-4] 507 movd m0, [r0+2*FDEC_STRIDEB-4] 508 punpcklbw m0, [r0+3*FDEC_STRIDEB-4] 509 punpckhwd m1, m0 510 movq m0, m1 511 punpckhbw m1, m1 512 pshufw m1, m1, q3333 513 punpckhdq m0, m1 514 movq m2, m0 515 movq m3, m0 516 movq m5, m0 517 psrlq m3, 8 518 psrlq m2, 16 519 pavgb m5, m3 520 PRED8x8_LOWPASS m3, m0, m2, m3, m4 521 movd [r0+3*FDEC_STRIDEB], m1 522 punpcklbw m5, m3 523 movd [r0+0*FDEC_STRIDEB], m5 524 psrlq m5, 16 525 movd [r0+1*FDEC_STRIDEB], m5 526 psrlq m5, 16 527 movd [r0+2*FDEC_STRIDEB], m5 528 RET 529%endif ; HIGH_BIT_DEPTH 530 531;----------------------------------------------------------------------------- 532; void predict_4x4_vl( pixel *src ) 533;----------------------------------------------------------------------------- 534%macro PREDICT_4x4_V1 1 535cglobal predict_4x4_vl, 1,1 536 movu m1, [r0-FDEC_STRIDEB] 537 PSRLPIX m3, m1, 1 538 PSRLPIX m2, m1, 2 539 pavg%1 m4, m3, m1 540 PRED8x8_LOWPASS m0, m1, m2, m3, m5 541 542 movh [r0+0*FDEC_STRIDEB], m4 543 movh [r0+1*FDEC_STRIDEB], m0 544 PSRLPIX m4, m4, 1 545 PSRLPIX m0, m0, 1 546 movh [r0+2*FDEC_STRIDEB], m4 547 movh [r0+3*FDEC_STRIDEB], m0 548 RET 549%endmacro 550 551%if HIGH_BIT_DEPTH 552INIT_XMM sse2 553PREDICT_4x4_V1 w 554INIT_XMM avx 555PREDICT_4x4_V1 w 556 557INIT_MMX mmx2 558cglobal predict_4x4_vl, 1,4 559 mova m1, [r0-FDEC_STRIDEB+0] 560 mova m2, [r0-FDEC_STRIDEB+8] 561 mova m0, m2 562 PALIGNR m2, m1, 4, m4 563 PALIGNR m0, m1, 2, m4 564 mova m3, m0 565 pavgw m3, m1 566 mova [r0+0*FDEC_STRIDEB], m3 567 psrlq m3, 16 568 mova [r0+2*FDEC_STRIDEB], m3 569 PRED8x8_LOWPASS m0, m1, m2, m0 570 mova [r0+1*FDEC_STRIDEB], m0 571 psrlq m0, 16 572 mova [r0+3*FDEC_STRIDEB], m0 573 574 movzx r1d, word [r0-FDEC_STRIDEB+ 8] 575 movzx r2d, word [r0-FDEC_STRIDEB+10] 576 movzx r3d, word [r0-FDEC_STRIDEB+12] 577 lea r1d, [r1+r2+1] 578 add r3d, r2d 579 lea r3d, [r3+r1+1] 580 shr r1d, 1 581 shr r3d, 2 582 mov [r0+2*FDEC_STRIDEB+6], r1w 583 mov [r0+3*FDEC_STRIDEB+6], r3w 584 RET 585%else ; !HIGH_BIT_DEPTH 586INIT_MMX mmx2 587PREDICT_4x4_V1 b 588%endif 589 590;----------------------------------------------------------------------------- 591; void predict_4x4_dc( pixel *src ) 592;----------------------------------------------------------------------------- 593INIT_MMX mmx2 594%if HIGH_BIT_DEPTH 595cglobal predict_4x4_dc, 1,1 596 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] 597 paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] 598 paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] 599 paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] 600 psrlq m2, 48 601 mova m0, [r0-FDEC_STRIDEB] 602 HADDW m0, m1 603 paddw m0, [pw_4] 604 paddw m0, m2 605 psrlw m0, 3 606 SPLATW m0, m0 607 mova [r0+0*FDEC_STRIDEB], m0 608 mova [r0+1*FDEC_STRIDEB], m0 609 mova [r0+2*FDEC_STRIDEB], m0 610 mova [r0+3*FDEC_STRIDEB], m0 611 RET 612 613%else ; !HIGH_BIT_DEPTH 614cglobal predict_4x4_dc, 1,4 615 pxor mm7, mm7 616 movd mm0, [r0-FDEC_STRIDEB] 617 psadbw mm0, mm7 618 movd r3d, mm0 619 movzx r1d, byte [r0-1] 620%assign Y 1 621%rep 3 622 movzx r2d, byte [r0+FDEC_STRIDEB*Y-1] 623 add r1d, r2d 624%assign Y Y+1 625%endrep 626 lea r1d, [r1+r3+4] 627 shr r1d, 3 628 imul r1d, 0x01010101 629 mov [r0+FDEC_STRIDEB*0], r1d 630 mov [r0+FDEC_STRIDEB*1], r1d 631 mov [r0+FDEC_STRIDEB*2], r1d 632 mov [r0+FDEC_STRIDEB*3], r1d 633 RET 634%endif ; HIGH_BIT_DEPTH 635 636%macro PREDICT_FILTER 4 637;----------------------------------------------------------------------------- 638;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters ) 639;----------------------------------------------------------------------------- 640cglobal predict_8x8_filter, 4,6,6 641 add r0, 0x58*SIZEOF_PIXEL 642%define src r0-0x58*SIZEOF_PIXEL 643%if ARCH_X86_64 == 0 644 mov r4, r1 645%define t1 r4 646%define t4 r1 647%else 648%define t1 r1 649%define t4 r4 650%endif 651 test r3b, 1 652 je .check_top 653 mov t4d, r2d 654 and t4d, 8 655 neg t4 656 mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] 657 punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)] 658 mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] 659 punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] 660 punpckh%2%3 m1, m0 661 mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL] 662 punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] 663 mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL] 664 punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL] 665 punpckh%2%3 m3, m2 666 punpckh%3%4 m3, m1 667 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL] 668 mova m1, [src-1*FDEC_STRIDEB] 669 PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0 670 PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2 671 PRED8x8_LOWPASS m3, m1, m4, m3, m5 672 mova [t1+8*SIZEOF_PIXEL], m3 673 movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL] 674 movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL] 675 lea t4d, [t4*3+2] 676 add t4d, r5d 677 shr t4d, 2 678 mov [t1+7*SIZEOF_PIXEL], t4%1 679 mov [t1+6*SIZEOF_PIXEL], t4%1 680 test r3b, 2 681 je .done 682.check_top: 683%if SIZEOF_PIXEL==1 && cpuflag(ssse3) 684INIT_XMM cpuname 685 movu m3, [src-1*FDEC_STRIDEB] 686 movhps m0, [src-1*FDEC_STRIDEB-8] 687 test r2b, 8 688 je .fix_lt_2 689.do_top: 690 and r2d, 4 691%if ARCH_X86_64 692 lea r3, [shuf_fixtr] 693 pshufb m3, [r3+r2*4] 694%else 695 pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr 696%endif 697 psrldq m1, m3, 15 698 PALIGNR m2, m3, m0, 15, m0 699 PALIGNR m1, m3, 1, m5 700 PRED8x8_LOWPASS m0, m2, m1, m3, m5 701 mova [t1+16*SIZEOF_PIXEL], m0 702 psrldq m0, 15 703 movd [t1+32*SIZEOF_PIXEL], m0 704.done: 705 REP_RET 706.fix_lt_2: 707 pslldq m0, m3, 15 708 jmp .do_top 709 710%else 711 mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL] 712 mova m3, [src-1*FDEC_STRIDEB] 713 mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL] 714 test r2b, 8 715 je .fix_lt_2 716 test r2b, 4 717 je .fix_tr_1 718.do_top: 719 PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0 720 PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5 721 PRED8x8_LOWPASS m4, m2, m0, m3, m5 722 mova [t1+16*SIZEOF_PIXEL], m4 723 test r3b, 4 724 je .done 725 PSRLPIX m5, m1, 7 726 PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3 727 PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4 728 PRED8x8_LOWPASS m0, m2, m5, m1, m4 729 mova [t1+24*SIZEOF_PIXEL], m0 730 PSRLPIX m0, m0, 7 731 movd [t1+32*SIZEOF_PIXEL], m0 732.done: 733 REP_RET 734.fix_lt_2: 735 PSLLPIX m0, m3, 7 736 test r2b, 4 737 jne .do_top 738.fix_tr_1: 739 punpckh%1%2 m1, m3, m3 740 pshuf%2 m1, m1, q3333 741 jmp .do_top 742%endif 743%endmacro 744 745%if HIGH_BIT_DEPTH 746INIT_XMM sse2 747PREDICT_FILTER w, d, q, dq 748INIT_XMM ssse3 749PREDICT_FILTER w, d, q, dq 750INIT_XMM avx 751PREDICT_FILTER w, d, q, dq 752%else 753INIT_MMX mmx2 754PREDICT_FILTER b, w, d, q 755INIT_MMX ssse3 756PREDICT_FILTER b, w, d, q 757%endif 758 759;----------------------------------------------------------------------------- 760; void predict_8x8_v( pixel *src, pixel *edge ) 761;----------------------------------------------------------------------------- 762%macro PREDICT_8x8_V 0 763cglobal predict_8x8_v, 2,2 764 mova m0, [r1+16*SIZEOF_PIXEL] 765 STORE8 m0 766 RET 767%endmacro 768 769%if HIGH_BIT_DEPTH 770INIT_XMM sse 771PREDICT_8x8_V 772%else 773INIT_MMX mmx2 774PREDICT_8x8_V 775%endif 776 777;----------------------------------------------------------------------------- 778; void predict_8x8_h( pixel *src, pixel edge[36] ) 779;----------------------------------------------------------------------------- 780%macro PREDICT_8x8_H 2 781cglobal predict_8x8_h, 2,2 782 movu m1, [r1+7*SIZEOF_PIXEL] 783 add r0, 4*FDEC_STRIDEB 784 punpckl%1 m2, m1, m1 785 punpckh%1 m1, m1 786%assign Y 0 787%rep 8 788%assign i 1+Y/4 789 SPLAT%2 m0, m %+ i, (3-Y)&3 790 mova [r0+(Y-4)*FDEC_STRIDEB], m0 791%assign Y Y+1 792%endrep 793 RET 794%endmacro 795 796%if HIGH_BIT_DEPTH 797INIT_XMM sse2 798PREDICT_8x8_H wd, D 799%else 800INIT_MMX mmx2 801PREDICT_8x8_H bw, W 802%endif 803 804;----------------------------------------------------------------------------- 805; void predict_8x8_dc( pixel *src, pixel *edge ); 806;----------------------------------------------------------------------------- 807%if HIGH_BIT_DEPTH 808INIT_XMM sse2 809cglobal predict_8x8_dc, 2,2 810 movu m0, [r1+14] 811 paddw m0, [r1+32] 812 HADDW m0, m1 813 paddw m0, [pw_8] 814 psrlw m0, 4 815 SPLATW m0, m0 816 STORE8 m0 817 RET 818 819%else ; !HIGH_BIT_DEPTH 820INIT_MMX mmx2 821cglobal predict_8x8_dc, 2,2 822 pxor mm0, mm0 823 pxor mm1, mm1 824 psadbw mm0, [r1+7] 825 psadbw mm1, [r1+16] 826 paddw mm0, [pw_8] 827 paddw mm0, mm1 828 psrlw mm0, 4 829 pshufw mm0, mm0, 0 830 packuswb mm0, mm0 831 STORE8 mm0 832 RET 833%endif ; HIGH_BIT_DEPTH 834 835;----------------------------------------------------------------------------- 836; void predict_8x8_dc_top ( pixel *src, pixel *edge ); 837; void predict_8x8_dc_left( pixel *src, pixel *edge ); 838;----------------------------------------------------------------------------- 839%if HIGH_BIT_DEPTH 840%macro PREDICT_8x8_DC 3 841cglobal %1, 2,2 842 %3 m0, [r1+%2] 843 HADDW m0, m1 844 paddw m0, [pw_4] 845 psrlw m0, 3 846 SPLATW m0, m0 847 STORE8 m0 848 RET 849%endmacro 850INIT_XMM sse2 851PREDICT_8x8_DC predict_8x8_dc_top , 32, mova 852PREDICT_8x8_DC predict_8x8_dc_left, 14, movu 853 854%else ; !HIGH_BIT_DEPTH 855%macro PREDICT_8x8_DC 2 856cglobal %1, 2,2 857 pxor mm0, mm0 858 psadbw mm0, [r1+%2] 859 paddw mm0, [pw_4] 860 psrlw mm0, 3 861 pshufw mm0, mm0, 0 862 packuswb mm0, mm0 863 STORE8 mm0 864 RET 865%endmacro 866INIT_MMX 867PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16 868PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7 869%endif ; HIGH_BIT_DEPTH 870 871; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe 872; size on the 8-bit mmx functions below if we know sse2 is available. 873%macro PREDICT_8x8_DDLR 0 874;----------------------------------------------------------------------------- 875; void predict_8x8_ddl( pixel *src, pixel *edge ) 876;----------------------------------------------------------------------------- 877cglobal predict_8x8_ddl, 2,2,7 878 mova m0, [r1+16*SIZEOF_PIXEL] 879 mova m1, [r1+24*SIZEOF_PIXEL] 880%if cpuflag(cache64) 881 movd m5, [r1+32*SIZEOF_PIXEL] 882 palignr m3, m1, m0, 1*SIZEOF_PIXEL 883 palignr m5, m5, m1, 1*SIZEOF_PIXEL 884 palignr m4, m1, m0, 7*SIZEOF_PIXEL 885%else 886 movu m3, [r1+17*SIZEOF_PIXEL] 887 movu m4, [r1+23*SIZEOF_PIXEL] 888 movu m5, [r1+25*SIZEOF_PIXEL] 889%endif 890 PSLLPIX m2, m0, 1 891 add r0, FDEC_STRIDEB*4 892 PRED8x8_LOWPASS m0, m2, m3, m0, m6 893 PRED8x8_LOWPASS m1, m4, m5, m1, m6 894 mova [r0+3*FDEC_STRIDEB], m1 895%assign Y 2 896%rep 6 897 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 898 PSLLPIX m0, m0, 1 899 mova [r0+Y*FDEC_STRIDEB], m1 900%assign Y (Y-1) 901%endrep 902 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 903 mova [r0+Y*FDEC_STRIDEB], m1 904 RET 905 906;----------------------------------------------------------------------------- 907; void predict_8x8_ddr( pixel *src, pixel *edge ) 908;----------------------------------------------------------------------------- 909cglobal predict_8x8_ddr, 2,2,7 910 add r0, FDEC_STRIDEB*4 911 mova m0, [r1+ 8*SIZEOF_PIXEL] 912 mova m1, [r1+16*SIZEOF_PIXEL] 913 ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit 914 movu m2, [r1+ 7*SIZEOF_PIXEL] 915 movu m5, [r1+17*SIZEOF_PIXEL] 916%if cpuflag(cache64) 917 palignr m3, m1, m0, 1*SIZEOF_PIXEL 918 palignr m4, m1, m0, 7*SIZEOF_PIXEL 919%else 920 movu m3, [r1+ 9*SIZEOF_PIXEL] 921 movu m4, [r1+15*SIZEOF_PIXEL] 922%endif 923 PRED8x8_LOWPASS m0, m2, m3, m0, m6 924 PRED8x8_LOWPASS m1, m4, m5, m1, m6 925 mova [r0+3*FDEC_STRIDEB], m0 926%assign Y -4 927%rep 6 928 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 929 PSLLPIX m0, m0, 1 930 mova [r0+Y*FDEC_STRIDEB], m1 931%assign Y (Y+1) 932%endrep 933 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 934 mova [r0+Y*FDEC_STRIDEB], m1 935 RET 936%endmacro ; PREDICT_8x8_DDLR 937 938%if HIGH_BIT_DEPTH 939INIT_XMM sse2 940PREDICT_8x8_DDLR 941INIT_XMM ssse3 942PREDICT_8x8_DDLR 943INIT_XMM cache64, ssse3 944PREDICT_8x8_DDLR 945%elif ARCH_X86_64 == 0 946INIT_MMX mmx2 947PREDICT_8x8_DDLR 948%endif 949 950;----------------------------------------------------------------------------- 951; void predict_8x8_hu( pixel *src, pixel *edge ) 952;----------------------------------------------------------------------------- 953%macro PREDICT_8x8_HU 2 954cglobal predict_8x8_hu, 2,2,8 955 add r0, 4*FDEC_STRIDEB 956%if HIGH_BIT_DEPTH 957%if cpuflag(ssse3) 958 movu m5, [r1+7*SIZEOF_PIXEL] 959 pshufb m5, [pw_reverse] 960%else 961 movq m6, [r1+7*SIZEOF_PIXEL] 962 movq m5, [r1+11*SIZEOF_PIXEL] 963 pshuflw m6, m6, q0123 964 pshuflw m5, m5, q0123 965 movlhps m5, m6 966%endif ; cpuflag 967 psrldq m2, m5, 2 968 pshufd m3, m5, q0321 969 pshufhw m2, m2, q2210 970 pshufhw m3, m3, q1110 971 pavgw m4, m5, m2 972%else ; !HIGH_BIT_DEPTH 973 movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 974 pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 975 psllq m1, 56 ; l7 .. .. .. .. .. .. .. 976 mova m2, m0 977 psllw m0, 8 978 psrlw m2, 8 979 por m2, m0 980 mova m3, m2 981 mova m4, m2 982 mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0 983 psrlq m3, 16 984 psrlq m2, 8 985 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1 986 punpckhbw m1, m1 987 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2 988 pavgb m4, m2 989%endif ; !HIGH_BIT_DEPTH 990 PRED8x8_LOWPASS m2, m3, m5, m2, m6 991 punpckh%2 m0, m4, m2 ; p8 p7 p6 p5 992 punpckl%2 m4, m2 ; p4 p3 p2 p1 993 PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3 994 pshuf%1 m1, m0, q3321 995 PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3 996 pshuf%1 m2, m0, q3332 997 PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3 998 pshuf%1 m3, m0, q3333 999 mova [r0-4*FDEC_STRIDEB], m4 1000 mova [r0-3*FDEC_STRIDEB], m5 1001 mova [r0-2*FDEC_STRIDEB], m6 1002 mova [r0-1*FDEC_STRIDEB], m7 1003 mova [r0+0*FDEC_STRIDEB], m0 1004 mova [r0+1*FDEC_STRIDEB], m1 1005 mova [r0+2*FDEC_STRIDEB], m2 1006 mova [r0+3*FDEC_STRIDEB], m3 1007 RET 1008%endmacro 1009 1010%if HIGH_BIT_DEPTH 1011INIT_XMM sse2 1012PREDICT_8x8_HU d, wd 1013INIT_XMM ssse3 1014PREDICT_8x8_HU d, wd 1015INIT_XMM avx 1016PREDICT_8x8_HU d, wd 1017%elif ARCH_X86_64 == 0 1018INIT_MMX mmx2 1019PREDICT_8x8_HU w, bw 1020%endif 1021 1022;----------------------------------------------------------------------------- 1023; void predict_8x8_vr( pixel *src, pixel *edge ) 1024;----------------------------------------------------------------------------- 1025%macro PREDICT_8x8_VR 1 1026cglobal predict_8x8_vr, 2,3 1027 mova m2, [r1+16*SIZEOF_PIXEL] 1028%ifidn cpuname, ssse3 1029 mova m0, [r1+8*SIZEOF_PIXEL] 1030 palignr m3, m2, m0, 7*SIZEOF_PIXEL 1031 palignr m1, m2, m0, 6*SIZEOF_PIXEL 1032%else 1033 movu m3, [r1+15*SIZEOF_PIXEL] 1034 movu m1, [r1+14*SIZEOF_PIXEL] 1035%endif 1036 pavg%1 m4, m3, m2 1037 add r0, FDEC_STRIDEB*4 1038 PRED8x8_LOWPASS m3, m1, m2, m3, m5 1039 mova [r0-4*FDEC_STRIDEB], m4 1040 mova [r0-3*FDEC_STRIDEB], m3 1041 mova m1, [r1+8*SIZEOF_PIXEL] 1042 PSLLPIX m0, m1, 1 1043 PSLLPIX m2, m1, 2 1044 PRED8x8_LOWPASS m0, m1, m2, m0, m6 1045 1046%assign Y -2 1047%rep 5 1048 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5 1049 mova [r0+Y*FDEC_STRIDEB], m4 1050 PSLLPIX m0, m0, 1 1051 SWAP 3, 4 1052%assign Y (Y+1) 1053%endrep 1054 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0 1055 mova [r0+Y*FDEC_STRIDEB], m4 1056 RET 1057%endmacro 1058 1059%if HIGH_BIT_DEPTH 1060INIT_XMM sse2 1061PREDICT_8x8_VR w 1062INIT_XMM ssse3 1063PREDICT_8x8_VR w 1064INIT_XMM avx 1065PREDICT_8x8_VR w 1066%elif ARCH_X86_64 == 0 1067INIT_MMX mmx2 1068PREDICT_8x8_VR b 1069%endif 1070 1071%macro LOAD_PLANE_ARGS 0 1072%if cpuflag(avx2) && ARCH_X86_64 == 0 1073 vpbroadcastw m0, r1m 1074 vpbroadcastw m2, r2m 1075 vpbroadcastw m4, r3m 1076%elif mmsize == 8 ; MMX is only used on x86_32 1077 SPLATW m0, r1m 1078 SPLATW m2, r2m 1079 SPLATW m4, r3m 1080%else 1081 movd xm0, r1m 1082 movd xm2, r2m 1083 movd xm4, r3m 1084 SPLATW m0, xm0 1085 SPLATW m2, xm2 1086 SPLATW m4, xm4 1087%endif 1088%endmacro 1089 1090;----------------------------------------------------------------------------- 1091; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) 1092;----------------------------------------------------------------------------- 1093%if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0 1094%macro PREDICT_CHROMA_P_MMX 1 1095cglobal predict_8x%1c_p_core, 1,2 1096 LOAD_PLANE_ARGS 1097 movq m1, m2 1098 pmullw m2, [pw_0to15] 1099 psllw m1, 2 1100 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b} 1101 paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b} 1102 mov r1d, %1 1103ALIGN 4 1104.loop: 1105 movq m5, m0 1106 movq m6, m1 1107 psraw m5, 5 1108 psraw m6, 5 1109 packuswb m5, m6 1110 movq [r0], m5 1111 1112 paddsw m0, m4 1113 paddsw m1, m4 1114 add r0, FDEC_STRIDE 1115 dec r1d 1116 jg .loop 1117 RET 1118%endmacro ; PREDICT_CHROMA_P_MMX 1119 1120INIT_MMX mmx2 1121PREDICT_CHROMA_P_MMX 8 1122PREDICT_CHROMA_P_MMX 16 1123%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH 1124 1125%macro PREDICT_CHROMA_P 1 1126%if HIGH_BIT_DEPTH 1127cglobal predict_8x%1c_p_core, 1,2,7 1128 LOAD_PLANE_ARGS 1129 mova m3, [pw_pixel_max] 1130 pxor m1, m1 1131 pmullw m2, [pw_43210123] ; b 1132%if %1 == 16 1133 pmullw m5, m4, [pw_m7] ; c 1134%else 1135 pmullw m5, m4, [pw_m3] 1136%endif 1137 paddw m5, [pw_16] 1138%if mmsize == 32 1139 mova xm6, xm4 1140 paddw m4, m4 1141 paddw m5, m6 1142%endif 1143 mov r1d, %1/(mmsize/16) 1144.loop: 1145 paddsw m6, m2, m5 1146 paddsw m6, m0 1147 psraw m6, 5 1148 CLIPW m6, m1, m3 1149 paddw m5, m4 1150%if mmsize == 32 1151 vextracti128 [r0], m6, 1 1152 mova [r0+FDEC_STRIDEB], xm6 1153 add r0, 2*FDEC_STRIDEB 1154%else 1155 mova [r0], m6 1156 add r0, FDEC_STRIDEB 1157%endif 1158 dec r1d 1159 jg .loop 1160 RET 1161%else ; !HIGH_BIT_DEPTH 1162cglobal predict_8x%1c_p_core, 1,2 1163 LOAD_PLANE_ARGS 1164%if mmsize == 32 1165 vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 1166 pmullw m2, m1 1167 mova xm1, xm4 ; zero upper half 1168 paddsw m4, m4 1169 paddsw m0, m1 1170%else 1171 pmullw m2, [pw_0to15] 1172%endif 1173 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} 1174 paddsw m1, m0, m4 1175 paddsw m4, m4 1176 mov r1d, %1/(mmsize/8) 1177.loop: 1178 psraw m2, m0, 5 1179 psraw m3, m1, 5 1180 paddsw m0, m4 1181 paddsw m1, m4 1182 packuswb m2, m3 1183%if mmsize == 32 1184 movq [r0+FDEC_STRIDE*1], xm2 1185 movhps [r0+FDEC_STRIDE*3], xm2 1186 vextracti128 xm2, m2, 1 1187 movq [r0+FDEC_STRIDE*0], xm2 1188 movhps [r0+FDEC_STRIDE*2], xm2 1189%else 1190 movq [r0+FDEC_STRIDE*0], xm2 1191 movhps [r0+FDEC_STRIDE*1], xm2 1192%endif 1193 add r0, FDEC_STRIDE*mmsize/8 1194 dec r1d 1195 jg .loop 1196 RET 1197%endif ; HIGH_BIT_DEPTH 1198%endmacro ; PREDICT_CHROMA_P 1199 1200INIT_XMM sse2 1201PREDICT_CHROMA_P 8 1202PREDICT_CHROMA_P 16 1203INIT_XMM avx 1204PREDICT_CHROMA_P 8 1205PREDICT_CHROMA_P 16 1206INIT_YMM avx2 1207PREDICT_CHROMA_P 8 1208PREDICT_CHROMA_P 16 1209 1210;----------------------------------------------------------------------------- 1211; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) 1212;----------------------------------------------------------------------------- 1213%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0 1214INIT_MMX mmx2 1215cglobal predict_16x16_p_core, 1,2 1216 LOAD_PLANE_ARGS 1217 movq mm5, mm2 1218 movq mm1, mm2 1219 pmullw mm5, [pw_0to15] 1220 psllw mm2, 3 1221 psllw mm1, 2 1222 movq mm3, mm2 1223 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} 1224 paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} 1225 paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} 1226 paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} 1227 1228 mov r1d, 16 1229ALIGN 4 1230.loop: 1231 movq mm5, mm0 1232 movq mm6, mm1 1233 psraw mm5, 5 1234 psraw mm6, 5 1235 packuswb mm5, mm6 1236 movq [r0], mm5 1237 1238 movq mm5, mm2 1239 movq mm6, mm3 1240 psraw mm5, 5 1241 psraw mm6, 5 1242 packuswb mm5, mm6 1243 movq [r0+8], mm5 1244 1245 paddsw mm0, mm4 1246 paddsw mm1, mm4 1247 paddsw mm2, mm4 1248 paddsw mm3, mm4 1249 add r0, FDEC_STRIDE 1250 dec r1d 1251 jg .loop 1252 RET 1253%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64 1254 1255%macro PREDICT_16x16_P 0 1256cglobal predict_16x16_p_core, 1,2,8 1257 movd m0, r1m 1258 movd m1, r2m 1259 movd m2, r3m 1260 SPLATW m0, m0, 0 1261 SPLATW m1, m1, 0 1262 SPLATW m2, m2, 0 1263 pmullw m3, m1, [pw_0to15] 1264 psllw m1, 3 1265%if HIGH_BIT_DEPTH 1266 pxor m6, m6 1267 mov r1d, 16 1268.loop: 1269 mova m4, m0 1270 mova m5, m0 1271 mova m7, m3 1272 paddsw m7, m6 1273 paddsw m4, m7 1274 paddsw m7, m1 1275 paddsw m5, m7 1276 psraw m4, 5 1277 psraw m5, 5 1278 CLIPW m4, [pb_0], [pw_pixel_max] 1279 CLIPW m5, [pb_0], [pw_pixel_max] 1280 mova [r0], m4 1281 mova [r0+16], m5 1282 add r0, FDEC_STRIDEB 1283 paddw m6, m2 1284%else ; !HIGH_BIT_DEPTH 1285 paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} 1286 paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} 1287 paddsw m7, m2, m2 1288 mov r1d, 8 1289ALIGN 4 1290.loop: 1291 psraw m3, m0, 5 1292 psraw m4, m1, 5 1293 paddsw m5, m0, m2 1294 paddsw m6, m1, m2 1295 psraw m5, 5 1296 psraw m6, 5 1297 packuswb m3, m4 1298 packuswb m5, m6 1299 mova [r0+FDEC_STRIDE*0], m3 1300 mova [r0+FDEC_STRIDE*1], m5 1301 paddsw m0, m7 1302 paddsw m1, m7 1303 add r0, FDEC_STRIDE*2 1304%endif ; !HIGH_BIT_DEPTH 1305 dec r1d 1306 jg .loop 1307 RET 1308%endmacro ; PREDICT_16x16_P 1309 1310INIT_XMM sse2 1311PREDICT_16x16_P 1312%if HIGH_BIT_DEPTH == 0 1313INIT_XMM avx 1314PREDICT_16x16_P 1315%endif 1316 1317INIT_YMM avx2 1318cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH 1319 LOAD_PLANE_ARGS 1320%if HIGH_BIT_DEPTH 1321 pmullw m2, [pw_0to15] 1322 pxor m5, m5 1323 pxor m6, m6 1324 mova m7, [pw_pixel_max] 1325 mov r1d, 8 1326.loop: 1327 paddsw m1, m2, m5 1328 paddw m5, m4 1329 paddsw m1, m0 1330 paddsw m3, m2, m5 1331 psraw m1, 5 1332 paddsw m3, m0 1333 psraw m3, 5 1334 CLIPW m1, m6, m7 1335 mova [r0+0*FDEC_STRIDEB], m1 1336 CLIPW m3, m6, m7 1337 mova [r0+1*FDEC_STRIDEB], m3 1338 paddw m5, m4 1339 add r0, 2*FDEC_STRIDEB 1340%else ; !HIGH_BIT_DEPTH 1341 vbroadcasti128 m1, [pw_0to15] 1342 mova xm3, xm4 ; zero high bits 1343 pmullw m1, m2 1344 psllw m2, 3 1345 paddsw m0, m3 1346 paddsw m0, m1 ; X+1*C X+0*C 1347 paddsw m1, m0, m2 ; Y+1*C Y+0*C 1348 paddsw m4, m4 1349 mov r1d, 4 1350.loop: 1351 psraw m2, m0, 5 1352 psraw m3, m1, 5 1353 paddsw m0, m4 1354 paddsw m1, m4 1355 packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C 1356 vextracti128 [r0+0*FDEC_STRIDE], m2, 1 1357 mova [r0+1*FDEC_STRIDE], xm2 1358 psraw m2, m0, 5 1359 psraw m3, m1, 5 1360 paddsw m0, m4 1361 paddsw m1, m4 1362 packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C 1363 vextracti128 [r0+2*FDEC_STRIDE], m2, 1 1364 mova [r0+3*FDEC_STRIDE], xm2 1365 add r0, FDEC_STRIDE*4 1366%endif ; !HIGH_BIT_DEPTH 1367 dec r1d 1368 jg .loop 1369 RET 1370 1371%if HIGH_BIT_DEPTH == 0 1372%macro PREDICT_8x8 0 1373;----------------------------------------------------------------------------- 1374; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) 1375;----------------------------------------------------------------------------- 1376cglobal predict_8x8_ddl, 2,2 1377 mova m0, [r1+16] 1378%ifidn cpuname, ssse3 1379 movd m2, [r1+32] 1380 palignr m2, m0, 1 1381%else 1382 movu m2, [r1+17] 1383%endif 1384 pslldq m1, m0, 1 1385 add r0, FDEC_STRIDE*4 1386 PRED8x8_LOWPASS m0, m1, m2, m0, m3 1387 1388%assign Y -4 1389%rep 8 1390 psrldq m0, 1 1391 movq [r0+Y*FDEC_STRIDE], m0 1392%assign Y (Y+1) 1393%endrep 1394 RET 1395 1396%ifnidn cpuname, ssse3 1397;----------------------------------------------------------------------------- 1398; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) 1399;----------------------------------------------------------------------------- 1400cglobal predict_8x8_ddr, 2,2 1401 movu m0, [r1+8] 1402 movu m1, [r1+7] 1403 psrldq m2, m0, 1 1404 add r0, FDEC_STRIDE*4 1405 PRED8x8_LOWPASS m0, m1, m2, m0, m3 1406 1407 psrldq m1, m0, 1 1408%assign Y 3 1409%rep 3 1410 movq [r0+Y*FDEC_STRIDE], m0 1411 movq [r0+(Y-1)*FDEC_STRIDE], m1 1412 psrldq m0, 2 1413 psrldq m1, 2 1414%assign Y (Y-2) 1415%endrep 1416 movq [r0-3*FDEC_STRIDE], m0 1417 movq [r0-4*FDEC_STRIDE], m1 1418 RET 1419 1420;----------------------------------------------------------------------------- 1421; void predict_8x8_vl( uint8_t *src, uint8_t *edge ) 1422;----------------------------------------------------------------------------- 1423cglobal predict_8x8_vl, 2,2 1424 mova m0, [r1+16] 1425 pslldq m1, m0, 1 1426 psrldq m2, m0, 1 1427 pavgb m3, m0, m2 1428 add r0, FDEC_STRIDE*4 1429 PRED8x8_LOWPASS m0, m1, m2, m0, m5 1430; m0: (t0 + 2*t1 + t2 + 2) >> 2 1431; m3: (t0 + t1 + 1) >> 1 1432 1433%assign Y -4 1434%rep 3 1435 psrldq m0, 1 1436 movq [r0+ Y *FDEC_STRIDE], m3 1437 movq [r0+(Y+1)*FDEC_STRIDE], m0 1438 psrldq m3, 1 1439%assign Y (Y+2) 1440%endrep 1441 psrldq m0, 1 1442 movq [r0+ Y *FDEC_STRIDE], m3 1443 movq [r0+(Y+1)*FDEC_STRIDE], m0 1444 RET 1445%endif ; !ssse3 1446 1447;----------------------------------------------------------------------------- 1448; void predict_8x8_vr( uint8_t *src, uint8_t *edge ) 1449;----------------------------------------------------------------------------- 1450cglobal predict_8x8_vr, 2,2 1451 movu m2, [r1+8] 1452 add r0, 4*FDEC_STRIDE 1453 pslldq m1, m2, 2 1454 pslldq m0, m2, 1 1455 pavgb m3, m2, m0 1456 PRED8x8_LOWPASS m0, m2, m1, m0, m4 1457 movhps [r0-4*FDEC_STRIDE], m3 1458 movhps [r0-3*FDEC_STRIDE], m0 1459%if cpuflag(ssse3) 1460 punpckhqdq m3, m3 1461 pshufb m0, [shuf_vr] 1462 palignr m3, m0, 13 1463%else 1464 mova m2, m0 1465 mova m1, [pw_00ff] 1466 pand m1, m0 1467 psrlw m0, 8 1468 packuswb m1, m0 1469 pslldq m1, 4 1470 movhlps m3, m1 1471 shufps m1, m2, q3210 1472 psrldq m3, 5 1473 psrldq m1, 5 1474 SWAP 0, 1 1475%endif 1476 movq [r0+3*FDEC_STRIDE], m0 1477 movq [r0+2*FDEC_STRIDE], m3 1478 psrldq m0, 1 1479 psrldq m3, 1 1480 movq [r0+1*FDEC_STRIDE], m0 1481 movq [r0+0*FDEC_STRIDE], m3 1482 psrldq m0, 1 1483 psrldq m3, 1 1484 movq [r0-1*FDEC_STRIDE], m0 1485 movq [r0-2*FDEC_STRIDE], m3 1486 RET 1487%endmacro ; PREDICT_8x8 1488 1489INIT_XMM sse2 1490PREDICT_8x8 1491INIT_XMM ssse3 1492PREDICT_8x8 1493INIT_XMM avx 1494PREDICT_8x8 1495 1496%endif ; !HIGH_BIT_DEPTH 1497 1498;----------------------------------------------------------------------------- 1499; void predict_8x8_vl( pixel *src, pixel *edge ) 1500;----------------------------------------------------------------------------- 1501%macro PREDICT_8x8_VL_10 1 1502cglobal predict_8x8_vl, 2,2,8 1503 mova m0, [r1+16*SIZEOF_PIXEL] 1504 mova m1, [r1+24*SIZEOF_PIXEL] 1505 PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4 1506 PSRLPIX m4, m1, 1 1507 pavg%1 m6, m0, m2 1508 pavg%1 m7, m1, m4 1509 add r0, FDEC_STRIDEB*4 1510 mova [r0-4*FDEC_STRIDEB], m6 1511 PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5 1512 mova [r0-2*FDEC_STRIDEB], m3 1513 PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5 1514 mova [r0+0*FDEC_STRIDEB], m3 1515 PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5 1516 mova [r0+2*FDEC_STRIDEB], m7 1517 PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6 1518 PSLLPIX m5, m0, 1 1519 PRED8x8_LOWPASS m0, m5, m2, m0, m7 1520 PRED8x8_LOWPASS m1, m3, m4, m1, m7 1521 PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2 1522 mova [r0-3*FDEC_STRIDEB], m4 1523 PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2 1524 mova [r0-1*FDEC_STRIDEB], m4 1525 PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2 1526 mova [r0+1*FDEC_STRIDEB], m4 1527 PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2 1528 mova [r0+3*FDEC_STRIDEB], m1 1529 RET 1530%endmacro 1531%if HIGH_BIT_DEPTH 1532INIT_XMM sse2 1533PREDICT_8x8_VL_10 w 1534INIT_XMM ssse3 1535PREDICT_8x8_VL_10 w 1536INIT_XMM avx 1537PREDICT_8x8_VL_10 w 1538%else 1539INIT_MMX mmx2 1540PREDICT_8x8_VL_10 b 1541%endif 1542 1543;----------------------------------------------------------------------------- 1544; void predict_8x8_hd( pixel *src, pixel *edge ) 1545;----------------------------------------------------------------------------- 1546%macro PREDICT_8x8_HD 2 1547cglobal predict_8x8_hd, 2,2 1548 add r0, 4*FDEC_STRIDEB 1549 mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 1550 movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 1551%ifidn cpuname, ssse3 1552 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0 1553 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0 1554 palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt 1555 palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5 1556%else 1557 movu m2, [r1+15*SIZEOF_PIXEL] 1558 movu m4, [r1+ 9*SIZEOF_PIXEL] 1559%endif ; cpuflag 1560 pavg%1 m3, m0, m1 1561 PRED8x8_LOWPASS m0, m4, m1, m0, m5 1562 PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1 1563 PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0 1564 PRED8x8_LOWPASS m1, m4, m2, m1, m5 1565 ; .. p11 p10 p9 1566 punpckh%2 m2, m3, m0 ; p8 p7 p6 p5 1567 punpckl%2 m3, m0 ; p4 p3 p2 p1 1568 mova [r0+3*FDEC_STRIDEB], m3 1569 PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5 1570 mova [r0+2*FDEC_STRIDEB], m0 1571 PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5 1572 mova [r0+1*FDEC_STRIDEB], m0 1573 PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3 1574 mova [r0+0*FDEC_STRIDEB], m0 1575 mova [r0-1*FDEC_STRIDEB], m2 1576 PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5 1577 mova [r0-2*FDEC_STRIDEB], m0 1578 PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5 1579 mova [r0-3*FDEC_STRIDEB], m0 1580 PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2 1581 mova [r0-4*FDEC_STRIDEB], m1 1582 RET 1583%endmacro 1584 1585%if HIGH_BIT_DEPTH 1586INIT_XMM sse2 1587PREDICT_8x8_HD w, wd 1588INIT_XMM ssse3 1589PREDICT_8x8_HD w, wd 1590INIT_XMM avx 1591PREDICT_8x8_HD w, wd 1592%else 1593INIT_MMX mmx2 1594PREDICT_8x8_HD b, bw 1595 1596;----------------------------------------------------------------------------- 1597; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) 1598;----------------------------------------------------------------------------- 1599%macro PREDICT_8x8_HD 0 1600cglobal predict_8x8_hd, 2,2 1601 add r0, 4*FDEC_STRIDE 1602 movu m1, [r1+7] 1603 movu m3, [r1+8] 1604 movu m2, [r1+9] 1605 pavgb m4, m1, m3 1606 PRED8x8_LOWPASS m0, m1, m2, m3, m5 1607 punpcklbw m4, m0 1608 movhlps m0, m4 1609 1610%assign Y 3 1611%rep 3 1612 movq [r0+(Y)*FDEC_STRIDE], m4 1613 movq [r0+(Y-4)*FDEC_STRIDE], m0 1614 psrldq m4, 2 1615 psrldq m0, 2 1616%assign Y (Y-1) 1617%endrep 1618 movq [r0+(Y)*FDEC_STRIDE], m4 1619 movq [r0+(Y-4)*FDEC_STRIDE], m0 1620 RET 1621%endmacro 1622 1623INIT_XMM sse2 1624PREDICT_8x8_HD 1625INIT_XMM avx 1626PREDICT_8x8_HD 1627%endif ; HIGH_BIT_DEPTH 1628 1629%if HIGH_BIT_DEPTH == 0 1630;----------------------------------------------------------------------------- 1631; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) 1632;----------------------------------------------------------------------------- 1633INIT_MMX 1634cglobal predict_8x8_hu_sse2, 2,2 1635 add r0, 4*FDEC_STRIDE 1636 movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 1637 pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 1638 movq mm2, mm0 1639 psllw mm0, 8 1640 psrlw mm2, 8 1641 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 1642 psllq mm1, 56 ; l7 .. .. .. .. .. .. .. 1643 movq mm3, mm2 1644 movq mm4, mm2 1645 movq mm5, mm2 1646 psrlq mm2, 8 1647 psrlq mm3, 16 1648 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 1649 punpckhbw mm1, mm1 1650 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 1651 pavgb mm4, mm2 1652 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 1653 1654 movq2dq xmm0, mm4 1655 movq2dq xmm1, mm1 1656 punpcklbw xmm0, xmm1 1657 punpckhbw mm4, mm1 1658%assign Y -4 1659%rep 3 1660 movq [r0+Y*FDEC_STRIDE], xmm0 1661 psrldq xmm0, 2 1662%assign Y (Y+1) 1663%endrep 1664 pshufw mm5, mm4, q3321 1665 pshufw mm6, mm4, q3332 1666 pshufw mm7, mm4, q3333 1667 movq [r0+Y*FDEC_STRIDE], xmm0 1668 movq [r0+0*FDEC_STRIDE], mm4 1669 movq [r0+1*FDEC_STRIDE], mm5 1670 movq [r0+2*FDEC_STRIDE], mm6 1671 movq [r0+3*FDEC_STRIDE], mm7 1672 RET 1673 1674INIT_XMM 1675cglobal predict_8x8_hu_ssse3, 2,2 1676 add r0, 4*FDEC_STRIDE 1677 movq m3, [r1+7] 1678 pshufb m3, [shuf_hu] 1679 psrldq m1, m3, 1 1680 psrldq m2, m3, 2 1681 pavgb m0, m1, m3 1682 PRED8x8_LOWPASS m1, m3, m2, m1, m4 1683 punpcklbw m0, m1 1684%assign Y -4 1685%rep 3 1686 movq [r0+ Y *FDEC_STRIDE], m0 1687 movhps [r0+(Y+4)*FDEC_STRIDE], m0 1688 psrldq m0, 2 1689 pshufhw m0, m0, q2210 1690%assign Y (Y+1) 1691%endrep 1692 movq [r0+ Y *FDEC_STRIDE], m0 1693 movhps [r0+(Y+4)*FDEC_STRIDE], m0 1694 RET 1695%endif ; !HIGH_BIT_DEPTH 1696 1697;----------------------------------------------------------------------------- 1698; void predict_8x8c_v( uint8_t *src ) 1699;----------------------------------------------------------------------------- 1700 1701%macro PREDICT_8x8C_V 0 1702cglobal predict_8x8c_v, 1,1 1703 mova m0, [r0 - FDEC_STRIDEB] 1704 STORE8 m0 1705 RET 1706%endmacro 1707 1708%if HIGH_BIT_DEPTH 1709INIT_XMM sse 1710PREDICT_8x8C_V 1711%else 1712INIT_MMX mmx 1713PREDICT_8x8C_V 1714%endif 1715 1716%if HIGH_BIT_DEPTH 1717 1718INIT_MMX 1719cglobal predict_8x8c_v_mmx, 1,1 1720 mova m0, [r0 - FDEC_STRIDEB] 1721 mova m1, [r0 - FDEC_STRIDEB + 8] 1722%assign Y 0 1723%rep 8 1724 mova [r0 + (Y&1)*FDEC_STRIDEB], m0 1725 mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1 1726%if (Y&1) && (Y!=7) 1727 add r0, FDEC_STRIDEB*2 1728%endif 1729%assign Y Y+1 1730%endrep 1731 RET 1732 1733%endif 1734 1735%macro PREDICT_8x16C_V 0 1736cglobal predict_8x16c_v, 1,1 1737 mova m0, [r0 - FDEC_STRIDEB] 1738 STORE16 m0 1739 RET 1740%endmacro 1741 1742%if HIGH_BIT_DEPTH 1743INIT_XMM sse 1744PREDICT_8x16C_V 1745%else 1746INIT_MMX mmx 1747PREDICT_8x16C_V 1748%endif 1749 1750;----------------------------------------------------------------------------- 1751; void predict_8x8c_h( uint8_t *src ) 1752;----------------------------------------------------------------------------- 1753%macro PREDICT_C_H 0 1754cglobal predict_8x8c_h, 1,1 1755%if cpuflag(ssse3) && notcpuflag(avx2) 1756 mova m2, [pb_3] 1757%endif 1758 PRED_H_4ROWS 8, 1 1759 PRED_H_4ROWS 8, 0 1760 RET 1761 1762cglobal predict_8x16c_h, 1,2 1763%if cpuflag(ssse3) && notcpuflag(avx2) 1764 mova m2, [pb_3] 1765%endif 1766 mov r1d, 4 1767.loop: 1768 PRED_H_4ROWS 8, 1 1769 dec r1d 1770 jg .loop 1771 RET 1772%endmacro 1773 1774INIT_MMX mmx2 1775PREDICT_C_H 1776%if HIGH_BIT_DEPTH 1777INIT_XMM sse2 1778PREDICT_C_H 1779INIT_XMM avx2 1780PREDICT_C_H 1781%else 1782INIT_MMX ssse3 1783PREDICT_C_H 1784%endif 1785 1786;----------------------------------------------------------------------------- 1787; void predict_8x8c_dc( pixel *src ) 1788;----------------------------------------------------------------------------- 1789%macro LOAD_LEFT 1 1790 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL] 1791 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL] 1792 add r1d, r2d 1793 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL] 1794 add r1d, r2d 1795 movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL] 1796 add r1d, r2d 1797%endmacro 1798 1799%macro PREDICT_8x8C_DC 0 1800cglobal predict_8x8c_dc, 1,3 1801 pxor m7, m7 1802%if HIGH_BIT_DEPTH 1803 movq m0, [r0-FDEC_STRIDEB+0] 1804 movq m1, [r0-FDEC_STRIDEB+8] 1805 HADDW m0, m2 1806 HADDW m1, m2 1807%else ; !HIGH_BIT_DEPTH 1808 movd m0, [r0-FDEC_STRIDEB+0] 1809 movd m1, [r0-FDEC_STRIDEB+4] 1810 psadbw m0, m7 ; s0 1811 psadbw m1, m7 ; s1 1812%endif 1813 add r0, FDEC_STRIDEB*4 1814 1815 LOAD_LEFT 0 ; s2 1816 movd m2, r1d 1817 LOAD_LEFT 4 ; s3 1818 movd m3, r1d 1819 1820 punpcklwd m0, m1 1821 punpcklwd m2, m3 1822 punpckldq m0, m2 ; s0, s1, s2, s3 1823 pshufw m3, m0, q3312 ; s2, s1, s3, s3 1824 pshufw m0, m0, q1310 ; s0, s1, s3, s1 1825 paddw m0, m3 1826 psrlw m0, 2 1827 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 1828%if HIGH_BIT_DEPTH 1829%if cpuflag(sse2) 1830 movq2dq xmm0, m0 1831 punpcklwd xmm0, xmm0 1832 pshufd xmm1, xmm0, q3322 1833 punpckldq xmm0, xmm0 1834%assign Y 0 1835%rep 8 1836%assign i (0 + (Y/4)) 1837 movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i 1838%assign Y Y+1 1839%endrep 1840%else ; !sse2 1841 pshufw m1, m0, q0000 1842 pshufw m2, m0, q1111 1843 pshufw m3, m0, q2222 1844 pshufw m4, m0, q3333 1845%assign Y 0 1846%rep 8 1847%assign i (1 + (Y/4)*2) 1848%assign j (2 + (Y/4)*2) 1849 movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i 1850 movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j 1851%assign Y Y+1 1852%endrep 1853%endif 1854%else ; !HIGH_BIT_DEPTH 1855 packuswb m0, m0 1856 punpcklbw m0, m0 1857 movq m1, m0 1858 punpcklbw m0, m0 1859 punpckhbw m1, m1 1860%assign Y 0 1861%rep 8 1862%assign i (0 + (Y/4)) 1863 movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i 1864%assign Y Y+1 1865%endrep 1866%endif 1867 RET 1868%endmacro 1869 1870INIT_MMX mmx2 1871PREDICT_8x8C_DC 1872%if HIGH_BIT_DEPTH 1873INIT_MMX sse2 1874PREDICT_8x8C_DC 1875%endif 1876 1877%if HIGH_BIT_DEPTH 1878%macro STORE_4LINES 3 1879%if cpuflag(sse2) 1880 movdqa [r0+FDEC_STRIDEB*(%3-4)], %1 1881 movdqa [r0+FDEC_STRIDEB*(%3-3)], %1 1882 movdqa [r0+FDEC_STRIDEB*(%3-2)], %1 1883 movdqa [r0+FDEC_STRIDEB*(%3-1)], %1 1884%else 1885 movq [r0+FDEC_STRIDEB*(%3-4)+0], %1 1886 movq [r0+FDEC_STRIDEB*(%3-4)+8], %2 1887 movq [r0+FDEC_STRIDEB*(%3-3)+0], %1 1888 movq [r0+FDEC_STRIDEB*(%3-3)+8], %2 1889 movq [r0+FDEC_STRIDEB*(%3-2)+0], %1 1890 movq [r0+FDEC_STRIDEB*(%3-2)+8], %2 1891 movq [r0+FDEC_STRIDEB*(%3-1)+0], %1 1892 movq [r0+FDEC_STRIDEB*(%3-1)+8], %2 1893%endif 1894%endmacro 1895%else 1896%macro STORE_4LINES 2 1897 movq [r0+FDEC_STRIDEB*(%2-4)], %1 1898 movq [r0+FDEC_STRIDEB*(%2-3)], %1 1899 movq [r0+FDEC_STRIDEB*(%2-2)], %1 1900 movq [r0+FDEC_STRIDEB*(%2-1)], %1 1901%endmacro 1902%endif 1903 1904%macro PREDICT_8x16C_DC 0 1905cglobal predict_8x16c_dc, 1,3 1906 pxor m7, m7 1907%if HIGH_BIT_DEPTH 1908 movq m0, [r0-FDEC_STRIDEB+0] 1909 movq m1, [r0-FDEC_STRIDEB+8] 1910 HADDW m0, m2 1911 HADDW m1, m2 1912%else 1913 movd m0, [r0-FDEC_STRIDEB+0] 1914 movd m1, [r0-FDEC_STRIDEB+4] 1915 psadbw m0, m7 ; s0 1916 psadbw m1, m7 ; s1 1917%endif 1918 punpcklwd m0, m1 ; s0, s1 1919 1920 add r0, FDEC_STRIDEB*4 1921 LOAD_LEFT 0 ; s2 1922 pinsrw m0, r1d, 2 1923 LOAD_LEFT 4 ; s3 1924 pinsrw m0, r1d, 3 ; s0, s1, s2, s3 1925 add r0, FDEC_STRIDEB*8 1926 LOAD_LEFT 0 ; s4 1927 pinsrw m1, r1d, 2 1928 LOAD_LEFT 4 ; s5 1929 pinsrw m1, r1d, 3 ; s1, __, s4, s5 1930 sub r0, FDEC_STRIDEB*8 1931 1932 pshufw m2, m0, q1310 ; s0, s1, s3, s1 1933 pshufw m0, m0, q3312 ; s2, s1, s3, s3 1934 pshufw m3, m1, q0302 ; s4, s1, s5, s1 1935 pshufw m1, m1, q3322 ; s4, s4, s5, s5 1936 paddw m0, m2 1937 paddw m1, m3 1938 psrlw m0, 2 1939 psrlw m1, 2 1940 pavgw m0, m7 1941 pavgw m1, m7 1942%if HIGH_BIT_DEPTH 1943%if cpuflag(sse2) 1944 movq2dq xmm0, m0 1945 movq2dq xmm1, m1 1946 punpcklwd xmm0, xmm0 1947 punpcklwd xmm1, xmm1 1948 pshufd xmm2, xmm0, q3322 1949 pshufd xmm3, xmm1, q3322 1950 punpckldq xmm0, xmm0 1951 punpckldq xmm1, xmm1 1952 STORE_4LINES xmm0, xmm0, 0 1953 STORE_4LINES xmm2, xmm2, 4 1954 STORE_4LINES xmm1, xmm1, 8 1955 STORE_4LINES xmm3, xmm3, 12 1956%else 1957 pshufw m2, m0, q0000 1958 pshufw m3, m0, q1111 1959 pshufw m4, m0, q2222 1960 pshufw m5, m0, q3333 1961 STORE_4LINES m2, m3, 0 1962 STORE_4LINES m4, m5, 4 1963 pshufw m2, m1, q0000 1964 pshufw m3, m1, q1111 1965 pshufw m4, m1, q2222 1966 pshufw m5, m1, q3333 1967 STORE_4LINES m2, m3, 8 1968 STORE_4LINES m4, m5, 12 1969%endif 1970%else 1971 packuswb m0, m0 ; dc0, dc1, dc2, dc3 1972 packuswb m1, m1 ; dc4, dc5, dc6, dc7 1973 punpcklbw m0, m0 1974 punpcklbw m1, m1 1975 pshufw m2, m0, q1100 1976 pshufw m3, m0, q3322 1977 pshufw m4, m1, q1100 1978 pshufw m5, m1, q3322 1979 STORE_4LINES m2, 0 1980 STORE_4LINES m3, 4 1981 add r0, FDEC_STRIDEB*8 1982 STORE_4LINES m4, 0 1983 STORE_4LINES m5, 4 1984%endif 1985 RET 1986%endmacro 1987 1988INIT_MMX mmx2 1989PREDICT_8x16C_DC 1990%if HIGH_BIT_DEPTH 1991INIT_MMX sse2 1992PREDICT_8x16C_DC 1993%endif 1994 1995%macro PREDICT_C_DC_TOP 1 1996%if HIGH_BIT_DEPTH 1997INIT_XMM 1998cglobal predict_8x%1c_dc_top_sse2, 1,1 1999 pxor m2, m2 2000 mova m0, [r0 - FDEC_STRIDEB] 2001 pshufd m1, m0, q2301 2002 paddw m0, m1 2003 pshuflw m1, m0, q2301 2004 pshufhw m1, m1, q2301 2005 paddw m0, m1 2006 psrlw m0, 1 2007 pavgw m0, m2 2008 STORE%1 m0 2009 RET 2010%else ; !HIGH_BIT_DEPTH 2011INIT_MMX 2012cglobal predict_8x%1c_dc_top_mmx2, 1,1 2013 movq mm0, [r0 - FDEC_STRIDE] 2014 pxor mm1, mm1 2015 pxor mm2, mm2 2016 punpckhbw mm1, mm0 2017 punpcklbw mm0, mm2 2018 psadbw mm1, mm2 ; s1 2019 psadbw mm0, mm2 ; s0 2020 psrlw mm1, 1 2021 psrlw mm0, 1 2022 pavgw mm1, mm2 2023 pavgw mm0, mm2 2024 pshufw mm1, mm1, 0 2025 pshufw mm0, mm0, 0 ; dc0 (w) 2026 packuswb mm0, mm1 ; dc0,dc1 (b) 2027 STORE%1 mm0 2028 RET 2029%endif 2030%endmacro 2031 2032PREDICT_C_DC_TOP 8 2033PREDICT_C_DC_TOP 16 2034 2035;----------------------------------------------------------------------------- 2036; void predict_16x16_v( pixel *src ) 2037;----------------------------------------------------------------------------- 2038 2039%macro PREDICT_16x16_V 0 2040cglobal predict_16x16_v, 1,2 2041%assign %%i 0 2042%rep 16*SIZEOF_PIXEL/mmsize 2043 mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize] 2044%assign %%i %%i+1 2045%endrep 2046%if 16*SIZEOF_PIXEL/mmsize == 4 2047 STORE16 m0, m1, m2, m3 2048%elif 16*SIZEOF_PIXEL/mmsize == 2 2049 STORE16 m0, m1 2050%else 2051 STORE16 m0 2052%endif 2053 RET 2054%endmacro 2055 2056INIT_MMX mmx2 2057PREDICT_16x16_V 2058INIT_XMM sse 2059PREDICT_16x16_V 2060%if HIGH_BIT_DEPTH 2061INIT_YMM avx 2062PREDICT_16x16_V 2063%endif 2064 2065;----------------------------------------------------------------------------- 2066; void predict_16x16_h( pixel *src ) 2067;----------------------------------------------------------------------------- 2068%macro PREDICT_16x16_H 0 2069cglobal predict_16x16_h, 1,2 2070%if cpuflag(ssse3) && notcpuflag(avx2) 2071 mova m2, [pb_3] 2072%endif 2073 mov r1d, 4 2074.loop: 2075 PRED_H_4ROWS 16, 1 2076 dec r1d 2077 jg .loop 2078 RET 2079%endmacro 2080 2081INIT_MMX mmx2 2082PREDICT_16x16_H 2083%if HIGH_BIT_DEPTH 2084INIT_XMM sse2 2085PREDICT_16x16_H 2086INIT_YMM avx2 2087PREDICT_16x16_H 2088%else 2089;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3 2090INIT_XMM ssse3 2091PREDICT_16x16_H 2092%endif 2093 2094;----------------------------------------------------------------------------- 2095; void predict_16x16_dc( pixel *src ) 2096;----------------------------------------------------------------------------- 2097%if WIN64 2098DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes 2099%else 2100DECLARE_REG_TMP 3 2101%endif 2102 2103INIT_XMM 2104; Returns the sum of the left pixels in r1d+r2d 2105cglobal predict_16x16_dc_left_internal, 0,4 2106 movzx r1d, pixel [r0-SIZEOF_PIXEL] 2107 movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL] 2108%assign i 2*FDEC_STRIDEB 2109%rep 7 2110 movzx t0d, pixel [r0+i-SIZEOF_PIXEL] 2111 add r1d, t0d 2112 movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL] 2113 add r2d, t0d 2114%assign i i+2*FDEC_STRIDEB 2115%endrep 2116 RET 2117 2118%macro PRED16x16_DC 2 2119%if HIGH_BIT_DEPTH 2120 mova xm0, [r0 - FDEC_STRIDEB+ 0] 2121 paddw xm0, [r0 - FDEC_STRIDEB+16] 2122 HADDW xm0, xm2 2123 paddw xm0, %1 2124 psrlw xm0, %2 2125 SPLATW m0, xm0 2126%if mmsize == 32 2127 STORE16 m0 2128%else 2129 STORE16 m0, m0 2130%endif 2131%else ; !HIGH_BIT_DEPTH 2132 pxor m0, m0 2133 psadbw m0, [r0 - FDEC_STRIDE] 2134 MOVHL m1, m0 2135 paddw m0, m1 2136 paddusw m0, %1 2137 psrlw m0, %2 ; dc 2138 SPLATW m0, m0 2139 packuswb m0, m0 ; dc in bytes 2140 STORE16 m0 2141%endif 2142%endmacro 2143 2144%macro PREDICT_16x16_DC 0 2145cglobal predict_16x16_dc, 1,3 2146 call predict_16x16_dc_left_internal 2147 lea r1d, [r1+r2+16] 2148 movd xm3, r1d 2149 PRED16x16_DC xm3, 5 2150 RET 2151 2152cglobal predict_16x16_dc_top, 1,2 2153 PRED16x16_DC [pw_8], 4 2154 RET 2155 2156cglobal predict_16x16_dc_left, 1,3 2157 call predict_16x16_dc_left_internal 2158 lea r1d, [r1+r2+8] 2159 shr r1d, 4 2160 movd xm0, r1d 2161 SPLATW m0, xm0 2162%if HIGH_BIT_DEPTH && mmsize == 16 2163 STORE16 m0, m0 2164%else 2165%if HIGH_BIT_DEPTH == 0 2166 packuswb m0, m0 2167%endif 2168 STORE16 m0 2169%endif 2170 RET 2171%endmacro 2172 2173INIT_XMM sse2 2174PREDICT_16x16_DC 2175%if HIGH_BIT_DEPTH 2176INIT_YMM avx2 2177PREDICT_16x16_DC 2178%else 2179INIT_XMM avx2 2180PREDICT_16x16_DC 2181%endif 2182