1;/***************************************************************************** 2; * 3; * XVID MPEG-4 VIDEO CODEC 4; * - Quarter-pixel interpolation - 5; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net> 6; * 7; * This file is part of Xvid, a free MPEG-4 video encoder/decoder 8; * 9; * Xvid is free software; you can redistribute it and/or modify it 10; * under the terms of the GNU General Public License as published by 11; * the Free Software Foundation; either version 2 of the License, or 12; * (at your option) any later version. 13; * 14; * This program is distributed in the hope that it will be useful, 15; * but WITHOUT ANY WARRANTY; without even the implied warranty of 16; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17; * GNU General Public License for more details. 18; * 19; * You should have received a copy of the GNU General Public License 20; * along with this program; if not, write to the Free Software 21; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22; * 23; * $Id: qpel_mmx.asm,v 1.13 2010-11-28 15:18:21 Isibaar Exp $ 24; * 25; *************************************************************************/ 26 27;/************************************************************************** 28; * 29; * History: 30; * 31; * 22.10.2002 initial coding. unoptimized 'proof of concept', 32; * just to heft the qpel filtering. - Skal - 33; * 34; *************************************************************************/ 35 36 37%define USE_TABLES ; in order to use xvid_FIR_x_x_x_x tables 38 ; instead of xvid_Expand_mmx... 39 40 41%include "nasm.inc" 42 43;////////////////////////////////////////////////////////////////////// 44;// Declarations 45;// all signatures are: 46;// void XXX(uint8_t *dst, const uint8_t *src, 47;// int32_t length, int32_t stride, int32_t rounding) 48;////////////////////////////////////////////////////////////////////// 49 50cglobal xvid_H_Pass_16_mmx 51cglobal xvid_H_Pass_Avrg_16_mmx 52cglobal xvid_H_Pass_Avrg_Up_16_mmx 53cglobal xvid_V_Pass_16_mmx 54cglobal xvid_V_Pass_Avrg_16_mmx 55cglobal xvid_V_Pass_Avrg_Up_16_mmx 56cglobal xvid_H_Pass_8_mmx 57cglobal xvid_H_Pass_Avrg_8_mmx 58cglobal xvid_H_Pass_Avrg_Up_8_mmx 59cglobal xvid_V_Pass_8_mmx 60cglobal xvid_V_Pass_Avrg_8_mmx 61cglobal xvid_V_Pass_Avrg_Up_8_mmx 62 63cglobal xvid_H_Pass_Add_16_mmx 64cglobal xvid_H_Pass_Avrg_Add_16_mmx 65cglobal xvid_H_Pass_Avrg_Up_Add_16_mmx 66cglobal xvid_V_Pass_Add_16_mmx 67cglobal xvid_V_Pass_Avrg_Add_16_mmx 68cglobal xvid_V_Pass_Avrg_Up_Add_16_mmx 69cglobal xvid_H_Pass_8_Add_mmx 70cglobal xvid_H_Pass_Avrg_8_Add_mmx 71cglobal xvid_H_Pass_Avrg_Up_8_Add_mmx 72cglobal xvid_V_Pass_8_Add_mmx 73cglobal xvid_V_Pass_Avrg_8_Add_mmx 74cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx 75 76cglobal xvid_Expand_mmx 77 78cglobal xvid_FIR_1_0_0_0 79cglobal xvid_FIR_3_1_0_0 80cglobal xvid_FIR_6_3_1_0 81cglobal xvid_FIR_14_3_2_1 82cglobal xvid_FIR_20_6_3_1 83cglobal xvid_FIR_20_20_6_3 84cglobal xvid_FIR_23_19_6_3 85cglobal xvid_FIR_7_20_20_6 86cglobal xvid_FIR_6_20_20_6 87cglobal xvid_FIR_6_20_20_7 88cglobal xvid_FIR_3_6_20_20 89cglobal xvid_FIR_3_6_19_23 90cglobal xvid_FIR_1_3_6_20 91cglobal xvid_FIR_1_2_3_14 92cglobal xvid_FIR_0_1_3_6 93cglobal xvid_FIR_0_0_1_3 94cglobal xvid_FIR_0_0_0_1 95 96SECTION .data align=SECTION_ALIGN 97 98align SECTION_ALIGN 99xvid_Expand_mmx: 100times 256*4 dw 0 ; uint16_t xvid_Expand_mmx[256][4] 101ENDFUNC 102 103xvid_FIR_1_0_0_0: 104times 256*4 dw 0 105ENDFUNC 106 107xvid_FIR_3_1_0_0: 108times 256*4 dw 0 109ENDFUNC 110 111xvid_FIR_6_3_1_0: 112times 256*4 dw 0 113ENDFUNC 114 115xvid_FIR_14_3_2_1: 116times 256*4 dw 0 117ENDFUNC 118 119xvid_FIR_20_6_3_1: 120times 256*4 dw 0 121ENDFUNC 122 123xvid_FIR_20_20_6_3: 124times 256*4 dw 0 125ENDFUNC 126 127xvid_FIR_23_19_6_3: 128times 256*4 dw 0 129ENDFUNC 130 131xvid_FIR_7_20_20_6: 132times 256*4 dw 0 133ENDFUNC 134 135xvid_FIR_6_20_20_6: 136times 256*4 dw 0 137ENDFUNC 138 139xvid_FIR_6_20_20_7: 140times 256*4 dw 0 141ENDFUNC 142 143xvid_FIR_3_6_20_20: 144times 256*4 dw 0 145ENDFUNC 146 147xvid_FIR_3_6_19_23: 148times 256*4 dw 0 149ENDFUNC 150 151xvid_FIR_1_3_6_20: 152times 256*4 dw 0 153ENDFUNC 154 155xvid_FIR_1_2_3_14: 156times 256*4 dw 0 157ENDFUNC 158 159xvid_FIR_0_1_3_6: 160times 256*4 dw 0 161ENDFUNC 162 163xvid_FIR_0_0_1_3: 164times 256*4 dw 0 165ENDFUNC 166 167xvid_FIR_0_0_0_1: 168times 256*4 dw 0 169ENDFUNC 170 171;////////////////////////////////////////////////////////////////////// 172 173DATA 174 175align SECTION_ALIGN 176Rounder1_MMX: 177times 4 dw 1 178Rounder0_MMX: 179times 4 dw 0 180 181align SECTION_ALIGN 182Rounder_QP_MMX: 183times 4 dw 16 184times 4 dw 15 185 186%ifndef USE_TABLES 187 188align SECTION_ALIGN 189 190 ; H-Pass table shared by 16x? and 8x? filters 191 192FIR_R0: dw 14, -3, 2, -1 193align SECTION_ALIGN 194FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 195 196FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 197 198FIR_R3: dw 3, -6, 20, 20, -6, 3, -1, 0 199 200FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 201 202FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 203align SECTION_ALIGN 204FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 205align SECTION_ALIGN 206FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 207align SECTION_ALIGN 208FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 209 210FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 211align SECTION_ALIGN 212FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 213align SECTION_ALIGN 214FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 215align SECTION_ALIGN 216FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 217 218FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 219 220FIR_R14: dw 0, 0, -1, 3, -6, 20, 20, -7 221 222FIR_R15: dw 0, 0, 0, -1, 3, -6, 19, 23 223 224FIR_R16: dw -1, 2, -3, 14 225 226%endif ; !USE_TABLES 227 228 ; V-Pass taps 229 230align SECTION_ALIGN 231FIR_Cm7: times 4 dw -7 232FIR_Cm6: times 4 dw -6 233FIR_Cm3: times 4 dw -3 234FIR_Cm1: times 4 dw -1 235FIR_C2: times 4 dw 2 236FIR_C3: times 4 dw 3 237FIR_C14: times 4 dw 14 238FIR_C19: times 4 dw 19 239FIR_C20: times 4 dw 20 240FIR_C23: times 4 dw 23 241 242TEXT 243 244;////////////////////////////////////////////////////////////////////// 245;// Here we go with the Q-Pel mess. 246;// For horizontal passes, we process 4 *output* pixel in parallel 247;// For vertical ones, we process 4 *input* pixel in parallel. 248;////////////////////////////////////////////////////////////////////// 249 250%ifdef ARCH_IS_X86_64 251%macro XVID_MOVQ 3 252 lea r9, [%2] 253 movq %1, [r9 + %3] 254%endmacro 255%macro XVID_PADDW 3 256 lea r9, [%2] 257 paddw %1, [r9 + %3] 258%endmacro 259%define SRC_PTR prm2 260%define DST_PTR prm1 261%else 262%macro XVID_MOVQ 3 263 movq %1, [%2 + %3] 264%endmacro 265%macro XVID_PADDW 3 266 paddw %1, [%2 + %3] 267%endmacro 268%define SRC_PTR _ESI 269%define DST_PTR _EDI 270%endif 271 272%macro PROLOG_NO_AVRG 0 273 mov TMP0, prm3 ; Size 274 mov TMP1, prm4 ; BpS 275 mov eax, prm5d ; Rnd 276 277%ifndef ARCH_IS_X86_64 278 push SRC_PTR 279 push DST_PTR 280%endif 281 push _EBP 282 mov _EBP, TMP1 283 284%ifndef ARCH_IS_X86_64 285 mov DST_PTR, [_ESP+16 + 0*4] ; Dst 286 mov SRC_PTR, [_ESP+16 + 1*4] ; Src 287%endif 288 289 and _EAX, 1 290 lea TMP1, [Rounder_QP_MMX] 291 movq mm7, [TMP1+_EAX*8] ; rounder 292%endmacro 293 294%macro EPILOG_NO_AVRG 0 295 pop _EBP 296%ifndef ARCH_IS_X86_64 297 pop DST_PTR 298 pop SRC_PTR 299%endif 300 ret 301%endmacro 302 303%macro PROLOG_AVRG 0 304 mov TMP0, prm3 ; Size 305 mov TMP1, prm4 ; BpS 306 mov eax, prm5d ; Rnd 307 308 push _EBX 309 push _EBP 310%ifndef ARCH_IS_X86_64 311 push SRC_PTR 312 push DST_PTR 313%endif 314 mov _EBP, TMP1 315 316%ifndef ARCH_IS_X86_64 317 mov DST_PTR, [_ESP+20 + 0*4] ; Dst 318 mov SRC_PTR, [_ESP+20 + 1*4] ; Src 319%endif 320 321 and _EAX, 1 322 lea TMP1, [Rounder_QP_MMX] 323 movq mm7, [TMP1+_EAX*8] ; rounder 324 lea TMP1, [Rounder1_MMX] 325 lea _EBX, [TMP1+_EAX*8] ; *Rounder2 326%endmacro 327 328%macro EPILOG_AVRG 0 329%ifndef ARCH_IS_X86_64 330 pop DST_PTR 331 pop SRC_PTR 332%endif 333 pop _EBP 334 pop _EBX 335 ret 336%endmacro 337 338;////////////////////////////////////////////////////////////////////// 339;// 340;// All horizontal passes 341;// 342;////////////////////////////////////////////////////////////////////// 343 344 ; macros for USE_TABLES 345 346%macro TLOAD 2 ; %1,%2: src pixels 347 movzx _EAX, byte [SRC_PTR+%1] 348 movzx TMP1, byte [SRC_PTR+%2] 349 XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 350 XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 351 paddw mm0, mm7 352 paddw mm3, mm7 353%endmacro 354 355%macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs 356 movzx _EAX, byte [SRC_PTR+%1] 357 XVID_PADDW %4, %2, _EAX*8 358 XVID_PADDW %5, %3, _EAX*8 359%endmacro 360 361%macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs 362 movzx _EAX, byte [SRC_PTR+%1] 363 XVID_PADDW %5, %2, _EAX*8 364 XVID_PADDW %6, %3, _EAX*8 365 XVID_PADDW %7, %4, _EAX*8 366%endmacro 367 368;////////////////////////////////////////////////////////////////////// 369 370 ; macros without USE_TABLES 371 372%macro LOAD 2 ; %1,%2: src pixels 373 movzx _EAX, byte [SRC_PTR+%1] 374 movzx TMP1, byte [SRC_PTR+%2] 375 XVID_MOVQ mm0, xvid_Expand_mmx, _EAX*8 376 XVID_MOVQ mm3, xvid_Expand_mmx, TMP1*8 377 pmullw mm0, [FIR_R0 ] 378 pmullw mm3, [FIR_R16] 379 paddw mm0, mm7 380 paddw mm3, mm7 381%endmacro 382 383%macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 384 movzx _EAX, byte [SRC_PTR+%1] 385 XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 386 movq mm5, mm4 387 pmullw mm4, [%2] 388 pmullw mm5, [%2+8] 389 paddw %3, mm4 390 paddw %4, mm5 391%endmacro 392 393%macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 394 movzx _EAX, byte [SRC_PTR+%1] 395 XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 396 movq mm5, mm4 397 movq mm6, mm5 398 pmullw mm4, [%2 ] 399 pmullw mm5, [%2+ 8] 400 pmullw mm6, [%2+16] 401 paddw %3, mm4 402 paddw %4, mm5 403 paddw %5, mm6 404%endmacro 405 406;////////////////////////////////////////////////////////////////////// 407 408%macro MIX 3 ; %1:reg, %2:src, %3:rounder 409 pxor mm6, mm6 410 movq mm4, [%2] 411 movq mm1, %1 412 movq mm5, mm4 413 punpcklbw %1, mm6 414 punpcklbw mm4, mm6 415 punpckhbw mm1, mm6 416 punpckhbw mm5, mm6 417 movq mm6, [%3] ; rounder #2 418 paddusw %1, mm4 419 paddusw mm1, mm5 420 paddusw %1, mm6 421 paddusw mm1, mm6 422 psrlw %1, 1 423 psrlw mm1, 1 424 packuswb %1, mm1 425%endmacro 426 427;////////////////////////////////////////////////////////////////////// 428 429%macro H_PASS_16 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) 430 431%if (%2==0) && (%1==0) 432 PROLOG_NO_AVRG 433%else 434 PROLOG_AVRG 435%endif 436 437.Loop: 438 439 ; mm0..mm3 serves as a 4x4 delay line 440 441%ifndef USE_TABLES 442 443 LOAD 0, 16 ; special case for 1rst/last pixel 444 movq mm1, mm7 445 movq mm2, mm7 446 447 ACCUM2 1, FIR_R1, mm0, mm1 448 ACCUM2 2, FIR_R2, mm0, mm1 449 ACCUM2 3, FIR_R3, mm0, mm1 450 ACCUM2 4, FIR_R4, mm0, mm1 451 452 ACCUM3 5, FIR_R5, mm0, mm1, mm2 453 ACCUM3 6, FIR_R6, mm0, mm1, mm2 454 ACCUM3 7, FIR_R7, mm0, mm1, mm2 455 ACCUM2 8, FIR_R8, mm1, mm2 456 ACCUM3 9, FIR_R9, mm1, mm2, mm3 457 ACCUM3 10, FIR_R10,mm1, mm2, mm3 458 ACCUM3 11, FIR_R11,mm1, mm2, mm3 459 460 ACCUM2 12, FIR_R12, mm2, mm3 461 ACCUM2 13, FIR_R13, mm2, mm3 462 ACCUM2 14, FIR_R14, mm2, mm3 463 ACCUM2 15, FIR_R15, mm2, mm3 464 465%else 466 467 TLOAD 0, 16 ; special case for 1rst/last pixel 468 movq mm1, mm7 469 movq mm2, mm7 470 471 TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1 472 TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1 473 TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1 474 TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1 475 476 TACCUM3 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0 , mm0, mm1, mm2 477 TACCUM3 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1, mm2 478 TACCUM3 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1, mm2 479 480 TACCUM2 8, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm1, mm2 481 482 TACCUM3 9, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0, mm1, mm2, mm3 483 TACCUM3 10, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0, mm1, mm2, mm3 484 TACCUM3 11, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0, mm1, mm2, mm3 485 486 TACCUM2 12, xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3 487 TACCUM2 13, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3 488 TACCUM2 14, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3 489 TACCUM2 15, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3 490 491%endif 492 493 psraw mm0, 5 494 psraw mm1, 5 495 psraw mm2, 5 496 psraw mm3, 5 497 packuswb mm0, mm1 498 packuswb mm2, mm3 499 500%if (%1==1) 501 MIX mm0, SRC_PTR, _EBX 502%elif (%1==2) 503 MIX mm0, SRC_PTR+1, _EBX 504%endif 505%if (%2==1) 506 MIX mm0, DST_PTR, Rounder1_MMX 507%endif 508 509%if (%1==1) 510 MIX mm2, SRC_PTR+8, _EBX 511%elif (%1==2) 512 MIX mm2, SRC_PTR+9, _EBX 513%endif 514%if (%2==1) 515 MIX mm2, DST_PTR+8, Rounder1_MMX 516%endif 517 518 lea SRC_PTR, [SRC_PTR+_EBP] 519 520 movq [DST_PTR+0], mm0 521 movq [DST_PTR+8], mm2 522 523 add DST_PTR, _EBP 524 dec TMP0 525 jg .Loop 526 527%if (%2==0) && (%1==0) 528 EPILOG_NO_AVRG 529%else 530 EPILOG_AVRG 531%endif 532 533%endmacro 534 535 536;////////////////////////////////////////////////////////////////////// 537 538%macro H_PASS_8 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) 539 540%if (%2==0) && (%1==0) 541 PROLOG_NO_AVRG 542%else 543 PROLOG_AVRG 544%endif 545 546.Loop: 547 ; mm0..mm3 serves as a 4x4 delay line 548 549%ifndef USE_TABLES 550 551 LOAD 0, 8 ; special case for 1rst/last pixel 552 ACCUM2 1, FIR_R1, mm0, mm3 553 ACCUM2 2, FIR_R2, mm0, mm3 554 ACCUM2 3, FIR_R3, mm0, mm3 555 ACCUM2 4, FIR_R4, mm0, mm3 556 557 ACCUM2 5, FIR_R13, mm0, mm3 558 ACCUM2 6, FIR_R14, mm0, mm3 559 ACCUM2 7, FIR_R15, mm0, mm3 560 561%else 562 563%if 0 ; test with no unrolling 564 565 TLOAD 0, 8 ; special case for 1rst/last pixel 566 TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm3 567 TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm3 568 TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm3 569 TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3 570 TACCUM2 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm0, mm3 571 TACCUM2 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm0, mm3 572 TACCUM2 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm0, mm3 573 574%else ; test with unrolling (little faster, but not much) 575 576 movzx _EAX, byte [SRC_PTR] 577 movzx TMP1, byte [SRC_PTR+8] 578 XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 579 movzx _EAX, byte [SRC_PTR+1] 580 XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 581 paddw mm0, mm7 582 paddw mm3, mm7 583 584 movzx TMP1, byte [SRC_PTR+2] 585 XVID_PADDW mm0, xvid_FIR_23_19_6_3, _EAX*8 586 XVID_PADDW mm3, xvid_FIR_1_0_0_0, _EAX*8 587 588 movzx _EAX, byte [SRC_PTR+3] 589 XVID_PADDW mm0, xvid_FIR_7_20_20_6, TMP1*8 590 XVID_PADDW mm3, xvid_FIR_3_1_0_0, TMP1*8 591 592 movzx TMP1, byte [SRC_PTR+4] 593 XVID_PADDW mm0, xvid_FIR_3_6_20_20, _EAX*8 594 XVID_PADDW mm3, xvid_FIR_6_3_1_0, _EAX*8 595 596 movzx _EAX, byte [SRC_PTR+5] 597 XVID_PADDW mm0, xvid_FIR_1_3_6_20, TMP1*8 598 XVID_PADDW mm3, xvid_FIR_20_6_3_1, TMP1*8 599 600 movzx TMP1, byte [SRC_PTR+6] 601 XVID_PADDW mm0, xvid_FIR_0_1_3_6, _EAX*8 602 XVID_PADDW mm3, xvid_FIR_20_20_6_3, _EAX*8 603 604 movzx _EAX, byte [SRC_PTR+7] 605 XVID_PADDW mm0, xvid_FIR_0_0_1_3, TMP1*8 606 XVID_PADDW mm3, xvid_FIR_6_20_20_7, TMP1*8 607 608 XVID_PADDW mm0, xvid_FIR_0_0_0_1, _EAX*8 609 XVID_PADDW mm3, xvid_FIR_3_6_19_23, _EAX*8 610 611%endif 612 613%endif ; !USE_TABLES 614 615 psraw mm0, 5 616 psraw mm3, 5 617 packuswb mm0, mm3 618 619%if (%1==1) 620 MIX mm0, SRC_PTR, _EBX 621%elif (%1==2) 622 MIX mm0, SRC_PTR+1, _EBX 623%endif 624%if (%2==1) 625 MIX mm0, DST_PTR, Rounder1_MMX 626%endif 627 628 movq [DST_PTR], mm0 629 630 add DST_PTR, _EBP 631 add SRC_PTR, _EBP 632 dec TMP0 633 jg .Loop 634 635%if (%2==0) && (%1==0) 636 EPILOG_NO_AVRG 637%else 638 EPILOG_AVRG 639%endif 640 641%endmacro 642 643;////////////////////////////////////////////////////////////////////// 644;// 16x? copy Functions 645 646xvid_H_Pass_16_mmx: 647 H_PASS_16 0, 0 648ENDFUNC 649xvid_H_Pass_Avrg_16_mmx: 650 H_PASS_16 1, 0 651ENDFUNC 652xvid_H_Pass_Avrg_Up_16_mmx: 653 H_PASS_16 2, 0 654ENDFUNC 655 656;////////////////////////////////////////////////////////////////////// 657;// 8x? copy Functions 658 659xvid_H_Pass_8_mmx: 660 H_PASS_8 0, 0 661ENDFUNC 662xvid_H_Pass_Avrg_8_mmx: 663 H_PASS_8 1, 0 664ENDFUNC 665xvid_H_Pass_Avrg_Up_8_mmx: 666 H_PASS_8 2, 0 667ENDFUNC 668 669;////////////////////////////////////////////////////////////////////// 670;// 16x? avrg Functions 671 672xvid_H_Pass_Add_16_mmx: 673 H_PASS_16 0, 1 674ENDFUNC 675xvid_H_Pass_Avrg_Add_16_mmx: 676 H_PASS_16 1, 1 677ENDFUNC 678xvid_H_Pass_Avrg_Up_Add_16_mmx: 679 H_PASS_16 2, 1 680ENDFUNC 681 682;////////////////////////////////////////////////////////////////////// 683;// 8x? avrg Functions 684 685xvid_H_Pass_8_Add_mmx: 686 H_PASS_8 0, 1 687ENDFUNC 688xvid_H_Pass_Avrg_8_Add_mmx: 689 H_PASS_8 1, 1 690ENDFUNC 691xvid_H_Pass_Avrg_Up_8_Add_mmx: 692 H_PASS_8 2, 1 693ENDFUNC 694 695 696 697;////////////////////////////////////////////////////////////////////// 698;// 699;// All vertical passes 700;// 701;////////////////////////////////////////////////////////////////////// 702 703%macro V_LOAD 1 ; %1=Last? 704 705 movd mm4, dword [TMP1] 706 pxor mm6, mm6 707%if (%1==0) 708 add TMP1, _EBP 709%endif 710 punpcklbw mm4, mm6 711 712%endmacro 713 714%macro V_ACC1 2 ; %1:reg; 2:tap 715 pmullw mm4, [%2] 716 paddw %1, mm4 717%endmacro 718 719%macro V_ACC2 4 ; %1-%2: regs, %3-%4: taps 720 movq mm5, mm4 721 movq mm6, mm4 722 pmullw mm5, [%3] 723 pmullw mm6, [%4] 724 paddw %1, mm5 725 paddw %2, mm6 726%endmacro 727 728%macro V_ACC2l 4 ; %1-%2: regs, %3-%4: taps 729 movq mm5, mm4 730 pmullw mm5, [%3] 731 pmullw mm4, [%4] 732 paddw %1, mm5 733 paddw %2, mm4 734%endmacro 735 736%macro V_ACC4 8 ; %1-%4: regs, %5-%8: taps 737 V_ACC2 %1,%2, %5,%6 738 V_ACC2l %3,%4, %7,%8 739%endmacro 740 741 742%macro V_MIX 3 ; %1:dst-reg, %2:src, %3: rounder 743 pxor mm6, mm6 744 movq mm4, [%2] 745 punpcklbw %1, mm6 746 punpcklbw mm4, mm6 747 paddusw %1, mm4 748 paddusw %1, [%3] 749 psrlw %1, 1 750 packuswb %1, %1 751%endmacro 752 753%macro V_STORE 4 ; %1-%2: mix ops, %3: reg, %4:last? 754 755 psraw %3, 5 756 packuswb %3, %3 757 758%if (%1==1) 759 V_MIX %3, SRC_PTR, _EBX 760 add SRC_PTR, _EBP 761%elif (%1==2) 762 add SRC_PTR, _EBP 763 V_MIX %3, SRC_PTR, _EBX 764%endif 765%if (%2==1) 766 V_MIX %3, DST_PTR, Rounder1_MMX 767%endif 768 769 movd eax, %3 770 mov dword [DST_PTR], eax 771 772%if (%4==0) 773 add DST_PTR, _EBP 774%endif 775 776%endmacro 777 778;////////////////////////////////////////////////////////////////////// 779 780%macro V_PASS_16 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) 781 782%if (%2==0) && (%1==0) 783 PROLOG_NO_AVRG 784%else 785 PROLOG_AVRG 786%endif 787 788 ; we process one stripe of 4x16 pixel each time. 789 ; the size (3rd argument) is meant to be a multiple of 4 790 ; mm0..mm3 serves as a 4x4 delay line 791 792.Loop: 793 794 push DST_PTR 795 push SRC_PTR ; SRC_PTR is preserved for src-mixing 796 mov TMP1, SRC_PTR 797 798 ; ouput rows [0..3], from input rows [0..8] 799 800 movq mm0, mm7 801 movq mm1, mm7 802 movq mm2, mm7 803 movq mm3, mm7 804 805 V_LOAD 0 806 V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 807 V_LOAD 0 808 V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 809 V_LOAD 0 810 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 811 V_LOAD 0 812 V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 813 V_LOAD 0 814 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 815 V_STORE %1, %2, mm0, 0 816 817 V_LOAD 0 818 V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 819 V_ACC1 mm3, FIR_Cm6 820 V_STORE %1, %2, mm1, 0 821 822 V_LOAD 0 823 V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 824 V_STORE %1, %2, mm2, 0 825 826 V_LOAD 1 827 V_ACC1 mm3, FIR_Cm1 828 V_STORE %1, %2, mm3, 0 829 830 ; ouput rows [4..7], from input rows [1..11] (!!) 831 832 mov SRC_PTR, [_ESP] 833 lea TMP1, [SRC_PTR+_EBP] 834 835 lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing 836 push SRC_PTR ; this will be the new value for next round 837 838 movq mm0, mm7 839 movq mm1, mm7 840 movq mm2, mm7 841 movq mm3, mm7 842 843 V_LOAD 0 844 V_ACC1 mm0, FIR_Cm1 845 846 V_LOAD 0 847 V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 848 849 V_LOAD 0 850 V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 851 V_ACC1 mm2, FIR_Cm1 852 853 V_LOAD 0 854 V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 855 V_LOAD 0 856 V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 857 V_LOAD 0 858 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 859 V_LOAD 0 860 V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 861 V_LOAD 0 862 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 863 V_STORE %1, %2, mm0, 0 864 865 V_LOAD 0 866 V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 867 V_ACC1 mm3, FIR_Cm6 868 V_STORE %1, %2, mm1, 0 869 870 V_LOAD 0 871 V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 872 V_STORE %1, %2, mm2, 0 873 874 V_LOAD 1 875 V_ACC1 mm3, FIR_Cm1 876 V_STORE %1, %2, mm3, 0 877 878 ; ouput rows [8..11], from input rows [5..15] 879 880 pop SRC_PTR 881 lea TMP1, [SRC_PTR+_EBP] 882 883 lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing 884 push SRC_PTR ; this will be the new value for next round 885 886 movq mm0, mm7 887 movq mm1, mm7 888 movq mm2, mm7 889 movq mm3, mm7 890 891 V_LOAD 0 892 V_ACC1 mm0, FIR_Cm1 893 894 V_LOAD 0 895 V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 896 897 V_LOAD 0 898 V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 899 V_ACC1 mm2, FIR_Cm1 900 901 V_LOAD 0 902 V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 903 V_LOAD 0 904 V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 905 V_LOAD 0 906 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 907 V_LOAD 0 908 V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 909 V_LOAD 0 910 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 911 912 V_STORE %1, %2, mm0, 0 913 914 V_LOAD 0 915 V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 916 V_ACC1 mm3, FIR_Cm6 917 V_STORE %1, %2, mm1, 0 918 919 V_LOAD 0 920 V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 921 V_STORE %1, %2, mm2, 0 922 923 V_LOAD 1 924 V_ACC1 mm3, FIR_Cm1 925 V_STORE %1, %2, mm3, 0 926 927 928 ; ouput rows [12..15], from input rows [9.16] 929 930 pop SRC_PTR 931 lea TMP1, [SRC_PTR+_EBP] 932 933%if (%1!=0) 934 lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing 935%endif 936 937 movq mm0, mm7 938 movq mm1, mm7 939 movq mm2, mm7 940 movq mm3, mm7 941 942 V_LOAD 0 943 V_ACC1 mm3, FIR_Cm1 944 945 V_LOAD 0 946 V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 947 948 V_LOAD 0 949 V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 950 V_ACC1 mm3, FIR_Cm6 951 952 V_LOAD 0 953 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 954 V_LOAD 0 955 V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 956 V_LOAD 0 957 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 958 V_LOAD 0 959 V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 960 V_LOAD 1 961 V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 962 963 V_STORE %1, %2, mm3, 0 964 V_STORE %1, %2, mm2, 0 965 V_STORE %1, %2, mm1, 0 966 V_STORE %1, %2, mm0, 1 967 968 ; ... next 4 columns 969 970 pop SRC_PTR 971 pop DST_PTR 972 add SRC_PTR, 4 973 add DST_PTR, 4 974 sub TMP0, 4 975 jg .Loop 976 977%if (%2==0) && (%1==0) 978 EPILOG_NO_AVRG 979%else 980 EPILOG_AVRG 981%endif 982 983%endmacro 984 985;////////////////////////////////////////////////////////////////////// 986 987%macro V_PASS_8 2 ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG) 988 989%if (%2==0) && (%1==0) 990 PROLOG_NO_AVRG 991%else 992 PROLOG_AVRG 993%endif 994 995 ; we process one stripe of 4x8 pixel each time 996 ; the size (3rd argument) is meant to be a multiple of 4 997 ; mm0..mm3 serves as a 4x4 delay line 998.Loop: 999 1000 push DST_PTR 1001 push SRC_PTR ; SRC_PTR is preserved for src-mixing 1002 mov TMP1, SRC_PTR 1003 1004 ; ouput rows [0..3], from input rows [0..8] 1005 1006 movq mm0, mm7 1007 movq mm1, mm7 1008 movq mm2, mm7 1009 movq mm3, mm7 1010 1011 V_LOAD 0 1012 V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 1013 V_LOAD 0 1014 V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 1015 V_LOAD 0 1016 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 1017 V_LOAD 0 1018 V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 1019 V_LOAD 0 1020 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 1021 V_STORE %1, %2, mm0, 0 1022 1023 V_LOAD 0 1024 V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 1025 V_ACC1 mm3, FIR_Cm6 1026 1027 V_STORE %1, %2, mm1, 0 1028 1029 V_LOAD 0 1030 V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 1031 V_STORE %1, %2, mm2, 0 1032 1033 V_LOAD 1 1034 V_ACC1 mm3, FIR_Cm1 1035 V_STORE %1, %2, mm3, 0 1036 1037 ; ouput rows [4..7], from input rows [1..9] 1038 1039 mov SRC_PTR, [_ESP] 1040 lea TMP1, [SRC_PTR+_EBP] 1041 1042%if (%1!=0) 1043 lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing 1044%endif 1045 1046 movq mm0, mm7 1047 movq mm1, mm7 1048 movq mm2, mm7 1049 movq mm3, mm7 1050 1051 V_LOAD 0 1052 V_ACC1 mm3, FIR_Cm1 1053 1054 V_LOAD 0 1055 V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 1056 1057 V_LOAD 0 1058 V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 1059 V_ACC1 mm3, FIR_Cm6 1060 1061 V_LOAD 0 1062 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 1063 V_LOAD 0 1064 V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 1065 V_LOAD 0 1066 V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 1067 V_LOAD 0 1068 V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 1069 V_LOAD 1 1070 V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 1071 1072 V_STORE %1, %2, mm3, 0 1073 V_STORE %1, %2, mm2, 0 1074 V_STORE %1, %2, mm1, 0 1075 V_STORE %1, %2, mm0, 1 1076 1077 ; ... next 4 columns 1078 1079 pop SRC_PTR 1080 pop DST_PTR 1081 add SRC_PTR, 4 1082 add DST_PTR, 4 1083 sub TMP0, 4 1084 jg .Loop 1085 1086%if (%2==0) && (%1==0) 1087 EPILOG_NO_AVRG 1088%else 1089 EPILOG_AVRG 1090%endif 1091 1092%endmacro 1093 1094 1095;////////////////////////////////////////////////////////////////////// 1096;// 16x? copy Functions 1097 1098xvid_V_Pass_16_mmx: 1099 V_PASS_16 0, 0 1100ENDFUNC 1101xvid_V_Pass_Avrg_16_mmx: 1102 V_PASS_16 1, 0 1103ENDFUNC 1104xvid_V_Pass_Avrg_Up_16_mmx: 1105 V_PASS_16 2, 0 1106ENDFUNC 1107 1108;////////////////////////////////////////////////////////////////////// 1109;// 8x? copy Functions 1110 1111xvid_V_Pass_8_mmx: 1112 V_PASS_8 0, 0 1113ENDFUNC 1114xvid_V_Pass_Avrg_8_mmx: 1115 V_PASS_8 1, 0 1116ENDFUNC 1117xvid_V_Pass_Avrg_Up_8_mmx: 1118 V_PASS_8 2, 0 1119ENDFUNC 1120 1121;////////////////////////////////////////////////////////////////////// 1122;// 16x? avrg Functions 1123 1124xvid_V_Pass_Add_16_mmx: 1125 V_PASS_16 0, 1 1126ENDFUNC 1127xvid_V_Pass_Avrg_Add_16_mmx: 1128 V_PASS_16 1, 1 1129ENDFUNC 1130xvid_V_Pass_Avrg_Up_Add_16_mmx: 1131 V_PASS_16 2, 1 1132ENDFUNC 1133 1134;////////////////////////////////////////////////////////////////////// 1135;// 8x? avrg Functions 1136 1137xvid_V_Pass_8_Add_mmx: 1138 V_PASS_8 0, 1 1139ENDFUNC 1140xvid_V_Pass_Avrg_8_Add_mmx: 1141 V_PASS_8 1, 1 1142ENDFUNC 1143xvid_V_Pass_Avrg_Up_8_Add_mmx: 1144 V_PASS_8 2, 1 1145ENDFUNC 1146 1147;////////////////////////////////////////////////////////////////////// 1148 1149%undef SRC_PTR 1150%undef DST_PTR 1151 1152NON_EXEC_STACK 1153