1;/**************************************************************************** 2; * 3; * XVID MPEG-4 VIDEO CODEC 4; * - 8<->16 bit transfer functions - 5; * 6; * Copyright (C) 2001 Peter Ross <pross@xvid.org> 7; * 2001-2008 Michael Militzer <michael@xvid.org> 8; * 2002 Pascal Massimino <skal@planet-d.net> 9; * 10; * This program is free software ; you can redistribute it and/or modify 11; * it under the terms of the GNU General Public License as published by 12; * the Free Software Foundation ; either version 2 of the License, or 13; * (at your option) any later version. 14; * 15; * This program is distributed in the hope that it will be useful, 16; * but WITHOUT ANY WARRANTY ; without even the implied warranty of 17; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18; * GNU General Public License for more details. 19; * 20; * You should have received a copy of the GNU General Public License 21; * along with this program ; if not, write to the Free Software 22; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23; * 24; * $Id: mem_transfer_mmx.asm,v 1.22 2009-09-16 17:07:58 Isibaar Exp $ 25; * 26; ***************************************************************************/ 27 28%include "nasm.inc" 29 30;============================================================================= 31; Read only data 32;============================================================================= 33 34DATA 35 36ALIGN SECTION_ALIGN 37mmx_one: 38 dw 1, 1, 1, 1 39 40;============================================================================= 41; Code 42;============================================================================= 43 44TEXT 45 46cglobal transfer_8to16copy_mmx 47cglobal transfer_16to8copy_mmx 48cglobal transfer_8to16sub_mmx 49cglobal transfer_8to16subro_mmx 50cglobal transfer_8to16sub2_mmx 51cglobal transfer_8to16sub2_xmm 52cglobal transfer_8to16sub2ro_xmm 53cglobal transfer_16to8add_mmx 54cglobal transfer8x8_copy_mmx 55cglobal transfer8x4_copy_mmx 56 57;----------------------------------------------------------------------------- 58; 59; void transfer_8to16copy_mmx(int16_t * const dst, 60; const uint8_t * const src, 61; uint32_t stride); 62; 63;----------------------------------------------------------------------------- 64 65%macro COPY_8_TO_16 1 66 movq mm0, [_EAX] 67 movq mm1, [_EAX+TMP1] 68 movq mm2, mm0 69 movq mm3, mm1 70 punpcklbw mm0, mm7 71 movq [TMP0+%1*32], mm0 72 punpcklbw mm1, mm7 73 movq [TMP0+%1*32+16], mm1 74 punpckhbw mm2, mm7 75 punpckhbw mm3, mm7 76 lea _EAX, [_EAX+2*TMP1] 77 movq [TMP0+%1*32+8], mm2 78 movq [TMP0+%1*32+24], mm3 79%endmacro 80 81ALIGN SECTION_ALIGN 82transfer_8to16copy_mmx: 83 84 mov TMP0, prm1 ; Dst 85 mov _EAX, prm2 ; Src 86 mov TMP1, prm3 ; Stride 87 pxor mm7, mm7 88 89 COPY_8_TO_16 0 90 COPY_8_TO_16 1 91 COPY_8_TO_16 2 92 COPY_8_TO_16 3 93 ret 94ENDFUNC 95 96;----------------------------------------------------------------------------- 97; 98; void transfer_16to8copy_mmx(uint8_t * const dst, 99; const int16_t * const src, 100; uint32_t stride); 101; 102;----------------------------------------------------------------------------- 103 104%macro COPY_16_TO_8 1 105 movq mm0, [_EAX+%1*32] 106 movq mm1, [_EAX+%1*32+8] 107 packuswb mm0, mm1 108 movq [TMP0], mm0 109 movq mm2, [_EAX+%1*32+16] 110 movq mm3, [_EAX+%1*32+24] 111 packuswb mm2, mm3 112 movq [TMP0+TMP1], mm2 113%endmacro 114 115ALIGN SECTION_ALIGN 116transfer_16to8copy_mmx: 117 118 mov TMP0, prm1 ; Dst 119 mov _EAX, prm2 ; Src 120 mov TMP1, prm3 ; Stride 121 122 COPY_16_TO_8 0 123 lea TMP0,[TMP0+2*TMP1] 124 COPY_16_TO_8 1 125 lea TMP0,[TMP0+2*TMP1] 126 COPY_16_TO_8 2 127 lea TMP0,[TMP0+2*TMP1] 128 COPY_16_TO_8 3 129 ret 130ENDFUNC 131 132;----------------------------------------------------------------------------- 133; 134; void transfer_8to16sub_mmx(int16_t * const dct, 135; uint8_t * const cur, 136; const uint8_t * const ref, 137; const uint32_t stride); 138; 139;----------------------------------------------------------------------------- 140 141; when second argument == 1, reference (ebx) block is to current (_EAX) 142%macro COPY_8_TO_16_SUB 2 143 movq mm0, [_EAX] ; cur 144 movq mm2, [_EAX+TMP1] 145 movq mm1, mm0 146 movq mm3, mm2 147 148 punpcklbw mm0, mm7 149 punpcklbw mm2, mm7 150 movq mm4, [_EBX] ; ref 151 punpckhbw mm1, mm7 152 punpckhbw mm3, mm7 153 movq mm5, [_EBX+TMP1] ; ref 154 155 movq mm6, mm4 156%if %2 == 1 157 movq [_EAX], mm4 158 movq [_EAX+TMP1], mm5 159%endif 160 punpcklbw mm4, mm7 161 punpckhbw mm6, mm7 162 psubsw mm0, mm4 163 psubsw mm1, mm6 164 movq mm6, mm5 165 punpcklbw mm5, mm7 166 punpckhbw mm6, mm7 167 psubsw mm2, mm5 168 lea _EAX, [_EAX+2*TMP1] 169 psubsw mm3, mm6 170 lea _EBX,[_EBX+2*TMP1] 171 172 movq [TMP0+%1*32+ 0], mm0 ; dst 173 movq [TMP0+%1*32+ 8], mm1 174 movq [TMP0+%1*32+16], mm2 175 movq [TMP0+%1*32+24], mm3 176%endmacro 177 178ALIGN SECTION_ALIGN 179transfer_8to16sub_mmx: 180 mov TMP0, prm1 ; Dst 181 mov _EAX, prm2 ; Cur 182 mov TMP1, prm4 ; Stride 183 184 push _EBX 185%ifdef ARCH_IS_X86_64 186 mov _EBX, prm3 187%else 188 mov _EBX, [_ESP+4+12] ; Ref 189%endif 190 pxor mm7, mm7 191 192 COPY_8_TO_16_SUB 0, 1 193 COPY_8_TO_16_SUB 1, 1 194 COPY_8_TO_16_SUB 2, 1 195 COPY_8_TO_16_SUB 3, 1 196 197 pop _EBX 198 ret 199ENDFUNC 200 201 202ALIGN SECTION_ALIGN 203transfer_8to16subro_mmx: 204 mov TMP0, prm1 ; Dst 205 mov _EAX, prm2 ; Cur 206 mov TMP1, prm4 ; Stride 207 208 push _EBX 209%ifdef ARCH_IS_X86_64 210 mov _EBX, prm3 211%else 212 mov _EBX, [_ESP+4+12] ; Ref 213%endif 214 pxor mm7, mm7 215 216 COPY_8_TO_16_SUB 0, 0 217 COPY_8_TO_16_SUB 1, 0 218 COPY_8_TO_16_SUB 2, 0 219 COPY_8_TO_16_SUB 3, 0 220 221 pop _EBX 222 ret 223ENDFUNC 224 225 226;----------------------------------------------------------------------------- 227; 228; void transfer_8to16sub2_mmx(int16_t * const dct, 229; uint8_t * const cur, 230; const uint8_t * ref1, 231; const uint8_t * ref2, 232; const uint32_t stride) 233; 234;----------------------------------------------------------------------------- 235 236%macro COPY_8_TO_16_SUB2_MMX 1 237 movq mm0, [_EAX] ; cur 238 movq mm2, [_EAX+TMP1] 239 240 ; mm4 <- (ref1+ref2+1) / 2 241 movq mm4, [_EBX] ; ref1 242 movq mm1, [_ESI] ; ref2 243 movq mm6, mm4 244 movq mm3, mm1 245 punpcklbw mm4, mm7 246 punpcklbw mm1, mm7 247 punpckhbw mm6, mm7 248 punpckhbw mm3, mm7 249 paddusw mm4, mm1 250 paddusw mm6, mm3 251 paddusw mm4, [mmx_one] 252 paddusw mm6, [mmx_one] 253 psrlw mm4, 1 254 psrlw mm6, 1 255 packuswb mm4, mm6 256 movq [_EAX], mm4 257 258 ; mm5 <- (ref1+ref2+1) / 2 259 movq mm5, [_EBX+TMP1] ; ref1 260 movq mm1, [_ESI+TMP1] ; ref2 261 movq mm6, mm5 262 movq mm3, mm1 263 punpcklbw mm5, mm7 264 punpcklbw mm1, mm7 265 punpckhbw mm6, mm7 266 punpckhbw mm3, mm7 267 paddusw mm5, mm1 268 paddusw mm6, mm3 269 paddusw mm5, [mmx_one] 270 paddusw mm6, [mmx_one] 271 lea _ESI, [_ESI+2*TMP1] 272 psrlw mm5, 1 273 psrlw mm6, 1 274 packuswb mm5, mm6 275 movq [_EAX+TMP1], mm5 276 277 movq mm1, mm0 278 movq mm3, mm2 279 punpcklbw mm0, mm7 280 punpcklbw mm2, mm7 281 punpckhbw mm1, mm7 282 punpckhbw mm3, mm7 283 284 movq mm6, mm4 285 punpcklbw mm4, mm7 286 punpckhbw mm6, mm7 287 psubsw mm0, mm4 288 psubsw mm1, mm6 289 movq mm6, mm5 290 punpcklbw mm5, mm7 291 punpckhbw mm6, mm7 292 psubsw mm2, mm5 293 lea _EAX, [_EAX+2*TMP1] 294 psubsw mm3, mm6 295 lea _EBX, [_EBX+2*TMP1] 296 297 movq [TMP0+%1*32+ 0], mm0 ; dst 298 movq [TMP0+%1*32+ 8], mm1 299 movq [TMP0+%1*32+16], mm2 300 movq [TMP0+%1*32+24], mm3 301%endmacro 302 303ALIGN SECTION_ALIGN 304transfer_8to16sub2_mmx: 305 mov TMP0, prm1 ; Dst 306 mov TMP1d, prm5d ; Stride 307 mov _EAX, prm2 ; Cur 308 309 push _EBX 310%ifdef ARCH_IS_X86_64 311 mov _EBX, prm3 312%else 313 mov _EBX, [_ESP+4+12] ; Ref1 314%endif 315 316 push _ESI 317%ifdef ARCH_IS_X86_64 318 mov _ESI, prm4 319%else 320 mov _ESI, [_ESP+8+16] ; Ref2 321%endif 322 323 pxor mm7, mm7 324 325 COPY_8_TO_16_SUB2_MMX 0 326 COPY_8_TO_16_SUB2_MMX 1 327 COPY_8_TO_16_SUB2_MMX 2 328 COPY_8_TO_16_SUB2_MMX 3 329 330 pop _ESI 331 pop _EBX 332 ret 333ENDFUNC 334 335;----------------------------------------------------------------------------- 336; 337; void transfer_8to16sub2_xmm(int16_t * const dct, 338; uint8_t * const cur, 339; const uint8_t * ref1, 340; const uint8_t * ref2, 341; const uint32_t stride) 342; 343;----------------------------------------------------------------------------- 344 345%macro COPY_8_TO_16_SUB2_SSE 1 346 movq mm0, [_EAX] ; cur 347 movq mm2, [_EAX+TMP1] 348 movq mm1, mm0 349 movq mm3, mm2 350 351 punpcklbw mm0, mm7 352 punpcklbw mm2, mm7 353 movq mm4, [_EBX] ; ref1 354 pavgb mm4, [_ESI] ; ref2 355 movq [_EAX], mm4 356 punpckhbw mm1, mm7 357 punpckhbw mm3, mm7 358 movq mm5, [_EBX+TMP1] ; ref 359 pavgb mm5, [_ESI+TMP1] ; ref2 360 movq [_EAX+TMP1], mm5 361 362 movq mm6, mm4 363 punpcklbw mm4, mm7 364 punpckhbw mm6, mm7 365 psubsw mm0, mm4 366 psubsw mm1, mm6 367 lea _ESI, [_ESI+2*TMP1] 368 movq mm6, mm5 369 punpcklbw mm5, mm7 370 punpckhbw mm6, mm7 371 psubsw mm2, mm5 372 lea _EAX, [_EAX+2*TMP1] 373 psubsw mm3, mm6 374 lea _EBX, [_EBX+2*TMP1] 375 376 movq [TMP0+%1*32+ 0], mm0 ; dst 377 movq [TMP0+%1*32+ 8], mm1 378 movq [TMP0+%1*32+16], mm2 379 movq [TMP0+%1*32+24], mm3 380%endmacro 381 382ALIGN SECTION_ALIGN 383transfer_8to16sub2_xmm: 384 mov TMP0, prm1 ; Dst 385 mov _EAX, prm2 ; Cur 386 mov TMP1d, prm5d ; Stride 387 388 push _EBX 389%ifdef ARCH_IS_X86_64 390 mov _EBX, prm3 ; Ref1 391%else 392 mov _EBX, [_ESP+4+12] ; Ref1 393%endif 394 395 push _ESI 396%ifdef ARCH_IS_X86_64 397 mov _ESI, prm4 ; Ref1 398%else 399 mov _ESI, [_ESP+8+16] ; Ref2 400%endif 401 402 pxor mm7, mm7 403 404 COPY_8_TO_16_SUB2_SSE 0 405 COPY_8_TO_16_SUB2_SSE 1 406 COPY_8_TO_16_SUB2_SSE 2 407 COPY_8_TO_16_SUB2_SSE 3 408 409 pop _ESI 410 pop _EBX 411 ret 412ENDFUNC 413 414 415;----------------------------------------------------------------------------- 416; 417; void transfer_8to16sub2ro_xmm(int16_t * const dct, 418; const uint8_t * const cur, 419; const uint8_t * ref1, 420; const uint8_t * ref2, 421; const uint32_t stride) 422; 423;----------------------------------------------------------------------------- 424 425%macro COPY_8_TO_16_SUB2RO_SSE 1 426 movq mm0, [_EAX] ; cur 427 movq mm2, [_EAX+TMP1] 428 movq mm1, mm0 429 movq mm3, mm2 430 431 punpcklbw mm0, mm7 432 punpcklbw mm2, mm7 433 movq mm4, [_EBX] ; ref1 434 pavgb mm4, [_ESI] ; ref2 435 punpckhbw mm1, mm7 436 punpckhbw mm3, mm7 437 movq mm5, [_EBX+TMP1] ; ref 438 pavgb mm5, [_ESI+TMP1] ; ref2 439 440 movq mm6, mm4 441 punpcklbw mm4, mm7 442 punpckhbw mm6, mm7 443 psubsw mm0, mm4 444 psubsw mm1, mm6 445 lea _ESI, [_ESI+2*TMP1] 446 movq mm6, mm5 447 punpcklbw mm5, mm7 448 punpckhbw mm6, mm7 449 psubsw mm2, mm5 450 lea _EAX, [_EAX+2*TMP1] 451 psubsw mm3, mm6 452 lea _EBX, [_EBX+2*TMP1] 453 454 movq [TMP0+%1*32+ 0], mm0 ; dst 455 movq [TMP0+%1*32+ 8], mm1 456 movq [TMP0+%1*32+16], mm2 457 movq [TMP0+%1*32+24], mm3 458%endmacro 459 460ALIGN SECTION_ALIGN 461transfer_8to16sub2ro_xmm: 462 pxor mm7, mm7 463 mov TMP0, prm1 ; Dst 464 mov _EAX, prm2 ; Cur 465 mov TMP1d, prm5d ; Stride 466 467 push _EBX 468%ifdef ARCH_IS_X86_64 469 mov _EBX, prm3 470%else 471 mov _EBX, [_ESP+4+12] ; Ref1 472%endif 473 474 push _ESI 475%ifdef ARCH_IS_X86_64 476 mov _ESI, prm4 477%else 478 mov _ESI, [_ESP+8+16] ; Ref2 479%endif 480 481 COPY_8_TO_16_SUB2RO_SSE 0 482 COPY_8_TO_16_SUB2RO_SSE 1 483 COPY_8_TO_16_SUB2RO_SSE 2 484 COPY_8_TO_16_SUB2RO_SSE 3 485 486 pop _ESI 487 pop _EBX 488 ret 489ENDFUNC 490 491 492;----------------------------------------------------------------------------- 493; 494; void transfer_16to8add_mmx(uint8_t * const dst, 495; const int16_t * const src, 496; uint32_t stride); 497; 498;----------------------------------------------------------------------------- 499 500%macro COPY_16_TO_8_ADD 1 501 movq mm0, [TMP0] 502 movq mm2, [TMP0+TMP1] 503 movq mm1, mm0 504 movq mm3, mm2 505 punpcklbw mm0, mm7 506 punpcklbw mm2, mm7 507 punpckhbw mm1, mm7 508 punpckhbw mm3, mm7 509 paddsw mm0, [_EAX+%1*32+ 0] 510 paddsw mm1, [_EAX+%1*32+ 8] 511 paddsw mm2, [_EAX+%1*32+16] 512 paddsw mm3, [_EAX+%1*32+24] 513 packuswb mm0, mm1 514 movq [TMP0], mm0 515 packuswb mm2, mm3 516 movq [TMP0+TMP1], mm2 517%endmacro 518 519 520ALIGN SECTION_ALIGN 521transfer_16to8add_mmx: 522 mov TMP0, prm1 ; Dst 523 mov _EAX, prm2 ; Src 524 mov TMP1, prm3 ; Stride 525 pxor mm7, mm7 526 527 COPY_16_TO_8_ADD 0 528 lea TMP0,[TMP0+2*TMP1] 529 COPY_16_TO_8_ADD 1 530 lea TMP0,[TMP0+2*TMP1] 531 COPY_16_TO_8_ADD 2 532 lea TMP0,[TMP0+2*TMP1] 533 COPY_16_TO_8_ADD 3 534 ret 535ENDFUNC 536 537;----------------------------------------------------------------------------- 538; 539; void transfer8x8_copy_mmx(uint8_t * const dst, 540; const uint8_t * const src, 541; const uint32_t stride); 542; 543; 544;----------------------------------------------------------------------------- 545 546%macro COPY_8_TO_8 0 547 movq mm0, [_EAX] 548 movq mm1, [_EAX+TMP1] 549 movq [TMP0], mm0 550 lea _EAX, [_EAX+2*TMP1] 551 movq [TMP0+TMP1], mm1 552%endmacro 553 554ALIGN SECTION_ALIGN 555transfer8x8_copy_mmx: 556 mov TMP0, prm1 ; Dst 557 mov _EAX, prm2 ; Src 558 mov TMP1, prm3 ; Stride 559 560 COPY_8_TO_8 561 lea TMP0,[TMP0+2*TMP1] 562 COPY_8_TO_8 563 lea TMP0,[TMP0+2*TMP1] 564 COPY_8_TO_8 565 lea TMP0,[TMP0+2*TMP1] 566 COPY_8_TO_8 567 ret 568ENDFUNC 569 570;----------------------------------------------------------------------------- 571; 572; void transfer8x4_copy_mmx(uint8_t * const dst, 573; const uint8_t * const src, 574; const uint32_t stride); 575; 576; 577;----------------------------------------------------------------------------- 578 579ALIGN SECTION_ALIGN 580transfer8x4_copy_mmx: 581 mov TMP0, prm1 ; Dst 582 mov _EAX, prm2 ; Src 583 mov TMP1, prm3 ; Stride 584 585 COPY_8_TO_8 586 lea TMP0,[TMP0+2*TMP1] 587 COPY_8_TO_8 588 ret 589ENDFUNC 590 591NON_EXEC_STACK 592