1;/**************************************************************************** 2; * 3; * XVID MPEG-4 VIDEO CODEC 4; * - 8<->16 bit transfer functions - 5; * 6; * Copyright (C) 2002 Jaan Kalda 7; * 8; * This program is free software ; you can redistribute it and/or modify 9; * it under the terms of the GNU General Public License as published by 10; * the Free Software Foundation ; either version 2 of the License, or 11; * (at your option) any later version. 12; * 13; * This program is distributed in the hope that it will be useful, 14; * but WITHOUT ANY WARRANTY ; without even the implied warranty of 15; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16; * GNU General Public License for more details. 17; * 18; * You should have received a copy of the GNU General Public License 19; * along with this program ; if not, write to the Free Software 20; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21; * 22; * $Id: mem_transfer_3dne.asm,v 1.13 2009-09-16 17:07:58 Isibaar Exp $ 23; * 24; ***************************************************************************/ 25 26; these 3dne functions are compatible with iSSE, but are optimized specifically 27; for K7 pipelines 28 29%include "nasm.inc" 30 31;============================================================================= 32; Read only data 33;============================================================================= 34 35DATA 36 37ALIGN SECTION_ALIGN 38mm_zero: 39 dd 0,0 40;============================================================================= 41; Macros 42;============================================================================= 43 44%ifdef ARCH_IS_X86_64 45%define nop4 46%else 47%macro nop4 0 48 db 08Dh, 074h, 026h, 0 49%endmacro 50%endif 51 52;============================================================================= 53; Code 54;============================================================================= 55 56TEXT 57 58cglobal transfer_8to16copy_3dne 59cglobal transfer_16to8copy_3dne 60cglobal transfer_8to16sub_3dne 61cglobal transfer_8to16subro_3dne 62cglobal transfer_8to16sub2_3dne 63cglobal transfer_16to8add_3dne 64cglobal transfer8x8_copy_3dne 65cglobal transfer8x4_copy_3dne 66 67;----------------------------------------------------------------------------- 68; 69; void transfer_8to16copy_3dne(int16_t * const dst, 70; const uint8_t * const src, 71; uint32_t stride); 72; 73;----------------------------------------------------------------------------- 74 75ALIGN SECTION_ALIGN 76transfer_8to16copy_3dne: 77 78 mov _EAX, prm2 ; Src 79 mov TMP1, prm3 ; Stride 80 mov TMP0, prm1 ; Dst 81 punpcklbw mm0, [byte _EAX] 82 punpcklbw mm1, [_EAX+4] 83 movq mm2, [_EAX+TMP1] 84 movq mm3, [_EAX+TMP1] 85 pxor mm7, mm7 86 lea _EAX, [_EAX+2*TMP1] 87 punpcklbw mm2, mm7 88 punpckhbw mm3, mm7 89 psrlw mm0, 8 90 psrlw mm1, 8 91 punpcklbw mm4, [_EAX] 92 punpcklbw mm5, [_EAX+TMP1+4] 93 movq [byte TMP0+0*64], mm0 94 movq [TMP0+0*64+8], mm1 95 punpcklbw mm6, [_EAX+TMP1] 96 punpcklbw mm7, [_EAX+4] 97 lea _EAX, [byte _EAX+2*TMP1] 98 psrlw mm4, 8 99 psrlw mm5, 8 100 punpcklbw mm0, [_EAX] 101 punpcklbw mm1, [_EAX+TMP1+4] 102 movq [TMP0+0*64+16], mm2 103 movq [TMP0+0*64+24], mm3 104 psrlw mm6, 8 105 psrlw mm7, 8 106 punpcklbw mm2, [_EAX+TMP1] 107 punpcklbw mm3, [_EAX+4] 108 lea _EAX, [byte _EAX+2*TMP1] 109 movq [byte TMP0+0*64+32], mm4 110 movq [TMP0+0*64+56], mm5 111 psrlw mm0, 8 112 psrlw mm1, 8 113 punpcklbw mm4, [_EAX] 114 punpcklbw mm5, [_EAX+TMP1+4] 115 movq [byte TMP0+0*64+48], mm6 116 movq [TMP0+0*64+40], mm7 117 psrlw mm2, 8 118 psrlw mm3, 8 119 punpcklbw mm6, [_EAX+TMP1] 120 punpcklbw mm7, [_EAX+4] 121 movq [byte TMP0+1*64], mm0 122 movq [TMP0+1*64+24], mm1 123 psrlw mm4, 8 124 psrlw mm5, 8 125 movq [TMP0+1*64+16], mm2 126 movq [TMP0+1*64+8], mm3 127 psrlw mm6, 8 128 psrlw mm7, 8 129 movq [byte TMP0+1*64+32], mm4 130 movq [TMP0+1*64+56], mm5 131 movq [byte TMP0+1*64+48], mm6 132 movq [TMP0+1*64+40], mm7 133 ret 134ENDFUNC 135 136 137;----------------------------------------------------------------------------- 138; 139; void transfer_16to8copy_3dne(uint8_t * const dst, 140; const int16_t * const src, 141; uint32_t stride); 142; 143;----------------------------------------------------------------------------- 144 145ALIGN SECTION_ALIGN 146transfer_16to8copy_3dne: 147 148 mov _EAX, prm2 ; Src 149 mov TMP0, prm1 ; Dst 150 mov TMP1, prm3 ; Stride 151 152 movq mm0, [byte _EAX+0*32] 153 packuswb mm0, [_EAX+0*32+8] 154 movq mm1, [_EAX+0*32+16] 155 packuswb mm1, [_EAX+0*32+24] 156 movq mm5, [_EAX+2*32+16] 157 movq mm2, [_EAX+1*32] 158 packuswb mm2, [_EAX+1*32+8] 159 movq mm3, [_EAX+1*32+16] 160 packuswb mm3, [_EAX+1*32+24] 161 movq mm6, [_EAX+3*32] 162 movq mm4, [_EAX+2*32] 163 packuswb mm4, [_EAX+2*32+8] 164 packuswb mm5, [_EAX+2*32+24] 165 movq mm7, [_EAX+3*32+16] 166 packuswb mm7, [_EAX+3*32+24] 167 packuswb mm6, [_EAX+3*32+8] 168 movq [TMP0], mm0 169 lea _EAX, [3*TMP1] 170 add _EAX, TMP0 171 movq [TMP0+TMP1], mm1 172 movq [TMP0+2*TMP1], mm2 173 movq [byte _EAX], mm3 174 movq [TMP0+4*TMP1], mm4 175 lea TMP0, [byte TMP0+4*TMP1] 176 movq [_EAX+2*TMP1], mm5 177 movq [_EAX+4*TMP1], mm7 178 movq [TMP0+2*TMP1], mm6 179 ret 180ENDFUNC 181 182;----------------------------------------------------------------------------- 183; 184; void transfer_8to16sub_3dne(int16_t * const dct, 185; uint8_t * const cur, 186; const uint8_t * const ref, 187; const uint32_t stride); 188; 189;----------------------------------------------------------------------------- 190 191; when second argument == 1, reference (ebx) block is to current (_EAX) 192%macro COPY_8_TO_16_SUB 2 193 movq mm1, [_EAX] ; cur 194 movq mm0, mm1 195 movq mm4, [TMP0] ; ref 196 movq mm6, mm4 197%if %2 == 1 198 movq [_EAX], mm4 199%endif 200 punpckhbw mm1, mm7 201 punpckhbw mm6, mm7 202 punpcklbw mm4, mm7 203ALIGN SECTION_ALIGN 204 movq mm2, [byte _EAX+TMP1] 205 punpcklbw mm0, mm7 206 movq mm3, [byte _EAX+TMP1] 207 punpcklbw mm2, mm7 208 movq mm5, [byte TMP0+TMP1] ; ref 209 punpckhbw mm3, mm7 210%if %2 == 1 211 movq [byte _EAX+TMP1], mm5 212%endif 213 psubsw mm1, mm6 214 215 movq mm6, mm5 216 psubsw mm0, mm4 217%if (%1 < 3) 218 lea _EAX,[_EAX+2*TMP1] 219 lea TMP0,[TMP0+2*TMP1] 220%else 221 mov TMP0,[_ESP] 222 add _ESP,byte PTR_SIZE 223%endif 224 movq [_EDI+%1*32+ 8], mm1 225 movq [byte _EDI+%1*32+ 0], mm0 ; dst 226 punpcklbw mm5, mm7 227 punpckhbw mm6, mm7 228 psubsw mm2, mm5 229 psubsw mm3, mm6 230 movq [_EDI+%1*32+16], mm2 231 movq [_EDI+%1*32+24], mm3 232%endmacro 233 234ALIGN SECTION_ALIGN 235transfer_8to16sub_3dne: 236 mov _EAX, prm2 ; Cur 237 mov TMP0, prm3 ; Ref 238 mov TMP1, prm4 ; Stride 239 240 push _EDI 241%ifdef ARCH_IS_X86_64 242 mov _EDI, prm1 243%else 244 mov _EDI, [_ESP+4+4] ; Dst 245%endif 246 247 pxor mm7, mm7 248 nop 249ALIGN SECTION_ALIGN 250 COPY_8_TO_16_SUB 0, 1 251 COPY_8_TO_16_SUB 1, 1 252 COPY_8_TO_16_SUB 2, 1 253 COPY_8_TO_16_SUB 3, 1 254 mov _EDI, TMP0 255 ret 256ENDFUNC 257 258ALIGN SECTION_ALIGN 259transfer_8to16subro_3dne: 260 mov _EAX, prm2 ; Cur 261 mov TMP0, prm3 ; Ref 262 mov TMP1, prm4 ; Stride 263 264 push _EDI 265%ifdef ARCH_IS_X86_64 266 mov _EDI, prm1 267%else 268 mov _EDI, [_ESP+4+ 4] ; Dst 269%endif 270 271 pxor mm7, mm7 272 nop 273ALIGN SECTION_ALIGN 274 COPY_8_TO_16_SUB 0, 0 275 COPY_8_TO_16_SUB 1, 0 276 COPY_8_TO_16_SUB 2, 0 277 COPY_8_TO_16_SUB 3, 0 278 mov _EDI, TMP0 279 ret 280ENDFUNC 281 282 283;----------------------------------------------------------------------------- 284; 285; void transfer_8to16sub2_3dne(int16_t * const dct, 286; uint8_t * const cur, 287; const uint8_t * ref1, 288; const uint8_t * ref2, 289; const uint32_t stride) 290; 291;----------------------------------------------------------------------------- 292 293%macro COPY_8_TO_16_SUB2_SSE 1 294 db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte _EAX] ; cur 295 punpcklbw mm0, mm7 296 movq mm2, [byte _EAX+TMP1] 297 punpcklbw mm2, mm7 298 db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte _EAX] 299 punpckhbw mm1, mm7 300 movq mm3, [byte _EAX+TMP1] 301 punpckhbw mm3, mm7 302 303 movq mm4, [byte _EBX] ; ref1 304 pavgb mm4, [byte _ESI] ; ref2 305 movq [_EAX], mm4 306 movq mm5, [_EBX+TMP1] ; ref 307 pavgb mm5, [_ESI+TMP1] ; ref2 308 movq [_EAX+TMP1], mm5 309 movq mm6, mm4 310 punpcklbw mm4, mm7 311 punpckhbw mm6, mm7 312%if (%1 < 3) 313 lea _ESI,[_ESI+2*TMP1] 314 lea _EBX,[byte _EBX+2*TMP1] 315 lea _EAX,[_EAX+2*TMP1] 316%else 317 mov _ESI,[_ESP] 318 mov _EBX,[_ESP+PTR_SIZE] 319 add _ESP,byte 2*PTR_SIZE 320%endif 321 psubsw mm0, mm4 322 psubsw mm1, mm6 323 movq mm6, mm5 324 punpcklbw mm5, mm7 325 punpckhbw mm6, mm7 326 psubsw mm2, mm5 327 psubsw mm3, mm6 328 movq [byte TMP0+%1*32+ 0], mm0 ; dst 329 movq [TMP0+%1*32+ 8], mm1 330 movq [TMP0+%1*32+16], mm2 331 movq [TMP0+%1*32+24], mm3 332%endmacro 333 334ALIGN SECTION_ALIGN 335transfer_8to16sub2_3dne: 336 mov TMP1d, prm5d ; Stride 337 mov TMP0, prm1 ; Dst 338 mov _EAX, prm2 ; Cur 339 push _EBX 340 lea _EBP,[byte _EBP] 341 342%ifdef ARCH_IS_X86_64 343 mov _EBX, prm3 344%else 345 mov _EBX, [_ESP+4+12] ; Ref1 346%endif 347 348 push _ESI 349 pxor mm7, mm7 350 351%ifdef ARCH_IS_X86_64 352 mov _ESI, prm4 353%else 354 mov _ESI, [_ESP+8+16] ; Ref2 355%endif 356 357 nop4 358 COPY_8_TO_16_SUB2_SSE 0 359 COPY_8_TO_16_SUB2_SSE 1 360 COPY_8_TO_16_SUB2_SSE 2 361 COPY_8_TO_16_SUB2_SSE 3 362 363 ret 364ENDFUNC 365 366 367;----------------------------------------------------------------------------- 368; 369; void transfer_16to8add_3dne(uint8_t * const dst, 370; const int16_t * const src, 371; uint32_t stride); 372; 373;----------------------------------------------------------------------------- 374 375%macro COPY_16_TO_8_ADD 1 376 movq mm0, [byte TMP0] 377 punpcklbw mm0, mm7 378 movq mm2, [byte TMP0+TMP1] 379 punpcklbw mm2, mm7 380 movq mm1, [byte TMP0] 381 punpckhbw mm1, mm7 382 movq mm3, [byte TMP0+TMP1] 383 punpckhbw mm3, mm7 384 paddsw mm0, [byte _EAX+%1*32+ 0] 385 paddsw mm1, [_EAX+%1*32+ 8] 386 paddsw mm2, [_EAX+%1*32+16] 387 paddsw mm3, [_EAX+%1*32+24] 388 packuswb mm0, mm1 389 packuswb mm2, mm3 390 mov _ESP, _ESP 391 movq [byte TMP0], mm0 392 movq [TMP0+TMP1], mm2 393%endmacro 394 395 396ALIGN SECTION_ALIGN 397transfer_16to8add_3dne: 398 mov TMP0, prm1 ; Dst 399 mov TMP1, prm3 ; Stride 400 mov _EAX, prm2 ; Src 401 pxor mm7, mm7 402 nop 403 404 COPY_16_TO_8_ADD 0 405 lea TMP0,[byte TMP0+2*TMP1] 406 COPY_16_TO_8_ADD 1 407 lea TMP0,[byte TMP0+2*TMP1] 408 COPY_16_TO_8_ADD 2 409 lea TMP0,[byte TMP0+2*TMP1] 410 COPY_16_TO_8_ADD 3 411 ret 412ENDFUNC 413 414;----------------------------------------------------------------------------- 415; 416; void transfer8x8_copy_3dne(uint8_t * const dst, 417; const uint8_t * const src, 418; const uint32_t stride); 419; 420; 421;----------------------------------------------------------------------------- 422 423%macro COPY_8_TO_8 0 424 movq mm0, [byte _EAX] 425 movq mm1, [_EAX+TMP1] 426 movq [byte TMP0], mm0 427 lea _EAX,[byte _EAX+2*TMP1] 428 movq [TMP0+TMP1], mm1 429%endmacro 430 431ALIGN SECTION_ALIGN 432transfer8x8_copy_3dne: 433 mov _EAX, prm2 ; Src 434 mov TMP1, prm3 ; Stride 435 mov TMP0, prm1 ; Dst 436 437 COPY_8_TO_8 438 lea TMP0,[byte TMP0+2*TMP1] 439 COPY_8_TO_8 440 lea TMP0,[byte TMP0+2*TMP1] 441 COPY_8_TO_8 442 lea TMP0,[byte TMP0+2*TMP1] 443 COPY_8_TO_8 444 ret 445ENDFUNC 446 447;----------------------------------------------------------------------------- 448; 449; void transfer8x4_copy_3dne(uint8_t * const dst, 450; const uint8_t * const src, 451; const uint32_t stride); 452; 453; 454;----------------------------------------------------------------------------- 455 456ALIGN SECTION_ALIGN 457transfer8x4_copy_3dne: 458 mov _EAX, prm2 ; Src 459 mov TMP1, prm3 ; Stride 460 mov TMP0, prm1 ; Dst 461 462 COPY_8_TO_8 463 lea TMP0,[byte TMP0+2*TMP1] 464 COPY_8_TO_8 465 ret 466ENDFUNC 467 468NON_EXEC_STACK 469