1/* 2 * rgbtoyuv.S 3 * 4 * Copyright (C) Peter Schlaile - February 2001 5 * 6 * This file is part of libdv, a free DV (IEC 61834/SMPTE 314M) 7 * codec. 8 * 9 * libdv is free software; you can redistribute it and/or modify it 10 * under the terms of the GNU Lesser Public License as published by 11 * the Free Software Foundation; either version 2.1, or (at your 12 * option) any later version. 13 * 14 * libdv is distributed in the hope that it will be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser Public License 20 * along with libdv; see the file COPYING. If not, write to 21 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22 * 23 * The libdv homepage is http://libdv.sourceforge.net/. 24 */ 25 26 27# The loop processes interleaved RGB values for 8 pixels. 28# The notation in the comments which describe the data locate 29# the first byte on the right. For example in a register containing 30# G2R2B1G1R1B0G0R0, R0 is in the position of the lease significant 31# byte and G2 is in the position of the most significant byte. 32# The output is to separate Y, U, and V buffers. Input are bytes, 33# output are words 34 35#define CONSTSHIFT 15 36#define PRECISION 1 37#define FIXPSHIFT CONSTSHIFT-PRECISION 38 39#define DV_WIDTH_SHORT 720*2 40#define DV_WIDTH_BYTE 720 41#define DV_WIDTH_SHORT_HALF 720 42#define DV_WIDTH_BYTE_HALF 360 43 44.global _dv_rgbtoycb_mmx_x86_64 45# .global yuvtoycb_mmx_x86_64 46 47.data 48 49.align 8 50ZEROSX: .word 0,0,0,0 51ZEROS: .long 0,0 52 53ALLONE: .word 1,1,1,1 54 55OFFSETDX: .word 0,64,0,64 #offset used before shift 56OFFSETD: .long 0,0 57OFFSETWX: .word 128,0,128,0 #offset used before pack 32 58OFFSETW: .long 0,0 59OFFSETBX: .word 128,128,128,128 60OFFSETB: .long 0,0 61OFFSETY: .word (16-128) << PRECISION 62 .word (16-128) << PRECISION 63 .word (16-128) << PRECISION 64 .word (16-128) << PRECISION 65 66TEMP0: .long 0,0 67TEMPY: .long 0,0 68TEMPU: .long 0,0 69TEMPV: .long 0,0 70 71#if 0 /* Original YUV */ 72YR0GRX: .word 9798,19235,0,9798 73YBG0BX: .word 3736,0,19235,3736 74YR0GR: .long 0,0 75YBG0B: .long 0,0 76UR0GRX: .word -4784,-9437,0,-4784 77UBG0BX: .word 14221,0,-9437,14221 78UR0GR: .long 0,0 79UBG0B: .long 0,0 80VR0GRX: .word 20218,-16941,0,20218 81VBG0BX: .word -3277,0,-16941,-3277 82VR0GR: .long 0,0 83VBG0B: .long 0,0 84 85YR0GRX: .word 8420,16529,0,8420 86YBG0BX: .word 3203,0,16529,3203 87YR0GR: .long 0,0 88YBG0B: .long 0,0 89UR0GRX: .word 14391,-12055,0,14391 90UBG0BX: .word -2336,0,-12055,-2336 91UR0GR: .long 0,0 92UBG0B: .long 0,0 93VR0GRX: .word -4857,-9534,0,-4857 94VBG0BX: .word 14391,0,-9534,14391 95VR0GR: .long 0,0 96VBG0B: .long 0,0 97 98#else 99YR0GRX: .word 8414,16519,0,8414 100YBG0BX: .word 3208,0,16519,3208 101YR0GR: .long 0,0 102YBG0B: .long 0,0 103UR0GRX: .word 14392,-12061,0,14392 104UBG0BX: .word -2332,0,-12061,-2332 105UR0GR: .long 0,0 106UBG0B: .long 0,0 107VR0GRX: .word -4864,-9528,0,-4864 108VBG0BX: .word 14392,0,-9528,14392 109VR0GR: .long 0,0 110VBG0B: .long 0,0 111 112#endif 113 114.section .note.GNU-stack, "", @progbits 115 116.text 117 118#define _inPtr 8 119#define _rows 12 120#define _columns 16 121#define _outyPtr 20 122#define _outuPtr 24 123#define _outvPtr 28 124 125.global _dv_rgbtoycb_mmx_x86_64 126.hidden _dv_rgbtoycb_mmx_x86_64 127.type _dv_rgbtoycb_mmx_x86_64,@function 128_dv_rgbtoycb_mmx_x86_64: 129 130 /* extern void _dv_rgbtoycb_mmx_x86_64(unsigned char* inPtr, rdi 131 int rows, rsi 132 int columns, rdx 133 short* outyPtr, rcx 134 short* outuPtr, r8 135 short* outvPtr); r9 136 */ 137 138 push %rax 139 push %rbx 140 push %r12 141 push %r13 142 143 lea ZEROSX(%rip), %rax #This section gets around a bug 144 movq (%rax), %mm0 #unlikely to persist 145 movq %mm0, ZEROS(%rip) 146 lea OFFSETDX(%rip), %rax 147 movq (%rax), %mm0 148 movq %mm0, OFFSETD(%rip) 149 lea OFFSETWX(%rip), %rax 150 movq (%rax), %mm0 151 movq %mm0, OFFSETW(%rip) 152 lea OFFSETBX(%rip), %rax 153 movq (%rax), %mm0 154 movq %mm0, OFFSETB(%rip) 155 lea YR0GRX(%rip), %rax 156 movq (%rax), %mm0 157 movq %mm0, YR0GR(%rip) 158 lea YBG0BX(%rip), %rax 159 movq (%rax), %mm0 160 movq %mm0, YBG0B(%rip) 161 lea UR0GRX(%rip), %rax 162 movq (%rax), %mm0 163 movq %mm0, UR0GR(%rip) 164 lea UBG0BX(%rip), %rax 165 movq (%rax), %mm0 166 movq %mm0, UBG0B(%rip) 167 lea VR0GRX(%rip), %rax 168 movq (%rax), %mm0 169 movq %mm0, VR0GR(%rip) 170 lea VBG0BX(%rip), %rax 171 movq (%rax), %mm0 172 movq %mm0, VBG0B(%rip) 173 174 mov %rsi, %rax #rows 175 mov %rdx, %rbx #columns 176 mul %rbx #number pixels 177 shr $3, %rax #number of loops 178 mov %rax,%r11 #loop counter in r11 179 180 mov %rdi,%rax #inPtr 181 mov %rcx,%rbx #outyPtr 182 mov %r8,%r12 #outuPtr 183 mov %r9,%r13 #outvPtr 184rgbtoycb_mmx_loop: 185 movq (%rax), %mm1 #load G2R2B1G1R1B0G0R0 186 pxor %mm6, %mm6 #0 -> mm6 187 188 movq %mm1, %mm0 #G2R2B1G1R1B0G0R0 -> mm0 189 psrlq $16, %mm1 #00G2R2B1G1R1B0-> mm1 190 191 punpcklbw %mm6, %mm0 #R1B0G0R0 -> mm0 192 movq %mm1, %mm7 #00G2R2B1G1R1B0-> mm7 193 194 punpcklbw %mm6, %mm1 #B1G1R1B0 -> mm1 195 movq %mm0, %mm2 #R1B0G0R0 -> mm2 196 197 pmaddwd YR0GR(%rip), %mm0 #yrR1,ygG0+yrR0 -> mm0 198 movq %mm1, %mm3 #B1G1R1B0 -> mm3 199 200 pmaddwd YBG0B(%rip), %mm1 #ybB1+ygG1,ybB0 -> mm1 201 movq %mm2, %mm4 #R1B0G0R0 -> mm4 202 203 pmaddwd UR0GR(%rip), %mm2 #urR1,ugG0+urR0 -> mm2 204 movq %mm3, %mm5 #B1G1R1B0 -> mm5 205 206 pmaddwd UBG0B(%rip), %mm3 #ubB1+ugG1,ubB0 -> mm3 207 punpckhbw %mm6, %mm7 # 00G2R2 -> mm7 208 209 pmaddwd VR0GR(%rip), %mm4 #vrR1,vgG0+vrR0 -> mm4 210 paddd %mm1, %mm0 #Y1Y0 -> mm0 211 212 pmaddwd VBG0B(%rip), %mm5 #vbB1+vgG1,vbB0 -> mm5 213 214 movq 8(%rax), %mm1 #R5B4G4R4B3G3R3B2 -> mm1 215 paddd %mm3, %mm2 #U1U0 -> mm2 216 217 movq %mm1, %mm6 #R5B4G4R4B3G3R3B2 -> mm6 218 219 punpcklbw ZEROS(%rip), %mm1 #B3G3R3B2 -> mm1 220 paddd %mm5, %mm4 #V1V0 -> mm4 221 222 movq %mm1, %mm5 #B3G3R3B2 -> mm5 223 psllq $32, %mm1 #R3B200 -> mm1 224 225 paddd %mm7, %mm1 #R3B200+00G2R2=R3B2G2R2->mm1 226 227 punpckhbw ZEROS(%rip), %mm6 #R5B4G4R3 -> mm6 228 movq %mm1, %mm3 #R3B2G2R2 -> mm3 229 230 pmaddwd YR0GR(%rip), %mm1 #yrR3,ygG2+yrR2 -> mm1 231 movq %mm5, %mm7 #B3G3R3B2 -> mm7 232 233 pmaddwd YBG0B(%rip), %mm5 #ybB3+ygG3,ybB2 -> mm5 234 psrad $FIXPSHIFT, %mm0 #32-bit scaled Y1Y0 -> mm0 235 236 movq %mm6, TEMP0(%rip) #R5B4G4R4 -> TEMP0 237 movq %mm3, %mm6 #R3B2G2R2 -> mm6 238 pmaddwd UR0GR(%rip), %mm6 #urR3,ugG2+urR2 -> mm6 239 psrad $FIXPSHIFT, %mm2 #32-bit scaled U1U0 -> mm2 240 241 paddd %mm5, %mm1 #Y3Y2 -> mm1 242 movq %mm7, %mm5 #B3G3R3B2 -> mm5 243 pmaddwd UBG0B(%rip), %mm7 #ubB3+ugG3,ubB2 244 psrad $FIXPSHIFT, %mm1 #32-bit scaled Y3Y2 -> mm1 245 246 pmaddwd VR0GR(%rip), %mm3 #vrR3,vgG2+vgR2 247 packssdw %mm1, %mm0 #Y3Y2Y1Y0 -> mm0 248 249 pmaddwd VBG0B(%rip), %mm5 #vbB3+vgG3,vbB2 -> mm5 250 psrad $FIXPSHIFT, %mm4 #32-bit scaled V1V0 -> mm4 251 252 movq 16(%rax), %mm1 #B7G7R7B6G6R6B5G5 -> mm7 253 paddd %mm7, %mm6 #U3U2 -> mm6 254 255 movq %mm1, %mm7 #B7G7R7B6G6R6B5G5 -> mm1 256 psrad $FIXPSHIFT, %mm6 #32-bit scaled U3U2 -> mm6 257 258 paddd %mm5, %mm3 #V3V2 -> mm3 259 psllq $16, %mm7 #R7B6G6R6B5G500 -> mm7 260 261 movq %mm7, %mm5 #R7B6G6R6B5G500 -> mm5 262 psrad $FIXPSHIFT, %mm3 #32-bit scaled V3V2 -> mm3 263 264 paddw OFFSETY(%rip), %mm0 265 movq %mm0, (%rbx) #store Y3Y2Y1Y0 266 packssdw %mm6, %mm2 #32-bit scaled U3U2U1U0 -> mm2 267 268 movq TEMP0(%rip), %mm0 #R5B4G4R4 -> mm0 269 add $8, %rbx 270 271 punpcklbw ZEROS(%rip), %mm7 #B5G500 -> mm7 272 movq %mm0, %mm6 #R5B4G4R4 -> mm6 273 274 movq %mm2, TEMPU(%rip) #32-bit scaled U3U2U1U0 -> TEMPU 275 psrlq $32, %mm0 #00R5B4 -> mm0 276 277 paddw %mm0, %mm7 #B5G5R5B4 -> mm7 278 movq %mm6, %mm2 #B5B4G4R4 -> mm2 279 280 pmaddwd YR0GR(%rip), %mm2 #yrR5,ygG4+yrR4 -> mm2 281 movq %mm7, %mm0 #B5G5R5B4 -> mm0 282 283 pmaddwd YBG0B(%rip), %mm7 #ybB5+ygG5,ybB4 -> mm7 284 packssdw %mm3, %mm4 #32-bit scaled V3V2V1V0 -> mm4 285 286 add $24, %rax #increment RGB count 287 288 movq %mm4, TEMPV(%rip) #(V3V2V1V0)/256 -> mm4 289 movq %mm6, %mm4 #B5B4G4R4 -> mm4 290 291 pmaddwd UR0GR(%rip), %mm6 #urR5,ugG4+urR4 292 movq %mm0, %mm3 #B5G5R5B4 -> mm0 293 294 pmaddwd UBG0B(%rip), %mm0 #ubB5+ugG5,ubB4 295 paddd %mm7, %mm2 #Y5Y4 -> mm2 296 297 pmaddwd VR0GR(%rip), %mm4 #vrR5,vgG4+vrR4 -> mm4 298 pxor %mm7, %mm7 #0 -> mm7 299 300 pmaddwd VBG0B(%rip), %mm3 #vbB5+vgG5,vbB4 -> mm3 301 punpckhbw %mm7, %mm1 #B7G7R7B6 -> mm1 302 303 paddd %mm6, %mm0 #U5U4 -> mm0 304 movq %mm1, %mm6 #B7G7R7B6 -> mm6 305 306 pmaddwd YBG0B(%rip), %mm6 #ybB7+ygG7,ybB6 -> mm6 307 punpckhbw %mm7, %mm5 #R7B6G6R6 -> mm5 308 309 movq %mm5, %mm7 #R7B6G6R6 -> mm7 310 paddd %mm4, %mm3 #V5V4 -> mm3 311 312 pmaddwd YR0GR(%rip), %mm5 #yrR7,ygG6+yrR6 -> mm5 313 movq %mm1, %mm4 #B7G7R7B6 -> mm4 314 315 pmaddwd UBG0B(%rip), %mm4 #ubB7+ugG7,ubB6 -> mm4 316 psrad $FIXPSHIFT, %mm0 #32-bit scaled U5U4 -> mm0 317 318 psrad $FIXPSHIFT, %mm2 #32-bit scaled Y5Y4 -> mm2 319 320 paddd %mm5, %mm6 #Y7Y6 -> mm6 321 movq %mm7, %mm5 #R7B6G6R6 -> mm5 322 323 pmaddwd UR0GR(%rip), %mm7 #urR7,ugG6+ugR6 -> mm7 324 psrad $FIXPSHIFT, %mm3 #32-bit scaled V5V4 -> mm3 325 326 pmaddwd VBG0B(%rip), %mm1 #vbB7+vgG7,vbB6 -> mm1 327 psrad $FIXPSHIFT, %mm6 #32-bit scaled Y7Y6 -> mm6 328 329 packssdw %mm6, %mm2 #Y7Y6Y5Y4 -> mm2 330 331 pmaddwd VR0GR(%rip), %mm5 #vrR7,vgG6+vrR6 -> mm5 332 paddd %mm4, %mm7 #U7U6 -> mm7 333 334 psrad $FIXPSHIFT, %mm7 #32-bit scaled U7U6 -> mm7 335 paddw OFFSETY(%rip), %mm2 336 movq %mm2, (%rbx) #store Y7Y6Y5Y4 337 338 movq ALLONE(%rip), %mm6 339 packssdw %mm7, %mm0 #32-bit scaled U7U6U5U4 -> mm0 340 341 movq TEMPU(%rip), %mm4 #32-bit scaled U3U2U1U0 -> mm4 342 pmaddwd %mm6, %mm0 #U7U6U5U4 averaged -> (U7U6)(U5U4)=UU3 UU2->mm0 343 344 pmaddwd %mm6, %mm4 #U3U2U1U0 averaged -> (U3U2)(U1U0)=UU1 UU0->mm4 345 346 paddd %mm5, %mm1 #V7V6 -> mm1 347 packssdw %mm0, %mm4 #UU3UU2UU1UU0 -> mm4 348 349 psrad $FIXPSHIFT, %mm1 #32-bit scaled V7V6 -> mm1 350 psraw $1, %mm4 #divide UU3 UU2 UU1 UU0 by 2 -> mm4 351 352 movq TEMPV(%rip), %mm5 #32-bit scaled V3V2V1V0 -> mm5 353 354 movq %mm4, (%r12) # store U 355 356 pmaddwd %mm6, %mm5 #V3V2V1V0 averaged -> VV1 VV0 -> mm5 357 packssdw %mm1, %mm3 #V7V6V5V4 -> mm3 358 359 pmaddwd %mm6, %mm3 #V7V6V5V4 averaged -> VV3 VV2 -> mm3 360 361 packssdw %mm3, %mm5 # VV3 VV2 VV1 VV0 -> mm5 362 psraw $1, %mm5 363 364 add $8, %rbx #increment Y count 365 add $8, %r12 #increment U count 366 367 movq %mm5, (%r13) #store V 368 369 add $8, %r13 #increment V count 370 371 dec %r11 #decrement loop counter 372 jnz rgbtoycb_mmx_loop #do 24 more bytes if not 0 373 374 pop %r13 375 pop %r12 376 pop %rbx 377 pop %rax 378 379 ret 380 381.global _dv_ppm_copy_y_block_mmx_x86_64 382.hidden _dv_ppm_copy_y_block_mmx_x86_64 383.type _dv_ppm_copy_y_block_mmx_x86_64,@function 384_dv_ppm_copy_y_block_mmx_x86_64: 385 386/* extern void _dv_ppm_copy_y_block_mmx_x86_64(short * dst, short * src); */ 387 388 /* arguments are dst=rdi, src=rsi */ 389 390 movq (%rsi), %mm0 391 movq 8(%rsi), %mm1 392 movq %mm0, 0*8(%rdi) 393 movq %mm1, 1*8(%rdi) 394 movq DV_WIDTH_SHORT(%rsi), %mm2 395 movq DV_WIDTH_SHORT+8(%rsi), %mm3 396 movq %mm2, 2*8(%rdi) 397 movq %mm3, 3*8(%rdi) 398 movq DV_WIDTH_SHORT*2(%rsi), %mm4 399 movq DV_WIDTH_SHORT*2+8(%rsi), %mm5 400 movq %mm4, 4*8(%rdi) 401 movq %mm5, 5*8(%rdi) 402 movq DV_WIDTH_SHORT*3(%rsi), %mm6 403 movq DV_WIDTH_SHORT*3+8(%rsi), %mm7 404 movq %mm6, 6*8(%rdi) 405 movq %mm7, 7*8(%rdi) 406 407 movq DV_WIDTH_SHORT*4(%rsi), %mm0 408 movq DV_WIDTH_SHORT*4+8(%rsi), %mm1 409 movq %mm0, 8*8(%rdi) 410 movq %mm1, 9*8(%rdi) 411 movq DV_WIDTH_SHORT*5(%rsi), %mm2 412 movq DV_WIDTH_SHORT*5+8(%rsi), %mm3 413 movq %mm2, 10*8(%rdi) 414 movq %mm3, 11*8(%rdi) 415 movq DV_WIDTH_SHORT*6(%rsi), %mm4 416 movq DV_WIDTH_SHORT*6+8(%rsi), %mm5 417 movq %mm4, 12*8(%rdi) 418 movq %mm5, 13*8(%rdi) 419 movq DV_WIDTH_SHORT*7(%rsi), %mm6 420 movq DV_WIDTH_SHORT*7+8(%rsi), %mm7 421 movq %mm6, 14*8(%rdi) 422 movq %mm7, 15*8(%rdi) 423 424 ret 425 426.global _dv_pgm_copy_y_block_mmx_x86_64 427.hidden _dv_pgm_copy_y_block_mmx_x86_64 428.type _dv_pgm_copy_y_block_mmx_x86_64,@function 429_dv_pgm_copy_y_block_mmx_x86_64: 430 431/* extern void _dv_pgm_copy_y_block_mmx_x86_64(short * dst, unsigned char * src); */ 432 433 /* arguments are dst=rdi, src=rsi */ 434 435 movq OFFSETY(%rip), %mm7 436 pxor %mm6, %mm6 437 438 movq (%rsi), %mm0 439 movq DV_WIDTH_BYTE(%rsi), %mm1 440 441 movq %mm0, %mm2 442 movq %mm1, %mm3 443 444 punpcklbw %mm6, %mm0 445 punpcklbw %mm6, %mm1 446 447 punpckhbw %mm6, %mm2 448 punpckhbw %mm6, %mm3 449 450#if PRECISION > 0 451 psllw $PRECISION, %mm0 452 psllw $PRECISION, %mm1 453 psllw $PRECISION, %mm2 454 psllw $PRECISION, %mm3 455#endif 456 457 paddw %mm7, %mm0 458 paddw %mm7, %mm1 459 paddw %mm7, %mm2 460 paddw %mm7, %mm3 461 462 movq %mm0, (%rdi) 463 movq %mm2, 8(%rdi) 464 movq %mm1, 16(%rdi) 465 movq %mm3, 24(%rdi) 466 467 add $2*DV_WIDTH_BYTE, %rsi 468 add $32, %rdi 469 470 movq (%rsi), %mm0 471 movq DV_WIDTH_BYTE(%rsi), %mm1 472 473 movq %mm0, %mm2 474 movq %mm1, %mm3 475 476 punpcklbw %mm6, %mm0 477 punpcklbw %mm6, %mm1 478 479 punpckhbw %mm6, %mm2 480 punpckhbw %mm6, %mm3 481 482#if PRECISION > 0 483 psllw $PRECISION, %mm0 484 psllw $PRECISION, %mm1 485 psllw $PRECISION, %mm2 486 psllw $PRECISION, %mm3 487#endif 488 489 paddw %mm7, %mm0 490 paddw %mm7, %mm1 491 paddw %mm7, %mm2 492 paddw %mm7, %mm3 493 494 movq %mm0, (%rdi) 495 movq %mm2, 8(%rdi) 496 movq %mm1, 16(%rdi) 497 movq %mm3, 24(%rdi) 498 499 add $2*DV_WIDTH_BYTE, %rsi 500 add $32, %rdi 501 502 movq (%rsi), %mm0 503 movq DV_WIDTH_BYTE(%rsi), %mm1 504 505 movq %mm0, %mm2 506 movq %mm1, %mm3 507 508 punpcklbw %mm6, %mm0 509 punpcklbw %mm6, %mm1 510 511 punpckhbw %mm6, %mm2 512 punpckhbw %mm6, %mm3 513 514#if PRECISION > 0 515 psllw $PRECISION, %mm0 516 psllw $PRECISION, %mm1 517 psllw $PRECISION, %mm2 518 psllw $PRECISION, %mm3 519#endif 520 paddw %mm7, %mm0 521 paddw %mm7, %mm1 522 paddw %mm7, %mm2 523 paddw %mm7, %mm3 524 525 movq %mm0, (%rdi) 526 movq %mm2, 8(%rdi) 527 movq %mm1, 16(%rdi) 528 movq %mm3, 24(%rdi) 529 530 add $2*DV_WIDTH_BYTE, %rsi 531 add $32, %rdi 532 533 movq (%rsi), %mm0 534 movq DV_WIDTH_BYTE(%rsi), %mm1 535 536 movq %mm0, %mm2 537 movq %mm1, %mm3 538 539 punpcklbw %mm6, %mm0 540 punpcklbw %mm6, %mm1 541 542 punpckhbw %mm6, %mm2 543 punpckhbw %mm6, %mm3 544 545#if PRECISION > 0 546 psllw $PRECISION, %mm0 547 psllw $PRECISION, %mm1 548 psllw $PRECISION, %mm2 549 psllw $PRECISION, %mm3 550#endif 551 paddw %mm7, %mm0 552 paddw %mm7, %mm1 553 paddw %mm7, %mm2 554 paddw %mm7, %mm3 555 556 movq %mm0, (%rdi) 557 movq %mm2, 8(%rdi) 558 movq %mm1, 16(%rdi) 559 movq %mm3, 24(%rdi) 560 561 ret 562 563.global _dv_video_copy_y_block_mmx_x86_64 564.hidden _dv_video_copy_y_block_mmx_x86_64 565.type _dv_video_copy_y_block_mmx_x86_64,@function 566_dv_video_copy_y_block_mmx_x86_64: 567 568/* extern void _dv_video_copy_y_block_mmx_x86_64(short * dst, unsigned char * src); */ 569 570 /* arguments are dst=rdi, src=rsi */ 571 572 movq OFFSETBX(%rip), %mm7 573 pxor %mm6, %mm6 574 575 movq (%rsi), %mm0 576 movq DV_WIDTH_BYTE(%rsi), %mm1 577 578 movq %mm0, %mm2 579 movq %mm1, %mm3 580 581 punpcklbw %mm6, %mm0 582 punpcklbw %mm6, %mm1 583 584 punpckhbw %mm6, %mm2 585 punpckhbw %mm6, %mm3 586 587 psubw %mm7, %mm0 588 psubw %mm7, %mm1 589 psubw %mm7, %mm2 590 psubw %mm7, %mm3 591 592#if PRECISION > 0 593 psllw $PRECISION, %mm0 594 psllw $PRECISION, %mm1 595 psllw $PRECISION, %mm2 596 psllw $PRECISION, %mm3 597#endif 598 599 movq %mm0, (%rdi) 600 movq %mm2, 8(%rdi) 601 movq %mm1, 16(%rdi) 602 movq %mm3, 24(%rdi) 603 604 add $2*DV_WIDTH_BYTE, %rsi 605 add $32, %rdi 606 607 movq (%rsi), %mm0 608 movq DV_WIDTH_BYTE(%rsi), %mm1 609 610 movq %mm0, %mm2 611 movq %mm1, %mm3 612 613 punpcklbw %mm6, %mm0 614 punpcklbw %mm6, %mm1 615 616 punpckhbw %mm6, %mm2 617 punpckhbw %mm6, %mm3 618 619 psubw %mm7, %mm0 620 psubw %mm7, %mm1 621 psubw %mm7, %mm2 622 psubw %mm7, %mm3 623 624#if PRECISION > 0 625 psllw $PRECISION, %mm0 626 psllw $PRECISION, %mm1 627 psllw $PRECISION, %mm2 628 psllw $PRECISION, %mm3 629#endif 630 631 movq %mm0, (%rdi) 632 movq %mm2, 8(%rdi) 633 movq %mm1, 16(%rdi) 634 movq %mm3, 24(%rdi) 635 636 add $2*DV_WIDTH_BYTE, %rsi 637 add $32, %rdi 638 639 movq (%rsi), %mm0 640 movq DV_WIDTH_BYTE(%rsi), %mm1 641 642 movq %mm0, %mm2 643 movq %mm1, %mm3 644 645 punpcklbw %mm6, %mm0 646 punpcklbw %mm6, %mm1 647 648 punpckhbw %mm6, %mm2 649 punpckhbw %mm6, %mm3 650 651 psubw %mm7, %mm0 652 psubw %mm7, %mm1 653 psubw %mm7, %mm2 654 psubw %mm7, %mm3 655 656#if PRECISION > 0 657 psllw $PRECISION, %mm0 658 psllw $PRECISION, %mm1 659 psllw $PRECISION, %mm2 660 psllw $PRECISION, %mm3 661#endif 662 663 movq %mm0, (%rdi) 664 movq %mm2, 8(%rdi) 665 movq %mm1, 16(%rdi) 666 movq %mm3, 24(%rdi) 667 668 add $2*DV_WIDTH_BYTE, %rsi 669 add $32, %rdi 670 671 movq (%rsi), %mm0 672 movq DV_WIDTH_BYTE(%rsi), %mm1 673 674 movq %mm0, %mm2 675 movq %mm1, %mm3 676 677 punpcklbw %mm6, %mm0 678 punpcklbw %mm6, %mm1 679 680 punpckhbw %mm6, %mm2 681 punpckhbw %mm6, %mm3 682 683 psubw %mm7, %mm0 684 psubw %mm7, %mm1 685 psubw %mm7, %mm2 686 psubw %mm7, %mm3 687 688#if PRECISION > 0 689 psllw $PRECISION, %mm0 690 psllw $PRECISION, %mm1 691 psllw $PRECISION, %mm2 692 psllw $PRECISION, %mm3 693#endif 694 695 movq %mm0, (%rdi) 696 movq %mm2, 8(%rdi) 697 movq %mm1, 16(%rdi) 698 movq %mm3, 24(%rdi) 699 700 ret 701 702 703.global _dv_ppm_copy_pal_c_block_mmx_x86_64 704.hidden _dv_ppm_copy_pal_c_block_mmx_x86_64 705.type _dv_ppm_copy_pal_c_block_mmx_x86_64,@function 706_dv_ppm_copy_pal_c_block_mmx_x86_64: 707 708/* extern void _dv_ppm_copy_pal_c_block_mmx_x86_64(short * dst, short * src); */ 709 710 /* arguments are dst=rdi, src=rsi */ 711 712 movq (%rsi), %mm0 713 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 714 movq 8(%rsi), %mm2 715 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 716 717 paddw %mm0, %mm1 718 paddw %mm2, %mm3 719 psraw $1, %mm1 720 psraw $1, %mm3 721 722 movq %mm1, 0*8(%rdi) 723 movq %mm3, 1*8(%rdi) 724 725 add $DV_WIDTH_SHORT, %rsi 726 add $16, %rdi 727 728 movq (%rsi), %mm0 729 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 730 movq 8(%rsi), %mm2 731 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 732 733 paddw %mm0, %mm1 734 paddw %mm2, %mm3 735 psraw $1, %mm1 736 psraw $1, %mm3 737 738 movq %mm1, 0*8(%rdi) 739 movq %mm3, 1*8(%rdi) 740 741 add $DV_WIDTH_SHORT, %rsi 742 add $16, %rdi 743 744 movq (%rsi), %mm0 745 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 746 movq 8(%rsi), %mm2 747 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 748 749 paddw %mm0, %mm1 750 paddw %mm2, %mm3 751 psraw $1, %mm1 752 psraw $1, %mm3 753 754 movq %mm1, 0*8(%rdi) 755 movq %mm3, 1*8(%rdi) 756 757 add $DV_WIDTH_SHORT, %rsi 758 add $16, %rdi 759 760 movq (%rsi), %mm0 761 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 762 movq 8(%rsi), %mm2 763 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 764 765 paddw %mm0, %mm1 766 paddw %mm2, %mm3 767 psraw $1, %mm1 768 psraw $1, %mm3 769 770 movq %mm1, 0*8(%rdi) 771 movq %mm3, 1*8(%rdi) 772 773 add $DV_WIDTH_SHORT, %rsi 774 add $16, %rdi 775 776 movq (%rsi), %mm0 777 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 778 movq 8(%rsi), %mm2 779 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 780 781 paddw %mm0, %mm1 782 paddw %mm2, %mm3 783 psraw $1, %mm1 784 psraw $1, %mm3 785 786 movq %mm1, 0*8(%rdi) 787 movq %mm3, 1*8(%rdi) 788 789 add $DV_WIDTH_SHORT, %rsi 790 add $16, %rdi 791 792 movq (%rsi), %mm0 793 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 794 movq 8(%rsi), %mm2 795 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 796 797 paddw %mm0, %mm1 798 paddw %mm2, %mm3 799 psraw $1, %mm1 800 psraw $1, %mm3 801 802 movq %mm1, 0*8(%rdi) 803 movq %mm3, 1*8(%rdi) 804 805 add $DV_WIDTH_SHORT, %rsi 806 add $16, %rdi 807 808 movq (%rsi), %mm0 809 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 810 movq 8(%rsi), %mm2 811 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 812 813 paddw %mm0, %mm1 814 paddw %mm2, %mm3 815 psraw $1, %mm1 816 psraw $1, %mm3 817 818 movq %mm1, 0*8(%rdi) 819 movq %mm3, 1*8(%rdi) 820 821 add $DV_WIDTH_SHORT, %rsi 822 add $16, %rdi 823 824 movq (%rsi), %mm0 825 movq DV_WIDTH_SHORT_HALF(%rsi), %mm1 826 movq 8(%rsi), %mm2 827 movq DV_WIDTH_SHORT_HALF+8(%rsi), %mm3 828 829 paddw %mm0, %mm1 830 paddw %mm2, %mm3 831 psraw $1, %mm1 832 psraw $1, %mm3 833 834 movq %mm1, 0*8(%rdi) 835 movq %mm3, 1*8(%rdi) 836 837 ret 838 839.global _dv_pgm_copy_pal_c_block_mmx_x86_64 840.hidden _dv_pgm_copy_pal_c_block_mmx_x86_64 841.type _dv_pgm_copy_pal_c_block_mmx_x86_64,@function 842_dv_pgm_copy_pal_c_block_mmx_x86_64: 843 844/* extern void _dv_pgm_copy_pal_c_block_mmx_x86_64(short * dst, unsigned char * src); */ 845 846 /* arguments are dst=rdi, src=rsi */ 847 848 movq OFFSETBX(%rip), %mm7 849 pxor %mm6, %mm6 850 851 852 movq (%rsi), %mm0 853 movq DV_WIDTH_BYTE(%rsi), %mm1 854 855 movq %mm0, %mm2 856 movq %mm1, %mm3 857 858 punpcklbw %mm6, %mm0 859 punpcklbw %mm6, %mm1 860 861 punpckhbw %mm6, %mm2 862 punpckhbw %mm6, %mm3 863 864 psubw %mm7, %mm0 865 psubw %mm7, %mm1 866 psubw %mm7, %mm2 867 psubw %mm7, %mm3 868 869#if PRECISION > 0 870 psllw $PRECISION, %mm0 871 psllw $PRECISION, %mm1 872 psllw $PRECISION, %mm2 873 psllw $PRECISION, %mm3 874#endif 875 876 movq %mm0, (%rdi) 877 movq %mm2, 8(%rdi) 878 movq %mm1, 16(%rdi) 879 movq %mm3, 24(%rdi) 880 881 add $2*DV_WIDTH_BYTE, %rsi 882 add $32, %rdi 883 884 movq (%rsi), %mm0 885 movq DV_WIDTH_BYTE(%rsi), %mm1 886 887 movq %mm0, %mm2 888 movq %mm1, %mm3 889 890 punpcklbw %mm6, %mm0 891 punpcklbw %mm6, %mm1 892 893 punpckhbw %mm6, %mm2 894 punpckhbw %mm6, %mm3 895 896 psubw %mm7, %mm0 897 psubw %mm7, %mm1 898 psubw %mm7, %mm2 899 psubw %mm7, %mm3 900 901#if PRECISION > 0 902 psllw $PRECISION, %mm0 903 psllw $PRECISION, %mm1 904 psllw $PRECISION, %mm2 905 psllw $PRECISION, %mm3 906#endif 907 908 movq %mm0, (%rdi) 909 movq %mm2, 8(%rdi) 910 movq %mm1, 16(%rdi) 911 movq %mm3, 24(%rdi) 912 913 add $2*DV_WIDTH_BYTE, %rsi 914 add $32, %rdi 915 916 movq (%rsi), %mm0 917 movq DV_WIDTH_BYTE(%rsi), %mm1 918 919 movq %mm0, %mm2 920 movq %mm1, %mm3 921 922 punpcklbw %mm6, %mm0 923 punpcklbw %mm6, %mm1 924 925 punpckhbw %mm6, %mm2 926 punpckhbw %mm6, %mm3 927 928 psubw %mm7, %mm0 929 psubw %mm7, %mm1 930 psubw %mm7, %mm2 931 psubw %mm7, %mm3 932 933#if PRECISION > 0 934 psllw $PRECISION, %mm0 935 psllw $PRECISION, %mm1 936 psllw $PRECISION, %mm2 937 psllw $PRECISION, %mm3 938#endif 939 940 movq %mm0, (%rdi) 941 movq %mm2, 8(%rdi) 942 movq %mm1, 16(%rdi) 943 movq %mm3, 24(%rdi) 944 945 add $2*DV_WIDTH_BYTE, %rsi 946 add $32, %rdi 947 948 movq (%rsi), %mm0 949 movq DV_WIDTH_BYTE(%rsi), %mm1 950 951 movq %mm0, %mm2 952 movq %mm1, %mm3 953 954 punpcklbw %mm6, %mm0 955 punpcklbw %mm6, %mm1 956 957 punpckhbw %mm6, %mm2 958 punpckhbw %mm6, %mm3 959 960 psubw %mm7, %mm0 961 psubw %mm7, %mm1 962 psubw %mm7, %mm2 963 psubw %mm7, %mm3 964 965#if PRECISION > 0 966 psllw $PRECISION, %mm0 967 psllw $PRECISION, %mm1 968 psllw $PRECISION, %mm2 969 psllw $PRECISION, %mm3 970#endif 971 972 movq %mm0, (%rdi) 973 movq %mm2, 8(%rdi) 974 movq %mm1, 16(%rdi) 975 movq %mm3, 24(%rdi) 976 977 ret 978 979.global _dv_video_copy_pal_c_block_mmx_x86_64 980.hidden _dv_video_copy_pal_c_block_mmx_x86_64 981.type _dv_video_copy_pal_c_block_mmx_x86_64,@function 982_dv_video_copy_pal_c_block_mmx_x86_64: 983 984 /* extern void _dv_video_copy_pal_c_block_mmx_x86_64(short * dst, unsigned char * src); */ 985 986 /* arguments are dst=rdi, src=rsi */ 987 988 push %rbx 989 990 movq OFFSETBX(%rip), %mm7 991 paddw %mm7, %mm7 992 pxor %mm6, %mm6 993 994 mov $4, %rbx 995 996video_copy_pal_c_block_mmx_loop: 997 998 movq (%rsi), %mm0 999 movq DV_WIDTH_BYTE_HALF(%rsi), %mm2 1000 1001 movq %mm0, %mm1 1002 movq %mm2, %mm3 1003 1004 punpcklbw %mm6, %mm0 1005 punpcklbw %mm6, %mm2 1006 1007 punpckhbw %mm6, %mm1 1008 punpckhbw %mm6, %mm3 1009 1010 paddw %mm0, %mm2 1011 paddw %mm1, %mm3 1012 1013 psubw %mm7, %mm2 1014 psubw %mm7, %mm3 1015 1016#if PRECISION == 0 1017 psraw $1, %mm2 1018 psraw $1, %mm3 1019#else 1020#if PRECISION > 1 1021 psllw $PRECISION-1, %mm2 1022 psllw $PRECISION-1, %mm3 1023#endif 1024#endif 1025 movq %mm2, 0*8(%rdi) 1026 movq %mm3, 1*8(%rdi) 1027 1028 add $DV_WIDTH_BYTE, %rsi 1029 add $16, %rdi 1030 1031 movq (%rsi), %mm0 1032 movq DV_WIDTH_BYTE_HALF(%rsi), %mm2 1033 1034 movq %mm0, %mm1 1035 movq %mm2, %mm3 1036 1037 punpcklbw %mm6, %mm0 1038 punpcklbw %mm6, %mm2 1039 1040 punpckhbw %mm6, %mm1 1041 punpckhbw %mm6, %mm3 1042 1043 paddw %mm0, %mm2 1044 paddw %mm1, %mm3 1045 1046 psubw %mm7, %mm2 1047 psubw %mm7, %mm3 1048 1049#if PRECISION == 0 1050 psraw $1, %mm2 1051 psraw $1, %mm3 1052#else 1053#if PRECISION > 1 1054 psllw $PRECISION-1, %mm2 1055 psllw $PRECISION-1, %mm3 1056#endif 1057#endif 1058 movq %mm2, 0*8(%rdi) 1059 movq %mm3, 1*8(%rdi) 1060 1061 add $DV_WIDTH_BYTE, %rsi 1062 add $16, %rdi 1063 1064 dec %rbx 1065 jnz video_copy_pal_c_block_mmx_loop 1066 1067 pop %rbx 1068 1069 ret 1070 1071.global _dv_ppm_copy_ntsc_c_block_mmx_x86_64 1072.hidden _dv_ppm_copy_ntsc_c_block_mmx_x86_64 1073.type _dv_ppm_copy_ntsc_c_block_mmx_x86_64,@function 1074_dv_ppm_copy_ntsc_c_block_mmx_x86_64: 1075 1076 /* extern void _dv_ppm_copy_ntsc_c_block_mmx_x86_64(short * dst, short * src); */ 1077 1078 /* arguments are dst=rdi, src=rsi */ 1079 1080 push %rbx 1081 1082 mov $4, %rbx # loop counter 1083 1084 movq ALLONE(%rip), %mm6 1085 1086ppm_copy_ntsc_c_block_mmx_loop: 1087 1088 movq (%rsi), %mm0 1089 movq 8(%rsi), %mm1 1090 movq 16(%rsi), %mm2 1091 movq 24(%rsi), %mm3 1092 1093 pmaddwd %mm6, %mm0 1094 pmaddwd %mm6, %mm1 1095 1096 pmaddwd %mm6, %mm2 1097 pmaddwd %mm6, %mm3 1098 1099 packssdw %mm1, %mm0 1100 packssdw %mm3, %mm2 1101 1102 psraw $1, %mm0 1103 psraw $1, %mm2 1104 1105 movq %mm0, 0*8(%rdi) 1106 movq %mm2, 1*8(%rdi) 1107 1108 add $DV_WIDTH_SHORT_HALF, %rsi 1109 add $16, %rdi 1110 1111 movq (%rsi), %mm0 1112 movq 8(%rsi), %mm1 1113 movq 16(%rsi), %mm2 1114 movq 24(%rsi), %mm3 1115 1116 pmaddwd %mm6, %mm0 1117 pmaddwd %mm6, %mm1 1118 1119 pmaddwd %mm6, %mm2 1120 pmaddwd %mm6, %mm3 1121 1122 packssdw %mm1, %mm0 1123 packssdw %mm3, %mm2 1124 1125 psraw $1, %mm0 1126 psraw $1, %mm2 1127 1128 movq %mm0, 0*8(%rdi) 1129 movq %mm2, 1*8(%rdi) 1130 1131 add $DV_WIDTH_SHORT_HALF, %rsi 1132 add $16, %rdi 1133 1134 dec %rbx 1135 jnz ppm_copy_ntsc_c_block_mmx_loop 1136 1137 pop %rbx 1138 1139 ret 1140 1141.global _dv_pgm_copy_ntsc_c_block_mmx_x86_64 1142.hidden _dv_pgm_copy_ntsc_c_block_mmx_x86_64 1143.type _dv_pgm_copy_ntsc_c_block_mmx_x86_64,@function 1144_dv_pgm_copy_ntsc_c_block_mmx_x86_64: 1145 1146 /* extern void _dv_pgm_copy_ntsc_c_block_mmx_x86_64(short * dst, unsigned char * src); */ 1147 1148 /* arguments are dst=rdi, src=rsi */ 1149 1150 movq OFFSETBX(%rip), %mm7 1151 paddw %mm7, %mm7 1152 pxor %mm6, %mm6 1153 1154 movq (%rsi), %mm0 1155 movq 8(%rsi), %mm2 1156 1157 movq %mm0, %mm1 1158 movq %mm2, %mm3 1159 1160 punpcklbw %mm6, %mm0 1161 punpcklbw %mm6, %mm2 1162 1163 punpckhbw %mm6, %mm1 1164 punpckhbw %mm6, %mm3 1165 1166 paddw %mm0, %mm1 1167 paddw %mm2, %mm3 1168 1169 psubw %mm7, %mm1 1170 psubw %mm7, %mm3 1171 1172#if PRECISION == 0 1173 psraw $1, %mm1 1174 psraw $1, %mm3 1175#else 1176#if PRECISION > 1 1177 psllw $PRECISION-1, %mm1 1178 psllw $PRECISION-1, %mm3 1179#endif 1180#endif 1181 movq %mm1, 0*8(%rdi) 1182 movq %mm3, 1*8(%rdi) 1183 movq %mm1, 2*8(%rdi) 1184 movq %mm3, 3*8(%rdi) 1185 1186 add $DV_WIDTH_BYTE, %rsi 1187 add $32, %rdi 1188 1189 movq (%rsi), %mm0 1190 movq 8(%rsi), %mm2 1191 1192 movq %mm0, %mm1 1193 movq %mm2, %mm3 1194 1195 punpcklbw %mm6, %mm0 1196 punpcklbw %mm6, %mm2 1197 1198 punpckhbw %mm6, %mm1 1199 punpckhbw %mm6, %mm3 1200 1201 paddw %mm0, %mm1 1202 paddw %mm2, %mm3 1203 1204 psubw %mm7, %mm1 1205 psubw %mm7, %mm3 1206 1207#if PRECISION == 0 1208 psraw $1, %mm1 1209 psraw $1, %mm3 1210#else 1211#if PRECISION > 1 1212 psllw $PRECISION-1, %mm1 1213 psllw $PRECISION-1, %mm3 1214#endif 1215#endif 1216 movq %mm1, 0*8(%rdi) 1217 movq %mm3, 1*8(%rdi) 1218 movq %mm1, 2*8(%rdi) 1219 movq %mm3, 3*8(%rdi) 1220 1221 add $DV_WIDTH_BYTE, %rsi 1222 add $32, %rdi 1223 1224 movq (%rsi), %mm0 1225 movq 8(%rsi), %mm2 1226 1227 movq %mm0, %mm1 1228 movq %mm2, %mm3 1229 1230 punpcklbw %mm6, %mm0 1231 punpcklbw %mm6, %mm2 1232 1233 punpckhbw %mm6, %mm1 1234 punpckhbw %mm6, %mm3 1235 1236 paddw %mm0, %mm1 1237 paddw %mm2, %mm3 1238 1239 psubw %mm7, %mm1 1240 psubw %mm7, %mm3 1241 1242#if PRECISION == 0 1243 psraw $1, %mm1 1244 psraw $1, %mm3 1245#else 1246#if PRECISION > 1 1247 psllw $PRECISION-1, %mm1 1248 psllw $PRECISION-1, %mm3 1249#endif 1250#endif 1251 movq %mm1, 0*8(%rdi) 1252 movq %mm3, 1*8(%rdi) 1253 movq %mm1, 2*8(%rdi) 1254 movq %mm3, 3*8(%rdi) 1255 1256 add $DV_WIDTH_BYTE, %rsi 1257 add $32, %rdi 1258 1259 movq (%rsi), %mm0 1260 movq 8(%rsi), %mm2 1261 1262 movq %mm0, %mm1 1263 movq %mm2, %mm3 1264 1265 punpcklbw %mm6, %mm0 1266 punpcklbw %mm6, %mm2 1267 1268 punpckhbw %mm6, %mm1 1269 punpckhbw %mm6, %mm3 1270 1271 paddw %mm0, %mm1 1272 paddw %mm2, %mm3 1273 1274 psubw %mm7, %mm1 1275 psubw %mm7, %mm3 1276 1277#if PRECISION == 0 1278 psraw $1, %mm1 1279 psraw $1, %mm3 1280#else 1281#if PRECISION > 1 1282 psllw $PRECISION-1, %mm1 1283 psllw $PRECISION-1, %mm3 1284#endif 1285#endif 1286 movq %mm1, 0*8(%rdi) 1287 movq %mm3, 1*8(%rdi) 1288 movq %mm1, 2*8(%rdi) 1289 movq %mm3, 3*8(%rdi) 1290 1291 ret 1292 1293.global _dv_video_copy_ntsc_c_block_mmx_x86_64 1294.hidden _dv_video_copy_ntsc_c_block_mmx_x86_64 1295.type _dv_video_copy_ntsc_c_block_mmx_x86_64,@function 1296_dv_video_copy_ntsc_c_block_mmx_x86_64: 1297 1298 /* extern void _dv_video_copy_ntsc_c_block_mmx_x86_64(short * dst, unsigned char * src); */ 1299 1300 /* arguments are dst=rdi, src=rsi */ 1301 1302 push %rbx 1303 1304 movq OFFSETBX(%rip), %mm7 1305 paddw %mm7, %mm7 1306 pxor %mm6, %mm6 1307 1308 mov $4, %rbx # loop counter 1309 1310video_copy_ntsc_c_block_mmx_loop: 1311 1312 movq (%rsi), %mm0 1313 movq 8(%rsi), %mm2 1314 1315 movq %mm0, %mm1 1316 movq %mm2, %mm3 1317 1318 punpcklbw %mm6, %mm0 1319 punpcklbw %mm6, %mm2 1320 1321 punpckhbw %mm6, %mm1 1322 punpckhbw %mm6, %mm3 1323 1324 paddw %mm0, %mm1 1325 paddw %mm2, %mm3 1326 1327 psubw %mm7, %mm1 1328 psubw %mm7, %mm3 1329 1330#if PRECISION == 0 1331 psraw $1, %mm1 1332 psraw $1, %mm3 1333#else 1334#if PRECISION > 1 1335 psllw $PRECISION-1, %mm1 1336 psllw $PRECISION-1, %mm3 1337#endif 1338#endif 1339 movq %mm1, 0*8(%rdi) 1340 movq %mm3, 1*8(%rdi) 1341 1342 add $DV_WIDTH_BYTE_HALF, %rsi 1343 add $16, %rdi 1344 1345 movq (%rsi), %mm0 1346 movq 8(%rsi), %mm2 1347 1348 movq %mm0, %mm1 1349 movq %mm2, %mm3 1350 1351 punpcklbw %mm6, %mm0 1352 punpcklbw %mm6, %mm2 1353 1354 punpckhbw %mm6, %mm1 1355 punpckhbw %mm6, %mm3 1356 1357 paddw %mm0, %mm1 1358 paddw %mm2, %mm3 1359 1360 psubw %mm7, %mm1 1361 psubw %mm7, %mm3 1362 1363#if PRECISION == 0 1364 psraw $1, %mm1 1365 psraw $1, %mm3 1366#else 1367#if PRECISION > 1 1368 psllw $PRECISION-1, %mm1 1369 psllw $PRECISION-1, %mm3 1370#endif 1371#endif 1372 movq %mm1, 0*8(%rdi) 1373 movq %mm3, 1*8(%rdi) 1374 1375 add $DV_WIDTH_BYTE_HALF, %rsi 1376 add $16, %rdi 1377 1378 dec %rbx 1379 jnz video_copy_ntsc_c_block_mmx_loop 1380 1381 1382 pop %rbx 1383 1384 ret 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394