1/* 2 * encode_x86.S 3 * 4 * Copyright (C) Peter Schlaile - February 2001 5 * 6 * This file is part of libdv, a free DV (IEC 61834/SMPTE 314M) 7 * codec. 8 * 9 * libdv is free software; you can redistribute it and/or modify it 10 * under the terms of the GNU Lesser Public License as published by 11 * the Free Software Foundation; either version 2.1, or (at your 12 * option) any later version. 13 * 14 * libdv is distributed in the hope that it will be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser Public License 20 * along with libdv; see the file COPYING. If not, write to 21 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22 * 23 * The libdv homepage is http://libdv.sourceforge.net/. 24 */ 25 26.data 27ALLONE: .word 1,1,1,1 28VLCADDMASK: .byte 255,0,0,0,255,0,0,0 29 30 31.section .note.GNU-stack, "", @progbits 32 33.text 34 35.global _dv_vlc_encode_block_mmx 36.hidden _dv_vlc_encode_block_mmx 37.type _dv_vlc_encode_block_mmx,@function 38_dv_vlc_encode_block_mmx: 39 pushl %ebx 40 pushl %esi 41 pushl %edi 42 pushl %ebp 43 44 xorl %eax, %eax 45 xorl %edx, %edx 46 movl 4+4*4(%esp), %edi # src 47 movl 4+4*4+4(%esp), %edx # &dst 48 movl (%edx), %edx 49 addl $2, %edi 50 51 movl $63, %ecx 52 53 movl vlc_encode_lookup, %esi 54 55 pxor %mm0, %mm0 56 pxor %mm2, %mm2 57 movq VLCADDMASK, %mm1 58 xorl %ebp, %ebp 59 subl $8, %edx 60vlc_encode_block_mmx_loop: 61 pand %mm1, %mm0 62 movw (%edi), %ax 63 addl $8, %edx 64 paddd %mm0, %mm2 65 cmpw $0, %ax 66 jz vlc_encode_block_amp_zero 67 addw $255, %ax 68 addl $2, %edi 69 movq (%esi, %eax, 8), %mm0 70 movq %mm0, (%edx) 71 decl %ecx 72 jnz vlc_encode_block_mmx_loop 73 pand %mm1, %mm0 74 paddd %mm0, %mm2 75 jmp vlc_encode_block_out 76 77vlc_encode_block_amp_zero: 78 movl %ecx, %ebp 79 incl %ecx 80 repz scasw 81 jecxz vlc_encode_block_out 82 movw -2(%edi), %ax 83 subl %ecx, %ebp 84 addw $255, %ax 85 shll $9, %ebp 86 orl %ebp, %eax 87 88 movq (%esi, %eax, 8), %mm0 89 movq %mm0, (%edx) 90 91 decl %ecx 92 jnz vlc_encode_block_mmx_loop 93 94 pand %mm1, %mm0 95 paddd %mm0, %mm2 96 97vlc_encode_block_out: 98 movq %mm2, %mm0 99 psrlq $32, %mm0 100 paddd %mm0, %mm2 101 102 movl 4+4*4+4(%esp), %ebx 103 movl %edx, (%ebx) 104 105 movd %mm2, %eax 106 107 popl %ebp 108 popl %edi 109 popl %esi 110 popl %ebx 111 ret 112 113.global _dv_vlc_num_bits_block_x86 114.hidden _dv_vlc_num_bits_block_x86 115.type _dv_vlc_num_bits_block_x86,@function 116_dv_vlc_num_bits_block_x86: 117 pushl %ebx 118 pushl %esi 119 pushl %edi 120 pushl %ebp 121 122 xorl %eax, %eax 123 xorl %edx, %edx 124 xorl %ebx, %ebx 125 xorl %ebp, %ebp 126 127 movl 4+4*4(%esp), %edi # src 128 addl $2, %edi 129 130 movl $63, %ecx 131 movl vlc_num_bits_lookup, %esi 132 133vlc_num_bits_block_x86_loop: 134 movw (%edi), %ax 135 addl %ebx, %edx 136 cmpw $0, %ax 137 jz vlc_num_bits_block_amp_zero 138 addw $255, %ax 139 addl $2, %edi 140 movb (%esi, %eax), %bl 141 142 decl %ecx 143 jnz vlc_num_bits_block_x86_loop 144 addl %ebx, %edx 145 jmp vlc_num_bits_block_out 146 147vlc_num_bits_block_amp_zero: 148 movl %ecx, %ebp 149 incl %ecx 150 repz scasw 151 jecxz vlc_num_bits_block_out 152 153 subl %ecx, %ebp 154 movw -2(%edi), %ax 155 shll $9, %ebp 156 addw $255, %ax 157 orl %ebp, %eax 158 movb (%esi, %eax), %bl 159 160 decl %ecx 161 jnz vlc_num_bits_block_x86_loop 162 addl %ebx, %edx 163 164vlc_num_bits_block_out: 165 movl %edx, %eax 166 167 popl %ebp 168 popl %edi 169 popl %esi 170 popl %ebx 171 ret 172 173.global _dv_vlc_encode_block_pass_1_x86 174.hidden _dv_vlc_encode_block_pass_1_x86 175.type _dv_vlc_encode_block_pass_1_x86,@function 176_dv_vlc_encode_block_pass_1_x86: 177 pushl %ebx 178 pushl %esi 179 pushl %edi 180 pushl %ebp 181 182 subl $4, %esp 183 184 movl 1*4+5*4(%esp), %esi # start 185 movl (%esi), %esi 186 movl 2*4+5*4(%esp), %edi # end 187 movl 3*4+5*4(%esp), %eax # bit_budget 188 movl (%eax), %eax 189 movl %eax, (%esp) 190 movl 4*4+5*4(%esp), %ebp # bit_offset 191 movl (%ebp), %ebp 192 /* 5*4+5*4(%esp) # vsbuffer */ 193 xorl %ecx, %ecx 194 xorl %edx, %edx 195 196vlc_encode_block_pass_1_x86_loop: 197 lodsl 198 movb %al, %cl 199 200 subl %ecx, (%esp) # bit_budget -= len 201 jl vlc_encode_block_pass1_x86_out 202 203 movl %ebp, %ebx # bit_offset 204 negl %ecx # -len 205 206 andl $7, %ebx # bit_offset & 7 207 addl $32, %ecx # 32-len 208 209 movb %al, %dl # len 210 subl %ebx, %ecx # 32-len-(bit_offset & 7) 211 212 shrl $8, %eax # value 213 movl %ebp, %ebx # bit_offset 214 215 shll %cl, %eax # value <<= 32-len-(bit_offset & 7) 216 shrl $3, %ebx # bit_offset >> 3 217 218 bswap %eax 219 addl 5*4+5*4(%esp), %ebx # vsbuffer + bit_offset >> 3 220 221 addl %edx, %ebp # bit_offset += len 222 orl %eax, (%ebx) # store value 223 224 cmpl %esi, %edi 225 jnz vlc_encode_block_pass_1_x86_loop 226 227 xorl %ecx, %ecx 228 addl $4, %esi 229 230vlc_encode_block_pass1_x86_out: 231 subl $4, %esi 232 addl (%esp), %ecx # bit_budget 233 234 movl 1*4+5*4(%esp), %eax # start 235 movl %esi, (%eax) 236 237 movl 3*4+5*4(%esp), %eax # bit_budget 238 movl %ecx, (%eax) 239 240 movl 4*4+5*4(%esp), %eax # bit_offset 241 movl %ebp, (%eax) 242 243 addl $4, %esp 244 245 popl %ebp 246 popl %edi 247 popl %esi 248 popl %ebx 249 ret 250 251.global _dv_classify_mmx 252.hidden _dv_classify_mmx 253.type _dv_classify_mmx,@function 254_dv_classify_mmx: 255 256 pushl %ebp 257 movl %esp, %ebp 258 pushl %esi 259 260 movl 12(%ebp), %esi 261 movq (%esi), %mm7 # amp_ofs 262 movl 16(%ebp), %esi 263 movq (%esi), %mm6 # amp_cmp 264 265 movl 8(%ebp), %esi # source 266 267 movq %mm7, %mm5 268 movq %mm6, %mm4 269 270 pxor %mm3, %mm3 271 pxor %mm2, %mm2 272 273 movq 0*8(%esi), %mm0 274 movq 1*8(%esi), %mm1 275 276 paddw %mm7, %mm0 277 paddw %mm5, %mm1 278 pcmpgtw %mm6, %mm0 279 pcmpgtw %mm4, %mm1 280 paddw %mm0, %mm3 281 paddw %mm1, %mm2 282 283 movq 2*8(%esi), %mm0 284 movq 3*8(%esi), %mm1 285 paddw %mm7, %mm0 286 paddw %mm5, %mm1 287 pcmpgtw %mm6, %mm0 288 pcmpgtw %mm4, %mm1 289 paddw %mm0, %mm3 290 paddw %mm1, %mm2 291 292 movq 4*8(%esi), %mm0 293 movq 5*8(%esi), %mm1 294 paddw %mm7, %mm0 295 paddw %mm5, %mm1 296 pcmpgtw %mm6, %mm0 297 pcmpgtw %mm4, %mm1 298 paddw %mm0, %mm3 299 paddw %mm1, %mm2 300 301 movq 6*8(%esi), %mm0 302 movq 7*8(%esi), %mm1 303 paddw %mm7, %mm0 304 paddw %mm5, %mm1 305 pcmpgtw %mm6, %mm0 306 pcmpgtw %mm4, %mm1 307 paddw %mm0, %mm3 308 paddw %mm1, %mm2 309 310 movq 8*8(%esi), %mm0 311 movq 9*8(%esi), %mm1 312 paddw %mm7, %mm0 313 paddw %mm5, %mm1 314 pcmpgtw %mm6, %mm0 315 pcmpgtw %mm4, %mm1 316 paddw %mm0, %mm3 317 paddw %mm1, %mm2 318 319 movq 10*8(%esi), %mm0 320 movq 11*8(%esi), %mm1 321 paddw %mm7, %mm0 322 paddw %mm5, %mm1 323 pcmpgtw %mm6, %mm0 324 pcmpgtw %mm4, %mm1 325 paddw %mm0, %mm3 326 paddw %mm1, %mm2 327 328 movq 12*8(%esi), %mm0 329 movq 13*8(%esi), %mm1 330 paddw %mm7, %mm0 331 paddw %mm5, %mm1 332 pcmpgtw %mm6, %mm0 333 pcmpgtw %mm4, %mm1 334 paddw %mm0, %mm3 335 paddw %mm1, %mm2 336 337 movq 14*8(%esi), %mm0 338 movq 15*8(%esi), %mm1 339 paddw %mm7, %mm0 340 paddw %mm5, %mm1 341 pcmpgtw %mm6, %mm0 342 pcmpgtw %mm4, %mm1 343 paddw %mm0, %mm3 344 paddw %mm1, %mm2 345 346 paddw %mm2, %mm3 347 packsswb %mm3, %mm3 348 349 movd %mm3, %eax 350 351 pop %esi 352 pop %ebp 353 ret 354 355/* FIXME: _dv_reorder_block_mmx isn't really _that_ faster than the C version... 356 don't know why... */ 357 358.global _dv_reorder_block_mmx 359.hidden _dv_reorder_block_mmx 360.type _dv_reorder_block_mmx,@function 361_dv_reorder_block_mmx: 362 363 pushl %ebp 364 movl %esp, %ebp 365 pushl %esi 366 pushl %edi 367 pushl %ebx 368 pushl %ecx 369 pushl %edx 370 371 movl 8(%ebp), %esi # source 372 movl 12(%ebp), %edi # reorder_table 373 374 xorl %ebp, %ebp 375 xorl %eax, %eax 376 xorl %ebx, %ebx 377 xorl %ecx, %ecx 378 xorl %edx, %edx 379 380 subl $128, %esp 381 382reorder_loop: 383 384 movw (%esi, %ebp), %ax 385 movw 2(%esi, %ebp), %bx 386 387 movw (%edi, %ebp), %cx 388 movw 2(%edi, %ebp), %dx 389 390 movw %ax, (%esp,%ecx) 391 movw %bx, (%esp,%edx) 392 393 movw 4(%esi, %ebp), %ax 394 movw 6(%esi, %ebp), %bx 395 396 movw 4(%edi, %ebp), %cx 397 movw 6(%edi, %ebp), %dx 398 399 movw %ax, (%esp,%ecx) 400 movw %bx, (%esp,%edx) 401 402 movw 8(%esi, %ebp), %ax 403 movw 10(%esi, %ebp), %bx 404 405 movw 8(%edi, %ebp), %cx 406 movw 10(%edi, %ebp), %dx 407 408 movw %ax, (%esp,%ecx) 409 movw %bx, (%esp,%edx) 410 411 movw 12(%esi, %ebp), %ax 412 movw 14(%esi, %ebp), %bx 413 414 movw 12(%edi, %ebp), %cx 415 movw 14(%edi, %ebp), %dx 416 417 movw %ax, (%esp,%ecx) 418 movw %bx, (%esp,%edx) 419 420 addl $16, %ebp 421 422 cmpl $128, %ebp 423 jne reorder_loop 424 425 movq (%esp) , %mm0 426 movq 8(%esp) , %mm1 427 movq 16(%esp), %mm2 428 movq 24(%esp), %mm3 429 430 movq %mm0, (%esi) 431 movq %mm1, 8(%esi) 432 movq %mm2, 16(%esi) 433 movq %mm3, 24(%esi) 434 435 movq 32(%esp) , %mm0 436 movq 32+8(%esp) , %mm1 437 movq 32+16(%esp), %mm2 438 movq 32+24(%esp), %mm3 439 440 movq %mm0, 32(%esi) 441 movq %mm1, 32+8(%esi) 442 movq %mm2, 32+16(%esi) 443 movq %mm3, 32+24(%esi) 444 445 movq 64(%esp) , %mm0 446 movq 64+8(%esp) , %mm1 447 movq 64+16(%esp), %mm2 448 movq 64+24(%esp), %mm3 449 450 movq %mm0, 64(%esi) 451 movq %mm1, 64+8(%esi) 452 movq %mm2, 64+16(%esi) 453 movq %mm3, 64+24(%esi) 454 455 movq 96(%esp) , %mm0 456 movq 96+8(%esp) , %mm1 457 movq 96+16(%esp), %mm2 458 movq 96+24(%esp), %mm3 459 460 addl $128, %esp 461 462 movq %mm0, 96(%esi) 463 movq %mm1, 96+8(%esi) 464 movq %mm2, 96+16(%esi) 465 movq %mm3, 96+24(%esi) 466 467 popl %edx 468 popl %ecx 469 popl %ebx 470 popl %edi 471 popl %esi 472 popl %ebp 473 ret 474 475.global _dv_need_dct_248_mmx_rows 476.hidden _dv_need_dct_248_mmx_rows 477.type _dv_need_dct_248_mmx_rows,@function 478_dv_need_dct_248_mmx_rows: 479 480 pushl %ebp 481 movl %esp, %ebp 482 pushl %esi 483 pushl %edi 484 485 movl 8(%ebp), %esi # source 486 487 movq (0*8+0)*2(%esi), %mm0 488 movq (0*8+4)*2(%esi), %mm1 489 psubw (1*8+0)*2(%esi), %mm0 490 psubw (1*8+4)*2(%esi), %mm1 491 movq %mm0, %mm2 492 movq %mm1, %mm3 493 psraw $15, %mm2 494 psraw $15, %mm3 495 pxor %mm2, %mm0 496 pxor %mm3, %mm1 497 psubw %mm2, %mm0 498 psubw %mm3, %mm1 499 500 movq (1*8+0)*2(%esi), %mm4 501 movq (1*8+4)*2(%esi), %mm5 502 psubw (2*8+0)*2(%esi), %mm4 503 psubw (2*8+4)*2(%esi), %mm5 504 movq %mm4, %mm6 505 movq %mm5, %mm7 506 psraw $15, %mm6 507 psraw $15, %mm7 508 pxor %mm6, %mm4 509 pxor %mm7, %mm5 510 psubw %mm6, %mm4 511 psubw %mm7, %mm5 512 513 paddw %mm4, %mm0 514 paddw %mm5, %mm1 515 516 movq (2*8+0)*2(%esi), %mm4 517 movq (2*8+4)*2(%esi), %mm5 518 psubw (3*8+0)*2(%esi), %mm4 519 psubw (3*8+4)*2(%esi), %mm5 520 movq %mm4, %mm6 521 movq %mm5, %mm7 522 psraw $15, %mm6 523 psraw $15, %mm7 524 pxor %mm6, %mm4 525 pxor %mm7, %mm5 526 psubw %mm6, %mm4 527 psubw %mm7, %mm5 528 529 paddw %mm4, %mm0 530 paddw %mm5, %mm1 531 532 movq (3*8+0)*2(%esi), %mm4 533 movq (3*8+4)*2(%esi), %mm5 534 psubw (4*8+0)*2(%esi), %mm4 535 psubw (4*8+4)*2(%esi), %mm5 536 movq %mm4, %mm6 537 movq %mm5, %mm7 538 psraw $15, %mm6 539 psraw $15, %mm7 540 pxor %mm6, %mm4 541 pxor %mm7, %mm5 542 psubw %mm6, %mm4 543 psubw %mm7, %mm5 544 545 paddw %mm4, %mm0 546 paddw %mm5, %mm1 547 548 movq (4*8+0)*2(%esi), %mm4 549 movq (4*8+4)*2(%esi), %mm5 550 psubw (5*8+0)*2(%esi), %mm4 551 psubw (5*8+4)*2(%esi), %mm5 552 movq %mm4, %mm6 553 movq %mm5, %mm7 554 psraw $15, %mm6 555 psraw $15, %mm7 556 pxor %mm6, %mm4 557 pxor %mm7, %mm5 558 psubw %mm6, %mm4 559 psubw %mm7, %mm5 560 561 paddw %mm4, %mm0 562 paddw %mm5, %mm1 563 564 movq (5*8+0)*2(%esi), %mm4 565 movq (5*8+4)*2(%esi), %mm5 566 psubw (6*8+0)*2(%esi), %mm4 567 psubw (6*8+4)*2(%esi), %mm5 568 movq %mm4, %mm6 569 movq %mm5, %mm7 570 psraw $15, %mm6 571 psraw $15, %mm7 572 pxor %mm6, %mm4 573 pxor %mm7, %mm5 574 psubw %mm6, %mm4 575 psubw %mm7, %mm5 576 577 paddw %mm4, %mm0 578 paddw %mm5, %mm1 579 580 movq (6*8+0)*2(%esi), %mm4 581 movq (6*8+4)*2(%esi), %mm5 582 psubw (7*8+0)*2(%esi), %mm4 583 psubw (7*8+4)*2(%esi), %mm5 584 movq %mm4, %mm6 585 movq %mm5, %mm7 586 psraw $15, %mm6 587 psraw $15, %mm7 588 pxor %mm6, %mm4 589 pxor %mm7, %mm5 590 psubw %mm6, %mm4 591 psubw %mm7, %mm5 592 593 paddw %mm4, %mm0 594 paddw %mm5, %mm1 595 596 paddw %mm1, %mm0 597 598 pmaddwd ALLONE, %mm0 599 movq %mm0, %mm1 600 psrlq $32, %mm1 601 paddd %mm1, %mm0 602 603 movd %mm0, %eax 604 605 popl %edi 606 popl %esi 607 popl %ebp 608 609 ret 610 611 612 613 614