1;***************************************************************************** 2;* x86-optimized functions for fspp filter 3;* 4;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 5;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or modify 10;* it under the terms of the GNU General Public License as published by 11;* the Free Software Foundation; either version 2 of the License, or 12;* (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17;* GNU General Public License for more details. 18;* 19;* You should have received a copy of the GNU General Public License along 20;* with FFmpeg; if not, write to the Free Software Foundation, Inc., 21;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \ 29 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \ 30 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \ 31 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21 32pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) 33pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) 34pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) 35pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) 36pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) 37pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) 38pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) 39pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14) 40pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14) 41pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14) 42pw_4: times 4 dw 4 43pw_2: times 4 dw 2 44 45SECTION .text 46 47%define DCTSIZE 8 48 49INIT_MMX mmx 50 51;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, 52; ptrdiff_t dst_stride, ptrdiff_t src_stride, 53; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) 54%if ARCH_X86_64 55cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 56%else 57cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 58%define dst_strideq r2m 59%define src_strideq r3m 60 mov widthq, r4m 61 mov dither_heightq, r5m 62 mov ditherq, r6m ; log2_scale 63%endif 64 add widthq, 7 65 mov tmpq, src_strideq 66 and widthq, ~7 67 sub dst_strideq, widthq 68 movd m5, ditherd ; log2_scale 69 xor ditherq, -1 ; log2_scale 70 mov tmp2q, tmpq 71 add ditherq, 7 ; log2_scale 72 neg tmpq 73 sub tmp2q, widthq 74 movd m2, ditherd ; log2_scale 75 add tmp2q, tmp2q 76 lea ditherq, [pb_dither] 77 mov src_strideq, tmp2q 78 shl tmpq, 4 79 lea dither_heightq, [ditherq+dither_heightq*8] 80 pxor m7, m7 81 82.loop_height: 83 movq m3, [ditherq] 84 movq m4, m3 85 punpcklbw m3, m7 86 punpckhbw m4, m7 87 mov tmp2q, widthq 88 psraw m3, m5 89 psraw m4, m5 90 91.loop_width: 92 movq [srcq+tmpq], m7 93 movq m0, [srcq] 94 movq m1, [srcq+8] 95 movq [srcq+tmpq+8], m7 96 paddw m0, m3 97 paddw m1, m4 98 movq [srcq], m7 99 psraw m0, m2 100 psraw m1, m2 101 movq [srcq+8], m7 102 packuswb m0, m1 103 add srcq, 16 104 movq [dstq], m0 105 add dstq, 8 106 sub tmp2q, 8 107 jg .loop_width 108 109 add srcq, src_strideq 110 add ditherq, 8 111 add dstq, dst_strideq 112 cmp ditherq, dither_heightq 113 jl .loop_height 114 RET 115 116;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, 117; ptrdiff_t dst_stride, ptrdiff_t src_stride, 118; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) 119%if ARCH_X86_64 120cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 121%else 122cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 123%define dst_strideq r2m 124%define src_strideq r3m 125 mov dstq, dstm 126 mov srcq, srcm 127 mov widthq, r4m 128 mov dither_heightq, r5m 129 mov ditherq, r6m ; log2_scale 130%endif 131 add widthq, 7 132 mov tmpq, src_strideq 133 and widthq, ~7 134 sub dst_strideq, widthq 135 movd m5, ditherd ; log2_scale 136 xor ditherq, -1 ; log2_scale 137 mov tmp2q, tmpq 138 add ditherq, 7 ; log2_scale 139 sub tmp2q, widthq 140 movd m2, ditherd ; log2_scale 141 add tmp2q, tmp2q 142 lea ditherq, [pb_dither] 143 mov src_strideq, tmp2q 144 shl tmpq, 5 145 lea dither_heightq, [ditherq+dither_heightq*8] 146 pxor m7, m7 147 148.loop_height: 149 movq m3, [ditherq] 150 movq m4, m3 151 punpcklbw m3, m7 152 punpckhbw m4, m7 153 mov tmp2q,widthq 154 psraw m3, m5 155 psraw m4, m5 156 157.loop_width: 158 movq m0, [srcq] 159 movq m1, [srcq+8] 160 paddw m0, m3 161 paddw m0, [srcq+tmpq] 162 paddw m1, m4 163 movq m6, [srcq+tmpq+8] 164 movq [srcq+tmpq], m7 165 psraw m0, m2 166 paddw m1, m6 167 movq [srcq+tmpq+8], m7 168 psraw m1, m2 169 packuswb m0, m1 170 movq [dstq], m0 171 add srcq, 16 172 add dstq, 8 173 sub tmp2q, 8 174 jg .loop_width 175 176 add srcq, src_strideq 177 add ditherq, 8 178 add dstq, dst_strideq 179 cmp ditherq, dither_heightq 180 jl .loop_height 181 RET 182 183;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); 184cglobal mul_thrmat, 3, 3, 0, thrn, thr, q 185 movd m7, qd 186 movq m0, [thrnq] 187 punpcklwd m7, m7 188 movq m1, [thrnq+8] 189 punpckldq m7, m7 190 pmullw m0, m7 191 movq m2, [thrnq+8*2] 192 pmullw m1, m7 193 movq m3, [thrnq+8*3] 194 pmullw m2, m7 195 movq [thrq], m0 196 movq m4, [thrnq+8*4] 197 pmullw m3, m7 198 movq [thrq+8], m1 199 movq m5, [thrnq+8*5] 200 pmullw m4, m7 201 movq [thrq+8*2], m2 202 movq m6, [thrnq+8*6] 203 pmullw m5, m7 204 movq [thrq+8*3], m3 205 movq m0, [thrnq+8*7] 206 pmullw m6, m7 207 movq [thrq+8*4], m4 208 movq m1, [thrnq+8*7+8] 209 pmullw m0, m7 210 movq [thrq+8*5], m5 211 movq m2, [thrnq+8*7+8*2] 212 pmullw m1, m7 213 movq [thrq+8*6], m6 214 movq m3, [thrnq+8*7+8*3] 215 pmullw m2, m7 216 movq [thrq+8*7], m0 217 movq m4, [thrnq+8*7+8*4] 218 pmullw m3, m7 219 movq [thrq+8*7+8], m1 220 movq m5, [thrnq+8*7+8*5] 221 pmullw m4, m7 222 movq [thrq+8*7+8*2], m2 223 movq m6, [thrnq+8*7+8*6] 224 pmullw m5, m7 225 movq [thrq+8*7+8*3], m3 226 movq m0, [thrnq+14*8] 227 pmullw m6, m7 228 movq [thrq+8*7+8*4], m4 229 movq m1, [thrnq+14*8+8] 230 pmullw m0, m7 231 movq [thrq+8*7+8*5], m5 232 pmullw m1, m7 233 movq [thrq+8*7+8*6], m6 234 movq [thrq+14*8], m0 235 movq [thrq+14*8+8], m1 236 RET 237 238%macro COLUMN_FDCT 1-3 0, 0 239 movq m1, [srcq+DCTSIZE*0*2] 240 movq m7, [srcq+DCTSIZE*3*2] 241 movq m0, m1 242 paddw m1, [srcq+DCTSIZE*7*2] 243 movq m3, m7 244 paddw m7, [srcq+DCTSIZE*4*2] 245 movq m5, m1 246 movq m6, [srcq+DCTSIZE*1*2] 247 psubw m1, m7 248 movq m2, [srcq+DCTSIZE*2*2] 249 movq m4, m6 250 paddw m6, [srcq+DCTSIZE*6*2] 251 paddw m5, m7 252 paddw m2, [srcq+DCTSIZE*5*2] 253 movq m7, m6 254 paddw m6, m2 255 psubw m7, m2 256 movq m2, m5 257 paddw m5, m6 258 psubw m2, m6 259 paddw m7, m1 260 movq m6, [thrq+4*16+%2] 261 psllw m7, 2 262 psubw m5, [thrq+%2] 263 psubw m2, m6 264 paddusw m5, [thrq+%2] 265 paddusw m2, m6 266 pmulhw m7, [pw_2D41] 267 paddw m5, [thrq+%2] 268 paddw m2, m6 269 psubusw m5, [thrq+%2] 270 psubusw m2, m6 271 paddw m5, [pw_2] 272 movq m6, m2 273 paddw m2, m5 274 psubw m5, m6 275 movq m6, m1 276 paddw m1, m7 277 psubw m1, [thrq+2*16+%2] 278 psubw m6, m7 279 movq m7, [thrq+6*16+%2] 280 psraw m5, 2 281 paddusw m1, [thrq+2*16+%2] 282 psubw m6, m7 283 paddw m1, [thrq+2*16+%2] 284 paddusw m6, m7 285 psubusw m1, [thrq+2*16+%2] 286 paddw m6, m7 287 psubw m3, [srcq+DCTSIZE*4*2] 288 psubusw m6, m7 289 movq m7, m1 290 psraw m2, 2 291 psubw m4, [srcq+DCTSIZE*6*2] 292 psubw m1, m6 293 psubw m0, [srcq+DCTSIZE*7*2] 294 paddw m6, m7 295 psraw m6, 2 296 movq m7, m2 297 pmulhw m1, [pw_5A82] 298 paddw m2, m6 299 movq [rsp], m2 300 psubw m7, m6 301 movq m2, [srcq+DCTSIZE*2*2] 302 psubw m1, m6 303 psubw m2, [srcq+DCTSIZE*5*2] 304 movq m6, m5 305 movq [rsp+8*3], m7 306 paddw m3, m2 307 paddw m2, m4 308 paddw m4, m0 309 movq m7, m3 310 psubw m3, m4 311 psllw m3, 2 312 psllw m7, 2 313 pmulhw m3, [pw_187E] 314 psllw m4, 2 315 pmulhw m7, [pw_22A3] 316 psllw m2, 2 317 pmulhw m4, [pw_539F] 318 paddw m5, m1 319 pmulhw m2, [pw_2D41] 320 psubw m6, m1 321 paddw m7, m3 322 movq [rsp+8], m5 323 paddw m4, m3 324 movq m3, [thrq+3*16+%2] 325 movq m1, m0 326 movq [rsp+8*2], m6 327 psubw m1, m2 328 paddw m0, m2 329 movq m5, m1 330 movq m2, [thrq+5*16+%2] 331 psubw m1, m7 332 paddw m5, m7 333 psubw m1, m3 334 movq m7, [thrq+16+%2] 335 psubw m5, m2 336 movq m6, m0 337 paddw m0, m4 338 paddusw m1, m3 339 psubw m6, m4 340 movq m4, [thrq+7*16+%2] 341 psubw m0, m7 342 psubw m6, m4 343 paddusw m5, m2 344 paddusw m6, m4 345 paddw m1, m3 346 paddw m5, m2 347 paddw m6, m4 348 psubusw m1, m3 349 psubusw m5, m2 350 psubusw m6, m4 351 movq m4, m1 352 por m4, m5 353 paddusw m0, m7 354 por m4, m6 355 paddw m0, m7 356 packssdw m4, m4 357 psubusw m0, m7 358 movd tmpd, m4 359 or tmpd, tmpd 360 jnz %1 361 movq m4, [rsp] 362 movq m1, m0 363 pmulhw m0, [pw_3642] 364 movq m2, m1 365 movq m5, [outq+DCTSIZE*0*2] 366 movq m3, m2 367 pmulhw m1, [pw_2441] 368 paddw m5, m4 369 movq m6, [rsp+8] 370 psraw m3, 2 371 pmulhw m2, [pw_0CBB] 372 psubw m4, m3 373 movq m7, [outq+DCTSIZE*1*2] 374 paddw m5, m3 375 movq [outq+DCTSIZE*7*2], m4 376 paddw m7, m6 377 movq m3, [rsp+8*2] 378 psubw m6, m0 379 movq m4, [outq+DCTSIZE*2*2] 380 paddw m7, m0 381 movq [outq], m5 382 paddw m4, m3 383 movq [outq+DCTSIZE*6*2], m6 384 psubw m3, m1 385 movq m5, [outq+DCTSIZE*5*2] 386 paddw m4, m1 387 movq m6, [outq+DCTSIZE*3*2] 388 paddw m5, m3 389 movq m0, [rsp+8*3] 390 add srcq, 8+%3 391 movq [outq+DCTSIZE*1*2], m7 392 paddw m6, m0 393 movq [outq+DCTSIZE*2*2], m4 394 psubw m0, m2 395 movq m7, [outq+DCTSIZE*4*2] 396 paddw m6, m2 397 movq [outq+DCTSIZE*5*2], m5 398 paddw m7, m0 399 movq [outq+DCTSIZE*3*2], m6 400 movq [outq+DCTSIZE*4*2], m7 401 add outq, 8+%3 402%endmacro 403 404%macro COLUMN_IDCT 0-1 0 405 movq m3, m5 406 psubw m5, m1 407 psllw m5, 1 408 paddw m3, m1 409 movq m2, m0 410 psubw m0, m6 411 movq m1, m5 412 psllw m0, 1 413 pmulhw m1, [pw_AC62] 414 paddw m5, m0 415 pmulhw m5, [pw_3B21] 416 paddw m2, m6 417 pmulhw m0, [pw_22A3] 418 movq m7, m2 419 movq m4, [rsp] 420 psubw m2, m3 421 psllw m2, 1 422 paddw m7, m3 423 pmulhw m2, [pw_2D41] 424 movq m6, m4 425 psraw m7, 2 426 paddw m4, [outq] 427 psubw m6, m7 428 movq m3, [rsp+8] 429 paddw m4, m7 430 movq [outq+DCTSIZE*7*2], m6 431 paddw m1, m5 432 movq [outq], m4 433 psubw m1, m7 434 movq m7, [rsp+8*2] 435 psubw m0, m5 436 movq m6, [rsp+8*3] 437 movq m5, m3 438 paddw m3, [outq+DCTSIZE*1*2] 439 psubw m5, m1 440 psubw m2, m1 441 paddw m3, m1 442 movq [outq+DCTSIZE*6*2], m5 443 movq m4, m7 444 paddw m7, [outq+DCTSIZE*2*2] 445 psubw m4, m2 446 paddw m4, [outq+DCTSIZE*5*2] 447 paddw m7, m2 448 movq [outq+DCTSIZE*1*2], m3 449 paddw m0, m2 450 movq [outq+DCTSIZE*2*2], m7 451 movq m1, m6 452 paddw m6, [outq+DCTSIZE*4*2] 453 psubw m1, m0 454 paddw m1, [outq+DCTSIZE*3*2] 455 paddw m6, m0 456 movq [outq+DCTSIZE*5*2], m4 457 add srcq, 8+%1 458 movq [outq+DCTSIZE*4*2], m6 459 movq [outq+DCTSIZE*3*2], m1 460 add outq, 8+%1 461%endmacro 462 463;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); 464cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp 465.fdct1: 466 COLUMN_FDCT .idct1 467 jmp .fdct2 468 469.idct1: 470 COLUMN_IDCT 471 472.fdct2: 473 COLUMN_FDCT .idct2, 8, 16 474 sub cntd, 2 475 jg .fdct1 476 RET 477 478.idct2: 479 COLUMN_IDCT 16 480 sub cntd, 2 481 jg .fdct1 482 RET 483 484;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); 485cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 486 add strideq, strideq 487 lea stride3q, [strideq+strideq*2] 488.loop: 489 movq m0, [srcq+DCTSIZE*0*2] 490 movq m1, [srcq+DCTSIZE*1*2] 491 movq m4, m0 492 movq m2, [srcq+DCTSIZE*2*2] 493 punpcklwd m0, m1 494 movq m3, [srcq+DCTSIZE*3*2] 495 punpckhwd m4, m1 496 movq m7, m2 497 punpcklwd m2, m3 498 movq m6, m0 499 punpckldq m0, m2 500 punpckhdq m6, m2 501 movq m5, m0 502 punpckhwd m7, m3 503 psubw m0, m6 504 pmulhw m0, [pw_5A82] 505 movq m2, m4 506 punpckldq m4, m7 507 paddw m5, m6 508 punpckhdq m2, m7 509 movq m1, m4 510 psllw m0, 2 511 paddw m4, m2 512 movq m3, [srcq+DCTSIZE*0*2+8] 513 psubw m1, m2 514 movq m2, [srcq+DCTSIZE*1*2+8] 515 psubw m0, m5 516 movq m6, m4 517 paddw m4, m5 518 psubw m6, m5 519 movq m7, m1 520 movq m5, [srcq+DCTSIZE*2*2+8] 521 paddw m1, m0 522 movq [rsp], m4 523 movq m4, m3 524 movq [rsp+8], m6 525 punpcklwd m3, m2 526 movq m6, [srcq+DCTSIZE*3*2+8] 527 punpckhwd m4, m2 528 movq m2, m5 529 punpcklwd m5, m6 530 psubw m7, m0 531 punpckhwd m2, m6 532 movq m0, m3 533 punpckldq m3, m5 534 punpckhdq m0, m5 535 movq m5, m4 536 movq m6, m3 537 punpckldq m4, m2 538 psubw m3, m0 539 punpckhdq m5, m2 540 paddw m6, m0 541 movq m2, m4 542 movq m0, m3 543 psubw m4, m5 544 pmulhw m0, [pw_AC62] 545 paddw m3, m4 546 pmulhw m3, [pw_3B21] 547 paddw m2, m5 548 pmulhw m4, [pw_22A3] 549 movq m5, m2 550 psubw m2, m6 551 paddw m5, m6 552 pmulhw m2, [pw_2D41] 553 paddw m0, m3 554 psllw m0, 3 555 psubw m4, m3 556 movq m6, [rsp] 557 movq m3, m1 558 psllw m4, 3 559 psubw m0, m5 560 psllw m2, 3 561 paddw m1, m0 562 psubw m2, m0 563 psubw m3, m0 564 paddw m4, m2 565 movq m0, m7 566 paddw m7, m2 567 psubw m0, m2 568 movq m2, [pw_4] 569 psubw m6, m5 570 paddw m5, [rsp] 571 paddw m1, m2 572 paddw m5, m2 573 psraw m1, 3 574 paddw m7, m2 575 psraw m5, 3 576 paddw m5, [dstq] 577 psraw m7, 3 578 paddw m1, [dstq+strideq*1] 579 paddw m0, m2 580 paddw m7, [dstq+strideq*2] 581 paddw m3, m2 582 movq [dstq], m5 583 paddw m6, m2 584 movq [dstq+strideq*1], m1 585 psraw m0, 3 586 movq [dstq+strideq*2], m7 587 add dstq, stride3q 588 movq m5, [rsp+8] 589 psraw m3, 3 590 paddw m0, [dstq+strideq*2] 591 psubw m5, m4 592 paddw m3, [dstq+stride3q*1] 593 psraw m6, 3 594 paddw m4, [rsp+8] 595 paddw m5, m2 596 paddw m6, [dstq+strideq*4] 597 paddw m4, m2 598 movq [dstq+strideq*2], m0 599 psraw m5, 3 600 paddw m5, [dstq] 601 psraw m4, 3 602 paddw m4, [dstq+strideq*1] 603 add srcq, DCTSIZE*2*4 604 movq [dstq+stride3q*1], m3 605 movq [dstq+strideq*4], m6 606 movq [dstq], m5 607 movq [dstq+strideq*1], m4 608 sub dstq, stride3q 609 add dstq, 8 610 dec r3d 611 jnz .loop 612 RET 613 614;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); 615cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3 616 lea stride3q, [strideq+strideq*2] 617.loop: 618 movd m0, [pixq] 619 pxor m7, m7 620 movd m1, [pixq+strideq*1] 621 punpcklbw m0, m7 622 movd m2, [pixq+strideq*2] 623 punpcklbw m1, m7 624 punpcklbw m2, m7 625 add pixq,stride3q 626 movq m5, m0 627 movd m3, [pixq+strideq*4] 628 movq m6, m1 629 movd m4, [pixq+stride3q*1] 630 punpcklbw m3, m7 631 psubw m5, m3 632 punpcklbw m4, m7 633 paddw m0, m3 634 psubw m6, m4 635 movd m3, [pixq+strideq*2] 636 paddw m1, m4 637 movq [rsp], m5 638 punpcklbw m3, m7 639 movq [rsp+8], m6 640 movq m4, m2 641 movd m5, [pixq] 642 paddw m2, m3 643 movd m6, [pixq+strideq*1] 644 punpcklbw m5, m7 645 psubw m4, m3 646 punpcklbw m6, m7 647 movq m3, m5 648 paddw m5, m6 649 psubw m3, m6 650 movq m6, m0 651 movq m7, m1 652 psubw m0, m5 653 psubw m1, m2 654 paddw m7, m2 655 paddw m1, m0 656 movq m2, m7 657 psllw m1, 2 658 paddw m6, m5 659 pmulhw m1, [pw_2D41] 660 paddw m7, m6 661 psubw m6, m2 662 movq m5, m0 663 movq m2, m7 664 punpcklwd m7, m6 665 paddw m0, m1 666 punpckhwd m2, m6 667 psubw m5, m1 668 movq m6, m0 669 movq m1, [rsp+8] 670 punpcklwd m0, m5 671 punpckhwd m6, m5 672 movq m5, m0 673 punpckldq m0, m7 674 paddw m3, m4 675 punpckhdq m5, m7 676 movq m7, m6 677 movq [srcq+DCTSIZE*0*2], m0 678 punpckldq m6, m2 679 movq [srcq+DCTSIZE*1*2], m5 680 punpckhdq m7, m2 681 movq [srcq+DCTSIZE*2*2], m6 682 paddw m4, m1 683 movq [srcq+DCTSIZE*3*2], m7 684 psllw m3, 2 685 movq m2, [rsp] 686 psllw m4, 2 687 pmulhw m4, [pw_2D41] 688 paddw m1, m2 689 psllw m1, 2 690 movq m0, m3 691 pmulhw m0, [pw_22A3] 692 psubw m3, m1 693 pmulhw m3, [pw_187E] 694 movq m5, m2 695 pmulhw m1, [pw_539F] 696 psubw m2, m4 697 paddw m5, m4 698 movq m6, m2 699 paddw m0, m3 700 movq m7, m5 701 paddw m2, m0 702 psubw m6, m0 703 movq m4, m2 704 paddw m1, m3 705 punpcklwd m2, m6 706 paddw m5, m1 707 punpckhwd m4, m6 708 psubw m7, m1 709 movq m6, m5 710 punpcklwd m5, m7 711 punpckhwd m6, m7 712 movq m7, m2 713 punpckldq m2, m5 714 sub pixq, stride3q 715 punpckhdq m7, m5 716 movq m5, m4 717 movq [srcq+DCTSIZE*0*2+8], m2 718 punpckldq m4, m6 719 movq [srcq+DCTSIZE*1*2+8], m7 720 punpckhdq m5, m6 721 movq [srcq+DCTSIZE*2*2+8], m4 722 add pixq, 4 723 movq [srcq+DCTSIZE*3*2+8], m5 724 add srcq, DCTSIZE*4*2 725 dec cntd 726 jnz .loop 727 RET 728