1; 2; Simple IDCT MMX 3; 4; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> 5; 6; Conversion from gcc syntax to x264asm syntax with minimal modifications 7; by James Darnley <jdarnley@obe.tv>. 8; 9; This file is part of FFmpeg. 10; 11; FFmpeg is free software; you can redistribute it and/or 12; modify it under the terms of the GNU Lesser General Public 13; License as published by the Free Software Foundation; either 14; version 2.1 of the License, or (at your option) any later version. 15; 16; FFmpeg is distributed in the hope that it will be useful, 17; but WITHOUT ANY WARRANTY; without even the implied warranty of 18; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19; Lesser General Public License for more details. 20; 21; You should have received a copy of the GNU Lesser General Public 22; License along with FFmpeg; if not, write to the Free Software 23; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;/ 25 26%include "libavutil/x86/x86util.asm" 27 28SECTION_RODATA 29 30cextern pb_80 31 32wm1010: dw 0, 0xffff, 0, 0xffff 33d40000: dd 4 << 16, 0 34 35; 23170.475006 36; 22725.260826 37; 21406.727617 38; 19265.545870 39; 16384.000000 40; 12872.826198 41; 8866.956905 42; 4520.335430 43 44%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 45%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 46%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 47%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 48%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 49%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 50%define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 51%define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 52 53%define ROW_SHIFT 11 54%define COL_SHIFT 20 ; 6 55 56coeffs: 57 dw 1 << (ROW_SHIFT - 1), 0 58 dw 1 << (ROW_SHIFT - 1), 0 59 dw 1 << (ROW_SHIFT - 1), 1 60 dw 1 << (ROW_SHIFT - 1), 0 61 62 dw C4, C4, C4, C4 63 dw C4, -C4, C4, -C4 64 65 dw C2, C6, C2, C6 66 dw C6, -C2, C6, -C2 67 68 dw C1, C3, C1, C3 69 dw C5, C7, C5, C7 70 71 dw C3, -C7, C3, -C7 72 dw -C1, -C5, -C1, -C5 73 74 dw C5, -C1, C5, -C1 75 dw C7, C3, C7, C3 76 77 dw C7, -C5, C7, -C5 78 dw C3, -C1, C3, -C1 79 80SECTION .text 81 82%macro DC_COND_IDCT 7 83 movq mm0, [blockq + %1] ; R4 R0 r4 r0 84 movq mm1, [blockq + %2] ; R6 R2 r6 r2 85 movq mm2, [blockq + %3] ; R3 R1 r3 r1 86 movq mm3, [blockq + %4] ; R7 R5 r7 r5 87 movq mm4, [wm1010] 88 pand mm4, mm0 89 por mm4, mm1 90 por mm4, mm2 91 por mm4, mm3 92 packssdw mm4, mm4 93 movd t0d, mm4 94 or t0d, t0d 95 jz %%1 96 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 97 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 98 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 99 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 100 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 101 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 102 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 103 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 104 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 105 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 106 paddd mm4, [coeffs + 8] 107 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 108 paddd mm4, mm5 ; A0 a0 109 psubd mm6, mm5 ; A3 a3 110 movq mm5, [coeffs + 56] ; C7 C5 C7 C5 111 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 112 paddd mm0, [coeffs + 8] 113 paddd mm1, mm0 ; A1 a1 114 paddd mm0, mm0 115 psubd mm0, mm1 ; A2 a2 116 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 117 paddd mm7, mm5 ; B0 b0 118 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 119 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 120 paddd mm7, mm4 ; A0+B0 a0+b0 121 paddd mm4, mm4 ; 2A0 2a0 122 psubd mm4, mm7 ; A0-B0 a0-b0 123 paddd mm5, mm2 ; B1 b1 124 psrad mm7, %7 125 psrad mm4, %7 126 movq mm2, mm1 ; A1 a1 127 paddd mm1, mm5 ; A1+B1 a1+b1 128 psubd mm2, mm5 ; A1-B1 a1-b1 129 psrad mm1, %7 130 psrad mm2, %7 131 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 132 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 133 movq [%5], mm7 134 movq mm1, [blockq + %3] ; R3 R1 r3 r1 135 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 136 movq [24 + %5], mm2 137 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 138 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 139 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 140 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 141 movq mm2, mm0 ; A2 a2 142 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 143 paddd mm4, mm7 ; B2 b2 144 paddd mm2, mm4 ; A2+B2 a2+b2 145 psubd mm0, mm4 ; a2-B2 a2-b2 146 psrad mm2, %7 147 psrad mm0, %7 148 movq mm4, mm6 ; A3 a3 149 paddd mm3, mm1 ; B3 b3 150 paddd mm6, mm3 ; A3+B3 a3+b3 151 psubd mm4, mm3 ; a3-B3 a3-b3 152 psrad mm6, %7 153 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 154 movq [8 + %5], mm2 155 psrad mm4, %7 156 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 157 movq [16 + %5], mm4 158 jmp %%2 159%%1: 160 pslld mm0, 16 161 paddd mm0, [d40000] 162 psrad mm0, 13 163 packssdw mm0, mm0 164 movq [%5], mm0 165 movq [8 + %5], mm0 166 movq [16 + %5], mm0 167 movq [24 + %5], mm0 168%%2: 169%endmacro 170 171%macro Z_COND_IDCT 8 172 movq mm0, [blockq + %1] ; R4 R0 r4 r0 173 movq mm1, [blockq + %2] ; R6 R2 r6 r2 174 movq mm2, [blockq + %3] ; R3 R1 r3 r1 175 movq mm3, [blockq + %4] ; R7 R5 r7 r5 176 movq mm4, mm0 177 por mm4, mm1 178 por mm4, mm2 179 por mm4, mm3 180 packssdw mm4, mm4 181 movd t0d, mm4 182 or t0d, t0d 183 jz %8 184 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 185 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 186 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 187 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 188 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 189 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 190 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 191 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 192 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 193 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 194 paddd mm4, [coeffs] 195 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 196 paddd mm4, mm5 ; A0 a0 197 psubd mm6, mm5 ; A3 a3 198 movq mm5, [coeffs + 56] ; C7 C5 C7 C5 199 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 200 paddd mm0, [coeffs] 201 paddd mm1, mm0 ; A1 a1 202 paddd mm0, mm0 203 psubd mm0, mm1 ; A2 a2 204 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 205 paddd mm7, mm5 ; B0 b0 206 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 207 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 208 paddd mm7, mm4 ; A0+B0 a0+b0 209 paddd mm4, mm4 ; 2A0 2a0 210 psubd mm4, mm7 ; A0-B0 a0-b0 211 paddd mm5, mm2 ; B1 b1 212 psrad mm7, %7 213 psrad mm4, %7 214 movq mm2, mm1 ; A1 a1 215 paddd mm1, mm5 ; A1+B1 a1+b1 216 psubd mm2, mm5 ; A1-B1 a1-b1 217 psrad mm1, %7 218 psrad mm2, %7 219 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 220 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 221 movq [%5], mm7 222 movq mm1, [blockq + %3] ; R3 R1 r3 r1 223 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 224 movq [24 + %5], mm2 225 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 226 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 227 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 228 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 229 movq mm2, mm0 ; A2 a2 230 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 231 paddd mm4, mm7 ; B2 b2 232 paddd mm2, mm4 ; A2+B2 a2+b2 233 psubd mm0, mm4 ; a2-B2 a2-b2 234 psrad mm2, %7 235 psrad mm0, %7 236 movq mm4, mm6 ; A3 a3 237 paddd mm3, mm1 ; B3 b3 238 paddd mm6, mm3 ; A3+B3 a3+b3 239 psubd mm4, mm3 ; a3-B3 a3-b3 240 psrad mm6, %7 241 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 242 movq [8 + %5], mm2 243 psrad mm4, %7 244 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 245 movq [16 + %5], mm4 246%endmacro 247 248%macro IDCT1 6 249 movq mm0, %1 ; R4 R0 r4 r0 250 movq mm1, %2 ; R6 R2 r6 r2 251 movq mm2, %3 ; R3 R1 r3 r1 252 movq mm3, %4 ; R7 R5 r7 r5 253 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 254 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 255 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 256 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 257 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 258 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 259 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 260 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 261 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 262 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 263 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 264 paddd mm4, mm5 ; A0 a0 265 psubd mm6, mm5 ; A3 a3 266 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 267 paddd mm0, mm1 ; A1 a1 268 psubd mm5, mm1 ; A2 a2 269 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 270 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 271 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 272 paddd mm7, mm1 ; B0 b0 273 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 274 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 275 paddd mm7, mm4 ; A0+B0 a0+b0 276 paddd mm4, mm4 ; 2A0 2a0 277 psubd mm4, mm7 ; A0-B0 a0-b0 278 paddd mm1, mm2 ; B1 b1 279 psrad mm7, %6 280 psrad mm4, %6 281 movq mm2, mm0 ; A1 a1 282 paddd mm0, mm1 ; A1+B1 a1+b1 283 psubd mm2, mm1 ; A1-B1 a1-b1 284 psrad mm0, %6 285 psrad mm2, %6 286 packssdw mm7, mm7 ; A0+B0 a0+b0 287 movd [%5], mm7 288 packssdw mm0, mm0 ; A1+B1 a1+b1 289 movd [16 + %5], mm0 290 packssdw mm2, mm2 ; A1-B1 a1-b1 291 movd [96 + %5], mm2 292 packssdw mm4, mm4 ; A0-B0 a0-b0 293 movd [112 + %5], mm4 294 movq mm0, %3 ; R3 R1 r3 r1 295 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 296 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 297 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 298 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 299 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 300 movq mm2, mm5 ; A2 a2 301 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 302 paddd mm4, mm7 ; B2 b2 303 paddd mm2, mm4 ; A2+B2 a2+b2 304 psubd mm5, mm4 ; a2-B2 a2-b2 305 psrad mm2, %6 306 psrad mm5, %6 307 movq mm4, mm6 ; A3 a3 308 paddd mm3, mm0 ; B3 b3 309 paddd mm6, mm3 ; A3+B3 a3+b3 310 psubd mm4, mm3 ; a3-B3 a3-b3 311 psrad mm6, %6 312 psrad mm4, %6 313 packssdw mm2, mm2 ; A2+B2 a2+b2 314 packssdw mm6, mm6 ; A3+B3 a3+b3 315 movd [32 + %5], mm2 316 packssdw mm4, mm4 ; A3-B3 a3-b3 317 packssdw mm5, mm5 ; A2-B2 a2-b2 318 movd [48 + %5], mm6 319 movd [64 + %5], mm4 320 movd [80 + %5], mm5 321%endmacro 322 323%macro IDCT2 6 324 movq mm0, %1 ; R4 R0 r4 r0 325 movq mm1, %2 ; R6 R2 r6 r2 326 movq mm3, %4 ; R7 R5 r7 r5 327 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 328 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 329 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 330 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 331 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 332 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 333 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 334 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 335 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 336 paddd mm4, mm5 ; A0 a0 337 psubd mm6, mm5 ; A3 a3 338 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 339 paddd mm0, mm1 ; A1 a1 340 psubd mm5, mm1 ; A2 a2 341 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 342 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 343 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 344 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 345 paddd mm1, mm4 ; A0+B0 a0+b0 346 paddd mm4, mm4 ; 2A0 2a0 347 psubd mm4, mm1 ; A0-B0 a0-b0 348 psrad mm1, %6 349 psrad mm4, %6 350 movq mm2, mm0 ; A1 a1 351 paddd mm0, mm7 ; A1+B1 a1+b1 352 psubd mm2, mm7 ; A1-B1 a1-b1 353 psrad mm0, %6 354 psrad mm2, %6 355 packssdw mm1, mm1 ; A0+B0 a0+b0 356 movd [%5], mm1 357 packssdw mm0, mm0 ; A1+B1 a1+b1 358 movd [16 + %5], mm0 359 packssdw mm2, mm2 ; A1-B1 a1-b1 360 movd [96 + %5], mm2 361 packssdw mm4, mm4 ; A0-B0 a0-b0 362 movd [112 + %5], mm4 363 movq mm1, [coeffs + 88] ; C3 C7 C3 C7 364 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 365 movq mm2, mm5 ; A2 a2 366 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 367 paddd mm2, mm1 ; A2+B2 a2+b2 368 psubd mm5, mm1 ; a2-B2 a2-b2 369 psrad mm2, %6 370 psrad mm5, %6 371 movq mm1, mm6 ; A3 a3 372 paddd mm6, mm3 ; A3+B3 a3+b3 373 psubd mm1, mm3 ; a3-B3 a3-b3 374 psrad mm6, %6 375 psrad mm1, %6 376 packssdw mm2, mm2 ; A2+B2 a2+b2 377 packssdw mm6, mm6 ; A3+B3 a3+b3 378 movd [32 + %5], mm2 379 packssdw mm1, mm1 ; A3-B3 a3-b3 380 packssdw mm5, mm5 ; A2-B2 a2-b2 381 movd [48 + %5], mm6 382 movd [64 + %5], mm1 383 movd [80 + %5], mm5 384%endmacro 385 386%macro IDCT3 6 387 movq mm0, %1 ; R4 R0 r4 r0 388 movq mm3, %4 ; R7 R5 r7 r5 389 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 390 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 391 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 392 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 393 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 394 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 395 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 396 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 397 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 398 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 399 paddd mm1, mm4 ; A0+B0 a0+b0 400 paddd mm4, mm4 ; 2A0 2a0 401 psubd mm4, mm1 ; A0-B0 a0-b0 402 psrad mm1, %6 403 psrad mm4, %6 404 movq mm2, mm0 ; A1 a1 405 paddd mm0, mm7 ; A1+B1 a1+b1 406 psubd mm2, mm7 ; A1-B1 a1-b1 407 psrad mm0, %6 408 psrad mm2, %6 409 packssdw mm1, mm1 ; A0+B0 a0+b0 410 movd [%5], mm1 411 packssdw mm0, mm0 ; A1+B1 a1+b1 412 movd [16 + %5], mm0 413 packssdw mm2, mm2 ; A1-B1 a1-b1 414 movd [96 + %5], mm2 415 packssdw mm4, mm4 ; A0-B0 a0-b0 416 movd [112 + %5], mm4 417 movq mm1, [coeffs + 88] ; C3 C7 C3 C7 418 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 419 movq mm2, mm5 ; A2 a2 420 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 421 paddd mm2, mm1 ; A2+B2 a2+b2 422 psubd mm5, mm1 ; a2-B2 a2-b2 423 psrad mm2, %6 424 psrad mm5, %6 425 movq mm1, mm6 ; A3 a3 426 paddd mm6, mm3 ; A3+B3 a3+b3 427 psubd mm1, mm3 ; a3-B3 a3-b3 428 psrad mm6, %6 429 psrad mm1, %6 430 packssdw mm2, mm2 ; A2+B2 a2+b2 431 packssdw mm6, mm6 ; A3+B3 a3+b3 432 movd [32 + %5], mm2 433 packssdw mm1, mm1 ; A3-B3 a3-b3 434 packssdw mm5, mm5 ; A2-B2 a2-b2 435 movd [48 + %5], mm6 436 movd [64 + %5], mm1 437 movd [80 + %5], mm5 438%endmacro 439 440%macro IDCT4 6 441 movq mm0, %1 ; R4 R0 r4 r0 442 movq mm2, %3 ; R3 R1 r3 r1 443 movq mm3, %4 ; R7 R5 r7 r5 444 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 445 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 446 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 447 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 448 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 449 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 450 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 451 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 452 movq mm1, [coeffs + 56] ; C7 C5 C7 C5 453 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 454 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 455 paddd mm7, mm1 ; B0 b0 456 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 457 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 458 paddd mm7, mm4 ; A0+B0 a0+b0 459 paddd mm4, mm4 ; 2A0 2a0 460 psubd mm4, mm7 ; A0-B0 a0-b0 461 paddd mm1, mm2 ; B1 b1 462 psrad mm7, %6 463 psrad mm4, %6 464 movq mm2, mm0 ; A1 a1 465 paddd mm0, mm1 ; A1+B1 a1+b1 466 psubd mm2, mm1 ; A1-B1 a1-b1 467 psrad mm0, %6 468 psrad mm2, %6 469 packssdw mm7, mm7 ; A0+B0 a0+b0 470 movd [%5], mm7 471 packssdw mm0, mm0 ; A1+B1 a1+b1 472 movd [16 + %5], mm0 473 packssdw mm2, mm2 ; A1-B1 a1-b1 474 movd [96 + %5], mm2 475 packssdw mm4, mm4 ; A0-B0 a0-b0 476 movd [112 + %5], mm4 477 movq mm0, %3 ; R3 R1 r3 r1 478 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 479 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 480 movq mm7, [coeffs + 88] ; C3 C7 C3 C7 481 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 482 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 483 movq mm2, mm5 ; A2 a2 484 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 485 paddd mm4, mm7 ; B2 b2 486 paddd mm2, mm4 ; A2+B2 a2+b2 487 psubd mm5, mm4 ; a2-B2 a2-b2 488 psrad mm2, %6 489 psrad mm5, %6 490 movq mm4, mm6 ; A3 a3 491 paddd mm3, mm0 ; B3 b3 492 paddd mm6, mm3 ; A3+B3 a3+b3 493 psubd mm4, mm3 ; a3-B3 a3-b3 494 psrad mm6, %6 495 psrad mm4, %6 496 packssdw mm2, mm2 ; A2+B2 a2+b2 497 packssdw mm6, mm6 ; A3+B3 a3+b3 498 movd [32 + %5], mm2 499 packssdw mm4, mm4 ; A3-B3 a3-b3 500 packssdw mm5, mm5 ; A2-B2 a2-b2 501 movd [48 + %5], mm6 502 movd [64 + %5], mm4 503 movd [80 + %5], mm5 504%endmacro 505 506%macro IDCT5 6 507 movq mm0, %1 ; R4 R0 r4 r0 508 movq mm2, %3 ; R3 R1 r3 r1 509 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 510 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 511 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 512 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 513 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 514 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 515 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 516 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 517 movq mm3, [coeffs + 64] 518 pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1 519 paddd mm7, mm4 ; A0+B0 a0+b0 520 paddd mm4, mm4 ; 2A0 2a0 521 psubd mm4, mm7 ; A0-B0 a0-b0 522 psrad mm7, %6 523 psrad mm4, %6 524 movq mm1, mm0 ; A1 a1 525 paddd mm0, mm3 ; A1+B1 a1+b1 526 psubd mm1, mm3 ; A1-B1 a1-b1 527 psrad mm0, %6 528 psrad mm1, %6 529 packssdw mm7, mm7 ; A0+B0 a0+b0 530 movd [%5], mm7 531 packssdw mm0, mm0 ; A1+B1 a1+b1 532 movd [16 + %5], mm0 533 packssdw mm1, mm1 ; A1-B1 a1-b1 534 movd [96 + %5], mm1 535 packssdw mm4, mm4 ; A0-B0 a0-b0 536 movd [112 + %5], mm4 537 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 538 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 539 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 540 movq mm1, mm5 ; A2 a2 541 paddd mm1, mm4 ; A2+B2 a2+b2 542 psubd mm5, mm4 ; a2-B2 a2-b2 543 psrad mm1, %6 544 psrad mm5, %6 545 movq mm4, mm6 ; A3 a3 546 paddd mm6, mm2 ; A3+B3 a3+b3 547 psubd mm4, mm2 ; a3-B3 a3-b3 548 psrad mm6, %6 549 psrad mm4, %6 550 packssdw mm1, mm1 ; A2+B2 a2+b2 551 packssdw mm6, mm6 ; A3+B3 a3+b3 552 movd [32 + %5], mm1 553 packssdw mm4, mm4 ; A3-B3 a3-b3 554 packssdw mm5, mm5 ; A2-B2 a2-b2 555 movd [48 + %5], mm6 556 movd [64 + %5], mm4 557 movd [80 + %5], mm5 558%endmacro 559 560%macro IDCT6 6 561 movq mm0, [%1] ; R4 R0 r4 r0 562 movq mm1, [%2] ; R6 R2 r6 r2 563 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 564 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 565 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 566 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 567 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 568 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 569 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 570 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 571 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 572 paddd mm4, mm5 ; A0 a0 573 psubd mm6, mm5 ; A3 a3 574 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 575 paddd mm0, mm1 ; A1 a1 576 psubd mm5, mm1 ; A2 a2 577 movq mm2, [8 + %1] ; R4 R0 r4 r0 578 movq mm3, [8 + %2] ; R6 R2 r6 r2 579 movq mm1, [coeffs + 16] ; C4 C4 C4 C4 580 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 581 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 582 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 583 movq mm7, [coeffs + 32] ; C6 C2 C6 C2 584 pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2 585 pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2 586 paddd mm7, mm1 ; A0 a0 587 paddd mm1, mm1 ; 2C0 2c0 588 psubd mm1, mm7 ; A3 a3 589 paddd mm3, mm2 ; A1 a1 590 paddd mm2, mm2 ; 2C1 2c1 591 psubd mm2, mm3 ; A2 a2 592 psrad mm4, %6 593 psrad mm7, %6 594 psrad mm3, %6 595 packssdw mm4, mm7 ; A0 a0 596 movq [%5], mm4 597 psrad mm0, %6 598 packssdw mm0, mm3 ; A1 a1 599 movq [16 + %5], mm0 600 movq [96 + %5], mm0 601 movq [112 + %5], mm4 602 psrad mm5, %6 603 psrad mm6, %6 604 psrad mm2, %6 605 packssdw mm5, mm2 ; A2-B2 a2-b2 606 movq [32 + %5], mm5 607 psrad mm1, %6 608 packssdw mm6, mm1 ; A3+B3 a3+b3 609 movq [48 + %5], mm6 610 movq [64 + %5], mm6 611 movq [80 + %5], mm5 612%endmacro 613 614%macro IDCT7 6 615 movq mm0, %1 ; R4 R0 r4 r0 616 movq mm1, %2 ; R6 R2 r6 r2 617 movq mm2, %3 ; R3 R1 r3 r1 618 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 619 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 620 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 621 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 622 movq mm5, [coeffs + 32] ; C6 C2 C6 C2 623 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 624 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 625 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 626 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 627 movq mm7, [coeffs + 48] ; C3 C1 C3 C1 628 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 629 paddd mm4, mm5 ; A0 a0 630 psubd mm6, mm5 ; A3 a3 631 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 632 paddd mm0, mm1 ; A1 a1 633 psubd mm5, mm1 ; A2 a2 634 movq mm1, [coeffs + 64] 635 pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1 636 paddd mm7, mm4 ; A0+B0 a0+b0 637 paddd mm4, mm4 ; 2A0 2a0 638 psubd mm4, mm7 ; A0-B0 a0-b0 639 psrad mm7, %6 640 psrad mm4, %6 641 movq mm3, mm0 ; A1 a1 642 paddd mm0, mm1 ; A1+B1 a1+b1 643 psubd mm3, mm1 ; A1-B1 a1-b1 644 psrad mm0, %6 645 psrad mm3, %6 646 packssdw mm7, mm7 ; A0+B0 a0+b0 647 movd [%5], mm7 648 packssdw mm0, mm0 ; A1+B1 a1+b1 649 movd [16 + %5], mm0 650 packssdw mm3, mm3 ; A1-B1 a1-b1 651 movd [96 + %5], mm3 652 packssdw mm4, mm4 ; A0-B0 a0-b0 653 movd [112 + %5], mm4 654 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 655 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 656 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 657 movq mm3, mm5 ; A2 a2 658 paddd mm3, mm4 ; A2+B2 a2+b2 659 psubd mm5, mm4 ; a2-B2 a2-b2 660 psrad mm3, %6 661 psrad mm5, %6 662 movq mm4, mm6 ; A3 a3 663 paddd mm6, mm2 ; A3+B3 a3+b3 664 psubd mm4, mm2 ; a3-B3 a3-b3 665 psrad mm6, %6 666 packssdw mm3, mm3 ; A2+B2 a2+b2 667 movd [32 + %5], mm3 668 psrad mm4, %6 669 packssdw mm6, mm6 ; A3+B3 a3+b3 670 movd [48 + %5], mm6 671 packssdw mm4, mm4 ; A3-B3 a3-b3 672 packssdw mm5, mm5 ; A2-B2 a2-b2 673 movd [64 + %5], mm4 674 movd [80 + %5], mm5 675%endmacro 676 677%macro IDCT8 6 678 movq mm0, [%1] ; R4 R0 r4 r0 679 movq mm4, [coeffs + 16] ; C4 C4 C4 C4 680 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 681 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 682 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 683 psrad mm4, %6 684 psrad mm0, %6 685 movq mm2, [8 + %1] ; R4 R0 r4 r0 686 movq mm1, [coeffs + 16] ; C4 C4 C4 C4 687 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 688 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 689 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 690 movq mm7, [coeffs + 32] ; C6 C2 C6 C2 691 psrad mm1, %6 692 packssdw mm4, mm1 ; A0 a0 693 movq [%5], mm4 694 psrad mm2, %6 695 packssdw mm0, mm2 ; A1 a1 696 movq [16 + %5], mm0 697 movq [96 + %5], mm0 698 movq [112 + %5], mm4 699 movq [32 + %5], mm0 700 movq [48 + %5], mm4 701 movq [64 + %5], mm4 702 movq [80 + %5], mm0 703%endmacro 704 705%macro IDCT 0 706 DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11 707 Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4 708 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2 709 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1 710 711 IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 712 IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 713 IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 714 IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 715 jmp %%9 716 717 ALIGN 16 718 %%4: 719 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6 720 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5 721 722 IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 723 IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 724 IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 725 IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 726 jmp %%9 727 728 ALIGN 16 729 %%6: 730 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7 731 732 IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 733 IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 734 IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 735 IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 736 jmp %%9 737 738 ALIGN 16 739 %%2: 740 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3 741 742 IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 743 IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 744 IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 745 IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 746 jmp %%9 747 748 ALIGN 16 749 %%3: 750 751 IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 752 IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 753 IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 754 IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 755 jmp %%9 756 757 ALIGN 16 758 %%5: 759 760 IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 761 IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 762 jmp %%9 763 764 ALIGN 16 765 %%1: 766 767 IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 768 IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 769 IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 770 IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 771 jmp %%9 772 773 ALIGN 16 774 %%7: 775 776 IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 777 IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 778 779 %%9: 780%endmacro 781 782%macro PUT_PIXELS_CLAMPED_HALF 1 783 mova m0, [blockq+mmsize*0+%1] 784 mova m1, [blockq+mmsize*2+%1] 785%if mmsize == 8 786 mova m2, [blockq+mmsize*4+%1] 787 mova m3, [blockq+mmsize*6+%1] 788%endif 789 packuswb m0, [blockq+mmsize*1+%1] 790 packuswb m1, [blockq+mmsize*3+%1] 791%if mmsize == 8 792 packuswb m2, [blockq+mmsize*5+%1] 793 packuswb m3, [blockq+mmsize*7+%1] 794 movq [pixelsq], m0 795 movq [lsizeq+pixelsq], m1 796 movq [2*lsizeq+pixelsq], m2 797 movq [lsize3q+pixelsq], m3 798%else 799 movq [pixelsq], m0 800 movhps [lsizeq+pixelsq], m0 801 movq [2*lsizeq+pixelsq], m1 802 movhps [lsize3q+pixelsq], m1 803%endif 804%endmacro 805 806%macro ADD_PIXELS_CLAMPED 1 807 mova m0, [blockq+mmsize*0+%1] 808 mova m1, [blockq+mmsize*1+%1] 809%if mmsize == 8 810 mova m5, [blockq+mmsize*2+%1] 811 mova m6, [blockq+mmsize*3+%1] 812%endif 813 movq m2, [pixelsq] 814 movq m3, [pixelsq+lsizeq] 815%if mmsize == 8 816 mova m7, m2 817 punpcklbw m2, m4 818 punpckhbw m7, m4 819 paddsw m0, m2 820 paddsw m1, m7 821 mova m7, m3 822 punpcklbw m3, m4 823 punpckhbw m7, m4 824 paddsw m5, m3 825 paddsw m6, m7 826%else 827 punpcklbw m2, m4 828 punpcklbw m3, m4 829 paddsw m0, m2 830 paddsw m1, m3 831%endif 832 packuswb m0, m1 833%if mmsize == 8 834 packuswb m5, m6 835 movq [pixelsq], m0 836 movq [pixelsq+lsizeq], m5 837%else 838 movq [pixelsq], m0 839 movhps [pixelsq+lsizeq], m0 840%endif 841%endmacro 842 843INIT_MMX mmx 844 845cglobal simple_idct, 1, 2, 8, 128, block, t0 846 IDCT 847RET 848 849cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 850 IDCT 851 lea lsize3q, [lsizeq*3] 852 PUT_PIXELS_CLAMPED_HALF 0 853 lea pixelsq, [pixelsq+lsizeq*4] 854 PUT_PIXELS_CLAMPED_HALF 64 855RET 856 857cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0 858 IDCT 859 pxor m4, m4 860 ADD_PIXELS_CLAMPED 0 861 lea pixelsq, [pixelsq+lsizeq*2] 862 ADD_PIXELS_CLAMPED 32 863 lea pixelsq, [pixelsq+lsizeq*2] 864 ADD_PIXELS_CLAMPED 64 865 lea pixelsq, [pixelsq+lsizeq*2] 866 ADD_PIXELS_CLAMPED 96 867RET 868 869INIT_XMM sse2 870 871cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 872 IDCT 873 lea lsize3q, [lsizeq*3] 874 PUT_PIXELS_CLAMPED_HALF 0 875 lea pixelsq, [pixelsq+lsizeq*4] 876 PUT_PIXELS_CLAMPED_HALF 64 877RET 878 879cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0 880 IDCT 881 pxor m4, m4 882 ADD_PIXELS_CLAMPED 0 883 lea pixelsq, [pixelsq+lsizeq*2] 884 ADD_PIXELS_CLAMPED 32 885 lea pixelsq, [pixelsq+lsizeq*2] 886 ADD_PIXELS_CLAMPED 64 887 lea pixelsq, [pixelsq+lsizeq*2] 888 ADD_PIXELS_CLAMPED 96 889RET 890