1; 2; jidctflt.asm - floating-point IDCT (SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains a floating-point implementation of the inverse DCT 17; (Discrete Cosine Transform). The following code is based directly on 18; the IJG's original jidctflt.c; see the jidctflt.c for more details. 19; 20; [TAB8] 21 22%include "jsimdext.inc" 23%include "jdct.inc" 24 25; -------------------------------------------------------------------------- 26 27%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 28 shufps %1,%2,0x44 29%endmacro 30 31%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 32 shufps %1,%2,0xEE 33%endmacro 34 35; -------------------------------------------------------------------------- 36 SECTION SEG_CONST 37 38 alignz 16 39 global EXTN(jconst_idct_float_sse2) 40 41EXTN(jconst_idct_float_sse2): 42 43PD_1_414 times 4 dd 1.414213562373095048801689 44PD_1_847 times 4 dd 1.847759065022573512256366 45PD_1_082 times 4 dd 1.082392200292393968799446 46PD_M2_613 times 4 dd -2.613125929752753055713286 47PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 48PB_CENTERJSAMP times 16 db CENTERJSAMPLE 49 50 alignz 16 51 52; -------------------------------------------------------------------------- 53 SECTION SEG_TEXT 54 BITS 32 55; 56; Perform dequantization and inverse DCT on one block of coefficients. 57; 58; GLOBAL(void) 59; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block, 60; JSAMPARRAY output_buf, JDIMENSION output_col) 61; 62 63%define dct_table(b) (b)+8 ; void *dct_table 64%define coef_block(b) (b)+12 ; JCOEFPTR coef_block 65%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf 66%define output_col(b) (b)+20 ; JDIMENSION output_col 67 68%define original_ebp ebp+0 69%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 70%define WK_NUM 2 71%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT 72 ; FAST_FLOAT workspace[DCTSIZE2] 73 74 align 16 75 global EXTN(jsimd_idct_float_sse2) 76 77EXTN(jsimd_idct_float_sse2): 78 push ebp 79 mov eax,esp ; eax = original ebp 80 sub esp, byte 4 81 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 82 mov [esp],eax 83 mov ebp,esp ; ebp = aligned ebp 84 lea esp, [workspace] 85 push ebx 86; push ecx ; need not be preserved 87; push edx ; need not be preserved 88 push esi 89 push edi 90 91 get_GOT ebx ; get GOT address 92 93 ; ---- Pass 1: process columns from input, store into work array. 94 95; mov eax, [original_ebp] 96 mov edx, POINTER [dct_table(eax)] ; quantptr 97 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 98 lea edi, [workspace] ; FAST_FLOAT *wsptr 99 mov ecx, DCTSIZE/4 ; ctr 100 alignx 16,7 101.columnloop: 102%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 103 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 104 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 105 jnz near .columnDCT 106 107 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 108 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 109 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 110 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 111 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 112 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 113 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 114 por xmm1,xmm2 115 por xmm3,xmm4 116 por xmm5,xmm6 117 por xmm1,xmm3 118 por xmm5,xmm7 119 por xmm1,xmm5 120 packsswb xmm1,xmm1 121 movd eax,xmm1 122 test eax,eax 123 jnz short .columnDCT 124 125 ; -- AC terms all zero 126 127 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 128 129 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 130 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 131 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) 132 133 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 134 135 movaps xmm1,xmm0 136 movaps xmm2,xmm0 137 movaps xmm3,xmm0 138 139 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) 140 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) 141 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) 142 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) 143 144 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 145 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 146 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 147 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 148 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 149 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 150 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 151 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 152 jmp near .nextcolumn 153 alignx 16,7 154%endif 155.columnDCT: 156 157 ; -- Even part 158 159 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 160 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 161 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 162 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 163 164 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 165 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 166 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 167 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 168 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) 169 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) 170 171 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 172 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 173 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 174 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 175 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) 176 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) 177 178 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 179 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 180 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 181 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 182 183 movaps xmm4,xmm0 184 movaps xmm5,xmm1 185 subps xmm0,xmm2 ; xmm0=tmp11 186 subps xmm1,xmm3 187 addps xmm4,xmm2 ; xmm4=tmp10 188 addps xmm5,xmm3 ; xmm5=tmp13 189 190 mulps xmm1,[GOTOFF(ebx,PD_1_414)] 191 subps xmm1,xmm5 ; xmm1=tmp12 192 193 movaps xmm6,xmm4 194 movaps xmm7,xmm0 195 subps xmm4,xmm5 ; xmm4=tmp3 196 subps xmm0,xmm1 ; xmm0=tmp2 197 addps xmm6,xmm5 ; xmm6=tmp0 198 addps xmm7,xmm1 ; xmm7=tmp1 199 200 movaps XMMWORD [wk(1)], xmm4 ; tmp3 201 movaps XMMWORD [wk(0)], xmm0 ; tmp2 202 203 ; -- Odd part 204 205 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 206 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 207 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 208 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 209 210 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 211 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 212 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 213 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 214 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) 215 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) 216 217 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 218 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 219 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 220 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 221 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) 222 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) 223 224 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 225 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 226 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 227 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 228 229 movaps xmm4,xmm2 230 movaps xmm0,xmm5 231 addps xmm2,xmm1 ; xmm2=z11 232 addps xmm5,xmm3 ; xmm5=z13 233 subps xmm4,xmm1 ; xmm4=z12 234 subps xmm0,xmm3 ; xmm0=z10 235 236 movaps xmm1,xmm2 237 subps xmm2,xmm5 238 addps xmm1,xmm5 ; xmm1=tmp7 239 240 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 241 242 movaps xmm3,xmm0 243 addps xmm0,xmm4 244 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 245 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 246 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 247 addps xmm3,xmm0 ; xmm3=tmp12 248 subps xmm4,xmm0 ; xmm4=tmp10 249 250 ; -- Final output stage 251 252 subps xmm3,xmm1 ; xmm3=tmp6 253 movaps xmm5,xmm6 254 movaps xmm0,xmm7 255 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) 256 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) 257 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) 258 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) 259 subps xmm2,xmm3 ; xmm2=tmp5 260 261 movaps xmm1,xmm6 ; transpose coefficients(phase 1) 262 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) 263 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) 264 movaps xmm3,xmm0 ; transpose coefficients(phase 1) 265 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) 266 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) 267 268 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 269 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 270 271 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 272 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 273 274 addps xmm4,xmm2 ; xmm4=tmp4 275 movaps xmm0,xmm7 276 movaps xmm3,xmm5 277 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) 278 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) 279 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) 280 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) 281 282 movaps xmm2,xmm7 ; transpose coefficients(phase 1) 283 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) 284 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) 285 movaps xmm4,xmm5 ; transpose coefficients(phase 1) 286 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) 287 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) 288 289 movaps xmm3,xmm6 ; transpose coefficients(phase 2) 290 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) 291 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) 292 movaps xmm0,xmm1 ; transpose coefficients(phase 2) 293 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) 294 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) 295 296 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 297 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 298 299 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 300 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 301 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 302 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 303 304 movaps xmm6,xmm5 ; transpose coefficients(phase 2) 305 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) 306 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) 307 movaps xmm3,xmm4 ; transpose coefficients(phase 2) 308 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) 309 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) 310 311 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 312 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 313 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 314 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 315 316.nextcolumn: 317 add esi, byte 4*SIZEOF_JCOEF ; coef_block 318 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 319 add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 320 dec ecx ; ctr 321 jnz near .columnloop 322 323 ; -- Prefetch the next coefficient block 324 325 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 326 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 327 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 328 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 329 330 ; ---- Pass 2: process rows from work array, store into output array. 331 332 mov eax, [original_ebp] 333 lea esi, [workspace] ; FAST_FLOAT *wsptr 334 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 335 mov eax, JDIMENSION [output_col(eax)] 336 mov ecx, DCTSIZE/4 ; ctr 337 alignx 16,7 338.rowloop: 339 340 ; -- Even part 341 342 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 343 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 344 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 345 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 346 347 movaps xmm4,xmm0 348 movaps xmm5,xmm1 349 subps xmm0,xmm2 ; xmm0=tmp11 350 subps xmm1,xmm3 351 addps xmm4,xmm2 ; xmm4=tmp10 352 addps xmm5,xmm3 ; xmm5=tmp13 353 354 mulps xmm1,[GOTOFF(ebx,PD_1_414)] 355 subps xmm1,xmm5 ; xmm1=tmp12 356 357 movaps xmm6,xmm4 358 movaps xmm7,xmm0 359 subps xmm4,xmm5 ; xmm4=tmp3 360 subps xmm0,xmm1 ; xmm0=tmp2 361 addps xmm6,xmm5 ; xmm6=tmp0 362 addps xmm7,xmm1 ; xmm7=tmp1 363 364 movaps XMMWORD [wk(1)], xmm4 ; tmp3 365 movaps XMMWORD [wk(0)], xmm0 ; tmp2 366 367 ; -- Odd part 368 369 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 370 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 371 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 372 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 373 374 movaps xmm4,xmm2 375 movaps xmm0,xmm5 376 addps xmm2,xmm1 ; xmm2=z11 377 addps xmm5,xmm3 ; xmm5=z13 378 subps xmm4,xmm1 ; xmm4=z12 379 subps xmm0,xmm3 ; xmm0=z10 380 381 movaps xmm1,xmm2 382 subps xmm2,xmm5 383 addps xmm1,xmm5 ; xmm1=tmp7 384 385 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 386 387 movaps xmm3,xmm0 388 addps xmm0,xmm4 389 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 390 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 391 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 392 addps xmm3,xmm0 ; xmm3=tmp12 393 subps xmm4,xmm0 ; xmm4=tmp10 394 395 ; -- Final output stage 396 397 subps xmm3,xmm1 ; xmm3=tmp6 398 movaps xmm5,xmm6 399 movaps xmm0,xmm7 400 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) 401 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) 402 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) 403 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) 404 subps xmm2,xmm3 ; xmm2=tmp5 405 406 movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] 407 pcmpeqd xmm3,xmm3 408 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 409 410 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 411 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 412 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 413 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 414 415 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 416 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 417 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 418 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 419 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 420 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 421 422 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 423 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 424 425 addps xmm4,xmm2 ; xmm4=tmp4 426 movaps xmm7,xmm1 427 movaps xmm5,xmm3 428 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) 429 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) 430 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) 431 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) 432 433 movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] 434 pcmpeqd xmm4,xmm4 435 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 436 437 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 438 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 439 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 440 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 441 442 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 443 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 444 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 445 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 446 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 447 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 448 449 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 450 451 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 452 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 453 paddb xmm6,xmm2 454 paddb xmm1,xmm2 455 456 movdqa xmm4,xmm6 ; transpose coefficients(phase 2) 457 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 458 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 459 460 movdqa xmm7,xmm6 ; transpose coefficients(phase 3) 461 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 462 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 463 464 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 465 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 466 467 pushpic ebx ; save GOT address 468 469 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 470 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 471 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 472 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 473 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 474 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 475 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 476 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 477 478 poppic ebx ; restore GOT address 479 480 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 481 add edi, byte 4*SIZEOF_JSAMPROW 482 dec ecx ; ctr 483 jnz near .rowloop 484 485 pop edi 486 pop esi 487; pop edx ; need not be preserved 488; pop ecx ; need not be preserved 489 pop ebx 490 mov esp,ebp ; esp <- aligned ebp 491 pop esp ; esp <- original ebp 492 pop ebp 493 ret 494 495; For some reason, the OS X linker does not honor the request to align the 496; segment unless we do this. 497 align 16 498