1; 2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the inverse DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 29 shufps %1,%2,0x44 30%endmacro 31 32%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 33 shufps %1,%2,0xEE 34%endmacro 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_CONST 38 39 alignz 16 40 global EXTN(jconst_idct_float_sse2) 41 42EXTN(jconst_idct_float_sse2): 43 44PD_1_414 times 4 dd 1.414213562373095048801689 45PD_1_847 times 4 dd 1.847759065022573512256366 46PD_1_082 times 4 dd 1.082392200292393968799446 47PD_M2_613 times 4 dd -2.613125929752753055713286 48PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 49PB_CENTERJSAMP times 16 db CENTERJSAMPLE 50 51 alignz 16 52 53; -------------------------------------------------------------------------- 54 SECTION SEG_TEXT 55 BITS 64 56; 57; Perform dequantization and inverse DCT on one block of coefficients. 58; 59; GLOBAL(void) 60; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block, 61; JSAMPARRAY output_buf, JDIMENSION output_col) 62; 63 64; r10 = void *dct_table 65; r11 = JCOEFPTR coef_block 66; r12 = JSAMPARRAY output_buf 67; r13 = JDIMENSION output_col 68 69%define original_rbp rbp+0 70%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 71%define WK_NUM 2 72%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT 73 ; FAST_FLOAT workspace[DCTSIZE2] 74 75 align 16 76 global EXTN(jsimd_idct_float_sse2) 77 78EXTN(jsimd_idct_float_sse2): 79 push rbp 80 mov rax,rsp ; rax = original rbp 81 sub rsp, byte 4 82 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 83 mov [rsp],rax 84 mov rbp,rsp ; rbp = aligned rbp 85 lea rsp, [workspace] 86 collect_args 87 push rbx 88 89 ; ---- Pass 1: process columns from input, store into work array. 90 91 mov rdx, r10 ; quantptr 92 mov rsi, r11 ; inptr 93 lea rdi, [workspace] ; FAST_FLOAT *wsptr 94 mov rcx, DCTSIZE/4 ; ctr 95.columnloop: 96%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 97 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 98 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 99 jnz near .columnDCT 100 101 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 102 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 103 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 104 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 105 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 106 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 107 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 108 por xmm1,xmm2 109 por xmm3,xmm4 110 por xmm5,xmm6 111 por xmm1,xmm3 112 por xmm5,xmm7 113 por xmm1,xmm5 114 packsswb xmm1,xmm1 115 movd eax,xmm1 116 test rax,rax 117 jnz short .columnDCT 118 119 ; -- AC terms all zero 120 121 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 122 123 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 124 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 125 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) 126 127 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 128 129 movaps xmm1,xmm0 130 movaps xmm2,xmm0 131 movaps xmm3,xmm0 132 133 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) 134 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) 135 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) 136 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) 137 138 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 139 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 140 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 141 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 142 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 143 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 144 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 145 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 146 jmp near .nextcolumn 147%endif 148.columnDCT: 149 150 ; -- Even part 151 152 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 153 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 154 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 155 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 156 157 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 158 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 159 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 160 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 161 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) 162 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) 163 164 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 165 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 166 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 167 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 168 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) 169 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) 170 171 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 172 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 173 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 174 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 175 176 movaps xmm4,xmm0 177 movaps xmm5,xmm1 178 subps xmm0,xmm2 ; xmm0=tmp11 179 subps xmm1,xmm3 180 addps xmm4,xmm2 ; xmm4=tmp10 181 addps xmm5,xmm3 ; xmm5=tmp13 182 183 mulps xmm1,[rel PD_1_414] 184 subps xmm1,xmm5 ; xmm1=tmp12 185 186 movaps xmm6,xmm4 187 movaps xmm7,xmm0 188 subps xmm4,xmm5 ; xmm4=tmp3 189 subps xmm0,xmm1 ; xmm0=tmp2 190 addps xmm6,xmm5 ; xmm6=tmp0 191 addps xmm7,xmm1 ; xmm7=tmp1 192 193 movaps XMMWORD [wk(1)], xmm4 ; tmp3 194 movaps XMMWORD [wk(0)], xmm0 ; tmp2 195 196 ; -- Odd part 197 198 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 199 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 200 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 201 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 202 203 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 204 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 205 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 206 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 207 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) 208 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) 209 210 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 211 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 212 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 213 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 214 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) 215 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) 216 217 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 218 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 219 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 220 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 221 222 movaps xmm4,xmm2 223 movaps xmm0,xmm5 224 addps xmm2,xmm1 ; xmm2=z11 225 addps xmm5,xmm3 ; xmm5=z13 226 subps xmm4,xmm1 ; xmm4=z12 227 subps xmm0,xmm3 ; xmm0=z10 228 229 movaps xmm1,xmm2 230 subps xmm2,xmm5 231 addps xmm1,xmm5 ; xmm1=tmp7 232 233 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 234 235 movaps xmm3,xmm0 236 addps xmm0,xmm4 237 mulps xmm0,[rel PD_1_847] ; xmm0=z5 238 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 239 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) 240 addps xmm3,xmm0 ; xmm3=tmp12 241 subps xmm4,xmm0 ; xmm4=tmp10 242 243 ; -- Final output stage 244 245 subps xmm3,xmm1 ; xmm3=tmp6 246 movaps xmm5,xmm6 247 movaps xmm0,xmm7 248 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) 249 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) 250 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) 251 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) 252 subps xmm2,xmm3 ; xmm2=tmp5 253 254 movaps xmm1,xmm6 ; transpose coefficients(phase 1) 255 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) 256 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) 257 movaps xmm3,xmm0 ; transpose coefficients(phase 1) 258 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) 259 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) 260 261 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 262 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 263 264 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 265 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 266 267 addps xmm4,xmm2 ; xmm4=tmp4 268 movaps xmm0,xmm7 269 movaps xmm3,xmm5 270 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) 271 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) 272 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) 273 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) 274 275 movaps xmm2,xmm7 ; transpose coefficients(phase 1) 276 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) 277 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) 278 movaps xmm4,xmm5 ; transpose coefficients(phase 1) 279 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) 280 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) 281 282 movaps xmm3,xmm6 ; transpose coefficients(phase 2) 283 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) 284 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) 285 movaps xmm0,xmm1 ; transpose coefficients(phase 2) 286 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) 287 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) 288 289 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 290 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 291 292 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 293 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 294 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 295 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 296 297 movaps xmm6,xmm5 ; transpose coefficients(phase 2) 298 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) 299 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) 300 movaps xmm3,xmm4 ; transpose coefficients(phase 2) 301 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) 302 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) 303 304 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 305 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 306 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 307 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 308 309.nextcolumn: 310 add rsi, byte 4*SIZEOF_JCOEF ; coef_block 311 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 312 add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 313 dec rcx ; ctr 314 jnz near .columnloop 315 316 ; -- Prefetch the next coefficient block 317 318 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 321 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 322 323 ; ---- Pass 2: process rows from work array, store into output array. 324 325 mov rax, [original_rbp] 326 lea rsi, [workspace] ; FAST_FLOAT *wsptr 327 mov rdi, r12 ; (JSAMPROW *) 328 mov eax, r13d 329 mov rcx, DCTSIZE/4 ; ctr 330.rowloop: 331 332 ; -- Even part 333 334 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 335 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] 336 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] 337 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] 338 339 movaps xmm4,xmm0 340 movaps xmm5,xmm1 341 subps xmm0,xmm2 ; xmm0=tmp11 342 subps xmm1,xmm3 343 addps xmm4,xmm2 ; xmm4=tmp10 344 addps xmm5,xmm3 ; xmm5=tmp13 345 346 mulps xmm1,[rel PD_1_414] 347 subps xmm1,xmm5 ; xmm1=tmp12 348 349 movaps xmm6,xmm4 350 movaps xmm7,xmm0 351 subps xmm4,xmm5 ; xmm4=tmp3 352 subps xmm0,xmm1 ; xmm0=tmp2 353 addps xmm6,xmm5 ; xmm6=tmp0 354 addps xmm7,xmm1 ; xmm7=tmp1 355 356 movaps XMMWORD [wk(1)], xmm4 ; tmp3 357 movaps XMMWORD [wk(0)], xmm0 ; tmp2 358 359 ; -- Odd part 360 361 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 362 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] 363 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] 364 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] 365 366 movaps xmm4,xmm2 367 movaps xmm0,xmm5 368 addps xmm2,xmm1 ; xmm2=z11 369 addps xmm5,xmm3 ; xmm5=z13 370 subps xmm4,xmm1 ; xmm4=z12 371 subps xmm0,xmm3 ; xmm0=z10 372 373 movaps xmm1,xmm2 374 subps xmm2,xmm5 375 addps xmm1,xmm5 ; xmm1=tmp7 376 377 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 378 379 movaps xmm3,xmm0 380 addps xmm0,xmm4 381 mulps xmm0,[rel PD_1_847] ; xmm0=z5 382 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 383 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) 384 addps xmm3,xmm0 ; xmm3=tmp12 385 subps xmm4,xmm0 ; xmm4=tmp10 386 387 ; -- Final output stage 388 389 subps xmm3,xmm1 ; xmm3=tmp6 390 movaps xmm5,xmm6 391 movaps xmm0,xmm7 392 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) 393 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) 394 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) 395 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) 396 subps xmm2,xmm3 ; xmm2=tmp5 397 398 movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] 399 pcmpeqd xmm3,xmm3 400 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 401 402 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 403 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 404 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 405 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 406 407 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 408 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 409 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 410 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 411 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 412 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 413 414 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 415 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 416 417 addps xmm4,xmm2 ; xmm4=tmp4 418 movaps xmm7,xmm1 419 movaps xmm5,xmm3 420 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) 421 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) 422 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) 423 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) 424 425 movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] 426 pcmpeqd xmm4,xmm4 427 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 428 429 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 430 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 431 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 432 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 433 434 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 435 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 436 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 437 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 438 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 439 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 440 441 movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 442 443 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 444 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 445 paddb xmm6,xmm2 446 paddb xmm1,xmm2 447 448 movdqa xmm4,xmm6 ; transpose coefficients(phase 2) 449 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 450 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 451 452 movdqa xmm7,xmm6 ; transpose coefficients(phase 3) 453 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 454 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 455 456 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 457 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 458 459 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 460 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 461 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 462 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 463 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 464 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 465 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 466 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 467 468 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 469 add rdi, byte 4*SIZEOF_JSAMPROW 470 dec rcx ; ctr 471 jnz near .rowloop 472 473 pop rbx 474 uncollect_args 475 mov rsp,rbp ; rsp <- aligned rbp 476 pop rsp ; rsp <- original rbp 477 pop rbp 478 ret 479 480; For some reason, the OS X linker does not honor the request to align the 481; segment unless we do this. 482 align 16 483