1;
2; jidctflt.asm - floating-point IDCT (SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains a floating-point implementation of the inverse DCT
17; (Discrete Cosine Transform). The following code is based directly on
18; the IJG's original jidctflt.c; see the jidctflt.c for more details.
19;
20; [TAB8]
21
22%include "jsimdext.inc"
23%include "jdct.inc"
24
25; --------------------------------------------------------------------------
26
27%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
28        shufps  %1,%2,0x44
29%endmacro
30
31%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
32        shufps  %1,%2,0xEE
33%endmacro
34
35; --------------------------------------------------------------------------
36        SECTION SEG_CONST
37
38        alignz  16
39        global  EXTN(jconst_idct_float_sse2)
40
41EXTN(jconst_idct_float_sse2):
42
43PD_1_414        times 4 dd  1.414213562373095048801689
44PD_1_847        times 4 dd  1.847759065022573512256366
45PD_1_082        times 4 dd  1.082392200292393968799446
46PD_M2_613       times 4 dd -2.613125929752753055713286
47PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
48PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
49
50        alignz  16
51
52; --------------------------------------------------------------------------
53        SECTION SEG_TEXT
54        BITS    32
55;
56; Perform dequantization and inverse DCT on one block of coefficients.
57;
58; GLOBAL(void)
59; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
60;                        JSAMPARRAY output_buf, JDIMENSION output_col)
61;
62
63%define dct_table(b)    (b)+8           ; void *dct_table
64%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
65%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
66%define output_col(b)   (b)+20          ; JDIMENSION output_col
67
68%define original_ebp    ebp+0
69%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
70%define WK_NUM          2
71%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
72                                        ; FAST_FLOAT workspace[DCTSIZE2]
73
74        align   16
75        global  EXTN(jsimd_idct_float_sse2)
76
77EXTN(jsimd_idct_float_sse2):
78        push    ebp
79        mov     eax,esp                         ; eax = original ebp
80        sub     esp, byte 4
81        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
82        mov     [esp],eax
83        mov     ebp,esp                         ; ebp = aligned ebp
84        lea     esp, [workspace]
85        push    ebx
86;       push    ecx             ; need not be preserved
87;       push    edx             ; need not be preserved
88        push    esi
89        push    edi
90
91        get_GOT ebx             ; get GOT address
92
93        ; ---- Pass 1: process columns from input, store into work array.
94
95;       mov     eax, [original_ebp]
96        mov     edx, POINTER [dct_table(eax)]           ; quantptr
97        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
98        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
99        mov     ecx, DCTSIZE/4                          ; ctr
100        alignx  16,7
101.columnloop:
102%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
103        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
104        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
105        jnz     near .columnDCT
106
107        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
108        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
109        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
110        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
111        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
112        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
113        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
114        por     xmm1,xmm2
115        por     xmm3,xmm4
116        por     xmm5,xmm6
117        por     xmm1,xmm3
118        por     xmm5,xmm7
119        por     xmm1,xmm5
120        packsswb xmm1,xmm1
121        movd    eax,xmm1
122        test    eax,eax
123        jnz     short .columnDCT
124
125        ; -- AC terms all zero
126
127        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
128
129        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
130        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
131        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
132
133        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
134
135        movaps  xmm1,xmm0
136        movaps  xmm2,xmm0
137        movaps  xmm3,xmm0
138
139        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
140        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
141        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
142        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
143
144        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
145        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
146        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
147        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
148        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
149        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
150        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
151        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
152        jmp     near .nextcolumn
153        alignx  16,7
154%endif
155.columnDCT:
156
157        ; -- Even part
158
159        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
160        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
161        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
162        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
163
164        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
165        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
166        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
167        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
168        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
169        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
170
171        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
172        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
173        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
174        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
175        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
176        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
177
178        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
179        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
180        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
181        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
182
183        movaps  xmm4,xmm0
184        movaps  xmm5,xmm1
185        subps   xmm0,xmm2               ; xmm0=tmp11
186        subps   xmm1,xmm3
187        addps   xmm4,xmm2               ; xmm4=tmp10
188        addps   xmm5,xmm3               ; xmm5=tmp13
189
190        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
191        subps   xmm1,xmm5               ; xmm1=tmp12
192
193        movaps  xmm6,xmm4
194        movaps  xmm7,xmm0
195        subps   xmm4,xmm5               ; xmm4=tmp3
196        subps   xmm0,xmm1               ; xmm0=tmp2
197        addps   xmm6,xmm5               ; xmm6=tmp0
198        addps   xmm7,xmm1               ; xmm7=tmp1
199
200        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
201        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
202
203        ; -- Odd part
204
205        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
206        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
207        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
208        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
209
210        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
211        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
212        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
213        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
214        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
215        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
216
217        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
218        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
219        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
220        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
221        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
222        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
223
224        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
225        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
226        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
227        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
228
229        movaps  xmm4,xmm2
230        movaps  xmm0,xmm5
231        addps   xmm2,xmm1               ; xmm2=z11
232        addps   xmm5,xmm3               ; xmm5=z13
233        subps   xmm4,xmm1               ; xmm4=z12
234        subps   xmm0,xmm3               ; xmm0=z10
235
236        movaps  xmm1,xmm2
237        subps   xmm2,xmm5
238        addps   xmm1,xmm5               ; xmm1=tmp7
239
240        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
241
242        movaps  xmm3,xmm0
243        addps   xmm0,xmm4
244        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
245        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
246        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
247        addps   xmm3,xmm0               ; xmm3=tmp12
248        subps   xmm4,xmm0               ; xmm4=tmp10
249
250        ; -- Final output stage
251
252        subps   xmm3,xmm1               ; xmm3=tmp6
253        movaps  xmm5,xmm6
254        movaps  xmm0,xmm7
255        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
256        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
257        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
258        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
259        subps   xmm2,xmm3               ; xmm2=tmp5
260
261        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
262        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
263        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
264        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
265        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
266        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
267
268        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
269        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
270
271        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
272        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
273
274        addps   xmm4,xmm2               ; xmm4=tmp4
275        movaps  xmm0,xmm7
276        movaps  xmm3,xmm5
277        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
278        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
279        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
280        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
281
282        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
283        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
284        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
285        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
286        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
287        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
288
289        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
290        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
291        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
292        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
293        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
294        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
295
296        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
297        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
298
299        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
300        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
301        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
302        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
303
304        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
305        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
306        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
307        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
308        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
309        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
310
311        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
312        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
313        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
314        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
315
316.nextcolumn:
317        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
318        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
319        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
320        dec     ecx                                     ; ctr
321        jnz     near .columnloop
322
323        ; -- Prefetch the next coefficient block
324
325        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
326        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
327        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
328        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
329
330        ; ---- Pass 2: process rows from work array, store into output array.
331
332        mov     eax, [original_ebp]
333        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
334        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
335        mov     eax, JDIMENSION [output_col(eax)]
336        mov     ecx, DCTSIZE/4                          ; ctr
337        alignx  16,7
338.rowloop:
339
340        ; -- Even part
341
342        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
343        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
344        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
345        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
346
347        movaps  xmm4,xmm0
348        movaps  xmm5,xmm1
349        subps   xmm0,xmm2               ; xmm0=tmp11
350        subps   xmm1,xmm3
351        addps   xmm4,xmm2               ; xmm4=tmp10
352        addps   xmm5,xmm3               ; xmm5=tmp13
353
354        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
355        subps   xmm1,xmm5               ; xmm1=tmp12
356
357        movaps  xmm6,xmm4
358        movaps  xmm7,xmm0
359        subps   xmm4,xmm5               ; xmm4=tmp3
360        subps   xmm0,xmm1               ; xmm0=tmp2
361        addps   xmm6,xmm5               ; xmm6=tmp0
362        addps   xmm7,xmm1               ; xmm7=tmp1
363
364        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
365        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
366
367        ; -- Odd part
368
369        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
370        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
371        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
372        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
373
374        movaps  xmm4,xmm2
375        movaps  xmm0,xmm5
376        addps   xmm2,xmm1               ; xmm2=z11
377        addps   xmm5,xmm3               ; xmm5=z13
378        subps   xmm4,xmm1               ; xmm4=z12
379        subps   xmm0,xmm3               ; xmm0=z10
380
381        movaps  xmm1,xmm2
382        subps   xmm2,xmm5
383        addps   xmm1,xmm5               ; xmm1=tmp7
384
385        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
386
387        movaps  xmm3,xmm0
388        addps   xmm0,xmm4
389        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
390        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
391        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
392        addps   xmm3,xmm0               ; xmm3=tmp12
393        subps   xmm4,xmm0               ; xmm4=tmp10
394
395        ; -- Final output stage
396
397        subps   xmm3,xmm1               ; xmm3=tmp6
398        movaps  xmm5,xmm6
399        movaps  xmm0,xmm7
400        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
401        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
402        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
403        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
404        subps   xmm2,xmm3               ; xmm2=tmp5
405
406        movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
407        pcmpeqd xmm3,xmm3
408        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
409
410        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
411        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
412        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
413        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
414
415        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
416        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
417        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
418        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
419        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
420        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
421
422        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
423        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
424
425        addps   xmm4,xmm2               ; xmm4=tmp4
426        movaps  xmm7,xmm1
427        movaps  xmm5,xmm3
428        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
429        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
430        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
431        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
432
433        movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
434        pcmpeqd xmm4,xmm4
435        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
436
437        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
438        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
439        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
440        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
441
442        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
443        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
444        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
445        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
446        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
447        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
448
449        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
450
451        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
452        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
453        paddb     xmm6,xmm2
454        paddb     xmm1,xmm2
455
456        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
457        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
458        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
459
460        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
461        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
462        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
463
464        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
465        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
466
467        pushpic ebx                     ; save GOT address
468
469        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
470        mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
471        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
472        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
473        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
474        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
475        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
476        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
477
478        poppic  ebx                     ; restore GOT address
479
480        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
481        add     edi, byte 4*SIZEOF_JSAMPROW
482        dec     ecx                             ; ctr
483        jnz     near .rowloop
484
485        pop     edi
486        pop     esi
487;       pop     edx             ; need not be preserved
488;       pop     ecx             ; need not be preserved
489        pop     ebx
490        mov     esp,ebp         ; esp <- aligned ebp
491        pop     esp             ; esp <- original ebp
492        pop     ebp
493        ret
494
495; For some reason, the OS X linker does not honor the request to align the
496; segment unless we do this.
497        align   16
498