1;
2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29        shufps  %1,%2,0x44
30%endmacro
31
32%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33        shufps  %1,%2,0xEE
34%endmacro
35
36; --------------------------------------------------------------------------
37        SECTION SEG_CONST
38
39        alignz  16
40        global  EXTN(jconst_idct_float_sse2)
41
42EXTN(jconst_idct_float_sse2):
43
44PD_1_414        times 4 dd  1.414213562373095048801689
45PD_1_847        times 4 dd  1.847759065022573512256366
46PD_1_082        times 4 dd  1.082392200292393968799446
47PD_M2_613       times 4 dd -2.613125929752753055713286
48PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
49PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
50
51        alignz  16
52
53; --------------------------------------------------------------------------
54        SECTION SEG_TEXT
55        BITS    64
56;
57; Perform dequantization and inverse DCT on one block of coefficients.
58;
59; GLOBAL(void)
60; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
61;                        JSAMPARRAY output_buf, JDIMENSION output_col)
62;
63
64; r10 = void *dct_table
65; r11 = JCOEFPTR coef_block
66; r12 = JSAMPARRAY output_buf
67; r13 = JDIMENSION output_col
68
69%define original_rbp    rbp+0
70%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
71%define WK_NUM          2
72%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
73                                        ; FAST_FLOAT workspace[DCTSIZE2]
74
75        align   16
76        global  EXTN(jsimd_idct_float_sse2)
77
78EXTN(jsimd_idct_float_sse2):
79        push    rbp
80        mov     rax,rsp                         ; rax = original rbp
81        sub     rsp, byte 4
82        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
83        mov     [rsp],rax
84        mov     rbp,rsp                         ; rbp = aligned rbp
85        lea     rsp, [workspace]
86        collect_args
87        push    rbx
88
89        ; ---- Pass 1: process columns from input, store into work array.
90
91        mov     rdx, r10                ; quantptr
92        mov     rsi, r11                ; inptr
93        lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
94        mov     rcx, DCTSIZE/4                          ; ctr
95.columnloop:
96%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
97        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
98        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
99        jnz     near .columnDCT
100
101        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
102        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
103        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
104        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
105        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
106        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
107        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
108        por     xmm1,xmm2
109        por     xmm3,xmm4
110        por     xmm5,xmm6
111        por     xmm1,xmm3
112        por     xmm5,xmm7
113        por     xmm1,xmm5
114        packsswb xmm1,xmm1
115        movd    eax,xmm1
116        test    rax,rax
117        jnz     short .columnDCT
118
119        ; -- AC terms all zero
120
121        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
122
123        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
124        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
125        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
126
127        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
128
129        movaps  xmm1,xmm0
130        movaps  xmm2,xmm0
131        movaps  xmm3,xmm0
132
133        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
134        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
135        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
136        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
137
138        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
139        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
140        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
141        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
142        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
143        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
144        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
145        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
146        jmp     near .nextcolumn
147%endif
148.columnDCT:
149
150        ; -- Even part
151
152        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
153        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
154        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
155        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
156
157        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
158        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
159        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
160        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
161        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
162        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
163
164        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
165        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
166        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
167        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
168        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
169        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
170
171        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
172        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
175
176        movaps  xmm4,xmm0
177        movaps  xmm5,xmm1
178        subps   xmm0,xmm2               ; xmm0=tmp11
179        subps   xmm1,xmm3
180        addps   xmm4,xmm2               ; xmm4=tmp10
181        addps   xmm5,xmm3               ; xmm5=tmp13
182
183        mulps   xmm1,[rel PD_1_414]
184        subps   xmm1,xmm5               ; xmm1=tmp12
185
186        movaps  xmm6,xmm4
187        movaps  xmm7,xmm0
188        subps   xmm4,xmm5               ; xmm4=tmp3
189        subps   xmm0,xmm1               ; xmm0=tmp2
190        addps   xmm6,xmm5               ; xmm6=tmp0
191        addps   xmm7,xmm1               ; xmm7=tmp1
192
193        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
194        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
195
196        ; -- Odd part
197
198        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
199        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
200        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
201        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
202
203        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
204        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
205        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
206        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
207        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
208        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
209
210        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
211        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
212        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
213        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
214        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
215        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
216
217        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
218        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
221
222        movaps  xmm4,xmm2
223        movaps  xmm0,xmm5
224        addps   xmm2,xmm1               ; xmm2=z11
225        addps   xmm5,xmm3               ; xmm5=z13
226        subps   xmm4,xmm1               ; xmm4=z12
227        subps   xmm0,xmm3               ; xmm0=z10
228
229        movaps  xmm1,xmm2
230        subps   xmm2,xmm5
231        addps   xmm1,xmm5               ; xmm1=tmp7
232
233        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
234
235        movaps  xmm3,xmm0
236        addps   xmm0,xmm4
237        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
238        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
239        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
240        addps   xmm3,xmm0               ; xmm3=tmp12
241        subps   xmm4,xmm0               ; xmm4=tmp10
242
243        ; -- Final output stage
244
245        subps   xmm3,xmm1               ; xmm3=tmp6
246        movaps  xmm5,xmm6
247        movaps  xmm0,xmm7
248        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
249        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
250        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
251        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
252        subps   xmm2,xmm3               ; xmm2=tmp5
253
254        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
255        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
256        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
257        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
258        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
259        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
260
261        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
262        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
263
264        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
265        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
266
267        addps   xmm4,xmm2               ; xmm4=tmp4
268        movaps  xmm0,xmm7
269        movaps  xmm3,xmm5
270        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
271        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
272        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
273        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
274
275        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
276        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
277        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
278        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
279        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
280        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
281
282        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
283        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
284        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
285        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
286        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
287        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
288
289        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
290        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
291
292        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
293        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
294        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
295        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
296
297        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
298        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
299        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
300        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
301        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
302        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
303
304        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
305        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
306        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
307        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
308
309.nextcolumn:
310        add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
311        add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
312        add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
313        dec     rcx                                     ; ctr
314        jnz     near .columnloop
315
316        ; -- Prefetch the next coefficient block
317
318        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
319        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
320        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
321        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
322
323        ; ---- Pass 2: process rows from work array, store into output array.
324
325        mov     rax, [original_rbp]
326        lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
327        mov     rdi, r12        ; (JSAMPROW *)
328        mov     eax, r13d
329        mov     rcx, DCTSIZE/4                          ; ctr
330.rowloop:
331
332        ; -- Even part
333
334        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
335        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
336        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
337        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
338
339        movaps  xmm4,xmm0
340        movaps  xmm5,xmm1
341        subps   xmm0,xmm2               ; xmm0=tmp11
342        subps   xmm1,xmm3
343        addps   xmm4,xmm2               ; xmm4=tmp10
344        addps   xmm5,xmm3               ; xmm5=tmp13
345
346        mulps   xmm1,[rel PD_1_414]
347        subps   xmm1,xmm5               ; xmm1=tmp12
348
349        movaps  xmm6,xmm4
350        movaps  xmm7,xmm0
351        subps   xmm4,xmm5               ; xmm4=tmp3
352        subps   xmm0,xmm1               ; xmm0=tmp2
353        addps   xmm6,xmm5               ; xmm6=tmp0
354        addps   xmm7,xmm1               ; xmm7=tmp1
355
356        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
357        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
358
359        ; -- Odd part
360
361        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
362        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
363        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
364        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
365
366        movaps  xmm4,xmm2
367        movaps  xmm0,xmm5
368        addps   xmm2,xmm1               ; xmm2=z11
369        addps   xmm5,xmm3               ; xmm5=z13
370        subps   xmm4,xmm1               ; xmm4=z12
371        subps   xmm0,xmm3               ; xmm0=z10
372
373        movaps  xmm1,xmm2
374        subps   xmm2,xmm5
375        addps   xmm1,xmm5               ; xmm1=tmp7
376
377        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
378
379        movaps  xmm3,xmm0
380        addps   xmm0,xmm4
381        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
382        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
383        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
384        addps   xmm3,xmm0               ; xmm3=tmp12
385        subps   xmm4,xmm0               ; xmm4=tmp10
386
387        ; -- Final output stage
388
389        subps   xmm3,xmm1               ; xmm3=tmp6
390        movaps  xmm5,xmm6
391        movaps  xmm0,xmm7
392        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
393        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
394        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
395        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
396        subps   xmm2,xmm3               ; xmm2=tmp5
397
398        movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
399        pcmpeqd xmm3,xmm3
400        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
401
402        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
403        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
404        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
405        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
406
407        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
408        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
409        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
410        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
411        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
412        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
413
414        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
415        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
416
417        addps   xmm4,xmm2               ; xmm4=tmp4
418        movaps  xmm7,xmm1
419        movaps  xmm5,xmm3
420        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
421        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
422        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
423        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
424
425        movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
426        pcmpeqd xmm4,xmm4
427        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
428
429        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
430        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
431        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
432        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
433
434        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
435        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
436        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
437        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
438        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
439        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
440
441        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
442
443        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
444        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
445        paddb     xmm6,xmm2
446        paddb     xmm1,xmm2
447
448        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
449        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
450        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
451
452        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
453        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
454        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
455
456        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
457        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
458
459        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
460        mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
461        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
462        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
463        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
464        mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
465        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
466        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
467
468        add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
469        add     rdi, byte 4*SIZEOF_JSAMPROW
470        dec     rcx                             ; ctr
471        jnz     near .rowloop
472
473        pop     rbx
474        uncollect_args
475        mov     rsp,rbp         ; rsp <- aligned rbp
476        pop     rsp             ; rsp <- original rbp
477        pop     rbp
478        ret
479
480; For some reason, the OS X linker does not honor the request to align the
481; segment unless we do this.
482        align   16
483