1;
2; jdmrgext.asm - merged upsampling/color conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
23;
24; GLOBAL(void)
25; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
26;                                 JSAMPIMAGE input_buf,
27;                                 JDIMENSION in_row_group_ctr,
28;                                 JSAMPARRAY output_buf);
29;
30
31%define output_width(b) (b)+8                   ; JDIMENSION output_width
32%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
33%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
34%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
35
36%define original_ebp    ebp+0
37%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
38%define WK_NUM          3
39%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
40
41        align   16
42        global  EXTN(jsimd_h2v1_merged_upsample_mmx)
43
44EXTN(jsimd_h2v1_merged_upsample_mmx):
45        push    ebp
46        mov     eax,esp                         ; eax = original ebp
47        sub     esp, byte 4
48        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
49        mov     [esp],eax
50        mov     ebp,esp                         ; ebp = aligned ebp
51        lea     esp, [wk(0)]
52        pushpic eax             ; make a room for GOT address
53        push    ebx
54;       push    ecx             ; need not be preserved
55;       push    edx             ; need not be preserved
56        push    esi
57        push    edi
58
59        get_GOT ebx                     ; get GOT address
60        movpic  POINTER [gotptr], ebx   ; save GOT address
61
62        mov     ecx, JDIMENSION [output_width(eax)]     ; col
63        test    ecx,ecx
64        jz      near .return
65
66        push    ecx
67
68        mov     edi, JSAMPIMAGE [input_buf(eax)]
69        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
70        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
71        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
72        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
73        mov     edi, JSAMPARRAY [output_buf(eax)]
74        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
75        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
76        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
77        mov     edi, JSAMPROW [edi]                             ; outptr
78
79        pop     ecx                     ; col
80
81        alignx  16,7
82.columnloop:
83        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
84
85        movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
86        movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
87
88        pxor      mm1,mm1               ; mm1=(all 0's)
89        pcmpeqw   mm3,mm3
90        psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
91
92        movq      mm4,mm6
93        punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
94        punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
95        movq      mm0,mm7
96        punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
97        punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
98
99        paddw     mm6,mm3
100        paddw     mm4,mm3
101        paddw     mm7,mm3
102        paddw     mm0,mm3
103
104        ; (Original)
105        ; R = Y                + 1.40200 * Cr
106        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
107        ; B = Y + 1.77200 * Cb
108        ;
109        ; (This implementation)
110        ; R = Y                + 0.40200 * Cr + Cr
111        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
112        ; B = Y - 0.22800 * Cb + Cb + Cb
113
114        movq    mm5,mm6                 ; mm5=CbH
115        movq    mm2,mm4                 ; mm2=CbL
116        paddw   mm6,mm6                 ; mm6=2*CbH
117        paddw   mm4,mm4                 ; mm4=2*CbL
118        movq    mm1,mm7                 ; mm1=CrH
119        movq    mm3,mm0                 ; mm3=CrL
120        paddw   mm7,mm7                 ; mm7=2*CrH
121        paddw   mm0,mm0                 ; mm0=2*CrL
122
123        pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
124        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
125        pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
126        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
127
128        paddw   mm6,[GOTOFF(eax,PW_ONE)]
129        paddw   mm4,[GOTOFF(eax,PW_ONE)]
130        psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
131        psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
132        paddw   mm7,[GOTOFF(eax,PW_ONE)]
133        paddw   mm0,[GOTOFF(eax,PW_ONE)]
134        psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
135        psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
136
137        paddw   mm6,mm5
138        paddw   mm4,mm2
139        paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
140        paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
141        paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
142        paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
143
144        movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
145        movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
146
147        movq      mm6,mm5
148        movq      mm7,mm2
149        punpcklwd mm5,mm1
150        punpckhwd mm6,mm1
151        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
152        pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
153        punpcklwd mm2,mm3
154        punpckhwd mm7,mm3
155        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
156        pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
157
158        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
159        paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
160        psrad     mm5,SCALEBITS
161        psrad     mm6,SCALEBITS
162        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
163        paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
164        psrad     mm2,SCALEBITS
165        psrad     mm7,SCALEBITS
166
167        packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
168        packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
169        psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
170        psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
171
172        movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
173
174        mov     al,2                    ; Yctr
175        jmp     short .Yloop_1st
176        alignx  16,7
177
178.Yloop_2nd:
179        movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
180        movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
181        movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
182        alignx  16,7
183
184.Yloop_1st:
185        movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
186
187        pcmpeqw mm6,mm6
188        psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
189        pand    mm6,mm7                 ; mm6=Y(0246)=YE
190        psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
191
192        movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
193        movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
194        movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
195
196        paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
197        paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
198        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
199        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
200
201        paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
202        paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
203        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
204        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
205
206        paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
207        paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
208        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
209        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
210
211%if RGB_PIXELSIZE == 3 ; ---------------
212
213        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
214        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
215        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
216        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
217
218        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
219        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
220        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
221
222        movq      mmG,mmA
223        movq      mmH,mmA
224        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
225        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
226
227        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
228        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
229
230        movq      mmC,mmD
231        movq      mmB,mmD
232        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
233        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
234
235        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
236
237        movq      mmF,mmE
238        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
239        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
240
241        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
242        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
243        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
244
245        cmp     ecx, byte SIZEOF_MMWORD
246        jb      short .column_st16
247
248        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
249        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
250        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
251
252        sub     ecx, byte SIZEOF_MMWORD
253        jz      near .endcolumn
254
255        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
256        add     esi, byte SIZEOF_MMWORD                 ; inptr0
257        dec     al                      ; Yctr
258        jnz     near .Yloop_2nd
259
260        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
261        add     edx, byte SIZEOF_MMWORD                 ; inptr2
262        jmp     near .columnloop
263        alignx  16,7
264
265.column_st16:
266        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
267        cmp     ecx, byte 2*SIZEOF_MMWORD
268        jb      short .column_st8
269        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
270        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
271        movq    mmA,mmC
272        sub     ecx, byte 2*SIZEOF_MMWORD
273        add     edi, byte 2*SIZEOF_MMWORD
274        jmp     short .column_st4
275.column_st8:
276        cmp     ecx, byte SIZEOF_MMWORD
277        jb      short .column_st4
278        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
279        movq    mmA,mmE
280        sub     ecx, byte SIZEOF_MMWORD
281        add     edi, byte SIZEOF_MMWORD
282.column_st4:
283        movd    eax,mmA
284        cmp     ecx, byte SIZEOF_DWORD
285        jb      short .column_st2
286        mov     DWORD [edi+0*SIZEOF_DWORD], eax
287        psrlq   mmA,DWORD_BIT
288        movd    eax,mmA
289        sub     ecx, byte SIZEOF_DWORD
290        add     edi, byte SIZEOF_DWORD
291.column_st2:
292        cmp     ecx, byte SIZEOF_WORD
293        jb      short .column_st1
294        mov     WORD [edi+0*SIZEOF_WORD], ax
295        shr     eax,WORD_BIT
296        sub     ecx, byte SIZEOF_WORD
297        add     edi, byte SIZEOF_WORD
298.column_st1:
299        cmp     ecx, byte SIZEOF_BYTE
300        jb      short .endcolumn
301        mov     BYTE [edi+0*SIZEOF_BYTE], al
302
303%else ; RGB_PIXELSIZE == 4 ; -----------
304
305%ifdef RGBX_FILLER_0XFF
306        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
307        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
308%else
309        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
310        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
311%endif
312        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
313        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
314        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
315        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
316
317        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
318        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
319        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
320        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
321
322        movq      mmC,mmA
323        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
324        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
325        movq      mmG,mmB
326        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
327        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
328
329        movq      mmD,mmA
330        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
331        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
332        movq      mmH,mmC
333        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
334        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
335
336        cmp     ecx, byte SIZEOF_MMWORD
337        jb      short .column_st16
338
339        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
340        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
341        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
342        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
343
344        sub     ecx, byte SIZEOF_MMWORD
345        jz      short .endcolumn
346
347        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
348        add     esi, byte SIZEOF_MMWORD                 ; inptr0
349        dec     al                      ; Yctr
350        jnz     near .Yloop_2nd
351
352        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
353        add     edx, byte SIZEOF_MMWORD                 ; inptr2
354        jmp     near .columnloop
355        alignx  16,7
356
357.column_st16:
358        cmp     ecx, byte SIZEOF_MMWORD/2
359        jb      short .column_st8
360        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
361        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
362        movq    mmA,mmC
363        movq    mmD,mmH
364        sub     ecx, byte SIZEOF_MMWORD/2
365        add     edi, byte 2*SIZEOF_MMWORD
366.column_st8:
367        cmp     ecx, byte SIZEOF_MMWORD/4
368        jb      short .column_st4
369        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
370        movq    mmA,mmD
371        sub     ecx, byte SIZEOF_MMWORD/4
372        add     edi, byte 1*SIZEOF_MMWORD
373.column_st4:
374        cmp     ecx, byte SIZEOF_MMWORD/8
375        jb      short .endcolumn
376        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
377
378%endif ; RGB_PIXELSIZE ; ---------------
379
380.endcolumn:
381        emms            ; empty MMX state
382
383.return:
384        pop     edi
385        pop     esi
386;       pop     edx             ; need not be preserved
387;       pop     ecx             ; need not be preserved
388        pop     ebx
389        mov     esp,ebp         ; esp <- aligned ebp
390        pop     esp             ; esp <- original ebp
391        pop     ebp
392        ret
393
394; --------------------------------------------------------------------------
395;
396; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
397;
398; GLOBAL(void)
399; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
400;                                 JSAMPIMAGE input_buf,
401;                                 JDIMENSION in_row_group_ctr,
402;                                 JSAMPARRAY output_buf);
403;
404
405%define output_width(b) (b)+8                   ; JDIMENSION output_width
406%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
407%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
408%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
409
410        align   16
411        global  EXTN(jsimd_h2v2_merged_upsample_mmx)
412
413EXTN(jsimd_h2v2_merged_upsample_mmx):
414        push    ebp
415        mov     ebp,esp
416        push    ebx
417;       push    ecx             ; need not be preserved
418;       push    edx             ; need not be preserved
419        push    esi
420        push    edi
421
422        mov     eax, JDIMENSION [output_width(ebp)]
423
424        mov     edi, JSAMPIMAGE [input_buf(ebp)]
425        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
426        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
427        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
428        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
429        mov     edi, JSAMPARRAY [output_buf(ebp)]
430        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
431
432        push    edx                     ; inptr2
433        push    ebx                     ; inptr1
434        push    esi                     ; inptr00
435        mov     ebx,esp
436
437        push    edi                     ; output_buf (outptr0)
438        push    ecx                     ; in_row_group_ctr
439        push    ebx                     ; input_buf
440        push    eax                     ; output_width
441
442        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
443
444        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
445        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
446        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
447        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
448
449        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
450
451        add     esp, byte 7*SIZEOF_DWORD
452
453        pop     edi
454        pop     esi
455;       pop     edx             ; need not be preserved
456;       pop     ecx             ; need not be preserved
457        pop     ebx
458        pop     ebp
459        ret
460
461; For some reason, the OS X linker does not honor the request to align the
462; segment unless we do this.
463        align   16
464