1;
2; jcsample.asm - downsampling (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18
19; --------------------------------------------------------------------------
20    SECTION     SEG_TEXT
21    BITS        32
22;
23; Downsample pixel values of a single component.
24; This version handles the common case of 2:1 horizontal and 1:1 vertical,
25; without smoothing.
26;
27; GLOBAL(void)
28; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
29;                            JDIMENSION v_samp_factor,
30;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
31;                            JSAMPARRAY output_data);
32;
33
34%define img_width(b)    (b) + 8         ; JDIMENSION image_width
35%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
36%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
37%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
38%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
39%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
40
41    align       32
42    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
43
44EXTN(jsimd_h2v1_downsample_sse2):
45    push        ebp
46    mov         ebp, esp
47;   push        ebx                     ; unused
48;   push        ecx                     ; need not be preserved
49;   push        edx                     ; need not be preserved
50    push        esi
51    push        edi
52
53    mov         ecx, JDIMENSION [width_blks(ebp)]
54    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
55    jz          near .return
56
57    mov         edx, JDIMENSION [img_width(ebp)]
58
59    ; -- expand_right_edge
60
61    push        ecx
62    shl         ecx, 1                  ; output_cols * 2
63    sub         ecx, edx
64    jle         short .expand_end
65
66    mov         eax, INT [max_v_samp(ebp)]
67    test        eax, eax
68    jle         short .expand_end
69
70    cld
71    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
72    alignx      16, 7
73.expandloop:
74    push        eax
75    push        ecx
76
77    mov         edi, JSAMPROW [esi]
78    add         edi, edx
79    mov         al, JSAMPLE [edi-1]
80
81    rep stosb
82
83    pop         ecx
84    pop         eax
85
86    add         esi, byte SIZEOF_JSAMPROW
87    dec         eax
88    jg          short .expandloop
89
90.expand_end:
91    pop         ecx                     ; output_cols
92
93    ; -- h2v1_downsample
94
95    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
96    test        eax, eax
97    jle         near .return
98
99    mov         edx, 0x00010000         ; bias pattern
100    movd        xmm7, edx
101    pcmpeqw     xmm6, xmm6
102    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
103    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
104
105    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
106    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
107    alignx      16, 7
108.rowloop:
109    push        ecx
110    push        edi
111    push        esi
112
113    mov         esi, JSAMPROW [esi]     ; inptr
114    mov         edi, JSAMPROW [edi]     ; outptr
115
116    cmp         ecx, byte SIZEOF_XMMWORD
117    jae         short .columnloop
118    alignx      16, 7
119
120.columnloop_r8:
121    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
122    pxor        xmm1, xmm1
123    mov         ecx, SIZEOF_XMMWORD
124    jmp         short .downsample
125    alignx      16, 7
126
127.columnloop:
128    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
129    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
130
131.downsample:
132    movdqa      xmm2, xmm0
133    movdqa      xmm3, xmm1
134
135    pand        xmm0, xmm6
136    psrlw       xmm2, BYTE_BIT
137    pand        xmm1, xmm6
138    psrlw       xmm3, BYTE_BIT
139
140    paddw       xmm0, xmm2
141    paddw       xmm1, xmm3
142    paddw       xmm0, xmm7
143    paddw       xmm1, xmm7
144    psrlw       xmm0, 1
145    psrlw       xmm1, 1
146
147    packuswb    xmm0, xmm1
148
149    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
150
151    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
152    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
153    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
154    cmp         ecx, byte SIZEOF_XMMWORD
155    jae         short .columnloop
156    test        ecx, ecx
157    jnz         short .columnloop_r8
158
159    pop         esi
160    pop         edi
161    pop         ecx
162
163    add         esi, byte SIZEOF_JSAMPROW  ; input_data
164    add         edi, byte SIZEOF_JSAMPROW  ; output_data
165    dec         eax                        ; rowctr
166    jg          near .rowloop
167
168.return:
169    pop         edi
170    pop         esi
171;   pop         edx                     ; need not be preserved
172;   pop         ecx                     ; need not be preserved
173;   pop         ebx                     ; unused
174    pop         ebp
175    ret
176
177; --------------------------------------------------------------------------
178;
179; Downsample pixel values of a single component.
180; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
181; without smoothing.
182;
183; GLOBAL(void)
184; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
185;                            JDIMENSION v_samp_factor,
186;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
187;                            JSAMPARRAY output_data);
188;
189
190%define img_width(b)    (b) + 8         ; JDIMENSION image_width
191%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
192%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
193%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
194%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
195%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
196
197    align       32
198    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
199
200EXTN(jsimd_h2v2_downsample_sse2):
201    push        ebp
202    mov         ebp, esp
203;   push        ebx                     ; unused
204;   push        ecx                     ; need not be preserved
205;   push        edx                     ; need not be preserved
206    push        esi
207    push        edi
208
209    mov         ecx, JDIMENSION [width_blks(ebp)]
210    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
211    jz          near .return
212
213    mov         edx, JDIMENSION [img_width(ebp)]
214
215    ; -- expand_right_edge
216
217    push        ecx
218    shl         ecx, 1                  ; output_cols * 2
219    sub         ecx, edx
220    jle         short .expand_end
221
222    mov         eax, INT [max_v_samp(ebp)]
223    test        eax, eax
224    jle         short .expand_end
225
226    cld
227    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
228    alignx      16, 7
229.expandloop:
230    push        eax
231    push        ecx
232
233    mov         edi, JSAMPROW [esi]
234    add         edi, edx
235    mov         al, JSAMPLE [edi-1]
236
237    rep stosb
238
239    pop         ecx
240    pop         eax
241
242    add         esi, byte SIZEOF_JSAMPROW
243    dec         eax
244    jg          short .expandloop
245
246.expand_end:
247    pop         ecx                     ; output_cols
248
249    ; -- h2v2_downsample
250
251    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
252    test        eax, eax
253    jle         near .return
254
255    mov         edx, 0x00020001         ; bias pattern
256    movd        xmm7, edx
257    pcmpeqw     xmm6, xmm6
258    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
259    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
260
261    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
262    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
263    alignx      16, 7
264.rowloop:
265    push        ecx
266    push        edi
267    push        esi
268
269    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
270    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
271    mov         edi, JSAMPROW [edi]                    ; outptr
272
273    cmp         ecx, byte SIZEOF_XMMWORD
274    jae         short .columnloop
275    alignx      16, 7
276
277.columnloop_r8:
278    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
279    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
280    pxor        xmm2, xmm2
281    pxor        xmm3, xmm3
282    mov         ecx, SIZEOF_XMMWORD
283    jmp         short .downsample
284    alignx      16, 7
285
286.columnloop:
287    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
288    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
289    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
290    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
291
292.downsample:
293    movdqa      xmm4, xmm0
294    movdqa      xmm5, xmm1
295    pand        xmm0, xmm6
296    psrlw       xmm4, BYTE_BIT
297    pand        xmm1, xmm6
298    psrlw       xmm5, BYTE_BIT
299    paddw       xmm0, xmm4
300    paddw       xmm1, xmm5
301
302    movdqa      xmm4, xmm2
303    movdqa      xmm5, xmm3
304    pand        xmm2, xmm6
305    psrlw       xmm4, BYTE_BIT
306    pand        xmm3, xmm6
307    psrlw       xmm5, BYTE_BIT
308    paddw       xmm2, xmm4
309    paddw       xmm3, xmm5
310
311    paddw       xmm0, xmm1
312    paddw       xmm2, xmm3
313    paddw       xmm0, xmm7
314    paddw       xmm2, xmm7
315    psrlw       xmm0, 2
316    psrlw       xmm2, 2
317
318    packuswb    xmm0, xmm2
319
320    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
321
322    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
323    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
324    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
325    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
326    cmp         ecx, byte SIZEOF_XMMWORD
327    jae         near .columnloop
328    test        ecx, ecx
329    jnz         near .columnloop_r8
330
331    pop         esi
332    pop         edi
333    pop         ecx
334
335    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
336    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
337    dec         eax                          ; rowctr
338    jg          near .rowloop
339
340.return:
341    pop         edi
342    pop         esi
343;   pop         edx                     ; need not be preserved
344;   pop         ecx                     ; need not be preserved
345;   pop         ebx                     ; unused
346    pop         ebp
347    ret
348
349; For some reason, the OS X linker does not honor the request to align the
350; segment unless we do this.
351    align       32
352