1;
2; jcsample.asm - downsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2018, Matthias Räncker.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_TEXT
22    BITS        64
23;
24; Downsample pixel values of a single component.
25; This version handles the common case of 2:1 horizontal and 1:1 vertical,
26; without smoothing.
27;
28; GLOBAL(void)
29; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
30;                            JDIMENSION v_samp_factor,
31;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
32;                            JSAMPARRAY output_data);
33;
34
35; r10d = JDIMENSION image_width
36; r11 = int max_v_samp_factor
37; r12d = JDIMENSION v_samp_factor
38; r13d = JDIMENSION width_in_blocks
39; r14 = JSAMPARRAY input_data
40; r15 = JSAMPARRAY output_data
41
42    align       32
43    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
44
45EXTN(jsimd_h2v1_downsample_sse2):
46    push        rbp
47    mov         rax, rsp
48    mov         rbp, rsp
49    collect_args 6
50
51    mov         ecx, r13d
52    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
53    jz          near .return
54
55    mov         edx, r10d
56
57    ; -- expand_right_edge
58
59    push        rcx
60    shl         rcx, 1                  ; output_cols * 2
61    sub         rcx, rdx
62    jle         short .expand_end
63
64    mov         rax, r11
65    test        rax, rax
66    jle         short .expand_end
67
68    cld
69    mov         rsi, r14                ; input_data
70.expandloop:
71    push        rax
72    push        rcx
73
74    mov         rdip, JSAMPROW [rsi]
75    add         rdi, rdx
76    mov         al, JSAMPLE [rdi-1]
77
78    rep stosb
79
80    pop         rcx
81    pop         rax
82
83    add         rsi, byte SIZEOF_JSAMPROW
84    dec         rax
85    jg          short .expandloop
86
87.expand_end:
88    pop         rcx                     ; output_cols
89
90    ; -- h2v1_downsample
91
92    mov         eax, r12d               ; rowctr
93    test        eax, eax
94    jle         near .return
95
96    mov         rdx, 0x00010000         ; bias pattern
97    movd        xmm7, edx
98    pcmpeqw     xmm6, xmm6
99    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
100    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
101
102    mov         rsi, r14                ; input_data
103    mov         rdi, r15                ; output_data
104.rowloop:
105    push        rcx
106    push        rdi
107    push        rsi
108
109    mov         rsip, JSAMPROW [rsi]    ; inptr
110    mov         rdip, JSAMPROW [rdi]    ; outptr
111
112    cmp         rcx, byte SIZEOF_XMMWORD
113    jae         short .columnloop
114
115.columnloop_r8:
116    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
117    pxor        xmm1, xmm1
118    mov         rcx, SIZEOF_XMMWORD
119    jmp         short .downsample
120
121.columnloop:
122    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
123    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
124
125.downsample:
126    movdqa      xmm2, xmm0
127    movdqa      xmm3, xmm1
128
129    pand        xmm0, xmm6
130    psrlw       xmm2, BYTE_BIT
131    pand        xmm1, xmm6
132    psrlw       xmm3, BYTE_BIT
133
134    paddw       xmm0, xmm2
135    paddw       xmm1, xmm3
136    paddw       xmm0, xmm7
137    paddw       xmm1, xmm7
138    psrlw       xmm0, 1
139    psrlw       xmm1, 1
140
141    packuswb    xmm0, xmm1
142
143    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
144
145    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
146    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
147    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
148    cmp         rcx, byte SIZEOF_XMMWORD
149    jae         short .columnloop
150    test        rcx, rcx
151    jnz         short .columnloop_r8
152
153    pop         rsi
154    pop         rdi
155    pop         rcx
156
157    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
158    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
159    dec         rax                        ; rowctr
160    jg          near .rowloop
161
162.return:
163    uncollect_args 6
164    pop         rbp
165    ret
166
167; --------------------------------------------------------------------------
168;
169; Downsample pixel values of a single component.
170; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
171; without smoothing.
172;
173; GLOBAL(void)
174; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
175;                            JDIMENSION v_samp_factor,
176;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
177;                            JSAMPARRAY output_data);
178;
179
180; r10d = JDIMENSION image_width
181; r11 = int max_v_samp_factor
182; r12d = JDIMENSION v_samp_factor
183; r13d = JDIMENSION width_in_blocks
184; r14 = JSAMPARRAY input_data
185; r15 = JSAMPARRAY output_data
186
187    align       32
188    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
189
190EXTN(jsimd_h2v2_downsample_sse2):
191    push        rbp
192    mov         rax, rsp
193    mov         rbp, rsp
194    collect_args 6
195
196    mov         ecx, r13d
197    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
198    jz          near .return
199
200    mov         edx, r10d
201
202    ; -- expand_right_edge
203
204    push        rcx
205    shl         rcx, 1                  ; output_cols * 2
206    sub         rcx, rdx
207    jle         short .expand_end
208
209    mov         rax, r11
210    test        rax, rax
211    jle         short .expand_end
212
213    cld
214    mov         rsi, r14                ; input_data
215.expandloop:
216    push        rax
217    push        rcx
218
219    mov         rdip, JSAMPROW [rsi]
220    add         rdi, rdx
221    mov         al, JSAMPLE [rdi-1]
222
223    rep stosb
224
225    pop         rcx
226    pop         rax
227
228    add         rsi, byte SIZEOF_JSAMPROW
229    dec         rax
230    jg          short .expandloop
231
232.expand_end:
233    pop         rcx                     ; output_cols
234
235    ; -- h2v2_downsample
236
237    mov         eax, r12d               ; rowctr
238    test        rax, rax
239    jle         near .return
240
241    mov         rdx, 0x00020001         ; bias pattern
242    movd        xmm7, edx
243    pcmpeqw     xmm6, xmm6
244    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
245    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
246
247    mov         rsi, r14                ; input_data
248    mov         rdi, r15                ; output_data
249.rowloop:
250    push        rcx
251    push        rdi
252    push        rsi
253
254    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
255    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
256    mov         rdip, JSAMPROW [rdi]                    ; outptr
257
258    cmp         rcx, byte SIZEOF_XMMWORD
259    jae         short .columnloop
260
261.columnloop_r8:
262    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
263    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
264    pxor        xmm2, xmm2
265    pxor        xmm3, xmm3
266    mov         rcx, SIZEOF_XMMWORD
267    jmp         short .downsample
268
269.columnloop:
270    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
271    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
272    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
273    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
274
275.downsample:
276    movdqa      xmm4, xmm0
277    movdqa      xmm5, xmm1
278    pand        xmm0, xmm6
279    psrlw       xmm4, BYTE_BIT
280    pand        xmm1, xmm6
281    psrlw       xmm5, BYTE_BIT
282    paddw       xmm0, xmm4
283    paddw       xmm1, xmm5
284
285    movdqa      xmm4, xmm2
286    movdqa      xmm5, xmm3
287    pand        xmm2, xmm6
288    psrlw       xmm4, BYTE_BIT
289    pand        xmm3, xmm6
290    psrlw       xmm5, BYTE_BIT
291    paddw       xmm2, xmm4
292    paddw       xmm3, xmm5
293
294    paddw       xmm0, xmm1
295    paddw       xmm2, xmm3
296    paddw       xmm0, xmm7
297    paddw       xmm2, xmm7
298    psrlw       xmm0, 2
299    psrlw       xmm2, 2
300
301    packuswb    xmm0, xmm2
302
303    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
304
305    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
306    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
307    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
308    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
309    cmp         rcx, byte SIZEOF_XMMWORD
310    jae         near .columnloop
311    test        rcx, rcx
312    jnz         near .columnloop_r8
313
314    pop         rsi
315    pop         rdi
316    pop         rcx
317
318    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
319    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
320    dec         rax                          ; rowctr
321    jg          near .rowloop
322
323.return:
324    uncollect_args 6
325    pop         rbp
326    ret
327
328; For some reason, the OS X linker does not honor the request to align the
329; segment unless we do this.
330    align       32
331