1;
2; jcsample.asm - downsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18
19; --------------------------------------------------------------------------
20    SECTION     SEG_TEXT
21    BITS        64
22;
23; Downsample pixel values of a single component.
24; This version handles the common case of 2:1 horizontal and 1:1 vertical,
25; without smoothing.
26;
27; GLOBAL(void)
28; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
29;                            JDIMENSION v_samp_factor,
30;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
31;                            JSAMPARRAY output_data);
32;
33
34; r10d = JDIMENSION image_width
35; r11 = int max_v_samp_factor
36; r12d = JDIMENSION v_samp_factor
37; r13d = JDIMENSION width_in_blocks
38; r14 = JSAMPARRAY input_data
39; r15 = JSAMPARRAY output_data
40
41    align       32
42    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
43
44EXTN(jsimd_h2v1_downsample_sse2):
45    push        rbp
46    mov         rax, rsp
47    mov         rbp, rsp
48    collect_args 6
49
50    mov         ecx, r13d
51    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
52    jz          near .return
53
54    mov         edx, r10d
55
56    ; -- expand_right_edge
57
58    push        rcx
59    shl         rcx, 1                  ; output_cols * 2
60    sub         rcx, rdx
61    jle         short .expand_end
62
63    mov         rax, r11
64    test        rax, rax
65    jle         short .expand_end
66
67    cld
68    mov         rsi, r14                ; input_data
69.expandloop:
70    push        rax
71    push        rcx
72
73    mov         rdi, JSAMPROW [rsi]
74    add         rdi, rdx
75    mov         al, JSAMPLE [rdi-1]
76
77    rep stosb
78
79    pop         rcx
80    pop         rax
81
82    add         rsi, byte SIZEOF_JSAMPROW
83    dec         rax
84    jg          short .expandloop
85
86.expand_end:
87    pop         rcx                     ; output_cols
88
89    ; -- h2v1_downsample
90
91    mov         eax, r12d               ; rowctr
92    test        eax, eax
93    jle         near .return
94
95    mov         rdx, 0x00010000         ; bias pattern
96    movd        xmm7, edx
97    pcmpeqw     xmm6, xmm6
98    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
99    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
100
101    mov         rsi, r14                ; input_data
102    mov         rdi, r15                ; output_data
103.rowloop:
104    push        rcx
105    push        rdi
106    push        rsi
107
108    mov         rsi, JSAMPROW [rsi]     ; inptr
109    mov         rdi, JSAMPROW [rdi]     ; outptr
110
111    cmp         rcx, byte SIZEOF_XMMWORD
112    jae         short .columnloop
113
114.columnloop_r8:
115    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
116    pxor        xmm1, xmm1
117    mov         rcx, SIZEOF_XMMWORD
118    jmp         short .downsample
119
120.columnloop:
121    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
122    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
123
124.downsample:
125    movdqa      xmm2, xmm0
126    movdqa      xmm3, xmm1
127
128    pand        xmm0, xmm6
129    psrlw       xmm2, BYTE_BIT
130    pand        xmm1, xmm6
131    psrlw       xmm3, BYTE_BIT
132
133    paddw       xmm0, xmm2
134    paddw       xmm1, xmm3
135    paddw       xmm0, xmm7
136    paddw       xmm1, xmm7
137    psrlw       xmm0, 1
138    psrlw       xmm1, 1
139
140    packuswb    xmm0, xmm1
141
142    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
143
144    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
145    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
146    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
147    cmp         rcx, byte SIZEOF_XMMWORD
148    jae         short .columnloop
149    test        rcx, rcx
150    jnz         short .columnloop_r8
151
152    pop         rsi
153    pop         rdi
154    pop         rcx
155
156    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
157    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
158    dec         rax                        ; rowctr
159    jg          near .rowloop
160
161.return:
162    uncollect_args 6
163    pop         rbp
164    ret
165
166; --------------------------------------------------------------------------
167;
168; Downsample pixel values of a single component.
169; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
170; without smoothing.
171;
172; GLOBAL(void)
173; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
174;                            JDIMENSION v_samp_factor,
175;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
176;                            JSAMPARRAY output_data);
177;
178
179; r10d = JDIMENSION image_width
180; r11 = int max_v_samp_factor
181; r12d = JDIMENSION v_samp_factor
182; r13d = JDIMENSION width_in_blocks
183; r14 = JSAMPARRAY input_data
184; r15 = JSAMPARRAY output_data
185
186    align       32
187    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
188
189EXTN(jsimd_h2v2_downsample_sse2):
190    push        rbp
191    mov         rax, rsp
192    mov         rbp, rsp
193    collect_args 6
194
195    mov         ecx, r13d
196    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
197    jz          near .return
198
199    mov         edx, r10d
200
201    ; -- expand_right_edge
202
203    push        rcx
204    shl         rcx, 1                  ; output_cols * 2
205    sub         rcx, rdx
206    jle         short .expand_end
207
208    mov         rax, r11
209    test        rax, rax
210    jle         short .expand_end
211
212    cld
213    mov         rsi, r14                ; input_data
214.expandloop:
215    push        rax
216    push        rcx
217
218    mov         rdi, JSAMPROW [rsi]
219    add         rdi, rdx
220    mov         al, JSAMPLE [rdi-1]
221
222    rep stosb
223
224    pop         rcx
225    pop         rax
226
227    add         rsi, byte SIZEOF_JSAMPROW
228    dec         rax
229    jg          short .expandloop
230
231.expand_end:
232    pop         rcx                     ; output_cols
233
234    ; -- h2v2_downsample
235
236    mov         eax, r12d               ; rowctr
237    test        rax, rax
238    jle         near .return
239
240    mov         rdx, 0x00020001         ; bias pattern
241    movd        xmm7, edx
242    pcmpeqw     xmm6, xmm6
243    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
244    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
245
246    mov         rsi, r14                ; input_data
247    mov         rdi, r15                ; output_data
248.rowloop:
249    push        rcx
250    push        rdi
251    push        rsi
252
253    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
254    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
255    mov         rdi, JSAMPROW [rdi]                    ; outptr
256
257    cmp         rcx, byte SIZEOF_XMMWORD
258    jae         short .columnloop
259
260.columnloop_r8:
261    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
262    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
263    pxor        xmm2, xmm2
264    pxor        xmm3, xmm3
265    mov         rcx, SIZEOF_XMMWORD
266    jmp         short .downsample
267
268.columnloop:
269    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
270    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
271    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
272    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
273
274.downsample:
275    movdqa      xmm4, xmm0
276    movdqa      xmm5, xmm1
277    pand        xmm0, xmm6
278    psrlw       xmm4, BYTE_BIT
279    pand        xmm1, xmm6
280    psrlw       xmm5, BYTE_BIT
281    paddw       xmm0, xmm4
282    paddw       xmm1, xmm5
283
284    movdqa      xmm4, xmm2
285    movdqa      xmm5, xmm3
286    pand        xmm2, xmm6
287    psrlw       xmm4, BYTE_BIT
288    pand        xmm3, xmm6
289    psrlw       xmm5, BYTE_BIT
290    paddw       xmm2, xmm4
291    paddw       xmm3, xmm5
292
293    paddw       xmm0, xmm1
294    paddw       xmm2, xmm3
295    paddw       xmm0, xmm7
296    paddw       xmm2, xmm7
297    psrlw       xmm0, 2
298    psrlw       xmm2, 2
299
300    packuswb    xmm0, xmm2
301
302    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
303
304    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
305    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
306    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
307    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
308    cmp         rcx, byte SIZEOF_XMMWORD
309    jae         near .columnloop
310    test        rcx, rcx
311    jnz         near .columnloop_r8
312
313    pop         rsi
314    pop         rdi
315    pop         rcx
316
317    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
318    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
319    dec         rax                          ; rowctr
320    jg          near .rowloop
321
322.return:
323    uncollect_args 6
324    pop         rbp
325    ret
326
327; For some reason, the OS X linker does not honor the request to align the
328; segment unless we do this.
329    align       32
330