1; 2; jcsample.asm - downsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18 19; -------------------------------------------------------------------------- 20 SECTION SEG_TEXT 21 BITS 64 22; 23; Downsample pixel values of a single component. 24; This version handles the common case of 2:1 horizontal and 1:1 vertical, 25; without smoothing. 26; 27; GLOBAL(void) 28; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 29; JDIMENSION v_samp_factor, 30; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 31; JSAMPARRAY output_data); 32; 33 34; r10d = JDIMENSION image_width 35; r11 = int max_v_samp_factor 36; r12d = JDIMENSION v_samp_factor 37; r13d = JDIMENSION width_in_blocks 38; r14 = JSAMPARRAY input_data 39; r15 = JSAMPARRAY output_data 40 41 align 32 42 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 43 44EXTN(jsimd_h2v1_downsample_sse2): 45 push rbp 46 mov rax, rsp 47 mov rbp, rsp 48 collect_args 6 49 50 mov ecx, r13d 51 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 52 jz near .return 53 54 mov edx, r10d 55 56 ; -- expand_right_edge 57 58 push rcx 59 shl rcx, 1 ; output_cols * 2 60 sub rcx, rdx 61 jle short .expand_end 62 63 mov rax, r11 64 test rax, rax 65 jle short .expand_end 66 67 cld 68 mov rsi, r14 ; input_data 69.expandloop: 70 push rax 71 push rcx 72 73 mov rdi, JSAMPROW [rsi] 74 add rdi, rdx 75 mov al, JSAMPLE [rdi-1] 76 77 rep stosb 78 79 pop rcx 80 pop rax 81 82 add rsi, byte SIZEOF_JSAMPROW 83 dec rax 84 jg short .expandloop 85 86.expand_end: 87 pop rcx ; output_cols 88 89 ; -- h2v1_downsample 90 91 mov eax, r12d ; rowctr 92 test eax, eax 93 jle near .return 94 95 mov rdx, 0x00010000 ; bias pattern 96 movd xmm7, edx 97 pcmpeqw xmm6, xmm6 98 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 99 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 100 101 mov rsi, r14 ; input_data 102 mov rdi, r15 ; output_data 103.rowloop: 104 push rcx 105 push rdi 106 push rsi 107 108 mov rsi, JSAMPROW [rsi] ; inptr 109 mov rdi, JSAMPROW [rdi] ; outptr 110 111 cmp rcx, byte SIZEOF_XMMWORD 112 jae short .columnloop 113 114.columnloop_r8: 115 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 116 pxor xmm1, xmm1 117 mov rcx, SIZEOF_XMMWORD 118 jmp short .downsample 119 120.columnloop: 121 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 122 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] 123 124.downsample: 125 movdqa xmm2, xmm0 126 movdqa xmm3, xmm1 127 128 pand xmm0, xmm6 129 psrlw xmm2, BYTE_BIT 130 pand xmm1, xmm6 131 psrlw xmm3, BYTE_BIT 132 133 paddw xmm0, xmm2 134 paddw xmm1, xmm3 135 paddw xmm0, xmm7 136 paddw xmm1, xmm7 137 psrlw xmm0, 1 138 psrlw xmm1, 1 139 140 packuswb xmm0, xmm1 141 142 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 143 144 sub rcx, byte SIZEOF_XMMWORD ; outcol 145 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 146 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 147 cmp rcx, byte SIZEOF_XMMWORD 148 jae short .columnloop 149 test rcx, rcx 150 jnz short .columnloop_r8 151 152 pop rsi 153 pop rdi 154 pop rcx 155 156 add rsi, byte SIZEOF_JSAMPROW ; input_data 157 add rdi, byte SIZEOF_JSAMPROW ; output_data 158 dec rax ; rowctr 159 jg near .rowloop 160 161.return: 162 uncollect_args 6 163 pop rbp 164 ret 165 166; -------------------------------------------------------------------------- 167; 168; Downsample pixel values of a single component. 169; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 170; without smoothing. 171; 172; GLOBAL(void) 173; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 174; JDIMENSION v_samp_factor, 175; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 176; JSAMPARRAY output_data); 177; 178 179; r10d = JDIMENSION image_width 180; r11 = int max_v_samp_factor 181; r12d = JDIMENSION v_samp_factor 182; r13d = JDIMENSION width_in_blocks 183; r14 = JSAMPARRAY input_data 184; r15 = JSAMPARRAY output_data 185 186 align 32 187 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 188 189EXTN(jsimd_h2v2_downsample_sse2): 190 push rbp 191 mov rax, rsp 192 mov rbp, rsp 193 collect_args 6 194 195 mov ecx, r13d 196 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 197 jz near .return 198 199 mov edx, r10d 200 201 ; -- expand_right_edge 202 203 push rcx 204 shl rcx, 1 ; output_cols * 2 205 sub rcx, rdx 206 jle short .expand_end 207 208 mov rax, r11 209 test rax, rax 210 jle short .expand_end 211 212 cld 213 mov rsi, r14 ; input_data 214.expandloop: 215 push rax 216 push rcx 217 218 mov rdi, JSAMPROW [rsi] 219 add rdi, rdx 220 mov al, JSAMPLE [rdi-1] 221 222 rep stosb 223 224 pop rcx 225 pop rax 226 227 add rsi, byte SIZEOF_JSAMPROW 228 dec rax 229 jg short .expandloop 230 231.expand_end: 232 pop rcx ; output_cols 233 234 ; -- h2v2_downsample 235 236 mov eax, r12d ; rowctr 237 test rax, rax 238 jle near .return 239 240 mov rdx, 0x00020001 ; bias pattern 241 movd xmm7, edx 242 pcmpeqw xmm6, xmm6 243 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 244 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 245 246 mov rsi, r14 ; input_data 247 mov rdi, r15 ; output_data 248.rowloop: 249 push rcx 250 push rdi 251 push rsi 252 253 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 254 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 255 mov rdi, JSAMPROW [rdi] ; outptr 256 257 cmp rcx, byte SIZEOF_XMMWORD 258 jae short .columnloop 259 260.columnloop_r8: 261 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 262 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 263 pxor xmm2, xmm2 264 pxor xmm3, xmm3 265 mov rcx, SIZEOF_XMMWORD 266 jmp short .downsample 267 268.columnloop: 269 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 270 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 271 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] 272 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] 273 274.downsample: 275 movdqa xmm4, xmm0 276 movdqa xmm5, xmm1 277 pand xmm0, xmm6 278 psrlw xmm4, BYTE_BIT 279 pand xmm1, xmm6 280 psrlw xmm5, BYTE_BIT 281 paddw xmm0, xmm4 282 paddw xmm1, xmm5 283 284 movdqa xmm4, xmm2 285 movdqa xmm5, xmm3 286 pand xmm2, xmm6 287 psrlw xmm4, BYTE_BIT 288 pand xmm3, xmm6 289 psrlw xmm5, BYTE_BIT 290 paddw xmm2, xmm4 291 paddw xmm3, xmm5 292 293 paddw xmm0, xmm1 294 paddw xmm2, xmm3 295 paddw xmm0, xmm7 296 paddw xmm2, xmm7 297 psrlw xmm0, 2 298 psrlw xmm2, 2 299 300 packuswb xmm0, xmm2 301 302 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 303 304 sub rcx, byte SIZEOF_XMMWORD ; outcol 305 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 306 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 307 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 308 cmp rcx, byte SIZEOF_XMMWORD 309 jae near .columnloop 310 test rcx, rcx 311 jnz near .columnloop_r8 312 313 pop rsi 314 pop rdi 315 pop rcx 316 317 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 318 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 319 dec rax ; rowctr 320 jg near .rowloop 321 322.return: 323 uncollect_args 6 324 pop rbp 325 ret 326 327; For some reason, the OS X linker does not honor the request to align the 328; segment unless we do this. 329 align 32 330