1; 2; jcsample.asm - downsampling (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18 19; -------------------------------------------------------------------------- 20 SECTION SEG_TEXT 21 BITS 32 22; 23; Downsample pixel values of a single component. 24; This version handles the common case of 2:1 horizontal and 1:1 vertical, 25; without smoothing. 26; 27; GLOBAL(void) 28; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 29; JDIMENSION v_samp_factor, 30; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 31; JSAMPARRAY output_data); 32; 33 34%define img_width(b) (b) + 8 ; JDIMENSION image_width 35%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 36%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 37%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 38%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 39%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 40 41 align 32 42 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 43 44EXTN(jsimd_h2v1_downsample_sse2): 45 push ebp 46 mov ebp, esp 47; push ebx ; unused 48; push ecx ; need not be preserved 49; push edx ; need not be preserved 50 push esi 51 push edi 52 53 mov ecx, JDIMENSION [width_blks(ebp)] 54 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 55 jz near .return 56 57 mov edx, JDIMENSION [img_width(ebp)] 58 59 ; -- expand_right_edge 60 61 push ecx 62 shl ecx, 1 ; output_cols * 2 63 sub ecx, edx 64 jle short .expand_end 65 66 mov eax, INT [max_v_samp(ebp)] 67 test eax, eax 68 jle short .expand_end 69 70 cld 71 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 72 alignx 16, 7 73.expandloop: 74 push eax 75 push ecx 76 77 mov edi, JSAMPROW [esi] 78 add edi, edx 79 mov al, JSAMPLE [edi-1] 80 81 rep stosb 82 83 pop ecx 84 pop eax 85 86 add esi, byte SIZEOF_JSAMPROW 87 dec eax 88 jg short .expandloop 89 90.expand_end: 91 pop ecx ; output_cols 92 93 ; -- h2v1_downsample 94 95 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 96 test eax, eax 97 jle near .return 98 99 mov edx, 0x00010000 ; bias pattern 100 movd xmm7, edx 101 pcmpeqw xmm6, xmm6 102 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 103 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 104 105 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 106 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 107 alignx 16, 7 108.rowloop: 109 push ecx 110 push edi 111 push esi 112 113 mov esi, JSAMPROW [esi] ; inptr 114 mov edi, JSAMPROW [edi] ; outptr 115 116 cmp ecx, byte SIZEOF_XMMWORD 117 jae short .columnloop 118 alignx 16, 7 119 120.columnloop_r8: 121 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 122 pxor xmm1, xmm1 123 mov ecx, SIZEOF_XMMWORD 124 jmp short .downsample 125 alignx 16, 7 126 127.columnloop: 128 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 129 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] 130 131.downsample: 132 movdqa xmm2, xmm0 133 movdqa xmm3, xmm1 134 135 pand xmm0, xmm6 136 psrlw xmm2, BYTE_BIT 137 pand xmm1, xmm6 138 psrlw xmm3, BYTE_BIT 139 140 paddw xmm0, xmm2 141 paddw xmm1, xmm3 142 paddw xmm0, xmm7 143 paddw xmm1, xmm7 144 psrlw xmm0, 1 145 psrlw xmm1, 1 146 147 packuswb xmm0, xmm1 148 149 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 150 151 sub ecx, byte SIZEOF_XMMWORD ; outcol 152 add esi, byte 2*SIZEOF_XMMWORD ; inptr 153 add edi, byte 1*SIZEOF_XMMWORD ; outptr 154 cmp ecx, byte SIZEOF_XMMWORD 155 jae short .columnloop 156 test ecx, ecx 157 jnz short .columnloop_r8 158 159 pop esi 160 pop edi 161 pop ecx 162 163 add esi, byte SIZEOF_JSAMPROW ; input_data 164 add edi, byte SIZEOF_JSAMPROW ; output_data 165 dec eax ; rowctr 166 jg near .rowloop 167 168.return: 169 pop edi 170 pop esi 171; pop edx ; need not be preserved 172; pop ecx ; need not be preserved 173; pop ebx ; unused 174 pop ebp 175 ret 176 177; -------------------------------------------------------------------------- 178; 179; Downsample pixel values of a single component. 180; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 181; without smoothing. 182; 183; GLOBAL(void) 184; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 185; JDIMENSION v_samp_factor, 186; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 187; JSAMPARRAY output_data); 188; 189 190%define img_width(b) (b) + 8 ; JDIMENSION image_width 191%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 192%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 193%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 194%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 195%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 196 197 align 32 198 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 199 200EXTN(jsimd_h2v2_downsample_sse2): 201 push ebp 202 mov ebp, esp 203; push ebx ; unused 204; push ecx ; need not be preserved 205; push edx ; need not be preserved 206 push esi 207 push edi 208 209 mov ecx, JDIMENSION [width_blks(ebp)] 210 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 211 jz near .return 212 213 mov edx, JDIMENSION [img_width(ebp)] 214 215 ; -- expand_right_edge 216 217 push ecx 218 shl ecx, 1 ; output_cols * 2 219 sub ecx, edx 220 jle short .expand_end 221 222 mov eax, INT [max_v_samp(ebp)] 223 test eax, eax 224 jle short .expand_end 225 226 cld 227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 228 alignx 16, 7 229.expandloop: 230 push eax 231 push ecx 232 233 mov edi, JSAMPROW [esi] 234 add edi, edx 235 mov al, JSAMPLE [edi-1] 236 237 rep stosb 238 239 pop ecx 240 pop eax 241 242 add esi, byte SIZEOF_JSAMPROW 243 dec eax 244 jg short .expandloop 245 246.expand_end: 247 pop ecx ; output_cols 248 249 ; -- h2v2_downsample 250 251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 252 test eax, eax 253 jle near .return 254 255 mov edx, 0x00020001 ; bias pattern 256 movd xmm7, edx 257 pcmpeqw xmm6, xmm6 258 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 259 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 260 261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 263 alignx 16, 7 264.rowloop: 265 push ecx 266 push edi 267 push esi 268 269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 271 mov edi, JSAMPROW [edi] ; outptr 272 273 cmp ecx, byte SIZEOF_XMMWORD 274 jae short .columnloop 275 alignx 16, 7 276 277.columnloop_r8: 278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 280 pxor xmm2, xmm2 281 pxor xmm3, xmm3 282 mov ecx, SIZEOF_XMMWORD 283 jmp short .downsample 284 alignx 16, 7 285 286.columnloop: 287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] 290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] 291 292.downsample: 293 movdqa xmm4, xmm0 294 movdqa xmm5, xmm1 295 pand xmm0, xmm6 296 psrlw xmm4, BYTE_BIT 297 pand xmm1, xmm6 298 psrlw xmm5, BYTE_BIT 299 paddw xmm0, xmm4 300 paddw xmm1, xmm5 301 302 movdqa xmm4, xmm2 303 movdqa xmm5, xmm3 304 pand xmm2, xmm6 305 psrlw xmm4, BYTE_BIT 306 pand xmm3, xmm6 307 psrlw xmm5, BYTE_BIT 308 paddw xmm2, xmm4 309 paddw xmm3, xmm5 310 311 paddw xmm0, xmm1 312 paddw xmm2, xmm3 313 paddw xmm0, xmm7 314 paddw xmm2, xmm7 315 psrlw xmm0, 2 316 psrlw xmm2, 2 317 318 packuswb xmm0, xmm2 319 320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 321 322 sub ecx, byte SIZEOF_XMMWORD ; outcol 323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 325 add edi, byte 1*SIZEOF_XMMWORD ; outptr 326 cmp ecx, byte SIZEOF_XMMWORD 327 jae near .columnloop 328 test ecx, ecx 329 jnz near .columnloop_r8 330 331 pop esi 332 pop edi 333 pop ecx 334 335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 337 dec eax ; rowctr 338 jg near .rowloop 339 340.return: 341 pop edi 342 pop esi 343; pop edx ; need not be preserved 344; pop ecx ; need not be preserved 345; pop ebx ; unused 346 pop ebp 347 ret 348 349; For some reason, the OS X linker does not honor the request to align the 350; segment unless we do this. 351 align 32 352