1; 2; jquant.asm - sample data conversion and quantization (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jsimdext.inc" 19%include "jdct.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24; 25; Load data into workspace, applying unsigned->signed conversion 26; 27; GLOBAL(void) 28; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col, 29; DCTELEM *workspace); 30; 31 32%define sample_data ebp+8 ; JSAMPARRAY sample_data 33%define start_col ebp+12 ; JDIMENSION start_col 34%define workspace ebp+16 ; DCTELEM *workspace 35 36 align 16 37 global EXTN(jsimd_convsamp_mmx) 38 39EXTN(jsimd_convsamp_mmx): 40 push ebp 41 mov ebp,esp 42 push ebx 43; push ecx ; need not be preserved 44; push edx ; need not be preserved 45 push esi 46 push edi 47 48 pxor mm6,mm6 ; mm6=(all 0's) 49 pcmpeqw mm7,mm7 50 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 51 52 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 53 mov eax, JDIMENSION [start_col] 54 mov edi, POINTER [workspace] ; (DCTELEM *) 55 mov ecx, DCTSIZE/4 56 alignx 16,7 57.convloop: 58 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 61 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) 62 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) 63 64 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 65 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 66 67 movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) 68 movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) 69 70 movq mm4,mm0 71 punpcklbw mm0,mm6 ; mm0=(0123) 72 punpckhbw mm4,mm6 ; mm4=(4567) 73 movq mm5,mm1 74 punpcklbw mm1,mm6 ; mm1=(89AB) 75 punpckhbw mm5,mm6 ; mm5=(CDEF) 76 77 paddw mm0,mm7 78 paddw mm4,mm7 79 paddw mm1,mm7 80 paddw mm5,mm7 81 82 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 83 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 84 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 85 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 86 87 movq mm0,mm2 88 punpcklbw mm2,mm6 ; mm2=(GHIJ) 89 punpckhbw mm0,mm6 ; mm0=(KLMN) 90 movq mm4,mm3 91 punpcklbw mm3,mm6 ; mm3=(OPQR) 92 punpckhbw mm4,mm6 ; mm4=(STUV) 93 94 paddw mm2,mm7 95 paddw mm0,mm7 96 paddw mm3,mm7 97 paddw mm4,mm7 98 99 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 100 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 101 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 102 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 103 104 add esi, byte 4*SIZEOF_JSAMPROW 105 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 106 dec ecx 107 jnz short .convloop 108 109 emms ; empty MMX state 110 111 pop edi 112 pop esi 113; pop edx ; need not be preserved 114; pop ecx ; need not be preserved 115 pop ebx 116 pop ebp 117 ret 118 119; -------------------------------------------------------------------------- 120; 121; Quantize/descale the coefficients, and store into coef_block 122; 123; This implementation is based on an algorithm described in 124; "How to optimize for the Pentium family of microprocessors" 125; (http://www.agner.org/assem/). 126; 127; GLOBAL(void) 128; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors, 129; DCTELEM *workspace); 130; 131 132%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) 133%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) 134%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) 135%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) 136 137%define coef_block ebp+8 ; JCOEFPTR coef_block 138%define divisors ebp+12 ; DCTELEM *divisors 139%define workspace ebp+16 ; DCTELEM *workspace 140 141 align 16 142 global EXTN(jsimd_quantize_mmx) 143 144EXTN(jsimd_quantize_mmx): 145 push ebp 146 mov ebp,esp 147; push ebx ; unused 148; push ecx ; unused 149; push edx ; need not be preserved 150 push esi 151 push edi 152 153 mov esi, POINTER [workspace] 154 mov edx, POINTER [divisors] 155 mov edi, JCOEFPTR [coef_block] 156 mov ah, 2 157 alignx 16,7 158.quantloop1: 159 mov al, DCTSIZE2/8/2 160 alignx 16,7 161.quantloop2: 162 movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 163 movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] 164 165 movq mm0,mm2 166 movq mm1,mm3 167 168 psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise 169 psraw mm3,(WORD_BIT-1) 170 171 pxor mm0,mm2 ; val = -val 172 pxor mm1,mm3 173 psubw mm0,mm2 174 psubw mm1,mm3 175 176 ; 177 ; MMX is an annoyingly crappy instruction set. It has two 178 ; misfeatures that are causing problems here: 179 ; 180 ; - All multiplications are signed. 181 ; 182 ; - The second operand for the shifts is not treated as packed. 183 ; 184 ; 185 ; We work around the first problem by implementing this algorithm: 186 ; 187 ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) 188 ; { 189 ; enum { SHORT_BIT = 16 }; 190 ; signed short sx = (signed short) x; 191 ; signed short sy = (signed short) y; 192 ; signed long sz; 193 ; 194 ; sz = (long) sx * (long) sy; /* signed multiply */ 195 ; 196 ; if (sx < 0) sz += (long) sy << SHORT_BIT; 197 ; if (sy < 0) sz += (long) sx << SHORT_BIT; 198 ; 199 ; return (unsigned long) sz; 200 ; } 201 ; 202 ; (note that a negative sx adds _sy_ and vice versa) 203 ; 204 ; For the second problem, we replace the shift by a multiplication. 205 ; Unfortunately that means we have to deal with the signed issue again. 206 ; 207 208 paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 209 paddw mm1, MMWORD [CORRECTION(0,1,edx)] 210 211 movq mm4,mm0 ; store current value for later 212 movq mm5,mm1 213 pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 214 pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] 215 paddw mm0,mm4 ; reciprocal is always negative (MSB=1), 216 paddw mm1,mm5 ; so we always need to add the initial value 217 ; (input value is never negative as we 218 ; inverted it at the start of this routine) 219 220 ; here it gets a bit tricky as both scale 221 ; and mm0/mm1 can be negative 222 movq mm6, MMWORD [SCALE(0,0,edx)] ; scale 223 movq mm7, MMWORD [SCALE(0,1,edx)] 224 movq mm4,mm0 225 movq mm5,mm1 226 pmulhw mm0,mm6 227 pmulhw mm1,mm7 228 229 psraw mm6,(WORD_BIT-1) ; determine if scale is negative 230 psraw mm7,(WORD_BIT-1) 231 232 pand mm6,mm4 ; and add input if it is 233 pand mm7,mm5 234 paddw mm0,mm6 235 paddw mm1,mm7 236 237 psraw mm4,(WORD_BIT-1) ; then check if negative input 238 psraw mm5,(WORD_BIT-1) 239 240 pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is 241 pand mm5, MMWORD [SCALE(0,1,edx)] 242 paddw mm0,mm4 243 paddw mm1,mm5 244 245 pxor mm0,mm2 ; val = -val 246 pxor mm1,mm3 247 psubw mm0,mm2 248 psubw mm1,mm3 249 250 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 251 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 252 253 add esi, byte 8*SIZEOF_DCTELEM 254 add edx, byte 8*SIZEOF_DCTELEM 255 add edi, byte 8*SIZEOF_JCOEF 256 dec al 257 jnz near .quantloop2 258 dec ah 259 jnz near .quantloop1 ; to avoid branch misprediction 260 261 emms ; empty MMX state 262 263 pop edi 264 pop esi 265; pop edx ; need not be preserved 266; pop ecx ; unused 267; pop ebx ; unused 268 pop ebp 269 ret 270 271; For some reason, the OS X linker does not honor the request to align the 272; segment unless we do this. 273 align 16 274