1; vim:filetype=nasm ts=8 2 3; libFLAC - Free Lossless Audio Codec library 4; Copyright (C) 2001-2009 Josh Coalson 5; Copyright (C) 2011-2013 Xiph.Org Foundation 6; 7; Redistribution and use in source and binary forms, with or without 8; modification, are permitted provided that the following conditions 9; are met: 10; 11; - Redistributions of source code must retain the above copyright 12; notice, this list of conditions and the following disclaimer. 13; 14; - Redistributions in binary form must reproduce the above copyright 15; notice, this list of conditions and the following disclaimer in the 16; documentation and/or other materials provided with the distribution. 17; 18; - Neither the name of the Xiph.org Foundation nor the names of its 19; contributors may be used to endorse or promote products derived from 20; this software without specific prior written permission. 21; 22; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 26; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 30; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 34%include "nasm.h" 35 36 data_section 37 38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 42cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 43cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow 44cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 45cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx 46cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 47cglobal FLAC__lpc_restore_signal_asm_ia32 48cglobal FLAC__lpc_restore_signal_asm_ia32_mmx 49cglobal FLAC__lpc_restore_signal_wide_asm_ia32 50 51 code_section 52 53; ********************************************************************** 54; 55; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 56; { 57; FLAC__real d; 58; unsigned sample, coeff; 59; const unsigned limit = data_len - lag; 60; 61; FLAC__ASSERT(lag > 0); 62; FLAC__ASSERT(lag <= data_len); 63; 64; for(coeff = 0; coeff < lag; coeff++) 65; autoc[coeff] = 0.0; 66; for(sample = 0; sample <= limit; sample++) { 67; d = data[sample]; 68; for(coeff = 0; coeff < lag; coeff++) 69; autoc[coeff] += d * data[sample+coeff]; 70; } 71; for(; sample < data_len; sample++) { 72; d = data[sample]; 73; for(coeff = 0; coeff < data_len - sample; coeff++) 74; autoc[coeff] += d * data[sample+coeff]; 75; } 76; } 77; 78 ALIGN 16 79cident FLAC__lpc_compute_autocorrelation_asm_ia32 80 ;[esp + 28] == autoc[] 81 ;[esp + 24] == lag 82 ;[esp + 20] == data_len 83 ;[esp + 16] == data[] 84 85 ;ASSERT(lag > 0) 86 ;ASSERT(lag <= 33) 87 ;ASSERT(lag <= data_len) 88 89.begin: 90 push esi 91 push edi 92 push ebx 93 94 ; for(coeff = 0; coeff < lag; coeff++) 95 ; autoc[coeff] = 0.0; 96 mov edi, [esp + 28] ; edi == autoc 97 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write 98 xor eax, eax 99 rep stosd 100 101 ; const unsigned limit = data_len - lag; 102 mov eax, [esp + 24] ; eax == lag 103 mov ecx, [esp + 20] 104 sub ecx, eax ; ecx == limit 105 106 mov edi, [esp + 28] ; edi == autoc 107 mov esi, [esp + 16] ; esi == data 108 inc ecx ; we are looping <= limit so we add one to the counter 109 110 ; for(sample = 0; sample <= limit; sample++) { 111 ; d = data[sample]; 112 ; for(coeff = 0; coeff < lag; coeff++) 113 ; autoc[coeff] += d * data[sample+coeff]; 114 ; } 115 fld dword [esi] ; ST = d <- data[sample] 116 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) 117 lea edx, [eax + eax*2] 118 neg edx 119 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] 120 call .mov_eip_to_ebx 121.get_eip1: 122 add edx, ebx 123 inc edx ; compensate for the shorter opcode on the last iteration 124 inc edx ; compensate for the shorter opcode on the last iteration 125 inc edx ; compensate for the shorter opcode on the last iteration 126 cmp eax, 33 127 jne .loop1_start 128 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration 129.loop1_start: 130 jmp edx 131 132.mov_eip_to_ebx: 133 mov ebx, [esp] 134 ret 135 136 fld st0 ; ST = d d 137 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! 138 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! 139 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! 140 fld st0 ; ST = d d 141 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d 142 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d 143 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d 144 fld st0 ; ST = d d 145 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d 146 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d 147 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d 148 fld st0 ; ST = d d 149 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d 150 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d 151 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d 152 fld st0 ; ST = d d 153 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d 154 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d 155 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d 156 fld st0 ; ST = d d 157 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d 158 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d 159 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d 160 fld st0 ; ST = d d 161 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d 162 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d 163 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d 164 fld st0 ; ST = d d 165 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d 166 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d 167 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d 168 fld st0 ; ST = d d 169 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d 170 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d 171 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d 172 fld st0 ; ST = d d 173 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d 174 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d 175 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d 176 fld st0 ; ST = d d 177 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d 178 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d 179 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d 180 fld st0 ; ST = d d 181 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d 182 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d 183 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d 184 fld st0 ; ST = d d 185 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d 186 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d 187 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d 188 fld st0 ; ST = d d 189 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d 190 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d 191 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d 192 fld st0 ; ST = d d 193 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d 194 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d 195 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d 196 fld st0 ; ST = d d 197 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d 198 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d 199 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d 200 fld st0 ; ST = d d 201 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d 202 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d 203 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d 204 fld st0 ; ST = d d 205 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d 206 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d 207 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d 208 fld st0 ; ST = d d 209 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d 210 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d 211 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d 212 fld st0 ; ST = d d 213 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d 214 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d 215 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d 216 fld st0 ; ST = d d 217 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d 218 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d 219 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d 220 fld st0 ; ST = d d 221 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d 222 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d 223 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d 224 fld st0 ; ST = d d 225 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d 226 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d 227 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d 228 fld st0 ; ST = d d 229 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d 230 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d 231 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d 232 fld st0 ; ST = d d 233 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d 234 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d 235 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d 236 fld st0 ; ST = d d 237 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d 238 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d 239 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d 240 fld st0 ; ST = d d 241 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d 242 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d 243 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d 244 fld st0 ; ST = d d 245 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d 246 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d 247 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d 248 fld st0 ; ST = d d 249 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d 250 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d 251 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d 252 fld st0 ; ST = d d 253 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d 254 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d 255 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d 256 fld st0 ; ST = d d 257 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d 258 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d 259 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d 260 fld st0 ; ST = d d 261 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d 262 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d 263 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d 264 fld st0 ; ST = d d 265 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! 266 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! 267 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! 268.jumper1_0: 269 270 fstp st0 ; pop d, ST = empty 271 add esi, byte 4 ; sample++ 272 dec ecx 273 jz .loop1_end 274 fld dword [esi] ; ST = d <- data[sample] 275 jmp edx 276.loop1_end: 277 278 ; for(; sample < data_len; sample++) { 279 ; d = data[sample]; 280 ; for(coeff = 0; coeff < data_len - sample; coeff++) 281 ; autoc[coeff] += d * data[sample+coeff]; 282 ; } 283 mov ecx, [esp + 24] ; ecx <- lag 284 dec ecx ; ecx <- lag - 1 285 jz near .end ; skip loop if 0 (i.e. lag == 1) 286 287 fld dword [esi] ; ST = d <- data[sample] 288 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through 289 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) 290 lea edx, [eax + eax*2] 291 neg edx 292 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] 293 call .mov_eip_to_ebx 294.get_eip2: 295 add edx, ebx 296 inc edx ; compensate for the shorter opcode on the last iteration 297 inc edx ; compensate for the shorter opcode on the last iteration 298 inc edx ; compensate for the shorter opcode on the last iteration 299 jmp edx 300 301 fld st0 ; ST = d d 302 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d 303 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d 304 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d 305 fld st0 ; ST = d d 306 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d 307 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d 308 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d 309 fld st0 ; ST = d d 310 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d 311 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d 312 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d 313 fld st0 ; ST = d d 314 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d 315 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d 316 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d 317 fld st0 ; ST = d d 318 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d 319 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d 320 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d 321 fld st0 ; ST = d d 322 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d 323 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d 324 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d 325 fld st0 ; ST = d d 326 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d 327 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d 328 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d 329 fld st0 ; ST = d d 330 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d 331 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d 332 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d 333 fld st0 ; ST = d d 334 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d 335 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d 336 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d 337 fld st0 ; ST = d d 338 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d 339 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d 340 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d 341 fld st0 ; ST = d d 342 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d 343 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d 344 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d 345 fld st0 ; ST = d d 346 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d 347 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d 348 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d 349 fld st0 ; ST = d d 350 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d 351 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d 352 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d 353 fld st0 ; ST = d d 354 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d 355 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d 356 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d 357 fld st0 ; ST = d d 358 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d 359 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d 360 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d 361 fld st0 ; ST = d d 362 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d 363 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d 364 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d 365 fld st0 ; ST = d d 366 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d 367 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d 368 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d 369 fld st0 ; ST = d d 370 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d 371 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d 372 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d 373 fld st0 ; ST = d d 374 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d 375 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d 376 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d 377 fld st0 ; ST = d d 378 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d 379 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d 380 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d 381 fld st0 ; ST = d d 382 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d 383 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d 384 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d 385 fld st0 ; ST = d d 386 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d 387 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d 388 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d 389 fld st0 ; ST = d d 390 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d 391 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d 392 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d 393 fld st0 ; ST = d d 394 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d 395 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d 396 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d 397 fld st0 ; ST = d d 398 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d 399 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d 400 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d 401 fld st0 ; ST = d d 402 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d 403 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d 404 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d 405 fld st0 ; ST = d d 406 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d 407 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d 408 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d 409 fld st0 ; ST = d d 410 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d 411 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d 412 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d 413 fld st0 ; ST = d d 414 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d 415 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d 416 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d 417 fld st0 ; ST = d d 418 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d 419 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d 420 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d 421 fld st0 ; ST = d d 422 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d 423 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d 424 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d 425 fld st0 ; ST = d d 426 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! 427 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! 428 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! 429.jumper2_0: 430 431 fstp st0 ; pop d, ST = empty 432 add esi, byte 4 ; sample++ 433 dec ecx 434 jz .loop2_end 435 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target 436 fld dword [esi] ; ST = d <- data[sample] 437 jmp edx 438.loop2_end: 439 440.end: 441 pop ebx 442 pop edi 443 pop esi 444 ret 445 446 ALIGN 16 447cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 448 ;[esp + 16] == autoc[] 449 ;[esp + 12] == lag 450 ;[esp + 8] == data_len 451 ;[esp + 4] == data[] 452 453 ;ASSERT(lag > 0) 454 ;ASSERT(lag <= 4) 455 ;ASSERT(lag <= data_len) 456 457 ; for(coeff = 0; coeff < lag; coeff++) 458 ; autoc[coeff] = 0.0; 459 xorps xmm5, xmm5 460 461 mov edx, [esp + 8] ; edx == data_len 462 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 463 464 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 465 add eax, 4 466 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 467 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 468.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] 469 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 470 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 471 dec edx 472 jz .loop_end 473 ALIGN 16 474.loop_start: 475 ; start by reading the next sample 476 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 477 add eax, 4 478 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 479 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float 480 movss xmm2, xmm0 481 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 482 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 483 dec edx 484 jnz .loop_start 485.loop_end: 486 ; store autoc 487 mov edx, [esp + 16] ; edx == autoc 488 movups [edx], xmm5 489 490.end: 491 ret 492 493 ALIGN 16 494cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 495 ;[esp + 16] == autoc[] 496 ;[esp + 12] == lag 497 ;[esp + 8] == data_len 498 ;[esp + 4] == data[] 499 500 ;ASSERT(lag > 0) 501 ;ASSERT(lag <= 8) 502 ;ASSERT(lag <= data_len) 503 504 ; for(coeff = 0; coeff < lag; coeff++) 505 ; autoc[coeff] = 0.0; 506 xorps xmm5, xmm5 507 xorps xmm6, xmm6 508 509 mov edx, [esp + 8] ; edx == data_len 510 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 511 512 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 513 add eax, 4 514 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 515 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 516 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 517 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 518.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] 519 mulps xmm0, xmm2 520 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 521 addps xmm5, xmm0 522 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 523 dec edx 524 jz .loop_end 525 ALIGN 16 526.loop_start: 527 ; start by reading the next sample 528 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 529 ; here we reorder the instructions; see the (#) indexes for a logical order 530 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float 531 add eax, 4 ; (0) 532 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float 533 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] 534 movss xmm3, xmm2 ; (5) 535 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] 536 movss xmm2, xmm0 ; (6) 537 mulps xmm1, xmm3 ; (8) 538 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 539 addps xmm6, xmm1 ; (10) 540 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 541 dec edx 542 jnz .loop_start 543.loop_end: 544 ; store autoc 545 mov edx, [esp + 16] ; edx == autoc 546 movups [edx], xmm5 547 movups [edx + 16], xmm6 548 549.end: 550 ret 551 552 ALIGN 16 553cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 554 ;[esp + 16] == autoc[] 555 ;[esp + 12] == lag 556 ;[esp + 8] == data_len 557 ;[esp + 4] == data[] 558 559 ;ASSERT(lag > 0) 560 ;ASSERT(lag <= 12) 561 ;ASSERT(lag <= data_len) 562 563 ; for(coeff = 0; coeff < lag; coeff++) 564 ; autoc[coeff] = 0.0; 565 xorps xmm5, xmm5 566 xorps xmm6, xmm6 567 xorps xmm7, xmm7 568 569 mov edx, [esp + 8] ; edx == data_len 570 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 571 572 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 573 add eax, 4 574 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 575 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 576 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 577 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 578.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] 579 movaps xmm1, xmm0 580 mulps xmm1, xmm2 581 addps xmm5, xmm1 582 movaps xmm1, xmm0 583 mulps xmm1, xmm3 584 addps xmm6, xmm1 585 mulps xmm0, xmm4 586 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 587 dec edx 588 jz .loop_end 589 ALIGN 16 590.loop_start: 591 ; start by reading the next sample 592 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 593 add eax, 4 594 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 595 596 ; shift xmm4:xmm3:xmm2 left by one float 597 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float 598 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float 599 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float 600 movss xmm4, xmm3 601 movss xmm3, xmm2 602 movss xmm2, xmm0 603 604 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 605 movaps xmm1, xmm0 606 mulps xmm1, xmm2 607 addps xmm5, xmm1 608 movaps xmm1, xmm0 609 mulps xmm1, xmm3 610 addps xmm6, xmm1 611 mulps xmm0, xmm4 612 addps xmm7, xmm0 613 614 dec edx 615 jnz .loop_start 616.loop_end: 617 ; store autoc 618 mov edx, [esp + 16] ; edx == autoc 619 movups [edx], xmm5 620 movups [edx + 16], xmm6 621 movups [edx + 32], xmm7 622 623.end: 624 ret 625 626 ALIGN 16 627cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 628 ;[ebp + 20] == autoc[] 629 ;[ebp + 16] == lag 630 ;[ebp + 12] == data_len 631 ;[ebp + 8] == data[] 632 ;[esp] == __m128 633 ;[esp + 16] == __m128 634 635 push ebp 636 mov ebp, esp 637 and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps' 638 sub esp, 32 639 640 ;ASSERT(lag > 0) 641 ;ASSERT(lag <= 12) 642 ;ASSERT(lag <= data_len) 643 ;ASSERT(data_len > 0) 644 645 ; for(coeff = 0; coeff < lag; coeff++) 646 ; autoc[coeff] = 0.0; 647 xorps xmm5, xmm5 648 xorps xmm6, xmm6 649 movaps [esp], xmm5 650 movaps [esp + 16], xmm6 651 652 mov edx, [ebp + 12] ; edx == data_len 653 mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0] 654 655 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 656 add eax, 4 657 movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] 658 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 659 xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 660 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 661 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 662 movaps xmm7, xmm0 663 mulps xmm7, xmm1 664 addps xmm5, xmm7 665 dec edx 666 jz .loop_end 667 ALIGN 16 668.loop_start: 669 ; start by reading the next sample 670 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 671 add eax, 4 672 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 673 674 ; shift xmm4:xmm3:xmm2:xmm1 left by one float 675 shufps xmm1, xmm1, 93h 676 shufps xmm2, xmm2, 93h 677 shufps xmm3, xmm3, 93h 678 shufps xmm4, xmm4, 93h 679 movss xmm4, xmm3 680 movss xmm3, xmm2 681 movss xmm2, xmm1 682 movss xmm1, xmm0 683 684 ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 685 movaps xmm7, xmm0 686 mulps xmm7, xmm1 687 addps xmm5, xmm7 688 movaps xmm7, xmm0 689 mulps xmm7, xmm2 690 addps xmm6, xmm7 691 movaps xmm7, xmm0 692 mulps xmm7, xmm3 693 mulps xmm0, xmm4 694 addps xmm7, [esp] 695 addps xmm0, [esp + 16] 696 movaps [esp], xmm7 697 movaps [esp + 16], xmm0 698 699 dec edx 700 jnz .loop_start 701.loop_end: 702 ; store autoc 703 mov edx, [ebp + 20] ; edx == autoc 704 movups [edx], xmm5 705 movups [edx + 16], xmm6 706 movaps xmm5, [esp] 707 movaps xmm6, [esp + 16] 708 movups [edx + 32], xmm5 709 movups [edx + 48], xmm6 710.end: 711 mov esp, ebp 712 pop ebp 713 ret 714 715 ALIGN 16 716cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow 717 ;[ebp + 32] autoc 718 ;[ebp + 28] lag 719 ;[ebp + 24] data_len 720 ;[ebp + 20] data 721 722 push ebp 723 push ebx 724 push esi 725 push edi 726 mov ebp, esp 727 728 mov esi, [ebp + 20] 729 mov edi, [ebp + 24] 730 mov edx, [ebp + 28] 731 inc edx 732 and edx, byte -2 733 mov eax, edx 734 neg eax 735 and esp, byte -8 736 lea esp, [esp + 4 * eax] 737 mov ecx, edx 738 xor eax, eax 739.loop0: 740 dec ecx 741 mov [esp + 4 * ecx], eax 742 jnz short .loop0 743 744 mov eax, edi 745 sub eax, edx 746 mov ebx, edx 747 and ebx, byte 1 748 sub eax, ebx 749 lea ecx, [esi + 4 * eax - 12] 750 cmp esi, ecx 751 mov eax, esi 752 ja short .loop2_pre 753 ALIGN 16 ;4 nops 754.loop1_i: 755 movd mm0, [eax] 756 movd mm2, [eax + 4] 757 movd mm4, [eax + 8] 758 movd mm6, [eax + 12] 759 mov ebx, edx 760 punpckldq mm0, mm0 761 punpckldq mm2, mm2 762 punpckldq mm4, mm4 763 punpckldq mm6, mm6 764 ALIGN 16 ;3 nops 765.loop1_j: 766 sub ebx, byte 2 767 movd mm1, [eax + 4 * ebx] 768 movd mm3, [eax + 4 * ebx + 4] 769 movd mm5, [eax + 4 * ebx + 8] 770 movd mm7, [eax + 4 * ebx + 12] 771 punpckldq mm1, mm3 772 punpckldq mm3, mm5 773 pfmul mm1, mm0 774 punpckldq mm5, mm7 775 pfmul mm3, mm2 776 punpckldq mm7, [eax + 4 * ebx + 16] 777 pfmul mm5, mm4 778 pfmul mm7, mm6 779 pfadd mm1, mm3 780 movq mm3, [esp + 4 * ebx] 781 pfadd mm5, mm7 782 pfadd mm1, mm5 783 pfadd mm3, mm1 784 movq [esp + 4 * ebx], mm3 785 jg short .loop1_j 786 787 add eax, byte 16 788 cmp eax, ecx 789 jb short .loop1_i 790 791.loop2_pre: 792 mov ebx, eax 793 sub eax, esi 794 shr eax, 2 795 lea ecx, [esi + 4 * edi] 796 mov esi, ebx 797.loop2_i: 798 movd mm0, [esi] 799 mov ebx, edi 800 sub ebx, eax 801 cmp ebx, edx 802 jbe short .loop2_j 803 mov ebx, edx 804.loop2_j: 805 dec ebx 806 movd mm1, [esi + 4 * ebx] 807 pfmul mm1, mm0 808 movd mm2, [esp + 4 * ebx] 809 pfadd mm1, mm2 810 movd [esp + 4 * ebx], mm1 811 812 jnz short .loop2_j 813 814 add esi, byte 4 815 inc eax 816 cmp esi, ecx 817 jnz short .loop2_i 818 819 mov edi, [ebp + 32] 820 mov edx, [ebp + 28] 821.loop3: 822 dec edx 823 mov eax, [esp + 4 * edx] 824 mov [edi + 4 * edx], eax 825 jnz short .loop3 826 827 femms 828 829 mov esp, ebp 830 pop edi 831 pop esi 832 pop ebx 833 pop ebp 834 ret 835 836;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 837; 838; for(i = 0; i < data_len; i++) { 839; sum = 0; 840; for(j = 0; j < order; j++) 841; sum += qlp_coeff[j] * data[i-j-1]; 842; residual[i] = data[i] - (sum >> lp_quantization); 843; } 844; 845 ALIGN 16 846cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 847 ;[esp + 40] residual[] 848 ;[esp + 36] lp_quantization 849 ;[esp + 32] order 850 ;[esp + 28] qlp_coeff[] 851 ;[esp + 24] data_len 852 ;[esp + 20] data[] 853 854 ;ASSERT(order > 0) 855 856 push ebp 857 push ebx 858 push esi 859 push edi 860 861 mov esi, [esp + 20] ; esi = data[] 862 mov edi, [esp + 40] ; edi = residual[] 863 mov eax, [esp + 32] ; eax = order 864 mov ebx, [esp + 24] ; ebx = data_len 865 866 test ebx, ebx 867 jz near .end ; do nothing if data_len == 0 868.begin: 869 cmp eax, byte 1 870 jg short .i_1more 871 872 mov ecx, [esp + 28] 873 mov edx, [ecx] ; edx = qlp_coeff[0] 874 mov eax, [esi - 4] ; eax = data[-1] 875 mov ecx, [esp + 36] ; cl = lp_quantization 876 ALIGN 16 877.i_1_loop_i: 878 imul eax, edx 879 sar eax, cl 880 neg eax 881 add eax, [esi] 882 mov [edi], eax 883 mov eax, [esi] 884 add edi, byte 4 885 add esi, byte 4 886 dec ebx 887 jnz .i_1_loop_i 888 889 jmp .end 890 891.i_1more: 892 cmp eax, byte 32 ; for order <= 32 there is a faster routine 893 jbe short .i_32 894 895 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 896 ALIGN 16 897.i_32more_loop_i: 898 xor ebp, ebp 899 mov ecx, [esp + 32] 900 mov edx, ecx 901 shl edx, 2 902 add edx, [esp + 28] 903 neg ecx 904 ALIGN 16 905.i_32more_loop_j: 906 sub edx, byte 4 907 mov eax, [edx] 908 imul eax, [esi + 4 * ecx] 909 add ebp, eax 910 inc ecx 911 jnz short .i_32more_loop_j 912 913 mov ecx, [esp + 36] 914 sar ebp, cl 915 neg ebp 916 add ebp, [esi] 917 mov [edi], ebp 918 add esi, byte 4 919 add edi, byte 4 920 921 dec ebx 922 jnz .i_32more_loop_i 923 924 jmp .end 925 926.mov_eip_to_eax: 927 mov eax, [esp] 928 ret 929 930.i_32: 931 sub edi, esi 932 neg eax 933 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] 934 call .mov_eip_to_eax 935.get_eip0: 936 add edx, eax 937 inc edx 938 mov eax, [esp + 28] ; eax = qlp_coeff[] 939 xor ebp, ebp 940 jmp edx 941 942 mov ecx, [eax + 124] 943 imul ecx, [esi - 128] 944 add ebp, ecx 945 mov ecx, [eax + 120] 946 imul ecx, [esi - 124] 947 add ebp, ecx 948 mov ecx, [eax + 116] 949 imul ecx, [esi - 120] 950 add ebp, ecx 951 mov ecx, [eax + 112] 952 imul ecx, [esi - 116] 953 add ebp, ecx 954 mov ecx, [eax + 108] 955 imul ecx, [esi - 112] 956 add ebp, ecx 957 mov ecx, [eax + 104] 958 imul ecx, [esi - 108] 959 add ebp, ecx 960 mov ecx, [eax + 100] 961 imul ecx, [esi - 104] 962 add ebp, ecx 963 mov ecx, [eax + 96] 964 imul ecx, [esi - 100] 965 add ebp, ecx 966 mov ecx, [eax + 92] 967 imul ecx, [esi - 96] 968 add ebp, ecx 969 mov ecx, [eax + 88] 970 imul ecx, [esi - 92] 971 add ebp, ecx 972 mov ecx, [eax + 84] 973 imul ecx, [esi - 88] 974 add ebp, ecx 975 mov ecx, [eax + 80] 976 imul ecx, [esi - 84] 977 add ebp, ecx 978 mov ecx, [eax + 76] 979 imul ecx, [esi - 80] 980 add ebp, ecx 981 mov ecx, [eax + 72] 982 imul ecx, [esi - 76] 983 add ebp, ecx 984 mov ecx, [eax + 68] 985 imul ecx, [esi - 72] 986 add ebp, ecx 987 mov ecx, [eax + 64] 988 imul ecx, [esi - 68] 989 add ebp, ecx 990 mov ecx, [eax + 60] 991 imul ecx, [esi - 64] 992 add ebp, ecx 993 mov ecx, [eax + 56] 994 imul ecx, [esi - 60] 995 add ebp, ecx 996 mov ecx, [eax + 52] 997 imul ecx, [esi - 56] 998 add ebp, ecx 999 mov ecx, [eax + 48] 1000 imul ecx, [esi - 52] 1001 add ebp, ecx 1002 mov ecx, [eax + 44] 1003 imul ecx, [esi - 48] 1004 add ebp, ecx 1005 mov ecx, [eax + 40] 1006 imul ecx, [esi - 44] 1007 add ebp, ecx 1008 mov ecx, [eax + 36] 1009 imul ecx, [esi - 40] 1010 add ebp, ecx 1011 mov ecx, [eax + 32] 1012 imul ecx, [esi - 36] 1013 add ebp, ecx 1014 mov ecx, [eax + 28] 1015 imul ecx, [esi - 32] 1016 add ebp, ecx 1017 mov ecx, [eax + 24] 1018 imul ecx, [esi - 28] 1019 add ebp, ecx 1020 mov ecx, [eax + 20] 1021 imul ecx, [esi - 24] 1022 add ebp, ecx 1023 mov ecx, [eax + 16] 1024 imul ecx, [esi - 20] 1025 add ebp, ecx 1026 mov ecx, [eax + 12] 1027 imul ecx, [esi - 16] 1028 add ebp, ecx 1029 mov ecx, [eax + 8] 1030 imul ecx, [esi - 12] 1031 add ebp, ecx 1032 mov ecx, [eax + 4] 1033 imul ecx, [esi - 8] 1034 add ebp, ecx 1035 mov ecx, [eax] ; there is one byte missing 1036 imul ecx, [esi - 4] 1037 add ebp, ecx 1038.jumper_0: 1039 1040 mov ecx, [esp + 36] 1041 sar ebp, cl 1042 neg ebp 1043 add ebp, [esi] 1044 mov [edi + esi], ebp 1045 add esi, byte 4 1046 1047 dec ebx 1048 jz short .end 1049 xor ebp, ebp 1050 jmp edx 1051 1052.end: 1053 pop edi 1054 pop esi 1055 pop ebx 1056 pop ebp 1057 ret 1058 1059; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for 1060; the channel and qlp_coeffs must be <= 16. Especially note that this routine 1061; cannot be used for side-channel coded 16bps channels since the effective bps 1062; is 17. 1063 ALIGN 16 1064cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx 1065 ;[esp + 40] residual[] 1066 ;[esp + 36] lp_quantization 1067 ;[esp + 32] order 1068 ;[esp + 28] qlp_coeff[] 1069 ;[esp + 24] data_len 1070 ;[esp + 20] data[] 1071 1072 ;ASSERT(order > 0) 1073 1074 push ebp 1075 push ebx 1076 push esi 1077 push edi 1078 1079 mov esi, [esp + 20] ; esi = data[] 1080 mov edi, [esp + 40] ; edi = residual[] 1081 mov eax, [esp + 32] ; eax = order 1082 mov ebx, [esp + 24] ; ebx = data_len 1083 1084 test ebx, ebx 1085 jz near .end ; do nothing if data_len == 0 1086 dec ebx 1087 test ebx, ebx 1088 jz near .last_one 1089 1090 mov edx, [esp + 28] ; edx = qlp_coeff[] 1091 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization 1092 mov ebp, esp 1093 1094 and esp, 0xfffffff8 1095 1096 xor ecx, ecx 1097.copy_qlp_loop: 1098 push word [edx + 4 * ecx] 1099 inc ecx 1100 cmp ecx, eax 1101 jnz short .copy_qlp_loop 1102 1103 and ecx, 0x3 1104 test ecx, ecx 1105 je short .za_end 1106 sub ecx, byte 4 1107.za_loop: 1108 push word 0 1109 inc eax 1110 inc ecx 1111 jnz short .za_loop 1112.za_end: 1113 1114 movq mm5, [esp + 2 * eax - 8] 1115 movd mm4, [esi - 16] 1116 punpckldq mm4, [esi - 12] 1117 movd mm0, [esi - 8] 1118 punpckldq mm0, [esi - 4] 1119 packssdw mm4, mm0 1120 1121 cmp eax, byte 4 1122 jnbe short .mmx_4more 1123 1124 ALIGN 16 1125.mmx_4_loop_i: 1126 movd mm1, [esi] 1127 movq mm3, mm4 1128 punpckldq mm1, [esi + 4] 1129 psrlq mm4, 16 1130 movq mm0, mm1 1131 psllq mm0, 48 1132 por mm4, mm0 1133 movq mm2, mm4 1134 psrlq mm4, 16 1135 pxor mm0, mm0 1136 punpckhdq mm0, mm1 1137 pmaddwd mm3, mm5 1138 pmaddwd mm2, mm5 1139 psllq mm0, 16 1140 por mm4, mm0 1141 movq mm0, mm3 1142 punpckldq mm3, mm2 1143 punpckhdq mm0, mm2 1144 paddd mm3, mm0 1145 psrad mm3, mm6 1146 psubd mm1, mm3 1147 movd [edi], mm1 1148 punpckhdq mm1, mm1 1149 movd [edi + 4], mm1 1150 1151 add edi, byte 8 1152 add esi, byte 8 1153 1154 sub ebx, 2 1155 jg .mmx_4_loop_i 1156 jmp .mmx_end 1157 1158.mmx_4more: 1159 shl eax, 2 1160 neg eax 1161 add eax, byte 16 1162 1163 ALIGN 16 1164.mmx_4more_loop_i: 1165 movd mm1, [esi] 1166 punpckldq mm1, [esi + 4] 1167 movq mm3, mm4 1168 psrlq mm4, 16 1169 movq mm0, mm1 1170 psllq mm0, 48 1171 por mm4, mm0 1172 movq mm2, mm4 1173 psrlq mm4, 16 1174 pxor mm0, mm0 1175 punpckhdq mm0, mm1 1176 pmaddwd mm3, mm5 1177 pmaddwd mm2, mm5 1178 psllq mm0, 16 1179 por mm4, mm0 1180 1181 mov ecx, esi 1182 add ecx, eax 1183 mov edx, esp 1184 1185 ALIGN 16 1186.mmx_4more_loop_j: 1187 movd mm0, [ecx - 16] 1188 movd mm7, [ecx - 8] 1189 punpckldq mm0, [ecx - 12] 1190 punpckldq mm7, [ecx - 4] 1191 packssdw mm0, mm7 1192 pmaddwd mm0, [edx] 1193 punpckhdq mm7, mm7 1194 paddd mm3, mm0 1195 movd mm0, [ecx - 12] 1196 punpckldq mm0, [ecx - 8] 1197 punpckldq mm7, [ecx] 1198 packssdw mm0, mm7 1199 pmaddwd mm0, [edx] 1200 paddd mm2, mm0 1201 1202 add edx, byte 8 1203 add ecx, byte 16 1204 cmp ecx, esi 1205 jnz .mmx_4more_loop_j 1206 1207 movq mm0, mm3 1208 punpckldq mm3, mm2 1209 punpckhdq mm0, mm2 1210 paddd mm3, mm0 1211 psrad mm3, mm6 1212 psubd mm1, mm3 1213 movd [edi], mm1 1214 punpckhdq mm1, mm1 1215 movd [edi + 4], mm1 1216 1217 add edi, byte 8 1218 add esi, byte 8 1219 1220 sub ebx, 2 1221 jg near .mmx_4more_loop_i 1222 1223.mmx_end: 1224 emms 1225 mov esp, ebp 1226.last_one: 1227 mov eax, [esp + 32] 1228 inc ebx 1229 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin 1230 1231.end: 1232 pop edi 1233 pop esi 1234 pop ebx 1235 pop ebp 1236 ret 1237 1238; ********************************************************************** 1239; 1240; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 1241; { 1242; unsigned i, j; 1243; FLAC__int32 sum; 1244; 1245; FLAC__ASSERT(order > 0); 1246; 1247; for(i = 0; i < data_len; i++) { 1248; sum = 0; 1249; for(j = 0; j < order; j++) 1250; sum += qlp_coeff[j] * data[i-j-1]; 1251; data[i] = residual[i] + (sum >> lp_quantization); 1252; } 1253; } 1254 ALIGN 16 1255cident FLAC__lpc_restore_signal_asm_ia32 1256 ;[esp + 40] data[] 1257 ;[esp + 36] lp_quantization 1258 ;[esp + 32] order 1259 ;[esp + 28] qlp_coeff[] 1260 ;[esp + 24] data_len 1261 ;[esp + 20] residual[] 1262 1263 ;ASSERT(order > 0) 1264 1265 push ebp 1266 push ebx 1267 push esi 1268 push edi 1269 1270 mov esi, [esp + 20] ; esi = residual[] 1271 mov edi, [esp + 40] ; edi = data[] 1272 mov eax, [esp + 32] ; eax = order 1273 mov ebx, [esp + 24] ; ebx = data_len 1274 1275 test ebx, ebx 1276 jz near .end ; do nothing if data_len == 0 1277 1278.begin: 1279 cmp eax, byte 1 1280 jg short .x87_1more 1281 1282 mov ecx, [esp + 28] 1283 mov edx, [ecx] 1284 mov eax, [edi - 4] 1285 mov ecx, [esp + 36] 1286 ALIGN 16 1287.x87_1_loop_i: 1288 imul eax, edx 1289 sar eax, cl 1290 add eax, [esi] 1291 mov [edi], eax 1292 add esi, byte 4 1293 add edi, byte 4 1294 dec ebx 1295 jnz .x87_1_loop_i 1296 1297 jmp .end 1298 1299.x87_1more: 1300 cmp eax, byte 32 ; for order <= 32 there is a faster routine 1301 jbe short .x87_32 1302 1303 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 1304 ALIGN 16 1305.x87_32more_loop_i: 1306 xor ebp, ebp 1307 mov ecx, [esp + 32] 1308 mov edx, ecx 1309 shl edx, 2 1310 add edx, [esp + 28] 1311 neg ecx 1312 ALIGN 16 1313.x87_32more_loop_j: 1314 sub edx, byte 4 1315 mov eax, [edx] 1316 imul eax, [edi + 4 * ecx] 1317 add ebp, eax 1318 inc ecx 1319 jnz short .x87_32more_loop_j 1320 1321 mov ecx, [esp + 36] 1322 sar ebp, cl 1323 add ebp, [esi] 1324 mov [edi], ebp 1325 add edi, byte 4 1326 add esi, byte 4 1327 1328 dec ebx 1329 jnz .x87_32more_loop_i 1330 1331 jmp .end 1332 1333.mov_eip_to_eax: 1334 mov eax, [esp] 1335 ret 1336 1337.x87_32: 1338 sub esi, edi 1339 neg eax 1340 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] 1341 call .mov_eip_to_eax 1342.get_eip0: 1343 add edx, eax 1344 inc edx ; compensate for the shorter opcode on the last iteration 1345 mov eax, [esp + 28] ; eax = qlp_coeff[] 1346 xor ebp, ebp 1347 jmp edx 1348 1349 mov ecx, [eax + 124] ; ecx = qlp_coeff[31] 1350 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] 1351 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] 1352 mov ecx, [eax + 120] ; ecx = qlp_coeff[30] 1353 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] 1354 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] 1355 mov ecx, [eax + 116] ; ecx = qlp_coeff[29] 1356 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] 1357 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] 1358 mov ecx, [eax + 112] ; ecx = qlp_coeff[28] 1359 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] 1360 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] 1361 mov ecx, [eax + 108] ; ecx = qlp_coeff[27] 1362 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] 1363 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] 1364 mov ecx, [eax + 104] ; ecx = qlp_coeff[26] 1365 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] 1366 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] 1367 mov ecx, [eax + 100] ; ecx = qlp_coeff[25] 1368 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] 1369 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] 1370 mov ecx, [eax + 96] ; ecx = qlp_coeff[24] 1371 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] 1372 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] 1373 mov ecx, [eax + 92] ; ecx = qlp_coeff[23] 1374 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] 1375 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] 1376 mov ecx, [eax + 88] ; ecx = qlp_coeff[22] 1377 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] 1378 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] 1379 mov ecx, [eax + 84] ; ecx = qlp_coeff[21] 1380 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] 1381 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] 1382 mov ecx, [eax + 80] ; ecx = qlp_coeff[20] 1383 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] 1384 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] 1385 mov ecx, [eax + 76] ; ecx = qlp_coeff[19] 1386 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] 1387 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] 1388 mov ecx, [eax + 72] ; ecx = qlp_coeff[18] 1389 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] 1390 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] 1391 mov ecx, [eax + 68] ; ecx = qlp_coeff[17] 1392 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] 1393 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] 1394 mov ecx, [eax + 64] ; ecx = qlp_coeff[16] 1395 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] 1396 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] 1397 mov ecx, [eax + 60] ; ecx = qlp_coeff[15] 1398 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] 1399 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] 1400 mov ecx, [eax + 56] ; ecx = qlp_coeff[14] 1401 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] 1402 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] 1403 mov ecx, [eax + 52] ; ecx = qlp_coeff[13] 1404 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] 1405 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] 1406 mov ecx, [eax + 48] ; ecx = qlp_coeff[12] 1407 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] 1408 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] 1409 mov ecx, [eax + 44] ; ecx = qlp_coeff[11] 1410 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] 1411 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] 1412 mov ecx, [eax + 40] ; ecx = qlp_coeff[10] 1413 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] 1414 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] 1415 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] 1416 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] 1417 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] 1418 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] 1419 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] 1420 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] 1421 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] 1422 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] 1423 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] 1424 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] 1425 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] 1426 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] 1427 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] 1428 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] 1429 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] 1430 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] 1431 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] 1432 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] 1433 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] 1434 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] 1435 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] 1436 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] 1437 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] 1438 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] 1439 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] 1440 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] 1441 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] 1442 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 1443 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] 1444 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] 1445.jumper_0: 1446 1447 mov ecx, [esp + 36] 1448 sar ebp, cl ; ebp = (sum >> lp_quantization) 1449 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) 1450 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) 1451 add edi, byte 4 1452 1453 dec ebx 1454 jz short .end 1455 xor ebp, ebp 1456 jmp edx 1457 1458.end: 1459 pop edi 1460 pop esi 1461 pop ebx 1462 pop ebp 1463 ret 1464 1465; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for 1466; the channel and qlp_coeffs must be <= 16. Especially note that this routine 1467; cannot be used for side-channel coded 16bps channels since the effective bps 1468; is 17. 1469; WATCHOUT: this routine requires that each data array have a buffer of up to 1470; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each 1471; channel n, data[n][-1] through data[n][-3] should be accessible and zero. 1472 ALIGN 16 1473cident FLAC__lpc_restore_signal_asm_ia32_mmx 1474 ;[esp + 40] data[] 1475 ;[esp + 36] lp_quantization 1476 ;[esp + 32] order 1477 ;[esp + 28] qlp_coeff[] 1478 ;[esp + 24] data_len 1479 ;[esp + 20] residual[] 1480 1481 ;ASSERT(order > 0) 1482 1483 push ebp 1484 push ebx 1485 push esi 1486 push edi 1487 1488 mov esi, [esp + 20] 1489 mov edi, [esp + 40] 1490 mov eax, [esp + 32] 1491 mov ebx, [esp + 24] 1492 1493 test ebx, ebx 1494 jz near .end ; do nothing if data_len == 0 1495 cmp eax, byte 4 1496 jb near FLAC__lpc_restore_signal_asm_ia32.begin 1497 1498 mov edx, [esp + 28] 1499 movd mm6, [esp + 36] 1500 mov ebp, esp 1501 1502 and esp, 0xfffffff8 1503 1504 xor ecx, ecx 1505.copy_qlp_loop: 1506 push word [edx + 4 * ecx] 1507 inc ecx 1508 cmp ecx, eax 1509 jnz short .copy_qlp_loop 1510 1511 and ecx, 0x3 1512 test ecx, ecx 1513 je short .za_end 1514 sub ecx, byte 4 1515.za_loop: 1516 push word 0 1517 inc eax 1518 inc ecx 1519 jnz short .za_loop 1520.za_end: 1521 1522 movq mm5, [esp + 2 * eax - 8] 1523 movd mm4, [edi - 16] 1524 punpckldq mm4, [edi - 12] 1525 movd mm0, [edi - 8] 1526 punpckldq mm0, [edi - 4] 1527 packssdw mm4, mm0 1528 1529 cmp eax, byte 4 1530 jnbe short .mmx_4more 1531 1532 ALIGN 16 1533.mmx_4_loop_i: 1534 movq mm7, mm4 1535 pmaddwd mm7, mm5 1536 movq mm0, mm7 1537 punpckhdq mm7, mm7 1538 paddd mm7, mm0 1539 psrad mm7, mm6 1540 movd mm1, [esi] 1541 paddd mm7, mm1 1542 movd [edi], mm7 1543 psllq mm7, 48 1544 psrlq mm4, 16 1545 por mm4, mm7 1546 1547 add esi, byte 4 1548 add edi, byte 4 1549 1550 dec ebx 1551 jnz .mmx_4_loop_i 1552 jmp .mmx_end 1553.mmx_4more: 1554 shl eax, 2 1555 neg eax 1556 add eax, byte 16 1557 ALIGN 16 1558.mmx_4more_loop_i: 1559 mov ecx, edi 1560 add ecx, eax 1561 mov edx, esp 1562 1563 movq mm7, mm4 1564 pmaddwd mm7, mm5 1565 1566 ALIGN 16 1567.mmx_4more_loop_j: 1568 movd mm0, [ecx - 16] 1569 punpckldq mm0, [ecx - 12] 1570 movd mm1, [ecx - 8] 1571 punpckldq mm1, [ecx - 4] 1572 packssdw mm0, mm1 1573 pmaddwd mm0, [edx] 1574 paddd mm7, mm0 1575 1576 add edx, byte 8 1577 add ecx, byte 16 1578 cmp ecx, edi 1579 jnz .mmx_4more_loop_j 1580 1581 movq mm0, mm7 1582 punpckhdq mm7, mm7 1583 paddd mm7, mm0 1584 psrad mm7, mm6 1585 movd mm1, [esi] 1586 paddd mm7, mm1 1587 movd [edi], mm7 1588 psllq mm7, 48 1589 psrlq mm4, 16 1590 por mm4, mm7 1591 1592 add esi, byte 4 1593 add edi, byte 4 1594 1595 dec ebx 1596 jnz short .mmx_4more_loop_i 1597.mmx_end: 1598 emms 1599 mov esp, ebp 1600 1601.end: 1602 pop edi 1603 pop esi 1604 pop ebx 1605 pop ebp 1606 ret 1607 1608 1609; ********************************************************************** 1610; 1611;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 1612; { 1613; unsigned i, j; 1614; FLAC__int64 sum; 1615; 1616; FLAC__ASSERT(order > 0); 1617; 1618; for(i = 0; i < data_len; i++) { 1619; sum = 0; 1620; for(j = 0; j < order; j++) 1621; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; 1622; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 1623; } 1624; } 1625 ALIGN 16 1626cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 1627 ;[esp + 40] residual[] 1628 ;[esp + 36] lp_quantization 1629 ;[esp + 32] order 1630 ;[esp + 28] qlp_coeff[] 1631 ;[esp + 24] data_len 1632 ;[esp + 20] data[] 1633 1634 ;ASSERT(order > 0) 1635 ;ASSERT(order <= 32) 1636 ;ASSERT(lp_quantization <= 31) 1637 1638 push ebp 1639 push ebx 1640 push esi 1641 push edi 1642 1643 mov ebx, [esp + 24] ; ebx = data_len 1644 test ebx, ebx 1645 jz near .end ; do nothing if data_len == 0 1646 1647.begin: 1648 mov eax, [esp + 32] ; eax = order 1649 cmp eax, 1 1650 jg short .i_32 1651 1652 mov esi, [esp + 40] ; esi = residual[] 1653 mov edi, [esp + 20] ; edi = data[] 1654 mov ecx, [esp + 28] ; ecx = qlp_coeff[] 1655 mov ebp, [ecx] ; ebp = qlp_coeff[0] 1656 mov eax, [edi - 4] ; eax = data[-1] 1657 mov ecx, [esp + 36] ; cl = lp_quantization 1658 ALIGN 16 1659.i_1_loop_i: 1660 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] 1661 shrd eax, edx, cl ; 0 <= lp_quantization <= 15 1662 neg eax 1663 add eax, [edi] 1664 mov [esi], eax 1665 mov eax, [edi] 1666 add esi, 4 1667 add edi, 4 1668 dec ebx 1669 jnz .i_1_loop_i 1670 jmp .end 1671 1672.mov_eip_to_eax: 1673 mov eax, [esp] 1674 ret 1675 1676.i_32: ; eax = order 1677 neg eax 1678 add eax, eax 1679 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] 1680 call .mov_eip_to_eax 1681.get_eip0: 1682 add ebp, eax 1683 inc ebp ; compensate for the shorter opcode on the last iteration 1684 1685 mov ebx, [esp + 28] ; ebx = qlp_coeff[] 1686 mov edi, [esp + 20] ; edi = data[] 1687 sub [esp + 40], edi ; residual[] -= data[] 1688 1689 xor ecx, ecx 1690 xor esi, esi 1691 jmp ebp 1692 1693;eax = -- 1694;edx = -- 1695;ecx = 0 1696;esi = 0 1697; 1698;ebx = qlp_coeff[] 1699;edi = data[] 1700;ebp = @address 1701 1702 mov eax, [ebx + 124] ; eax = qlp_coeff[31] 1703 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] 1704 add ecx, eax 1705 adc esi, edx ; sum += qlp_coeff[31] * data[i-32] 1706 1707 mov eax, [ebx + 120] ; eax = qlp_coeff[30] 1708 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] 1709 add ecx, eax 1710 adc esi, edx ; sum += qlp_coeff[30] * data[i-31] 1711 1712 mov eax, [ebx + 116] 1713 imul dword [edi - 120] 1714 add ecx, eax 1715 adc esi, edx 1716 1717 mov eax, [ebx + 112] 1718 imul dword [edi - 116] 1719 add ecx, eax 1720 adc esi, edx 1721 1722 mov eax, [ebx + 108] 1723 imul dword [edi - 112] 1724 add ecx, eax 1725 adc esi, edx 1726 1727 mov eax, [ebx + 104] 1728 imul dword [edi - 108] 1729 add ecx, eax 1730 adc esi, edx 1731 1732 mov eax, [ebx + 100] 1733 imul dword [edi - 104] 1734 add ecx, eax 1735 adc esi, edx 1736 1737 mov eax, [ebx + 96] 1738 imul dword [edi - 100] 1739 add ecx, eax 1740 adc esi, edx 1741 1742 mov eax, [ebx + 92] 1743 imul dword [edi - 96] 1744 add ecx, eax 1745 adc esi, edx 1746 1747 mov eax, [ebx + 88] 1748 imul dword [edi - 92] 1749 add ecx, eax 1750 adc esi, edx 1751 1752 mov eax, [ebx + 84] 1753 imul dword [edi - 88] 1754 add ecx, eax 1755 adc esi, edx 1756 1757 mov eax, [ebx + 80] 1758 imul dword [edi - 84] 1759 add ecx, eax 1760 adc esi, edx 1761 1762 mov eax, [ebx + 76] 1763 imul dword [edi - 80] 1764 add ecx, eax 1765 adc esi, edx 1766 1767 mov eax, [ebx + 72] 1768 imul dword [edi - 76] 1769 add ecx, eax 1770 adc esi, edx 1771 1772 mov eax, [ebx + 68] 1773 imul dword [edi - 72] 1774 add ecx, eax 1775 adc esi, edx 1776 1777 mov eax, [ebx + 64] 1778 imul dword [edi - 68] 1779 add ecx, eax 1780 adc esi, edx 1781 1782 mov eax, [ebx + 60] 1783 imul dword [edi - 64] 1784 add ecx, eax 1785 adc esi, edx 1786 1787 mov eax, [ebx + 56] 1788 imul dword [edi - 60] 1789 add ecx, eax 1790 adc esi, edx 1791 1792 mov eax, [ebx + 52] 1793 imul dword [edi - 56] 1794 add ecx, eax 1795 adc esi, edx 1796 1797 mov eax, [ebx + 48] 1798 imul dword [edi - 52] 1799 add ecx, eax 1800 adc esi, edx 1801 1802 mov eax, [ebx + 44] 1803 imul dword [edi - 48] 1804 add ecx, eax 1805 adc esi, edx 1806 1807 mov eax, [ebx + 40] 1808 imul dword [edi - 44] 1809 add ecx, eax 1810 adc esi, edx 1811 1812 mov eax, [ebx + 36] 1813 imul dword [edi - 40] 1814 add ecx, eax 1815 adc esi, edx 1816 1817 mov eax, [ebx + 32] 1818 imul dword [edi - 36] 1819 add ecx, eax 1820 adc esi, edx 1821 1822 mov eax, [ebx + 28] 1823 imul dword [edi - 32] 1824 add ecx, eax 1825 adc esi, edx 1826 1827 mov eax, [ebx + 24] 1828 imul dword [edi - 28] 1829 add ecx, eax 1830 adc esi, edx 1831 1832 mov eax, [ebx + 20] 1833 imul dword [edi - 24] 1834 add ecx, eax 1835 adc esi, edx 1836 1837 mov eax, [ebx + 16] 1838 imul dword [edi - 20] 1839 add ecx, eax 1840 adc esi, edx 1841 1842 mov eax, [ebx + 12] 1843 imul dword [edi - 16] 1844 add ecx, eax 1845 adc esi, edx 1846 1847 mov eax, [ebx + 8] 1848 imul dword [edi - 12] 1849 add ecx, eax 1850 adc esi, edx 1851 1852 mov eax, [ebx + 4] 1853 imul dword [edi - 8] 1854 add ecx, eax 1855 adc esi, edx 1856 1857 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 1858 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] 1859 add ecx, eax 1860 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] 1861 1862.jumper_0: 1863 mov edx, ecx 1864;esi:edx = sum 1865 mov ecx, [esp + 36] ; cl = lp_quantization 1866 shrd edx, esi, cl ; edx = (sum >> lp_quantization) 1867;eax = -- 1868;ecx = -- 1869;edx = sum >> lp_q 1870;esi = -- 1871 neg edx ; edx = -(sum >> lp_quantization) 1872 mov eax, [esp + 40] ; residual[] - data[] 1873 add edx, [edi] ; edx = data[i] - (sum >> lp_quantization) 1874 mov [edi + eax], edx 1875 add edi, 4 1876 1877 dec dword [esp + 24] 1878 jz short .end 1879 xor ecx, ecx 1880 xor esi, esi 1881 jmp ebp 1882 1883.end: 1884 pop edi 1885 pop esi 1886 pop ebx 1887 pop ebp 1888 ret 1889 1890; ********************************************************************** 1891; 1892; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 1893; { 1894; unsigned i, j; 1895; FLAC__int64 sum; 1896; 1897; FLAC__ASSERT(order > 0); 1898; 1899; for(i = 0; i < data_len; i++) { 1900; sum = 0; 1901; for(j = 0; j < order; j++) 1902; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; 1903; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); 1904; } 1905; } 1906 ALIGN 16 1907cident FLAC__lpc_restore_signal_wide_asm_ia32 1908 ;[esp + 40] data[] 1909 ;[esp + 36] lp_quantization 1910 ;[esp + 32] order 1911 ;[esp + 28] qlp_coeff[] 1912 ;[esp + 24] data_len 1913 ;[esp + 20] residual[] 1914 1915 ;ASSERT(order > 0) 1916 ;ASSERT(order <= 32) 1917 ;ASSERT(lp_quantization <= 31) 1918 1919 push ebp 1920 push ebx 1921 push esi 1922 push edi 1923 1924 mov ebx, [esp + 24] ; ebx = data_len 1925 test ebx, ebx 1926 jz near .end ; do nothing if data_len == 0 1927 1928.begin: 1929 mov eax, [esp + 32] ; eax = order 1930 cmp eax, 1 1931 jg short .x87_32 1932 1933 mov esi, [esp + 20] ; esi = residual[] 1934 mov edi, [esp + 40] ; edi = data[] 1935 mov ecx, [esp + 28] ; ecx = qlp_coeff[] 1936 mov ebp, [ecx] ; ebp = qlp_coeff[0] 1937 mov eax, [edi - 4] ; eax = data[-1] 1938 mov ecx, [esp + 36] ; cl = lp_quantization 1939 ALIGN 16 1940.x87_1_loop_i: 1941 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] 1942 shrd eax, edx, cl ; 0 <= lp_quantization <= 15 1943; 1944 add eax, [esi] 1945 mov [edi], eax 1946; 1947 add esi, 4 1948 add edi, 4 1949 dec ebx 1950 jnz .x87_1_loop_i 1951 jmp .end 1952 1953.mov_eip_to_eax: 1954 mov eax, [esp] 1955 ret 1956 1957.x87_32: ; eax = order 1958 neg eax 1959 add eax, eax 1960 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] 1961 call .mov_eip_to_eax 1962.get_eip0: 1963 add ebp, eax 1964 inc ebp ; compensate for the shorter opcode on the last iteration 1965 1966 mov ebx, [esp + 28] ; ebx = qlp_coeff[] 1967 mov edi, [esp + 40] ; esi = data[] 1968 sub [esp + 20], edi ; residual[] -= data[] 1969 1970 xor ecx, ecx 1971 xor esi, esi 1972 jmp ebp 1973 1974;eax = -- 1975;edx = -- 1976;ecx = 0 1977;esi = 0 1978; 1979;ebx = qlp_coeff[] 1980;edi = data[] 1981;ebp = @address 1982 1983 mov eax, [ebx + 124] ; eax = qlp_coeff[31] 1984 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] 1985 add ecx, eax 1986 adc esi, edx ; sum += qlp_coeff[31] * data[i-32] 1987 1988 mov eax, [ebx + 120] ; eax = qlp_coeff[30] 1989 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] 1990 add ecx, eax 1991 adc esi, edx ; sum += qlp_coeff[30] * data[i-31] 1992 1993 mov eax, [ebx + 116] 1994 imul dword [edi - 120] 1995 add ecx, eax 1996 adc esi, edx 1997 1998 mov eax, [ebx + 112] 1999 imul dword [edi - 116] 2000 add ecx, eax 2001 adc esi, edx 2002 2003 mov eax, [ebx + 108] 2004 imul dword [edi - 112] 2005 add ecx, eax 2006 adc esi, edx 2007 2008 mov eax, [ebx + 104] 2009 imul dword [edi - 108] 2010 add ecx, eax 2011 adc esi, edx 2012 2013 mov eax, [ebx + 100] 2014 imul dword [edi - 104] 2015 add ecx, eax 2016 adc esi, edx 2017 2018 mov eax, [ebx + 96] 2019 imul dword [edi - 100] 2020 add ecx, eax 2021 adc esi, edx 2022 2023 mov eax, [ebx + 92] 2024 imul dword [edi - 96] 2025 add ecx, eax 2026 adc esi, edx 2027 2028 mov eax, [ebx + 88] 2029 imul dword [edi - 92] 2030 add ecx, eax 2031 adc esi, edx 2032 2033 mov eax, [ebx + 84] 2034 imul dword [edi - 88] 2035 add ecx, eax 2036 adc esi, edx 2037 2038 mov eax, [ebx + 80] 2039 imul dword [edi - 84] 2040 add ecx, eax 2041 adc esi, edx 2042 2043 mov eax, [ebx + 76] 2044 imul dword [edi - 80] 2045 add ecx, eax 2046 adc esi, edx 2047 2048 mov eax, [ebx + 72] 2049 imul dword [edi - 76] 2050 add ecx, eax 2051 adc esi, edx 2052 2053 mov eax, [ebx + 68] 2054 imul dword [edi - 72] 2055 add ecx, eax 2056 adc esi, edx 2057 2058 mov eax, [ebx + 64] 2059 imul dword [edi - 68] 2060 add ecx, eax 2061 adc esi, edx 2062 2063 mov eax, [ebx + 60] 2064 imul dword [edi - 64] 2065 add ecx, eax 2066 adc esi, edx 2067 2068 mov eax, [ebx + 56] 2069 imul dword [edi - 60] 2070 add ecx, eax 2071 adc esi, edx 2072 2073 mov eax, [ebx + 52] 2074 imul dword [edi - 56] 2075 add ecx, eax 2076 adc esi, edx 2077 2078 mov eax, [ebx + 48] 2079 imul dword [edi - 52] 2080 add ecx, eax 2081 adc esi, edx 2082 2083 mov eax, [ebx + 44] 2084 imul dword [edi - 48] 2085 add ecx, eax 2086 adc esi, edx 2087 2088 mov eax, [ebx + 40] 2089 imul dword [edi - 44] 2090 add ecx, eax 2091 adc esi, edx 2092 2093 mov eax, [ebx + 36] 2094 imul dword [edi - 40] 2095 add ecx, eax 2096 adc esi, edx 2097 2098 mov eax, [ebx + 32] 2099 imul dword [edi - 36] 2100 add ecx, eax 2101 adc esi, edx 2102 2103 mov eax, [ebx + 28] 2104 imul dword [edi - 32] 2105 add ecx, eax 2106 adc esi, edx 2107 2108 mov eax, [ebx + 24] 2109 imul dword [edi - 28] 2110 add ecx, eax 2111 adc esi, edx 2112 2113 mov eax, [ebx + 20] 2114 imul dword [edi - 24] 2115 add ecx, eax 2116 adc esi, edx 2117 2118 mov eax, [ebx + 16] 2119 imul dword [edi - 20] 2120 add ecx, eax 2121 adc esi, edx 2122 2123 mov eax, [ebx + 12] 2124 imul dword [edi - 16] 2125 add ecx, eax 2126 adc esi, edx 2127 2128 mov eax, [ebx + 8] 2129 imul dword [edi - 12] 2130 add ecx, eax 2131 adc esi, edx 2132 2133 mov eax, [ebx + 4] 2134 imul dword [edi - 8] 2135 add ecx, eax 2136 adc esi, edx 2137 2138 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 2139 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] 2140 add ecx, eax 2141 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] 2142 2143.jumper_0: 2144 mov edx, ecx 2145;esi:edx = sum 2146 mov ecx, [esp + 36] ; cl = lp_quantization 2147 shrd edx, esi, cl ; edx = (sum >> lp_quantization) 2148;eax = -- 2149;ecx = -- 2150;edx = sum >> lp_q 2151;esi = -- 2152; 2153 mov eax, [esp + 20] ; residual[] - data[] 2154 add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) 2155 mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) 2156 add edi, 4 2157 2158 dec dword [esp + 24] 2159 jz short .end 2160 xor ecx, ecx 2161 xor esi, esi 2162 jmp ebp 2163 2164.end: 2165 pop edi 2166 pop esi 2167 pop ebx 2168 pop ebp 2169 ret 2170 2171; end 2172