1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001-2009  Josh Coalson
5;  Copyright (C) 2011-2013  Xiph.Org Foundation
6;
7;  Redistribution and use in source and binary forms, with or without
8;  modification, are permitted provided that the following conditions
9;  are met:
10;
11;  - Redistributions of source code must retain the above copyright
12;  notice, this list of conditions and the following disclaimer.
13;
14;  - Redistributions in binary form must reproduce the above copyright
15;  notice, this list of conditions and the following disclaimer in the
16;  documentation and/or other materials provided with the distribution.
17;
18;  - Neither the name of the Xiph.org Foundation nor the names of its
19;  contributors may be used to endorse or promote products derived from
20;  this software without specific prior written permission.
21;
22;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
26;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34%include "nasm.h"
35
36	data_section
37
38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
42cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
43cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
44cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
45cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
46cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
47cglobal FLAC__lpc_restore_signal_asm_ia32
48cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
49cglobal FLAC__lpc_restore_signal_wide_asm_ia32
50
51	code_section
52
53; **********************************************************************
54;
55; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
56; {
57;	FLAC__real d;
58;	unsigned sample, coeff;
59;	const unsigned limit = data_len - lag;
60;
61;	FLAC__ASSERT(lag > 0);
62;	FLAC__ASSERT(lag <= data_len);
63;
64;	for(coeff = 0; coeff < lag; coeff++)
65;		autoc[coeff] = 0.0;
66;	for(sample = 0; sample <= limit; sample++) {
67;		d = data[sample];
68;		for(coeff = 0; coeff < lag; coeff++)
69;			autoc[coeff] += d * data[sample+coeff];
70;	}
71;	for(; sample < data_len; sample++) {
72;		d = data[sample];
73;		for(coeff = 0; coeff < data_len - sample; coeff++)
74;			autoc[coeff] += d * data[sample+coeff];
75;	}
76; }
77;
78	ALIGN 16
79cident FLAC__lpc_compute_autocorrelation_asm_ia32
80	;[esp + 28] == autoc[]
81	;[esp + 24] == lag
82	;[esp + 20] == data_len
83	;[esp + 16] == data[]
84
85	;ASSERT(lag > 0)
86	;ASSERT(lag <= 33)
87	;ASSERT(lag <= data_len)
88
89.begin:
90	push	esi
91	push	edi
92	push	ebx
93
94	;	for(coeff = 0; coeff < lag; coeff++)
95	;		autoc[coeff] = 0.0;
96	mov	edi, [esp + 28]			; edi == autoc
97	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
98	xor	eax, eax
99	rep	stosd
100
101	;	const unsigned limit = data_len - lag;
102	mov	eax, [esp + 24]			; eax == lag
103	mov	ecx, [esp + 20]
104	sub	ecx, eax			; ecx == limit
105
106	mov	edi, [esp + 28]			; edi == autoc
107	mov	esi, [esp + 16]			; esi == data
108	inc	ecx				; we are looping <= limit so we add one to the counter
109
110	;	for(sample = 0; sample <= limit; sample++) {
111	;		d = data[sample];
112	;		for(coeff = 0; coeff < lag; coeff++)
113	;			autoc[coeff] += d * data[sample+coeff];
114	;	}
115	fld	dword [esi]			; ST = d <- data[sample]
116	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
117	lea	edx, [eax + eax*2]
118	neg	edx
119	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
120	call	.mov_eip_to_ebx
121.get_eip1:
122	add	edx, ebx
123	inc	edx				; compensate for the shorter opcode on the last iteration
124	inc	edx				; compensate for the shorter opcode on the last iteration
125	inc	edx				; compensate for the shorter opcode on the last iteration
126	cmp	eax, 33
127	jne	.loop1_start
128	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
129.loop1_start:
130	jmp	edx
131
132.mov_eip_to_ebx:
133	mov	ebx, [esp]
134	ret
135
136	fld	st0				; ST = d d
137	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
138	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
139	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
140	fld	st0				; ST = d d
141	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
142	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
143	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
144	fld	st0				; ST = d d
145	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
146	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
147	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
148	fld	st0				; ST = d d
149	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
150	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
151	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
152	fld	st0				; ST = d d
153	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
154	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
155	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
156	fld	st0				; ST = d d
157	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
158	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
159	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
160	fld	st0				; ST = d d
161	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
162	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
163	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
164	fld	st0				; ST = d d
165	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
166	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
167	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
168	fld	st0				; ST = d d
169	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
170	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
171	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
172	fld	st0				; ST = d d
173	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
174	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
175	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
176	fld	st0				; ST = d d
177	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
178	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
179	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
180	fld	st0				; ST = d d
181	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
182	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
183	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
184	fld	st0				; ST = d d
185	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
186	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
187	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
188	fld	st0				; ST = d d
189	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
190	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
191	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
192	fld	st0				; ST = d d
193	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
194	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
195	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
196	fld	st0				; ST = d d
197	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
198	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
199	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
200	fld	st0				; ST = d d
201	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
202	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
203	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
204	fld	st0				; ST = d d
205	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
206	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
207	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
208	fld	st0				; ST = d d
209	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
210	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
211	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
212	fld	st0				; ST = d d
213	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
214	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
215	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
216	fld	st0				; ST = d d
217	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
218	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
219	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
220	fld	st0				; ST = d d
221	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
222	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
223	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
224	fld	st0				; ST = d d
225	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
226	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
227	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
228	fld	st0				; ST = d d
229	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
230	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
231	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
232	fld	st0				; ST = d d
233	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
234	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
235	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
236	fld	st0				; ST = d d
237	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
238	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
239	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
240	fld	st0				; ST = d d
241	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
242	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
243	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
244	fld	st0				; ST = d d
245	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
246	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
247	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
248	fld	st0				; ST = d d
249	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
250	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
251	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
252	fld	st0				; ST = d d
253	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
254	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
255	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
256	fld	st0				; ST = d d
257	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
258	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
259	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
260	fld	st0				; ST = d d
261	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
262	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
263	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
264	fld	st0				; ST = d d
265	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
266	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
267	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
268.jumper1_0:
269
270	fstp	st0				; pop d, ST = empty
271	add	esi, byte 4			; sample++
272	dec	ecx
273	jz	.loop1_end
274	fld	dword [esi]			; ST = d <- data[sample]
275	jmp	edx
276.loop1_end:
277
278	;	for(; sample < data_len; sample++) {
279	;		d = data[sample];
280	;		for(coeff = 0; coeff < data_len - sample; coeff++)
281	;			autoc[coeff] += d * data[sample+coeff];
282	;	}
283	mov	ecx, [esp + 24]			; ecx <- lag
284	dec	ecx				; ecx <- lag - 1
285	jz	near .end			; skip loop if 0 (i.e. lag == 1)
286
287	fld	dword [esi]			; ST = d <- data[sample]
288	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
289	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
290	lea	edx, [eax + eax*2]
291	neg	edx
292	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
293	call	.mov_eip_to_ebx
294.get_eip2:
295	add	edx, ebx
296	inc	edx				; compensate for the shorter opcode on the last iteration
297	inc	edx				; compensate for the shorter opcode on the last iteration
298	inc	edx				; compensate for the shorter opcode on the last iteration
299	jmp	edx
300
301	fld	st0				; ST = d d
302	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
303	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
304	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
305	fld	st0				; ST = d d
306	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
307	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
308	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
309	fld	st0				; ST = d d
310	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
311	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
312	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
313	fld	st0				; ST = d d
314	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
315	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
316	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
317	fld	st0				; ST = d d
318	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
319	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
320	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
321	fld	st0				; ST = d d
322	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
323	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
324	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
325	fld	st0				; ST = d d
326	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
327	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
328	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
329	fld	st0				; ST = d d
330	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
331	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
332	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
333	fld	st0				; ST = d d
334	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
335	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
336	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
337	fld	st0				; ST = d d
338	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
339	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
340	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
341	fld	st0				; ST = d d
342	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
343	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
344	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
345	fld	st0				; ST = d d
346	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
347	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
348	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
349	fld	st0				; ST = d d
350	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
351	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
352	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
353	fld	st0				; ST = d d
354	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
355	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
356	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
357	fld	st0				; ST = d d
358	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
359	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
360	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
361	fld	st0				; ST = d d
362	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
363	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
364	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
365	fld	st0				; ST = d d
366	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
367	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
368	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
369	fld	st0				; ST = d d
370	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
371	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
372	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
373	fld	st0				; ST = d d
374	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
375	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
376	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
377	fld	st0				; ST = d d
378	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
379	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
380	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
381	fld	st0				; ST = d d
382	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
383	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
384	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
385	fld	st0				; ST = d d
386	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
387	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
388	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
389	fld	st0				; ST = d d
390	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
391	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
392	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
393	fld	st0				; ST = d d
394	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
395	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
396	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
397	fld	st0				; ST = d d
398	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
399	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
400	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
401	fld	st0				; ST = d d
402	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
403	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
404	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
405	fld	st0				; ST = d d
406	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
407	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
408	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
409	fld	st0				; ST = d d
410	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
411	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
412	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
413	fld	st0				; ST = d d
414	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
415	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
416	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
417	fld	st0				; ST = d d
418	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
419	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
420	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
421	fld	st0				; ST = d d
422	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
423	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
424	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
425	fld	st0				; ST = d d
426	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
427	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
428	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
429.jumper2_0:
430
431	fstp	st0				; pop d, ST = empty
432	add	esi, byte 4			; sample++
433	dec	ecx
434	jz	.loop2_end
435	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
436	fld	dword [esi]			; ST = d <- data[sample]
437	jmp	edx
438.loop2_end:
439
440.end:
441	pop	ebx
442	pop	edi
443	pop	esi
444	ret
445
446	ALIGN 16
447cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
448	;[esp + 16] == autoc[]
449	;[esp + 12] == lag
450	;[esp + 8] == data_len
451	;[esp + 4] == data[]
452
453	;ASSERT(lag > 0)
454	;ASSERT(lag <= 4)
455	;ASSERT(lag <= data_len)
456
457	;	for(coeff = 0; coeff < lag; coeff++)
458	;		autoc[coeff] = 0.0;
459	xorps	xmm5, xmm5
460
461	mov	edx, [esp + 8]			; edx == data_len
462	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
463
464	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
465	add	eax, 4
466	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
467	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
468.warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
469	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
470	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
471	dec	edx
472	jz	.loop_end
473	ALIGN 16
474.loop_start:
475	; start by reading the next sample
476	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
477	add	eax, 4
478	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
479	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
480	movss	xmm2, xmm0
481	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
482	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
483	dec	edx
484	jnz	.loop_start
485.loop_end:
486	; store autoc
487	mov	edx, [esp + 16]			; edx == autoc
488	movups	[edx], xmm5
489
490.end:
491	ret
492
493	ALIGN 16
494cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
495	;[esp + 16] == autoc[]
496	;[esp + 12] == lag
497	;[esp + 8] == data_len
498	;[esp + 4] == data[]
499
500	;ASSERT(lag > 0)
501	;ASSERT(lag <= 8)
502	;ASSERT(lag <= data_len)
503
504	;	for(coeff = 0; coeff < lag; coeff++)
505	;		autoc[coeff] = 0.0;
506	xorps	xmm5, xmm5
507	xorps	xmm6, xmm6
508
509	mov	edx, [esp + 8]			; edx == data_len
510	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
511
512	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
513	add	eax, 4
514	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
515	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
516	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
517	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
518.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
519	mulps	xmm0, xmm2
520	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
521	addps	xmm5, xmm0
522	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
523	dec	edx
524	jz	.loop_end
525	ALIGN 16
526.loop_start:
527	; start by reading the next sample
528	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
529	; here we reorder the instructions; see the (#) indexes for a logical order
530	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
531	add	eax, 4				; (0)
532	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
533	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
534	movss	xmm3, xmm2			; (5)
535	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
536	movss	xmm2, xmm0			; (6)
537	mulps	xmm1, xmm3			; (8)
538	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
539	addps	xmm6, xmm1			; (10)
540	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
541	dec	edx
542	jnz	.loop_start
543.loop_end:
544	; store autoc
545	mov	edx, [esp + 16]			; edx == autoc
546	movups	[edx], xmm5
547	movups	[edx + 16], xmm6
548
549.end:
550	ret
551
552	ALIGN 16
553cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
554	;[esp + 16] == autoc[]
555	;[esp + 12] == lag
556	;[esp + 8] == data_len
557	;[esp + 4] == data[]
558
559	;ASSERT(lag > 0)
560	;ASSERT(lag <= 12)
561	;ASSERT(lag <= data_len)
562
563	;	for(coeff = 0; coeff < lag; coeff++)
564	;		autoc[coeff] = 0.0;
565	xorps	xmm5, xmm5
566	xorps	xmm6, xmm6
567	xorps	xmm7, xmm7
568
569	mov	edx, [esp + 8]			; edx == data_len
570	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
571
572	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
573	add	eax, 4
574	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
575	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
576	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
577	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
578.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
579	movaps	xmm1, xmm0
580	mulps	xmm1, xmm2
581	addps	xmm5, xmm1
582	movaps	xmm1, xmm0
583	mulps	xmm1, xmm3
584	addps	xmm6, xmm1
585	mulps	xmm0, xmm4
586	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
587	dec	edx
588	jz	.loop_end
589	ALIGN 16
590.loop_start:
591	; start by reading the next sample
592	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
593	add	eax, 4
594	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
595
596	; shift xmm4:xmm3:xmm2 left by one float
597	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
598	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
599	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
600	movss	xmm4, xmm3
601	movss	xmm3, xmm2
602	movss	xmm2, xmm0
603
604	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
605	movaps	xmm1, xmm0
606	mulps	xmm1, xmm2
607	addps	xmm5, xmm1
608	movaps	xmm1, xmm0
609	mulps	xmm1, xmm3
610	addps	xmm6, xmm1
611	mulps	xmm0, xmm4
612	addps	xmm7, xmm0
613
614	dec	edx
615	jnz	.loop_start
616.loop_end:
617	; store autoc
618	mov	edx, [esp + 16]			; edx == autoc
619	movups	[edx], xmm5
620	movups	[edx + 16], xmm6
621	movups	[edx + 32], xmm7
622
623.end:
624	ret
625
626	ALIGN 16
627cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
628	;[ebp + 20] == autoc[]
629	;[ebp + 16] == lag
630	;[ebp + 12] == data_len
631	;[ebp +  8] == data[]
632	;[esp] == __m128
633	;[esp + 16] == __m128
634
635	push	ebp
636	mov	ebp, esp
637	and	esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
638	sub	esp, 32
639
640	;ASSERT(lag > 0)
641	;ASSERT(lag <= 12)
642	;ASSERT(lag <= data_len)
643	;ASSERT(data_len > 0)
644
645	;	for(coeff = 0; coeff < lag; coeff++)
646	;		autoc[coeff] = 0.0;
647	xorps	xmm5, xmm5
648	xorps	xmm6, xmm6
649	movaps	[esp], xmm5
650	movaps	[esp + 16], xmm6
651
652	mov	edx, [ebp + 12]			; edx == data_len
653	mov	eax, [ebp +  8]			; eax == &data[sample] <- &data[0]
654
655	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
656	add	eax, 4
657	movaps	xmm1, xmm0			; xmm1 = 0,0,0,data[0]
658	shufps	xmm0, xmm0, 0		; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
659	xorps	xmm2, xmm2			; xmm2 = 0,0,0,0
660	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
661	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
662	movaps	xmm7, xmm0
663	mulps	xmm7, xmm1
664	addps	xmm5, xmm7
665	dec	edx
666	jz	.loop_end
667	ALIGN 16
668.loop_start:
669	; start by reading the next sample
670	movss	xmm0, [eax]				; xmm0 = 0,0,0,data[sample]
671	add	eax, 4
672	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
673
674	; shift xmm4:xmm3:xmm2:xmm1 left by one float
675	shufps	xmm1, xmm1, 93h
676	shufps	xmm2, xmm2, 93h
677	shufps	xmm3, xmm3, 93h
678	shufps	xmm4, xmm4, 93h
679	movss	xmm4, xmm3
680	movss	xmm3, xmm2
681	movss	xmm2, xmm1
682	movss	xmm1, xmm0
683
684	; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
685	movaps	xmm7, xmm0
686	mulps	xmm7, xmm1
687	addps	xmm5, xmm7
688	movaps	xmm7, xmm0
689	mulps	xmm7, xmm2
690	addps	xmm6, xmm7
691	movaps	xmm7, xmm0
692	mulps	xmm7, xmm3
693	mulps	xmm0, xmm4
694	addps	xmm7, [esp]
695	addps	xmm0, [esp + 16]
696	movaps	[esp], xmm7
697	movaps	[esp + 16], xmm0
698
699	dec	edx
700	jnz	.loop_start
701.loop_end:
702	; store autoc
703	mov	edx, [ebp + 20]				; edx == autoc
704	movups	[edx], xmm5
705	movups	[edx + 16], xmm6
706	movaps	xmm5, [esp]
707	movaps	xmm6, [esp + 16]
708	movups	[edx + 32], xmm5
709	movups	[edx + 48], xmm6
710.end:
711	mov	esp, ebp
712	pop	ebp
713	ret
714
715	ALIGN 16
716cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
717	;[ebp + 32] autoc
718	;[ebp + 28] lag
719	;[ebp + 24] data_len
720	;[ebp + 20] data
721
722	push	ebp
723	push	ebx
724	push	esi
725	push	edi
726	mov	ebp, esp
727
728	mov	esi, [ebp + 20]
729	mov	edi, [ebp + 24]
730	mov	edx, [ebp + 28]
731	inc	edx
732	and	edx, byte -2
733	mov	eax, edx
734	neg	eax
735	and	esp, byte -8
736	lea	esp, [esp + 4 * eax]
737	mov	ecx, edx
738	xor	eax, eax
739.loop0:
740	dec	ecx
741	mov	[esp + 4 * ecx], eax
742	jnz	short .loop0
743
744	mov	eax, edi
745	sub	eax, edx
746	mov	ebx, edx
747	and	ebx, byte 1
748	sub	eax, ebx
749	lea	ecx, [esi + 4 * eax - 12]
750	cmp	esi, ecx
751	mov	eax, esi
752	ja	short .loop2_pre
753	ALIGN	16		;4 nops
754.loop1_i:
755	movd	mm0, [eax]
756	movd	mm2, [eax + 4]
757	movd	mm4, [eax + 8]
758	movd	mm6, [eax + 12]
759	mov	ebx, edx
760	punpckldq	mm0, mm0
761	punpckldq	mm2, mm2
762	punpckldq	mm4, mm4
763	punpckldq	mm6, mm6
764	ALIGN	16		;3 nops
765.loop1_j:
766	sub	ebx, byte 2
767	movd	mm1, [eax + 4 * ebx]
768	movd	mm3, [eax + 4 * ebx + 4]
769	movd	mm5, [eax + 4 * ebx + 8]
770	movd	mm7, [eax + 4 * ebx + 12]
771	punpckldq	mm1, mm3
772	punpckldq	mm3, mm5
773	pfmul	mm1, mm0
774	punpckldq	mm5, mm7
775	pfmul	mm3, mm2
776	punpckldq	mm7, [eax + 4 * ebx + 16]
777	pfmul	mm5, mm4
778	pfmul	mm7, mm6
779	pfadd	mm1, mm3
780	movq	mm3, [esp + 4 * ebx]
781	pfadd	mm5, mm7
782	pfadd	mm1, mm5
783	pfadd	mm3, mm1
784	movq	[esp + 4 * ebx], mm3
785	jg	short .loop1_j
786
787	add	eax, byte 16
788	cmp	eax, ecx
789	jb	short .loop1_i
790
791.loop2_pre:
792	mov	ebx, eax
793	sub	eax, esi
794	shr	eax, 2
795	lea	ecx, [esi + 4 * edi]
796	mov	esi, ebx
797.loop2_i:
798	movd	mm0, [esi]
799	mov	ebx, edi
800	sub	ebx, eax
801	cmp	ebx, edx
802	jbe	short .loop2_j
803	mov	ebx, edx
804.loop2_j:
805	dec	ebx
806	movd	mm1, [esi + 4 * ebx]
807	pfmul	mm1, mm0
808	movd	mm2, [esp + 4 * ebx]
809	pfadd	mm1, mm2
810	movd	[esp + 4 * ebx], mm1
811
812	jnz	short .loop2_j
813
814	add	esi, byte 4
815	inc	eax
816	cmp	esi, ecx
817	jnz	short .loop2_i
818
819	mov	edi, [ebp + 32]
820	mov	edx, [ebp + 28]
821.loop3:
822	dec	edx
823	mov	eax, [esp + 4 * edx]
824	mov	[edi + 4 * edx], eax
825	jnz	short .loop3
826
827	femms
828
829	mov	esp, ebp
830	pop	edi
831	pop	esi
832	pop	ebx
833	pop	ebp
834	ret
835
836;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
837;
838;	for(i = 0; i < data_len; i++) {
839;		sum = 0;
840;		for(j = 0; j < order; j++)
841;			sum += qlp_coeff[j] * data[i-j-1];
842;		residual[i] = data[i] - (sum >> lp_quantization);
843;	}
844;
845	ALIGN	16
846cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
847	;[esp + 40]	residual[]
848	;[esp + 36]	lp_quantization
849	;[esp + 32]	order
850	;[esp + 28]	qlp_coeff[]
851	;[esp + 24]	data_len
852	;[esp + 20]	data[]
853
854	;ASSERT(order > 0)
855
856	push	ebp
857	push	ebx
858	push	esi
859	push	edi
860
861	mov	esi, [esp + 20]			; esi = data[]
862	mov	edi, [esp + 40]			; edi = residual[]
863	mov	eax, [esp + 32]			; eax = order
864	mov	ebx, [esp + 24]			; ebx = data_len
865
866	test	ebx, ebx
867	jz	near .end			; do nothing if data_len == 0
868.begin:
869	cmp	eax, byte 1
870	jg	short .i_1more
871
872	mov	ecx, [esp + 28]
873	mov	edx, [ecx]			; edx = qlp_coeff[0]
874	mov	eax, [esi - 4]			; eax = data[-1]
875	mov	ecx, [esp + 36]			; cl = lp_quantization
876	ALIGN	16
877.i_1_loop_i:
878	imul	eax, edx
879	sar	eax, cl
880	neg	eax
881	add	eax, [esi]
882	mov	[edi], eax
883	mov	eax, [esi]
884	add	edi, byte 4
885	add	esi, byte 4
886	dec	ebx
887	jnz	.i_1_loop_i
888
889	jmp	.end
890
891.i_1more:
892	cmp	eax, byte 32			; for order <= 32 there is a faster routine
893	jbe	short .i_32
894
895	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
896	ALIGN 16
897.i_32more_loop_i:
898	xor	ebp, ebp
899	mov	ecx, [esp + 32]
900	mov	edx, ecx
901	shl	edx, 2
902	add	edx, [esp + 28]
903	neg	ecx
904	ALIGN	16
905.i_32more_loop_j:
906	sub	edx, byte 4
907	mov	eax, [edx]
908	imul	eax, [esi + 4 * ecx]
909	add	ebp, eax
910	inc	ecx
911	jnz	short .i_32more_loop_j
912
913	mov	ecx, [esp + 36]
914	sar	ebp, cl
915	neg	ebp
916	add	ebp, [esi]
917	mov	[edi], ebp
918	add	esi, byte 4
919	add	edi, byte 4
920
921	dec	ebx
922	jnz	.i_32more_loop_i
923
924	jmp	.end
925
926.mov_eip_to_eax:
927	mov	eax, [esp]
928	ret
929
930.i_32:
931	sub	edi, esi
932	neg	eax
933	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
934	call	.mov_eip_to_eax
935.get_eip0:
936	add	edx, eax
937	inc	edx
938	mov	eax, [esp + 28]			; eax = qlp_coeff[]
939	xor	ebp, ebp
940	jmp	edx
941
942	mov	ecx, [eax + 124]
943	imul	ecx, [esi - 128]
944	add	ebp, ecx
945	mov	ecx, [eax + 120]
946	imul	ecx, [esi - 124]
947	add	ebp, ecx
948	mov	ecx, [eax + 116]
949	imul	ecx, [esi - 120]
950	add	ebp, ecx
951	mov	ecx, [eax + 112]
952	imul	ecx, [esi - 116]
953	add	ebp, ecx
954	mov	ecx, [eax + 108]
955	imul	ecx, [esi - 112]
956	add	ebp, ecx
957	mov	ecx, [eax + 104]
958	imul	ecx, [esi - 108]
959	add	ebp, ecx
960	mov	ecx, [eax + 100]
961	imul	ecx, [esi - 104]
962	add	ebp, ecx
963	mov	ecx, [eax + 96]
964	imul	ecx, [esi - 100]
965	add	ebp, ecx
966	mov	ecx, [eax + 92]
967	imul	ecx, [esi - 96]
968	add	ebp, ecx
969	mov	ecx, [eax + 88]
970	imul	ecx, [esi - 92]
971	add	ebp, ecx
972	mov	ecx, [eax + 84]
973	imul	ecx, [esi - 88]
974	add	ebp, ecx
975	mov	ecx, [eax + 80]
976	imul	ecx, [esi - 84]
977	add	ebp, ecx
978	mov	ecx, [eax + 76]
979	imul	ecx, [esi - 80]
980	add	ebp, ecx
981	mov	ecx, [eax + 72]
982	imul	ecx, [esi - 76]
983	add	ebp, ecx
984	mov	ecx, [eax + 68]
985	imul	ecx, [esi - 72]
986	add	ebp, ecx
987	mov	ecx, [eax + 64]
988	imul	ecx, [esi - 68]
989	add	ebp, ecx
990	mov	ecx, [eax + 60]
991	imul	ecx, [esi - 64]
992	add	ebp, ecx
993	mov	ecx, [eax + 56]
994	imul	ecx, [esi - 60]
995	add	ebp, ecx
996	mov	ecx, [eax + 52]
997	imul	ecx, [esi - 56]
998	add	ebp, ecx
999	mov	ecx, [eax + 48]
1000	imul	ecx, [esi - 52]
1001	add	ebp, ecx
1002	mov	ecx, [eax + 44]
1003	imul	ecx, [esi - 48]
1004	add	ebp, ecx
1005	mov	ecx, [eax + 40]
1006	imul	ecx, [esi - 44]
1007	add	ebp, ecx
1008	mov	ecx, [eax + 36]
1009	imul	ecx, [esi - 40]
1010	add	ebp, ecx
1011	mov	ecx, [eax + 32]
1012	imul	ecx, [esi - 36]
1013	add	ebp, ecx
1014	mov	ecx, [eax + 28]
1015	imul	ecx, [esi - 32]
1016	add	ebp, ecx
1017	mov	ecx, [eax + 24]
1018	imul	ecx, [esi - 28]
1019	add	ebp, ecx
1020	mov	ecx, [eax + 20]
1021	imul	ecx, [esi - 24]
1022	add	ebp, ecx
1023	mov	ecx, [eax + 16]
1024	imul	ecx, [esi - 20]
1025	add	ebp, ecx
1026	mov	ecx, [eax + 12]
1027	imul	ecx, [esi - 16]
1028	add	ebp, ecx
1029	mov	ecx, [eax + 8]
1030	imul	ecx, [esi - 12]
1031	add	ebp, ecx
1032	mov	ecx, [eax + 4]
1033	imul	ecx, [esi - 8]
1034	add	ebp, ecx
1035	mov	ecx, [eax]			; there is one byte missing
1036	imul	ecx, [esi - 4]
1037	add	ebp, ecx
1038.jumper_0:
1039
1040	mov	ecx, [esp + 36]
1041	sar	ebp, cl
1042	neg	ebp
1043	add	ebp, [esi]
1044	mov	[edi + esi], ebp
1045	add	esi, byte 4
1046
1047	dec	ebx
1048	jz	short .end
1049	xor	ebp, ebp
1050	jmp	edx
1051
1052.end:
1053	pop	edi
1054	pop	esi
1055	pop	ebx
1056	pop	ebp
1057	ret
1058
1059; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1060; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
1061; cannot be used for side-channel coded 16bps channels since the effective bps
1062; is 17.
1063	ALIGN	16
1064cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
1065	;[esp + 40]	residual[]
1066	;[esp + 36]	lp_quantization
1067	;[esp + 32]	order
1068	;[esp + 28]	qlp_coeff[]
1069	;[esp + 24]	data_len
1070	;[esp + 20]	data[]
1071
1072	;ASSERT(order > 0)
1073
1074	push	ebp
1075	push	ebx
1076	push	esi
1077	push	edi
1078
1079	mov	esi, [esp + 20]			; esi = data[]
1080	mov	edi, [esp + 40]			; edi = residual[]
1081	mov	eax, [esp + 32]			; eax = order
1082	mov	ebx, [esp + 24]			; ebx = data_len
1083
1084	test	ebx, ebx
1085	jz	near .end			; do nothing if data_len == 0
1086	dec	ebx
1087	test	ebx, ebx
1088	jz	near .last_one
1089
1090	mov	edx, [esp + 28]			; edx = qlp_coeff[]
1091	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
1092	mov	ebp, esp
1093
1094	and	esp, 0xfffffff8
1095
1096	xor	ecx, ecx
1097.copy_qlp_loop:
1098	push	word [edx + 4 * ecx]
1099	inc	ecx
1100	cmp	ecx, eax
1101	jnz	short .copy_qlp_loop
1102
1103	and	ecx, 0x3
1104	test	ecx, ecx
1105	je	short .za_end
1106	sub	ecx, byte 4
1107.za_loop:
1108	push	word 0
1109	inc	eax
1110	inc	ecx
1111	jnz	short .za_loop
1112.za_end:
1113
1114	movq	mm5, [esp + 2 * eax - 8]
1115	movd	mm4, [esi - 16]
1116	punpckldq	mm4, [esi - 12]
1117	movd	mm0, [esi - 8]
1118	punpckldq	mm0, [esi - 4]
1119	packssdw	mm4, mm0
1120
1121	cmp	eax, byte 4
1122	jnbe	short .mmx_4more
1123
1124	ALIGN	16
1125.mmx_4_loop_i:
1126	movd	mm1, [esi]
1127	movq	mm3, mm4
1128	punpckldq	mm1, [esi + 4]
1129	psrlq	mm4, 16
1130	movq	mm0, mm1
1131	psllq	mm0, 48
1132	por	mm4, mm0
1133	movq	mm2, mm4
1134	psrlq	mm4, 16
1135	pxor	mm0, mm0
1136	punpckhdq	mm0, mm1
1137	pmaddwd	mm3, mm5
1138	pmaddwd	mm2, mm5
1139	psllq	mm0, 16
1140	por	mm4, mm0
1141	movq	mm0, mm3
1142	punpckldq	mm3, mm2
1143	punpckhdq	mm0, mm2
1144	paddd	mm3, mm0
1145	psrad	mm3, mm6
1146	psubd	mm1, mm3
1147	movd	[edi], mm1
1148	punpckhdq	mm1, mm1
1149	movd	[edi + 4], mm1
1150
1151	add	edi, byte 8
1152	add	esi, byte 8
1153
1154	sub	ebx, 2
1155	jg	.mmx_4_loop_i
1156	jmp	.mmx_end
1157
1158.mmx_4more:
1159	shl	eax, 2
1160	neg	eax
1161	add	eax, byte 16
1162
1163	ALIGN	16
1164.mmx_4more_loop_i:
1165	movd	mm1, [esi]
1166	punpckldq	mm1, [esi + 4]
1167	movq	mm3, mm4
1168	psrlq	mm4, 16
1169	movq	mm0, mm1
1170	psllq	mm0, 48
1171	por	mm4, mm0
1172	movq	mm2, mm4
1173	psrlq	mm4, 16
1174	pxor	mm0, mm0
1175	punpckhdq	mm0, mm1
1176	pmaddwd	mm3, mm5
1177	pmaddwd	mm2, mm5
1178	psllq	mm0, 16
1179	por	mm4, mm0
1180
1181	mov	ecx, esi
1182	add	ecx, eax
1183	mov	edx, esp
1184
1185	ALIGN	16
1186.mmx_4more_loop_j:
1187	movd	mm0, [ecx - 16]
1188	movd	mm7, [ecx - 8]
1189	punpckldq	mm0, [ecx - 12]
1190	punpckldq	mm7, [ecx - 4]
1191	packssdw	mm0, mm7
1192	pmaddwd	mm0, [edx]
1193	punpckhdq	mm7, mm7
1194	paddd	mm3, mm0
1195	movd	mm0, [ecx - 12]
1196	punpckldq	mm0, [ecx - 8]
1197	punpckldq	mm7, [ecx]
1198	packssdw	mm0, mm7
1199	pmaddwd	mm0, [edx]
1200	paddd	mm2, mm0
1201
1202	add	edx, byte 8
1203	add	ecx, byte 16
1204	cmp	ecx, esi
1205	jnz	.mmx_4more_loop_j
1206
1207	movq	mm0, mm3
1208	punpckldq	mm3, mm2
1209	punpckhdq	mm0, mm2
1210	paddd	mm3, mm0
1211	psrad	mm3, mm6
1212	psubd	mm1, mm3
1213	movd	[edi], mm1
1214	punpckhdq	mm1, mm1
1215	movd	[edi + 4], mm1
1216
1217	add	edi, byte 8
1218	add	esi, byte 8
1219
1220	sub	ebx, 2
1221	jg	near .mmx_4more_loop_i
1222
1223.mmx_end:
1224	emms
1225	mov	esp, ebp
1226.last_one:
1227	mov	eax, [esp + 32]
1228	inc	ebx
1229	jnz	near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1230
1231.end:
1232	pop	edi
1233	pop	esi
1234	pop	ebx
1235	pop	ebp
1236	ret
1237
1238; **********************************************************************
1239;
1240; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1241; {
1242; 	unsigned i, j;
1243; 	FLAC__int32 sum;
1244;
1245; 	FLAC__ASSERT(order > 0);
1246;
1247; 	for(i = 0; i < data_len; i++) {
1248; 		sum = 0;
1249; 		for(j = 0; j < order; j++)
1250; 			sum += qlp_coeff[j] * data[i-j-1];
1251; 		data[i] = residual[i] + (sum >> lp_quantization);
1252; 	}
1253; }
1254	ALIGN	16
1255cident FLAC__lpc_restore_signal_asm_ia32
1256	;[esp + 40]	data[]
1257	;[esp + 36]	lp_quantization
1258	;[esp + 32]	order
1259	;[esp + 28]	qlp_coeff[]
1260	;[esp + 24]	data_len
1261	;[esp + 20]	residual[]
1262
1263	;ASSERT(order > 0)
1264
1265	push	ebp
1266	push	ebx
1267	push	esi
1268	push	edi
1269
1270	mov	esi, [esp + 20]			; esi = residual[]
1271	mov	edi, [esp + 40]			; edi = data[]
1272	mov	eax, [esp + 32]			; eax = order
1273	mov	ebx, [esp + 24]			; ebx = data_len
1274
1275	test	ebx, ebx
1276	jz	near .end			; do nothing if data_len == 0
1277
1278.begin:
1279	cmp	eax, byte 1
1280	jg	short .x87_1more
1281
1282	mov	ecx, [esp + 28]
1283	mov	edx, [ecx]
1284	mov	eax, [edi - 4]
1285	mov	ecx, [esp + 36]
1286	ALIGN	16
1287.x87_1_loop_i:
1288	imul	eax, edx
1289	sar	eax, cl
1290	add	eax, [esi]
1291	mov	[edi], eax
1292	add	esi, byte 4
1293	add	edi, byte 4
1294	dec	ebx
1295	jnz	.x87_1_loop_i
1296
1297	jmp	.end
1298
1299.x87_1more:
1300	cmp	eax, byte 32			; for order <= 32 there is a faster routine
1301	jbe	short .x87_32
1302
1303	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1304	ALIGN 16
1305.x87_32more_loop_i:
1306	xor	ebp, ebp
1307	mov	ecx, [esp + 32]
1308	mov	edx, ecx
1309	shl	edx, 2
1310	add	edx, [esp + 28]
1311	neg	ecx
1312	ALIGN	16
1313.x87_32more_loop_j:
1314	sub	edx, byte 4
1315	mov	eax, [edx]
1316	imul	eax, [edi + 4 * ecx]
1317	add	ebp, eax
1318	inc	ecx
1319	jnz	short .x87_32more_loop_j
1320
1321	mov	ecx, [esp + 36]
1322	sar	ebp, cl
1323	add	ebp, [esi]
1324	mov	[edi], ebp
1325	add	edi, byte 4
1326	add	esi, byte 4
1327
1328	dec	ebx
1329	jnz	.x87_32more_loop_i
1330
1331	jmp	.end
1332
1333.mov_eip_to_eax:
1334	mov	eax, [esp]
1335	ret
1336
1337.x87_32:
1338	sub	esi, edi
1339	neg	eax
1340	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1341	call	.mov_eip_to_eax
1342.get_eip0:
1343	add	edx, eax
1344	inc	edx				; compensate for the shorter opcode on the last iteration
1345	mov	eax, [esp + 28]			; eax = qlp_coeff[]
1346	xor	ebp, ebp
1347	jmp	edx
1348
1349	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
1350	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
1351	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
1352	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
1353	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
1354	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
1355	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
1356	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
1357	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
1358	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
1359	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
1360	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
1361	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
1362	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
1363	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
1364	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
1365	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
1366	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
1367	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
1368	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
1369	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
1370	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
1371	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
1372	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
1373	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
1374	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
1375	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
1376	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
1377	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
1378	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
1379	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
1380	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
1381	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
1382	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
1383	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
1384	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
1385	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
1386	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
1387	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
1388	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
1389	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
1390	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
1391	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
1392	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
1393	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
1394	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
1395	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
1396	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
1397	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
1398	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
1399	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
1400	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
1401	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
1402	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
1403	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
1404	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
1405	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
1406	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
1407	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
1408	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
1409	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
1410	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
1411	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
1412	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
1413	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
1414	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
1415	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
1416	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
1417	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
1418	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
1419	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
1420	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
1421	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
1422	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
1423	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
1424	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
1425	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
1426	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
1427	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
1428	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
1429	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
1430	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
1431	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
1432	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
1433	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
1434	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
1435	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
1436	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
1437	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
1438	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
1439	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
1440	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
1441	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
1442	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1443	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
1444	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
1445.jumper_0:
1446
1447	mov	ecx, [esp + 36]
1448	sar	ebp, cl				; ebp = (sum >> lp_quantization)
1449	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
1450	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
1451	add	edi, byte 4
1452
1453	dec	ebx
1454	jz	short .end
1455	xor	ebp, ebp
1456	jmp	edx
1457
1458.end:
1459	pop	edi
1460	pop	esi
1461	pop	ebx
1462	pop	ebp
1463	ret
1464
1465; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1466; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
1467; cannot be used for side-channel coded 16bps channels since the effective bps
1468; is 17.
1469; WATCHOUT: this routine requires that each data array have a buffer of up to
1470; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1471; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1472	ALIGN	16
1473cident FLAC__lpc_restore_signal_asm_ia32_mmx
1474	;[esp + 40]	data[]
1475	;[esp + 36]	lp_quantization
1476	;[esp + 32]	order
1477	;[esp + 28]	qlp_coeff[]
1478	;[esp + 24]	data_len
1479	;[esp + 20]	residual[]
1480
1481	;ASSERT(order > 0)
1482
1483	push	ebp
1484	push	ebx
1485	push	esi
1486	push	edi
1487
1488	mov	esi, [esp + 20]
1489	mov	edi, [esp + 40]
1490	mov	eax, [esp + 32]
1491	mov	ebx, [esp + 24]
1492
1493	test	ebx, ebx
1494	jz	near .end			; do nothing if data_len == 0
1495	cmp	eax, byte 4
1496	jb	near FLAC__lpc_restore_signal_asm_ia32.begin
1497
1498	mov	edx, [esp + 28]
1499	movd	mm6, [esp + 36]
1500	mov	ebp, esp
1501
1502	and	esp, 0xfffffff8
1503
1504	xor	ecx, ecx
1505.copy_qlp_loop:
1506	push	word [edx + 4 * ecx]
1507	inc	ecx
1508	cmp	ecx, eax
1509	jnz	short .copy_qlp_loop
1510
1511	and	ecx, 0x3
1512	test	ecx, ecx
1513	je	short .za_end
1514	sub	ecx, byte 4
1515.za_loop:
1516	push	word 0
1517	inc	eax
1518	inc	ecx
1519	jnz	short .za_loop
1520.za_end:
1521
1522	movq	mm5, [esp + 2 * eax - 8]
1523	movd	mm4, [edi - 16]
1524	punpckldq	mm4, [edi - 12]
1525	movd	mm0, [edi - 8]
1526	punpckldq	mm0, [edi - 4]
1527	packssdw	mm4, mm0
1528
1529	cmp	eax, byte 4
1530	jnbe	short .mmx_4more
1531
1532	ALIGN	16
1533.mmx_4_loop_i:
1534	movq	mm7, mm4
1535	pmaddwd	mm7, mm5
1536	movq	mm0, mm7
1537	punpckhdq	mm7, mm7
1538	paddd	mm7, mm0
1539	psrad	mm7, mm6
1540	movd	mm1, [esi]
1541	paddd	mm7, mm1
1542	movd	[edi], mm7
1543	psllq	mm7, 48
1544	psrlq	mm4, 16
1545	por	mm4, mm7
1546
1547	add	esi, byte 4
1548	add	edi, byte 4
1549
1550	dec	ebx
1551	jnz	.mmx_4_loop_i
1552	jmp	.mmx_end
1553.mmx_4more:
1554	shl	eax, 2
1555	neg	eax
1556	add	eax, byte 16
1557	ALIGN	16
1558.mmx_4more_loop_i:
1559	mov	ecx, edi
1560	add	ecx, eax
1561	mov	edx, esp
1562
1563	movq	mm7, mm4
1564	pmaddwd	mm7, mm5
1565
1566	ALIGN	16
1567.mmx_4more_loop_j:
1568	movd	mm0, [ecx - 16]
1569	punpckldq	mm0, [ecx - 12]
1570	movd	mm1, [ecx - 8]
1571	punpckldq	mm1, [ecx - 4]
1572	packssdw	mm0, mm1
1573	pmaddwd	mm0, [edx]
1574	paddd	mm7, mm0
1575
1576	add	edx, byte 8
1577	add	ecx, byte 16
1578	cmp	ecx, edi
1579	jnz	.mmx_4more_loop_j
1580
1581	movq	mm0, mm7
1582	punpckhdq	mm7, mm7
1583	paddd	mm7, mm0
1584	psrad	mm7, mm6
1585	movd	mm1, [esi]
1586	paddd	mm7, mm1
1587	movd	[edi], mm7
1588	psllq	mm7, 48
1589	psrlq	mm4, 16
1590	por	mm4, mm7
1591
1592	add	esi, byte 4
1593	add	edi, byte 4
1594
1595	dec	ebx
1596	jnz	short .mmx_4more_loop_i
1597.mmx_end:
1598	emms
1599	mov	esp, ebp
1600
1601.end:
1602	pop	edi
1603	pop	esi
1604	pop	ebx
1605	pop	ebp
1606	ret
1607
1608
1609; **********************************************************************
1610;
1611;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
1612; {
1613; 	unsigned i, j;
1614; 	FLAC__int64 sum;
1615;
1616; 	FLAC__ASSERT(order > 0);
1617;
1618;	for(i = 0; i < data_len; i++) {
1619;		sum = 0;
1620;		for(j = 0; j < order; j++)
1621;			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1622;		residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1623;	}
1624; }
1625	ALIGN	16
1626cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
1627	;[esp + 40]	residual[]
1628	;[esp + 36]	lp_quantization
1629	;[esp + 32]	order
1630	;[esp + 28]	qlp_coeff[]
1631	;[esp + 24]	data_len
1632	;[esp + 20]	data[]
1633
1634	;ASSERT(order > 0)
1635	;ASSERT(order <= 32)
1636	;ASSERT(lp_quantization <= 31)
1637
1638	push	ebp
1639	push	ebx
1640	push	esi
1641	push	edi
1642
1643	mov	ebx, [esp + 24]			; ebx = data_len
1644	test	ebx, ebx
1645	jz	near .end				; do nothing if data_len == 0
1646
1647.begin:
1648	mov	eax, [esp + 32]			; eax = order
1649	cmp	eax, 1
1650	jg	short .i_32
1651
1652	mov	esi, [esp + 40]			; esi = residual[]
1653	mov	edi, [esp + 20]			; edi = data[]
1654	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
1655	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
1656	mov	eax, [edi - 4]			; eax = data[-1]
1657	mov	ecx, [esp + 36]			; cl = lp_quantization
1658	ALIGN	16
1659.i_1_loop_i:
1660	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1661	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
1662	neg	eax
1663	add	eax, [edi]
1664	mov	[esi], eax
1665	mov	eax, [edi]
1666	add	esi, 4
1667	add	edi, 4
1668	dec	ebx
1669	jnz	.i_1_loop_i
1670	jmp	.end
1671
1672.mov_eip_to_eax:
1673	mov	eax, [esp]
1674	ret
1675
1676.i_32:	; eax = order
1677	neg	eax
1678	add	eax, eax
1679	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1680	call	.mov_eip_to_eax
1681.get_eip0:
1682	add	ebp, eax
1683	inc	ebp				; compensate for the shorter opcode on the last iteration
1684
1685	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
1686	mov	edi, [esp + 20]			; edi = data[]
1687	sub	[esp + 40], edi			; residual[] -= data[]
1688
1689	xor	ecx, ecx
1690	xor	esi, esi
1691	jmp	ebp
1692
1693;eax = --
1694;edx = --
1695;ecx = 0
1696;esi = 0
1697;
1698;ebx = qlp_coeff[]
1699;edi = data[]
1700;ebp = @address
1701
1702	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
1703	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
1704	add	ecx, eax
1705	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
1706
1707	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
1708	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
1709	add	ecx, eax
1710	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
1711
1712	mov	eax, [ebx + 116]
1713	imul	dword [edi - 120]
1714	add	ecx, eax
1715	adc	esi, edx
1716
1717	mov	eax, [ebx + 112]
1718	imul	dword [edi - 116]
1719	add	ecx, eax
1720	adc	esi, edx
1721
1722	mov	eax, [ebx + 108]
1723	imul	dword [edi - 112]
1724	add	ecx, eax
1725	adc	esi, edx
1726
1727	mov	eax, [ebx + 104]
1728	imul	dword [edi - 108]
1729	add	ecx, eax
1730	adc	esi, edx
1731
1732	mov	eax, [ebx + 100]
1733	imul	dword [edi - 104]
1734	add	ecx, eax
1735	adc	esi, edx
1736
1737	mov	eax, [ebx + 96]
1738	imul	dword [edi - 100]
1739	add	ecx, eax
1740	adc	esi, edx
1741
1742	mov	eax, [ebx + 92]
1743	imul	dword [edi - 96]
1744	add	ecx, eax
1745	adc	esi, edx
1746
1747	mov	eax, [ebx + 88]
1748	imul	dword [edi - 92]
1749	add	ecx, eax
1750	adc	esi, edx
1751
1752	mov	eax, [ebx + 84]
1753	imul	dword [edi - 88]
1754	add	ecx, eax
1755	adc	esi, edx
1756
1757	mov	eax, [ebx + 80]
1758	imul	dword [edi - 84]
1759	add	ecx, eax
1760	adc	esi, edx
1761
1762	mov	eax, [ebx + 76]
1763	imul	dword [edi - 80]
1764	add	ecx, eax
1765	adc	esi, edx
1766
1767	mov	eax, [ebx + 72]
1768	imul	dword [edi - 76]
1769	add	ecx, eax
1770	adc	esi, edx
1771
1772	mov	eax, [ebx + 68]
1773	imul	dword [edi - 72]
1774	add	ecx, eax
1775	adc	esi, edx
1776
1777	mov	eax, [ebx + 64]
1778	imul	dword [edi - 68]
1779	add	ecx, eax
1780	adc	esi, edx
1781
1782	mov	eax, [ebx + 60]
1783	imul	dword [edi - 64]
1784	add	ecx, eax
1785	adc	esi, edx
1786
1787	mov	eax, [ebx + 56]
1788	imul	dword [edi - 60]
1789	add	ecx, eax
1790	adc	esi, edx
1791
1792	mov	eax, [ebx + 52]
1793	imul	dword [edi - 56]
1794	add	ecx, eax
1795	adc	esi, edx
1796
1797	mov	eax, [ebx + 48]
1798	imul	dword [edi - 52]
1799	add	ecx, eax
1800	adc	esi, edx
1801
1802	mov	eax, [ebx + 44]
1803	imul	dword [edi - 48]
1804	add	ecx, eax
1805	adc	esi, edx
1806
1807	mov	eax, [ebx + 40]
1808	imul	dword [edi - 44]
1809	add	ecx, eax
1810	adc	esi, edx
1811
1812	mov	eax, [ebx + 36]
1813	imul	dword [edi - 40]
1814	add	ecx, eax
1815	adc	esi, edx
1816
1817	mov	eax, [ebx + 32]
1818	imul	dword [edi - 36]
1819	add	ecx, eax
1820	adc	esi, edx
1821
1822	mov	eax, [ebx + 28]
1823	imul	dword [edi - 32]
1824	add	ecx, eax
1825	adc	esi, edx
1826
1827	mov	eax, [ebx + 24]
1828	imul	dword [edi - 28]
1829	add	ecx, eax
1830	adc	esi, edx
1831
1832	mov	eax, [ebx + 20]
1833	imul	dword [edi - 24]
1834	add	ecx, eax
1835	adc	esi, edx
1836
1837	mov	eax, [ebx + 16]
1838	imul	dword [edi - 20]
1839	add	ecx, eax
1840	adc	esi, edx
1841
1842	mov	eax, [ebx + 12]
1843	imul	dword [edi - 16]
1844	add	ecx, eax
1845	adc	esi, edx
1846
1847	mov	eax, [ebx + 8]
1848	imul	dword [edi - 12]
1849	add	ecx, eax
1850	adc	esi, edx
1851
1852	mov	eax, [ebx + 4]
1853	imul	dword [edi - 8]
1854	add	ecx, eax
1855	adc	esi, edx
1856
1857	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1858	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
1859	add	ecx, eax
1860	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
1861
1862.jumper_0:
1863	mov	edx, ecx
1864;esi:edx = sum
1865	mov	ecx, [esp + 36]			; cl = lp_quantization
1866	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
1867;eax = --
1868;ecx = --
1869;edx = sum >> lp_q
1870;esi = --
1871	neg	edx						; edx = -(sum >> lp_quantization)
1872	mov	eax, [esp + 40]			; residual[] - data[]
1873	add	edx, [edi]				; edx = data[i] - (sum >> lp_quantization)
1874	mov	[edi + eax], edx
1875	add	edi, 4
1876
1877	dec	dword [esp + 24]
1878	jz	short .end
1879	xor	ecx, ecx
1880	xor	esi, esi
1881	jmp	ebp
1882
1883.end:
1884	pop	edi
1885	pop	esi
1886	pop	ebx
1887	pop	ebp
1888	ret
1889
1890; **********************************************************************
1891;
1892; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1893; {
1894; 	unsigned i, j;
1895; 	FLAC__int64 sum;
1896;
1897; 	FLAC__ASSERT(order > 0);
1898;
1899; 	for(i = 0; i < data_len; i++) {
1900; 		sum = 0;
1901; 		for(j = 0; j < order; j++)
1902; 			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1903; 		data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1904; 	}
1905; }
1906	ALIGN	16
1907cident FLAC__lpc_restore_signal_wide_asm_ia32
1908	;[esp + 40]	data[]
1909	;[esp + 36]	lp_quantization
1910	;[esp + 32]	order
1911	;[esp + 28]	qlp_coeff[]
1912	;[esp + 24]	data_len
1913	;[esp + 20]	residual[]
1914
1915	;ASSERT(order > 0)
1916	;ASSERT(order <= 32)
1917	;ASSERT(lp_quantization <= 31)
1918
1919	push	ebp
1920	push	ebx
1921	push	esi
1922	push	edi
1923
1924	mov	ebx, [esp + 24]			; ebx = data_len
1925	test	ebx, ebx
1926	jz	near .end				; do nothing if data_len == 0
1927
1928.begin:
1929	mov	eax, [esp + 32]			; eax = order
1930	cmp	eax, 1
1931	jg	short .x87_32
1932
1933	mov	esi, [esp + 20]			; esi = residual[]
1934	mov	edi, [esp + 40]			; edi = data[]
1935	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
1936	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
1937	mov	eax, [edi - 4]			; eax = data[-1]
1938	mov	ecx, [esp + 36]			; cl = lp_quantization
1939	ALIGN	16
1940.x87_1_loop_i:
1941	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1942	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
1943;
1944	add	eax, [esi]
1945	mov	[edi], eax
1946;
1947	add	esi, 4
1948	add	edi, 4
1949	dec	ebx
1950	jnz	.x87_1_loop_i
1951	jmp	.end
1952
1953.mov_eip_to_eax:
1954	mov	eax, [esp]
1955	ret
1956
1957.x87_32:	; eax = order
1958	neg	eax
1959	add	eax, eax
1960	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1961	call	.mov_eip_to_eax
1962.get_eip0:
1963	add	ebp, eax
1964	inc	ebp				; compensate for the shorter opcode on the last iteration
1965
1966	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
1967	mov	edi, [esp + 40]			; esi = data[]
1968	sub	[esp + 20], edi			; residual[] -= data[]
1969
1970	xor	ecx, ecx
1971	xor	esi, esi
1972	jmp	ebp
1973
1974;eax = --
1975;edx = --
1976;ecx = 0
1977;esi = 0
1978;
1979;ebx = qlp_coeff[]
1980;edi = data[]
1981;ebp = @address
1982
1983	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
1984	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
1985	add	ecx, eax
1986	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
1987
1988	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
1989	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
1990	add	ecx, eax
1991	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
1992
1993	mov	eax, [ebx + 116]
1994	imul	dword [edi - 120]
1995	add	ecx, eax
1996	adc	esi, edx
1997
1998	mov	eax, [ebx + 112]
1999	imul	dword [edi - 116]
2000	add	ecx, eax
2001	adc	esi, edx
2002
2003	mov	eax, [ebx + 108]
2004	imul	dword [edi - 112]
2005	add	ecx, eax
2006	adc	esi, edx
2007
2008	mov	eax, [ebx + 104]
2009	imul	dword [edi - 108]
2010	add	ecx, eax
2011	adc	esi, edx
2012
2013	mov	eax, [ebx + 100]
2014	imul	dword [edi - 104]
2015	add	ecx, eax
2016	adc	esi, edx
2017
2018	mov	eax, [ebx + 96]
2019	imul	dword [edi - 100]
2020	add	ecx, eax
2021	adc	esi, edx
2022
2023	mov	eax, [ebx + 92]
2024	imul	dword [edi - 96]
2025	add	ecx, eax
2026	adc	esi, edx
2027
2028	mov	eax, [ebx + 88]
2029	imul	dword [edi - 92]
2030	add	ecx, eax
2031	adc	esi, edx
2032
2033	mov	eax, [ebx + 84]
2034	imul	dword [edi - 88]
2035	add	ecx, eax
2036	adc	esi, edx
2037
2038	mov	eax, [ebx + 80]
2039	imul	dword [edi - 84]
2040	add	ecx, eax
2041	adc	esi, edx
2042
2043	mov	eax, [ebx + 76]
2044	imul	dword [edi - 80]
2045	add	ecx, eax
2046	adc	esi, edx
2047
2048	mov	eax, [ebx + 72]
2049	imul	dword [edi - 76]
2050	add	ecx, eax
2051	adc	esi, edx
2052
2053	mov	eax, [ebx + 68]
2054	imul	dword [edi - 72]
2055	add	ecx, eax
2056	adc	esi, edx
2057
2058	mov	eax, [ebx + 64]
2059	imul	dword [edi - 68]
2060	add	ecx, eax
2061	adc	esi, edx
2062
2063	mov	eax, [ebx + 60]
2064	imul	dword [edi - 64]
2065	add	ecx, eax
2066	adc	esi, edx
2067
2068	mov	eax, [ebx + 56]
2069	imul	dword [edi - 60]
2070	add	ecx, eax
2071	adc	esi, edx
2072
2073	mov	eax, [ebx + 52]
2074	imul	dword [edi - 56]
2075	add	ecx, eax
2076	adc	esi, edx
2077
2078	mov	eax, [ebx + 48]
2079	imul	dword [edi - 52]
2080	add	ecx, eax
2081	adc	esi, edx
2082
2083	mov	eax, [ebx + 44]
2084	imul	dword [edi - 48]
2085	add	ecx, eax
2086	adc	esi, edx
2087
2088	mov	eax, [ebx + 40]
2089	imul	dword [edi - 44]
2090	add	ecx, eax
2091	adc	esi, edx
2092
2093	mov	eax, [ebx + 36]
2094	imul	dword [edi - 40]
2095	add	ecx, eax
2096	adc	esi, edx
2097
2098	mov	eax, [ebx + 32]
2099	imul	dword [edi - 36]
2100	add	ecx, eax
2101	adc	esi, edx
2102
2103	mov	eax, [ebx + 28]
2104	imul	dword [edi - 32]
2105	add	ecx, eax
2106	adc	esi, edx
2107
2108	mov	eax, [ebx + 24]
2109	imul	dword [edi - 28]
2110	add	ecx, eax
2111	adc	esi, edx
2112
2113	mov	eax, [ebx + 20]
2114	imul	dword [edi - 24]
2115	add	ecx, eax
2116	adc	esi, edx
2117
2118	mov	eax, [ebx + 16]
2119	imul	dword [edi - 20]
2120	add	ecx, eax
2121	adc	esi, edx
2122
2123	mov	eax, [ebx + 12]
2124	imul	dword [edi - 16]
2125	add	ecx, eax
2126	adc	esi, edx
2127
2128	mov	eax, [ebx + 8]
2129	imul	dword [edi - 12]
2130	add	ecx, eax
2131	adc	esi, edx
2132
2133	mov	eax, [ebx + 4]
2134	imul	dword [edi - 8]
2135	add	ecx, eax
2136	adc	esi, edx
2137
2138	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
2139	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
2140	add	ecx, eax
2141	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
2142
2143.jumper_0:
2144	mov	edx, ecx
2145;esi:edx = sum
2146	mov	ecx, [esp + 36]			; cl = lp_quantization
2147	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
2148;eax = --
2149;ecx = --
2150;edx = sum >> lp_q
2151;esi = --
2152;
2153	mov	eax, [esp + 20]			; residual[] - data[]
2154	add	edx, [edi + eax]		; edx = residual[i] + (sum >> lp_quantization)
2155	mov	[edi], edx				; data[i] = residual[i] + (sum >> lp_quantization)
2156	add	edi, 4
2157
2158	dec	dword [esp + 24]
2159	jz	short .end
2160	xor	ecx, ecx
2161	xor	esi, esi
2162	jmp	ebp
2163
2164.end:
2165	pop	edi
2166	pop	esi
2167	pop	ebx
2168	pop	ebp
2169	ret
2170
2171; end
2172