1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001-2009  Josh Coalson
5;  Copyright (C) 2011-2014  Xiph.Org Foundation
6;
7;  Redistribution and use in source and binary forms, with or without
8;  modification, are permitted provided that the following conditions
9;  are met:
10;
11;  - Redistributions of source code must retain the above copyright
12;  notice, this list of conditions and the following disclaimer.
13;
14;  - Redistributions in binary form must reproduce the above copyright
15;  notice, this list of conditions and the following disclaimer in the
16;  documentation and/or other materials provided with the distribution.
17;
18;  - Neither the name of the Xiph.org Foundation nor the names of its
19;  contributors may be used to endorse or promote products derived from
20;  this software without specific prior written permission.
21;
22;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
26;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34%include "nasm.h"
35
36	data_section
37
38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
42cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
43cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
44cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
45cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
46cglobal FLAC__lpc_restore_signal_asm_ia32
47cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
48cglobal FLAC__lpc_restore_signal_wide_asm_ia32
49
50	code_section
51
52; **********************************************************************
53;
54; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
55; {
56;	FLAC__real d;
57;	unsigned sample, coeff;
58;	const unsigned limit = data_len - lag;
59;
60;	FLAC__ASSERT(lag > 0);
61;	FLAC__ASSERT(lag <= data_len);
62;
63;	for(coeff = 0; coeff < lag; coeff++)
64;		autoc[coeff] = 0.0;
65;	for(sample = 0; sample <= limit; sample++) {
66;		d = data[sample];
67;		for(coeff = 0; coeff < lag; coeff++)
68;			autoc[coeff] += d * data[sample+coeff];
69;	}
70;	for(; sample < data_len; sample++) {
71;		d = data[sample];
72;		for(coeff = 0; coeff < data_len - sample; coeff++)
73;			autoc[coeff] += d * data[sample+coeff];
74;	}
75; }
76;
77	ALIGN 16
78cident FLAC__lpc_compute_autocorrelation_asm_ia32
79	;[esp + 28] == autoc[]
80	;[esp + 24] == lag
81	;[esp + 20] == data_len
82	;[esp + 16] == data[]
83
84	;ASSERT(lag > 0)
85	;ASSERT(lag <= 33)
86	;ASSERT(lag <= data_len)
87
88.begin:
89	push	esi
90	push	edi
91	push	ebx
92
93	;	for(coeff = 0; coeff < lag; coeff++)
94	;		autoc[coeff] = 0.0;
95	mov	edi, [esp + 28]			; edi == autoc
96	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
97	xor	eax, eax
98	rep	stosd
99
100	;	const unsigned limit = data_len - lag;
101	mov	eax, [esp + 24]			; eax == lag
102	mov	ecx, [esp + 20]
103	sub	ecx, eax			; ecx == limit
104
105	mov	edi, [esp + 28]			; edi == autoc
106	mov	esi, [esp + 16]			; esi == data
107	inc	ecx				; we are looping <= limit so we add one to the counter
108
109	;	for(sample = 0; sample <= limit; sample++) {
110	;		d = data[sample];
111	;		for(coeff = 0; coeff < lag; coeff++)
112	;			autoc[coeff] += d * data[sample+coeff];
113	;	}
114	fld	dword [esi]			; ST = d <- data[sample]
115	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
116	lea	edx, [eax + eax*2]
117	neg	edx
118	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
119	call	.mov_eip_to_ebx
120.get_eip1:
121	add	edx, ebx
122	inc	edx				; compensate for the shorter opcode on the last iteration
123	inc	edx				; compensate for the shorter opcode on the last iteration
124	inc	edx				; compensate for the shorter opcode on the last iteration
125	cmp	eax, 33
126	jne	.loop1_start
127	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
128.loop1_start:
129	jmp	edx
130
131.mov_eip_to_ebx:
132	mov	ebx, [esp]
133	ret
134
135	fld	st0				; ST = d d
136	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
137	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
138	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
139	fld	st0				; ST = d d
140	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
141	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
142	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
143	fld	st0				; ST = d d
144	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
145	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
146	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
147	fld	st0				; ST = d d
148	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
149	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
150	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
151	fld	st0				; ST = d d
152	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
153	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
154	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
155	fld	st0				; ST = d d
156	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
157	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
158	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
159	fld	st0				; ST = d d
160	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
161	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
162	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
163	fld	st0				; ST = d d
164	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
165	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
166	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
167	fld	st0				; ST = d d
168	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
169	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
170	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
171	fld	st0				; ST = d d
172	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
173	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
174	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
175	fld	st0				; ST = d d
176	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
177	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
178	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
179	fld	st0				; ST = d d
180	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
181	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
182	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
183	fld	st0				; ST = d d
184	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
185	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
186	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
187	fld	st0				; ST = d d
188	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
189	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
190	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
191	fld	st0				; ST = d d
192	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
193	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
194	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
195	fld	st0				; ST = d d
196	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
197	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
198	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
199	fld	st0				; ST = d d
200	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
201	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
202	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
203	fld	st0				; ST = d d
204	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
205	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
206	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
207	fld	st0				; ST = d d
208	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
209	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
210	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
211	fld	st0				; ST = d d
212	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
213	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
214	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
215	fld	st0				; ST = d d
216	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
217	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
218	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
219	fld	st0				; ST = d d
220	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
221	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
222	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
223	fld	st0				; ST = d d
224	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
225	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
226	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
227	fld	st0				; ST = d d
228	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
229	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
230	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
231	fld	st0				; ST = d d
232	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
233	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
234	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
235	fld	st0				; ST = d d
236	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
237	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
238	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
239	fld	st0				; ST = d d
240	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
241	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
242	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
243	fld	st0				; ST = d d
244	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
245	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
246	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
247	fld	st0				; ST = d d
248	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
249	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
250	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
251	fld	st0				; ST = d d
252	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
253	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
254	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
255	fld	st0				; ST = d d
256	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
257	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
258	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
259	fld	st0				; ST = d d
260	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
261	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
262	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
263	fld	st0				; ST = d d
264	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
265	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
266	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
267.jumper1_0:
268
269	fstp	st0				; pop d, ST = empty
270	add	esi, byte 4			; sample++
271	dec	ecx
272	jz	.loop1_end
273	fld	dword [esi]			; ST = d <- data[sample]
274	jmp	edx
275.loop1_end:
276
277	;	for(; sample < data_len; sample++) {
278	;		d = data[sample];
279	;		for(coeff = 0; coeff < data_len - sample; coeff++)
280	;			autoc[coeff] += d * data[sample+coeff];
281	;	}
282	mov	ecx, [esp + 24]			; ecx <- lag
283	dec	ecx				; ecx <- lag - 1
284	jz	near .end			; skip loop if 0 (i.e. lag == 1)
285
286	fld	dword [esi]			; ST = d <- data[sample]
287	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
288	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
289	lea	edx, [eax + eax*2]
290	neg	edx
291	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
292	call	.mov_eip_to_ebx
293.get_eip2:
294	add	edx, ebx
295	inc	edx				; compensate for the shorter opcode on the last iteration
296	inc	edx				; compensate for the shorter opcode on the last iteration
297	inc	edx				; compensate for the shorter opcode on the last iteration
298	jmp	edx
299
300	fld	st0				; ST = d d
301	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
302	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
303	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
304	fld	st0				; ST = d d
305	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
306	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
307	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
308	fld	st0				; ST = d d
309	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
310	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
311	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
312	fld	st0				; ST = d d
313	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
314	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
315	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
316	fld	st0				; ST = d d
317	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
318	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
319	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
320	fld	st0				; ST = d d
321	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
322	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
323	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
324	fld	st0				; ST = d d
325	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
326	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
327	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
328	fld	st0				; ST = d d
329	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
330	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
331	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
332	fld	st0				; ST = d d
333	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
334	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
335	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
336	fld	st0				; ST = d d
337	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
338	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
339	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
340	fld	st0				; ST = d d
341	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
342	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
343	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
344	fld	st0				; ST = d d
345	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
346	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
347	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
348	fld	st0				; ST = d d
349	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
350	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
351	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
352	fld	st0				; ST = d d
353	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
354	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
355	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
356	fld	st0				; ST = d d
357	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
358	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
359	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
360	fld	st0				; ST = d d
361	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
362	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
363	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
364	fld	st0				; ST = d d
365	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
366	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
367	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
368	fld	st0				; ST = d d
369	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
370	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
371	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
372	fld	st0				; ST = d d
373	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
374	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
375	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
376	fld	st0				; ST = d d
377	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
378	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
379	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
380	fld	st0				; ST = d d
381	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
382	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
383	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
384	fld	st0				; ST = d d
385	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
386	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
387	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
388	fld	st0				; ST = d d
389	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
390	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
391	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
392	fld	st0				; ST = d d
393	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
394	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
395	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
396	fld	st0				; ST = d d
397	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
398	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
399	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
400	fld	st0				; ST = d d
401	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
402	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
403	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
404	fld	st0				; ST = d d
405	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
406	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
407	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
408	fld	st0				; ST = d d
409	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
410	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
411	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
412	fld	st0				; ST = d d
413	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
414	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
415	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
416	fld	st0				; ST = d d
417	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
418	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
419	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
420	fld	st0				; ST = d d
421	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
422	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
423	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
424	fld	st0				; ST = d d
425	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
426	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
427	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
428.jumper2_0:
429
430	fstp	st0				; pop d, ST = empty
431	add	esi, byte 4			; sample++
432	dec	ecx
433	jz	.loop2_end
434	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
435	fld	dword [esi]			; ST = d <- data[sample]
436	jmp	edx
437.loop2_end:
438
439.end:
440	pop	ebx
441	pop	edi
442	pop	esi
443	ret
444
445	ALIGN 16
446cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
447	;[esp + 16] == autoc[]
448	;[esp + 12] == lag
449	;[esp + 8] == data_len
450	;[esp + 4] == data[]
451
452	;ASSERT(lag > 0)
453	;ASSERT(lag <= 4)
454	;ASSERT(lag <= data_len)
455
456	;	for(coeff = 0; coeff < lag; coeff++)
457	;		autoc[coeff] = 0.0;
458	xorps	xmm5, xmm5
459
460	mov	edx, [esp + 8]			; edx == data_len
461	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
462
463	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
464	add	eax, 4
465	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
466	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
467.warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
468	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
469	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
470	dec	edx
471	jz	.loop_end
472	ALIGN 16
473.loop_start:
474	; start by reading the next sample
475	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
476	add	eax, 4
477	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
478	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
479	movss	xmm2, xmm0
480	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
481	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
482	dec	edx
483	jnz	.loop_start
484.loop_end:
485	; store autoc
486	mov	edx, [esp + 16]			; edx == autoc
487	movups	[edx], xmm5
488
489.end:
490	ret
491
492	ALIGN 16
493cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
494	;[esp + 16] == autoc[]
495	;[esp + 12] == lag
496	;[esp + 8] == data_len
497	;[esp + 4] == data[]
498
499	;ASSERT(lag > 0)
500	;ASSERT(lag <= 8)
501	;ASSERT(lag <= data_len)
502
503	;	for(coeff = 0; coeff < lag; coeff++)
504	;		autoc[coeff] = 0.0;
505	xorps	xmm5, xmm5
506	xorps	xmm6, xmm6
507
508	mov	edx, [esp + 8]			; edx == data_len
509	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
510
511	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
512	add	eax, 4
513	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
514	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
515	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
516	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
517.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
518	mulps	xmm0, xmm2
519	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
520	addps	xmm5, xmm0
521	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
522	dec	edx
523	jz	.loop_end
524	ALIGN 16
525.loop_start:
526	; start by reading the next sample
527	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
528	; here we reorder the instructions; see the (#) indexes for a logical order
529	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
530	add	eax, 4				; (0)
531	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
532	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
533	movss	xmm3, xmm2			; (5)
534	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
535	movss	xmm2, xmm0			; (6)
536	mulps	xmm1, xmm3			; (8)
537	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
538	addps	xmm6, xmm1			; (10)
539	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
540	dec	edx
541	jnz	.loop_start
542.loop_end:
543	; store autoc
544	mov	edx, [esp + 16]			; edx == autoc
545	movups	[edx], xmm5
546	movups	[edx + 16], xmm6
547
548.end:
549	ret
550
551	ALIGN 16
552cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
553	;[esp + 16] == autoc[]
554	;[esp + 12] == lag
555	;[esp + 8] == data_len
556	;[esp + 4] == data[]
557
558	;ASSERT(lag > 0)
559	;ASSERT(lag <= 12)
560	;ASSERT(lag <= data_len)
561
562	;	for(coeff = 0; coeff < lag; coeff++)
563	;		autoc[coeff] = 0.0;
564	xorps	xmm5, xmm5
565	xorps	xmm6, xmm6
566	xorps	xmm7, xmm7
567
568	mov	edx, [esp + 8]			; edx == data_len
569	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
570
571	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
572	add	eax, 4
573	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
574	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
575	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
576	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
577.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
578	movaps	xmm1, xmm0
579	mulps	xmm1, xmm2
580	addps	xmm5, xmm1
581	movaps	xmm1, xmm0
582	mulps	xmm1, xmm3
583	addps	xmm6, xmm1
584	mulps	xmm0, xmm4
585	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
586	dec	edx
587	jz	.loop_end
588	ALIGN 16
589.loop_start:
590	; start by reading the next sample
591	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
592	add	eax, 4
593	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
594
595	; shift xmm4:xmm3:xmm2 left by one float
596	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
597	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
598	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
599	movss	xmm4, xmm3
600	movss	xmm3, xmm2
601	movss	xmm2, xmm0
602
603	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
604	movaps	xmm1, xmm0
605	mulps	xmm1, xmm2
606	addps	xmm5, xmm1
607	movaps	xmm1, xmm0
608	mulps	xmm1, xmm3
609	addps	xmm6, xmm1
610	mulps	xmm0, xmm4
611	addps	xmm7, xmm0
612
613	dec	edx
614	jnz	.loop_start
615.loop_end:
616	; store autoc
617	mov	edx, [esp + 16]			; edx == autoc
618	movups	[edx], xmm5
619	movups	[edx + 16], xmm6
620	movups	[edx + 32], xmm7
621
622.end:
623	ret
624
625	ALIGN 16
626cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
627	;[ebp + 20] == autoc[]
628	;[ebp + 16] == lag
629	;[ebp + 12] == data_len
630	;[ebp +  8] == data[]
631	;[esp] == __m128
632	;[esp + 16] == __m128
633
634	push	ebp
635	mov	ebp, esp
636	and	esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
637	sub	esp, 32
638
639	;ASSERT(lag > 0)
640	;ASSERT(lag <= 12)
641	;ASSERT(lag <= data_len)
642	;ASSERT(data_len > 0)
643
644	;	for(coeff = 0; coeff < lag; coeff++)
645	;		autoc[coeff] = 0.0;
646	xorps	xmm5, xmm5
647	xorps	xmm6, xmm6
648	movaps	[esp], xmm5
649	movaps	[esp + 16], xmm6
650
651	mov	edx, [ebp + 12]			; edx == data_len
652	mov	eax, [ebp +  8]			; eax == &data[sample] <- &data[0]
653
654	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
655	add	eax, 4
656	movaps	xmm1, xmm0			; xmm1 = 0,0,0,data[0]
657	shufps	xmm0, xmm0, 0		; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
658	xorps	xmm2, xmm2			; xmm2 = 0,0,0,0
659	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
660	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
661	movaps	xmm7, xmm0
662	mulps	xmm7, xmm1
663	addps	xmm5, xmm7
664	dec	edx
665	jz	.loop_end
666	ALIGN 16
667.loop_start:
668	; start by reading the next sample
669	movss	xmm0, [eax]				; xmm0 = 0,0,0,data[sample]
670	add	eax, 4
671	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
672
673	; shift xmm4:xmm3:xmm2:xmm1 left by one float
674	shufps	xmm1, xmm1, 93h
675	shufps	xmm2, xmm2, 93h
676	shufps	xmm3, xmm3, 93h
677	shufps	xmm4, xmm4, 93h
678	movss	xmm4, xmm3
679	movss	xmm3, xmm2
680	movss	xmm2, xmm1
681	movss	xmm1, xmm0
682
683	; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
684	movaps	xmm7, xmm0
685	mulps	xmm7, xmm1
686	addps	xmm5, xmm7
687	movaps	xmm7, xmm0
688	mulps	xmm7, xmm2
689	addps	xmm6, xmm7
690	movaps	xmm7, xmm0
691	mulps	xmm7, xmm3
692	mulps	xmm0, xmm4
693	addps	xmm7, [esp]
694	addps	xmm0, [esp + 16]
695	movaps	[esp], xmm7
696	movaps	[esp + 16], xmm0
697
698	dec	edx
699	jnz	.loop_start
700.loop_end:
701	; store autoc
702	mov	edx, [ebp + 20]				; edx == autoc
703	movups	[edx], xmm5
704	movups	[edx + 16], xmm6
705	movaps	xmm5, [esp]
706	movaps	xmm6, [esp + 16]
707	movups	[edx + 32], xmm5
708	movups	[edx + 48], xmm6
709.end:
710	mov	esp, ebp
711	pop	ebp
712	ret
713
714;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
715;
716;	for(i = 0; i < data_len; i++) {
717;		sum = 0;
718;		for(j = 0; j < order; j++)
719;			sum += qlp_coeff[j] * data[i-j-1];
720;		residual[i] = data[i] - (sum >> lp_quantization);
721;	}
722;
723	ALIGN	16
724cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
725	;[esp + 40]	residual[]
726	;[esp + 36]	lp_quantization
727	;[esp + 32]	order
728	;[esp + 28]	qlp_coeff[]
729	;[esp + 24]	data_len
730	;[esp + 20]	data[]
731
732	;ASSERT(order > 0)
733
734	push	ebp
735	push	ebx
736	push	esi
737	push	edi
738
739	mov	esi, [esp + 20]			; esi = data[]
740	mov	edi, [esp + 40]			; edi = residual[]
741	mov	eax, [esp + 32]			; eax = order
742	mov	ebx, [esp + 24]			; ebx = data_len
743
744	test	ebx, ebx
745	jz	near .end			; do nothing if data_len == 0
746.begin:
747	cmp	eax, byte 1
748	jg	short .i_1more
749
750	mov	ecx, [esp + 28]
751	mov	edx, [ecx]			; edx = qlp_coeff[0]
752	mov	eax, [esi - 4]			; eax = data[-1]
753	mov	ecx, [esp + 36]			; cl = lp_quantization
754	ALIGN	16
755.i_1_loop_i:
756	imul	eax, edx
757	sar	eax, cl
758	neg	eax
759	add	eax, [esi]
760	mov	[edi], eax
761	mov	eax, [esi]
762	add	edi, byte 4
763	add	esi, byte 4
764	dec	ebx
765	jnz	.i_1_loop_i
766
767	jmp	.end
768
769.i_1more:
770	cmp	eax, byte 32			; for order <= 32 there is a faster routine
771	jbe	short .i_32
772
773	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
774	ALIGN 16
775.i_32more_loop_i:
776	xor	ebp, ebp
777	mov	ecx, [esp + 32]
778	mov	edx, ecx
779	shl	edx, 2
780	add	edx, [esp + 28]
781	neg	ecx
782	ALIGN	16
783.i_32more_loop_j:
784	sub	edx, byte 4
785	mov	eax, [edx]
786	imul	eax, [esi + 4 * ecx]
787	add	ebp, eax
788	inc	ecx
789	jnz	short .i_32more_loop_j
790
791	mov	ecx, [esp + 36]
792	sar	ebp, cl
793	neg	ebp
794	add	ebp, [esi]
795	mov	[edi], ebp
796	add	esi, byte 4
797	add	edi, byte 4
798
799	dec	ebx
800	jnz	.i_32more_loop_i
801
802	jmp	.end
803
804.mov_eip_to_eax:
805	mov	eax, [esp]
806	ret
807
808.i_32:
809	sub	edi, esi
810	neg	eax
811	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
812	call	.mov_eip_to_eax
813.get_eip0:
814	add	edx, eax
815	inc	edx
816	mov	eax, [esp + 28]			; eax = qlp_coeff[]
817	xor	ebp, ebp
818	jmp	edx
819
820	mov	ecx, [eax + 124]
821	imul	ecx, [esi - 128]
822	add	ebp, ecx
823	mov	ecx, [eax + 120]
824	imul	ecx, [esi - 124]
825	add	ebp, ecx
826	mov	ecx, [eax + 116]
827	imul	ecx, [esi - 120]
828	add	ebp, ecx
829	mov	ecx, [eax + 112]
830	imul	ecx, [esi - 116]
831	add	ebp, ecx
832	mov	ecx, [eax + 108]
833	imul	ecx, [esi - 112]
834	add	ebp, ecx
835	mov	ecx, [eax + 104]
836	imul	ecx, [esi - 108]
837	add	ebp, ecx
838	mov	ecx, [eax + 100]
839	imul	ecx, [esi - 104]
840	add	ebp, ecx
841	mov	ecx, [eax + 96]
842	imul	ecx, [esi - 100]
843	add	ebp, ecx
844	mov	ecx, [eax + 92]
845	imul	ecx, [esi - 96]
846	add	ebp, ecx
847	mov	ecx, [eax + 88]
848	imul	ecx, [esi - 92]
849	add	ebp, ecx
850	mov	ecx, [eax + 84]
851	imul	ecx, [esi - 88]
852	add	ebp, ecx
853	mov	ecx, [eax + 80]
854	imul	ecx, [esi - 84]
855	add	ebp, ecx
856	mov	ecx, [eax + 76]
857	imul	ecx, [esi - 80]
858	add	ebp, ecx
859	mov	ecx, [eax + 72]
860	imul	ecx, [esi - 76]
861	add	ebp, ecx
862	mov	ecx, [eax + 68]
863	imul	ecx, [esi - 72]
864	add	ebp, ecx
865	mov	ecx, [eax + 64]
866	imul	ecx, [esi - 68]
867	add	ebp, ecx
868	mov	ecx, [eax + 60]
869	imul	ecx, [esi - 64]
870	add	ebp, ecx
871	mov	ecx, [eax + 56]
872	imul	ecx, [esi - 60]
873	add	ebp, ecx
874	mov	ecx, [eax + 52]
875	imul	ecx, [esi - 56]
876	add	ebp, ecx
877	mov	ecx, [eax + 48]
878	imul	ecx, [esi - 52]
879	add	ebp, ecx
880	mov	ecx, [eax + 44]
881	imul	ecx, [esi - 48]
882	add	ebp, ecx
883	mov	ecx, [eax + 40]
884	imul	ecx, [esi - 44]
885	add	ebp, ecx
886	mov	ecx, [eax + 36]
887	imul	ecx, [esi - 40]
888	add	ebp, ecx
889	mov	ecx, [eax + 32]
890	imul	ecx, [esi - 36]
891	add	ebp, ecx
892	mov	ecx, [eax + 28]
893	imul	ecx, [esi - 32]
894	add	ebp, ecx
895	mov	ecx, [eax + 24]
896	imul	ecx, [esi - 28]
897	add	ebp, ecx
898	mov	ecx, [eax + 20]
899	imul	ecx, [esi - 24]
900	add	ebp, ecx
901	mov	ecx, [eax + 16]
902	imul	ecx, [esi - 20]
903	add	ebp, ecx
904	mov	ecx, [eax + 12]
905	imul	ecx, [esi - 16]
906	add	ebp, ecx
907	mov	ecx, [eax + 8]
908	imul	ecx, [esi - 12]
909	add	ebp, ecx
910	mov	ecx, [eax + 4]
911	imul	ecx, [esi - 8]
912	add	ebp, ecx
913	mov	ecx, [eax]			; there is one byte missing
914	imul	ecx, [esi - 4]
915	add	ebp, ecx
916.jumper_0:
917
918	mov	ecx, [esp + 36]
919	sar	ebp, cl
920	neg	ebp
921	add	ebp, [esi]
922	mov	[edi + esi], ebp
923	add	esi, byte 4
924
925	dec	ebx
926	jz	short .end
927	xor	ebp, ebp
928	jmp	edx
929
930.end:
931	pop	edi
932	pop	esi
933	pop	ebx
934	pop	ebp
935	ret
936
937; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
938; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
939; cannot be used for side-channel coded 16bps channels since the effective bps
940; is 17.
941	ALIGN	16
942cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
943	;[esp + 40]	residual[]
944	;[esp + 36]	lp_quantization
945	;[esp + 32]	order
946	;[esp + 28]	qlp_coeff[]
947	;[esp + 24]	data_len
948	;[esp + 20]	data[]
949
950	;ASSERT(order > 0)
951
952	push	ebp
953	push	ebx
954	push	esi
955	push	edi
956
957	mov	esi, [esp + 20]			; esi = data[]
958	mov	edi, [esp + 40]			; edi = residual[]
959	mov	eax, [esp + 32]			; eax = order
960	mov	ebx, [esp + 24]			; ebx = data_len
961
962	test	ebx, ebx
963	jz	near .end			; do nothing if data_len == 0
964	dec	ebx
965	test	ebx, ebx
966	jz	near .last_one
967
968	mov	edx, [esp + 28]			; edx = qlp_coeff[]
969	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
970	mov	ebp, esp
971
972	and	esp, 0xfffffff8
973
974	xor	ecx, ecx
975.copy_qlp_loop:
976	push	word [edx + 4 * ecx]
977	inc	ecx
978	cmp	ecx, eax
979	jnz	short .copy_qlp_loop
980
981	and	ecx, 0x3
982	test	ecx, ecx
983	je	short .za_end
984	sub	ecx, byte 4
985.za_loop:
986	push	word 0
987	inc	eax
988	inc	ecx
989	jnz	short .za_loop
990.za_end:
991
992	movq	mm5, [esp + 2 * eax - 8]
993	movd	mm4, [esi - 16]
994	punpckldq	mm4, [esi - 12]
995	movd	mm0, [esi - 8]
996	punpckldq	mm0, [esi - 4]
997	packssdw	mm4, mm0
998
999	cmp	eax, byte 4
1000	jnbe	short .mmx_4more
1001
1002	ALIGN	16
1003.mmx_4_loop_i:
1004	movd	mm1, [esi]
1005	movq	mm3, mm4
1006	punpckldq	mm1, [esi + 4]
1007	psrlq	mm4, 16
1008	movq	mm0, mm1
1009	psllq	mm0, 48
1010	por	mm4, mm0
1011	movq	mm2, mm4
1012	psrlq	mm4, 16
1013	pxor	mm0, mm0
1014	punpckhdq	mm0, mm1
1015	pmaddwd	mm3, mm5
1016	pmaddwd	mm2, mm5
1017	psllq	mm0, 16
1018	por	mm4, mm0
1019	movq	mm0, mm3
1020	punpckldq	mm3, mm2
1021	punpckhdq	mm0, mm2
1022	paddd	mm3, mm0
1023	psrad	mm3, mm6
1024	psubd	mm1, mm3
1025	movd	[edi], mm1
1026	punpckhdq	mm1, mm1
1027	movd	[edi + 4], mm1
1028
1029	add	edi, byte 8
1030	add	esi, byte 8
1031
1032	sub	ebx, 2
1033	jg	.mmx_4_loop_i
1034	jmp	.mmx_end
1035
1036.mmx_4more:
1037	shl	eax, 2
1038	neg	eax
1039	add	eax, byte 16
1040
1041	ALIGN	16
1042.mmx_4more_loop_i:
1043	movd	mm1, [esi]
1044	punpckldq	mm1, [esi + 4]
1045	movq	mm3, mm4
1046	psrlq	mm4, 16
1047	movq	mm0, mm1
1048	psllq	mm0, 48
1049	por	mm4, mm0
1050	movq	mm2, mm4
1051	psrlq	mm4, 16
1052	pxor	mm0, mm0
1053	punpckhdq	mm0, mm1
1054	pmaddwd	mm3, mm5
1055	pmaddwd	mm2, mm5
1056	psllq	mm0, 16
1057	por	mm4, mm0
1058
1059	mov	ecx, esi
1060	add	ecx, eax
1061	mov	edx, esp
1062
1063	ALIGN	16
1064.mmx_4more_loop_j:
1065	movd	mm0, [ecx - 16]
1066	movd	mm7, [ecx - 8]
1067	punpckldq	mm0, [ecx - 12]
1068	punpckldq	mm7, [ecx - 4]
1069	packssdw	mm0, mm7
1070	pmaddwd	mm0, [edx]
1071	punpckhdq	mm7, mm7
1072	paddd	mm3, mm0
1073	movd	mm0, [ecx - 12]
1074	punpckldq	mm0, [ecx - 8]
1075	punpckldq	mm7, [ecx]
1076	packssdw	mm0, mm7
1077	pmaddwd	mm0, [edx]
1078	paddd	mm2, mm0
1079
1080	add	edx, byte 8
1081	add	ecx, byte 16
1082	cmp	ecx, esi
1083	jnz	.mmx_4more_loop_j
1084
1085	movq	mm0, mm3
1086	punpckldq	mm3, mm2
1087	punpckhdq	mm0, mm2
1088	paddd	mm3, mm0
1089	psrad	mm3, mm6
1090	psubd	mm1, mm3
1091	movd	[edi], mm1
1092	punpckhdq	mm1, mm1
1093	movd	[edi + 4], mm1
1094
1095	add	edi, byte 8
1096	add	esi, byte 8
1097
1098	sub	ebx, 2
1099	jg	near .mmx_4more_loop_i
1100
1101.mmx_end:
1102	emms
1103	mov	esp, ebp
1104.last_one:
1105	mov	eax, [esp + 32]
1106	inc	ebx
1107	jnz	near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1108
1109.end:
1110	pop	edi
1111	pop	esi
1112	pop	ebx
1113	pop	ebp
1114	ret
1115
1116; **********************************************************************
1117;
1118; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1119; {
1120; 	unsigned i, j;
1121; 	FLAC__int32 sum;
1122;
1123; 	FLAC__ASSERT(order > 0);
1124;
1125; 	for(i = 0; i < data_len; i++) {
1126; 		sum = 0;
1127; 		for(j = 0; j < order; j++)
1128; 			sum += qlp_coeff[j] * data[i-j-1];
1129; 		data[i] = residual[i] + (sum >> lp_quantization);
1130; 	}
1131; }
1132	ALIGN	16
1133cident FLAC__lpc_restore_signal_asm_ia32
1134	;[esp + 40]	data[]
1135	;[esp + 36]	lp_quantization
1136	;[esp + 32]	order
1137	;[esp + 28]	qlp_coeff[]
1138	;[esp + 24]	data_len
1139	;[esp + 20]	residual[]
1140
1141	;ASSERT(order > 0)
1142
1143	push	ebp
1144	push	ebx
1145	push	esi
1146	push	edi
1147
1148	mov	esi, [esp + 20]			; esi = residual[]
1149	mov	edi, [esp + 40]			; edi = data[]
1150	mov	eax, [esp + 32]			; eax = order
1151	mov	ebx, [esp + 24]			; ebx = data_len
1152
1153	test	ebx, ebx
1154	jz	near .end			; do nothing if data_len == 0
1155
1156.begin:
1157	cmp	eax, byte 1
1158	jg	short .x87_1more
1159
1160	mov	ecx, [esp + 28]
1161	mov	edx, [ecx]
1162	mov	eax, [edi - 4]
1163	mov	ecx, [esp + 36]
1164	ALIGN	16
1165.x87_1_loop_i:
1166	imul	eax, edx
1167	sar	eax, cl
1168	add	eax, [esi]
1169	mov	[edi], eax
1170	add	esi, byte 4
1171	add	edi, byte 4
1172	dec	ebx
1173	jnz	.x87_1_loop_i
1174
1175	jmp	.end
1176
1177.x87_1more:
1178	cmp	eax, byte 32			; for order <= 32 there is a faster routine
1179	jbe	short .x87_32
1180
1181	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1182	ALIGN 16
1183.x87_32more_loop_i:
1184	xor	ebp, ebp
1185	mov	ecx, [esp + 32]
1186	mov	edx, ecx
1187	shl	edx, 2
1188	add	edx, [esp + 28]
1189	neg	ecx
1190	ALIGN	16
1191.x87_32more_loop_j:
1192	sub	edx, byte 4
1193	mov	eax, [edx]
1194	imul	eax, [edi + 4 * ecx]
1195	add	ebp, eax
1196	inc	ecx
1197	jnz	short .x87_32more_loop_j
1198
1199	mov	ecx, [esp + 36]
1200	sar	ebp, cl
1201	add	ebp, [esi]
1202	mov	[edi], ebp
1203	add	edi, byte 4
1204	add	esi, byte 4
1205
1206	dec	ebx
1207	jnz	.x87_32more_loop_i
1208
1209	jmp	.end
1210
1211.mov_eip_to_eax:
1212	mov	eax, [esp]
1213	ret
1214
1215.x87_32:
1216	sub	esi, edi
1217	neg	eax
1218	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1219	call	.mov_eip_to_eax
1220.get_eip0:
1221	add	edx, eax
1222	inc	edx				; compensate for the shorter opcode on the last iteration
1223	mov	eax, [esp + 28]			; eax = qlp_coeff[]
1224	xor	ebp, ebp
1225	jmp	edx
1226
1227	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
1228	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
1229	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
1230	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
1231	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
1232	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
1233	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
1234	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
1235	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
1236	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
1237	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
1238	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
1239	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
1240	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
1241	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
1242	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
1243	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
1244	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
1245	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
1246	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
1247	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
1248	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
1249	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
1250	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
1251	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
1252	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
1253	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
1254	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
1255	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
1256	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
1257	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
1258	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
1259	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
1260	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
1261	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
1262	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
1263	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
1264	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
1265	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
1266	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
1267	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
1268	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
1269	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
1270	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
1271	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
1272	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
1273	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
1274	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
1275	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
1276	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
1277	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
1278	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
1279	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
1280	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
1281	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
1282	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
1283	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
1284	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
1285	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
1286	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
1287	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
1288	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
1289	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
1290	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
1291	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
1292	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
1293	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
1294	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
1295	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
1296	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
1297	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
1298	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
1299	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
1300	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
1301	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
1302	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
1303	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
1304	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
1305	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
1306	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
1307	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
1308	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
1309	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
1310	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
1311	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
1312	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
1313	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
1314	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
1315	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
1316	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
1317	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
1318	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
1319	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
1320	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1321	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
1322	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
1323.jumper_0:
1324
1325	mov	ecx, [esp + 36]
1326	sar	ebp, cl				; ebp = (sum >> lp_quantization)
1327	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
1328	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
1329	add	edi, byte 4
1330
1331	dec	ebx
1332	jz	short .end
1333	xor	ebp, ebp
1334	jmp	edx
1335
1336.end:
1337	pop	edi
1338	pop	esi
1339	pop	ebx
1340	pop	ebp
1341	ret
1342
1343; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1344; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
1345; cannot be used for side-channel coded 16bps channels since the effective bps
1346; is 17.
1347; WATCHOUT: this routine requires that each data array have a buffer of up to
1348; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1349; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1350	ALIGN	16
1351cident FLAC__lpc_restore_signal_asm_ia32_mmx
1352	;[esp + 40]	data[]
1353	;[esp + 36]	lp_quantization
1354	;[esp + 32]	order
1355	;[esp + 28]	qlp_coeff[]
1356	;[esp + 24]	data_len
1357	;[esp + 20]	residual[]
1358
1359	;ASSERT(order > 0)
1360
1361	push	ebp
1362	push	ebx
1363	push	esi
1364	push	edi
1365
1366	mov	esi, [esp + 20]
1367	mov	edi, [esp + 40]
1368	mov	eax, [esp + 32]
1369	mov	ebx, [esp + 24]
1370
1371	test	ebx, ebx
1372	jz	near .end			; do nothing if data_len == 0
1373	cmp	eax, byte 4
1374	jb	near FLAC__lpc_restore_signal_asm_ia32.begin
1375
1376	mov	edx, [esp + 28]
1377	movd	mm6, [esp + 36]
1378	mov	ebp, esp
1379
1380	and	esp, 0xfffffff8
1381
1382	xor	ecx, ecx
1383.copy_qlp_loop:
1384	push	word [edx + 4 * ecx]
1385	inc	ecx
1386	cmp	ecx, eax
1387	jnz	short .copy_qlp_loop
1388
1389	and	ecx, 0x3
1390	test	ecx, ecx
1391	je	short .za_end
1392	sub	ecx, byte 4
1393.za_loop:
1394	push	word 0
1395	inc	eax
1396	inc	ecx
1397	jnz	short .za_loop
1398.za_end:
1399
1400	movq	mm5, [esp + 2 * eax - 8]
1401	movd	mm4, [edi - 16]
1402	punpckldq	mm4, [edi - 12]
1403	movd	mm0, [edi - 8]
1404	punpckldq	mm0, [edi - 4]
1405	packssdw	mm4, mm0
1406
1407	cmp	eax, byte 4
1408	jnbe	short .mmx_4more
1409
1410	ALIGN	16
1411.mmx_4_loop_i:
1412	movq	mm7, mm4
1413	pmaddwd	mm7, mm5
1414	movq	mm0, mm7
1415	punpckhdq	mm7, mm7
1416	paddd	mm7, mm0
1417	psrad	mm7, mm6
1418	movd	mm1, [esi]
1419	paddd	mm7, mm1
1420	movd	[edi], mm7
1421	psllq	mm7, 48
1422	psrlq	mm4, 16
1423	por	mm4, mm7
1424
1425	add	esi, byte 4
1426	add	edi, byte 4
1427
1428	dec	ebx
1429	jnz	.mmx_4_loop_i
1430	jmp	.mmx_end
1431.mmx_4more:
1432	shl	eax, 2
1433	neg	eax
1434	add	eax, byte 16
1435	ALIGN	16
1436.mmx_4more_loop_i:
1437	mov	ecx, edi
1438	add	ecx, eax
1439	mov	edx, esp
1440
1441	movq	mm7, mm4
1442	pmaddwd	mm7, mm5
1443
1444	ALIGN	16
1445.mmx_4more_loop_j:
1446	movd	mm0, [ecx - 16]
1447	punpckldq	mm0, [ecx - 12]
1448	movd	mm1, [ecx - 8]
1449	punpckldq	mm1, [ecx - 4]
1450	packssdw	mm0, mm1
1451	pmaddwd	mm0, [edx]
1452	paddd	mm7, mm0
1453
1454	add	edx, byte 8
1455	add	ecx, byte 16
1456	cmp	ecx, edi
1457	jnz	.mmx_4more_loop_j
1458
1459	movq	mm0, mm7
1460	punpckhdq	mm7, mm7
1461	paddd	mm7, mm0
1462	psrad	mm7, mm6
1463	movd	mm1, [esi]
1464	paddd	mm7, mm1
1465	movd	[edi], mm7
1466	psllq	mm7, 48
1467	psrlq	mm4, 16
1468	por	mm4, mm7
1469
1470	add	esi, byte 4
1471	add	edi, byte 4
1472
1473	dec	ebx
1474	jnz	short .mmx_4more_loop_i
1475.mmx_end:
1476	emms
1477	mov	esp, ebp
1478
1479.end:
1480	pop	edi
1481	pop	esi
1482	pop	ebx
1483	pop	ebp
1484	ret
1485
1486
1487; **********************************************************************
1488;
1489;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
1490; {
1491; 	unsigned i, j;
1492; 	FLAC__int64 sum;
1493;
1494; 	FLAC__ASSERT(order > 0);
1495;
1496;	for(i = 0; i < data_len; i++) {
1497;		sum = 0;
1498;		for(j = 0; j < order; j++)
1499;			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1500;		residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1501;	}
1502; }
1503	ALIGN	16
1504cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
1505	;[esp + 40]	residual[]
1506	;[esp + 36]	lp_quantization
1507	;[esp + 32]	order
1508	;[esp + 28]	qlp_coeff[]
1509	;[esp + 24]	data_len
1510	;[esp + 20]	data[]
1511
1512	;ASSERT(order > 0)
1513	;ASSERT(order <= 32)
1514	;ASSERT(lp_quantization <= 31)
1515
1516	push	ebp
1517	push	ebx
1518	push	esi
1519	push	edi
1520
1521	mov	ebx, [esp + 24]			; ebx = data_len
1522	test	ebx, ebx
1523	jz	near .end				; do nothing if data_len == 0
1524
1525.begin:
1526	mov	eax, [esp + 32]			; eax = order
1527	cmp	eax, 1
1528	jg	short .i_32
1529
1530	mov	esi, [esp + 40]			; esi = residual[]
1531	mov	edi, [esp + 20]			; edi = data[]
1532	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
1533	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
1534	mov	eax, [edi - 4]			; eax = data[-1]
1535	mov	ecx, [esp + 36]			; cl = lp_quantization
1536	ALIGN	16
1537.i_1_loop_i:
1538	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1539	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
1540	neg	eax
1541	add	eax, [edi]
1542	mov	[esi], eax
1543	mov	eax, [edi]
1544	add	esi, 4
1545	add	edi, 4
1546	dec	ebx
1547	jnz	.i_1_loop_i
1548	jmp	.end
1549
1550.mov_eip_to_eax:
1551	mov	eax, [esp]
1552	ret
1553
1554.i_32:	; eax = order
1555	neg	eax
1556	add	eax, eax
1557	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1558	call	.mov_eip_to_eax
1559.get_eip0:
1560	add	ebp, eax
1561	inc	ebp				; compensate for the shorter opcode on the last iteration
1562
1563	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
1564	mov	edi, [esp + 20]			; edi = data[]
1565	sub	[esp + 40], edi			; residual[] -= data[]
1566
1567	xor	ecx, ecx
1568	xor	esi, esi
1569	jmp	ebp
1570
1571;eax = --
1572;edx = --
1573;ecx = 0
1574;esi = 0
1575;
1576;ebx = qlp_coeff[]
1577;edi = data[]
1578;ebp = @address
1579
1580	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
1581	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
1582	add	ecx, eax
1583	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
1584
1585	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
1586	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
1587	add	ecx, eax
1588	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
1589
1590	mov	eax, [ebx + 116]
1591	imul	dword [edi - 120]
1592	add	ecx, eax
1593	adc	esi, edx
1594
1595	mov	eax, [ebx + 112]
1596	imul	dword [edi - 116]
1597	add	ecx, eax
1598	adc	esi, edx
1599
1600	mov	eax, [ebx + 108]
1601	imul	dword [edi - 112]
1602	add	ecx, eax
1603	adc	esi, edx
1604
1605	mov	eax, [ebx + 104]
1606	imul	dword [edi - 108]
1607	add	ecx, eax
1608	adc	esi, edx
1609
1610	mov	eax, [ebx + 100]
1611	imul	dword [edi - 104]
1612	add	ecx, eax
1613	adc	esi, edx
1614
1615	mov	eax, [ebx + 96]
1616	imul	dword [edi - 100]
1617	add	ecx, eax
1618	adc	esi, edx
1619
1620	mov	eax, [ebx + 92]
1621	imul	dword [edi - 96]
1622	add	ecx, eax
1623	adc	esi, edx
1624
1625	mov	eax, [ebx + 88]
1626	imul	dword [edi - 92]
1627	add	ecx, eax
1628	adc	esi, edx
1629
1630	mov	eax, [ebx + 84]
1631	imul	dword [edi - 88]
1632	add	ecx, eax
1633	adc	esi, edx
1634
1635	mov	eax, [ebx + 80]
1636	imul	dword [edi - 84]
1637	add	ecx, eax
1638	adc	esi, edx
1639
1640	mov	eax, [ebx + 76]
1641	imul	dword [edi - 80]
1642	add	ecx, eax
1643	adc	esi, edx
1644
1645	mov	eax, [ebx + 72]
1646	imul	dword [edi - 76]
1647	add	ecx, eax
1648	adc	esi, edx
1649
1650	mov	eax, [ebx + 68]
1651	imul	dword [edi - 72]
1652	add	ecx, eax
1653	adc	esi, edx
1654
1655	mov	eax, [ebx + 64]
1656	imul	dword [edi - 68]
1657	add	ecx, eax
1658	adc	esi, edx
1659
1660	mov	eax, [ebx + 60]
1661	imul	dword [edi - 64]
1662	add	ecx, eax
1663	adc	esi, edx
1664
1665	mov	eax, [ebx + 56]
1666	imul	dword [edi - 60]
1667	add	ecx, eax
1668	adc	esi, edx
1669
1670	mov	eax, [ebx + 52]
1671	imul	dword [edi - 56]
1672	add	ecx, eax
1673	adc	esi, edx
1674
1675	mov	eax, [ebx + 48]
1676	imul	dword [edi - 52]
1677	add	ecx, eax
1678	adc	esi, edx
1679
1680	mov	eax, [ebx + 44]
1681	imul	dword [edi - 48]
1682	add	ecx, eax
1683	adc	esi, edx
1684
1685	mov	eax, [ebx + 40]
1686	imul	dword [edi - 44]
1687	add	ecx, eax
1688	adc	esi, edx
1689
1690	mov	eax, [ebx + 36]
1691	imul	dword [edi - 40]
1692	add	ecx, eax
1693	adc	esi, edx
1694
1695	mov	eax, [ebx + 32]
1696	imul	dword [edi - 36]
1697	add	ecx, eax
1698	adc	esi, edx
1699
1700	mov	eax, [ebx + 28]
1701	imul	dword [edi - 32]
1702	add	ecx, eax
1703	adc	esi, edx
1704
1705	mov	eax, [ebx + 24]
1706	imul	dword [edi - 28]
1707	add	ecx, eax
1708	adc	esi, edx
1709
1710	mov	eax, [ebx + 20]
1711	imul	dword [edi - 24]
1712	add	ecx, eax
1713	adc	esi, edx
1714
1715	mov	eax, [ebx + 16]
1716	imul	dword [edi - 20]
1717	add	ecx, eax
1718	adc	esi, edx
1719
1720	mov	eax, [ebx + 12]
1721	imul	dword [edi - 16]
1722	add	ecx, eax
1723	adc	esi, edx
1724
1725	mov	eax, [ebx + 8]
1726	imul	dword [edi - 12]
1727	add	ecx, eax
1728	adc	esi, edx
1729
1730	mov	eax, [ebx + 4]
1731	imul	dword [edi - 8]
1732	add	ecx, eax
1733	adc	esi, edx
1734
1735	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1736	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
1737	add	ecx, eax
1738	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
1739
1740.jumper_0:
1741	mov	edx, ecx
1742;esi:edx = sum
1743	mov	ecx, [esp + 36]			; cl = lp_quantization
1744	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
1745;eax = --
1746;ecx = --
1747;edx = sum >> lp_q
1748;esi = --
1749	neg	edx						; edx = -(sum >> lp_quantization)
1750	mov	eax, [esp + 40]			; residual[] - data[]
1751	add	edx, [edi]				; edx = data[i] - (sum >> lp_quantization)
1752	mov	[edi + eax], edx
1753	add	edi, 4
1754
1755	dec	dword [esp + 24]
1756	jz	short .end
1757	xor	ecx, ecx
1758	xor	esi, esi
1759	jmp	ebp
1760
1761.end:
1762	pop	edi
1763	pop	esi
1764	pop	ebx
1765	pop	ebp
1766	ret
1767
1768; **********************************************************************
1769;
1770; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1771; {
1772; 	unsigned i, j;
1773; 	FLAC__int64 sum;
1774;
1775; 	FLAC__ASSERT(order > 0);
1776;
1777; 	for(i = 0; i < data_len; i++) {
1778; 		sum = 0;
1779; 		for(j = 0; j < order; j++)
1780; 			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
1781; 		data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1782; 	}
1783; }
1784	ALIGN	16
1785cident FLAC__lpc_restore_signal_wide_asm_ia32
1786	;[esp + 40]	data[]
1787	;[esp + 36]	lp_quantization
1788	;[esp + 32]	order
1789	;[esp + 28]	qlp_coeff[]
1790	;[esp + 24]	data_len
1791	;[esp + 20]	residual[]
1792
1793	;ASSERT(order > 0)
1794	;ASSERT(order <= 32)
1795	;ASSERT(lp_quantization <= 31)
1796
1797	push	ebp
1798	push	ebx
1799	push	esi
1800	push	edi
1801
1802	mov	ebx, [esp + 24]			; ebx = data_len
1803	test	ebx, ebx
1804	jz	near .end				; do nothing if data_len == 0
1805
1806.begin:
1807	mov	eax, [esp + 32]			; eax = order
1808	cmp	eax, 1
1809	jg	short .x87_32
1810
1811	mov	esi, [esp + 20]			; esi = residual[]
1812	mov	edi, [esp + 40]			; edi = data[]
1813	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
1814	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
1815	mov	eax, [edi - 4]			; eax = data[-1]
1816	mov	ecx, [esp + 36]			; cl = lp_quantization
1817	ALIGN	16
1818.x87_1_loop_i:
1819	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
1820	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
1821;
1822	add	eax, [esi]
1823	mov	[edi], eax
1824;
1825	add	esi, 4
1826	add	edi, 4
1827	dec	ebx
1828	jnz	.x87_1_loop_i
1829	jmp	.end
1830
1831.mov_eip_to_eax:
1832	mov	eax, [esp]
1833	ret
1834
1835.x87_32:	; eax = order
1836	neg	eax
1837	add	eax, eax
1838	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
1839	call	.mov_eip_to_eax
1840.get_eip0:
1841	add	ebp, eax
1842	inc	ebp				; compensate for the shorter opcode on the last iteration
1843
1844	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
1845	mov	edi, [esp + 40]			; esi = data[]
1846	sub	[esp + 20], edi			; residual[] -= data[]
1847
1848	xor	ecx, ecx
1849	xor	esi, esi
1850	jmp	ebp
1851
1852;eax = --
1853;edx = --
1854;ecx = 0
1855;esi = 0
1856;
1857;ebx = qlp_coeff[]
1858;edi = data[]
1859;ebp = @address
1860
1861	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
1862	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
1863	add	ecx, eax
1864	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
1865
1866	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
1867	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
1868	add	ecx, eax
1869	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
1870
1871	mov	eax, [ebx + 116]
1872	imul	dword [edi - 120]
1873	add	ecx, eax
1874	adc	esi, edx
1875
1876	mov	eax, [ebx + 112]
1877	imul	dword [edi - 116]
1878	add	ecx, eax
1879	adc	esi, edx
1880
1881	mov	eax, [ebx + 108]
1882	imul	dword [edi - 112]
1883	add	ecx, eax
1884	adc	esi, edx
1885
1886	mov	eax, [ebx + 104]
1887	imul	dword [edi - 108]
1888	add	ecx, eax
1889	adc	esi, edx
1890
1891	mov	eax, [ebx + 100]
1892	imul	dword [edi - 104]
1893	add	ecx, eax
1894	adc	esi, edx
1895
1896	mov	eax, [ebx + 96]
1897	imul	dword [edi - 100]
1898	add	ecx, eax
1899	adc	esi, edx
1900
1901	mov	eax, [ebx + 92]
1902	imul	dword [edi - 96]
1903	add	ecx, eax
1904	adc	esi, edx
1905
1906	mov	eax, [ebx + 88]
1907	imul	dword [edi - 92]
1908	add	ecx, eax
1909	adc	esi, edx
1910
1911	mov	eax, [ebx + 84]
1912	imul	dword [edi - 88]
1913	add	ecx, eax
1914	adc	esi, edx
1915
1916	mov	eax, [ebx + 80]
1917	imul	dword [edi - 84]
1918	add	ecx, eax
1919	adc	esi, edx
1920
1921	mov	eax, [ebx + 76]
1922	imul	dword [edi - 80]
1923	add	ecx, eax
1924	adc	esi, edx
1925
1926	mov	eax, [ebx + 72]
1927	imul	dword [edi - 76]
1928	add	ecx, eax
1929	adc	esi, edx
1930
1931	mov	eax, [ebx + 68]
1932	imul	dword [edi - 72]
1933	add	ecx, eax
1934	adc	esi, edx
1935
1936	mov	eax, [ebx + 64]
1937	imul	dword [edi - 68]
1938	add	ecx, eax
1939	adc	esi, edx
1940
1941	mov	eax, [ebx + 60]
1942	imul	dword [edi - 64]
1943	add	ecx, eax
1944	adc	esi, edx
1945
1946	mov	eax, [ebx + 56]
1947	imul	dword [edi - 60]
1948	add	ecx, eax
1949	adc	esi, edx
1950
1951	mov	eax, [ebx + 52]
1952	imul	dword [edi - 56]
1953	add	ecx, eax
1954	adc	esi, edx
1955
1956	mov	eax, [ebx + 48]
1957	imul	dword [edi - 52]
1958	add	ecx, eax
1959	adc	esi, edx
1960
1961	mov	eax, [ebx + 44]
1962	imul	dword [edi - 48]
1963	add	ecx, eax
1964	adc	esi, edx
1965
1966	mov	eax, [ebx + 40]
1967	imul	dword [edi - 44]
1968	add	ecx, eax
1969	adc	esi, edx
1970
1971	mov	eax, [ebx + 36]
1972	imul	dword [edi - 40]
1973	add	ecx, eax
1974	adc	esi, edx
1975
1976	mov	eax, [ebx + 32]
1977	imul	dword [edi - 36]
1978	add	ecx, eax
1979	adc	esi, edx
1980
1981	mov	eax, [ebx + 28]
1982	imul	dword [edi - 32]
1983	add	ecx, eax
1984	adc	esi, edx
1985
1986	mov	eax, [ebx + 24]
1987	imul	dword [edi - 28]
1988	add	ecx, eax
1989	adc	esi, edx
1990
1991	mov	eax, [ebx + 20]
1992	imul	dword [edi - 24]
1993	add	ecx, eax
1994	adc	esi, edx
1995
1996	mov	eax, [ebx + 16]
1997	imul	dword [edi - 20]
1998	add	ecx, eax
1999	adc	esi, edx
2000
2001	mov	eax, [ebx + 12]
2002	imul	dword [edi - 16]
2003	add	ecx, eax
2004	adc	esi, edx
2005
2006	mov	eax, [ebx + 8]
2007	imul	dword [edi - 12]
2008	add	ecx, eax
2009	adc	esi, edx
2010
2011	mov	eax, [ebx + 4]
2012	imul	dword [edi - 8]
2013	add	ecx, eax
2014	adc	esi, edx
2015
2016	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
2017	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
2018	add	ecx, eax
2019	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
2020
2021.jumper_0:
2022	mov	edx, ecx
2023;esi:edx = sum
2024	mov	ecx, [esp + 36]			; cl = lp_quantization
2025	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
2026;eax = --
2027;ecx = --
2028;edx = sum >> lp_q
2029;esi = --
2030;
2031	mov	eax, [esp + 20]			; residual[] - data[]
2032	add	edx, [edi + eax]		; edx = residual[i] + (sum >> lp_quantization)
2033	mov	[edi], edx				; data[i] = residual[i] + (sum >> lp_quantization)
2034	add	edi, 4
2035
2036	dec	dword [esp + 24]
2037	jz	short .end
2038	xor	ecx, ecx
2039	xor	esi, esi
2040	jmp	ebp
2041
2042.end:
2043	pop	edi
2044	pop	esi
2045	pop	ebx
2046	pop	ebp
2047	ret
2048
2049; end
2050