1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2016  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36 
37 #include "private/cpu.h"
38 
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44 
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47 
48 #include <emmintrin.h> /* SSE2 */
49 
50 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 
53 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])54 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
55 {
56 	int i;
57 	FLAC__int32 sum;
58 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
59 
60 	FLAC__ASSERT(order > 0);
61 	FLAC__ASSERT(order <= 32);
62 
63 	if(order <= 12) {
64 		if(order > 8) {
65 			if(order > 10) {
66 				if(order == 12) {
67 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
68 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
69 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
70 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
71 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
72 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
73 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
74 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
75 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
76 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
77 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
78 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
79 					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
80 
81 					for(i = 0; i < (int)data_len-3; i+=4) {
82 						__m128i summ, mull;
83 						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
84 						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
85 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
86 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
87 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
88 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
89 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
90 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
91 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
92 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
93 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
94 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
95 						summ = _mm_sra_epi32(summ, cnt);
96 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
97 					}
98 				}
99 				else { /* order == 11 */
100 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
101 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
102 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
103 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
104 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
105 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
106 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
107 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
108 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
109 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
110 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
111 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
112 
113 					for(i = 0; i < (int)data_len-3; i+=4) {
114 						__m128i summ, mull;
115 						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
116 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
117 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
118 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
119 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
120 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
121 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
122 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
123 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
124 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
125 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
126 						summ = _mm_sra_epi32(summ, cnt);
127 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
128 					}
129 				}
130 			}
131 			else {
132 				if(order == 10) {
133 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
134 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
135 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
136 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
137 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
138 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
139 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
140 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
141 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
142 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
143 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
144 
145 					for(i = 0; i < (int)data_len-3; i+=4) {
146 						__m128i summ, mull;
147 						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
148 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
149 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
150 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
151 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
152 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
153 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
154 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
155 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
156 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
157 						summ = _mm_sra_epi32(summ, cnt);
158 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
159 					}
160 				}
161 				else { /* order == 9 */
162 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
163 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
164 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
165 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
166 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
167 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
168 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
169 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
170 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
171 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
172 
173 					for(i = 0; i < (int)data_len-3; i+=4) {
174 						__m128i summ, mull;
175 						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
176 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
177 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
178 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
179 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
180 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
181 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
182 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
183 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
184 						summ = _mm_sra_epi32(summ, cnt);
185 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
186 					}
187 				}
188 			}
189 		}
190 		else if(order > 4) {
191 			if(order > 6) {
192 				if(order == 8) {
193 					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
194 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
195 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
196 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
197 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
198 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
199 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
200 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
201 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
202 
203 					for(i = 0; i < (int)data_len-3; i+=4) {
204 						__m128i summ, mull;
205 						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
206 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
207 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
208 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
209 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
210 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
211 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
212 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
213 						summ = _mm_sra_epi32(summ, cnt);
214 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
215 					}
216 				}
217 				else { /* order == 7 */
218 					__m128i q0, q1, q2, q3, q4, q5, q6;
219 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
220 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
221 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
222 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
223 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
224 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
225 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
226 
227 					for(i = 0; i < (int)data_len-3; i+=4) {
228 						__m128i summ, mull;
229 						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
230 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
231 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
232 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
233 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
234 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
235 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
236 						summ = _mm_sra_epi32(summ, cnt);
237 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
238 					}
239 				}
240 			}
241 			else {
242 				if(order == 6) {
243 					__m128i q0, q1, q2, q3, q4, q5;
244 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
245 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
246 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
247 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
248 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
249 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
250 
251 					for(i = 0; i < (int)data_len-3; i+=4) {
252 						__m128i summ, mull;
253 						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
254 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
255 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
256 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
257 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
258 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
259 						summ = _mm_sra_epi32(summ, cnt);
260 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
261 					}
262 				}
263 				else { /* order == 5 */
264 					__m128i q0, q1, q2, q3, q4;
265 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
266 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
267 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
268 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
269 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
270 
271 					for(i = 0; i < (int)data_len-3; i+=4) {
272 						__m128i summ, mull;
273 						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
274 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
275 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
276 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
277 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
278 						summ = _mm_sra_epi32(summ, cnt);
279 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
280 					}
281 				}
282 			}
283 		}
284 		else {
285 			if(order > 2) {
286 				if(order == 4) {
287 					__m128i q0, q1, q2, q3;
288 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
289 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
290 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
291 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
292 
293 					for(i = 0; i < (int)data_len-3; i+=4) {
294 						__m128i summ, mull;
295 						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
296 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
297 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
298 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
299 						summ = _mm_sra_epi32(summ, cnt);
300 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
301 					}
302 				}
303 				else { /* order == 3 */
304 					__m128i q0, q1, q2;
305 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
306 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
307 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
308 
309 					for(i = 0; i < (int)data_len-3; i+=4) {
310 						__m128i summ, mull;
311 						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
312 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
313 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
314 						summ = _mm_sra_epi32(summ, cnt);
315 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
316 					}
317 				}
318 			}
319 			else {
320 				if(order == 2) {
321 					__m128i q0, q1;
322 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
323 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
324 
325 					for(i = 0; i < (int)data_len-3; i+=4) {
326 						__m128i summ, mull;
327 						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
328 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
329 						summ = _mm_sra_epi32(summ, cnt);
330 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
331 					}
332 				}
333 				else { /* order == 1 */
334 					__m128i q0;
335 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
336 
337 					for(i = 0; i < (int)data_len-3; i+=4) {
338 						__m128i summ;
339 						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
340 						summ = _mm_sra_epi32(summ, cnt);
341 						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
342 					}
343 				}
344 			}
345 		}
346 		for(; i < (int)data_len; i++) {
347 			sum = 0;
348 			switch(order) {
349 				case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
350 				case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
351 				case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
352 				case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
353 				case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
354 				case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
355 				case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
356 				case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
357 				case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
358 				case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
359 				case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
360 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
361 			}
362 			residual[i] = data[i] - (sum >> lp_quantization);
363 		}
364 	}
365 	else { /* order > 12 */
366 		for(i = 0; i < (int)data_len; i++) {
367 			sum = 0;
368 			switch(order) {
369 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
370 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
371 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
372 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
373 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
374 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
375 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
376 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
377 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
378 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
379 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
380 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
381 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
382 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
383 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
384 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
385 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
386 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
387 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
388 				case 13: sum += qlp_coeff[12] * data[i-13];
389 				         sum += qlp_coeff[11] * data[i-12];
390 				         sum += qlp_coeff[10] * data[i-11];
391 				         sum += qlp_coeff[ 9] * data[i-10];
392 				         sum += qlp_coeff[ 8] * data[i- 9];
393 				         sum += qlp_coeff[ 7] * data[i- 8];
394 				         sum += qlp_coeff[ 6] * data[i- 7];
395 				         sum += qlp_coeff[ 5] * data[i- 6];
396 				         sum += qlp_coeff[ 4] * data[i- 5];
397 				         sum += qlp_coeff[ 3] * data[i- 4];
398 				         sum += qlp_coeff[ 2] * data[i- 3];
399 				         sum += qlp_coeff[ 1] * data[i- 2];
400 				         sum += qlp_coeff[ 0] * data[i- 1];
401 			}
402 			residual[i] = data[i] - (sum >> lp_quantization);
403 		}
404 	}
405 }
406 
407 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])408 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
409 {
410 	int i;
411 
412 	FLAC__ASSERT(order > 0);
413 	FLAC__ASSERT(order <= 32);
414 
415 	if(order <= 12) {
416 		if(order > 8) { /* order == 9, 10, 11, 12 */
417 			if(order > 10) { /* order == 11, 12 */
418 				if(order == 12) {
419 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
420 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
421 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
422 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
423 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
424 					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
425 					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
426 
427 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
428 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
429 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
430 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
431 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
432 					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
433 
434 					for(i = 0; i < (int)data_len; i++) {
435 						//sum = 0;
436 						//sum += qlp_coeff[11] * data[i-12];
437 						//sum += qlp_coeff[10] * data[i-11];
438 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
439 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
440 						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
441 
442 						//sum += qlp_coeff[9] * data[i-10];
443 						//sum += qlp_coeff[8] * data[i-9];
444 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
445 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
446 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
447 						xmm7 = _mm_add_epi32(xmm7, xmm6);
448 
449 						//sum += qlp_coeff[7] * data[i-8];
450 						//sum += qlp_coeff[6] * data[i-7];
451 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
452 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
454 						xmm7 = _mm_add_epi32(xmm7, xmm6);
455 
456 						//sum += qlp_coeff[5] * data[i-6];
457 						//sum += qlp_coeff[4] * data[i-5];
458 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
459 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
460 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
461 						xmm7 = _mm_add_epi32(xmm7, xmm6);
462 
463 						//sum += qlp_coeff[3] * data[i-4];
464 						//sum += qlp_coeff[2] * data[i-3];
465 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
466 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
467 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
468 						xmm7 = _mm_add_epi32(xmm7, xmm6);
469 
470 						//sum += qlp_coeff[1] * data[i-2];
471 						//sum += qlp_coeff[0] * data[i-1];
472 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
473 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
474 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
475 						xmm7 = _mm_add_epi32(xmm7, xmm6);
476 
477 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
478 						RESIDUAL32_RESULT(xmm7);
479 					}
480 				}
481 				else { /* order == 11 */
482 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
483 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
484 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
485 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
486 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
487 					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
488 					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
489 
490 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
491 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
492 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
493 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
494 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
495 
496 					for(i = 0; i < (int)data_len; i++) {
497 						//sum = 0;
498 						//sum  = qlp_coeff[10] * data[i-11];
499 						xmm7 = _mm_cvtsi32_si128(data[i-11]);
500 						xmm7 = _mm_mul_epu32(xmm7, xmm5);
501 
502 						//sum += qlp_coeff[9] * data[i-10];
503 						//sum += qlp_coeff[8] * data[i-9];
504 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
505 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
506 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
507 						xmm7 = _mm_add_epi32(xmm7, xmm6);
508 
509 						//sum += qlp_coeff[7] * data[i-8];
510 						//sum += qlp_coeff[6] * data[i-7];
511 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
512 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
513 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
514 						xmm7 = _mm_add_epi32(xmm7, xmm6);
515 
516 						//sum += qlp_coeff[5] * data[i-6];
517 						//sum += qlp_coeff[4] * data[i-5];
518 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
519 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
520 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
521 						xmm7 = _mm_add_epi32(xmm7, xmm6);
522 
523 						//sum += qlp_coeff[3] * data[i-4];
524 						//sum += qlp_coeff[2] * data[i-3];
525 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
526 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
527 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
528 						xmm7 = _mm_add_epi32(xmm7, xmm6);
529 
530 						//sum += qlp_coeff[1] * data[i-2];
531 						//sum += qlp_coeff[0] * data[i-1];
532 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
533 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
534 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
535 						xmm7 = _mm_add_epi32(xmm7, xmm6);
536 
537 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
538 						RESIDUAL32_RESULT(xmm7);
539 					}
540 				}
541 			}
542 			else { /* order == 9, 10 */
543 				if(order == 10) {
544 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
545 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
546 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
547 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
548 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
549 					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
550 
551 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
552 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
553 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
554 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
555 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
556 
557 					for(i = 0; i < (int)data_len; i++) {
558 						//sum = 0;
559 						//sum += qlp_coeff[9] * data[i-10];
560 						//sum += qlp_coeff[8] * data[i-9];
561 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
562 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
563 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
564 
565 						//sum += qlp_coeff[7] * data[i-8];
566 						//sum += qlp_coeff[6] * data[i-7];
567 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
568 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
569 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
570 						xmm7 = _mm_add_epi32(xmm7, xmm6);
571 
572 						//sum += qlp_coeff[5] * data[i-6];
573 						//sum += qlp_coeff[4] * data[i-5];
574 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
575 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
576 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
577 						xmm7 = _mm_add_epi32(xmm7, xmm6);
578 
579 						//sum += qlp_coeff[3] * data[i-4];
580 						//sum += qlp_coeff[2] * data[i-3];
581 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
582 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
583 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
584 						xmm7 = _mm_add_epi32(xmm7, xmm6);
585 
586 						//sum += qlp_coeff[1] * data[i-2];
587 						//sum += qlp_coeff[0] * data[i-1];
588 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
589 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
590 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
591 						xmm7 = _mm_add_epi32(xmm7, xmm6);
592 
593 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
594 						RESIDUAL32_RESULT(xmm7);
595 					}
596 				}
597 				else { /* order == 9 */
598 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
599 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
600 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
601 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
602 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
603 					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
604 
605 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
606 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
607 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
608 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
609 
610 					for(i = 0; i < (int)data_len; i++) {
611 						//sum = 0;
612 						//sum  = qlp_coeff[8] * data[i-9];
613 						xmm7 = _mm_cvtsi32_si128(data[i-9]);
614 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
615 
616 						//sum += qlp_coeff[7] * data[i-8];
617 						//sum += qlp_coeff[6] * data[i-7];
618 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
619 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
620 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
621 						xmm7 = _mm_add_epi32(xmm7, xmm6);
622 
623 						//sum += qlp_coeff[5] * data[i-6];
624 						//sum += qlp_coeff[4] * data[i-5];
625 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
626 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
627 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
628 						xmm7 = _mm_add_epi32(xmm7, xmm6);
629 
630 						//sum += qlp_coeff[3] * data[i-4];
631 						//sum += qlp_coeff[2] * data[i-3];
632 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
633 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
634 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
635 						xmm7 = _mm_add_epi32(xmm7, xmm6);
636 
637 						//sum += qlp_coeff[1] * data[i-2];
638 						//sum += qlp_coeff[0] * data[i-1];
639 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
640 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
641 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
642 						xmm7 = _mm_add_epi32(xmm7, xmm6);
643 
644 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
645 						RESIDUAL32_RESULT(xmm7);
646 					}
647 				}
648 			}
649 		}
650 		else if(order > 4) { /* order == 5, 6, 7, 8 */
651 			if(order > 6) { /* order == 7, 8 */
652 				if(order == 8) {
653 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
654 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
655 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
656 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
657 					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
658 
659 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
660 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
661 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
662 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
663 
664 					for(i = 0; i < (int)data_len; i++) {
665 						//sum = 0;
666 						//sum += qlp_coeff[7] * data[i-8];
667 						//sum += qlp_coeff[6] * data[i-7];
668 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
669 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
670 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
671 
672 						//sum += qlp_coeff[5] * data[i-6];
673 						//sum += qlp_coeff[4] * data[i-5];
674 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
675 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
676 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
677 						xmm7 = _mm_add_epi32(xmm7, xmm6);
678 
679 						//sum += qlp_coeff[3] * data[i-4];
680 						//sum += qlp_coeff[2] * data[i-3];
681 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
682 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
683 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
684 						xmm7 = _mm_add_epi32(xmm7, xmm6);
685 
686 						//sum += qlp_coeff[1] * data[i-2];
687 						//sum += qlp_coeff[0] * data[i-1];
688 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
689 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
690 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
691 						xmm7 = _mm_add_epi32(xmm7, xmm6);
692 
693 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
694 						RESIDUAL32_RESULT(xmm7);
695 					}
696 				}
697 				else { /* order == 7 */
698 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
699 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
700 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
701 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
702 					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
703 
704 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
705 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
706 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
707 
708 					for(i = 0; i < (int)data_len; i++) {
709 						//sum = 0;
710 						//sum  = qlp_coeff[6] * data[i-7];
711 						xmm7 = _mm_cvtsi32_si128(data[i-7]);
712 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
713 
714 						//sum += qlp_coeff[5] * data[i-6];
715 						//sum += qlp_coeff[4] * data[i-5];
716 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
717 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
718 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
719 						xmm7 = _mm_add_epi32(xmm7, xmm6);
720 
721 						//sum += qlp_coeff[3] * data[i-4];
722 						//sum += qlp_coeff[2] * data[i-3];
723 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
724 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
725 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
726 						xmm7 = _mm_add_epi32(xmm7, xmm6);
727 
728 						//sum += qlp_coeff[1] * data[i-2];
729 						//sum += qlp_coeff[0] * data[i-1];
730 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
731 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
732 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
733 						xmm7 = _mm_add_epi32(xmm7, xmm6);
734 
735 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
736 						RESIDUAL32_RESULT(xmm7);
737 					}
738 				}
739 			}
740 			else { /* order == 5, 6 */
741 				if(order == 6) {
742 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
743 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
744 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
745 					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
746 
747 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
748 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
749 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
750 
751 					for(i = 0; i < (int)data_len; i++) {
752 						//sum = 0;
753 						//sum += qlp_coeff[5] * data[i-6];
754 						//sum += qlp_coeff[4] * data[i-5];
755 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
756 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
757 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
758 
759 						//sum += qlp_coeff[3] * data[i-4];
760 						//sum += qlp_coeff[2] * data[i-3];
761 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
762 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
763 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
764 						xmm7 = _mm_add_epi32(xmm7, xmm6);
765 
766 						//sum += qlp_coeff[1] * data[i-2];
767 						//sum += qlp_coeff[0] * data[i-1];
768 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
769 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
770 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
771 						xmm7 = _mm_add_epi32(xmm7, xmm6);
772 
773 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
774 						RESIDUAL32_RESULT(xmm7);
775 					}
776 				}
777 				else { /* order == 5 */
778 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
779 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
780 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
781 					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
782 
783 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
784 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
785 
786 					for(i = 0; i < (int)data_len; i++) {
787 						//sum = 0;
788 						//sum  = qlp_coeff[4] * data[i-5];
789 						xmm7 = _mm_cvtsi32_si128(data[i-5]);
790 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
791 
792 						//sum += qlp_coeff[3] * data[i-4];
793 						//sum += qlp_coeff[2] * data[i-3];
794 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
795 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
796 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
797 						xmm7 = _mm_add_epi32(xmm7, xmm6);
798 
799 						//sum += qlp_coeff[1] * data[i-2];
800 						//sum += qlp_coeff[0] * data[i-1];
801 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
802 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
803 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
804 						xmm7 = _mm_add_epi32(xmm7, xmm6);
805 
806 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
807 						RESIDUAL32_RESULT(xmm7);
808 					}
809 				}
810 			}
811 		}
812 		else { /* order == 1, 2, 3, 4 */
813 			if(order > 2) { /* order == 3, 4 */
814 				if(order == 4) {
815 					__m128i xmm0, xmm1, xmm6, xmm7;
816 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
817 					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
818 
819 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
820 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
821 
822 					for(i = 0; i < (int)data_len; i++) {
823 						//sum = 0;
824 						//sum += qlp_coeff[3] * data[i-4];
825 						//sum += qlp_coeff[2] * data[i-3];
826 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
827 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
828 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
829 
830 						//sum += qlp_coeff[1] * data[i-2];
831 						//sum += qlp_coeff[0] * data[i-1];
832 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
833 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
834 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
835 						xmm7 = _mm_add_epi32(xmm7, xmm6);
836 
837 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
838 						RESIDUAL32_RESULT(xmm7);
839 					}
840 				}
841 				else { /* order == 3 */
842 					__m128i xmm0, xmm1, xmm6, xmm7;
843 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
844 					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
845 
846 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
847 
848 					for(i = 0; i < (int)data_len; i++) {
849 						//sum = 0;
850 						//sum  = qlp_coeff[2] * data[i-3];
851 						xmm7 = _mm_cvtsi32_si128(data[i-3]);
852 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
853 
854 						//sum += qlp_coeff[1] * data[i-2];
855 						//sum += qlp_coeff[0] * data[i-1];
856 						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
857 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
858 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
859 						xmm7 = _mm_add_epi32(xmm7, xmm6);
860 
861 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
862 						RESIDUAL32_RESULT(xmm7);
863 					}
864 				}
865 			}
866 			else { /* order == 1, 2 */
867 				if(order == 2) {
868 					__m128i xmm0, xmm7;
869 					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
870 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
871 
872 					for(i = 0; i < (int)data_len; i++) {
873 						//sum = 0;
874 						//sum += qlp_coeff[1] * data[i-2];
875 						//sum += qlp_coeff[0] * data[i-1];
876 						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
877 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
878 						xmm7 = _mm_mul_epu32(xmm7, xmm0);
879 
880 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
881 						RESIDUAL32_RESULT(xmm7);
882 					}
883 				}
884 				else { /* order == 1 */
885 					for(i = 0; i < (int)data_len; i++)
886 						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
887 				}
888 			}
889 		}
890 	}
891 	else { /* order > 12 */
892 		FLAC__int32 sum;
893 		for(i = 0; i < (int)data_len; i++) {
894 			sum = 0;
895 			switch(order) {
896 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
897 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
898 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
899 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
900 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
901 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
902 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
903 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
904 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
905 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
906 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
907 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
908 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
909 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
910 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
911 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
912 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
913 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
914 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
915 				case 13: sum += qlp_coeff[12] * data[i-13];
916 				         sum += qlp_coeff[11] * data[i-12];
917 				         sum += qlp_coeff[10] * data[i-11];
918 				         sum += qlp_coeff[ 9] * data[i-10];
919 				         sum += qlp_coeff[ 8] * data[i- 9];
920 				         sum += qlp_coeff[ 7] * data[i- 8];
921 				         sum += qlp_coeff[ 6] * data[i- 7];
922 				         sum += qlp_coeff[ 5] * data[i- 6];
923 				         sum += qlp_coeff[ 4] * data[i- 5];
924 				         sum += qlp_coeff[ 3] * data[i- 4];
925 				         sum += qlp_coeff[ 2] * data[i- 3];
926 				         sum += qlp_coeff[ 1] * data[i- 2];
927 				         sum += qlp_coeff[ 0] * data[i- 1];
928 			}
929 			residual[i] = data[i] - (sum >> lp_quantization);
930 		}
931 	}
932 }
933 
934 #endif /* FLAC__SSE2_SUPPORTED */
935 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
936 #endif /* FLAC__NO_ASM */
937 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
938