1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__AVX2_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <immintrin.h> /* AVX2 */
49
50 FLAC__SSE_TARGET("avx2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])51 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
52 {
53 int i;
54 FLAC__int32 sum;
55 const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
56
57 FLAC__ASSERT(order > 0);
58 FLAC__ASSERT(order <= 32);
59
60 if(order <= 12) {
61 if(order > 8) {
62 if(order > 10) {
63 if(order == 12) {
64 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
65 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
66 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
67 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
68 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
69 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
70 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
71 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
72 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
73 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
74 q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
75 q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
76 q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
77
78 for(i = 0; i < (int)data_len-7; i+=8) {
79 __m256i summ, mull;
80 summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
81 mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
82 mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
83 mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
84 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
85 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
86 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
87 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
88 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
89 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
90 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
91 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
92 summ = _mm256_sra_epi32(summ, cnt);
93 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
94 }
95 }
96 else { /* order == 11 */
97 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
98 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
99 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
100 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
101 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
102 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
103 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
104 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
105 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
106 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
107 q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
108 q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
109
110 for(i = 0; i < (int)data_len-7; i+=8) {
111 __m256i summ, mull;
112 summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
113 mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
114 mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
115 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
116 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
117 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
118 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
119 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
120 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
121 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
122 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
123 summ = _mm256_sra_epi32(summ, cnt);
124 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
125 }
126 }
127 }
128 else {
129 if(order == 10) {
130 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
131 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
132 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
133 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
134 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
135 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
136 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
137 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
138 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
139 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
140 q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
141
142 for(i = 0; i < (int)data_len-7; i+=8) {
143 __m256i summ, mull;
144 summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
145 mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
146 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
147 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
148 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
149 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
150 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
151 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
152 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
153 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
154 summ = _mm256_sra_epi32(summ, cnt);
155 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
156 }
157 }
158 else { /* order == 9 */
159 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
160 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
161 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
162 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
163 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
164 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
165 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
166 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
167 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
168 q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
169
170 for(i = 0; i < (int)data_len-7; i+=8) {
171 __m256i summ, mull;
172 summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 )));
173 mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
174 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
175 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
176 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
177 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
178 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
179 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
180 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
181 summ = _mm256_sra_epi32(summ, cnt);
182 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
183 }
184 }
185 }
186 }
187 else if(order > 4) {
188 if(order > 6) {
189 if(order == 8) {
190 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
191 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
192 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
193 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
194 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
195 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
196 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
197 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
198 q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
199
200 for(i = 0; i < (int)data_len-7; i+=8) {
201 __m256i summ, mull;
202 summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 )));
203 mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
204 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
205 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
206 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
207 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
208 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
209 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
210 summ = _mm256_sra_epi32(summ, cnt);
211 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
212 }
213 }
214 else { /* order == 7 */
215 __m256i q0, q1, q2, q3, q4, q5, q6;
216 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
217 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
218 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
219 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
220 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
221 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
222 q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
223
224 for(i = 0; i < (int)data_len-7; i+=8) {
225 __m256i summ, mull;
226 summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 )));
227 mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
228 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
229 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
230 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
231 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
232 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
233 summ = _mm256_sra_epi32(summ, cnt);
234 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
235 }
236 }
237 }
238 else {
239 if(order == 6) {
240 __m256i q0, q1, q2, q3, q4, q5;
241 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
242 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
243 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
244 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
245 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
246 q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
247
248 for(i = 0; i < (int)data_len-7; i+=8) {
249 __m256i summ, mull;
250 summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 )));
251 mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
252 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
253 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
254 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
255 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
256 summ = _mm256_sra_epi32(summ, cnt);
257 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
258 }
259 }
260 else { /* order == 5 */
261 __m256i q0, q1, q2, q3, q4;
262 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
263 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
264 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
265 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
266 q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
267
268 for(i = 0; i < (int)data_len-7; i+=8) {
269 __m256i summ, mull;
270 summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 )));
271 mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
272 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
273 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
274 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
275 summ = _mm256_sra_epi32(summ, cnt);
276 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
277 }
278 }
279 }
280 }
281 else {
282 if(order > 2) {
283 if(order == 4) {
284 __m256i q0, q1, q2, q3;
285 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
286 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
287 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
288 q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
289
290 for(i = 0; i < (int)data_len-7; i+=8) {
291 __m256i summ, mull;
292 summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 )));
293 mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
294 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
295 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
296 summ = _mm256_sra_epi32(summ, cnt);
297 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
298 }
299 }
300 else { /* order == 3 */
301 __m256i q0, q1, q2;
302 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
303 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
304 q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
305
306 for(i = 0; i < (int)data_len-7; i+=8) {
307 __m256i summ, mull;
308 summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 )));
309 mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
310 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
311 summ = _mm256_sra_epi32(summ, cnt);
312 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
313 }
314 }
315 }
316 else {
317 if(order == 2) {
318 __m256i q0, q1;
319 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
320 q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
321
322 for(i = 0; i < (int)data_len-7; i+=8) {
323 __m256i summ, mull;
324 summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 )));
325 mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
326 summ = _mm256_sra_epi32(summ, cnt);
327 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
328 }
329 }
330 else { /* order == 1 */
331 __m256i q0;
332 q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
333
334 for(i = 0; i < (int)data_len-7; i+=8) {
335 __m256i summ;
336 summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 )));
337 summ = _mm256_sra_epi32(summ, cnt);
338 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
339 }
340 }
341 }
342 }
343 for(; i < (int)data_len; i++) {
344 sum = 0;
345 switch(order) {
346 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
347 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
348 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
349 case 9: sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
350 case 8: sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
351 case 7: sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
352 case 6: sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
353 case 5: sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
354 case 4: sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
355 case 3: sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
356 case 2: sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
357 case 1: sum += qlp_coeff[ 0] * data[i- 1];
358 }
359 residual[i] = data[i] - (sum >> lp_quantization);
360 }
361 }
362 else { /* order > 12 */
363 for(i = 0; i < (int)data_len; i++) {
364 sum = 0;
365 switch(order) {
366 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
367 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
368 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
369 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
370 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
371 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
372 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
373 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
374 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
375 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
376 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
377 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
378 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
379 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
380 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
381 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
382 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
383 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
384 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
385 case 13: sum += qlp_coeff[12] * data[i-13];
386 sum += qlp_coeff[11] * data[i-12];
387 sum += qlp_coeff[10] * data[i-11];
388 sum += qlp_coeff[ 9] * data[i-10];
389 sum += qlp_coeff[ 8] * data[i- 9];
390 sum += qlp_coeff[ 7] * data[i- 8];
391 sum += qlp_coeff[ 6] * data[i- 7];
392 sum += qlp_coeff[ 5] * data[i- 6];
393 sum += qlp_coeff[ 4] * data[i- 5];
394 sum += qlp_coeff[ 3] * data[i- 4];
395 sum += qlp_coeff[ 2] * data[i- 3];
396 sum += qlp_coeff[ 1] * data[i- 2];
397 sum += qlp_coeff[ 0] * data[i- 1];
398 }
399 residual[i] = data[i] - (sum >> lp_quantization);
400 }
401 }
402 _mm256_zeroupper();
403 }
404
405 FLAC__SSE_TARGET("avx2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])406 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
407 {
408 int i;
409 FLAC__int32 sum;
410 const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
411
412 FLAC__ASSERT(order > 0);
413 FLAC__ASSERT(order <= 32);
414
415 if(order <= 12) {
416 if(order > 8) {
417 if(order > 10) {
418 if(order == 12) {
419 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
420 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
421 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
422 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
423 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
424 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
425 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
426 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
427 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
428 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
429 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
430 q10 = _mm256_set1_epi32(qlp_coeff[10]);
431 q11 = _mm256_set1_epi32(qlp_coeff[11]);
432
433 for(i = 0; i < (int)data_len-7; i+=8) {
434 __m256i summ, mull;
435 summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
436 mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
437 mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
438 mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
439 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
440 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
441 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
442 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
443 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
444 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
445 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
446 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
447 summ = _mm256_sra_epi32(summ, cnt);
448 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
449 }
450 }
451 else { /* order == 11 */
452 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
453 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
454 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
455 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
456 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
457 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
458 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
459 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
460 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
461 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
462 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
463 q10 = _mm256_set1_epi32(qlp_coeff[10]);
464
465 for(i = 0; i < (int)data_len-7; i+=8) {
466 __m256i summ, mull;
467 summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
468 mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
469 mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
470 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
471 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
472 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
473 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
474 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
475 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
476 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
477 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
478 summ = _mm256_sra_epi32(summ, cnt);
479 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
480 }
481 }
482 }
483 else {
484 if(order == 10) {
485 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
486 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
487 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
488 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
489 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
490 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
491 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
492 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
493 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
494 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
495 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
496
497 for(i = 0; i < (int)data_len-7; i+=8) {
498 __m256i summ, mull;
499 summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
500 mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
501 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
502 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
503 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
504 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
505 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
506 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
507 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
508 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
509 summ = _mm256_sra_epi32(summ, cnt);
510 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
511 }
512 }
513 else { /* order == 9 */
514 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
515 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
516 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
517 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
518 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
519 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
520 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
521 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
522 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
523 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
524
525 for(i = 0; i < (int)data_len-7; i+=8) {
526 __m256i summ, mull;
527 summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9)));
528 mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
529 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
530 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
531 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
532 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
533 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
534 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
535 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
536 summ = _mm256_sra_epi32(summ, cnt);
537 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
538 }
539 }
540 }
541 }
542 else if(order > 4) {
543 if(order > 6) {
544 if(order == 8) {
545 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
546 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
547 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
548 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
549 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
550 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
551 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
552 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
553 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
554
555 for(i = 0; i < (int)data_len-7; i+=8) {
556 __m256i summ, mull;
557 summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8)));
558 mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
559 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
560 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
561 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
562 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
563 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
564 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
565 summ = _mm256_sra_epi32(summ, cnt);
566 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
567 }
568 }
569 else { /* order == 7 */
570 __m256i q0, q1, q2, q3, q4, q5, q6;
571 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
572 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
573 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
574 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
575 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
576 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
577 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
578
579 for(i = 0; i < (int)data_len-7; i+=8) {
580 __m256i summ, mull;
581 summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7)));
582 mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
583 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
584 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
585 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
586 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
587 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
588 summ = _mm256_sra_epi32(summ, cnt);
589 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
590 }
591 }
592 }
593 else {
594 if(order == 6) {
595 __m256i q0, q1, q2, q3, q4, q5;
596 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
597 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
598 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
599 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
600 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
601 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
602
603 for(i = 0; i < (int)data_len-7; i+=8) {
604 __m256i summ, mull;
605 summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6)));
606 mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
607 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
608 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
609 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
610 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
611 summ = _mm256_sra_epi32(summ, cnt);
612 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
613 }
614 }
615 else { /* order == 5 */
616 __m256i q0, q1, q2, q3, q4;
617 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
618 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
619 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
620 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
621 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
622
623 for(i = 0; i < (int)data_len-7; i+=8) {
624 __m256i summ, mull;
625 summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5)));
626 mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
627 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
628 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
629 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
630 summ = _mm256_sra_epi32(summ, cnt);
631 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
632 }
633 }
634 }
635 }
636 else {
637 if(order > 2) {
638 if(order == 4) {
639 __m256i q0, q1, q2, q3;
640 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
641 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
642 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
643 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
644
645 for(i = 0; i < (int)data_len-7; i+=8) {
646 __m256i summ, mull;
647 summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4)));
648 mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
649 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
650 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
651 summ = _mm256_sra_epi32(summ, cnt);
652 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
653 }
654 }
655 else { /* order == 3 */
656 __m256i q0, q1, q2;
657 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
658 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
659 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
660
661 for(i = 0; i < (int)data_len-7; i+=8) {
662 __m256i summ, mull;
663 summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3)));
664 mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
665 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
666 summ = _mm256_sra_epi32(summ, cnt);
667 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
668 }
669 }
670 }
671 else {
672 if(order == 2) {
673 __m256i q0, q1;
674 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
675 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
676
677 for(i = 0; i < (int)data_len-7; i+=8) {
678 __m256i summ, mull;
679 summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2)));
680 mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
681 summ = _mm256_sra_epi32(summ, cnt);
682 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
683 }
684 }
685 else { /* order == 1 */
686 __m256i q0;
687 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
688
689 for(i = 0; i < (int)data_len-7; i+=8) {
690 __m256i summ;
691 summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1)));
692 summ = _mm256_sra_epi32(summ, cnt);
693 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
694 }
695 }
696 }
697 }
698 for(; i < (int)data_len; i++) {
699 sum = 0;
700 switch(order) {
701 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
702 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
703 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
704 case 9: sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
705 case 8: sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
706 case 7: sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
707 case 6: sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
708 case 5: sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
709 case 4: sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
710 case 3: sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
711 case 2: sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
712 case 1: sum += qlp_coeff[ 0] * data[i- 1];
713 }
714 residual[i] = data[i] - (sum >> lp_quantization);
715 }
716 }
717 else { /* order > 12 */
718 for(i = 0; i < (int)data_len; i++) {
719 sum = 0;
720 switch(order) {
721 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
722 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
723 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
724 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
725 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
726 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
727 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
728 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
729 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
730 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
731 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
732 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
733 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
734 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
735 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
736 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
737 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
738 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
739 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
740 case 13: sum += qlp_coeff[12] * data[i-13];
741 sum += qlp_coeff[11] * data[i-12];
742 sum += qlp_coeff[10] * data[i-11];
743 sum += qlp_coeff[ 9] * data[i-10];
744 sum += qlp_coeff[ 8] * data[i- 9];
745 sum += qlp_coeff[ 7] * data[i- 8];
746 sum += qlp_coeff[ 6] * data[i- 7];
747 sum += qlp_coeff[ 5] * data[i- 6];
748 sum += qlp_coeff[ 4] * data[i- 5];
749 sum += qlp_coeff[ 3] * data[i- 4];
750 sum += qlp_coeff[ 2] * data[i- 3];
751 sum += qlp_coeff[ 1] * data[i- 2];
752 sum += qlp_coeff[ 0] * data[i- 1];
753 }
754 residual[i] = data[i] - (sum >> lp_quantization);
755 }
756 }
757 _mm256_zeroupper();
758 }
759
760 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
761
762 FLAC__SSE_TARGET("avx2")
FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])763 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
764 {
765 int i;
766 FLAC__int64 sum;
767 const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
768 const __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
769
770 FLAC__ASSERT(order > 0);
771 FLAC__ASSERT(order <= 32);
772 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
773
774 if(order <= 12) {
775 if(order > 8) {
776 if(order > 10) {
777 if(order == 12) {
778 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
779 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
780 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
781 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
782 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
783 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
784 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
785 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
786 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
787 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
788 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
789 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
790 q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
791
792 for(i = 0; i < (int)data_len-3; i+=4) {
793 __m256i summ, mull;
794 summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
795 mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
796 mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
797 mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
798 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
799 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
800 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
801 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
802 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
803 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
804 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
805 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
806 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
807 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
808 }
809 }
810 else { /* order == 11 */
811 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
812 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
813 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
814 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
815 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
816 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
817 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
818 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
819 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
820 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
821 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
822 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
823
824 for(i = 0; i < (int)data_len-3; i+=4) {
825 __m256i summ, mull;
826 summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
827 mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
828 mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
829 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
830 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
831 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
832 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
833 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
834 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
835 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
836 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
837 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
838 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
839 }
840 }
841 }
842 else {
843 if(order == 10) {
844 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
845 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
846 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
847 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
848 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
849 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
850 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
851 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
852 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
853 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
854 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
855
856 for(i = 0; i < (int)data_len-3; i+=4) {
857 __m256i summ, mull;
858 summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
859 mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
860 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
861 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
862 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
863 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
864 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
865 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
866 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
867 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
868 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
869 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
870 }
871 }
872 else { /* order == 9 */
873 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
874 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
875 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
876 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
877 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
878 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
879 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
880 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
881 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
882 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
883
884 for(i = 0; i < (int)data_len-3; i+=4) {
885 __m256i summ, mull;
886 summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
887 mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
888 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
889 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
890 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
891 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
892 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
893 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
894 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
895 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
896 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
897 }
898 }
899 }
900 }
901 else if(order > 4) {
902 if(order > 6) {
903 if(order == 8) {
904 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
905 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
906 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
907 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
908 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
909 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
910 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
911 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
912 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
913
914 for(i = 0; i < (int)data_len-3; i+=4) {
915 __m256i summ, mull;
916 summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
917 mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
918 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
919 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
920 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
921 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
922 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
923 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
924 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
925 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
926 }
927 }
928 else { /* order == 7 */
929 __m256i q0, q1, q2, q3, q4, q5, q6;
930 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
931 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
932 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
933 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
934 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
935 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
936 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
937
938 for(i = 0; i < (int)data_len-3; i+=4) {
939 __m256i summ, mull;
940 summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
941 mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
942 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
943 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
944 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
945 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
946 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
947 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
948 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
949 }
950 }
951 }
952 else {
953 if(order == 6) {
954 __m256i q0, q1, q2, q3, q4, q5;
955 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
956 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
957 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
958 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
959 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
960 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
961
962 for(i = 0; i < (int)data_len-3; i+=4) {
963 __m256i summ, mull;
964 summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
965 mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
966 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
967 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
968 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
969 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
970 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
971 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
972 }
973 }
974 else { /* order == 5 */
975 __m256i q0, q1, q2, q3, q4;
976 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
977 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
978 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
979 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
980 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
981
982 for(i = 0; i < (int)data_len-3; i+=4) {
983 __m256i summ, mull;
984 summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
985 mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
986 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
987 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
988 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
989 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
990 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
991 }
992 }
993 }
994 }
995 else {
996 if(order > 2) {
997 if(order == 4) {
998 __m256i q0, q1, q2, q3;
999 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1000 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1001 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1002 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
1003
1004 for(i = 0; i < (int)data_len-3; i+=4) {
1005 __m256i summ, mull;
1006 summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1007 mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
1008 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1009 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1010 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1011 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1012 }
1013 }
1014 else { /* order == 3 */
1015 __m256i q0, q1, q2;
1016 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1017 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1018 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1019
1020 for(i = 0; i < (int)data_len-3; i+=4) {
1021 __m256i summ, mull;
1022 summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1023 mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1024 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1025 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1026 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1027 }
1028 }
1029 }
1030 else {
1031 if(order == 2) {
1032 __m256i q0, q1;
1033 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1034 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1035
1036 for(i = 0; i < (int)data_len-3; i+=4) {
1037 __m256i summ, mull;
1038 summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1039 mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1040 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1041 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1042 }
1043 }
1044 else { /* order == 1 */
1045 __m256i q0;
1046 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1047
1048 for(i = 0; i < (int)data_len-3; i+=4) {
1049 __m256i summ;
1050 summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1051 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1052 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1053 }
1054 }
1055 }
1056 }
1057 for(; i < (int)data_len; i++) {
1058 sum = 0;
1059 switch(order) {
1060 case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; /* Falls through. */
1061 case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; /* Falls through. */
1062 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; /* Falls through. */
1063 case 9: sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; /* Falls through. */
1064 case 8: sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; /* Falls through. */
1065 case 7: sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; /* Falls through. */
1066 case 6: sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; /* Falls through. */
1067 case 5: sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; /* Falls through. */
1068 case 4: sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; /* Falls through. */
1069 case 3: sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; /* Falls through. */
1070 case 2: sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; /* Falls through. */
1071 case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1072 }
1073 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1074 }
1075 }
1076 else { /* order > 12 */
1077 for(i = 0; i < (int)data_len; i++) {
1078 sum = 0;
1079 switch(order) {
1080 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; /* Falls through. */
1081 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; /* Falls through. */
1082 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; /* Falls through. */
1083 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; /* Falls through. */
1084 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; /* Falls through. */
1085 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; /* Falls through. */
1086 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; /* Falls through. */
1087 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; /* Falls through. */
1088 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; /* Falls through. */
1089 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; /* Falls through. */
1090 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; /* Falls through. */
1091 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; /* Falls through. */
1092 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; /* Falls through. */
1093 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; /* Falls through. */
1094 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; /* Falls through. */
1095 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; /* Falls through. */
1096 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; /* Falls through. */
1097 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; /* Falls through. */
1098 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; /* Falls through. */
1099 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1100 sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1101 sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1102 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1103 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1104 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1105 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1106 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1107 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1108 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1109 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1110 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1111 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1112 }
1113 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1114 }
1115 }
1116 _mm256_zeroupper();
1117 }
1118
1119 #endif /* FLAC__AVX2_SUPPORTED */
1120 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1121 #endif /* FLAC__NO_ASM */
1122 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
1123