1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "EbDefinitions.h"
13 #include "common_dsp_rtcd.h"
14 #include <tmmintrin.h>
15 #include "EbInvTransforms.h"
16 #include "av1_inv_txfm_ssse3.h"
17 #include "av1_txfm_sse2.h"
18 #include "transpose_sse2.h"
19
20 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
21 static int32_t new_sqrt2list[TX_SIZES] = {5793, 2 * 4096, 2 * 5793, 4 * 4096, 4 * 5793};
22
idct4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)23 static void idct4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
24 (void)cos_bit;
25 const int32_t *cospi = cospi_arr(INV_COS_BIT);
26 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
27
28 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
29 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
30 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
31 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
32
33 // stage 1
34 __m128i x[4];
35 x[0] = input[0];
36 x[1] = input[2];
37 x[2] = input[1];
38 x[3] = input[3];
39
40 // stage 2
41 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
42 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
43
44 // stage 3
45 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
46 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
47 }
48
idct4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)49 static void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
50 (void)cos_bit;
51 const int32_t *cospi = cospi_arr(INV_COS_BIT);
52 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
53
54 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
55 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
56 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
57 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
58
59 // stage 1
60 __m128i x[4];
61 x[0] = input[0];
62 x[1] = input[2];
63 x[2] = input[1];
64 x[3] = input[3];
65
66 // stage 2
67 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
68 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
69
70 // stage 3
71 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
72 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
73 }
74
idct8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)75 static void idct8_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
76 (void)cos_bit;
77 const int32_t *cospi = cospi_arr(INV_COS_BIT);
78
79 // stage 1
80 __m128i x[2];
81 x[0] = input[0];
82
83 // stage 2
84 // stage 3
85 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
86
87 // stage 4
88 // stage 5
89 output[0] = x[0];
90 output[7] = x[0];
91 output[1] = x[1];
92 output[6] = x[1];
93 output[2] = x[1];
94 output[5] = x[1];
95 output[3] = x[0];
96 output[4] = x[0];
97 }
98
idct8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)99 static void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
100 (void)cos_bit;
101 const int32_t *cospi = cospi_arr(INV_COS_BIT);
102 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
103
104 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
105 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
106 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
107 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
108 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
109 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
110 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
111 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
112 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
113
114 // stage 1
115 __m128i x[8];
116 x[0] = input[0];
117 x[1] = input[4];
118 x[2] = input[2];
119 x[3] = input[6];
120 x[4] = input[1];
121 x[5] = input[5];
122 x[6] = input[3];
123 x[7] = input[7];
124
125 // stage 2
126 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
127 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
128
129 // stage 3
130 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
131 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
132 btf_16_adds_subs_sse2(x[4], x[5]);
133 btf_16_subs_adds_sse2(x[7], x[6]);
134
135 // stage 4
136 btf_16_adds_subs_sse2(x[0], x[3]);
137 btf_16_adds_subs_sse2(x[1], x[2]);
138 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
139
140 // stage 5
141 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
142 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
143 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
144 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
145 }
146
idct8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)147 static void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
148 (void)cos_bit;
149 const int32_t *cospi = cospi_arr(INV_COS_BIT);
150 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
151
152 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
153 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
154 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
155 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
156 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
157 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
158 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
159 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
160 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
161
162 // stage 1
163 __m128i x[8];
164 x[0] = input[0];
165 x[1] = input[4];
166 x[2] = input[2];
167 x[3] = input[6];
168 x[4] = input[1];
169 x[5] = input[5];
170 x[6] = input[3];
171 x[7] = input[7];
172
173 // stage 2
174 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
175 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
176
177 // stage 3
178 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
179 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
180 btf_16_adds_subs_sse2(x[4], x[5]);
181 btf_16_subs_adds_sse2(x[7], x[6]);
182
183 // stage 4
184 btf_16_adds_subs_sse2(x[0], x[3]);
185 btf_16_adds_subs_sse2(x[1], x[2]);
186 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
187
188 // stage 5
189 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
190 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
191 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
192 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
193 }
194
idct16_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)195 static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
196 int8_t cos_bit) {
197 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
198 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
199 btf_16_adds_subs_sse2(x[0], x[3]);
200 btf_16_adds_subs_sse2(x[1], x[2]);
201 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
202 btf_16_adds_subs_sse2(x[8], x[11]);
203 btf_16_adds_subs_sse2(x[9], x[10]);
204 btf_16_subs_adds_sse2(x[15], x[12]);
205 btf_16_subs_adds_sse2(x[14], x[13]);
206 }
207
idct16_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)208 static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
209 int8_t cos_bit) {
210 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
211 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
212 btf_16_adds_subs_sse2(x[0], x[7]);
213 btf_16_adds_subs_sse2(x[1], x[6]);
214 btf_16_adds_subs_sse2(x[2], x[5]);
215 btf_16_adds_subs_sse2(x[3], x[4]);
216 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
217 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
218 }
219
idct16_stage7_sse2(__m128i * output,__m128i * x)220 static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
221 btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
222 btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
223 btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
224 btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
225 btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
226 btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
227 btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
228 btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
229 }
230
idct16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)231 static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
232 (void)cos_bit;
233 const int32_t *cospi = cospi_arr(INV_COS_BIT);
234
235 // stage 1
236 __m128i x[2];
237 x[0] = input[0];
238
239 // stage 2
240 // stage 3
241 // stage 4
242 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
243
244 // stage 5
245 // stage 6
246 // stage 7
247 output[0] = x[0];
248 output[15] = x[0];
249 output[1] = x[1];
250 output[14] = x[1];
251 output[2] = x[1];
252 output[13] = x[1];
253 output[3] = x[0];
254 output[12] = x[0];
255 output[4] = x[0];
256 output[11] = x[0];
257 output[5] = x[1];
258 output[10] = x[1];
259 output[6] = x[1];
260 output[9] = x[1];
261 output[7] = x[0];
262 output[8] = x[0];
263 }
264
idct16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)265 static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
266 (void)cos_bit;
267 const int32_t *cospi = cospi_arr(INV_COS_BIT);
268 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
269 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
270 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
271 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
272
273 // stage 1
274 __m128i x[16];
275 x[0] = input[0];
276 x[2] = input[4];
277 x[4] = input[2];
278 x[6] = input[6];
279 x[8] = input[1];
280 x[10] = input[5];
281 x[12] = input[3];
282 x[14] = input[7];
283
284 // stage 2
285 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
286 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
287 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
288 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
289
290 // stage 3
291 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
292 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
293 btf_16_adds_subs_sse2(x[8], x[9]);
294 btf_16_subs_adds_sse2(x[11], x[10]);
295 btf_16_adds_subs_sse2(x[12], x[13]);
296 btf_16_subs_adds_sse2(x[15], x[14]);
297
298 // stage 4
299 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
300 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
301 btf_16_adds_subs_sse2(x[4], x[5]);
302 btf_16_subs_adds_sse2(x[7], x[6]);
303 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
304 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
305
306 idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
307 idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
308 idct16_stage7_sse2(output, x);
309 }
310
idct16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)311 static void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
312 (void)cos_bit;
313 const int32_t *cospi = cospi_arr(INV_COS_BIT);
314 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
315
316 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
317 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
318 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
319 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
320 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
321 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
322 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
323 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
324 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
325 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
326 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
327 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
328 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
329 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
330 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
331 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
332 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
333 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
334 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
335
336 // stage 1
337 __m128i x[16];
338 x[0] = input[0];
339 x[1] = input[8];
340 x[2] = input[4];
341 x[3] = input[12];
342 x[4] = input[2];
343 x[5] = input[10];
344 x[6] = input[6];
345 x[7] = input[14];
346 x[8] = input[1];
347 x[9] = input[9];
348 x[10] = input[5];
349 x[11] = input[13];
350 x[12] = input[3];
351 x[13] = input[11];
352 x[14] = input[7];
353 x[15] = input[15];
354
355 // stage 2
356 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15], __rounding);
357 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14], __rounding);
358 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13], __rounding);
359 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12], __rounding);
360
361 // stage 3
362 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
363 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
364 btf_16_adds_subs_sse2(x[8], x[9]);
365 btf_16_subs_adds_sse2(x[11], x[10]);
366 btf_16_adds_subs_sse2(x[12], x[13]);
367 btf_16_subs_adds_sse2(x[15], x[14]);
368
369 // stage 4
370 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
371 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
372 btf_16_adds_subs_sse2(x[4], x[5]);
373 btf_16_subs_adds_sse2(x[7], x[6]);
374 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
375 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
376
377 // stage 5~7
378 idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
379 idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
380 idct16_stage7_sse2(output, x);
381 }
382
idct16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)383 static void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
384 (void)cos_bit;
385 const int32_t *cospi = cospi_arr(INV_COS_BIT);
386 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
387
388 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
389 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
390 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
391 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
392 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
393 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
394 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
395 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
396 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
397 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
398 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
399 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
400 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
401 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
402 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
403 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
404 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
405 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
406 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
407 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
408
409 // stage 1
410 __m128i x[16];
411 x[0] = input[0];
412 x[1] = input[8];
413 x[2] = input[4];
414 x[3] = input[12];
415 x[4] = input[2];
416 x[5] = input[10];
417 x[6] = input[6];
418 x[7] = input[14];
419 x[8] = input[1];
420 x[9] = input[9];
421 x[10] = input[5];
422 x[11] = input[13];
423 x[12] = input[3];
424 x[13] = input[11];
425 x[14] = input[7];
426 x[15] = input[15];
427
428 // stage 2
429 btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15], __rounding);
430 btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14], __rounding);
431 btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13], __rounding);
432 btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12], __rounding);
433
434 // stage 3
435 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
436 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
437 btf_16_adds_subs_sse2(x[8], x[9]);
438 btf_16_subs_adds_sse2(x[11], x[10]);
439 btf_16_adds_subs_sse2(x[12], x[13]);
440 btf_16_subs_adds_sse2(x[15], x[14]);
441
442 // stage 4
443 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
444 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
445 btf_16_adds_subs_sse2(x[4], x[5]);
446 btf_16_subs_adds_sse2(x[7], x[6]);
447 btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
448 btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
449
450 // stage 5
451 btf_16_adds_subs_sse2(x[0], x[3]);
452 btf_16_adds_subs_sse2(x[1], x[2]);
453 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
454 btf_16_adds_subs_sse2(x[8], x[11]);
455 btf_16_adds_subs_sse2(x[9], x[10]);
456 btf_16_subs_adds_sse2(x[15], x[12]);
457 btf_16_subs_adds_sse2(x[14], x[13]);
458
459 // stage 6
460 btf_16_adds_subs_sse2(x[0], x[7]);
461 btf_16_adds_subs_sse2(x[1], x[6]);
462 btf_16_adds_subs_sse2(x[2], x[5]);
463 btf_16_adds_subs_sse2(x[3], x[4]);
464 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
465 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
466
467 // stage 7
468 idct16_stage7_sse2(output, x);
469 }
470
idct32_high16_stage3_sse2(__m128i * x)471 static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
472 btf_16_adds_subs_sse2(x[16], x[17]);
473 btf_16_subs_adds_sse2(x[19], x[18]);
474 btf_16_adds_subs_sse2(x[20], x[21]);
475 btf_16_subs_adds_sse2(x[23], x[22]);
476 btf_16_adds_subs_sse2(x[24], x[25]);
477 btf_16_subs_adds_sse2(x[27], x[26]);
478 btf_16_adds_subs_sse2(x[28], x[29]);
479 btf_16_subs_adds_sse2(x[31], x[30]);
480 }
481
idct32_high16_stage4_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)482 static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
483 const __m128i __rounding, int8_t cos_bit) {
484 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
485 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
486 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
487 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
488 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
489 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
490 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30], __rounding);
491 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29], __rounding);
492 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26], __rounding);
493 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25], __rounding);
494 }
495
idct32_high24_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)496 static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
497 const __m128i __rounding, int8_t cos_bit) {
498 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
499 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
500 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
501 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
502 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
503 btf_16_adds_subs_sse2(x[16], x[19]);
504 btf_16_adds_subs_sse2(x[17], x[18]);
505 btf_16_subs_adds_sse2(x[23], x[20]);
506 btf_16_subs_adds_sse2(x[22], x[21]);
507 btf_16_adds_subs_sse2(x[24], x[27]);
508 btf_16_adds_subs_sse2(x[25], x[26]);
509 btf_16_subs_adds_sse2(x[31], x[28]);
510 btf_16_subs_adds_sse2(x[30], x[29]);
511 }
512
idct32_high28_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)513 static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
514 const __m128i __rounding, int8_t cos_bit) {
515 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
516 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
517 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
518 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
519 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
520 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
521 btf_16_adds_subs_sse2(x[8], x[11]);
522 btf_16_adds_subs_sse2(x[9], x[10]);
523 btf_16_subs_adds_sse2(x[15], x[12]);
524 btf_16_subs_adds_sse2(x[14], x[13]);
525 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29], __rounding);
526 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28], __rounding);
527 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27], __rounding);
528 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26], __rounding);
529 }
530
idct32_stage7_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)531 static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
532 int8_t cos_bit) {
533 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
534 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
535 btf_16_adds_subs_sse2(x[0], x[7]);
536 btf_16_adds_subs_sse2(x[1], x[6]);
537 btf_16_adds_subs_sse2(x[2], x[5]);
538 btf_16_adds_subs_sse2(x[3], x[4]);
539 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
540 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
541 btf_16_adds_subs_sse2(x[16], x[23]);
542 btf_16_adds_subs_sse2(x[17], x[22]);
543 btf_16_adds_subs_sse2(x[18], x[21]);
544 btf_16_adds_subs_sse2(x[19], x[20]);
545 btf_16_subs_adds_sse2(x[31], x[24]);
546 btf_16_subs_adds_sse2(x[30], x[25]);
547 btf_16_subs_adds_sse2(x[29], x[26]);
548 btf_16_subs_adds_sse2(x[28], x[27]);
549 }
550
idct32_stage8_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)551 static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
552 int8_t cos_bit) {
553 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
554 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
555 btf_16_adds_subs_sse2(x[0], x[15]);
556 btf_16_adds_subs_sse2(x[1], x[14]);
557 btf_16_adds_subs_sse2(x[2], x[13]);
558 btf_16_adds_subs_sse2(x[3], x[12]);
559 btf_16_adds_subs_sse2(x[4], x[11]);
560 btf_16_adds_subs_sse2(x[5], x[10]);
561 btf_16_adds_subs_sse2(x[6], x[9]);
562 btf_16_adds_subs_sse2(x[7], x[8]);
563 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27], __rounding);
564 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26], __rounding);
565 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25], __rounding);
566 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24], __rounding);
567 }
568
idct32_stage9_sse2(__m128i * output,__m128i * x)569 static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
570 btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
571 btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
572 btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
573 btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
574 btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
575 btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
576 btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
577 btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
578 btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
579 btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
580 btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
581 btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
582 btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
583 btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
584 btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
585 btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
586 }
587
idct32_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)588 static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
589 (void)cos_bit;
590 const int32_t *cospi = cospi_arr(INV_COS_BIT);
591
592 // stage 1
593 __m128i x[2];
594 x[0] = input[0];
595
596 // stage 2
597 // stage 3
598 // stage 4
599 // stage 5
600 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
601
602 // stage 6
603 // stage 7
604 // stage 8
605 // stage 9
606 output[0] = x[0];
607 output[31] = x[0];
608 output[1] = x[1];
609 output[30] = x[1];
610 output[2] = x[1];
611 output[29] = x[1];
612 output[3] = x[0];
613 output[28] = x[0];
614 output[4] = x[0];
615 output[27] = x[0];
616 output[5] = x[1];
617 output[26] = x[1];
618 output[6] = x[1];
619 output[25] = x[1];
620 output[7] = x[0];
621 output[24] = x[0];
622 output[8] = x[0];
623 output[23] = x[0];
624 output[9] = x[1];
625 output[22] = x[1];
626 output[10] = x[1];
627 output[21] = x[1];
628 output[11] = x[0];
629 output[20] = x[0];
630 output[12] = x[0];
631 output[19] = x[0];
632 output[13] = x[1];
633 output[18] = x[1];
634 output[14] = x[1];
635 output[17] = x[1];
636 output[15] = x[0];
637 output[16] = x[0];
638 }
639
idct32_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)640 static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
641 (void)cos_bit;
642 const int32_t *cospi = cospi_arr(INV_COS_BIT);
643 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
644
645 // stage 1
646 __m128i x[32];
647 x[0] = input[0];
648 x[4] = input[4];
649 x[8] = input[2];
650 x[12] = input[6];
651 x[16] = input[1];
652 x[20] = input[5];
653 x[24] = input[3];
654 x[28] = input[7];
655
656 // stage 2
657 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
658 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
659 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
660 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
661
662 // stage 3
663 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
664 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
665 x[17] = x[16];
666 x[18] = x[19];
667 x[21] = x[20];
668 x[22] = x[23];
669 x[25] = x[24];
670 x[26] = x[27];
671 x[29] = x[28];
672 x[30] = x[31];
673
674 // stage 4
675 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
676 x[9] = x[8];
677 x[10] = x[11];
678 x[13] = x[12];
679 x[14] = x[15];
680 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
681
682 // stage 5
683 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
684 x[5] = x[4];
685 x[6] = x[7];
686 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
687 // stage 6
688 x[3] = x[0];
689 x[2] = x[1];
690 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
691
692 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
693 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
694 idct32_stage9_sse2(output, x);
695 }
696
idct32_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)697 static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
698 (void)cos_bit;
699 const int32_t *cospi = cospi_arr(INV_COS_BIT);
700 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
701
702 // stage 1
703 __m128i x[32];
704 x[0] = input[0];
705 x[2] = input[8];
706 x[4] = input[4];
707 x[6] = input[12];
708 x[8] = input[2];
709 x[10] = input[10];
710 x[12] = input[6];
711 x[14] = input[14];
712 x[16] = input[1];
713 x[18] = input[9];
714 x[20] = input[5];
715 x[22] = input[13];
716 x[24] = input[3];
717 x[26] = input[11];
718 x[28] = input[7];
719 x[30] = input[15];
720
721 // stage 2
722 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
723 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
724 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
725 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
726 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
727 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
728 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
729 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
730
731 // stage 3
732 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
733 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
734 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
735 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
736 idct32_high16_stage3_sse2(x);
737
738 // stage 4
739 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
740 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
741 btf_16_adds_subs_sse2(x[8], x[9]);
742 btf_16_subs_adds_sse2(x[11], x[10]);
743 btf_16_adds_subs_sse2(x[12], x[13]);
744 btf_16_subs_adds_sse2(x[15], x[14]);
745 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
746
747 // stage 5
748 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
749 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
750 btf_16_adds_subs_sse2(x[4], x[5]);
751 btf_16_subs_adds_sse2(x[7], x[6]);
752 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
753
754 btf_16_adds_subs_sse2(x[0], x[3]);
755 btf_16_adds_subs_sse2(x[1], x[2]);
756 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
757
758 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
759 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
760 idct32_stage9_sse2(output, x);
761 }
762
idct32_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)763 static void idct32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
764 (void)cos_bit;
765 const int32_t *cospi = cospi_arr(INV_COS_BIT);
766 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
767
768 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
769 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
770 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
771 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
772 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
773 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
774 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
775 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
776 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
777 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
778 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
779 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
780 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
781 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
782 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
783 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
784 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
785 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
786 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
787 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
788 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
789 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
790 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
791 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
792 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
793 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
794 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
795 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
796 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
797 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
798 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
799 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
800
801 // stage 1
802 __m128i x[32];
803 x[0] = input[0];
804 x[1] = input[16];
805 x[2] = input[8];
806 x[3] = input[24];
807 x[4] = input[4];
808 x[5] = input[20];
809 x[6] = input[12];
810 x[7] = input[28];
811 x[8] = input[2];
812 x[9] = input[18];
813 x[10] = input[10];
814 x[11] = input[26];
815 x[12] = input[6];
816 x[13] = input[22];
817 x[14] = input[14];
818 x[15] = input[30];
819 x[16] = input[1];
820 x[17] = input[17];
821 x[18] = input[9];
822 x[19] = input[25];
823 x[20] = input[5];
824 x[21] = input[21];
825 x[22] = input[13];
826 x[23] = input[29];
827 x[24] = input[3];
828 x[25] = input[19];
829 x[26] = input[11];
830 x[27] = input[27];
831 x[28] = input[7];
832 x[29] = input[23];
833 x[30] = input[15];
834 x[31] = input[31];
835
836 // stage 2
837 btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31], __rounding);
838 btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30], __rounding);
839 btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29], __rounding);
840 btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28], __rounding);
841 btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27], __rounding);
842 btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26], __rounding);
843 btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25], __rounding);
844 btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24], __rounding);
845
846 // stage 3
847 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15], __rounding);
848 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14], __rounding);
849 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13], __rounding);
850 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12], __rounding);
851 idct32_high16_stage3_sse2(x);
852
853 // stage 4
854 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
855 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
856 btf_16_adds_subs_sse2(x[8], x[9]);
857 btf_16_subs_adds_sse2(x[11], x[10]);
858 btf_16_adds_subs_sse2(x[12], x[13]);
859 btf_16_subs_adds_sse2(x[15], x[14]);
860 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
861
862 // stage 5
863 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
864 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
865 btf_16_adds_subs_sse2(x[4], x[5]);
866 btf_16_adds_subs_sse2(x[7], x[6]);
867 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
868
869 // stage 6
870 btf_16_adds_subs_sse2(x[0], x[3]);
871 btf_16_adds_subs_sse2(x[1], x[2]);
872 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
873
874 // stage 7~8
875 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
876 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
877 idct32_stage9_sse2(output, x);
878 }
879
idct64_stage4_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)880 static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
881 const __m128i __rounding, int8_t cos_bit) {
882 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
883 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
884 const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
885 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
886 const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
887 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
888 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
889 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
890 const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
891 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
892 const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
893 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
894 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62], __rounding);
895 btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61], __rounding);
896 btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58], __rounding);
897 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57], __rounding);
898 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54], __rounding);
899 btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53], __rounding);
900 btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50], __rounding);
901 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49], __rounding);
902 }
903
idct64_stage5_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)904 static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
905 const __m128i __rounding, int8_t cos_bit) {
906 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
907 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
908 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
909 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
910 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
911 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
912 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30], __rounding);
913 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29], __rounding);
914 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26], __rounding);
915 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25], __rounding);
916 btf_16_adds_subs_sse2(x[32], x[35]);
917 btf_16_adds_subs_sse2(x[33], x[34]);
918 btf_16_subs_adds_sse2(x[39], x[36]);
919 btf_16_subs_adds_sse2(x[38], x[37]);
920 btf_16_adds_subs_sse2(x[40], x[43]);
921 btf_16_adds_subs_sse2(x[41], x[42]);
922 btf_16_subs_adds_sse2(x[47], x[44]);
923 btf_16_subs_adds_sse2(x[46], x[45]);
924 btf_16_adds_subs_sse2(x[48], x[51]);
925 btf_16_adds_subs_sse2(x[49], x[50]);
926 btf_16_subs_adds_sse2(x[55], x[52]);
927 btf_16_subs_adds_sse2(x[54], x[53]);
928 btf_16_adds_subs_sse2(x[56], x[59]);
929 btf_16_adds_subs_sse2(x[57], x[58]);
930 btf_16_subs_adds_sse2(x[63], x[60]);
931 btf_16_subs_adds_sse2(x[62], x[61]);
932 }
933
idct64_stage6_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)934 static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
935 const __m128i __rounding, int8_t cos_bit) {
936 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
937 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
938 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
939 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
940 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
941 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
942 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61], __rounding);
943 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60], __rounding);
944 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59], __rounding);
945 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58], __rounding);
946 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53], __rounding);
947 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52], __rounding);
948 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51], __rounding);
949 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50], __rounding);
950 }
951
idct64_stage6_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)952 static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
953 const __m128i __rounding, int8_t cos_bit) {
954 btf_16_adds_subs_sse2(x[16], x[19]);
955 btf_16_adds_subs_sse2(x[17], x[18]);
956 btf_16_subs_adds_sse2(x[23], x[20]);
957 btf_16_subs_adds_sse2(x[22], x[21]);
958 btf_16_adds_subs_sse2(x[24], x[27]);
959 btf_16_adds_subs_sse2(x[25], x[26]);
960 btf_16_subs_adds_sse2(x[31], x[28]);
961 btf_16_subs_adds_sse2(x[30], x[29]);
962 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
963 }
964
idct64_stage7_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)965 static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
966 const __m128i __rounding, int8_t cos_bit) {
967 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
968 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
969 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
970 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29], __rounding);
971 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28], __rounding);
972 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27], __rounding);
973 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26], __rounding);
974 btf_16_adds_subs_sse2(x[32], x[39]);
975 btf_16_adds_subs_sse2(x[33], x[38]);
976 btf_16_adds_subs_sse2(x[34], x[37]);
977 btf_16_adds_subs_sse2(x[35], x[36]);
978 btf_16_subs_adds_sse2(x[47], x[40]);
979 btf_16_subs_adds_sse2(x[46], x[41]);
980 btf_16_subs_adds_sse2(x[45], x[42]);
981 btf_16_subs_adds_sse2(x[44], x[43]);
982 btf_16_adds_subs_sse2(x[48], x[55]);
983 btf_16_adds_subs_sse2(x[49], x[54]);
984 btf_16_adds_subs_sse2(x[50], x[53]);
985 btf_16_adds_subs_sse2(x[51], x[52]);
986 btf_16_subs_adds_sse2(x[63], x[56]);
987 btf_16_subs_adds_sse2(x[62], x[57]);
988 btf_16_subs_adds_sse2(x[61], x[58]);
989 btf_16_subs_adds_sse2(x[60], x[59]);
990 }
991
idct64_stage8_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)992 static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
993 const __m128i __rounding, int8_t cos_bit) {
994 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
995 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
996 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
997 btf_16_adds_subs_sse2(x[16], x[23]);
998 btf_16_adds_subs_sse2(x[17], x[22]);
999 btf_16_adds_subs_sse2(x[18], x[21]);
1000 btf_16_adds_subs_sse2(x[19], x[20]);
1001 btf_16_subs_adds_sse2(x[31], x[24]);
1002 btf_16_subs_adds_sse2(x[30], x[25]);
1003 btf_16_subs_adds_sse2(x[29], x[26]);
1004 btf_16_subs_adds_sse2(x[28], x[27]);
1005 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59], __rounding);
1006 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58], __rounding);
1007 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57], __rounding);
1008 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56], __rounding);
1009 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55], __rounding);
1010 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54], __rounding);
1011 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53], __rounding);
1012 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52], __rounding);
1013 }
1014
idct64_stage9_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1015 static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1016 int8_t cos_bit) {
1017 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1018 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1019 btf_16_adds_subs_sse2(x[0], x[15]);
1020 btf_16_adds_subs_sse2(x[1], x[14]);
1021 btf_16_adds_subs_sse2(x[2], x[13]);
1022 btf_16_adds_subs_sse2(x[3], x[12]);
1023 btf_16_adds_subs_sse2(x[4], x[11]);
1024 btf_16_adds_subs_sse2(x[5], x[10]);
1025 btf_16_adds_subs_sse2(x[6], x[9]);
1026 btf_16_adds_subs_sse2(x[7], x[8]);
1027 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27], __rounding);
1028 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26], __rounding);
1029 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25], __rounding);
1030 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24], __rounding);
1031 btf_16_adds_subs_sse2(x[32], x[47]);
1032 btf_16_adds_subs_sse2(x[33], x[46]);
1033 btf_16_adds_subs_sse2(x[34], x[45]);
1034 btf_16_adds_subs_sse2(x[35], x[44]);
1035 btf_16_adds_subs_sse2(x[36], x[43]);
1036 btf_16_adds_subs_sse2(x[37], x[42]);
1037 btf_16_adds_subs_sse2(x[38], x[41]);
1038 btf_16_adds_subs_sse2(x[39], x[40]);
1039 btf_16_subs_adds_sse2(x[63], x[48]);
1040 btf_16_subs_adds_sse2(x[62], x[49]);
1041 btf_16_subs_adds_sse2(x[61], x[50]);
1042 btf_16_subs_adds_sse2(x[60], x[51]);
1043 btf_16_subs_adds_sse2(x[59], x[52]);
1044 btf_16_subs_adds_sse2(x[58], x[53]);
1045 btf_16_subs_adds_sse2(x[57], x[54]);
1046 btf_16_subs_adds_sse2(x[56], x[55]);
1047 }
1048
idct64_stage10_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1049 static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1050 int8_t cos_bit) {
1051 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1052 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1053 btf_16_adds_subs_sse2(x[0], x[31]);
1054 btf_16_adds_subs_sse2(x[1], x[30]);
1055 btf_16_adds_subs_sse2(x[2], x[29]);
1056 btf_16_adds_subs_sse2(x[3], x[28]);
1057 btf_16_adds_subs_sse2(x[4], x[27]);
1058 btf_16_adds_subs_sse2(x[5], x[26]);
1059 btf_16_adds_subs_sse2(x[6], x[25]);
1060 btf_16_adds_subs_sse2(x[7], x[24]);
1061 btf_16_adds_subs_sse2(x[8], x[23]);
1062 btf_16_adds_subs_sse2(x[9], x[22]);
1063 btf_16_adds_subs_sse2(x[10], x[21]);
1064 btf_16_adds_subs_sse2(x[11], x[20]);
1065 btf_16_adds_subs_sse2(x[12], x[19]);
1066 btf_16_adds_subs_sse2(x[13], x[18]);
1067 btf_16_adds_subs_sse2(x[14], x[17]);
1068 btf_16_adds_subs_sse2(x[15], x[16]);
1069 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55], __rounding);
1070 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54], __rounding);
1071 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53], __rounding);
1072 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52], __rounding);
1073 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51], __rounding);
1074 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50], __rounding);
1075 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49], __rounding);
1076 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48], __rounding);
1077 }
1078
idct64_stage11_sse2(__m128i * output,__m128i * x)1079 static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1080 btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1081 btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1082 btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1083 btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1084 btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1085 btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1086 btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1087 btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1088 btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1089 btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1090 btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1091 btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1092 btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1093 btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1094 btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1095 btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1096 btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1097 btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1098 btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1099 btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1100 btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1101 btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1102 btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1103 btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1104 btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1105 btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1106 btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1107 btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1108 btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1109 btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1110 btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1111 btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1112 }
1113
idct64_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1114 static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1115 (void)cos_bit;
1116 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1117
1118 // stage 1
1119 __m128i x[32];
1120 x[0] = input[0];
1121
1122 // stage 2
1123 // stage 3
1124 // stage 4
1125 // stage 5
1126 // stage 6
1127 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1128
1129 // stage 7
1130 // stage 8
1131 // stage 9
1132 // stage 10
1133 // stage 11
1134 output[0] = x[0];
1135 output[63] = x[0];
1136 output[1] = x[1];
1137 output[62] = x[1];
1138 output[2] = x[1];
1139 output[61] = x[1];
1140 output[3] = x[0];
1141 output[60] = x[0];
1142 output[4] = x[0];
1143 output[59] = x[0];
1144 output[5] = x[1];
1145 output[58] = x[1];
1146 output[6] = x[1];
1147 output[57] = x[1];
1148 output[7] = x[0];
1149 output[56] = x[0];
1150 output[8] = x[0];
1151 output[55] = x[0];
1152 output[9] = x[1];
1153 output[54] = x[1];
1154 output[10] = x[1];
1155 output[53] = x[1];
1156 output[11] = x[0];
1157 output[52] = x[0];
1158 output[12] = x[0];
1159 output[51] = x[0];
1160 output[13] = x[1];
1161 output[50] = x[1];
1162 output[14] = x[1];
1163 output[49] = x[1];
1164 output[15] = x[0];
1165 output[48] = x[0];
1166 output[16] = x[0];
1167 output[47] = x[0];
1168 output[17] = x[1];
1169 output[46] = x[1];
1170 output[18] = x[1];
1171 output[45] = x[1];
1172 output[19] = x[0];
1173 output[44] = x[0];
1174 output[20] = x[0];
1175 output[43] = x[0];
1176 output[21] = x[1];
1177 output[42] = x[1];
1178 output[22] = x[1];
1179 output[41] = x[1];
1180 output[23] = x[0];
1181 output[40] = x[0];
1182 output[24] = x[0];
1183 output[39] = x[0];
1184 output[25] = x[1];
1185 output[38] = x[1];
1186 output[26] = x[1];
1187 output[37] = x[1];
1188 output[27] = x[0];
1189 output[36] = x[0];
1190 output[28] = x[0];
1191 output[35] = x[0];
1192 output[29] = x[1];
1193 output[34] = x[1];
1194 output[30] = x[1];
1195 output[33] = x[1];
1196 output[31] = x[0];
1197 output[32] = x[0];
1198 }
1199
idct64_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1200 static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1201 (void)cos_bit;
1202 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1203 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1204 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1205 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1206 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1207 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1208 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1209 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1210 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1211 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1212 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1213 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1214 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1215 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1216 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1217 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1218 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1219 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1220
1221 // stage 1
1222 __m128i x[64];
1223 x[0] = input[0];
1224 x[8] = input[4];
1225 x[16] = input[2];
1226 x[24] = input[6];
1227 x[32] = input[1];
1228 x[40] = input[5];
1229 x[48] = input[3];
1230 x[56] = input[7];
1231
1232 // stage 2
1233 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1234 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1235 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1236 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1237
1238 // stage 3
1239 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1240 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1241 x[33] = x[32];
1242 x[38] = x[39];
1243 x[41] = x[40];
1244 x[46] = x[47];
1245 x[49] = x[48];
1246 x[54] = x[55];
1247 x[57] = x[56];
1248 x[62] = x[63];
1249
1250 // stage 4
1251 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1252 x[17] = x[16];
1253 x[22] = x[23];
1254 x[25] = x[24];
1255 x[30] = x[31];
1256 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62], __rounding);
1257 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57], __rounding);
1258 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54], __rounding);
1259 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49], __rounding);
1260
1261 // stage 5
1262 x[9] = x[8];
1263 x[14] = x[15];
1264 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30], __rounding);
1265 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25], __rounding);
1266 x[35] = x[32];
1267 x[34] = x[33];
1268 x[36] = x[39];
1269 x[37] = x[38];
1270 x[43] = x[40];
1271 x[42] = x[41];
1272 x[44] = x[47];
1273 x[45] = x[46];
1274 x[51] = x[48];
1275 x[50] = x[49];
1276 x[52] = x[55];
1277 x[53] = x[54];
1278 x[59] = x[56];
1279 x[58] = x[57];
1280 x[60] = x[63];
1281 x[61] = x[62];
1282
1283 // stage 6
1284 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1285 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
1286 x[19] = x[16];
1287 x[18] = x[17];
1288 x[20] = x[23];
1289 x[21] = x[22];
1290 x[27] = x[24];
1291 x[26] = x[25];
1292 x[28] = x[31];
1293 x[29] = x[30];
1294 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1295
1296 // stage 7
1297 x[3] = x[0];
1298 x[2] = x[1];
1299 x[11] = x[8];
1300 x[10] = x[9];
1301 x[12] = x[15];
1302 x[13] = x[14];
1303 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1304
1305 // stage 8
1306 x[7] = x[0];
1307 x[6] = x[1];
1308 x[5] = x[2];
1309 x[4] = x[3];
1310 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
1311 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
1312 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1313
1314 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1315 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1316 idct64_stage11_sse2(output, x);
1317 }
1318
idct64_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1319 static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1320 (void)cos_bit;
1321 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1322 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1323
1324 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1325 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1326 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1327 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1328 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1329
1330 // stage 1
1331 __m128i x[64];
1332 x[0] = input[0];
1333 x[4] = input[8];
1334 x[8] = input[4];
1335 x[12] = input[12];
1336 x[16] = input[2];
1337 x[20] = input[10];
1338 x[24] = input[6];
1339 x[28] = input[14];
1340 x[32] = input[1];
1341 x[36] = input[9];
1342 x[40] = input[5];
1343 x[44] = input[13];
1344 x[48] = input[3];
1345 x[52] = input[11];
1346 x[56] = input[7];
1347 x[60] = input[15];
1348
1349 // stage 2
1350 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1351 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1352 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1353 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1354 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1355 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1356 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1357 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1358
1359 // stage 3
1360 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1361 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1362 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1363 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1364 x[33] = x[32];
1365 x[34] = x[35];
1366 x[37] = x[36];
1367 x[38] = x[39];
1368 x[41] = x[40];
1369 x[42] = x[43];
1370 x[45] = x[44];
1371 x[46] = x[47];
1372 x[49] = x[48];
1373 x[50] = x[51];
1374 x[53] = x[52];
1375 x[54] = x[55];
1376 x[57] = x[56];
1377 x[58] = x[59];
1378 x[61] = x[60];
1379 x[62] = x[63];
1380
1381 // stage 4
1382 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1383 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1384 x[17] = x[16];
1385 x[18] = x[19];
1386 x[21] = x[20];
1387 x[22] = x[23];
1388 x[25] = x[24];
1389 x[26] = x[27];
1390 x[29] = x[28];
1391 x[30] = x[31];
1392 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1393
1394 // stage 5
1395 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1396 x[9] = x[8];
1397 x[10] = x[11];
1398 x[13] = x[12];
1399 x[14] = x[15];
1400 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1401
1402 // stage 6
1403 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1404 x[5] = x[4];
1405 x[6] = x[7];
1406 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
1407 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
1408 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1409
1410 // stage 7
1411 x[3] = x[0];
1412 x[2] = x[1];
1413 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
1414 btf_16_adds_subs_sse2(x[8], x[11]);
1415 btf_16_adds_subs_sse2(x[9], x[10]);
1416 btf_16_subs_adds_sse2(x[15], x[12]);
1417 btf_16_subs_adds_sse2(x[14], x[13]);
1418 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1419
1420 // stage 8
1421 btf_16_adds_subs_sse2(x[0], x[7]);
1422 btf_16_adds_subs_sse2(x[1], x[6]);
1423 btf_16_adds_subs_sse2(x[2], x[5]);
1424 btf_16_adds_subs_sse2(x[3], x[4]);
1425 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
1426 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
1427 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1428
1429 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1430 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1431 idct64_stage11_sse2(output, x);
1432 }
1433
idct64_low32_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1434 static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1435 (void)cos_bit;
1436 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1437 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1438
1439 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1440 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1441 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1442 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1443 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1444
1445 // stage 1
1446 __m128i x[64];
1447 x[0] = input[0];
1448 x[2] = input[16];
1449 x[4] = input[8];
1450 x[6] = input[24];
1451 x[8] = input[4];
1452 x[10] = input[20];
1453 x[12] = input[12];
1454 x[14] = input[28];
1455 x[16] = input[2];
1456 x[18] = input[18];
1457 x[20] = input[10];
1458 x[22] = input[26];
1459 x[24] = input[6];
1460 x[26] = input[22];
1461 x[28] = input[14];
1462 x[30] = input[30];
1463 x[32] = input[1];
1464 x[34] = input[17];
1465 x[36] = input[9];
1466 x[38] = input[25];
1467 x[40] = input[5];
1468 x[42] = input[21];
1469 x[44] = input[13];
1470 x[46] = input[29];
1471 x[48] = input[3];
1472 x[50] = input[19];
1473 x[52] = input[11];
1474 x[54] = input[27];
1475 x[56] = input[7];
1476 x[58] = input[23];
1477 x[60] = input[15];
1478 x[62] = input[31];
1479
1480 // stage 2
1481 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1482 btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1483 btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1484 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1485 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1486 btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1487 btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1488 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1489 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1490 btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1491 btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1492 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1493 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1494 btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1495 btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1496 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1497
1498 // stage 3
1499 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1500 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1501 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1502 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1503 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1504 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1505 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1506 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1507 btf_16_adds_subs_sse2(x[32], x[33]);
1508 btf_16_subs_adds_sse2(x[35], x[34]);
1509 btf_16_adds_subs_sse2(x[36], x[37]);
1510 btf_16_subs_adds_sse2(x[39], x[38]);
1511 btf_16_adds_subs_sse2(x[40], x[41]);
1512 btf_16_subs_adds_sse2(x[43], x[42]);
1513 btf_16_adds_subs_sse2(x[44], x[45]);
1514 btf_16_subs_adds_sse2(x[47], x[46]);
1515 btf_16_adds_subs_sse2(x[48], x[49]);
1516 btf_16_subs_adds_sse2(x[51], x[50]);
1517 btf_16_adds_subs_sse2(x[52], x[53]);
1518 btf_16_subs_adds_sse2(x[55], x[54]);
1519 btf_16_adds_subs_sse2(x[56], x[57]);
1520 btf_16_subs_adds_sse2(x[59], x[58]);
1521 btf_16_adds_subs_sse2(x[60], x[61]);
1522 btf_16_subs_adds_sse2(x[63], x[62]);
1523
1524 // stage 4
1525 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1526 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1527 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1528 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1529 btf_16_adds_subs_sse2(x[16], x[17]);
1530 btf_16_subs_adds_sse2(x[19], x[18]);
1531 btf_16_adds_subs_sse2(x[20], x[21]);
1532 btf_16_subs_adds_sse2(x[23], x[22]);
1533 btf_16_adds_subs_sse2(x[24], x[25]);
1534 btf_16_subs_adds_sse2(x[27], x[26]);
1535 btf_16_adds_subs_sse2(x[28], x[29]);
1536 btf_16_subs_adds_sse2(x[31], x[30]);
1537 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1538
1539 // stage 5
1540 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1541 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1542 btf_16_adds_subs_sse2(x[8], x[9]);
1543 btf_16_subs_adds_sse2(x[11], x[10]);
1544 btf_16_adds_subs_sse2(x[12], x[13]);
1545 btf_16_subs_adds_sse2(x[15], x[14]);
1546 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1547
1548 // stage 6
1549 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1550 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1551 btf_16_adds_subs_sse2(x[4], x[5]);
1552 btf_16_subs_adds_sse2(x[7], x[6]);
1553 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
1554 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
1555 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1556
1557 // stage 7
1558 btf_16_adds_subs_sse2(x[0], x[3]);
1559 btf_16_adds_subs_sse2(x[1], x[2]);
1560 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
1561 btf_16_adds_subs_sse2(x[8], x[11]);
1562 btf_16_adds_subs_sse2(x[9], x[10]);
1563 btf_16_subs_adds_sse2(x[15], x[12]);
1564 btf_16_subs_adds_sse2(x[14], x[13]);
1565 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1566
1567 // stage 8
1568 btf_16_adds_subs_sse2(x[0], x[7]);
1569 btf_16_adds_subs_sse2(x[1], x[6]);
1570 btf_16_adds_subs_sse2(x[2], x[5]);
1571 btf_16_adds_subs_sse2(x[3], x[4]);
1572 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
1573 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
1574 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1575
1576 // stage 9~11
1577 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1578 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1579 idct64_stage11_sse2(output, x);
1580 }
1581
iadst4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1582 static void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1583 (void)cos_bit;
1584 const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1585 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1586 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1587 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1588 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1589 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1590 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1591 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1592 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1593 __m128i x0[4];
1594 x0[0] = input[0];
1595 x0[1] = input[1];
1596 x0[2] = input[2];
1597 x0[3] = input[3];
1598
1599 __m128i u[4];
1600 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1601 u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1602 u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1603 u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1604
1605 __m128i x1[16];
1606 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1607 x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
1608 x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1609 x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
1610 x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
1611 x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
1612 x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
1613 x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
1614 x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1615 x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
1616 x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
1617 x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1618 x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1619 x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1620 x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1621 x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1622
1623 __m128i x2[8];
1624 x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1625 x2[1] = _mm_add_epi32(x1[1], x1[5]);
1626 x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1627 x2[3] = _mm_add_epi32(x1[3], x1[7]);
1628 x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
1629 x2[5] = _mm_add_epi32(x1[9], x1[11]);
1630 x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1631 x2[7] = _mm_add_epi32(x1[13], x1[15]);
1632
1633 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1634 for (int32_t i = 0; i < 4; ++i) {
1635 __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1636 __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1637 out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1638 out1 = _mm_srai_epi32(out1, INV_COS_BIT);
1639 output[i] = _mm_packs_epi32(out0, out1);
1640 }
1641 }
1642
iadst4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1643 static void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1644 (void)cos_bit;
1645 const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1646 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1647 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1648 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1649 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1650 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1651 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1652 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1653 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1654 __m128i x0[4];
1655 x0[0] = input[0];
1656 x0[1] = input[1];
1657 x0[2] = input[2];
1658 x0[3] = input[3];
1659
1660 __m128i u[2];
1661 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1662 u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1663
1664 __m128i x1[8];
1665 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1666 x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1667 x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
1668 x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
1669 x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1670 x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
1671 x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1672 x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1673
1674 __m128i x2[4];
1675 x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1676 x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1677 x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
1678 x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1679
1680 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1681 for (int32_t i = 0; i < 4; ++i) {
1682 __m128i out0 = _mm_add_epi32(x2[i], rounding);
1683 out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1684 output[i] = _mm_packs_epi32(out0, out0);
1685 }
1686 }
1687
iadst8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1688 static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1689 (void)cos_bit;
1690 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1691 const __m128i __zero = _mm_setzero_si128();
1692 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1693
1694 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1695 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1696 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1697 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1698
1699 // stage 1
1700 __m128i x[8];
1701 x[1] = input[0];
1702
1703 // stage 2
1704 btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1705
1706 // stage 3
1707 x[4] = x[0];
1708 x[5] = x[1];
1709
1710 // stage 4
1711 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1712
1713 // stage 5
1714 x[2] = x[0];
1715 x[3] = x[1];
1716 x[6] = x[4];
1717 x[7] = x[5];
1718
1719 // stage 6
1720 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1721 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1722
1723 // stage 7
1724 output[0] = x[0];
1725 output[1] = _mm_subs_epi16(__zero, x[4]);
1726 output[2] = x[6];
1727 output[3] = _mm_subs_epi16(__zero, x[2]);
1728 output[4] = x[3];
1729 output[5] = _mm_subs_epi16(__zero, x[7]);
1730 output[6] = x[5];
1731 output[7] = _mm_subs_epi16(__zero, x[1]);
1732 }
1733
iadst8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1734 static void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1735 (void)cos_bit;
1736 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1737 const __m128i __zero = _mm_setzero_si128();
1738 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1739
1740 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1741 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1742 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1743 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1744 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1745 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1746 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1747 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1748 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1749 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1750 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1751 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1752 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1753
1754 // stage 1
1755 __m128i x[8];
1756 x[0] = input[7];
1757 x[1] = input[0];
1758 x[2] = input[5];
1759 x[3] = input[2];
1760 x[4] = input[3];
1761 x[5] = input[4];
1762 x[6] = input[1];
1763 x[7] = input[6];
1764
1765 // stage 2
1766 btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1], __rounding);
1767 btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3], __rounding);
1768 btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5], __rounding);
1769 btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7], __rounding);
1770
1771 // stage 3
1772 btf_16_adds_subs_sse2(x[0], x[4]);
1773 btf_16_adds_subs_sse2(x[1], x[5]);
1774 btf_16_adds_subs_sse2(x[2], x[6]);
1775 btf_16_adds_subs_sse2(x[3], x[7]);
1776
1777 // stage 4
1778 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1779 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
1780
1781 // stage 5
1782 btf_16_adds_subs_sse2(x[0], x[2]);
1783 btf_16_adds_subs_sse2(x[1], x[3]);
1784 btf_16_adds_subs_sse2(x[4], x[6]);
1785 btf_16_adds_subs_sse2(x[5], x[7]);
1786
1787 // stage 6
1788 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1789 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1790
1791 // stage 7
1792 output[0] = x[0];
1793 output[1] = _mm_subs_epi16(__zero, x[4]);
1794 output[2] = x[6];
1795 output[3] = _mm_subs_epi16(__zero, x[2]);
1796 output[4] = x[3];
1797 output[5] = _mm_subs_epi16(__zero, x[7]);
1798 output[6] = x[5];
1799 output[7] = _mm_subs_epi16(__zero, x[1]);
1800 }
1801
iadst8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1802 static void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1803 (void)cos_bit;
1804 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1805 const __m128i __zero = _mm_setzero_si128();
1806 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1807
1808 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1809 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1810 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1811 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1812 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1813 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1814 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1815 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1816 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1817 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1818 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1819 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1820 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1821
1822 // stage 1
1823 __m128i x[8];
1824 x[0] = input[7];
1825 x[1] = input[0];
1826 x[2] = input[5];
1827 x[3] = input[2];
1828 x[4] = input[3];
1829 x[5] = input[4];
1830 x[6] = input[1];
1831 x[7] = input[6];
1832
1833 // stage 2
1834 btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1], __rounding);
1835 btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3], __rounding);
1836 btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5], __rounding);
1837 btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7], __rounding);
1838
1839 // stage 3
1840 btf_16_adds_subs_sse2(x[0], x[4]);
1841 btf_16_adds_subs_sse2(x[1], x[5]);
1842 btf_16_adds_subs_sse2(x[2], x[6]);
1843 btf_16_adds_subs_sse2(x[3], x[7]);
1844
1845 // stage 4
1846 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1847 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
1848
1849 // stage 5
1850 btf_16_adds_subs_sse2(x[0], x[2]);
1851 btf_16_adds_subs_sse2(x[1], x[3]);
1852 btf_16_adds_subs_sse2(x[4], x[6]);
1853 btf_16_adds_subs_sse2(x[5], x[7]);
1854
1855 // stage 6
1856 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1857 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1858
1859 // stage 7
1860 output[0] = x[0];
1861 output[1] = _mm_subs_epi16(__zero, x[4]);
1862 output[2] = x[6];
1863 output[3] = _mm_subs_epi16(__zero, x[2]);
1864 output[4] = x[3];
1865 output[5] = _mm_subs_epi16(__zero, x[7]);
1866 output[6] = x[5];
1867 output[7] = _mm_subs_epi16(__zero, x[1]);
1868 }
1869
iadst16_stage3_ssse3(__m128i * x)1870 static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1871 btf_16_adds_subs_sse2(x[0], x[8]);
1872 btf_16_adds_subs_sse2(x[1], x[9]);
1873 btf_16_adds_subs_sse2(x[2], x[10]);
1874 btf_16_adds_subs_sse2(x[3], x[11]);
1875 btf_16_adds_subs_sse2(x[4], x[12]);
1876 btf_16_adds_subs_sse2(x[5], x[13]);
1877 btf_16_adds_subs_sse2(x[6], x[14]);
1878 btf_16_adds_subs_sse2(x[7], x[15]);
1879 }
1880
iadst16_stage4_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1881 static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1882 int8_t cos_bit) {
1883 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1884 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1885 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1886 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1887 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1888 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1889 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9], __rounding);
1890 btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11], __rounding);
1891 btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13], __rounding);
1892 btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15], __rounding);
1893 }
1894
iadst16_stage5_ssse3(__m128i * x)1895 static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1896 btf_16_adds_subs_sse2(x[0], x[4]);
1897 btf_16_adds_subs_sse2(x[1], x[5]);
1898 btf_16_adds_subs_sse2(x[2], x[6]);
1899 btf_16_adds_subs_sse2(x[3], x[7]);
1900 btf_16_adds_subs_sse2(x[8], x[12]);
1901 btf_16_adds_subs_sse2(x[9], x[13]);
1902 btf_16_adds_subs_sse2(x[10], x[14]);
1903 btf_16_adds_subs_sse2(x[11], x[15]);
1904 }
1905
iadst16_stage6_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1906 static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1907 int8_t cos_bit) {
1908 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1909 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1910 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1911 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1912 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
1913 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13], __rounding);
1914 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15], __rounding);
1915 }
1916
iadst16_stage7_ssse3(__m128i * x)1917 static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1918 btf_16_adds_subs_sse2(x[0], x[2]);
1919 btf_16_adds_subs_sse2(x[1], x[3]);
1920 btf_16_adds_subs_sse2(x[4], x[6]);
1921 btf_16_adds_subs_sse2(x[5], x[7]);
1922 btf_16_adds_subs_sse2(x[8], x[10]);
1923 btf_16_adds_subs_sse2(x[9], x[11]);
1924 btf_16_adds_subs_sse2(x[12], x[14]);
1925 btf_16_adds_subs_sse2(x[13], x[15]);
1926 }
1927
iadst16_stage8_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1928 static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1929 int8_t cos_bit) {
1930 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1931 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1932 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1933 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1934 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11], __rounding);
1935 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15], __rounding);
1936 }
1937
iadst16_stage9_ssse3(__m128i * output,__m128i * x)1938 static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1939 const __m128i __zero = _mm_setzero_si128();
1940 output[0] = x[0];
1941 output[1] = _mm_subs_epi16(__zero, x[8]);
1942 output[2] = x[12];
1943 output[3] = _mm_subs_epi16(__zero, x[4]);
1944 output[4] = x[6];
1945 output[5] = _mm_subs_epi16(__zero, x[14]);
1946 output[6] = x[10];
1947 output[7] = _mm_subs_epi16(__zero, x[2]);
1948 output[8] = x[3];
1949 output[9] = _mm_subs_epi16(__zero, x[11]);
1950 output[10] = x[15];
1951 output[11] = _mm_subs_epi16(__zero, x[7]);
1952 output[12] = x[5];
1953 output[13] = _mm_subs_epi16(__zero, x[13]);
1954 output[14] = x[9];
1955 output[15] = _mm_subs_epi16(__zero, x[1]);
1956 }
1957
iadst16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1958 static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1959 (void)cos_bit;
1960 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1961 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1962
1963 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1964 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1965 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1966 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1967
1968 // stage 1
1969 __m128i x[16];
1970 x[1] = input[0];
1971
1972 // stage 2
1973 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
1974
1975 // stage 3
1976 x[8] = x[0];
1977 x[9] = x[1];
1978
1979 // stage 4
1980 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9], __rounding);
1981
1982 // stage 5
1983 x[4] = x[0];
1984 x[5] = x[1];
1985 x[12] = x[8];
1986 x[13] = x[9];
1987
1988 // stage 6
1989 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1990 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13], __rounding);
1991
1992 // stage 7
1993 x[2] = x[0];
1994 x[3] = x[1];
1995 x[6] = x[4];
1996 x[7] = x[5];
1997 x[10] = x[8];
1998 x[11] = x[9];
1999 x[14] = x[12];
2000 x[15] = x[13];
2001
2002 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2003 iadst16_stage9_ssse3(output, x);
2004 }
2005
iadst16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2006 static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
2007 (void)cos_bit;
2008 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2009 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2010
2011 // stage 1
2012 __m128i x[16];
2013 x[1] = input[0];
2014 x[3] = input[2];
2015 x[5] = input[4];
2016 x[7] = input[6];
2017 x[8] = input[7];
2018 x[10] = input[5];
2019 x[12] = input[3];
2020 x[14] = input[1];
2021
2022 // stage 2
2023 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2024 btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2025 btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2026 btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2027 btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2028 btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2029 btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2030 btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2031
2032 // stage 3
2033 iadst16_stage3_ssse3(x);
2034 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2035 iadst16_stage5_ssse3(x);
2036 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2037 iadst16_stage7_ssse3(x);
2038 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2039 iadst16_stage9_ssse3(output, x);
2040 }
2041
iadst16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2042 static void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2043 (void)cos_bit;
2044 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2045 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2046 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2047 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2048 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2049 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2050 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2051 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2052 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2053 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2054 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2055 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2056 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2057 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2058 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2059 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2060 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2061 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2062
2063 // stage 1
2064 __m128i x[16];
2065 x[0] = input[15];
2066 x[1] = input[0];
2067 x[2] = input[13];
2068 x[3] = input[2];
2069 x[4] = input[11];
2070 x[5] = input[4];
2071 x[6] = input[9];
2072 x[7] = input[6];
2073 x[8] = input[7];
2074 x[9] = input[8];
2075 x[10] = input[5];
2076 x[11] = input[10];
2077 x[12] = input[3];
2078 x[13] = input[12];
2079 x[14] = input[1];
2080 x[15] = input[14];
2081
2082 // stage 2
2083 btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1], __rounding);
2084 btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3], __rounding);
2085 btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5], __rounding);
2086 btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7], __rounding);
2087 btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9], __rounding);
2088 btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11], __rounding);
2089 btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13], __rounding);
2090 btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15], __rounding);
2091
2092 // stage 3~9
2093 iadst16_stage3_ssse3(x);
2094 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2095 iadst16_stage5_ssse3(x);
2096 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2097 iadst16_stage7_ssse3(x);
2098 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2099 iadst16_stage9_ssse3(output, x);
2100 }
2101
iadst16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2102 static void iadst16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2103 (void)cos_bit;
2104 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2105 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2106
2107 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2108 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2109 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2110 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2111 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2112 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2113 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2114 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2115 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2116 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2117 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2118 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2119 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2120 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2121 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2122 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2123 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2124 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2125 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2126 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2127 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2128 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2129 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2130 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2131 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2132 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2133 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2134
2135 // stage 1
2136 __m128i x[16];
2137 x[0] = input[15];
2138 x[1] = input[0];
2139 x[2] = input[13];
2140 x[3] = input[2];
2141 x[4] = input[11];
2142 x[5] = input[4];
2143 x[6] = input[9];
2144 x[7] = input[6];
2145 x[8] = input[7];
2146 x[9] = input[8];
2147 x[10] = input[5];
2148 x[11] = input[10];
2149 x[12] = input[3];
2150 x[13] = input[12];
2151 x[14] = input[1];
2152 x[15] = input[14];
2153
2154 // stage 2
2155 btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1], __rounding);
2156 btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3], __rounding);
2157 btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5], __rounding);
2158 btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7], __rounding);
2159 btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9], __rounding);
2160 btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11], __rounding);
2161 btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13], __rounding);
2162 btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15], __rounding);
2163
2164 // stage 3
2165 iadst16_stage3_ssse3(x);
2166
2167 // stage 4
2168 btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9], __rounding);
2169 btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11], __rounding);
2170 btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13], __rounding);
2171 btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15], __rounding);
2172
2173 // stage 5
2174 iadst16_stage5_ssse3(x);
2175
2176 // stage 6
2177 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
2178 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
2179 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13], __rounding);
2180 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15], __rounding);
2181
2182 // stage 7
2183 iadst16_stage7_ssse3(x);
2184
2185 // stage 8
2186 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
2187 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
2188 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11], __rounding);
2189 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15], __rounding);
2190
2191 // stage 9
2192 iadst16_stage9_ssse3(output, x);
2193 }
2194
iidentity4_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2195 static void iidentity4_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
2196 (void)cos_bit;
2197 const int16_t scale_fractional = (new_sqrt2 - (1 << new_sqrt2_bits)) << (15 - new_sqrt2_bits);
2198 const __m128i scale = _mm_set1_epi16(scale_fractional);
2199 for (int32_t i = 0; i < 4; ++i) {
2200 __m128i x = _mm_mulhrs_epi16(input[i], scale);
2201 output[i] = _mm_adds_epi16(x, input[i]);
2202 }
2203 }
2204
iidentity8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2205 static void iidentity8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2206 (void)cos_bit;
2207 for (int32_t i = 0; i < 8; ++i) output[i] = _mm_adds_epi16(input[i], input[i]);
2208 }
2209
iidentity16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2210 static void iidentity16_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
2211 (void)cos_bit;
2212 const int16_t scale_fractional = (2 * (new_sqrt2 - (1 << new_sqrt2_bits)))
2213 << (15 - new_sqrt2_bits);
2214 const __m128i scale = _mm_set1_epi16(scale_fractional);
2215 for (int32_t i = 0; i < 16; ++i) {
2216 __m128i x = _mm_mulhrs_epi16(input[i], scale);
2217 __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2218 output[i] = _mm_adds_epi16(x, srcx2);
2219 }
2220 }
2221
lowbd_get_recon_8x8_sse2(const __m128i pred,__m128i res)2222 static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, __m128i res) {
2223 const __m128i zero = _mm_setzero_si128();
2224 __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2225 return _mm_packus_epi16(x0, x0);
2226 }
2227
lowbd_write_buffer_4xn_sse2(__m128i * in,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,int32_t flipud,const int32_t height)2228 static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output_r, int32_t stride_r,
2229 uint8_t *output_w, int32_t stride_w, int32_t flipud,
2230 const int32_t height) {
2231 int32_t j = flipud ? (height - 1) : 0;
2232 const int32_t step = flipud ? -1 : 1;
2233 const __m128i zero = _mm_setzero_si128();
2234 for (int32_t i = 0; i < height; ++i, j += step) {
2235 const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output_r + i * stride_r)));
2236 __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2237 u = _mm_packus_epi16(u, zero);
2238 *((uint32_t *)(output_w + i * stride_w)) = _mm_cvtsi128_si32(u);
2239 }
2240 }
2241
lowbd_write_buffer_8xn_sse2(__m128i * in,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,int32_t flipud,const int32_t height)2242 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output_r, int32_t stride_r,
2243 uint8_t *output_w, int32_t stride_w, int32_t flipud,
2244 const int32_t height) {
2245 int32_t j = flipud ? (height - 1) : 0;
2246 const int32_t step = flipud ? -1 : 1;
2247 for (int32_t i = 0; i < height; ++i, j += step) {
2248 const __m128i v = _mm_loadl_epi64((__m128i const *)(output_r + i * stride_r));
2249 const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2250 _mm_storel_epi64((__m128i *)(output_w + i * stride_w), u);
2251 }
2252 }
2253
2254 // 1D functions process process 8 pixels at one time.
2255 static const Transform1dSsse3 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2256 {idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3},
2257 {idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2},
2258 {idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3},
2259 {idct32_new_sse2, NULL, NULL},
2260 {idct64_low32_new_ssse3, NULL, NULL},
2261 };
2262
2263 // functions for blocks with eob at DC and within
2264 // topleft 8x8, 16x16, 32x32 corner
2265 static const Transform1dSsse3 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2266 {
2267 {idct4_new_sse2, idct4_new_sse2, NULL, NULL},
2268 {iadst4_new_sse2, iadst4_new_sse2, NULL, NULL},
2269 {iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL},
2270 },
2271 {{idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL},
2272 {iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL},
2273 {iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL}},
2274 {
2275 {idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2, NULL},
2276 {iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2, NULL},
2277 {NULL, NULL, NULL, NULL},
2278 },
2279 {{idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3, idct32_new_sse2},
2280 {NULL, NULL, NULL, NULL},
2281 {NULL, NULL, NULL, NULL}},
2282 {{idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3, idct64_low32_new_ssse3},
2283 {NULL, NULL, NULL, NULL},
2284 {NULL, NULL, NULL, NULL}}};
2285
2286 // 1D functions process process 4 pixels at one time.
2287 // used in 4x4, 4x8, 4x16, 8x4, 16x4
2288 static const Transform1dSsse3 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2289 {idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3},
2290 {idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2},
2291 {idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3},
2292 {NULL, NULL, NULL},
2293 {NULL, NULL, NULL},
2294 };
2295
iidentity_row_8xn_ssse3(__m128i * out,const int32_t * input,int32_t stride,int32_t shift,int32_t height,int32_t txw_idx,int32_t rect_type)2296 static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, int32_t stride,
2297 int32_t shift, int32_t height, int32_t txw_idx,
2298 int32_t rect_type) {
2299 const int32_t *input_row = input;
2300 const __m128i scale = _mm_set1_epi16(new_sqrt2list[txw_idx]);
2301 const __m128i rounding = _mm_set1_epi16((1 << (new_sqrt2_bits - 1)) +
2302 (1 << (new_sqrt2_bits - shift - 1)));
2303 const __m128i one = _mm_set1_epi16(1);
2304 const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2305 if (rect_type != 1 && rect_type != -1) {
2306 for (int32_t i = 0; i < height; ++i) {
2307 const __m128i src = load_32bit_to_16bit(input_row);
2308 input_row += stride;
2309 __m128i lo = _mm_unpacklo_epi16(src, one);
2310 __m128i hi = _mm_unpackhi_epi16(src, one);
2311 lo = _mm_madd_epi16(lo, scale_rounding);
2312 hi = _mm_madd_epi16(hi, scale_rounding);
2313 lo = _mm_srai_epi32(lo, new_sqrt2_bits - shift);
2314 hi = _mm_srai_epi32(hi, new_sqrt2_bits - shift);
2315 out[i] = _mm_packs_epi32(lo, hi);
2316 }
2317 } else {
2318 const __m128i rect_scale = _mm_set1_epi16(new_inv_sqrt2 << (15 - new_sqrt2_bits));
2319 for (int32_t i = 0; i < height; ++i) {
2320 __m128i src = load_32bit_to_16bit(input_row);
2321 src = _mm_mulhrs_epi16(src, rect_scale);
2322 input_row += stride;
2323 __m128i lo = _mm_unpacklo_epi16(src, one);
2324 __m128i hi = _mm_unpackhi_epi16(src, one);
2325 lo = _mm_madd_epi16(lo, scale_rounding);
2326 hi = _mm_madd_epi16(hi, scale_rounding);
2327 lo = _mm_srai_epi32(lo, new_sqrt2_bits - shift);
2328 hi = _mm_srai_epi32(hi, new_sqrt2_bits - shift);
2329 out[i] = _mm_packs_epi32(lo, hi);
2330 }
2331 }
2332 }
2333
iidentity_col_8xn_ssse3(uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,__m128i * buf,int32_t shift,int32_t height,int32_t txh_idx)2334 static INLINE void iidentity_col_8xn_ssse3(uint8_t *output_r, int32_t stride_r, uint8_t *output_w,
2335 int32_t stride_w, __m128i *buf, int32_t shift,
2336 int32_t height, int32_t txh_idx) {
2337 const __m128i scale = _mm_set1_epi16(new_sqrt2list[txh_idx]);
2338 const __m128i scale_rounding = _mm_set1_epi16(1 << (new_sqrt2_bits - 1));
2339 const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2340 const __m128i one = _mm_set1_epi16(1);
2341 const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
2342 const __m128i zero = _mm_setzero_si128();
2343 for (int32_t h = 0; h < height; ++h) {
2344 __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2345 __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2346 lo = _mm_madd_epi16(lo, scale_coeff);
2347 hi = _mm_madd_epi16(hi, scale_coeff);
2348 lo = _mm_srai_epi32(lo, new_sqrt2_bits);
2349 hi = _mm_srai_epi32(hi, new_sqrt2_bits);
2350 lo = _mm_add_epi32(lo, shift_rounding);
2351 hi = _mm_add_epi32(hi, shift_rounding);
2352 lo = _mm_srai_epi32(lo, -shift);
2353 hi = _mm_srai_epi32(hi, -shift);
2354 __m128i x = _mm_packs_epi32(lo, hi);
2355
2356 const __m128i pred = _mm_loadl_epi64((__m128i const *)(output_r));
2357 x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2358 const __m128i u = _mm_packus_epi16(x, x);
2359 _mm_storel_epi64((__m128i *)(output_w), u);
2360 output_r += stride_r;
2361 output_w += stride_w;
2362 }
2363 }
2364
lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxSize tx_size)2365 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output_r,
2366 int32_t stride_r, uint8_t *output_w,
2367 int32_t stride_w, TxSize tx_size) {
2368 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2369 const int32_t txw_idx = get_txw_idx(tx_size);
2370 const int32_t txh_idx = get_txh_idx(tx_size);
2371 const int32_t txfm_size_col = tx_size_wide[tx_size];
2372 const int32_t txfm_size_row = tx_size_high[tx_size];
2373 const int32_t input_stride = AOMMIN(32, txfm_size_col);
2374 const int32_t row_max = AOMMIN(32, txfm_size_row);
2375 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2376 __m128i buf[32];
2377
2378 for (int32_t i = 0; i < (input_stride >> 3); ++i) {
2379 iidentity_row_8xn_ssse3(
2380 buf, input + 8 * i, input_stride, shift[0], row_max, txw_idx, rect_type);
2381 iidentity_col_8xn_ssse3(output_r + 8 * i,
2382 stride_r,
2383 output_w + 8 * i,
2384 stride_w,
2385 buf,
2386 shift[1],
2387 row_max,
2388 txh_idx);
2389 }
2390 }
2391
lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2392 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output_r,
2393 int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2394 TxType tx_type, TxSize tx_size_, int32_t eob) {
2395 (void)tx_size_;
2396 (void)eob;
2397 __m128i buf[4];
2398 const TxSize tx_size = TX_4X4;
2399 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2400 const int32_t txw_idx = get_txw_idx(tx_size);
2401 const int32_t txh_idx = get_txh_idx(tx_size);
2402 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2403 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2404 const int32_t txfm_size_col = tx_size_wide[tx_size];
2405 const int32_t txfm_size_row = tx_size_high[tx_size];
2406
2407 const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2408 const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2409
2410 int32_t ud_flip, lr_flip;
2411 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2412 load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2413 transpose_16bit_4x4(buf, buf);
2414 row_txfm(buf, buf, cos_bit_row);
2415 if (lr_flip) {
2416 __m128i temp[4];
2417 flip_buf_sse2(buf, temp, txfm_size_col);
2418 transpose_16bit_4x4(temp, buf);
2419 } else
2420 transpose_16bit_4x4(buf, buf);
2421 col_txfm(buf, buf, cos_bit_col);
2422 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2423 lowbd_write_buffer_4xn_sse2(
2424 buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2425 }
2426
lowbd_get_recon_16x16_sse2(const __m128i pred,__m128i res0,__m128i res1)2427 static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, __m128i res0, __m128i res1) {
2428 const __m128i zero = _mm_setzero_si128();
2429 __m128i x0 = _mm_unpacklo_epi8(pred, zero);
2430 __m128i x1 = _mm_unpackhi_epi8(pred, zero);
2431 x0 = _mm_adds_epi16(res0, x0);
2432 x1 = _mm_adds_epi16(res1, x1);
2433 return _mm_packus_epi16(x0, x1);
2434 }
2435
lowbd_write_buffer_16xn_sse2(__m128i * in,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,int32_t flipud,int32_t height)2436 static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output_r, int32_t stride_r,
2437 uint8_t *output_w, int32_t stride_w, int32_t flipud,
2438 int32_t height) {
2439 int32_t j = flipud ? (height - 1) : 0;
2440 const int32_t step = flipud ? -1 : 1;
2441 for (int32_t i = 0; i < height; ++i, j += step) {
2442 __m128i v = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
2443 __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2444 _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u);
2445 }
2446 }
2447
round_shift_ssse3(const __m128i * input,__m128i * output,int32_t size)2448 static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output, int32_t size) {
2449 const __m128i scale = _mm_set1_epi16(new_inv_sqrt2 * 8);
2450 for (int32_t i = 0; i < size; ++i) output[i] = _mm_mulhrs_epi16(input[i], scale);
2451 }
2452
lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2453 static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t *input, uint8_t *output_r,
2454 int32_t stride_r, uint8_t *output_w,
2455 int32_t stride_w, TxType tx_type,
2456 TxSize tx_size, int32_t eob) {
2457 __m128i buf1[64 * 8];
2458 int32_t eobx, eoby;
2459 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2460 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2461 const int32_t txw_idx = get_txw_idx(tx_size);
2462 const int32_t txh_idx = get_txh_idx(tx_size);
2463 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2464 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2465 const int32_t txfm_size_col = tx_size_wide[tx_size];
2466 const int32_t txfm_size_row = tx_size_high[tx_size];
2467 const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2468 const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2469 const int32_t buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2470 const int32_t input_stride = AOMMIN(32, txfm_size_col);
2471 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2472 ASSERT(eobx < 32);
2473 ASSERT(eoby < 32);
2474 const int32_t fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2475 const int32_t fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2476 const Transform1dSsse3 row_txfm =
2477 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2478 const Transform1dSsse3 col_txfm =
2479 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2480
2481 assert(col_txfm != NULL);
2482 assert(row_txfm != NULL);
2483 int32_t ud_flip, lr_flip;
2484 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2485 for (int32_t i = 0; i < buf_size_nonzero_h_div8; i++) {
2486 __m128i buf0[64];
2487 const int32_t *input_row = input + i * input_stride * 8;
2488 for (int32_t j = 0; j < buf_size_nonzero_w_div8; ++j) {
2489 __m128i *buf0_cur = buf0 + j * 8;
2490 load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2491 transpose_16bit_8x8(buf0_cur, buf0_cur);
2492 }
2493 if (rect_type == 1 || rect_type == -1)
2494 round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2495 row_txfm(buf0, buf0, cos_bit_row);
2496 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2497 __m128i *_buf1 = buf1 + i * 8;
2498 if (lr_flip) {
2499 for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2500 __m128i temp[8];
2501 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2502 transpose_16bit_8x8(temp, _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2503 }
2504 } else {
2505 for (int32_t j = 0; j < buf_size_w_div8; ++j)
2506 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2507 }
2508 }
2509 for (int32_t i = 0; i < buf_size_w_div8; i++) {
2510 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
2511 round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2512 }
2513
2514 if (txfm_size_col >= 16) {
2515 for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
2516 lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2517 output_r + 16 * i,
2518 stride_r,
2519 output_w + 16 * i,
2520 stride_w,
2521 ud_flip,
2522 txfm_size_row);
2523 }
2524 } else if (txfm_size_col == 8)
2525 lowbd_write_buffer_8xn_sse2(
2526 buf1, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2527 }
2528
lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2529 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, uint8_t *output_r,
2530 int32_t stride_r, uint8_t *output_w,
2531 int32_t stride_w, TxType tx_type,
2532 TxSize tx_size, int32_t eob) {
2533 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2534 int32_t eobx, eoby;
2535 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2536 const int32_t txw_idx = get_txw_idx(tx_size);
2537 const int32_t txh_idx = get_txh_idx(tx_size);
2538 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2539 const int32_t txfm_size_col = tx_size_wide[tx_size];
2540 const int32_t txfm_size_row = tx_size_high[tx_size];
2541 const int32_t buf_size_w_div8 = (eobx + 8) >> 3;
2542 const int32_t input_stride = AOMMIN(32, txfm_size_col);
2543 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2544
2545 const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2546 ASSERT(fun_idx < 4);
2547 const Transform1dSsse3 col_txfm =
2548 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2549
2550 assert(col_txfm != NULL);
2551
2552 int32_t ud_flip, lr_flip;
2553 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2554 for (int32_t i = 0; i < buf_size_w_div8; i++) {
2555 __m128i buf0[64];
2556 iidentity_row_8xn_ssse3(
2557 buf0, input + 8 * i, input_stride, shift[0], eoby + 1, txw_idx, rect_type);
2558 col_txfm(buf0, buf0, cos_bit_col);
2559 __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2560 int32_t k = ud_flip ? (txfm_size_row - 1) : 0;
2561 const int32_t step = ud_flip ? -1 : 1;
2562 uint8_t * out_r = output_r + 8 * i;
2563 uint8_t * out_w = output_w + 8 * i;
2564 for (int32_t j = 0; j < txfm_size_row; ++j, k += step) {
2565 const __m128i v = _mm_loadl_epi64((__m128i const *)(out_r));
2566 ASSERT(k >= 0);
2567 __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
2568 const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
2569 _mm_storel_epi64((__m128i *)(out_w), u);
2570 out_r += stride_r;
2571 out_w += stride_w;
2572 }
2573 }
2574 }
2575
lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2576 static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, uint8_t *output_r,
2577 int32_t stride_r, uint8_t *output_w,
2578 int32_t stride_w, TxType tx_type,
2579 TxSize tx_size, int32_t eob) {
2580 __m128i buf1[64];
2581 int32_t eobx, eoby;
2582 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2583 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2584 const int32_t txw_idx = get_txw_idx(tx_size);
2585 const int32_t txh_idx = get_txh_idx(tx_size);
2586 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2587 const int32_t txfm_size_col = tx_size_wide[tx_size];
2588 const int32_t txfm_size_row = tx_size_high[tx_size];
2589 const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2590 const int32_t buf_size_h_div8 = (eoby + 8) >> 3;
2591 const int32_t input_stride = AOMMIN(32, txfm_size_col);
2592 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2593
2594 const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2595 const Transform1dSsse3 row_txfm =
2596 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2597
2598 assert(row_txfm != NULL);
2599 int32_t ud_flip, lr_flip;
2600 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2601 for (int32_t i = 0; i < buf_size_h_div8; i++) {
2602 __m128i buf0[64];
2603 const int32_t *input_row = input + i * input_stride * 8;
2604 for (int32_t j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2605 __m128i *buf0_cur = buf0 + j * 8;
2606 load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2607 transpose_16bit_8x8(buf0_cur, buf0_cur);
2608 }
2609 if (rect_type == 1 || rect_type == -1)
2610 round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2611 row_txfm(buf0, buf0, cos_bit_row);
2612 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2613 __m128i *_buf1 = buf1;
2614 if (lr_flip) {
2615 for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2616 __m128i temp[8];
2617 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2618 transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2619 }
2620 } else {
2621 for (int32_t j = 0; j < buf_size_w_div8; ++j)
2622 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2623 }
2624
2625 for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2626 iidentity_col_8xn_ssse3(output_r + i * 8 * stride_r + j * 8,
2627 stride_r,
2628 output_w + i * 8 * stride_w + j * 8,
2629 stride_w,
2630 buf1 + j * 8,
2631 shift[1],
2632 8,
2633 txh_idx);
2634 }
2635 }
2636 }
2637
2638 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2639 static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(const int32_t *input, uint8_t *output_r,
2640 int32_t stride_r, uint8_t *output_w,
2641 int32_t stride_w, TxType tx_type,
2642 TxSize tx_size, int32_t eob) {
2643 switch (tx_type) {
2644 case DCT_DCT:
2645 lowbd_inv_txfm2d_add_no_identity_ssse3(
2646 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2647 break;
2648 case IDTX:
2649 lowbd_inv_txfm2d_add_idtx_ssse3(input, output_r, stride_r, output_w, stride_w, tx_size);
2650 break;
2651 case V_DCT:
2652 case V_ADST:
2653 case V_FLIPADST:
2654 lowbd_inv_txfm2d_add_h_identity_ssse3(
2655 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2656 break;
2657 case H_DCT:
2658 case H_ADST:
2659 case H_FLIPADST:
2660 lowbd_inv_txfm2d_add_v_identity_ssse3(
2661 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2662 break;
2663 default:
2664 lowbd_inv_txfm2d_add_no_identity_ssse3(
2665 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2666 break;
2667 }
2668 }
2669
lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2670 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output_r,
2671 int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2672 TxType tx_type, TxSize tx_size_, int32_t eob) {
2673 (void)tx_size_;
2674 (void)eob;
2675 __m128i buf[8];
2676 const TxSize tx_size = TX_4X8;
2677 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2678 const int32_t txw_idx = get_txw_idx(tx_size);
2679 const int32_t txh_idx = get_txh_idx(tx_size);
2680 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2681 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2682 const int32_t txfm_size_col = tx_size_wide[tx_size];
2683 const int32_t txfm_size_row = tx_size_high[tx_size];
2684
2685 const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2686 const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2687
2688 int32_t ud_flip, lr_flip;
2689 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2690 load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2691 transpose_16bit_4x8(buf, buf);
2692 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2693 row_txfm(buf, buf, cos_bit_row);
2694 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2695 if (lr_flip) {
2696 __m128i temp[4];
2697 flip_buf_sse2(buf, temp, txfm_size_col);
2698 transpose_16bit_8x4(temp, buf);
2699 } else
2700 transpose_16bit_8x4(buf, buf);
2701 col_txfm(buf, buf, cos_bit_col);
2702 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2703 lowbd_write_buffer_4xn_sse2(
2704 buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2705 }
2706
lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2707 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output_r,
2708 int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2709 TxType tx_type, TxSize tx_size_, int32_t eob) {
2710 (void)tx_size_;
2711 (void)eob;
2712 __m128i buf[8];
2713 const TxSize tx_size = TX_8X4;
2714 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2715 const int32_t txw_idx = get_txw_idx(tx_size);
2716 const int32_t txh_idx = get_txh_idx(tx_size);
2717 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2718 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2719 const int32_t txfm_size_col = tx_size_wide[tx_size];
2720 const int32_t txfm_size_row = tx_size_high[tx_size];
2721
2722 const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2723 const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2724
2725 int32_t ud_flip, lr_flip;
2726 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2727 load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2728 transpose_16bit_8x4(buf, buf);
2729 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2730 row_txfm(buf, buf, cos_bit_row);
2731 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2732 if (lr_flip) {
2733 __m128i temp[8];
2734 flip_buf_sse2(buf, temp, txfm_size_col);
2735 transpose_16bit_4x8(temp, buf);
2736 } else
2737 transpose_16bit_4x8(buf, buf);
2738 col_txfm(buf, buf, cos_bit_col);
2739 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2740 lowbd_write_buffer_8xn_sse2(
2741 buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2742 }
2743
lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2744 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output_r,
2745 int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2746 TxType tx_type, TxSize tx_size_, int32_t eob) {
2747 (void)tx_size_;
2748 (void)eob;
2749 __m128i buf[16];
2750 const TxSize tx_size = TX_4X16;
2751 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2752 const int32_t txw_idx = get_txw_idx(tx_size);
2753 const int32_t txh_idx = get_txh_idx(tx_size);
2754 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2755 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2756 const int32_t txfm_size_col = tx_size_wide[tx_size];
2757 const int32_t txfm_size_row = tx_size_high[tx_size];
2758
2759 const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2760 const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2761
2762 int32_t ud_flip, lr_flip;
2763 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2764
2765 const int32_t row_one_loop = 8;
2766 for (int32_t i = 0; i < 2; ++i) {
2767 const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2768 __m128i * buf_cur = buf + i * row_one_loop;
2769 load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur, row_one_loop);
2770 transpose_16bit_4x8(buf_cur, buf_cur);
2771 if (row_txfm == iidentity4_new_ssse3) {
2772 const __m128i scale = pair_set_epi16(new_sqrt2, 3 << (new_sqrt2_bits - 1));
2773 const __m128i ones = _mm_set1_epi16(1);
2774 for (int j = 0; j < 4; ++j) {
2775 const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
2776 const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
2777 const __m128i buf_32_lo = _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale),
2778 (new_sqrt2_bits + 1));
2779 const __m128i buf_32_hi = _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale),
2780 (new_sqrt2_bits + 1));
2781 buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2782 }
2783 } else {
2784 row_txfm(buf_cur, buf_cur, cos_bit_row);
2785 round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2786 }
2787 if (lr_flip) {
2788 __m128i temp[8];
2789 flip_buf_sse2(buf_cur, temp, txfm_size_col);
2790 transpose_16bit_8x4(temp, buf_cur);
2791 } else
2792 transpose_16bit_8x4(buf_cur, buf_cur);
2793 }
2794 col_txfm(buf, buf, cos_bit_col);
2795 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2796 lowbd_write_buffer_4xn_sse2(
2797 buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2798 }
2799
lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2800 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output_r,
2801 int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2802 TxType tx_type, TxSize tx_size_, int32_t eob) {
2803 (void)tx_size_;
2804 (void)eob;
2805 __m128i buf[16];
2806 const TxSize tx_size = TX_16X4;
2807 const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2808 const int32_t txw_idx = get_txw_idx(tx_size);
2809 const int32_t txh_idx = get_txh_idx(tx_size);
2810 const int32_t cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2811 const int32_t cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2812 const int32_t txfm_size_col = tx_size_wide[tx_size];
2813 const int32_t txfm_size_row = tx_size_high[tx_size];
2814 const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2815
2816 const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2817 const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2818
2819 int32_t ud_flip, lr_flip;
2820 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2821 const int32_t row_one_loop = 8;
2822 assert(buf_size_w_div8 > 0);
2823 for (int32_t i = 0; i < buf_size_w_div8; ++i) {
2824 const int32_t *input_cur = input + i * row_one_loop;
2825 __m128i * buf_cur = buf + i * row_one_loop;
2826 load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur, txfm_size_row);
2827 transpose_16bit_8x4(buf_cur, buf_cur);
2828 }
2829 if (row_txfm == iidentity16_new_ssse3) {
2830 const __m128i scale = pair_set_epi16(2 * new_sqrt2, 3 << (new_sqrt2_bits - 1));
2831 const __m128i ones = _mm_set1_epi16(1);
2832 for (int j = 0; j < 16; ++j) {
2833 const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
2834 const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
2835 const __m128i buf_32_lo = _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale),
2836 (new_sqrt2_bits + 1));
2837 const __m128i buf_32_hi = _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale),
2838 (new_sqrt2_bits + 1));
2839 buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2840 }
2841 } else {
2842 row_txfm(buf, buf, cos_bit_row);
2843 round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2844 }
2845 if (lr_flip) {
2846 __m128i temp[16];
2847 flip_buf_sse2(buf, temp, 16);
2848 transpose_16bit_4x8(temp, buf);
2849 transpose_16bit_4x8(temp + 8, buf + 8);
2850 } else {
2851 transpose_16bit_4x8(buf, buf);
2852 transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2853 }
2854 for (int32_t i = 0; i < buf_size_w_div8; i++) {
2855 col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
2856 round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2857 }
2858 lowbd_write_buffer_8xn_sse2(buf, output_r, stride_r, output_w, stride_w, ud_flip, 4);
2859 lowbd_write_buffer_8xn_sse2(
2860 buf + 8, output_r + 8, stride_r, output_w + 8, stride_w, ud_flip, 4);
2861 }
2862
svt_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2863 void svt_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output_r, int32_t stride_r,
2864 uint8_t *output_w, int32_t stride_w, TxType tx_type,
2865 TxSize tx_size, int32_t eob) {
2866 switch (tx_size) {
2867 case TX_4X4:
2868 lowbd_inv_txfm2d_add_4x4_ssse3(
2869 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2870 break;
2871 case TX_4X8:
2872 lowbd_inv_txfm2d_add_4x8_ssse3(
2873 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2874 break;
2875 case TX_8X4:
2876 lowbd_inv_txfm2d_add_8x4_ssse3(
2877 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2878 break;
2879 case TX_4X16:
2880 lowbd_inv_txfm2d_add_4x16_ssse3(
2881 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2882 break;
2883 case TX_16X4:
2884 lowbd_inv_txfm2d_add_16x4_ssse3(
2885 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2886 break;
2887 default:
2888 lowbd_inv_txfm2d_add_universe_ssse3(
2889 input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2890 break;
2891 }
2892 }
2893
svt_av1_inv_txfm_add_ssse3(const TranLow * dqcoeff,uint8_t * dst_r,int32_t stride_r,uint8_t * dst_w,int32_t stride_w,const TxfmParam * txfm_param)2894 void svt_av1_inv_txfm_add_ssse3(const TranLow *dqcoeff, uint8_t *dst_r, int32_t stride_r,
2895 uint8_t *dst_w, int32_t stride_w, const TxfmParam *txfm_param) {
2896 const TxType tx_type = txfm_param->tx_type;
2897 if (!txfm_param->lossless) {
2898 svt_av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff,
2899 dst_r,
2900 stride_r,
2901 dst_w,
2902 stride_w,
2903 tx_type,
2904 txfm_param->tx_size,
2905 txfm_param->eob);
2906 } else {
2907 svt_av1_inv_txfm_add_c(dqcoeff, dst_r, stride_r, dst_w, stride_w, txfm_param);
2908 }
2909 }
2910