1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #include "EbDefinitions.h"
13 #include "common_dsp_rtcd.h"
14 #include <tmmintrin.h>
15 #include "EbInvTransforms.h"
16 #include "av1_inv_txfm_ssse3.h"
17 #include "av1_txfm_sse2.h"
18 #include "transpose_sse2.h"
19 
20 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
21 static int32_t new_sqrt2list[TX_SIZES] = {5793, 2 * 4096, 2 * 5793, 4 * 4096, 4 * 5793};
22 
idct4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)23 static void idct4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
24     (void)cos_bit;
25     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
26     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
27 
28     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
29     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
30     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
31     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
32 
33     // stage 1
34     __m128i x[4];
35     x[0] = input[0];
36     x[1] = input[2];
37     x[2] = input[1];
38     x[3] = input[3];
39 
40     // stage 2
41     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
42     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
43 
44     // stage 3
45     btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
46     btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
47 }
48 
idct4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)49 static void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
50     (void)cos_bit;
51     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
52     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
53 
54     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
55     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
56     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
57     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
58 
59     // stage 1
60     __m128i x[4];
61     x[0] = input[0];
62     x[1] = input[2];
63     x[2] = input[1];
64     x[3] = input[3];
65 
66     // stage 2
67     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
68     btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
69 
70     // stage 3
71     btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
72     btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
73 }
74 
idct8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)75 static void idct8_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
76     (void)cos_bit;
77     const int32_t *cospi = cospi_arr(INV_COS_BIT);
78 
79     // stage 1
80     __m128i x[2];
81     x[0] = input[0];
82 
83     // stage 2
84     // stage 3
85     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
86 
87     // stage 4
88     // stage 5
89     output[0] = x[0];
90     output[7] = x[0];
91     output[1] = x[1];
92     output[6] = x[1];
93     output[2] = x[1];
94     output[5] = x[1];
95     output[3] = x[0];
96     output[4] = x[0];
97 }
98 
idct8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)99 static void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
100     (void)cos_bit;
101     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
102     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
103 
104     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
105     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
106     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
107     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
108     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
109     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
110     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
111     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
112     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
113 
114     // stage 1
115     __m128i x[8];
116     x[0] = input[0];
117     x[1] = input[4];
118     x[2] = input[2];
119     x[3] = input[6];
120     x[4] = input[1];
121     x[5] = input[5];
122     x[6] = input[3];
123     x[7] = input[7];
124 
125     // stage 2
126     btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
127     btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
128 
129     // stage 3
130     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
131     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
132     btf_16_adds_subs_sse2(x[4], x[5]);
133     btf_16_subs_adds_sse2(x[7], x[6]);
134 
135     // stage 4
136     btf_16_adds_subs_sse2(x[0], x[3]);
137     btf_16_adds_subs_sse2(x[1], x[2]);
138     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
139 
140     // stage 5
141     btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
142     btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
143     btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
144     btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
145 }
146 
idct8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)147 static void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
148     (void)cos_bit;
149     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
150     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
151 
152     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
153     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
154     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
155     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
156     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
157     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
158     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
159     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
160     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
161 
162     // stage 1
163     __m128i x[8];
164     x[0] = input[0];
165     x[1] = input[4];
166     x[2] = input[2];
167     x[3] = input[6];
168     x[4] = input[1];
169     x[5] = input[5];
170     x[6] = input[3];
171     x[7] = input[7];
172 
173     // stage 2
174     btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
175     btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
176 
177     // stage 3
178     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
179     btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
180     btf_16_adds_subs_sse2(x[4], x[5]);
181     btf_16_subs_adds_sse2(x[7], x[6]);
182 
183     // stage 4
184     btf_16_adds_subs_sse2(x[0], x[3]);
185     btf_16_adds_subs_sse2(x[1], x[2]);
186     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
187 
188     // stage 5
189     btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
190     btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
191     btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
192     btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
193 }
194 
idct16_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)195 static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
196                                       int8_t cos_bit) {
197     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
198     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
199     btf_16_adds_subs_sse2(x[0], x[3]);
200     btf_16_adds_subs_sse2(x[1], x[2]);
201     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
202     btf_16_adds_subs_sse2(x[8], x[11]);
203     btf_16_adds_subs_sse2(x[9], x[10]);
204     btf_16_subs_adds_sse2(x[15], x[12]);
205     btf_16_subs_adds_sse2(x[14], x[13]);
206 }
207 
idct16_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)208 static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
209                                       int8_t cos_bit) {
210     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
211     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
212     btf_16_adds_subs_sse2(x[0], x[7]);
213     btf_16_adds_subs_sse2(x[1], x[6]);
214     btf_16_adds_subs_sse2(x[2], x[5]);
215     btf_16_adds_subs_sse2(x[3], x[4]);
216     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
217     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
218 }
219 
idct16_stage7_sse2(__m128i * output,__m128i * x)220 static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
221     btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
222     btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
223     btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
224     btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
225     btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
226     btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
227     btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
228     btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
229 }
230 
idct16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)231 static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
232     (void)cos_bit;
233     const int32_t *cospi = cospi_arr(INV_COS_BIT);
234 
235     // stage 1
236     __m128i x[2];
237     x[0] = input[0];
238 
239     // stage 2
240     // stage 3
241     // stage 4
242     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
243 
244     // stage 5
245     // stage 6
246     // stage 7
247     output[0]  = x[0];
248     output[15] = x[0];
249     output[1]  = x[1];
250     output[14] = x[1];
251     output[2]  = x[1];
252     output[13] = x[1];
253     output[3]  = x[0];
254     output[12] = x[0];
255     output[4]  = x[0];
256     output[11] = x[0];
257     output[5]  = x[1];
258     output[10] = x[1];
259     output[6]  = x[1];
260     output[9]  = x[1];
261     output[7]  = x[0];
262     output[8]  = x[0];
263 }
264 
idct16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)265 static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
266     (void)cos_bit;
267     const int32_t *cospi         = cospi_arr(INV_COS_BIT);
268     const __m128i  __rounding    = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
269     const __m128i  cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
270     const __m128i  cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
271     const __m128i  cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
272 
273     // stage 1
274     __m128i x[16];
275     x[0]  = input[0];
276     x[2]  = input[4];
277     x[4]  = input[2];
278     x[6]  = input[6];
279     x[8]  = input[1];
280     x[10] = input[5];
281     x[12] = input[3];
282     x[14] = input[7];
283 
284     // stage 2
285     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
286     btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
287     btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
288     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
289 
290     // stage 3
291     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
292     btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
293     btf_16_adds_subs_sse2(x[8], x[9]);
294     btf_16_subs_adds_sse2(x[11], x[10]);
295     btf_16_adds_subs_sse2(x[12], x[13]);
296     btf_16_subs_adds_sse2(x[15], x[14]);
297 
298     // stage 4
299     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
300     btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
301     btf_16_adds_subs_sse2(x[4], x[5]);
302     btf_16_subs_adds_sse2(x[7], x[6]);
303     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
304     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
305 
306     idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
307     idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
308     idct16_stage7_sse2(output, x);
309 }
310 
idct16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)311 static void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
312     (void)cos_bit;
313     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
314     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
315 
316     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
317     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
318     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
319     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
320     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
321     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
322     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
323     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
324     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
325     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
326     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
327     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
328     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
329     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
330     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
331     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
332     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
333     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
334     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
335 
336     // stage 1
337     __m128i x[16];
338     x[0]  = input[0];
339     x[1]  = input[8];
340     x[2]  = input[4];
341     x[3]  = input[12];
342     x[4]  = input[2];
343     x[5]  = input[10];
344     x[6]  = input[6];
345     x[7]  = input[14];
346     x[8]  = input[1];
347     x[9]  = input[9];
348     x[10] = input[5];
349     x[11] = input[13];
350     x[12] = input[3];
351     x[13] = input[11];
352     x[14] = input[7];
353     x[15] = input[15];
354 
355     // stage 2
356     btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15], __rounding);
357     btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14], __rounding);
358     btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13], __rounding);
359     btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12], __rounding);
360 
361     // stage 3
362     btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
363     btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
364     btf_16_adds_subs_sse2(x[8], x[9]);
365     btf_16_subs_adds_sse2(x[11], x[10]);
366     btf_16_adds_subs_sse2(x[12], x[13]);
367     btf_16_subs_adds_sse2(x[15], x[14]);
368 
369     // stage 4
370     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
371     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
372     btf_16_adds_subs_sse2(x[4], x[5]);
373     btf_16_subs_adds_sse2(x[7], x[6]);
374     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
375     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
376 
377     // stage 5~7
378     idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
379     idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
380     idct16_stage7_sse2(output, x);
381 }
382 
idct16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)383 static void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
384     (void)cos_bit;
385     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
386     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
387 
388     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
389     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
390     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
391     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
392     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
393     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
394     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
395     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
396     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
397     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
398     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
399     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
400     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
401     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
402     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
403     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
404     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
405     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
406     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
407     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
408 
409     // stage 1
410     __m128i x[16];
411     x[0]  = input[0];
412     x[1]  = input[8];
413     x[2]  = input[4];
414     x[3]  = input[12];
415     x[4]  = input[2];
416     x[5]  = input[10];
417     x[6]  = input[6];
418     x[7]  = input[14];
419     x[8]  = input[1];
420     x[9]  = input[9];
421     x[10] = input[5];
422     x[11] = input[13];
423     x[12] = input[3];
424     x[13] = input[11];
425     x[14] = input[7];
426     x[15] = input[15];
427 
428     // stage 2
429     btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15], __rounding);
430     btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14], __rounding);
431     btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13], __rounding);
432     btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12], __rounding);
433 
434     // stage 3
435     btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
436     btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
437     btf_16_adds_subs_sse2(x[8], x[9]);
438     btf_16_subs_adds_sse2(x[11], x[10]);
439     btf_16_adds_subs_sse2(x[12], x[13]);
440     btf_16_subs_adds_sse2(x[15], x[14]);
441 
442     // stage 4
443     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
444     btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
445     btf_16_adds_subs_sse2(x[4], x[5]);
446     btf_16_subs_adds_sse2(x[7], x[6]);
447     btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
448     btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
449 
450     // stage 5
451     btf_16_adds_subs_sse2(x[0], x[3]);
452     btf_16_adds_subs_sse2(x[1], x[2]);
453     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
454     btf_16_adds_subs_sse2(x[8], x[11]);
455     btf_16_adds_subs_sse2(x[9], x[10]);
456     btf_16_subs_adds_sse2(x[15], x[12]);
457     btf_16_subs_adds_sse2(x[14], x[13]);
458 
459     // stage 6
460     btf_16_adds_subs_sse2(x[0], x[7]);
461     btf_16_adds_subs_sse2(x[1], x[6]);
462     btf_16_adds_subs_sse2(x[2], x[5]);
463     btf_16_adds_subs_sse2(x[3], x[4]);
464     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
465     btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
466 
467     // stage 7
468     idct16_stage7_sse2(output, x);
469 }
470 
idct32_high16_stage3_sse2(__m128i * x)471 static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
472     btf_16_adds_subs_sse2(x[16], x[17]);
473     btf_16_subs_adds_sse2(x[19], x[18]);
474     btf_16_adds_subs_sse2(x[20], x[21]);
475     btf_16_subs_adds_sse2(x[23], x[22]);
476     btf_16_adds_subs_sse2(x[24], x[25]);
477     btf_16_subs_adds_sse2(x[27], x[26]);
478     btf_16_adds_subs_sse2(x[28], x[29]);
479     btf_16_subs_adds_sse2(x[31], x[30]);
480 }
481 
idct32_high16_stage4_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)482 static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
483                                              const __m128i __rounding, int8_t cos_bit) {
484     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
485     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
486     const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
487     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
488     const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
489     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
490     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30], __rounding);
491     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29], __rounding);
492     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26], __rounding);
493     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25], __rounding);
494 }
495 
idct32_high24_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)496 static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
497                                              const __m128i __rounding, int8_t cos_bit) {
498     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
499     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
500     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
501     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
502     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
503     btf_16_adds_subs_sse2(x[16], x[19]);
504     btf_16_adds_subs_sse2(x[17], x[18]);
505     btf_16_subs_adds_sse2(x[23], x[20]);
506     btf_16_subs_adds_sse2(x[22], x[21]);
507     btf_16_adds_subs_sse2(x[24], x[27]);
508     btf_16_adds_subs_sse2(x[25], x[26]);
509     btf_16_subs_adds_sse2(x[31], x[28]);
510     btf_16_subs_adds_sse2(x[30], x[29]);
511 }
512 
idct32_high28_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)513 static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
514                                              const __m128i __rounding, int8_t cos_bit) {
515     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
516     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
517     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
518     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
519     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
520     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
521     btf_16_adds_subs_sse2(x[8], x[11]);
522     btf_16_adds_subs_sse2(x[9], x[10]);
523     btf_16_subs_adds_sse2(x[15], x[12]);
524     btf_16_subs_adds_sse2(x[14], x[13]);
525     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29], __rounding);
526     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28], __rounding);
527     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27], __rounding);
528     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26], __rounding);
529 }
530 
idct32_stage7_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)531 static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
532                                       int8_t cos_bit) {
533     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
534     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
535     btf_16_adds_subs_sse2(x[0], x[7]);
536     btf_16_adds_subs_sse2(x[1], x[6]);
537     btf_16_adds_subs_sse2(x[2], x[5]);
538     btf_16_adds_subs_sse2(x[3], x[4]);
539     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
540     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
541     btf_16_adds_subs_sse2(x[16], x[23]);
542     btf_16_adds_subs_sse2(x[17], x[22]);
543     btf_16_adds_subs_sse2(x[18], x[21]);
544     btf_16_adds_subs_sse2(x[19], x[20]);
545     btf_16_subs_adds_sse2(x[31], x[24]);
546     btf_16_subs_adds_sse2(x[30], x[25]);
547     btf_16_subs_adds_sse2(x[29], x[26]);
548     btf_16_subs_adds_sse2(x[28], x[27]);
549 }
550 
idct32_stage8_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)551 static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
552                                       int8_t cos_bit) {
553     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
554     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
555     btf_16_adds_subs_sse2(x[0], x[15]);
556     btf_16_adds_subs_sse2(x[1], x[14]);
557     btf_16_adds_subs_sse2(x[2], x[13]);
558     btf_16_adds_subs_sse2(x[3], x[12]);
559     btf_16_adds_subs_sse2(x[4], x[11]);
560     btf_16_adds_subs_sse2(x[5], x[10]);
561     btf_16_adds_subs_sse2(x[6], x[9]);
562     btf_16_adds_subs_sse2(x[7], x[8]);
563     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27], __rounding);
564     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26], __rounding);
565     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25], __rounding);
566     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24], __rounding);
567 }
568 
idct32_stage9_sse2(__m128i * output,__m128i * x)569 static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
570     btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
571     btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
572     btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
573     btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
574     btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
575     btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
576     btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
577     btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
578     btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
579     btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
580     btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
581     btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
582     btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
583     btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
584     btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
585     btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
586 }
587 
idct32_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)588 static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
589     (void)cos_bit;
590     const int32_t *cospi = cospi_arr(INV_COS_BIT);
591 
592     // stage 1
593     __m128i x[2];
594     x[0] = input[0];
595 
596     // stage 2
597     // stage 3
598     // stage 4
599     // stage 5
600     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
601 
602     // stage 6
603     // stage 7
604     // stage 8
605     // stage 9
606     output[0]  = x[0];
607     output[31] = x[0];
608     output[1]  = x[1];
609     output[30] = x[1];
610     output[2]  = x[1];
611     output[29] = x[1];
612     output[3]  = x[0];
613     output[28] = x[0];
614     output[4]  = x[0];
615     output[27] = x[0];
616     output[5]  = x[1];
617     output[26] = x[1];
618     output[6]  = x[1];
619     output[25] = x[1];
620     output[7]  = x[0];
621     output[24] = x[0];
622     output[8]  = x[0];
623     output[23] = x[0];
624     output[9]  = x[1];
625     output[22] = x[1];
626     output[10] = x[1];
627     output[21] = x[1];
628     output[11] = x[0];
629     output[20] = x[0];
630     output[12] = x[0];
631     output[19] = x[0];
632     output[13] = x[1];
633     output[18] = x[1];
634     output[14] = x[1];
635     output[17] = x[1];
636     output[15] = x[0];
637     output[16] = x[0];
638 }
639 
idct32_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)640 static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
641     (void)cos_bit;
642     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
643     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
644 
645     // stage 1
646     __m128i x[32];
647     x[0]  = input[0];
648     x[4]  = input[4];
649     x[8]  = input[2];
650     x[12] = input[6];
651     x[16] = input[1];
652     x[20] = input[5];
653     x[24] = input[3];
654     x[28] = input[7];
655 
656     // stage 2
657     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
658     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
659     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
660     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
661 
662     // stage 3
663     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
664     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
665     x[17] = x[16];
666     x[18] = x[19];
667     x[21] = x[20];
668     x[22] = x[23];
669     x[25] = x[24];
670     x[26] = x[27];
671     x[29] = x[28];
672     x[30] = x[31];
673 
674     // stage 4
675     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
676     x[9]  = x[8];
677     x[10] = x[11];
678     x[13] = x[12];
679     x[14] = x[15];
680     idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
681 
682     // stage 5
683     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
684     x[5] = x[4];
685     x[6] = x[7];
686     idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
687     // stage 6
688     x[3] = x[0];
689     x[2] = x[1];
690     idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
691 
692     idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
693     idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
694     idct32_stage9_sse2(output, x);
695 }
696 
idct32_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)697 static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
698     (void)cos_bit;
699     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
700     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
701 
702     // stage 1
703     __m128i x[32];
704     x[0]  = input[0];
705     x[2]  = input[8];
706     x[4]  = input[4];
707     x[6]  = input[12];
708     x[8]  = input[2];
709     x[10] = input[10];
710     x[12] = input[6];
711     x[14] = input[14];
712     x[16] = input[1];
713     x[18] = input[9];
714     x[20] = input[5];
715     x[22] = input[13];
716     x[24] = input[3];
717     x[26] = input[11];
718     x[28] = input[7];
719     x[30] = input[15];
720 
721     // stage 2
722     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
723     btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
724     btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
725     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
726     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
727     btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
728     btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
729     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
730 
731     // stage 3
732     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
733     btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
734     btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
735     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
736     idct32_high16_stage3_sse2(x);
737 
738     // stage 4
739     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
740     btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
741     btf_16_adds_subs_sse2(x[8], x[9]);
742     btf_16_subs_adds_sse2(x[11], x[10]);
743     btf_16_adds_subs_sse2(x[12], x[13]);
744     btf_16_subs_adds_sse2(x[15], x[14]);
745     idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
746 
747     // stage 5
748     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
749     btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
750     btf_16_adds_subs_sse2(x[4], x[5]);
751     btf_16_subs_adds_sse2(x[7], x[6]);
752     idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
753 
754     btf_16_adds_subs_sse2(x[0], x[3]);
755     btf_16_adds_subs_sse2(x[1], x[2]);
756     idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
757 
758     idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
759     idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
760     idct32_stage9_sse2(output, x);
761 }
762 
idct32_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)763 static void idct32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
764     (void)cos_bit;
765     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
766     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
767 
768     const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
769     const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
770     const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
771     const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
772     const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
773     const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
774     const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
775     const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
776     const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
777     const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
778     const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
779     const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
780     const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
781     const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
782     const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
783     const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
784     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
785     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
786     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
787     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
788     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
789     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
790     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
791     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
792     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
793     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
794     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
795     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
796     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
797     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
798     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
799     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
800 
801     // stage 1
802     __m128i x[32];
803     x[0]  = input[0];
804     x[1]  = input[16];
805     x[2]  = input[8];
806     x[3]  = input[24];
807     x[4]  = input[4];
808     x[5]  = input[20];
809     x[6]  = input[12];
810     x[7]  = input[28];
811     x[8]  = input[2];
812     x[9]  = input[18];
813     x[10] = input[10];
814     x[11] = input[26];
815     x[12] = input[6];
816     x[13] = input[22];
817     x[14] = input[14];
818     x[15] = input[30];
819     x[16] = input[1];
820     x[17] = input[17];
821     x[18] = input[9];
822     x[19] = input[25];
823     x[20] = input[5];
824     x[21] = input[21];
825     x[22] = input[13];
826     x[23] = input[29];
827     x[24] = input[3];
828     x[25] = input[19];
829     x[26] = input[11];
830     x[27] = input[27];
831     x[28] = input[7];
832     x[29] = input[23];
833     x[30] = input[15];
834     x[31] = input[31];
835 
836     // stage 2
837     btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31], __rounding);
838     btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30], __rounding);
839     btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29], __rounding);
840     btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28], __rounding);
841     btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27], __rounding);
842     btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26], __rounding);
843     btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25], __rounding);
844     btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24], __rounding);
845 
846     // stage 3
847     btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15], __rounding);
848     btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14], __rounding);
849     btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13], __rounding);
850     btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12], __rounding);
851     idct32_high16_stage3_sse2(x);
852 
853     // stage 4
854     btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7], __rounding);
855     btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6], __rounding);
856     btf_16_adds_subs_sse2(x[8], x[9]);
857     btf_16_subs_adds_sse2(x[11], x[10]);
858     btf_16_adds_subs_sse2(x[12], x[13]);
859     btf_16_subs_adds_sse2(x[15], x[14]);
860     idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
861 
862     // stage 5
863     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1], __rounding);
864     btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3], __rounding);
865     btf_16_adds_subs_sse2(x[4], x[5]);
866     btf_16_adds_subs_sse2(x[7], x[6]);
867     idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
868 
869     // stage 6
870     btf_16_adds_subs_sse2(x[0], x[3]);
871     btf_16_adds_subs_sse2(x[1], x[2]);
872     idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
873 
874     // stage 7~8
875     idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
876     idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
877     idct32_stage9_sse2(output, x);
878 }
879 
idct64_stage4_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)880 static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
881                                              const __m128i __rounding, int8_t cos_bit) {
882     const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
883     const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
884     const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
885     const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
886     const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
887     const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
888     const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
889     const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
890     const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
891     const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
892     const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
893     const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
894     btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62], __rounding);
895     btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61], __rounding);
896     btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58], __rounding);
897     btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57], __rounding);
898     btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54], __rounding);
899     btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53], __rounding);
900     btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50], __rounding);
901     btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49], __rounding);
902 }
903 
idct64_stage5_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)904 static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
905                                              const __m128i __rounding, int8_t cos_bit) {
906     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
907     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
908     const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
909     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
910     const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
911     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
912     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30], __rounding);
913     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29], __rounding);
914     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26], __rounding);
915     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25], __rounding);
916     btf_16_adds_subs_sse2(x[32], x[35]);
917     btf_16_adds_subs_sse2(x[33], x[34]);
918     btf_16_subs_adds_sse2(x[39], x[36]);
919     btf_16_subs_adds_sse2(x[38], x[37]);
920     btf_16_adds_subs_sse2(x[40], x[43]);
921     btf_16_adds_subs_sse2(x[41], x[42]);
922     btf_16_subs_adds_sse2(x[47], x[44]);
923     btf_16_subs_adds_sse2(x[46], x[45]);
924     btf_16_adds_subs_sse2(x[48], x[51]);
925     btf_16_adds_subs_sse2(x[49], x[50]);
926     btf_16_subs_adds_sse2(x[55], x[52]);
927     btf_16_subs_adds_sse2(x[54], x[53]);
928     btf_16_adds_subs_sse2(x[56], x[59]);
929     btf_16_adds_subs_sse2(x[57], x[58]);
930     btf_16_subs_adds_sse2(x[63], x[60]);
931     btf_16_subs_adds_sse2(x[62], x[61]);
932 }
933 
idct64_stage6_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)934 static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
935                                              const __m128i __rounding, int8_t cos_bit) {
936     const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
937     const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
938     const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
939     const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
940     const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
941     const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
942     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61], __rounding);
943     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60], __rounding);
944     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59], __rounding);
945     btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58], __rounding);
946     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53], __rounding);
947     btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52], __rounding);
948     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51], __rounding);
949     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50], __rounding);
950 }
951 
idct64_stage6_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)952 static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
953                                              const __m128i __rounding, int8_t cos_bit) {
954     btf_16_adds_subs_sse2(x[16], x[19]);
955     btf_16_adds_subs_sse2(x[17], x[18]);
956     btf_16_subs_adds_sse2(x[23], x[20]);
957     btf_16_subs_adds_sse2(x[22], x[21]);
958     btf_16_adds_subs_sse2(x[24], x[27]);
959     btf_16_adds_subs_sse2(x[25], x[26]);
960     btf_16_subs_adds_sse2(x[31], x[28]);
961     btf_16_subs_adds_sse2(x[30], x[29]);
962     idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
963 }
964 
idct64_stage7_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)965 static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
966                                              const __m128i __rounding, int8_t cos_bit) {
967     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
968     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
969     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
970     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29], __rounding);
971     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28], __rounding);
972     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27], __rounding);
973     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26], __rounding);
974     btf_16_adds_subs_sse2(x[32], x[39]);
975     btf_16_adds_subs_sse2(x[33], x[38]);
976     btf_16_adds_subs_sse2(x[34], x[37]);
977     btf_16_adds_subs_sse2(x[35], x[36]);
978     btf_16_subs_adds_sse2(x[47], x[40]);
979     btf_16_subs_adds_sse2(x[46], x[41]);
980     btf_16_subs_adds_sse2(x[45], x[42]);
981     btf_16_subs_adds_sse2(x[44], x[43]);
982     btf_16_adds_subs_sse2(x[48], x[55]);
983     btf_16_adds_subs_sse2(x[49], x[54]);
984     btf_16_adds_subs_sse2(x[50], x[53]);
985     btf_16_adds_subs_sse2(x[51], x[52]);
986     btf_16_subs_adds_sse2(x[63], x[56]);
987     btf_16_subs_adds_sse2(x[62], x[57]);
988     btf_16_subs_adds_sse2(x[61], x[58]);
989     btf_16_subs_adds_sse2(x[60], x[59]);
990 }
991 
idct64_stage8_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)992 static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
993                                              const __m128i __rounding, int8_t cos_bit) {
994     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
995     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
996     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
997     btf_16_adds_subs_sse2(x[16], x[23]);
998     btf_16_adds_subs_sse2(x[17], x[22]);
999     btf_16_adds_subs_sse2(x[18], x[21]);
1000     btf_16_adds_subs_sse2(x[19], x[20]);
1001     btf_16_subs_adds_sse2(x[31], x[24]);
1002     btf_16_subs_adds_sse2(x[30], x[25]);
1003     btf_16_subs_adds_sse2(x[29], x[26]);
1004     btf_16_subs_adds_sse2(x[28], x[27]);
1005     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59], __rounding);
1006     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58], __rounding);
1007     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57], __rounding);
1008     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56], __rounding);
1009     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55], __rounding);
1010     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54], __rounding);
1011     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53], __rounding);
1012     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52], __rounding);
1013 }
1014 
idct64_stage9_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1015 static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1016                                       int8_t cos_bit) {
1017     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1018     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1019     btf_16_adds_subs_sse2(x[0], x[15]);
1020     btf_16_adds_subs_sse2(x[1], x[14]);
1021     btf_16_adds_subs_sse2(x[2], x[13]);
1022     btf_16_adds_subs_sse2(x[3], x[12]);
1023     btf_16_adds_subs_sse2(x[4], x[11]);
1024     btf_16_adds_subs_sse2(x[5], x[10]);
1025     btf_16_adds_subs_sse2(x[6], x[9]);
1026     btf_16_adds_subs_sse2(x[7], x[8]);
1027     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27], __rounding);
1028     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26], __rounding);
1029     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25], __rounding);
1030     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24], __rounding);
1031     btf_16_adds_subs_sse2(x[32], x[47]);
1032     btf_16_adds_subs_sse2(x[33], x[46]);
1033     btf_16_adds_subs_sse2(x[34], x[45]);
1034     btf_16_adds_subs_sse2(x[35], x[44]);
1035     btf_16_adds_subs_sse2(x[36], x[43]);
1036     btf_16_adds_subs_sse2(x[37], x[42]);
1037     btf_16_adds_subs_sse2(x[38], x[41]);
1038     btf_16_adds_subs_sse2(x[39], x[40]);
1039     btf_16_subs_adds_sse2(x[63], x[48]);
1040     btf_16_subs_adds_sse2(x[62], x[49]);
1041     btf_16_subs_adds_sse2(x[61], x[50]);
1042     btf_16_subs_adds_sse2(x[60], x[51]);
1043     btf_16_subs_adds_sse2(x[59], x[52]);
1044     btf_16_subs_adds_sse2(x[58], x[53]);
1045     btf_16_subs_adds_sse2(x[57], x[54]);
1046     btf_16_subs_adds_sse2(x[56], x[55]);
1047 }
1048 
idct64_stage10_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1049 static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1050                                        int8_t cos_bit) {
1051     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1052     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1053     btf_16_adds_subs_sse2(x[0], x[31]);
1054     btf_16_adds_subs_sse2(x[1], x[30]);
1055     btf_16_adds_subs_sse2(x[2], x[29]);
1056     btf_16_adds_subs_sse2(x[3], x[28]);
1057     btf_16_adds_subs_sse2(x[4], x[27]);
1058     btf_16_adds_subs_sse2(x[5], x[26]);
1059     btf_16_adds_subs_sse2(x[6], x[25]);
1060     btf_16_adds_subs_sse2(x[7], x[24]);
1061     btf_16_adds_subs_sse2(x[8], x[23]);
1062     btf_16_adds_subs_sse2(x[9], x[22]);
1063     btf_16_adds_subs_sse2(x[10], x[21]);
1064     btf_16_adds_subs_sse2(x[11], x[20]);
1065     btf_16_adds_subs_sse2(x[12], x[19]);
1066     btf_16_adds_subs_sse2(x[13], x[18]);
1067     btf_16_adds_subs_sse2(x[14], x[17]);
1068     btf_16_adds_subs_sse2(x[15], x[16]);
1069     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55], __rounding);
1070     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54], __rounding);
1071     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53], __rounding);
1072     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52], __rounding);
1073     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51], __rounding);
1074     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50], __rounding);
1075     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49], __rounding);
1076     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48], __rounding);
1077 }
1078 
idct64_stage11_sse2(__m128i * output,__m128i * x)1079 static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1080     btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1081     btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1082     btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1083     btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1084     btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1085     btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1086     btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1087     btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1088     btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1089     btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1090     btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1091     btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1092     btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1093     btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1094     btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1095     btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1096     btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1097     btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1098     btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1099     btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1100     btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1101     btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1102     btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1103     btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1104     btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1105     btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1106     btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1107     btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1108     btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1109     btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1110     btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1111     btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1112 }
1113 
idct64_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1114 static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1115     (void)cos_bit;
1116     const int32_t *cospi = cospi_arr(INV_COS_BIT);
1117 
1118     // stage 1
1119     __m128i x[32];
1120     x[0] = input[0];
1121 
1122     // stage 2
1123     // stage 3
1124     // stage 4
1125     // stage 5
1126     // stage 6
1127     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1128 
1129     // stage 7
1130     // stage 8
1131     // stage 9
1132     // stage 10
1133     // stage 11
1134     output[0]  = x[0];
1135     output[63] = x[0];
1136     output[1]  = x[1];
1137     output[62] = x[1];
1138     output[2]  = x[1];
1139     output[61] = x[1];
1140     output[3]  = x[0];
1141     output[60] = x[0];
1142     output[4]  = x[0];
1143     output[59] = x[0];
1144     output[5]  = x[1];
1145     output[58] = x[1];
1146     output[6]  = x[1];
1147     output[57] = x[1];
1148     output[7]  = x[0];
1149     output[56] = x[0];
1150     output[8]  = x[0];
1151     output[55] = x[0];
1152     output[9]  = x[1];
1153     output[54] = x[1];
1154     output[10] = x[1];
1155     output[53] = x[1];
1156     output[11] = x[0];
1157     output[52] = x[0];
1158     output[12] = x[0];
1159     output[51] = x[0];
1160     output[13] = x[1];
1161     output[50] = x[1];
1162     output[14] = x[1];
1163     output[49] = x[1];
1164     output[15] = x[0];
1165     output[48] = x[0];
1166     output[16] = x[0];
1167     output[47] = x[0];
1168     output[17] = x[1];
1169     output[46] = x[1];
1170     output[18] = x[1];
1171     output[45] = x[1];
1172     output[19] = x[0];
1173     output[44] = x[0];
1174     output[20] = x[0];
1175     output[43] = x[0];
1176     output[21] = x[1];
1177     output[42] = x[1];
1178     output[22] = x[1];
1179     output[41] = x[1];
1180     output[23] = x[0];
1181     output[40] = x[0];
1182     output[24] = x[0];
1183     output[39] = x[0];
1184     output[25] = x[1];
1185     output[38] = x[1];
1186     output[26] = x[1];
1187     output[37] = x[1];
1188     output[27] = x[0];
1189     output[36] = x[0];
1190     output[28] = x[0];
1191     output[35] = x[0];
1192     output[29] = x[1];
1193     output[34] = x[1];
1194     output[30] = x[1];
1195     output[33] = x[1];
1196     output[31] = x[0];
1197     output[32] = x[0];
1198 }
1199 
idct64_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1200 static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1201     (void)cos_bit;
1202     const int32_t *cospi         = cospi_arr(INV_COS_BIT);
1203     const __m128i  __rounding    = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1204     const __m128i  cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1205     const __m128i  cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1206     const __m128i  cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1207     const __m128i  cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1208     const __m128i  cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1209     const __m128i  cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1210     const __m128i  cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1211     const __m128i  cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1212     const __m128i  cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1213     const __m128i  cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1214     const __m128i  cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1215     const __m128i  cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1216     const __m128i  cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1217     const __m128i  cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1218     const __m128i  cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1219     const __m128i  cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1220 
1221     // stage 1
1222     __m128i x[64];
1223     x[0]  = input[0];
1224     x[8]  = input[4];
1225     x[16] = input[2];
1226     x[24] = input[6];
1227     x[32] = input[1];
1228     x[40] = input[5];
1229     x[48] = input[3];
1230     x[56] = input[7];
1231 
1232     // stage 2
1233     btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1234     btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1235     btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1236     btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1237 
1238     // stage 3
1239     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1240     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1241     x[33] = x[32];
1242     x[38] = x[39];
1243     x[41] = x[40];
1244     x[46] = x[47];
1245     x[49] = x[48];
1246     x[54] = x[55];
1247     x[57] = x[56];
1248     x[62] = x[63];
1249 
1250     // stage 4
1251     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1252     x[17] = x[16];
1253     x[22] = x[23];
1254     x[25] = x[24];
1255     x[30] = x[31];
1256     btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62], __rounding);
1257     btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57], __rounding);
1258     btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54], __rounding);
1259     btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49], __rounding);
1260 
1261     // stage 5
1262     x[9]  = x[8];
1263     x[14] = x[15];
1264     btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30], __rounding);
1265     btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25], __rounding);
1266     x[35] = x[32];
1267     x[34] = x[33];
1268     x[36] = x[39];
1269     x[37] = x[38];
1270     x[43] = x[40];
1271     x[42] = x[41];
1272     x[44] = x[47];
1273     x[45] = x[46];
1274     x[51] = x[48];
1275     x[50] = x[49];
1276     x[52] = x[55];
1277     x[53] = x[54];
1278     x[59] = x[56];
1279     x[58] = x[57];
1280     x[60] = x[63];
1281     x[61] = x[62];
1282 
1283     // stage 6
1284     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1285     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
1286     x[19] = x[16];
1287     x[18] = x[17];
1288     x[20] = x[23];
1289     x[21] = x[22];
1290     x[27] = x[24];
1291     x[26] = x[25];
1292     x[28] = x[31];
1293     x[29] = x[30];
1294     idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1295 
1296     // stage 7
1297     x[3]  = x[0];
1298     x[2]  = x[1];
1299     x[11] = x[8];
1300     x[10] = x[9];
1301     x[12] = x[15];
1302     x[13] = x[14];
1303     idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1304 
1305     // stage 8
1306     x[7] = x[0];
1307     x[6] = x[1];
1308     x[5] = x[2];
1309     x[4] = x[3];
1310     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
1311     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
1312     idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1313 
1314     idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1315     idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1316     idct64_stage11_sse2(output, x);
1317 }
1318 
idct64_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1319 static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1320     (void)cos_bit;
1321     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
1322     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1323 
1324     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1325     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1326     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1327     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1328     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1329 
1330     // stage 1
1331     __m128i x[64];
1332     x[0]  = input[0];
1333     x[4]  = input[8];
1334     x[8]  = input[4];
1335     x[12] = input[12];
1336     x[16] = input[2];
1337     x[20] = input[10];
1338     x[24] = input[6];
1339     x[28] = input[14];
1340     x[32] = input[1];
1341     x[36] = input[9];
1342     x[40] = input[5];
1343     x[44] = input[13];
1344     x[48] = input[3];
1345     x[52] = input[11];
1346     x[56] = input[7];
1347     x[60] = input[15];
1348 
1349     // stage 2
1350     btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1351     btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1352     btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1353     btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1354     btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1355     btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1356     btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1357     btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1358 
1359     // stage 3
1360     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1361     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1362     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1363     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1364     x[33] = x[32];
1365     x[34] = x[35];
1366     x[37] = x[36];
1367     x[38] = x[39];
1368     x[41] = x[40];
1369     x[42] = x[43];
1370     x[45] = x[44];
1371     x[46] = x[47];
1372     x[49] = x[48];
1373     x[50] = x[51];
1374     x[53] = x[52];
1375     x[54] = x[55];
1376     x[57] = x[56];
1377     x[58] = x[59];
1378     x[61] = x[60];
1379     x[62] = x[63];
1380 
1381     // stage 4
1382     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1383     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1384     x[17] = x[16];
1385     x[18] = x[19];
1386     x[21] = x[20];
1387     x[22] = x[23];
1388     x[25] = x[24];
1389     x[26] = x[27];
1390     x[29] = x[28];
1391     x[30] = x[31];
1392     idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1393 
1394     // stage 5
1395     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1396     x[9]  = x[8];
1397     x[10] = x[11];
1398     x[13] = x[12];
1399     x[14] = x[15];
1400     idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1401 
1402     // stage 6
1403     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1404     x[5] = x[4];
1405     x[6] = x[7];
1406     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
1407     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
1408     idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1409 
1410     // stage 7
1411     x[3] = x[0];
1412     x[2] = x[1];
1413     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
1414     btf_16_adds_subs_sse2(x[8], x[11]);
1415     btf_16_adds_subs_sse2(x[9], x[10]);
1416     btf_16_subs_adds_sse2(x[15], x[12]);
1417     btf_16_subs_adds_sse2(x[14], x[13]);
1418     idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1419 
1420     // stage 8
1421     btf_16_adds_subs_sse2(x[0], x[7]);
1422     btf_16_adds_subs_sse2(x[1], x[6]);
1423     btf_16_adds_subs_sse2(x[2], x[5]);
1424     btf_16_adds_subs_sse2(x[3], x[4]);
1425     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
1426     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
1427     idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1428 
1429     idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1430     idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1431     idct64_stage11_sse2(output, x);
1432 }
1433 
idct64_low32_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1434 static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1435     (void)cos_bit;
1436     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
1437     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1438 
1439     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1440     const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1441     const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1442     const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1443     const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1444 
1445     // stage 1
1446     __m128i x[64];
1447     x[0]  = input[0];
1448     x[2]  = input[16];
1449     x[4]  = input[8];
1450     x[6]  = input[24];
1451     x[8]  = input[4];
1452     x[10] = input[20];
1453     x[12] = input[12];
1454     x[14] = input[28];
1455     x[16] = input[2];
1456     x[18] = input[18];
1457     x[20] = input[10];
1458     x[22] = input[26];
1459     x[24] = input[6];
1460     x[26] = input[22];
1461     x[28] = input[14];
1462     x[30] = input[30];
1463     x[32] = input[1];
1464     x[34] = input[17];
1465     x[36] = input[9];
1466     x[38] = input[25];
1467     x[40] = input[5];
1468     x[42] = input[21];
1469     x[44] = input[13];
1470     x[46] = input[29];
1471     x[48] = input[3];
1472     x[50] = input[19];
1473     x[52] = input[11];
1474     x[54] = input[27];
1475     x[56] = input[7];
1476     x[58] = input[23];
1477     x[60] = input[15];
1478     x[62] = input[31];
1479 
1480     // stage 2
1481     btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1482     btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1483     btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1484     btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1485     btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1486     btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1487     btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1488     btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1489     btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1490     btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1491     btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1492     btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1493     btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1494     btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1495     btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1496     btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1497 
1498     // stage 3
1499     btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1500     btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1501     btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1502     btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1503     btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1504     btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1505     btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1506     btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1507     btf_16_adds_subs_sse2(x[32], x[33]);
1508     btf_16_subs_adds_sse2(x[35], x[34]);
1509     btf_16_adds_subs_sse2(x[36], x[37]);
1510     btf_16_subs_adds_sse2(x[39], x[38]);
1511     btf_16_adds_subs_sse2(x[40], x[41]);
1512     btf_16_subs_adds_sse2(x[43], x[42]);
1513     btf_16_adds_subs_sse2(x[44], x[45]);
1514     btf_16_subs_adds_sse2(x[47], x[46]);
1515     btf_16_adds_subs_sse2(x[48], x[49]);
1516     btf_16_subs_adds_sse2(x[51], x[50]);
1517     btf_16_adds_subs_sse2(x[52], x[53]);
1518     btf_16_subs_adds_sse2(x[55], x[54]);
1519     btf_16_adds_subs_sse2(x[56], x[57]);
1520     btf_16_subs_adds_sse2(x[59], x[58]);
1521     btf_16_adds_subs_sse2(x[60], x[61]);
1522     btf_16_subs_adds_sse2(x[63], x[62]);
1523 
1524     // stage 4
1525     btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1526     btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1527     btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1528     btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1529     btf_16_adds_subs_sse2(x[16], x[17]);
1530     btf_16_subs_adds_sse2(x[19], x[18]);
1531     btf_16_adds_subs_sse2(x[20], x[21]);
1532     btf_16_subs_adds_sse2(x[23], x[22]);
1533     btf_16_adds_subs_sse2(x[24], x[25]);
1534     btf_16_subs_adds_sse2(x[27], x[26]);
1535     btf_16_adds_subs_sse2(x[28], x[29]);
1536     btf_16_subs_adds_sse2(x[31], x[30]);
1537     idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1538 
1539     // stage 5
1540     btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1541     btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1542     btf_16_adds_subs_sse2(x[8], x[9]);
1543     btf_16_subs_adds_sse2(x[11], x[10]);
1544     btf_16_adds_subs_sse2(x[12], x[13]);
1545     btf_16_subs_adds_sse2(x[15], x[14]);
1546     idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1547 
1548     // stage 6
1549     btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1550     btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1551     btf_16_adds_subs_sse2(x[4], x[5]);
1552     btf_16_subs_adds_sse2(x[7], x[6]);
1553     btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14], __rounding);
1554     btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13], __rounding);
1555     idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1556 
1557     // stage 7
1558     btf_16_adds_subs_sse2(x[0], x[3]);
1559     btf_16_adds_subs_sse2(x[1], x[2]);
1560     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6], __rounding);
1561     btf_16_adds_subs_sse2(x[8], x[11]);
1562     btf_16_adds_subs_sse2(x[9], x[10]);
1563     btf_16_subs_adds_sse2(x[15], x[12]);
1564     btf_16_subs_adds_sse2(x[14], x[13]);
1565     idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1566 
1567     // stage 8
1568     btf_16_adds_subs_sse2(x[0], x[7]);
1569     btf_16_adds_subs_sse2(x[1], x[6]);
1570     btf_16_adds_subs_sse2(x[2], x[5]);
1571     btf_16_adds_subs_sse2(x[3], x[4]);
1572     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13], __rounding);
1573     btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12], __rounding);
1574     idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1575 
1576     // stage 9~11
1577     idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1578     idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1579     idct64_stage11_sse2(output, x);
1580 }
1581 
iadst4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1582 static void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1583     (void)cos_bit;
1584     const int32_t *sinpi         = sinpi_arr(INV_COS_BIT);
1585     const __m128i  sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1586     const __m128i  sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1587     const __m128i  sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1588     const __m128i  sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1589     const __m128i  sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1590     const __m128i  sinpi_0_p03   = pair_set_epi16(0, sinpi[3]);
1591     const __m128i  sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1592     const __m128i  sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1593     __m128i        x0[4];
1594     x0[0] = input[0];
1595     x0[1] = input[1];
1596     x0[2] = input[2];
1597     x0[3] = input[3];
1598 
1599     __m128i u[4];
1600     u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1601     u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1602     u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1603     u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1604 
1605     __m128i x1[16];
1606     x1[0]  = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1607     x1[1]  = _mm_madd_epi16(u[1], sinpi_p01_p04);
1608     x1[2]  = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1609     x1[3]  = _mm_madd_epi16(u[1], sinpi_p02_m01);
1610     x1[4]  = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
1611     x1[5]  = _mm_madd_epi16(u[3], sinpi_p03_p02);
1612     x1[6]  = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
1613     x1[7]  = _mm_madd_epi16(u[3], sinpi_p03_m04);
1614     x1[8]  = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1615     x1[9]  = _mm_madd_epi16(u[1], sinpi_p03_m03);
1616     x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
1617     x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1618     x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1619     x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1620     x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1621     x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1622 
1623     __m128i x2[8];
1624     x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1625     x2[1] = _mm_add_epi32(x1[1], x1[5]);
1626     x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1627     x2[3] = _mm_add_epi32(x1[3], x1[7]);
1628     x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
1629     x2[5] = _mm_add_epi32(x1[9], x1[11]);
1630     x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1631     x2[7] = _mm_add_epi32(x1[13], x1[15]);
1632 
1633     const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1634     for (int32_t i = 0; i < 4; ++i) {
1635         __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1636         __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1637         out0         = _mm_srai_epi32(out0, INV_COS_BIT);
1638         out1         = _mm_srai_epi32(out1, INV_COS_BIT);
1639         output[i]    = _mm_packs_epi32(out0, out1);
1640     }
1641 }
1642 
iadst4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1643 static void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1644     (void)cos_bit;
1645     const int32_t *sinpi         = sinpi_arr(INV_COS_BIT);
1646     const __m128i  sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1647     const __m128i  sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1648     const __m128i  sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1649     const __m128i  sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1650     const __m128i  sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1651     const __m128i  sinpi_0_p03   = pair_set_epi16(0, sinpi[3]);
1652     const __m128i  sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1653     const __m128i  sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1654     __m128i        x0[4];
1655     x0[0] = input[0];
1656     x0[1] = input[1];
1657     x0[2] = input[2];
1658     x0[3] = input[3];
1659 
1660     __m128i u[2];
1661     u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1662     u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1663 
1664     __m128i x1[8];
1665     x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1666     x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1667     x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
1668     x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
1669     x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1670     x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
1671     x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1672     x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1673 
1674     __m128i x2[4];
1675     x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1676     x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1677     x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
1678     x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1679 
1680     const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1681     for (int32_t i = 0; i < 4; ++i) {
1682         __m128i out0 = _mm_add_epi32(x2[i], rounding);
1683         out0         = _mm_srai_epi32(out0, INV_COS_BIT);
1684         output[i]    = _mm_packs_epi32(out0, out0);
1685     }
1686 }
1687 
iadst8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1688 static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1689     (void)cos_bit;
1690     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
1691     const __m128i  __zero     = _mm_setzero_si128();
1692     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1693 
1694     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1695     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1696     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1697     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1698 
1699     // stage 1
1700     __m128i x[8];
1701     x[1] = input[0];
1702 
1703     // stage 2
1704     btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1705 
1706     // stage 3
1707     x[4] = x[0];
1708     x[5] = x[1];
1709 
1710     // stage 4
1711     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1712 
1713     // stage 5
1714     x[2] = x[0];
1715     x[3] = x[1];
1716     x[6] = x[4];
1717     x[7] = x[5];
1718 
1719     // stage 6
1720     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1721     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1722 
1723     // stage 7
1724     output[0] = x[0];
1725     output[1] = _mm_subs_epi16(__zero, x[4]);
1726     output[2] = x[6];
1727     output[3] = _mm_subs_epi16(__zero, x[2]);
1728     output[4] = x[3];
1729     output[5] = _mm_subs_epi16(__zero, x[7]);
1730     output[6] = x[5];
1731     output[7] = _mm_subs_epi16(__zero, x[1]);
1732 }
1733 
iadst8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1734 static void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1735     (void)cos_bit;
1736     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
1737     const __m128i  __zero     = _mm_setzero_si128();
1738     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1739 
1740     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1741     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1742     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1743     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1744     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1745     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1746     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1747     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1748     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1749     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1750     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1751     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1752     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1753 
1754     // stage 1
1755     __m128i x[8];
1756     x[0] = input[7];
1757     x[1] = input[0];
1758     x[2] = input[5];
1759     x[3] = input[2];
1760     x[4] = input[3];
1761     x[5] = input[4];
1762     x[6] = input[1];
1763     x[7] = input[6];
1764 
1765     // stage 2
1766     btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1], __rounding);
1767     btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3], __rounding);
1768     btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5], __rounding);
1769     btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7], __rounding);
1770 
1771     // stage 3
1772     btf_16_adds_subs_sse2(x[0], x[4]);
1773     btf_16_adds_subs_sse2(x[1], x[5]);
1774     btf_16_adds_subs_sse2(x[2], x[6]);
1775     btf_16_adds_subs_sse2(x[3], x[7]);
1776 
1777     // stage 4
1778     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1779     btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
1780 
1781     // stage 5
1782     btf_16_adds_subs_sse2(x[0], x[2]);
1783     btf_16_adds_subs_sse2(x[1], x[3]);
1784     btf_16_adds_subs_sse2(x[4], x[6]);
1785     btf_16_adds_subs_sse2(x[5], x[7]);
1786 
1787     // stage 6
1788     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1789     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1790 
1791     // stage 7
1792     output[0] = x[0];
1793     output[1] = _mm_subs_epi16(__zero, x[4]);
1794     output[2] = x[6];
1795     output[3] = _mm_subs_epi16(__zero, x[2]);
1796     output[4] = x[3];
1797     output[5] = _mm_subs_epi16(__zero, x[7]);
1798     output[6] = x[5];
1799     output[7] = _mm_subs_epi16(__zero, x[1]);
1800 }
1801 
iadst8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1802 static void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1803     (void)cos_bit;
1804     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
1805     const __m128i  __zero     = _mm_setzero_si128();
1806     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1807 
1808     const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1809     const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1810     const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1811     const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1812     const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1813     const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1814     const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1815     const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1816     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1817     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1818     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1819     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1820     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1821 
1822     // stage 1
1823     __m128i x[8];
1824     x[0] = input[7];
1825     x[1] = input[0];
1826     x[2] = input[5];
1827     x[3] = input[2];
1828     x[4] = input[3];
1829     x[5] = input[4];
1830     x[6] = input[1];
1831     x[7] = input[6];
1832 
1833     // stage 2
1834     btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1], __rounding);
1835     btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3], __rounding);
1836     btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5], __rounding);
1837     btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7], __rounding);
1838 
1839     // stage 3
1840     btf_16_adds_subs_sse2(x[0], x[4]);
1841     btf_16_adds_subs_sse2(x[1], x[5]);
1842     btf_16_adds_subs_sse2(x[2], x[6]);
1843     btf_16_adds_subs_sse2(x[3], x[7]);
1844 
1845     // stage 4
1846     btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1847     btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
1848 
1849     // stage 5
1850     btf_16_adds_subs_sse2(x[0], x[2]);
1851     btf_16_adds_subs_sse2(x[1], x[3]);
1852     btf_16_adds_subs_sse2(x[4], x[6]);
1853     btf_16_adds_subs_sse2(x[5], x[7]);
1854 
1855     // stage 6
1856     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1857     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1858 
1859     // stage 7
1860     output[0] = x[0];
1861     output[1] = _mm_subs_epi16(__zero, x[4]);
1862     output[2] = x[6];
1863     output[3] = _mm_subs_epi16(__zero, x[2]);
1864     output[4] = x[3];
1865     output[5] = _mm_subs_epi16(__zero, x[7]);
1866     output[6] = x[5];
1867     output[7] = _mm_subs_epi16(__zero, x[1]);
1868 }
1869 
iadst16_stage3_ssse3(__m128i * x)1870 static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1871     btf_16_adds_subs_sse2(x[0], x[8]);
1872     btf_16_adds_subs_sse2(x[1], x[9]);
1873     btf_16_adds_subs_sse2(x[2], x[10]);
1874     btf_16_adds_subs_sse2(x[3], x[11]);
1875     btf_16_adds_subs_sse2(x[4], x[12]);
1876     btf_16_adds_subs_sse2(x[5], x[13]);
1877     btf_16_adds_subs_sse2(x[6], x[14]);
1878     btf_16_adds_subs_sse2(x[7], x[15]);
1879 }
1880 
iadst16_stage4_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1881 static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1882                                         int8_t cos_bit) {
1883     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1884     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1885     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1886     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1887     const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1888     const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1889     btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9], __rounding);
1890     btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11], __rounding);
1891     btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13], __rounding);
1892     btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15], __rounding);
1893 }
1894 
iadst16_stage5_ssse3(__m128i * x)1895 static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1896     btf_16_adds_subs_sse2(x[0], x[4]);
1897     btf_16_adds_subs_sse2(x[1], x[5]);
1898     btf_16_adds_subs_sse2(x[2], x[6]);
1899     btf_16_adds_subs_sse2(x[3], x[7]);
1900     btf_16_adds_subs_sse2(x[8], x[12]);
1901     btf_16_adds_subs_sse2(x[9], x[13]);
1902     btf_16_adds_subs_sse2(x[10], x[14]);
1903     btf_16_adds_subs_sse2(x[11], x[15]);
1904 }
1905 
iadst16_stage6_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1906 static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1907                                         int8_t cos_bit) {
1908     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1909     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1910     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1911     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1912     btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
1913     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13], __rounding);
1914     btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15], __rounding);
1915 }
1916 
iadst16_stage7_ssse3(__m128i * x)1917 static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1918     btf_16_adds_subs_sse2(x[0], x[2]);
1919     btf_16_adds_subs_sse2(x[1], x[3]);
1920     btf_16_adds_subs_sse2(x[4], x[6]);
1921     btf_16_adds_subs_sse2(x[5], x[7]);
1922     btf_16_adds_subs_sse2(x[8], x[10]);
1923     btf_16_adds_subs_sse2(x[9], x[11]);
1924     btf_16_adds_subs_sse2(x[12], x[14]);
1925     btf_16_adds_subs_sse2(x[13], x[15]);
1926 }
1927 
iadst16_stage8_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1928 static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding,
1929                                         int8_t cos_bit) {
1930     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1931     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1932     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
1933     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
1934     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11], __rounding);
1935     btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15], __rounding);
1936 }
1937 
iadst16_stage9_ssse3(__m128i * output,__m128i * x)1938 static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1939     const __m128i __zero = _mm_setzero_si128();
1940     output[0]            = x[0];
1941     output[1]            = _mm_subs_epi16(__zero, x[8]);
1942     output[2]            = x[12];
1943     output[3]            = _mm_subs_epi16(__zero, x[4]);
1944     output[4]            = x[6];
1945     output[5]            = _mm_subs_epi16(__zero, x[14]);
1946     output[6]            = x[10];
1947     output[7]            = _mm_subs_epi16(__zero, x[2]);
1948     output[8]            = x[3];
1949     output[9]            = _mm_subs_epi16(__zero, x[11]);
1950     output[10]           = x[15];
1951     output[11]           = _mm_subs_epi16(__zero, x[7]);
1952     output[12]           = x[5];
1953     output[13]           = _mm_subs_epi16(__zero, x[13]);
1954     output[14]           = x[9];
1955     output[15]           = _mm_subs_epi16(__zero, x[1]);
1956 }
1957 
iadst16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1958 static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
1959     (void)cos_bit;
1960     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
1961     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1962 
1963     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1964     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1965     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1966     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1967 
1968     // stage 1
1969     __m128i x[16];
1970     x[1] = input[0];
1971 
1972     // stage 2
1973     btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
1974 
1975     // stage 3
1976     x[8] = x[0];
1977     x[9] = x[1];
1978 
1979     // stage 4
1980     btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9], __rounding);
1981 
1982     // stage 5
1983     x[4]  = x[0];
1984     x[5]  = x[1];
1985     x[12] = x[8];
1986     x[13] = x[9];
1987 
1988     // stage 6
1989     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
1990     btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13], __rounding);
1991 
1992     // stage 7
1993     x[2]  = x[0];
1994     x[3]  = x[1];
1995     x[6]  = x[4];
1996     x[7]  = x[5];
1997     x[10] = x[8];
1998     x[11] = x[9];
1999     x[14] = x[12];
2000     x[15] = x[13];
2001 
2002     iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2003     iadst16_stage9_ssse3(output, x);
2004 }
2005 
iadst16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2006 static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
2007     (void)cos_bit;
2008     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
2009     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2010 
2011     // stage 1
2012     __m128i x[16];
2013     x[1]  = input[0];
2014     x[3]  = input[2];
2015     x[5]  = input[4];
2016     x[7]  = input[6];
2017     x[8]  = input[7];
2018     x[10] = input[5];
2019     x[12] = input[3];
2020     x[14] = input[1];
2021 
2022     // stage 2
2023     btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2024     btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2025     btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2026     btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2027     btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2028     btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2029     btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2030     btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2031 
2032     // stage 3
2033     iadst16_stage3_ssse3(x);
2034     iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2035     iadst16_stage5_ssse3(x);
2036     iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2037     iadst16_stage7_ssse3(x);
2038     iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2039     iadst16_stage9_ssse3(output, x);
2040 }
2041 
iadst16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2042 static void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2043     (void)cos_bit;
2044     const int32_t *cospi         = cospi_arr(INV_COS_BIT);
2045     const __m128i  __rounding    = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2046     const __m128i  cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2047     const __m128i  cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2048     const __m128i  cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2049     const __m128i  cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2050     const __m128i  cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2051     const __m128i  cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2052     const __m128i  cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2053     const __m128i  cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2054     const __m128i  cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2055     const __m128i  cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2056     const __m128i  cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2057     const __m128i  cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2058     const __m128i  cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2059     const __m128i  cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2060     const __m128i  cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2061     const __m128i  cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2062 
2063     // stage 1
2064     __m128i x[16];
2065     x[0]  = input[15];
2066     x[1]  = input[0];
2067     x[2]  = input[13];
2068     x[3]  = input[2];
2069     x[4]  = input[11];
2070     x[5]  = input[4];
2071     x[6]  = input[9];
2072     x[7]  = input[6];
2073     x[8]  = input[7];
2074     x[9]  = input[8];
2075     x[10] = input[5];
2076     x[11] = input[10];
2077     x[12] = input[3];
2078     x[13] = input[12];
2079     x[14] = input[1];
2080     x[15] = input[14];
2081 
2082     // stage 2
2083     btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1], __rounding);
2084     btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3], __rounding);
2085     btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5], __rounding);
2086     btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7], __rounding);
2087     btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9], __rounding);
2088     btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11], __rounding);
2089     btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13], __rounding);
2090     btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15], __rounding);
2091 
2092     // stage 3~9
2093     iadst16_stage3_ssse3(x);
2094     iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2095     iadst16_stage5_ssse3(x);
2096     iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2097     iadst16_stage7_ssse3(x);
2098     iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2099     iadst16_stage9_ssse3(output, x);
2100 }
2101 
iadst16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2102 static void iadst16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2103     (void)cos_bit;
2104     const int32_t *cospi      = cospi_arr(INV_COS_BIT);
2105     const __m128i  __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2106 
2107     const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2108     const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2109     const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2110     const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2111     const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2112     const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2113     const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2114     const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2115     const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2116     const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2117     const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2118     const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2119     const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2120     const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2121     const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2122     const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2123     const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2124     const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2125     const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2126     const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2127     const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2128     const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2129     const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2130     const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2131     const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2132     const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2133     const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2134 
2135     // stage 1
2136     __m128i x[16];
2137     x[0]  = input[15];
2138     x[1]  = input[0];
2139     x[2]  = input[13];
2140     x[3]  = input[2];
2141     x[4]  = input[11];
2142     x[5]  = input[4];
2143     x[6]  = input[9];
2144     x[7]  = input[6];
2145     x[8]  = input[7];
2146     x[9]  = input[8];
2147     x[10] = input[5];
2148     x[11] = input[10];
2149     x[12] = input[3];
2150     x[13] = input[12];
2151     x[14] = input[1];
2152     x[15] = input[14];
2153 
2154     // stage 2
2155     btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1], __rounding);
2156     btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3], __rounding);
2157     btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5], __rounding);
2158     btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7], __rounding);
2159     btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9], __rounding);
2160     btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11], __rounding);
2161     btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13], __rounding);
2162     btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15], __rounding);
2163 
2164     // stage 3
2165     iadst16_stage3_ssse3(x);
2166 
2167     // stage 4
2168     btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9], __rounding);
2169     btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11], __rounding);
2170     btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13], __rounding);
2171     btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15], __rounding);
2172 
2173     // stage 5
2174     iadst16_stage5_ssse3(x);
2175 
2176     // stage 6
2177     btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5], __rounding);
2178     btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7], __rounding);
2179     btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13], __rounding);
2180     btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15], __rounding);
2181 
2182     // stage 7
2183     iadst16_stage7_ssse3(x);
2184 
2185     // stage 8
2186     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3], __rounding);
2187     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7], __rounding);
2188     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11], __rounding);
2189     btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15], __rounding);
2190 
2191     // stage 9
2192     iadst16_stage9_ssse3(output, x);
2193 }
2194 
iidentity4_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2195 static void iidentity4_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
2196     (void)cos_bit;
2197     const int16_t scale_fractional = (new_sqrt2 - (1 << new_sqrt2_bits)) << (15 - new_sqrt2_bits);
2198     const __m128i scale            = _mm_set1_epi16(scale_fractional);
2199     for (int32_t i = 0; i < 4; ++i) {
2200         __m128i x = _mm_mulhrs_epi16(input[i], scale);
2201         output[i] = _mm_adds_epi16(x, input[i]);
2202     }
2203 }
2204 
iidentity8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2205 static void iidentity8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2206     (void)cos_bit;
2207     for (int32_t i = 0; i < 8; ++i) output[i] = _mm_adds_epi16(input[i], input[i]);
2208 }
2209 
iidentity16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2210 static void iidentity16_new_ssse3(const __m128i *input, __m128i *output, int8_t cos_bit) {
2211     (void)cos_bit;
2212     const int16_t scale_fractional = (2 * (new_sqrt2 - (1 << new_sqrt2_bits)))
2213         << (15 - new_sqrt2_bits);
2214     const __m128i scale = _mm_set1_epi16(scale_fractional);
2215     for (int32_t i = 0; i < 16; ++i) {
2216         __m128i x     = _mm_mulhrs_epi16(input[i], scale);
2217         __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2218         output[i]     = _mm_adds_epi16(x, srcx2);
2219     }
2220 }
2221 
lowbd_get_recon_8x8_sse2(const __m128i pred,__m128i res)2222 static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, __m128i res) {
2223     const __m128i zero = _mm_setzero_si128();
2224     __m128i       x0   = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2225     return _mm_packus_epi16(x0, x0);
2226 }
2227 
lowbd_write_buffer_4xn_sse2(__m128i * in,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,int32_t flipud,const int32_t height)2228 static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output_r, int32_t stride_r,
2229                                                uint8_t *output_w, int32_t stride_w, int32_t flipud,
2230                                                const int32_t height) {
2231     int32_t       j    = flipud ? (height - 1) : 0;
2232     const int32_t step = flipud ? -1 : 1;
2233     const __m128i zero = _mm_setzero_si128();
2234     for (int32_t i = 0; i < height; ++i, j += step) {
2235         const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output_r + i * stride_r)));
2236         __m128i       u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2237         u               = _mm_packus_epi16(u, zero);
2238         *((uint32_t *)(output_w + i * stride_w)) = _mm_cvtsi128_si32(u);
2239     }
2240 }
2241 
lowbd_write_buffer_8xn_sse2(__m128i * in,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,int32_t flipud,const int32_t height)2242 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output_r, int32_t stride_r,
2243                                                uint8_t *output_w, int32_t stride_w, int32_t flipud,
2244                                                const int32_t height) {
2245     int32_t       j    = flipud ? (height - 1) : 0;
2246     const int32_t step = flipud ? -1 : 1;
2247     for (int32_t i = 0; i < height; ++i, j += step) {
2248         const __m128i v = _mm_loadl_epi64((__m128i const *)(output_r + i * stride_r));
2249         const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2250         _mm_storel_epi64((__m128i *)(output_w + i * stride_w), u);
2251     }
2252 }
2253 
2254 // 1D functions process process 8 pixels at one time.
2255 static const Transform1dSsse3 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2256     {idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3},
2257     {idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2},
2258     {idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3},
2259     {idct32_new_sse2, NULL, NULL},
2260     {idct64_low32_new_ssse3, NULL, NULL},
2261 };
2262 
2263 // functions for blocks with eob at DC and within
2264 // topleft 8x8, 16x16, 32x32 corner
2265 static const Transform1dSsse3 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2266     {
2267         {idct4_new_sse2, idct4_new_sse2, NULL, NULL},
2268         {iadst4_new_sse2, iadst4_new_sse2, NULL, NULL},
2269         {iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL},
2270     },
2271     {{idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL},
2272      {iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL},
2273      {iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL}},
2274     {
2275         {idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2, NULL},
2276         {iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2, NULL},
2277         {NULL, NULL, NULL, NULL},
2278     },
2279     {{idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3, idct32_new_sse2},
2280      {NULL, NULL, NULL, NULL},
2281      {NULL, NULL, NULL, NULL}},
2282     {{idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3, idct64_low32_new_ssse3},
2283      {NULL, NULL, NULL, NULL},
2284      {NULL, NULL, NULL, NULL}}};
2285 
2286 // 1D functions process process 4 pixels at one time.
2287 // used in 4x4, 4x8, 4x16, 8x4, 16x4
2288 static const Transform1dSsse3 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2289     {idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3},
2290     {idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2},
2291     {idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3},
2292     {NULL, NULL, NULL},
2293     {NULL, NULL, NULL},
2294 };
2295 
iidentity_row_8xn_ssse3(__m128i * out,const int32_t * input,int32_t stride,int32_t shift,int32_t height,int32_t txw_idx,int32_t rect_type)2296 static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, int32_t stride,
2297                                            int32_t shift, int32_t height, int32_t txw_idx,
2298                                            int32_t rect_type) {
2299     const int32_t *input_row      = input;
2300     const __m128i  scale          = _mm_set1_epi16(new_sqrt2list[txw_idx]);
2301     const __m128i  rounding       = _mm_set1_epi16((1 << (new_sqrt2_bits - 1)) +
2302                                             (1 << (new_sqrt2_bits - shift - 1)));
2303     const __m128i  one            = _mm_set1_epi16(1);
2304     const __m128i  scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2305     if (rect_type != 1 && rect_type != -1) {
2306         for (int32_t i = 0; i < height; ++i) {
2307             const __m128i src = load_32bit_to_16bit(input_row);
2308             input_row += stride;
2309             __m128i lo = _mm_unpacklo_epi16(src, one);
2310             __m128i hi = _mm_unpackhi_epi16(src, one);
2311             lo         = _mm_madd_epi16(lo, scale_rounding);
2312             hi         = _mm_madd_epi16(hi, scale_rounding);
2313             lo         = _mm_srai_epi32(lo, new_sqrt2_bits - shift);
2314             hi         = _mm_srai_epi32(hi, new_sqrt2_bits - shift);
2315             out[i]     = _mm_packs_epi32(lo, hi);
2316         }
2317     } else {
2318         const __m128i rect_scale = _mm_set1_epi16(new_inv_sqrt2 << (15 - new_sqrt2_bits));
2319         for (int32_t i = 0; i < height; ++i) {
2320             __m128i src = load_32bit_to_16bit(input_row);
2321             src         = _mm_mulhrs_epi16(src, rect_scale);
2322             input_row += stride;
2323             __m128i lo = _mm_unpacklo_epi16(src, one);
2324             __m128i hi = _mm_unpackhi_epi16(src, one);
2325             lo         = _mm_madd_epi16(lo, scale_rounding);
2326             hi         = _mm_madd_epi16(hi, scale_rounding);
2327             lo         = _mm_srai_epi32(lo, new_sqrt2_bits - shift);
2328             hi         = _mm_srai_epi32(hi, new_sqrt2_bits - shift);
2329             out[i]     = _mm_packs_epi32(lo, hi);
2330         }
2331     }
2332 }
2333 
iidentity_col_8xn_ssse3(uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,__m128i * buf,int32_t shift,int32_t height,int32_t txh_idx)2334 static INLINE void iidentity_col_8xn_ssse3(uint8_t *output_r, int32_t stride_r, uint8_t *output_w,
2335                                            int32_t stride_w, __m128i *buf, int32_t shift,
2336                                            int32_t height, int32_t txh_idx) {
2337     const __m128i scale          = _mm_set1_epi16(new_sqrt2list[txh_idx]);
2338     const __m128i scale_rounding = _mm_set1_epi16(1 << (new_sqrt2_bits - 1));
2339     const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2340     const __m128i one            = _mm_set1_epi16(1);
2341     const __m128i scale_coeff    = _mm_unpacklo_epi16(scale, scale_rounding);
2342     const __m128i zero           = _mm_setzero_si128();
2343     for (int32_t h = 0; h < height; ++h) {
2344         __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2345         __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2346         lo         = _mm_madd_epi16(lo, scale_coeff);
2347         hi         = _mm_madd_epi16(hi, scale_coeff);
2348         lo         = _mm_srai_epi32(lo, new_sqrt2_bits);
2349         hi         = _mm_srai_epi32(hi, new_sqrt2_bits);
2350         lo         = _mm_add_epi32(lo, shift_rounding);
2351         hi         = _mm_add_epi32(hi, shift_rounding);
2352         lo         = _mm_srai_epi32(lo, -shift);
2353         hi         = _mm_srai_epi32(hi, -shift);
2354         __m128i x  = _mm_packs_epi32(lo, hi);
2355 
2356         const __m128i pred = _mm_loadl_epi64((__m128i const *)(output_r));
2357         x                  = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2358         const __m128i u    = _mm_packus_epi16(x, x);
2359         _mm_storel_epi64((__m128i *)(output_w), u);
2360         output_r += stride_r;
2361         output_w += stride_w;
2362     }
2363 }
2364 
lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxSize tx_size)2365 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output_r,
2366                                                    int32_t stride_r, uint8_t *output_w,
2367                                                    int32_t stride_w, TxSize tx_size) {
2368     const int8_t *shift         = eb_inv_txfm_shift_ls[tx_size];
2369     const int32_t txw_idx       = get_txw_idx(tx_size);
2370     const int32_t txh_idx       = get_txh_idx(tx_size);
2371     const int32_t txfm_size_col = tx_size_wide[tx_size];
2372     const int32_t txfm_size_row = tx_size_high[tx_size];
2373     const int32_t input_stride  = AOMMIN(32, txfm_size_col);
2374     const int32_t row_max       = AOMMIN(32, txfm_size_row);
2375     const int32_t rect_type     = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2376     __m128i       buf[32];
2377 
2378     for (int32_t i = 0; i < (input_stride >> 3); ++i) {
2379         iidentity_row_8xn_ssse3(
2380             buf, input + 8 * i, input_stride, shift[0], row_max, txw_idx, rect_type);
2381         iidentity_col_8xn_ssse3(output_r + 8 * i,
2382                                 stride_r,
2383                                 output_w + 8 * i,
2384                                 stride_w,
2385                                 buf,
2386                                 shift[1],
2387                                 row_max,
2388                                 txh_idx);
2389     }
2390 }
2391 
lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2392 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output_r,
2393                                            int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2394                                            TxType tx_type, TxSize tx_size_, int32_t eob) {
2395     (void)tx_size_;
2396     (void)eob;
2397     __m128i       buf[4];
2398     const TxSize  tx_size       = TX_4X4;
2399     const int8_t *shift         = eb_inv_txfm_shift_ls[tx_size];
2400     const int32_t txw_idx       = get_txw_idx(tx_size);
2401     const int32_t txh_idx       = get_txh_idx(tx_size);
2402     const int32_t cos_bit_row   = inv_cos_bit_row[txw_idx][txh_idx];
2403     const int32_t cos_bit_col   = inv_cos_bit_col[txw_idx][txh_idx];
2404     const int32_t txfm_size_col = tx_size_wide[tx_size];
2405     const int32_t txfm_size_row = tx_size_high[tx_size];
2406 
2407     const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2408     const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2409 
2410     int32_t ud_flip, lr_flip;
2411     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2412     load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2413     transpose_16bit_4x4(buf, buf);
2414     row_txfm(buf, buf, cos_bit_row);
2415     if (lr_flip) {
2416         __m128i temp[4];
2417         flip_buf_sse2(buf, temp, txfm_size_col);
2418         transpose_16bit_4x4(temp, buf);
2419     } else
2420         transpose_16bit_4x4(buf, buf);
2421     col_txfm(buf, buf, cos_bit_col);
2422     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2423     lowbd_write_buffer_4xn_sse2(
2424         buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2425 }
2426 
lowbd_get_recon_16x16_sse2(const __m128i pred,__m128i res0,__m128i res1)2427 static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, __m128i res0, __m128i res1) {
2428     const __m128i zero = _mm_setzero_si128();
2429     __m128i       x0   = _mm_unpacklo_epi8(pred, zero);
2430     __m128i       x1   = _mm_unpackhi_epi8(pred, zero);
2431     x0                 = _mm_adds_epi16(res0, x0);
2432     x1                 = _mm_adds_epi16(res1, x1);
2433     return _mm_packus_epi16(x0, x1);
2434 }
2435 
lowbd_write_buffer_16xn_sse2(__m128i * in,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,int32_t flipud,int32_t height)2436 static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output_r, int32_t stride_r,
2437                                                 uint8_t *output_w, int32_t stride_w, int32_t flipud,
2438                                                 int32_t height) {
2439     int32_t       j    = flipud ? (height - 1) : 0;
2440     const int32_t step = flipud ? -1 : 1;
2441     for (int32_t i = 0; i < height; ++i, j += step) {
2442         __m128i v = _mm_loadu_si128((__m128i const *)(output_r + i * stride_r));
2443         __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2444         _mm_storeu_si128((__m128i *)(output_w + i * stride_w), u);
2445     }
2446 }
2447 
round_shift_ssse3(const __m128i * input,__m128i * output,int32_t size)2448 static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output, int32_t size) {
2449     const __m128i scale = _mm_set1_epi16(new_inv_sqrt2 * 8);
2450     for (int32_t i = 0; i < size; ++i) output[i] = _mm_mulhrs_epi16(input[i], scale);
2451 }
2452 
lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2453 static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t *input, uint8_t *output_r,
2454                                                           int32_t stride_r, uint8_t *output_w,
2455                                                           int32_t stride_w, TxType tx_type,
2456                                                           TxSize tx_size, int32_t eob) {
2457     __m128i buf1[64 * 8];
2458     int32_t eobx, eoby;
2459     get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2460     const int8_t *shift                   = eb_inv_txfm_shift_ls[tx_size];
2461     const int32_t txw_idx                 = get_txw_idx(tx_size);
2462     const int32_t txh_idx                 = get_txh_idx(tx_size);
2463     const int32_t cos_bit_col             = inv_cos_bit_col[txw_idx][txh_idx];
2464     const int32_t cos_bit_row             = inv_cos_bit_row[txw_idx][txh_idx];
2465     const int32_t txfm_size_col           = tx_size_wide[tx_size];
2466     const int32_t txfm_size_row           = tx_size_high[tx_size];
2467     const int32_t buf_size_w_div8         = txfm_size_col >> 3;
2468     const int32_t buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2469     const int32_t buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2470     const int32_t input_stride            = AOMMIN(32, txfm_size_col);
2471     const int32_t rect_type               = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2472     ASSERT(eobx < 32);
2473     ASSERT(eoby < 32);
2474     const int32_t          fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2475     const int32_t          fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2476     const Transform1dSsse3 row_txfm =
2477         lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2478     const Transform1dSsse3 col_txfm =
2479         lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2480 
2481     assert(col_txfm != NULL);
2482     assert(row_txfm != NULL);
2483     int32_t ud_flip, lr_flip;
2484     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2485     for (int32_t i = 0; i < buf_size_nonzero_h_div8; i++) {
2486         __m128i        buf0[64];
2487         const int32_t *input_row = input + i * input_stride * 8;
2488         for (int32_t j = 0; j < buf_size_nonzero_w_div8; ++j) {
2489             __m128i *buf0_cur = buf0 + j * 8;
2490             load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2491             transpose_16bit_8x8(buf0_cur, buf0_cur);
2492         }
2493         if (rect_type == 1 || rect_type == -1)
2494             round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2495         row_txfm(buf0, buf0, cos_bit_row);
2496         round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2497         __m128i *_buf1 = buf1 + i * 8;
2498         if (lr_flip) {
2499             for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2500                 __m128i temp[8];
2501                 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2502                 transpose_16bit_8x8(temp, _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2503             }
2504         } else {
2505             for (int32_t j = 0; j < buf_size_w_div8; ++j)
2506                 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2507         }
2508     }
2509     for (int32_t i = 0; i < buf_size_w_div8; i++) {
2510         col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
2511         round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2512     }
2513 
2514     if (txfm_size_col >= 16) {
2515         for (int32_t i = 0; i < (txfm_size_col >> 4); i++) {
2516             lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2517                                          output_r + 16 * i,
2518                                          stride_r,
2519                                          output_w + 16 * i,
2520                                          stride_w,
2521                                          ud_flip,
2522                                          txfm_size_row);
2523         }
2524     } else if (txfm_size_col == 8)
2525         lowbd_write_buffer_8xn_sse2(
2526             buf1, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2527 }
2528 
lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2529 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, uint8_t *output_r,
2530                                                          int32_t stride_r, uint8_t *output_w,
2531                                                          int32_t stride_w, TxType tx_type,
2532                                                          TxSize tx_size, int32_t eob) {
2533     const int8_t *shift = eb_inv_txfm_shift_ls[tx_size];
2534     int32_t       eobx, eoby;
2535     get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2536     const int32_t txw_idx         = get_txw_idx(tx_size);
2537     const int32_t txh_idx         = get_txh_idx(tx_size);
2538     const int32_t cos_bit_col     = inv_cos_bit_col[txw_idx][txh_idx];
2539     const int32_t txfm_size_col   = tx_size_wide[tx_size];
2540     const int32_t txfm_size_row   = tx_size_high[tx_size];
2541     const int32_t buf_size_w_div8 = (eobx + 8) >> 3;
2542     const int32_t input_stride    = AOMMIN(32, txfm_size_col);
2543     const int32_t rect_type       = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2544 
2545     const int32_t fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2546     ASSERT(fun_idx < 4);
2547     const Transform1dSsse3 col_txfm =
2548         lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2549 
2550     assert(col_txfm != NULL);
2551 
2552     int32_t ud_flip, lr_flip;
2553     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2554     for (int32_t i = 0; i < buf_size_w_div8; i++) {
2555         __m128i buf0[64];
2556         iidentity_row_8xn_ssse3(
2557             buf0, input + 8 * i, input_stride, shift[0], eoby + 1, txw_idx, rect_type);
2558         col_txfm(buf0, buf0, cos_bit_col);
2559         __m128i       mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2560         int32_t       k      = ud_flip ? (txfm_size_row - 1) : 0;
2561         const int32_t step   = ud_flip ? -1 : 1;
2562         uint8_t *     out_r  = output_r + 8 * i;
2563         uint8_t *     out_w  = output_w + 8 * i;
2564         for (int32_t j = 0; j < txfm_size_row; ++j, k += step) {
2565             const __m128i v = _mm_loadl_epi64((__m128i const *)(out_r));
2566             ASSERT(k >= 0);
2567             __m128i       res = _mm_mulhrs_epi16(buf0[k], mshift);
2568             const __m128i u   = lowbd_get_recon_8x8_sse2(v, res);
2569             _mm_storel_epi64((__m128i *)(out_w), u);
2570             out_r += stride_r;
2571             out_w += stride_w;
2572         }
2573     }
2574 }
2575 
lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2576 static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, uint8_t *output_r,
2577                                                          int32_t stride_r, uint8_t *output_w,
2578                                                          int32_t stride_w, TxType tx_type,
2579                                                          TxSize tx_size, int32_t eob) {
2580     __m128i buf1[64];
2581     int32_t eobx, eoby;
2582     get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2583     const int8_t *shift           = eb_inv_txfm_shift_ls[tx_size];
2584     const int32_t txw_idx         = get_txw_idx(tx_size);
2585     const int32_t txh_idx         = get_txh_idx(tx_size);
2586     const int32_t cos_bit_row     = inv_cos_bit_row[txw_idx][txh_idx];
2587     const int32_t txfm_size_col   = tx_size_wide[tx_size];
2588     const int32_t txfm_size_row   = tx_size_high[tx_size];
2589     const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2590     const int32_t buf_size_h_div8 = (eoby + 8) >> 3;
2591     const int32_t input_stride    = AOMMIN(32, txfm_size_col);
2592     const int32_t rect_type       = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2593 
2594     const int32_t          fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2595     const Transform1dSsse3 row_txfm =
2596         lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2597 
2598     assert(row_txfm != NULL);
2599     int32_t ud_flip, lr_flip;
2600     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2601     for (int32_t i = 0; i < buf_size_h_div8; i++) {
2602         __m128i        buf0[64];
2603         const int32_t *input_row = input + i * input_stride * 8;
2604         for (int32_t j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2605             __m128i *buf0_cur = buf0 + j * 8;
2606             load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2607             transpose_16bit_8x8(buf0_cur, buf0_cur);
2608         }
2609         if (rect_type == 1 || rect_type == -1)
2610             round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2611         row_txfm(buf0, buf0, cos_bit_row);
2612         round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2613         __m128i *_buf1 = buf1;
2614         if (lr_flip) {
2615             for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2616                 __m128i temp[8];
2617                 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2618                 transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2619             }
2620         } else {
2621             for (int32_t j = 0; j < buf_size_w_div8; ++j)
2622                 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2623         }
2624 
2625         for (int32_t j = 0; j < buf_size_w_div8; ++j) {
2626             iidentity_col_8xn_ssse3(output_r + i * 8 * stride_r + j * 8,
2627                                     stride_r,
2628                                     output_w + i * 8 * stride_w + j * 8,
2629                                     stride_w,
2630                                     buf1 + j * 8,
2631                                     shift[1],
2632                                     8,
2633                                     txh_idx);
2634         }
2635     }
2636 }
2637 
2638 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2639 static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(const int32_t *input, uint8_t *output_r,
2640                                                        int32_t stride_r, uint8_t *output_w,
2641                                                        int32_t stride_w, TxType tx_type,
2642                                                        TxSize tx_size, int32_t eob) {
2643     switch (tx_type) {
2644     case DCT_DCT:
2645         lowbd_inv_txfm2d_add_no_identity_ssse3(
2646             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2647         break;
2648     case IDTX:
2649         lowbd_inv_txfm2d_add_idtx_ssse3(input, output_r, stride_r, output_w, stride_w, tx_size);
2650         break;
2651     case V_DCT:
2652     case V_ADST:
2653     case V_FLIPADST:
2654         lowbd_inv_txfm2d_add_h_identity_ssse3(
2655             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2656         break;
2657     case H_DCT:
2658     case H_ADST:
2659     case H_FLIPADST:
2660         lowbd_inv_txfm2d_add_v_identity_ssse3(
2661             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2662         break;
2663     default:
2664         lowbd_inv_txfm2d_add_no_identity_ssse3(
2665             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2666         break;
2667     }
2668 }
2669 
lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2670 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output_r,
2671                                            int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2672                                            TxType tx_type, TxSize tx_size_, int32_t eob) {
2673     (void)tx_size_;
2674     (void)eob;
2675     __m128i       buf[8];
2676     const TxSize  tx_size       = TX_4X8;
2677     const int8_t *shift         = eb_inv_txfm_shift_ls[tx_size];
2678     const int32_t txw_idx       = get_txw_idx(tx_size);
2679     const int32_t txh_idx       = get_txh_idx(tx_size);
2680     const int32_t cos_bit_row   = inv_cos_bit_row[txw_idx][txh_idx];
2681     const int32_t cos_bit_col   = inv_cos_bit_col[txw_idx][txh_idx];
2682     const int32_t txfm_size_col = tx_size_wide[tx_size];
2683     const int32_t txfm_size_row = tx_size_high[tx_size];
2684 
2685     const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2686     const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2687 
2688     int32_t ud_flip, lr_flip;
2689     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2690     load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2691     transpose_16bit_4x8(buf, buf);
2692     round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2693     row_txfm(buf, buf, cos_bit_row);
2694     // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2695     if (lr_flip) {
2696         __m128i temp[4];
2697         flip_buf_sse2(buf, temp, txfm_size_col);
2698         transpose_16bit_8x4(temp, buf);
2699     } else
2700         transpose_16bit_8x4(buf, buf);
2701     col_txfm(buf, buf, cos_bit_col);
2702     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2703     lowbd_write_buffer_4xn_sse2(
2704         buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2705 }
2706 
lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2707 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output_r,
2708                                            int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2709                                            TxType tx_type, TxSize tx_size_, int32_t eob) {
2710     (void)tx_size_;
2711     (void)eob;
2712     __m128i       buf[8];
2713     const TxSize  tx_size       = TX_8X4;
2714     const int8_t *shift         = eb_inv_txfm_shift_ls[tx_size];
2715     const int32_t txw_idx       = get_txw_idx(tx_size);
2716     const int32_t txh_idx       = get_txh_idx(tx_size);
2717     const int32_t cos_bit_row   = inv_cos_bit_row[txw_idx][txh_idx];
2718     const int32_t cos_bit_col   = inv_cos_bit_col[txw_idx][txh_idx];
2719     const int32_t txfm_size_col = tx_size_wide[tx_size];
2720     const int32_t txfm_size_row = tx_size_high[tx_size];
2721 
2722     const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2723     const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2724 
2725     int32_t ud_flip, lr_flip;
2726     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2727     load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2728     transpose_16bit_8x4(buf, buf);
2729     round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2730     row_txfm(buf, buf, cos_bit_row);
2731     // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2732     if (lr_flip) {
2733         __m128i temp[8];
2734         flip_buf_sse2(buf, temp, txfm_size_col);
2735         transpose_16bit_4x8(temp, buf);
2736     } else
2737         transpose_16bit_4x8(buf, buf);
2738     col_txfm(buf, buf, cos_bit_col);
2739     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2740     lowbd_write_buffer_8xn_sse2(
2741         buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2742 }
2743 
lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2744 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output_r,
2745                                             int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2746                                             TxType tx_type, TxSize tx_size_, int32_t eob) {
2747     (void)tx_size_;
2748     (void)eob;
2749     __m128i       buf[16];
2750     const TxSize  tx_size       = TX_4X16;
2751     const int8_t *shift         = eb_inv_txfm_shift_ls[tx_size];
2752     const int32_t txw_idx       = get_txw_idx(tx_size);
2753     const int32_t txh_idx       = get_txh_idx(tx_size);
2754     const int32_t cos_bit_row   = inv_cos_bit_row[txw_idx][txh_idx];
2755     const int32_t cos_bit_col   = inv_cos_bit_col[txw_idx][txh_idx];
2756     const int32_t txfm_size_col = tx_size_wide[tx_size];
2757     const int32_t txfm_size_row = tx_size_high[tx_size];
2758 
2759     const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2760     const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2761 
2762     int32_t ud_flip, lr_flip;
2763     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2764 
2765     const int32_t row_one_loop = 8;
2766     for (int32_t i = 0; i < 2; ++i) {
2767         const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2768         __m128i *      buf_cur   = buf + i * row_one_loop;
2769         load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur, row_one_loop);
2770         transpose_16bit_4x8(buf_cur, buf_cur);
2771         if (row_txfm == iidentity4_new_ssse3) {
2772             const __m128i scale = pair_set_epi16(new_sqrt2, 3 << (new_sqrt2_bits - 1));
2773             const __m128i ones  = _mm_set1_epi16(1);
2774             for (int j = 0; j < 4; ++j) {
2775                 const __m128i buf_lo    = _mm_unpacklo_epi16(buf_cur[j], ones);
2776                 const __m128i buf_hi    = _mm_unpackhi_epi16(buf_cur[j], ones);
2777                 const __m128i buf_32_lo = _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale),
2778                                                          (new_sqrt2_bits + 1));
2779                 const __m128i buf_32_hi = _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale),
2780                                                          (new_sqrt2_bits + 1));
2781                 buf_cur[j]              = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2782             }
2783         } else {
2784             row_txfm(buf_cur, buf_cur, cos_bit_row);
2785             round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2786         }
2787         if (lr_flip) {
2788             __m128i temp[8];
2789             flip_buf_sse2(buf_cur, temp, txfm_size_col);
2790             transpose_16bit_8x4(temp, buf_cur);
2791         } else
2792             transpose_16bit_8x4(buf_cur, buf_cur);
2793     }
2794     col_txfm(buf, buf, cos_bit_col);
2795     round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2796     lowbd_write_buffer_4xn_sse2(
2797         buf, output_r, stride_r, output_w, stride_w, ud_flip, txfm_size_row);
2798 }
2799 
lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size_,int32_t eob)2800 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output_r,
2801                                             int32_t stride_r, uint8_t *output_w, int32_t stride_w,
2802                                             TxType tx_type, TxSize tx_size_, int32_t eob) {
2803     (void)tx_size_;
2804     (void)eob;
2805     __m128i       buf[16];
2806     const TxSize  tx_size         = TX_16X4;
2807     const int8_t *shift           = eb_inv_txfm_shift_ls[tx_size];
2808     const int32_t txw_idx         = get_txw_idx(tx_size);
2809     const int32_t txh_idx         = get_txh_idx(tx_size);
2810     const int32_t cos_bit_row     = inv_cos_bit_row[txw_idx][txh_idx];
2811     const int32_t cos_bit_col     = inv_cos_bit_col[txw_idx][txh_idx];
2812     const int32_t txfm_size_col   = tx_size_wide[tx_size];
2813     const int32_t txfm_size_row   = tx_size_high[tx_size];
2814     const int32_t buf_size_w_div8 = txfm_size_col >> 3;
2815 
2816     const Transform1dSsse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2817     const Transform1dSsse3 col_txfm = lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2818 
2819     int32_t ud_flip, lr_flip;
2820     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2821     const int32_t row_one_loop = 8;
2822     assert(buf_size_w_div8 > 0);
2823     for (int32_t i = 0; i < buf_size_w_div8; ++i) {
2824         const int32_t *input_cur = input + i * row_one_loop;
2825         __m128i *      buf_cur   = buf + i * row_one_loop;
2826         load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur, txfm_size_row);
2827         transpose_16bit_8x4(buf_cur, buf_cur);
2828     }
2829     if (row_txfm == iidentity16_new_ssse3) {
2830         const __m128i scale = pair_set_epi16(2 * new_sqrt2, 3 << (new_sqrt2_bits - 1));
2831         const __m128i ones  = _mm_set1_epi16(1);
2832         for (int j = 0; j < 16; ++j) {
2833             const __m128i buf_lo    = _mm_unpacklo_epi16(buf[j], ones);
2834             const __m128i buf_hi    = _mm_unpackhi_epi16(buf[j], ones);
2835             const __m128i buf_32_lo = _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale),
2836                                                      (new_sqrt2_bits + 1));
2837             const __m128i buf_32_hi = _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale),
2838                                                      (new_sqrt2_bits + 1));
2839             buf[j]                  = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2840         }
2841     } else {
2842         row_txfm(buf, buf, cos_bit_row);
2843         round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2844     }
2845     if (lr_flip) {
2846         __m128i temp[16];
2847         flip_buf_sse2(buf, temp, 16);
2848         transpose_16bit_4x8(temp, buf);
2849         transpose_16bit_4x8(temp + 8, buf + 8);
2850     } else {
2851         transpose_16bit_4x8(buf, buf);
2852         transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2853     }
2854     for (int32_t i = 0; i < buf_size_w_div8; i++) {
2855         col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
2856         round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2857     }
2858     lowbd_write_buffer_8xn_sse2(buf, output_r, stride_r, output_w, stride_w, ud_flip, 4);
2859     lowbd_write_buffer_8xn_sse2(
2860         buf + 8, output_r + 8, stride_r, output_w + 8, stride_w, ud_flip, 4);
2861 }
2862 
svt_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t * input,uint8_t * output_r,int32_t stride_r,uint8_t * output_w,int32_t stride_w,TxType tx_type,TxSize tx_size,int32_t eob)2863 void svt_av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output_r, int32_t stride_r,
2864                                         uint8_t *output_w, int32_t stride_w, TxType tx_type,
2865                                         TxSize tx_size, int32_t eob) {
2866     switch (tx_size) {
2867     case TX_4X4:
2868         lowbd_inv_txfm2d_add_4x4_ssse3(
2869             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2870         break;
2871     case TX_4X8:
2872         lowbd_inv_txfm2d_add_4x8_ssse3(
2873             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2874         break;
2875     case TX_8X4:
2876         lowbd_inv_txfm2d_add_8x4_ssse3(
2877             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2878         break;
2879     case TX_4X16:
2880         lowbd_inv_txfm2d_add_4x16_ssse3(
2881             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2882         break;
2883     case TX_16X4:
2884         lowbd_inv_txfm2d_add_16x4_ssse3(
2885             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2886         break;
2887     default:
2888         lowbd_inv_txfm2d_add_universe_ssse3(
2889             input, output_r, stride_r, output_w, stride_w, tx_type, tx_size, eob);
2890         break;
2891     }
2892 }
2893 
svt_av1_inv_txfm_add_ssse3(const TranLow * dqcoeff,uint8_t * dst_r,int32_t stride_r,uint8_t * dst_w,int32_t stride_w,const TxfmParam * txfm_param)2894 void svt_av1_inv_txfm_add_ssse3(const TranLow *dqcoeff, uint8_t *dst_r, int32_t stride_r,
2895                                 uint8_t *dst_w, int32_t stride_w, const TxfmParam *txfm_param) {
2896     const TxType tx_type = txfm_param->tx_type;
2897     if (!txfm_param->lossless) {
2898         svt_av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff,
2899                                            dst_r,
2900                                            stride_r,
2901                                            dst_w,
2902                                            stride_w,
2903                                            tx_type,
2904                                            txfm_param->tx_size,
2905                                            txfm_param->eob);
2906     } else {
2907         svt_av1_inv_txfm_add_c(dqcoeff, dst_r, stride_r, dst_w, stride_w, txfm_param);
2908     }
2909 }
2910